xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +47 -18
  3. xinference/api/oauth2/types.py +1 -0
  4. xinference/api/restful_api.py +34 -7
  5. xinference/client/oscar/actor_client.py +4 -3
  6. xinference/client/restful/restful_client.py +20 -4
  7. xinference/conftest.py +13 -2
  8. xinference/core/supervisor.py +48 -1
  9. xinference/core/worker.py +139 -20
  10. xinference/deploy/cmdline.py +119 -20
  11. xinference/model/embedding/core.py +1 -2
  12. xinference/model/llm/__init__.py +4 -6
  13. xinference/model/llm/ggml/llamacpp.py +2 -10
  14. xinference/model/llm/llm_family.json +877 -13
  15. xinference/model/llm/llm_family.py +15 -0
  16. xinference/model/llm/llm_family_modelscope.json +571 -0
  17. xinference/model/llm/pytorch/chatglm.py +2 -0
  18. xinference/model/llm/pytorch/core.py +22 -26
  19. xinference/model/llm/pytorch/deepseek_vl.py +232 -0
  20. xinference/model/llm/pytorch/internlm2.py +2 -0
  21. xinference/model/llm/pytorch/omnilmm.py +153 -0
  22. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  23. xinference/model/llm/pytorch/yi_vl.py +4 -2
  24. xinference/model/llm/utils.py +53 -5
  25. xinference/model/llm/vllm/core.py +54 -6
  26. xinference/model/rerank/core.py +3 -0
  27. xinference/thirdparty/deepseek_vl/__init__.py +31 -0
  28. xinference/thirdparty/deepseek_vl/models/__init__.py +28 -0
  29. xinference/thirdparty/deepseek_vl/models/clip_encoder.py +242 -0
  30. xinference/thirdparty/deepseek_vl/models/image_processing_vlm.py +208 -0
  31. xinference/thirdparty/deepseek_vl/models/modeling_vlm.py +170 -0
  32. xinference/thirdparty/deepseek_vl/models/processing_vlm.py +390 -0
  33. xinference/thirdparty/deepseek_vl/models/projector.py +100 -0
  34. xinference/thirdparty/deepseek_vl/models/sam.py +593 -0
  35. xinference/thirdparty/deepseek_vl/models/siglip_vit.py +681 -0
  36. xinference/thirdparty/deepseek_vl/utils/__init__.py +18 -0
  37. xinference/thirdparty/deepseek_vl/utils/conversation.py +348 -0
  38. xinference/thirdparty/deepseek_vl/utils/io.py +78 -0
  39. xinference/thirdparty/omnilmm/__init__.py +0 -0
  40. xinference/thirdparty/omnilmm/chat.py +216 -0
  41. xinference/thirdparty/omnilmm/constants.py +4 -0
  42. xinference/thirdparty/omnilmm/conversation.py +332 -0
  43. xinference/thirdparty/omnilmm/model/__init__.py +1 -0
  44. xinference/thirdparty/omnilmm/model/omnilmm.py +594 -0
  45. xinference/thirdparty/omnilmm/model/resampler.py +166 -0
  46. xinference/thirdparty/omnilmm/model/utils.py +563 -0
  47. xinference/thirdparty/omnilmm/train/__init__.py +13 -0
  48. xinference/thirdparty/omnilmm/train/train_utils.py +150 -0
  49. xinference/thirdparty/omnilmm/utils.py +134 -0
  50. xinference/types.py +15 -19
  51. xinference/web/ui/build/asset-manifest.json +3 -3
  52. xinference/web/ui/build/index.html +1 -1
  53. xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
  54. xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
  73. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/METADATA +25 -12
  74. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/RECORD +79 -58
  75. xinference/model/llm/ggml/ctransformers.py +0 -281
  76. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  77. xinference/web/ui/build/static/js/main.66b1c4fb.js +0 -3
  78. xinference/web/ui/build/static/js/main.66b1c4fb.js.map +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/c2124cfe036b26befcbd386d1d17743b1a58d0b7a041a17bb67f9924400d63c3.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fd4a8ae5d192331af1bedd1d2d70efcc569708ee6cc4cb479b225d059482aa81.json +0 -1
  98. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  99. /xinference/web/ui/build/static/js/{main.66b1c4fb.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
  100. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
  101. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
  102. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
  103. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,232 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import base64
15
+ import logging
16
+ import os.path
17
+ import tempfile
18
+ import time
19
+ import uuid
20
+ from concurrent.futures import ThreadPoolExecutor
21
+ from io import BytesIO
22
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
23
+
24
+ import requests
25
+ import torch
26
+
27
+ from ....model.utils import select_device
28
+ from ....types import (
29
+ ChatCompletion,
30
+ ChatCompletionChoice,
31
+ ChatCompletionChunk,
32
+ ChatCompletionMessage,
33
+ CompletionUsage,
34
+ )
35
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
36
+ from .core import PytorchChatModel, PytorchGenerateConfig
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class DeepSeekVLChatModel(PytorchChatModel):
42
+ def __init__(self, *args, **kwargs):
43
+ super().__init__(*args, **kwargs)
44
+ self._tokenizer = None
45
+ self._model = None
46
+ self._vl_chat_processor = None
47
+ self._type = None
48
+
49
+ @classmethod
50
+ def match(
51
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
52
+ ) -> bool:
53
+ if "deepseek" in model_family.model_name:
54
+ return True
55
+ return False
56
+
57
+ def load(self):
58
+ from transformers import AutoModelForCausalLM
59
+
60
+ from ....thirdparty.deepseek_vl.models import (
61
+ MultiModalityCausalLM,
62
+ VLChatProcessor,
63
+ )
64
+
65
+ self._device = self._pytorch_model_config.get("device", "auto")
66
+ self._device = select_device(self._device)
67
+ self._type = torch.float16 if self._device == "mps" else torch.bfloat16
68
+
69
+ # specify the path to the model
70
+ self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
71
+ self.model_path
72
+ )
73
+ self._tokenizer = self._vl_chat_processor.tokenizer
74
+
75
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
76
+ self.model_path, trust_remote_code=True, device_map=self._device
77
+ )
78
+ self._model = vl_gpt.to(self._type).eval()
79
+
80
+ @staticmethod
81
+ def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:
82
+ def _ensure_url(_url):
83
+ if _url.startswith("data:"):
84
+ logging.info("Parse url by base64 decoder.")
85
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
86
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
87
+ _type, data = _url.split(";")
88
+ _, ext = _type.split("/")
89
+ data = data[len("base64,") :]
90
+ data = base64.b64decode(data.encode("utf-8"))
91
+
92
+ with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
93
+ f.write(data)
94
+ logging.info("Dump base64 data to %s", f.name)
95
+ return f.name
96
+ else:
97
+ if len(_url) > 2048:
98
+ raise Exception(f"Image url is too long, {len(_url)} > 2048.")
99
+
100
+ return _url
101
+
102
+ def _download(_images):
103
+ local_images = []
104
+
105
+ # To make requests.get works
106
+ headers = {
107
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
108
+ }
109
+ with ThreadPoolExecutor() as executor:
110
+ for url in images:
111
+ try:
112
+ if os.path.exists(url):
113
+ local_images.append(url)
114
+ continue
115
+ except Exception as e:
116
+ logger.debug("Image is remote: %s, e: %s", url, e)
117
+ pass
118
+ # Append a placeholder
119
+ local_images.append(None)
120
+
121
+ def _fill_placeholder(_url, _index):
122
+ response = requests.get(url, headers=headers)
123
+ local_images[_index] = BytesIO(response.content)
124
+
125
+ executor.submit(_fill_placeholder, url, len(local_images) - 1)
126
+ return local_images
127
+
128
+ if not isinstance(content, str):
129
+ # TODO(codingl2k1): Optimize _ensure_url
130
+
131
+ images = []
132
+ new_content = []
133
+ for c in content:
134
+ c_type = c.get("type")
135
+ if c_type == "image_url":
136
+ images.append(_ensure_url(c["image_url"]["url"]))
137
+ elif c_type == "text":
138
+ new_content.append(c["text"])
139
+ if images:
140
+ new_content.insert(0, "<image_placeholder>")
141
+ images = _download(images)
142
+ return "".join(new_content), images
143
+ return content, []
144
+
145
+ def chat(
146
+ self,
147
+ prompt: Union[str, List[Dict]],
148
+ system_prompt: Optional[str] = None,
149
+ chat_history: Optional[List[ChatCompletionMessage]] = None,
150
+ generate_config: Optional[PytorchGenerateConfig] = None,
151
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
152
+ if generate_config and generate_config.get("stream"):
153
+ raise Exception(
154
+ f"Chat with model {self.model_family.model_name} does not support stream."
155
+ )
156
+ prompt, images = self._message_content_to_deepseek(prompt)
157
+ prompt_messages: List[Dict[str, Any]] = [
158
+ {
159
+ "role": "User",
160
+ "content": prompt,
161
+ },
162
+ {"role": "Assistant", "content": ""},
163
+ ]
164
+ if images:
165
+ prompt_messages[0]["images"] = images
166
+
167
+ # Convert openai history to qwen vl history
168
+ deepseek_history = []
169
+ for h in chat_history or []:
170
+ role = h["role"]
171
+ if role == "user":
172
+ content, images = self._message_content_to_deepseek(h["content"])
173
+ msg: Dict[str, Any] = {
174
+ "role": "User",
175
+ "content": content,
176
+ }
177
+ if images:
178
+ msg["images"] = images
179
+ deepseek_history.append(msg)
180
+ elif role == "assistant":
181
+ deepseek_history.append({"role": "Assistant", "content": h["content"]})
182
+ else:
183
+ logger.error("Unexpected msg in chat history: %s", h)
184
+
185
+ deepseek_history.extend(prompt_messages)
186
+
187
+ from ....thirdparty.deepseek_vl.utils.io import load_pil_images
188
+
189
+ # load images and prepare for inputs
190
+ pil_images = load_pil_images(deepseek_history)
191
+ prepare_inputs = self._vl_chat_processor(
192
+ conversations=deepseek_history, images=pil_images, force_batchify=True
193
+ ).to(self._model.device, self._model.dtype)
194
+
195
+ # run image encoder to get the image embeddings
196
+ inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
197
+
198
+ # run the model to get the response
199
+ outputs = self._model.language_model.generate(
200
+ inputs_embeds=inputs_embeds,
201
+ attention_mask=prepare_inputs.attention_mask,
202
+ pad_token_id=self._tokenizer.eos_token_id,
203
+ bos_token_id=self._tokenizer.bos_token_id,
204
+ eos_token_id=self._tokenizer.eos_token_id,
205
+ max_new_tokens=512,
206
+ do_sample=True,
207
+ top_p=0.95,
208
+ temperature=0.2,
209
+ repetition_penalty=1.1,
210
+ use_cache=True,
211
+ )
212
+
213
+ answer = self._tokenizer.decode(
214
+ outputs[0].cpu().tolist(), skip_special_tokens=True
215
+ )
216
+
217
+ return ChatCompletion(
218
+ id="chat" + str(uuid.uuid1()),
219
+ object="chat.completion",
220
+ created=int(time.time()),
221
+ model=self.model_uid,
222
+ choices=[
223
+ ChatCompletionChoice(
224
+ index=0,
225
+ message={"role": "assistant", "content": answer},
226
+ finish_reason="stop",
227
+ )
228
+ ],
229
+ usage=CompletionUsage(
230
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
+ ),
232
+ )
@@ -114,6 +114,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
114
114
  ]
115
115
  else:
116
116
  input_history = []
117
+ if system_prompt:
118
+ kwargs["meta_instruction"] = system_prompt
117
119
  if stream:
118
120
 
119
121
  def _stream_generator():
@@ -0,0 +1,153 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import base64
15
+ import json
16
+ import logging
17
+ import operator
18
+ import tempfile
19
+ import time
20
+ import uuid
21
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
22
+
23
+ from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
24
+ from ....types import (
25
+ ChatCompletion,
26
+ ChatCompletionChoice,
27
+ ChatCompletionChunk,
28
+ ChatCompletionMessage,
29
+ CompletionUsage,
30
+ )
31
+ from ...utils import select_device
32
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
33
+ from .core import PytorchChatModel, PytorchGenerateConfig
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class OmniLMMModel(PytorchChatModel):
39
+ def __init__(self, *args, **kwargs):
40
+ super().__init__(*args, **kwargs)
41
+ self._model = None
42
+
43
+ @classmethod
44
+ def match(
45
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
46
+ ) -> bool:
47
+ if "OmniLMM" in model_family.model_name:
48
+ return True
49
+ return False
50
+
51
+ def load(self):
52
+ device = self._pytorch_model_config.get("device", "auto")
53
+ device = select_device(device)
54
+
55
+ self._model = OmniLMMChat(self.model_path, device_map=device)
56
+
57
+ def _message_content_to_OmniLMM(
58
+ self, content
59
+ ) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
60
+ def _ensure_url(_url):
61
+ if _url.startswith("data:"):
62
+ logging.info("Parse url by base64 decoder.")
63
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
64
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
65
+ _type, data = _url.split(";")
66
+ _, ext = _type.split("/")
67
+ data = data[len("base64,") :]
68
+ data = base64.b64decode(data.encode("utf-8"))
69
+
70
+ with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
71
+ f.write(data)
72
+ logging.info("Dump base64 data to %s", f.name)
73
+ return f.name
74
+ else:
75
+ if len(_url) > 2048:
76
+ raise Exception(f"Image url is too long, {len(_url)} > 2048.")
77
+ return _url
78
+
79
+ if not isinstance(content, str):
80
+ images = []
81
+ other_content = []
82
+
83
+ for c in content:
84
+ if c.get("type") == "image_url":
85
+ images.append(
86
+ {"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
87
+ )
88
+ else:
89
+ other_content.append(c)
90
+
91
+ images = sorted(images, key=operator.itemgetter("type"))
92
+ other_content = sorted(other_content, key=operator.itemgetter("type"))
93
+
94
+ return images, other_content
95
+ return [], [{"type": "text", "text": content}]
96
+
97
+ def chat(
98
+ self,
99
+ prompt: Union[str, List[Dict]],
100
+ system_prompt: Optional[str] = None,
101
+ chat_history: Optional[List[ChatCompletionMessage]] = None,
102
+ generate_config: Optional[PytorchGenerateConfig] = None,
103
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
104
+ if generate_config and generate_config.get("stream"):
105
+ raise Exception(
106
+ f"Chat with model {self.model_family.model_name} does not support stream."
107
+ )
108
+ image_first, prompt = self._message_content_to_OmniLMM(prompt)
109
+
110
+ msgs = []
111
+ query_to_response: List[Dict] = []
112
+ image_another = []
113
+ for h in chat_history or []:
114
+ role = h["role"]
115
+ image_tmp, content = self._message_content_to_OmniLMM(h["content"])
116
+ if image_tmp != []:
117
+ image_another = image_tmp
118
+ if len(query_to_response) == 0 and role == "user":
119
+ query_to_response.append(
120
+ {"role": "user", "content": content[0]["text"]}
121
+ )
122
+ if len(query_to_response) == 1 and role == "assistant":
123
+ query_to_response.append(
124
+ {"role": "assistant", "content": content[0]["text"]}
125
+ )
126
+ if len(query_to_response) == 2:
127
+ msgs.extend(query_to_response)
128
+ query_to_response = []
129
+ if image_first != []:
130
+ image = image_first
131
+ if image_another != []:
132
+ image = image_another
133
+ im_64 = img2base64(image[0]["image"])
134
+ msgs.append({"role": "user", "content": prompt[0]["text"]})
135
+ input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
136
+ answer = self._model.chat(input=input)
137
+
138
+ return ChatCompletion(
139
+ id="chat" + str(uuid.uuid1()),
140
+ object="chat.completion",
141
+ created=int(time.time()),
142
+ model=self.model_uid,
143
+ choices=[
144
+ ChatCompletionChoice(
145
+ index=0,
146
+ message={"role": "assistant", "content": answer},
147
+ finish_reason="stop",
148
+ )
149
+ ],
150
+ usage=CompletionUsage(
151
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
152
+ ),
153
+ )
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
53
53
 
54
54
  device = self._pytorch_model_config.get("device", "auto")
55
55
  device = select_device(device)
56
+ # for multiple GPU, set back to auto to make multiple devices work
57
+ device = "auto" if device == "cuda" else device
56
58
 
57
59
  self._tokenizer = AutoTokenizer.from_pretrained(
58
60
  self.model_path,
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
59
59
 
60
60
  self._device = self._pytorch_model_config.get("device", "auto")
61
61
  self._device = select_device(self._device)
62
+ # for multiple GPU, set back to auto to make multiple devices work
63
+ self._device = "auto" if self._device == "cuda" else self._device
62
64
 
63
65
  key_info["model_path"] = self.model_path
64
66
  # Default device_map is auto, it can loads model to multiple cards.
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
190
192
  prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
191
193
  )
192
194
  .unsqueeze(0)
193
- .to(self._device)
195
+ .to(self._model.device)
194
196
  )
195
197
 
196
198
  images = state.get_images(return_pil=True)
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
215
217
  "input_ids": input_ids,
216
218
  "images": image_tensor.unsqueeze(0)
217
219
  .to(dtype=torch.bfloat16)
218
- .to(self._device),
220
+ .to(self._model.device),
219
221
  "streamer": streamer,
220
222
  "do_sample": True,
221
223
  "top_p": float(top_p),
@@ -163,7 +163,7 @@ class ChatModelMixin:
163
163
 
164
164
  for i, message in enumerate(chat_history):
165
165
  role = get_role(message["role"])
166
- content = message["content"]
166
+ content = message.get("content")
167
167
  tool_calls = message.get("tool_calls")
168
168
  if tool_calls:
169
169
  content = tool_calls[0]["function"]
@@ -248,7 +248,7 @@ Begin!"""
248
248
  ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
249
249
  for message in chat_history:
250
250
  role = get_role(message["role"])
251
- content = message["content"]
251
+ content = message.get("content")
252
252
 
253
253
  ret += prompt_style.intra_message_sep
254
254
  if tools:
@@ -421,6 +421,16 @@ Begin!"""
421
421
  else:
422
422
  ret += f"{role}".rstrip()
423
423
  return ret
424
+ elif prompt_style.style_name == "MINICPM-2B":
425
+ ret = ""
426
+ for message in chat_history:
427
+ content = message["content"] or ""
428
+ role = get_role(message["role"])
429
+ if role == "user":
430
+ ret += "<用户>" + content.strip()
431
+ else:
432
+ ret += "<AI>" + content.strip()
433
+ return ret
424
434
  else:
425
435
  raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
426
436
 
@@ -436,6 +446,11 @@ Begin!"""
436
446
  "index": i,
437
447
  "delta": {
438
448
  "content": choice["text"],
449
+ **(
450
+ {"tool_calls": choice["tool_calls"]}
451
+ if "tool_calls" in choice
452
+ else {}
453
+ ),
439
454
  },
440
455
  "finish_reason": choice["finish_reason"],
441
456
  }
@@ -582,10 +597,9 @@ Begin!"""
582
597
  return text, None, None
583
598
 
584
599
  @classmethod
585
- def _tool_calls_completion(cls, model_family, model_uid, c, tools):
586
- _id = str(uuid.uuid4())
600
+ def _eval_tool_arguments(cls, model_family, c, tools):
587
601
  family = model_family.model_family or model_family.model_name
588
- if "gorilla-openfunctions-v1" == family:
602
+ if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
589
603
  content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
590
604
  elif "chatglm3" == family:
591
605
  content, func, args = cls._eval_chatglm3_arguments(c, tools)
@@ -596,7 +610,41 @@ Begin!"""
596
610
  f"Model {model_family.model_name} is not support tool calls."
597
611
  )
598
612
  logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
613
+ return content, func, args
599
614
 
615
+ @classmethod
616
+ def _tools_token_filter(cls, model_family):
617
+ """
618
+ Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
619
+
620
+ Returns:
621
+ A function that takes tokens (string output by the model so far) as input
622
+ returns True if current token is after "\nFinal Answer:", else False.
623
+ """
624
+ family = model_family.model_family or model_family.model_name
625
+ if family in ["qwen-chat", "qwen1.5-chat"]:
626
+ # Encapsulating function to reset 'found' after each call
627
+ found = False
628
+
629
+ def process_token(tokens: str):
630
+ nonlocal found
631
+ # Once "Final Answer:" is found, future tokens are allowed.
632
+ if found:
633
+ return True
634
+ # Check if the token ends with "\nFinal Answer:" and update `found`.
635
+ if tokens.endswith("\nFinal Answer:"):
636
+ found = True
637
+ return False
638
+
639
+ return process_token
640
+ else:
641
+ # For other families, allow all tokens.
642
+ return lambda tokens: True
643
+
644
+ @classmethod
645
+ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
646
+ _id = str(uuid.uuid4())
647
+ content, func, args = cls._eval_tool_arguments(model_family, c, tools)
600
648
  if func:
601
649
  m = {
602
650
  "role": "assistant",
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import json
15
16
  import logging
16
17
  import multiprocessing
17
18
  import time
@@ -36,6 +37,8 @@ from ....types import (
36
37
  CompletionChoice,
37
38
  CompletionChunk,
38
39
  CompletionUsage,
40
+ ToolCallFunction,
41
+ ToolCalls,
39
42
  )
40
43
  from .. import LLM, LLMFamilyV1, LLMSpecV1
41
44
  from ..llm_family import CustomLLMFamilyV1
@@ -80,25 +83,36 @@ try:
80
83
  except ImportError:
81
84
  VLLM_INSTALLED = False
82
85
 
83
- VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
86
+ VLLM_SUPPORTED_MODELS = [
87
+ "llama-2",
88
+ "baichuan",
89
+ "internlm-16k",
90
+ "mistral-v0.1",
91
+ "Yi",
92
+ "code-llama",
93
+ "code-llama-python",
94
+ ]
84
95
  VLLM_SUPPORTED_CHAT_MODELS = [
85
96
  "llama-2-chat",
86
97
  "vicuna-v1.3",
87
98
  "vicuna-v1.5",
88
99
  "baichuan-chat",
100
+ "baichuan-2-chat",
89
101
  "internlm-chat-7b",
90
102
  "internlm-chat-8k",
91
103
  "internlm-chat-20b",
104
+ "internlm2-chat",
92
105
  "qwen-chat",
93
- "Yi",
94
106
  "Yi-chat",
95
- "code-llama",
96
- "code-llama-python",
97
107
  "code-llama-instruct",
98
108
  "mistral-instruct-v0.1",
99
109
  "mistral-instruct-v0.2",
100
110
  "mixtral-instruct-v0.1",
101
111
  "chatglm3",
112
+ "chatglm3-32k",
113
+ "chatglm3-128k",
114
+ "deepseek-chat",
115
+ "deepseek-coder-instruct",
102
116
  ]
103
117
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
104
118
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
@@ -110,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
110
124
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
111
125
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
112
126
 
127
+ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
128
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
129
+
113
130
 
114
131
  class VLLMModel(LLM):
115
132
  def __init__(
@@ -290,6 +307,7 @@ class VLLMModel(LLM):
290
307
  self,
291
308
  prompt: str,
292
309
  generate_config: Optional[Dict] = None,
310
+ tools: object = False,
293
311
  ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
294
312
  try:
295
313
  from vllm.sampling_params import SamplingParams
@@ -316,16 +334,46 @@ class VLLMModel(LLM):
316
334
 
317
335
  async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
318
336
  previous_texts = [""] * sanitized_generate_config["n"]
337
+ tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
319
338
  async for _request_output in results_generator:
320
339
  chunk = self._convert_request_output_to_completion_chunk(
321
340
  request_id=request_id,
322
341
  model=self.model_uid,
323
342
  request_output=_request_output,
324
343
  )
344
+
325
345
  for i, choice in enumerate(chunk["choices"]):
326
346
  delta = choice["text"][len(previous_texts[i]) :]
327
347
  previous_texts[i] = choice["text"]
328
348
  choice["text"] = delta
349
+
350
+ if tools:
351
+ # only handle the first choice
352
+ choice = chunk["choices"][0]
353
+ if choice["finish_reason"] is not None:
354
+ # use previous text for evaluation temporarily
355
+ choice_delta = choice["text"]
356
+ choice["text"] = previous_texts[0]
357
+ _content, func, args = ChatModelMixin._eval_tool_arguments(
358
+ self.model_family, chunk, tools
359
+ )
360
+ choice["text"] = choice_delta
361
+ if func is not None:
362
+ choice["text"] = None
363
+ choice["finish_reason"] = "tool_calls"
364
+ choice["tool_calls"] = [
365
+ ToolCalls(
366
+ id=str(uuid.uuid4()),
367
+ type="function",
368
+ function=ToolCallFunction(
369
+ name=func,
370
+ arguments=json.dumps(args, ensure_ascii=False),
371
+ ),
372
+ )
373
+ ]
374
+ # use a filter function to skip Qwen's react thought process
375
+ elif not tools_token_filter(previous_texts[0]):
376
+ continue
329
377
  prompt_tokens = len(_request_output.prompt_token_ids)
330
378
  completion_tokens = sum(
331
379
  len(output.token_ids) for output in _request_output.outputs
@@ -413,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
413
461
  generate_config = self._sanitize_chat_config(generate_config)
414
462
  # TODO(codingl2k1): qwen hacky to set stop for function call.
415
463
  model_family = self.model_family.model_family or self.model_family.model_name
416
- if tools and "qwen-chat" == model_family:
464
+ if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
417
465
  stop = generate_config.get("stop")
418
466
  if isinstance(stop, str):
419
467
  generate_config["stop"] = [stop, "Observation:"]
@@ -426,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
426
474
  stream = generate_config.get("stream", None)
427
475
 
428
476
  if stream:
429
- agen = await self.async_generate(full_prompt, generate_config)
477
+ agen = await self.async_generate(full_prompt, generate_config, tools)
430
478
  assert isinstance(agen, AsyncGenerator)
431
479
  return self._async_to_chat_completion_chunks(agen)
432
480
  else:
@@ -134,8 +134,11 @@ class RerankModel:
134
134
  top_n: Optional[int],
135
135
  max_chunks_per_doc: Optional[int],
136
136
  return_documents: Optional[bool],
137
+ **kwargs,
137
138
  ) -> Rerank:
138
139
  assert self._model is not None
140
+ if kwargs:
141
+ raise ValueError("rerank hasn't support extra parameter.")
139
142
  if max_chunks_per_doc is not None:
140
143
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
141
144
  sentence_combinations = [[query, doc] for doc in documents]