xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/METADATA +12 -15
  46. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import base64
15
- import json
16
15
  import logging
17
16
  import time
18
17
  import uuid
@@ -124,29 +123,60 @@ class MiniCPMV26Model(PytorchChatModel):
124
123
  else:
125
124
  return Image.open(BytesIO(response.content)).convert("RGB")
126
125
 
126
+ MAX_NUM_FRAMES = 64
127
+
128
+ def encode_video(video_path):
129
+ from decord import VideoReader, cpu
130
+
131
+ def uniform_sample(l, n):
132
+ gap = len(l) / n
133
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
134
+ return [l[i] for i in idxs]
135
+
136
+ vr = VideoReader(video_path, ctx=cpu(0))
137
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
138
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
139
+ if len(frame_idx) > MAX_NUM_FRAMES:
140
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
141
+ frames = vr.get_batch(frame_idx).asnumpy()
142
+ frames = [Image.fromarray(v.astype("uint8")) for v in frames]
143
+ print("num frames:", len(frames))
144
+ return frames
145
+
146
+ def _load_video(_url):
147
+ frames = None
148
+ if _url.startswith("data:"):
149
+ raise RuntimeError("Only video url format is supported")
150
+ else:
151
+ frames = encode_video(_url)
152
+ return frames
153
+
127
154
  if not isinstance(content, str):
128
155
  texts = []
129
156
  image_urls = []
157
+ video_urls = []
130
158
  for c in content:
131
159
  c_type = c.get("type")
132
160
  if c_type == "text":
133
161
  texts.append(c["text"])
134
162
  elif c_type == "image_url":
135
163
  image_urls.append(c["image_url"]["url"])
164
+ elif c_type == "video_url":
165
+ video_urls.append(c["video_url"]["url"])
136
166
  image_futures = []
137
167
  with ThreadPoolExecutor() as executor:
138
168
  for image_url in image_urls:
139
169
  fut = executor.submit(_load_image, image_url)
140
170
  image_futures.append(fut)
141
171
  images = [fut.result() for fut in image_futures]
172
+ frames = []
173
+ if len(video_urls) > 1:
174
+ raise RuntimeError("Only one video per message is supported")
175
+ for v in video_urls:
176
+ frames = _load_video(v)
142
177
  text = " ".join(texts)
143
- if len(images) == 0:
144
- return text, []
145
- elif len(images) == 1:
146
- return text, images
147
- else:
148
- raise RuntimeError("Only one image per message is supported")
149
- return content, []
178
+ return text, images, frames
179
+ return content, [], []
150
180
 
151
181
  def chat(
152
182
  self,
@@ -156,36 +186,51 @@ class MiniCPMV26Model(PytorchChatModel):
156
186
  generate_config: Optional[PytorchGenerateConfig] = None,
157
187
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
158
188
  stream = generate_config.get("stream", False) if generate_config else False
159
- content, images_chat = self._message_content_to_chat(prompt)
189
+ videoExisted = False
190
+
191
+ content, images_chat, video_frames = self._message_content_to_chat(prompt)
192
+ if len(video_frames) > 0:
193
+ videoExisted = True
194
+ images_chat = video_frames
160
195
 
161
196
  msgs = []
162
197
  query_to_response: List[Dict] = []
163
- images_history = []
164
198
  for h in chat_history or []:
199
+ images_history = []
165
200
  role = h["role"]
166
- content_h, images_tmp = self._message_content_to_chat(h["content"])
201
+ content_h, images_tmp, video_frames_h = self._message_content_to_chat(
202
+ h["content"]
203
+ )
167
204
  if images_tmp != []:
168
205
  images_history = images_tmp
206
+ if len(video_frames_h) > 0:
207
+ videoExisted = True
208
+ images_history = video_frames_h
169
209
  if len(query_to_response) == 0 and role == "user":
170
- query_to_response.append({"role": "user", "content": content_h})
210
+ query_to_response.append(
211
+ {"role": "user", "content": images_history + [content_h]}
212
+ )
171
213
  if len(query_to_response) == 1 and role == "assistant":
172
- query_to_response.append({"role": "assistant", "content": content_h})
214
+ query_to_response.append(
215
+ {"role": "assistant", "content": images_history + [content_h]}
216
+ )
173
217
  if len(query_to_response) == 2:
174
218
  msgs.extend(query_to_response)
175
219
  query_to_response = []
176
- image = None
177
- if len(images_chat) > 0:
178
- image = images_chat[0]
179
- elif len(images_history) > 0:
180
- image = images_history[0]
181
- msgs.append({"role": "user", "content": content})
220
+ msgs.append({"role": "user", "content": images_chat + [content]})
221
+
222
+ # Set decode params for video
223
+ params = {}
224
+ if videoExisted:
225
+ params = {"use_image_id": False, "max_slice_nums": 1}
182
226
 
183
227
  chat = self._model.chat(
184
- image=image,
185
- msgs=json.dumps(msgs, ensure_ascii=True),
228
+ image=None,
229
+ msgs=msgs,
186
230
  tokenizer=self._tokenizer,
187
231
  sampling=True,
188
- **generate_config
232
+ **generate_config,
233
+ **params,
189
234
  )
190
235
  if stream:
191
236
  it = self.chat_stream(chat)
@@ -40,8 +40,7 @@ from ....types import (
40
40
  )
41
41
 
42
42
  if TYPE_CHECKING:
43
- from ...llm.pytorch.core import PytorchModel
44
-
43
+ from ...llm.transformers.core import PytorchModel
45
44
 
46
45
  logger = logging.getLogger(__name__)
47
46
 
@@ -11,14 +11,19 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import base64
14
15
  import functools
15
16
  import json
16
17
  import logging
17
18
  import os
18
19
  import time
19
20
  import uuid
21
+ from io import BytesIO
20
22
  from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
21
23
 
24
+ import requests
25
+ from PIL import Image
26
+
22
27
  from ...types import (
23
28
  SPECIAL_TOOL_PROMPT,
24
29
  ChatCompletion,
@@ -28,7 +33,7 @@ from ...types import (
28
33
  CompletionChunk,
29
34
  )
30
35
  from .llm_family import (
31
- GgmlLLMSpecV1,
36
+ LlamaCppLLMSpecV1,
32
37
  LLMFamilyV1,
33
38
  LLMSpecV1,
34
39
  PromptStyleV1,
@@ -60,7 +65,7 @@ class ChatModelMixin:
60
65
  chat_history: List[ChatCompletionMessage],
61
66
  prompt_style: PromptStyleV1,
62
67
  tools: Optional[List[Dict]] = None,
63
- ) -> str:
68
+ ):
64
69
  """
65
70
  Inspired by FastChat. Format chat history into a prompt according to the prompty style of
66
71
  different models.
@@ -92,17 +97,6 @@ class ChatModelMixin:
92
97
  else:
93
98
  ret += role + ":"
94
99
  return ret
95
- elif prompt_style.style_name == "ADD_COLON_TWO":
96
- seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
97
- ret = prompt_style.system_prompt + seps[0]
98
- for i, message in enumerate(chat_history):
99
- role = get_role(message["role"])
100
- content = message["content"]
101
- if content:
102
- ret += role + ": " + content + seps[i % 2]
103
- else:
104
- ret += role + ":"
105
- return ret
106
100
  elif prompt_style.style_name == "NO_COLON_TWO":
107
101
  seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
108
102
  ret = prompt_style.system_prompt
@@ -144,21 +138,6 @@ class ChatModelMixin:
144
138
  else:
145
139
  ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
146
140
  return ret
147
- elif prompt_style.style_name == "FALCON":
148
- ret = prompt_style.system_prompt
149
- for message in chat_history:
150
- role = get_role(message["role"])
151
- content = message["content"]
152
- if content:
153
- ret += (
154
- role
155
- + ": "
156
- + content.replace("\r\n", "\n").replace("\n\n", "\n")
157
- )
158
- ret += "\n\n"
159
- else:
160
- ret += role + ":"
161
- return ret
162
141
  elif prompt_style.style_name == "MIXTRAL_V01":
163
142
  ret = ""
164
143
  for i, message in enumerate(chat_history):
@@ -168,22 +147,6 @@ class ChatModelMixin:
168
147
  else: # assistant
169
148
  ret += f"{content} </s>"
170
149
  return ret
171
- elif prompt_style.style_name == "CHATGLM":
172
- round_add_n = 1 if prompt_style.intra_message_sep == "\n\n" else 0
173
- if prompt_style.system_prompt:
174
- ret = prompt_style.system_prompt + prompt_style.intra_message_sep
175
- else:
176
- ret = ""
177
- for i, message in enumerate(chat_history):
178
- role = get_role(message["role"])
179
- content = message["content"]
180
- if i % 2 == 0:
181
- ret += f"[Round {i // 2 + round_add_n}]{prompt_style.intra_message_sep}"
182
- if content:
183
- ret += role + ":" + content + prompt_style.intra_message_sep
184
- else:
185
- ret += role + ":"
186
- return ret
187
150
  elif prompt_style.style_name == "CHATGLM3":
188
151
  prompts = (
189
152
  [f"<|system|>\n {prompt_style.system_prompt}"]
@@ -323,25 +286,6 @@ Begin!"""
323
286
  else:
324
287
  ret += role + "\n"
325
288
  return ret
326
- elif prompt_style.style_name == "INTERNLM":
327
- seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
328
- ret = ""
329
- for i, message in enumerate(chat_history[:-2]):
330
- if i % 2 == 0:
331
- ret += "<s>"
332
- role = get_role(message["role"])
333
- content = message["content"]
334
- ret += role + ":" + str(content) + seps[i % 2]
335
- if len(ret) == 0:
336
- ret += "<s>"
337
- ret += (
338
- chat_history[-2]["role"]
339
- + ":"
340
- + str(chat_history[-2]["content"])
341
- + seps[0]
342
- )
343
- ret += chat_history[-1]["role"] + ":"
344
- return ret
345
289
  elif prompt_style.style_name == "INTERNLM2":
346
290
  ret = (
347
291
  "<s>"
@@ -370,9 +314,6 @@ Begin!"""
370
314
  else:
371
315
  ret += role + ": Let's think step by step."
372
316
  return ret
373
- elif prompt_style.style_name == "INSTRUCTION":
374
- message = chat_history[-2]
375
- return prompt_style.system_prompt.format(message["content"])
376
317
  elif prompt_style.style_name == "DEEPSEEK_CHAT":
377
318
  seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
378
319
  ret = prompt_style.system_prompt
@@ -504,6 +445,52 @@ Begin!"""
504
445
  else:
505
446
  ret += role
506
447
  return ret
448
+ elif prompt_style.style_name == "INTERNVL":
449
+ ret = (
450
+ "<s>"
451
+ if prompt_style.system_prompt == ""
452
+ else "<s><|im_start|>system\n"
453
+ + prompt_style.system_prompt
454
+ + prompt_style.intra_message_sep
455
+ + "\n"
456
+ )
457
+ images = [] # type: ignore
458
+ for message in chat_history:
459
+ role = get_role(message["role"])
460
+ content = message["content"]
461
+ if isinstance(content, str):
462
+ ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
463
+ elif isinstance(content, list):
464
+ text = ""
465
+ image_urls = []
466
+ for c in content:
467
+ c_type = c.get("type")
468
+ if c_type == "text":
469
+ text = c["text"]
470
+ elif c_type == "image_url":
471
+ image_urls.append(c["image_url"]["url"])
472
+ image_futures = []
473
+ from concurrent.futures import ThreadPoolExecutor
474
+
475
+ with ThreadPoolExecutor() as executor:
476
+ for image_url in image_urls:
477
+ fut = executor.submit(_decode_image, image_url)
478
+ image_futures.append(fut)
479
+ images = [fut.result() for fut in image_futures]
480
+ if len(image_futures) == 0:
481
+ ret += (
482
+ role + "\n" + text + prompt_style.intra_message_sep + "\n"
483
+ )
484
+ else:
485
+ ret += (
486
+ role
487
+ + "\n"
488
+ + f"<image>\n{text}"
489
+ + prompt_style.intra_message_sep
490
+ + "\n"
491
+ )
492
+
493
+ return (ret, images)
507
494
  else:
508
495
  raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
509
496
 
@@ -706,7 +693,7 @@ Begin!"""
706
693
  family = model_family.model_family or model_family.model_name
707
694
  if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
708
695
  content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
709
- elif family in ["chatglm3"] + GLM4_TOOL_CALL_FAMILY:
696
+ elif family in GLM4_TOOL_CALL_FAMILY:
710
697
  content, func, args = cls._eval_glm_chat_arguments(c, tools)
711
698
  elif family in QWEN_TOOL_CALL_FAMILY:
712
699
  content, func, args = cls._eval_qwen_chat_arguments(c, tools)
@@ -870,10 +857,10 @@ def get_file_location(
870
857
  is_cached = cache_status
871
858
  assert isinstance(is_cached, bool)
872
859
 
873
- if spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
860
+ if spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
874
861
  return cache_dir, is_cached
875
- elif spec.model_format in ["ggmlv3", "ggufv2"]:
876
- assert isinstance(spec, GgmlLLMSpecV1)
862
+ elif spec.model_format in ["ggufv2"]:
863
+ assert isinstance(spec, LlamaCppLLMSpecV1)
877
864
  filename = spec.model_file_name_template.format(quantization=quantization)
878
865
  model_path = os.path.join(cache_dir, filename)
879
866
  return model_path, is_cached
@@ -885,3 +872,22 @@ def get_model_version(
885
872
  llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
886
873
  ) -> str:
887
874
  return f"{llm_family.model_name}--{llm_spec.model_size_in_billions}B--{llm_spec.model_format}--{quantization}"
875
+
876
+
877
+ def _decode_image(_url):
878
+ if _url.startswith("data:"):
879
+ logging.info("Parse url by base64 decoder.")
880
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
881
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
882
+ _type, data = _url.split(";")
883
+ _, ext = _type.split("/")
884
+ data = data[len("base64,") :]
885
+ data = base64.b64decode(data.encode("utf-8"))
886
+ return Image.open(BytesIO(data)).convert("RGB")
887
+ else:
888
+ try:
889
+ response = requests.get(_url)
890
+ except requests.exceptions.MissingSchema:
891
+ return Image.open(_url).convert("RGB")
892
+ else:
893
+ return Image.open(BytesIO(response.content)).convert("RGB")
@@ -21,6 +21,7 @@ import time
21
21
  import uuid
22
22
  from typing import (
23
23
  TYPE_CHECKING,
24
+ Any,
24
25
  AsyncGenerator,
25
26
  Dict,
26
27
  Iterable,
@@ -88,11 +89,12 @@ try:
88
89
  except ImportError:
89
90
  VLLM_INSTALLED = False
90
91
 
92
+ VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
93
+ "internvl2",
94
+ ]
91
95
  VLLM_SUPPORTED_MODELS = [
92
96
  "llama-2",
93
97
  "llama-3",
94
- "baichuan",
95
- "internlm-16k",
96
98
  "mistral-v0.1",
97
99
  "codestral-v0.1",
98
100
  "Yi",
@@ -105,13 +107,7 @@ VLLM_SUPPORTED_MODELS = [
105
107
  VLLM_SUPPORTED_CHAT_MODELS = [
106
108
  "llama-2-chat",
107
109
  "llama-3-instruct",
108
- "vicuna-v1.3",
109
- "vicuna-v1.5",
110
- "baichuan-chat",
111
110
  "baichuan-2-chat",
112
- "internlm-chat-7b",
113
- "internlm-chat-8k",
114
- "internlm-chat-20b",
115
111
  "internlm2-chat",
116
112
  "internlm2.5-chat",
117
113
  "internlm2.5-chat-1m",
@@ -338,7 +334,7 @@ class VLLMModel(LLM):
338
334
  return False
339
335
  if not cls._is_linux():
340
336
  return False
341
- if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
337
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
342
338
  return False
343
339
  if llm_spec.model_format == "pytorch":
344
340
  if quantization != "none" and not (quantization is None):
@@ -421,7 +417,7 @@ class VLLMModel(LLM):
421
417
 
422
418
  async def async_generate(
423
419
  self,
424
- prompt: str,
420
+ prompt: Union[str, Dict[str, Any]],
425
421
  generate_config: Optional[Dict] = None,
426
422
  tools: object = False,
427
423
  ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
@@ -558,7 +554,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
558
554
  def match(
559
555
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
560
556
  ) -> bool:
561
- if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
557
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
562
558
  return False
563
559
  if llm_spec.model_format == "pytorch":
564
560
  if quantization != "none" and not (quantization is None):
@@ -644,3 +640,106 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
644
640
  self.model_family, self.model_uid, c, tools
645
641
  )
646
642
  return self._to_chat_completion(c)
643
+
644
+
645
+ class VLLMVisionModel(VLLMModel, ChatModelMixin):
646
+ def load(self):
647
+ try:
648
+ import vllm
649
+ from vllm.engine.arg_utils import AsyncEngineArgs
650
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
651
+ except ImportError:
652
+ error_message = "Failed to import module 'vllm'"
653
+ installation_guide = [
654
+ "Please make sure 'vllm' is installed. ",
655
+ "You can install it by `pip install vllm`\n",
656
+ ]
657
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
658
+
659
+ if vllm.__version__ >= "0.3.1":
660
+ # from vllm v0.3.1, it uses cupy as NCCL backend
661
+ # in which cupy will fork a process
662
+ # only for xoscar >= 0.3.0, new process is allowed in subpool
663
+ # besides, xinference set start method as forkserver for unix
664
+ # we need to set it to fork to make cupy NCCL work
665
+ multiprocessing.set_start_method("fork", force=True)
666
+
667
+ self._model_config = self._sanitize_model_config(self._model_config)
668
+
669
+ logger.info(
670
+ f"Loading {self.model_uid} with following model config: {self._model_config}"
671
+ )
672
+
673
+ engine_args = AsyncEngineArgs(
674
+ model=self.model_path,
675
+ **self._model_config,
676
+ )
677
+ self._engine = AsyncLLMEngine.from_engine_args(engine_args)
678
+
679
+ @classmethod
680
+ def match(
681
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
682
+ ) -> bool:
683
+ if llm_spec.model_format != "pytorch":
684
+ return False
685
+ if llm_spec.model_format == "pytorch":
686
+ if quantization != "none" and not (quantization is None):
687
+ return False
688
+ if isinstance(llm_family, CustomLLMFamilyV1):
689
+ if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
690
+ return False
691
+ else:
692
+ if llm_family.model_name not in VLLM_SUPPORTED_VISION_MODEL_LIST:
693
+ return False
694
+ if "vision" not in llm_family.model_ability:
695
+ return False
696
+ return VLLM_INSTALLED
697
+
698
+ def _sanitize_chat_config(
699
+ self,
700
+ generate_config: Optional[Dict] = None,
701
+ ) -> Dict:
702
+ if not generate_config:
703
+ generate_config = {}
704
+ if self.model_family.prompt_style:
705
+ if self.model_family.prompt_style.stop_token_ids:
706
+ generate_config.setdefault(
707
+ "stop_token_ids",
708
+ self.model_family.prompt_style.stop_token_ids.copy(),
709
+ )
710
+ return generate_config
711
+
712
+ async def async_chat(
713
+ self,
714
+ prompt: str,
715
+ system_prompt: Optional[str] = None,
716
+ chat_history: Optional[List[ChatCompletionMessage]] = None,
717
+ generate_config: Optional[Dict] = None,
718
+ ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
719
+ # only support single image, waiting vllm support multi images
720
+ assert self.model_family.prompt_style is not None
721
+ prompt_style = self.model_family.prompt_style.copy()
722
+ chat_history = chat_history or []
723
+ prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
724
+ logger.info(f"messages:{prompt}")
725
+ if len(images) == 0:
726
+ inputs = {
727
+ "prompt": prompt,
728
+ }
729
+ else:
730
+ inputs = {
731
+ "prompt": prompt,
732
+ "multi_modal_data": {"image": images[-1]}, # type: ignore
733
+ }
734
+ generate_config = self._sanitize_chat_config(generate_config)
735
+
736
+ stream = generate_config.get("stream", None)
737
+
738
+ if stream:
739
+ agen = await self.async_generate(inputs, generate_config)
740
+ assert isinstance(agen, AsyncGenerator)
741
+ return self._async_to_chat_completion_chunks(agen)
742
+ else:
743
+ c = await self.async_generate(inputs, generate_config)
744
+ assert not isinstance(c, AsyncGenerator)
745
+ return self._to_chat_completion(c)
xinference/model/utils.py CHANGED
@@ -14,13 +14,11 @@
14
14
  import json
15
15
  import logging
16
16
  import os
17
- import shutil
18
17
  from json import JSONDecodeError
19
18
  from pathlib import Path
20
19
  from typing import Any, Callable, Dict, Optional, Tuple, Union
21
20
 
22
21
  import huggingface_hub
23
- from fsspec import AbstractFileSystem
24
22
 
25
23
  from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
26
24
  from ..device_utils import get_available_device, is_device_available
@@ -220,12 +218,7 @@ def is_valid_model_uri(model_uri: Optional[str]) -> bool:
220
218
  return True
221
219
 
222
220
 
223
- def cache_from_uri(
224
- model_spec: CacheableModelSpec,
225
- self_hosted_storage: bool = False,
226
- ) -> str:
227
- from fsspec import AbstractFileSystem, filesystem
228
-
221
+ def cache_from_uri(model_spec: CacheableModelSpec) -> str:
229
222
  cache_dir = os.path.realpath(
230
223
  os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
231
224
  )
@@ -247,48 +240,6 @@ def cache_from_uri(
247
240
  os.makedirs(XINFERENCE_CACHE_DIR, exist_ok=True)
248
241
  os.symlink(src_root, cache_dir, target_is_directory=True)
249
242
  return cache_dir
250
- elif src_scheme in ["s3"]:
251
- # use anonymous connection for self-hosted storage.
252
- src_fs: AbstractFileSystem = filesystem(src_scheme, anon=self_hosted_storage)
253
- local_fs: AbstractFileSystem = filesystem("file")
254
-
255
- files_to_download = []
256
- os.makedirs(cache_dir, exist_ok=True)
257
-
258
- for path, _, files in src_fs.walk(model_spec.model_uri):
259
- for file in files:
260
- src_path = f"{path}/{file}"
261
- local_path = src_path.replace(src_root, cache_dir)
262
- files_to_download.append((src_path, local_path))
263
-
264
- from concurrent.futures import ThreadPoolExecutor
265
-
266
- failed = False
267
- with ThreadPoolExecutor(max_workers=min(len(files_to_download), 4)) as executor:
268
- futures = [
269
- (
270
- src_path,
271
- executor.submit(
272
- copy_from_src_to_dst, src_fs, src_path, local_fs, local_path
273
- ),
274
- )
275
- for src_path, local_path in files_to_download
276
- ]
277
- for src_path, future in futures:
278
- if failed:
279
- future.cancel()
280
- else:
281
- try:
282
- future.result()
283
- except:
284
- logger.error(f"Download {src_path} failed", exc_info=True)
285
- failed = True
286
-
287
- if failed:
288
- logger.warning(f"Removing cache directory: {cache_dir}")
289
- shutil.rmtree(cache_dir, ignore_errors=True)
290
- raise RuntimeError(f"Failed to download model '{model_spec.model_name}' ")
291
- return cache_dir
292
243
  else:
293
244
  raise ValueError(f"Unsupported URL scheme: {src_scheme}")
294
245
 
@@ -346,51 +297,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
346
297
  return cache_dir
347
298
 
348
299
 
349
- def copy_from_src_to_dst(
350
- _src_fs: "AbstractFileSystem",
351
- _src_path: str,
352
- dst_fs: "AbstractFileSystem",
353
- dst_path: str,
354
- max_attempt: int = 3,
355
- ):
356
- from tqdm import tqdm
357
-
358
- for attempt in range(max_attempt):
359
- logger.info(f"Copy from {_src_path} to {dst_path}, attempt: {attempt}")
360
- try:
361
- with _src_fs.open(_src_path, "rb") as src_file:
362
- file_size = _src_fs.info(_src_path)["size"]
363
-
364
- dst_fs.makedirs(os.path.dirname(dst_path), exist_ok=True)
365
- with dst_fs.open(dst_path, "wb") as dst_file:
366
- chunk_size = 1024 * 1024 # 1 MB
367
-
368
- with tqdm(
369
- total=file_size,
370
- unit="B",
371
- unit_scale=True,
372
- unit_divisor=1024,
373
- desc=_src_path,
374
- ) as pbar:
375
- while True:
376
- chunk = src_file.read(chunk_size)
377
- if not chunk:
378
- break
379
- dst_file.write(chunk)
380
- pbar.update(len(chunk))
381
- logger.info(
382
- f"Copy from {_src_path} to {dst_path} finished, attempt: {attempt}"
383
- )
384
- break
385
- except:
386
- logger.error(
387
- f"Failed to copy from {_src_path} to {dst_path} on attempt {attempt + 1}",
388
- exc_info=True,
389
- )
390
- if attempt + 1 == max_attempt:
391
- raise
392
-
393
-
394
300
  def patch_trust_remote_code():
395
301
  """sentence-transformers calls transformers without the trust_remote_code=True, some embedding
396
302
  models will fail to load, e.g. jina-embeddings-v2-base-en
File without changes