xinference 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +1 -1
  3. xinference/conftest.py +0 -7
  4. xinference/core/media_interface.py +9 -8
  5. xinference/core/model.py +13 -6
  6. xinference/core/scheduler.py +1 -10
  7. xinference/core/worker.py +0 -10
  8. xinference/model/audio/model_spec.json +53 -1
  9. xinference/model/audio/model_spec_modelscope.json +57 -1
  10. xinference/model/embedding/core.py +19 -11
  11. xinference/model/image/model_spec.json +10 -1
  12. xinference/model/image/model_spec_modelscope.json +20 -0
  13. xinference/model/llm/__init__.py +6 -54
  14. xinference/model/llm/core.py +19 -5
  15. xinference/model/llm/llama_cpp/core.py +59 -3
  16. xinference/model/llm/llama_cpp/memory.py +455 -0
  17. xinference/model/llm/llm_family.json +185 -397
  18. xinference/model/llm/llm_family.py +88 -16
  19. xinference/model/llm/llm_family_modelscope.json +199 -421
  20. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  21. xinference/model/llm/sglang/core.py +4 -0
  22. xinference/model/llm/transformers/__init__.py +27 -6
  23. xinference/model/llm/transformers/chatglm.py +4 -2
  24. xinference/model/llm/transformers/core.py +49 -28
  25. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  26. xinference/model/llm/transformers/gemma3.py +119 -164
  27. xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
  28. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  29. xinference/model/llm/transformers/multimodal/core.py +205 -0
  30. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  31. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  32. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  33. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  34. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  35. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  36. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  37. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  38. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  39. xinference/model/llm/transformers/opt.py +4 -2
  40. xinference/model/llm/transformers/utils.py +6 -37
  41. xinference/model/llm/vllm/core.py +4 -0
  42. xinference/model/rerank/core.py +7 -1
  43. xinference/model/rerank/utils.py +17 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
  47. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
  52. xinference/web/ui/src/locales/en.json +3 -1
  53. xinference/web/ui/src/locales/zh.json +3 -1
  54. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/METADATA +16 -14
  55. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
  56. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
  57. xinference/model/llm/transformers/cogvlm2.py +0 -442
  58. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  59. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  60. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  61. xinference/model/llm/transformers/intern_vl.py +0 -526
  62. xinference/model/llm/transformers/internlm2.py +0 -94
  63. xinference/model/llm/transformers/minicpmv25.py +0 -193
  64. xinference/model/llm/transformers/omnilmm.py +0 -132
  65. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  66. xinference/model/llm/transformers/qwen_vl.py +0 -360
  67. xinference/thirdparty/omnilmm/LICENSE +0 -201
  68. xinference/thirdparty/omnilmm/__init__.py +0 -0
  69. xinference/thirdparty/omnilmm/chat.py +0 -218
  70. xinference/thirdparty/omnilmm/constants.py +0 -4
  71. xinference/thirdparty/omnilmm/conversation.py +0 -332
  72. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  73. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  74. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  75. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  76. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  77. xinference/thirdparty/omnilmm/utils.py +0 -134
  78. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  79. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  84. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
  85. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
  86. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
  87. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
@@ -1,442 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import logging
15
- import uuid
16
- from concurrent.futures import ThreadPoolExecutor
17
- from typing import Dict, Iterator, List, Optional, Tuple, Union
18
-
19
- import torch
20
-
21
- from ....core.scheduler import InferenceRequest
22
- from ....model.utils import select_device
23
- from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
24
- from ..llm_family import LLMFamilyV1, LLMSpecV1
25
- from ..utils import (
26
- _decode_image,
27
- generate_chat_completion,
28
- generate_completion_chunk,
29
- parse_messages,
30
- )
31
- from .core import PytorchChatModel, PytorchGenerateConfig
32
- from .utils import cache_clean, get_max_src_len
33
-
34
- logger = logging.getLogger(__name__)
35
-
36
-
37
- LANGUAGE_TOKEN_TYPE = 0
38
- VISION_TOKEN_TYPE = 1
39
-
40
-
41
- def recur_move_to(item, tgt, criterion_func):
42
- """
43
- This function is copied from https://github.com/THUDM/CogVLM2/blob/main/basic_demo/cli_demo_batch_inference.py
44
- """
45
- if criterion_func(item):
46
- device_copy = item.to(tgt)
47
- return device_copy
48
- elif isinstance(item, list):
49
- return [recur_move_to(v, tgt, criterion_func) for v in item]
50
- elif isinstance(item, tuple):
51
- return tuple([recur_move_to(v, tgt, criterion_func) for v in item])
52
- elif isinstance(item, dict):
53
- return {k: recur_move_to(v, tgt, criterion_func) for k, v in item.items()}
54
- else:
55
- return item
56
-
57
-
58
- class CogVLM2Model(PytorchChatModel):
59
- def __init__(self, *args, **kwargs):
60
- super().__init__(*args, **kwargs)
61
- self._torch_type = None
62
- self._device = None
63
- self._tokenizer = None
64
- self._model = None
65
-
66
- @classmethod
67
- def match_json(
68
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
69
- ) -> bool:
70
- family = model_family.model_family or model_family.model_name
71
- if "cogvlm2" in family.lower() and "video" not in family.lower():
72
- return True
73
- return False
74
-
75
- def load(self):
76
- from transformers import AutoModelForCausalLM, AutoTokenizer
77
- from transformers.generation import GenerationConfig
78
-
79
- device = self._pytorch_model_config.get("device", "auto")
80
- self._device = select_device(device)
81
- self._torch_type = (
82
- torch.bfloat16
83
- if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
84
- else torch.float16
85
- )
86
-
87
- if self._check_tensorizer_integrity():
88
- self._model, self._tokenizer = self._load_tensorizer()
89
- return
90
-
91
- kwargs = self.apply_bnb_quantization()
92
-
93
- self._tokenizer = AutoTokenizer.from_pretrained(
94
- self.model_path,
95
- trust_remote_code=True,
96
- )
97
-
98
- self._model = AutoModelForCausalLM.from_pretrained(
99
- self.model_path,
100
- torch_dtype=self._torch_type,
101
- trust_remote_code=True,
102
- low_cpu_mem_usage=True,
103
- device_map="auto",
104
- **kwargs
105
- ).eval()
106
-
107
- # Specify hyperparameters for generation
108
- self._model.generation_config = GenerationConfig.from_pretrained(
109
- self.model_path,
110
- trust_remote_code=True,
111
- )
112
- self._save_tensorizer()
113
-
114
- def _message_content_to_cogvlm2(self, content):
115
- if not isinstance(content, str):
116
- texts = []
117
- image_urls = []
118
- for c in content:
119
- c_type = c.get("type")
120
- if c_type == "text":
121
- texts.append(c["text"])
122
- elif c_type == "image_url":
123
- image_urls.append(c["image_url"]["url"])
124
- image_futures = []
125
- with ThreadPoolExecutor() as executor:
126
- for image_url in image_urls:
127
- fut = executor.submit(_decode_image, image_url)
128
- image_futures.append(fut)
129
- images = [fut.result() for fut in image_futures]
130
- text = " ".join(texts)
131
- if len(images) == 0:
132
- return text, None
133
- elif len(images) == 1:
134
- return text, images
135
- else:
136
- raise RuntimeError(
137
- "Only one image per message is supported by CogVLM2."
138
- )
139
- return content, None
140
-
141
- def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]):
142
- query = system_prompt
143
- history: List[Tuple] = []
144
- pixel_values = None
145
- for i in range(0, len(chat_history), 2):
146
- user = chat_history[i]["content"]
147
- if isinstance(user, List):
148
- for content in user:
149
- c_type = content.get("type")
150
- if c_type == "text":
151
- user = content["text"]
152
- elif c_type == "image_url" and not pixel_values:
153
- pixel_values = _decode_image(content["image_url"]["url"])
154
- assistant = chat_history[i + 1]["content"]
155
- history.append((user, assistant))
156
- query = assistant # type: ignore
157
- return query, history, [pixel_values]
158
-
159
- def get_query_and_history(
160
- self,
161
- prompt: Union[str, List[Dict]],
162
- system_prompt: Optional[str] = None,
163
- chat_history: Optional[List[Dict]] = None,
164
- ):
165
- content, image = self._message_content_to_cogvlm2(prompt)
166
-
167
- history = []
168
- history_image = None
169
- if chat_history:
170
- query, history, history_image = self._history_content_to_cogvlm2(
171
- system_prompt, chat_history # type: ignore
172
- )
173
-
174
- if image and history_image:
175
- history = []
176
- query = content
177
- else:
178
- image = image if image else history_image
179
- query = content
180
- return query, image, history
181
-
182
- @cache_clean
183
- def chat(
184
- self,
185
- messages: List[Dict],
186
- generate_config: Optional[PytorchGenerateConfig] = None,
187
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
188
- system_prompt = ""
189
- if messages[0]["role"] == "system":
190
- system_prompt = messages[0]["content"]
191
- stream = generate_config.get("stream", False) if generate_config else False
192
-
193
- sanitized_config = {
194
- "pad_token_id": 128002,
195
- "max_new_tokens": generate_config.get("max_tokens", 512)
196
- if generate_config
197
- else 512,
198
- }
199
-
200
- prompt, _, chat_history = parse_messages(messages)
201
- query, image, history = self.get_query_and_history(
202
- prompt, system_prompt=system_prompt, chat_history=chat_history
203
- )
204
-
205
- input_by_model = self._model.build_conversation_input_ids(
206
- self._tokenizer,
207
- query=query,
208
- history=history,
209
- images=image,
210
- template_version="chat",
211
- )
212
-
213
- inputs = {
214
- "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self._device),
215
- "token_type_ids": input_by_model["token_type_ids"]
216
- .unsqueeze(0)
217
- .to(self._device),
218
- "attention_mask": input_by_model["attention_mask"]
219
- .unsqueeze(0)
220
- .to(self._device),
221
- "images": [
222
- [input_by_model["images"][0].to(self._device).to(self._torch_type)]
223
- ]
224
- if image is not None
225
- else None,
226
- }
227
-
228
- if stream:
229
- it = self._streaming_chat_response(inputs, sanitized_config)
230
- return self._to_chat_completion_chunks(it)
231
- else:
232
- with torch.no_grad():
233
- outputs = self._model.generate(**inputs, **sanitized_config)
234
- outputs = outputs[:, inputs["input_ids"].shape[1] :]
235
- response = self._tokenizer.decode(outputs[0])
236
- response = response.split("<|end_of_text|>")[0]
237
-
238
- return generate_chat_completion(self.model_uid, response)
239
-
240
- def _streaming_chat_response(
241
- self, inputs: Dict, config: Dict
242
- ) -> Iterator[CompletionChunk]:
243
- from threading import Thread
244
-
245
- from transformers import TextIteratorStreamer
246
-
247
- streamer = TextIteratorStreamer(
248
- self._tokenizer, skip_prompt=True, skip_special_tokens=True
249
- )
250
- generation_kwargs = {
251
- "input_ids": inputs["input_ids"],
252
- "attention_mask": inputs["attention_mask"],
253
- "token_type_ids": inputs["token_type_ids"],
254
- "images": inputs["images"],
255
- "max_new_tokens": config["max_new_tokens"],
256
- "pad_token_id": config["pad_token_id"],
257
- "streamer": streamer,
258
- }
259
-
260
- thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
261
- thread.start()
262
-
263
- completion_id = str(uuid.uuid1())
264
- for new_text in streamer:
265
- yield generate_completion_chunk(
266
- chunk_text=new_text,
267
- finish_reason=None,
268
- chunk_id=completion_id,
269
- model_uid=self.model_uid,
270
- prompt_tokens=-1,
271
- completion_tokens=-1,
272
- total_tokens=-1,
273
- )
274
- yield generate_completion_chunk(
275
- chunk_text=None,
276
- finish_reason="stop",
277
- chunk_id=completion_id,
278
- model_uid=self.model_uid,
279
- prompt_tokens=-1,
280
- completion_tokens=-1,
281
- total_tokens=-1,
282
- has_choice=True,
283
- has_content=False,
284
- )
285
-
286
- @staticmethod
287
- def build_position_ids(x, attention_mask=None):
288
- """
289
- Copied from https://huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B-int4/blob/main/modeling_cogvlm.py
290
- """
291
- # Fix: 参考官方开源代码
292
- if attention_mask is not None:
293
- tmp = x.clone()
294
- tmp[~(attention_mask.bool())] = -1
295
- else:
296
- tmp = x.clone()
297
- # image boi eoi token as LANGUAGE_TOKEN_TYPE
298
- is_boi_eoi = torch.zeros_like(x, dtype=torch.bool)
299
- is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (
300
- tmp[:, :-1] == LANGUAGE_TOKEN_TYPE
301
- )
302
- is_boi_eoi[:, 0] |= tmp[:, 0] == VISION_TOKEN_TYPE
303
- is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (
304
- tmp[:, 1:] == LANGUAGE_TOKEN_TYPE
305
- )
306
- is_boi_eoi[:, -1] |= tmp[:, -1] == VISION_TOKEN_TYPE
307
- tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
308
- # final position ids
309
- y = torch.zeros_like(x, dtype=torch.long)
310
- y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
311
- (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
312
- )
313
- y = y.cumsum(dim=-1)
314
- return y
315
-
316
- def get_dtype(self):
317
- return self._torch_type
318
-
319
- def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
320
- prompt, system_prompt, chat_history = parse_messages(messages)
321
- system_prompt = system_prompt or ""
322
- query, image, history = self.get_query_and_history(
323
- prompt, system_prompt=system_prompt, chat_history=chat_history
324
- )
325
-
326
- input_by_model: dict = self._model.build_conversation_input_ids( # type: ignore
327
- self._tokenizer,
328
- query=query,
329
- history=history,
330
- images=image,
331
- template_version="chat",
332
- )
333
- return {
334
- "input_ids": input_by_model["input_ids"], # seq_len
335
- "token_type_ids": input_by_model["token_type_ids"], # seq_len
336
- "attention_mask": input_by_model["attention_mask"], # seq_len
337
- "images": input_by_model["images"],
338
- }
339
-
340
- def prepare_sanitize_generate_config(self, req: InferenceRequest):
341
- """
342
- See https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/main/generation_config.json
343
- """
344
- raw_config = req.inference_kwargs.get("raw_params", {})
345
- temperature = raw_config.get("temperature", None)
346
- if temperature is None:
347
- raw_config["temperature"] = 0.6
348
- top_p = raw_config.get("top_p", None)
349
- if top_p is None:
350
- raw_config["top_p"] = 0.9
351
- return raw_config
352
-
353
- def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
354
- context_len = self.get_context_len()
355
- assert isinstance(prompts[0], dict)
356
- images = []
357
- max_length = float("-inf")
358
- for i, feature in enumerate(prompts):
359
- req = req_list[i]
360
- if "images" in feature:
361
- images.append(feature.pop("images", None))
362
- max_src_len = get_max_src_len(context_len, req)
363
- input_ids = feature["input_ids"][-max_src_len:]
364
- req.prompt_tokens = input_ids.tolist()
365
- feature["input_ids"] = input_ids
366
- feature["token_type_ids"] = feature["token_type_ids"][-max_src_len:]
367
- feature["attention_mask"] = feature["attention_mask"][-max_src_len:]
368
- req.extra_kwargs["attention_mask_seq_len"] = feature[
369
- "attention_mask"
370
- ].shape[0]
371
- max_length = max(len(input_ids), max_length)
372
-
373
- def pad_to_max_length_internal(feature, max_len, idx):
374
- padding_length = max_len - len(feature["input_ids"])
375
- req_list[idx].padding_len = padding_length
376
- feature["input_ids"] = torch.cat(
377
- [torch.full((padding_length,), 0), feature["input_ids"]]
378
- )
379
- feature["token_type_ids"] = torch.cat(
380
- [
381
- torch.zeros(padding_length, dtype=torch.long),
382
- feature["token_type_ids"],
383
- ]
384
- )
385
- feature["attention_mask"] = torch.cat(
386
- [
387
- torch.zeros(padding_length, dtype=torch.long),
388
- feature["attention_mask"],
389
- ]
390
- )
391
- return feature
392
-
393
- features = [
394
- pad_to_max_length_internal(feature, max_length, i)
395
- for i, feature in enumerate(prompts)
396
- ]
397
- batch = {
398
- key: torch.stack([feature[key] for feature in features])
399
- for key in features[0].keys()
400
- }
401
-
402
- position_ids = self.build_position_ids(batch["token_type_ids"])
403
- batch["position_ids"] = position_ids
404
-
405
- for i in range(len(prompts)):
406
- req = req_list[i]
407
- req.extra_kwargs["max_position_id"] = position_ids[i : i + 1, -1].item()
408
-
409
- if images:
410
- batch["images"] = images
411
-
412
- batch = recur_move_to(
413
- batch, self._device, lambda x: isinstance(x, torch.Tensor)
414
- )
415
- dtype = self.get_dtype()
416
- if dtype:
417
- batch = recur_move_to(
418
- batch,
419
- dtype,
420
- lambda x: isinstance(x, torch.Tensor) and torch.is_floating_point(x),
421
- )
422
- return batch
423
-
424
- def build_decode_token_type_ids(
425
- self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
426
- ):
427
- token_type_ids = torch.full(
428
- (batch_size, 1), fill_value=1, dtype=torch.long, device=self._device
429
- )
430
- return token_type_ids
431
-
432
- def build_decode_position_ids(
433
- self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
434
- ):
435
- tmp = []
436
- for r in reqs:
437
- r.extra_kwargs["max_position_id"] += 1
438
- tmp.append(r.extra_kwargs["max_position_id"])
439
- position_ids = torch.as_tensor(
440
- tmp, device=self._device, dtype=torch.long
441
- ).unsqueeze(1)
442
- return position_ids