xinference 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +1 -1
  3. xinference/conftest.py +0 -7
  4. xinference/core/media_interface.py +9 -8
  5. xinference/core/model.py +13 -6
  6. xinference/core/scheduler.py +1 -10
  7. xinference/core/worker.py +0 -10
  8. xinference/model/audio/model_spec.json +53 -1
  9. xinference/model/audio/model_spec_modelscope.json +57 -1
  10. xinference/model/embedding/core.py +19 -11
  11. xinference/model/image/model_spec.json +10 -1
  12. xinference/model/image/model_spec_modelscope.json +20 -0
  13. xinference/model/llm/__init__.py +6 -54
  14. xinference/model/llm/core.py +19 -5
  15. xinference/model/llm/llama_cpp/core.py +59 -3
  16. xinference/model/llm/llama_cpp/memory.py +455 -0
  17. xinference/model/llm/llm_family.json +185 -397
  18. xinference/model/llm/llm_family.py +88 -16
  19. xinference/model/llm/llm_family_modelscope.json +199 -421
  20. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  21. xinference/model/llm/sglang/core.py +4 -0
  22. xinference/model/llm/transformers/__init__.py +27 -6
  23. xinference/model/llm/transformers/chatglm.py +4 -2
  24. xinference/model/llm/transformers/core.py +49 -28
  25. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  26. xinference/model/llm/transformers/gemma3.py +119 -164
  27. xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
  28. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  29. xinference/model/llm/transformers/multimodal/core.py +205 -0
  30. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  31. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  32. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  33. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  34. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  35. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  36. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  37. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  38. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  39. xinference/model/llm/transformers/opt.py +4 -2
  40. xinference/model/llm/transformers/utils.py +6 -37
  41. xinference/model/llm/vllm/core.py +4 -0
  42. xinference/model/rerank/core.py +7 -1
  43. xinference/model/rerank/utils.py +17 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
  47. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
  52. xinference/web/ui/src/locales/en.json +3 -1
  53. xinference/web/ui/src/locales/zh.json +3 -1
  54. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/METADATA +6 -4
  55. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
  56. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
  57. xinference/model/llm/transformers/cogvlm2.py +0 -442
  58. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  59. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  60. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  61. xinference/model/llm/transformers/intern_vl.py +0 -526
  62. xinference/model/llm/transformers/internlm2.py +0 -94
  63. xinference/model/llm/transformers/minicpmv25.py +0 -193
  64. xinference/model/llm/transformers/omnilmm.py +0 -132
  65. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  66. xinference/model/llm/transformers/qwen_vl.py +0 -360
  67. xinference/thirdparty/omnilmm/LICENSE +0 -201
  68. xinference/thirdparty/omnilmm/__init__.py +0 -0
  69. xinference/thirdparty/omnilmm/chat.py +0 -218
  70. xinference/thirdparty/omnilmm/constants.py +0 -4
  71. xinference/thirdparty/omnilmm/conversation.py +0 -332
  72. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  73. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  74. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  75. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  76. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  77. xinference/thirdparty/omnilmm/utils.py +0 -134
  78. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  79. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  84. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
  85. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
  86. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
  87. {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,412 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ import math
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ from threading import Thread
18
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
19
+
20
+ import torch
21
+
22
+ from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
23
+ from ...utils import _decode_image, parse_messages
24
+ from ..core import register_non_default_model
25
+ from .core import PytorchMultiModalModel
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @register_transformer
31
+ @register_non_default_model("InternVL3")
32
+ class InternVLChatModel(PytorchMultiModalModel):
33
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
34
+ IMAGENET_STD = (0.229, 0.224, 0.225)
35
+
36
+ @classmethod
37
+ def match_json(
38
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
39
+ ) -> bool:
40
+ family = model_family.model_family or model_family.model_name
41
+ if "internvl3" in family.lower():
42
+ return True
43
+ return False
44
+
45
+ def decide_device(self):
46
+ from transformers import AutoConfig
47
+
48
+ device_map = {}
49
+ world_size = torch.cuda.device_count()
50
+ # single gpu
51
+ if world_size == 1:
52
+ self._device = device_map
53
+ return
54
+ config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
55
+ num_layers = config.llm_config.num_hidden_layers
56
+
57
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
58
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
59
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
60
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
61
+ layer_cnt = 0
62
+ for i, num_layer in enumerate(num_layers_per_gpu):
63
+ for j in range(num_layer):
64
+ device_map[f"language_model.model.layers.{layer_cnt}"] = i
65
+ layer_cnt += 1
66
+ device_map["vision_model"] = 0
67
+ device_map["mlp1"] = 0
68
+ device_map["language_model.model.tok_embeddings"] = 0
69
+ device_map["language_model.model.embed_tokens"] = 0
70
+ device_map["language_model.output"] = 0
71
+ device_map["language_model.model.norm"] = 0
72
+ device_map["language_model.lm_head"] = 0
73
+ device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
74
+ self._device = device_map
75
+
76
+ def load_processor(self):
77
+ from transformers import AutoTokenizer
78
+
79
+ self._tokenizer = AutoTokenizer.from_pretrained(
80
+ self.model_path, trust_remote_code=True, use_fast=False
81
+ )
82
+
83
+ def load_multimodal_model(self):
84
+ from transformers import AutoModel
85
+
86
+ kwargs: Dict[str, Any] = {
87
+ "torch_dtype": torch.bfloat16,
88
+ "low_cpu_mem_usage": True,
89
+ "trust_remote_code": True,
90
+ }
91
+ if self._device:
92
+ kwargs["device_map"] = self._device
93
+ kwargs = self.apply_bnb_quantization(kwargs)
94
+
95
+ self._model = AutoModel.from_pretrained(self.model_path, **kwargs).eval()
96
+
97
+ if not self._device and "none" in self.quantization.lower():
98
+ self._model.cuda()
99
+
100
+ def _build_transform(self, input_size=448):
101
+ import torchvision.transforms as T
102
+ from torchvision.transforms.functional import InterpolationMode
103
+
104
+ MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
105
+ transform = T.Compose(
106
+ [
107
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
108
+ T.Resize(
109
+ (input_size, input_size), interpolation=InterpolationMode.BICUBIC
110
+ ),
111
+ T.ToTensor(),
112
+ T.Normalize(mean=MEAN, std=STD),
113
+ ]
114
+ )
115
+ return transform
116
+
117
+ # video multi-round conversation
118
+ @staticmethod
119
+ def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
120
+ import numpy as np
121
+
122
+ if bound:
123
+ start, end = bound[0], bound[1]
124
+ else:
125
+ start, end = -100000, 100000
126
+ start_idx = max(first_idx, round(start * fps))
127
+ end_idx = min(round(end * fps), max_frame)
128
+ seg_size = float(end_idx - start_idx) / num_segments
129
+ frame_indices = np.array(
130
+ [
131
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
132
+ for idx in range(num_segments)
133
+ ]
134
+ )
135
+ return frame_indices
136
+
137
+ def _find_closest_aspect_ratio(
138
+ self, aspect_ratio, target_ratios, width, height, image_size
139
+ ):
140
+ best_ratio_diff = float("inf")
141
+ best_ratio = (1, 1)
142
+ area = width * height
143
+ for ratio in target_ratios:
144
+ target_aspect_ratio = ratio[0] / ratio[1]
145
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
146
+ if ratio_diff < best_ratio_diff:
147
+ best_ratio_diff = ratio_diff
148
+ best_ratio = ratio
149
+ elif ratio_diff == best_ratio_diff:
150
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
151
+ best_ratio = ratio
152
+ return best_ratio
153
+
154
+ def _dynamic_preprocess(
155
+ self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
156
+ ):
157
+ orig_width, orig_height = image.size
158
+ aspect_ratio = orig_width / orig_height
159
+
160
+ # calculate the existing image aspect ratio
161
+ target_ratios = set(
162
+ (i, j)
163
+ for n in range(min_num, max_num + 1)
164
+ for i in range(1, n + 1)
165
+ for j in range(1, n + 1)
166
+ if i * j <= max_num and i * j >= min_num
167
+ )
168
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
169
+
170
+ # find the closest aspect ratio to the target
171
+ target_aspect_ratio = self._find_closest_aspect_ratio(
172
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
173
+ )
174
+
175
+ # calculate the target width and height
176
+ target_width = image_size * target_aspect_ratio[0]
177
+ target_height = image_size * target_aspect_ratio[1]
178
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
179
+
180
+ # resize the image
181
+ resized_img = image.resize((target_width, target_height))
182
+ processed_images = []
183
+ for i in range(blocks):
184
+ box = (
185
+ (i % (target_width // image_size)) * image_size,
186
+ (i // (target_width // image_size)) * image_size,
187
+ ((i % (target_width // image_size)) + 1) * image_size,
188
+ ((i // (target_width // image_size)) + 1) * image_size,
189
+ )
190
+ # split the image
191
+ split_img = resized_img.crop(box)
192
+ processed_images.append(split_img)
193
+ assert len(processed_images) == blocks
194
+ if use_thumbnail and len(processed_images) != 1:
195
+ thumbnail_img = image.resize((image_size, image_size))
196
+ processed_images.append(thumbnail_img)
197
+ return processed_images
198
+
199
+ def _load_video(
200
+ self, video_path, bound=None, input_size=448, max_num=1, num_segments=32
201
+ ):
202
+ from decord import VideoReader, cpu
203
+ from PIL import Image
204
+
205
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
206
+ max_frame = len(vr) - 1
207
+ fps = float(vr.get_avg_fps())
208
+
209
+ pixel_values_list, num_patches_list = [], []
210
+ transform = self._build_transform(input_size=input_size)
211
+ frame_indices = self._get_index(
212
+ bound, fps, max_frame, first_idx=0, num_segments=num_segments
213
+ )
214
+ for frame_index in frame_indices:
215
+ img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
216
+ img = self._dynamic_preprocess(
217
+ img, image_size=input_size, use_thumbnail=True, max_num=max_num
218
+ )
219
+ pixel_values = [transform(tile) for tile in img]
220
+ pixel_values = torch.stack(pixel_values)
221
+ pixel_values = pixel_values.to(torch.bfloat16).cuda()
222
+ num_patches_list.append(pixel_values.shape[0])
223
+ pixel_values_list.append(pixel_values)
224
+ pixel_values = torch.cat(pixel_values_list)
225
+ return pixel_values, num_patches_list
226
+
227
+ def _message_content_to_intern(self, content, image_cnt):
228
+ if not isinstance(content, str):
229
+ texts = []
230
+ image_urls = []
231
+ video_urls = []
232
+ for c in content:
233
+ c_type = c.get("type")
234
+ if c_type == "text":
235
+ texts.append(c["text"])
236
+ elif c_type == "image_url":
237
+ image_urls.append(c["image_url"]["url"])
238
+ elif c_type == "video_url":
239
+ video_urls.append(c["video_url"]["url"])
240
+ if len(video_urls) > 1:
241
+ raise RuntimeError("Only one video per message is supported")
242
+ image_futures = []
243
+ with ThreadPoolExecutor() as executor:
244
+ for image_url in image_urls:
245
+ fut = executor.submit(_decode_image, image_url)
246
+ image_futures.append(fut)
247
+ images = [fut.result() for fut in image_futures]
248
+ videos = []
249
+ for vid_url in video_urls:
250
+ videos.append(self._load_video(vid_url, num_segments=8, max_num=1))
251
+ prefix = ""
252
+ for i, _ in enumerate(images):
253
+ prefix += f"Image-{image_cnt + i + 1}: <image>\n\n"
254
+
255
+ if len(videos) > 0:
256
+ prefix = "".join(
257
+ [f"Frame{i + 1}: <image>\n" for i in range(len(videos[0][1]))]
258
+ )
259
+
260
+ text = prefix + " ".join(texts)
261
+ return text, images, videos
262
+ return content, [], []
263
+
264
+ def _get_prompt_and_chat_history(
265
+ self,
266
+ prompt: Union[str, List[Dict]],
267
+ chat_history: Optional[List[Dict]] = None,
268
+ ):
269
+ # Convert openai history to intern vl history
270
+ images = []
271
+ videos = []
272
+ history = []
273
+ image_cnt = 0
274
+ for h1, h2 in zip(*[iter(chat_history or [])] * 2):
275
+ content1, img, vid = self._message_content_to_intern(
276
+ h1["content"], image_cnt
277
+ )
278
+ content2, _, _ = self._message_content_to_intern(h2["content"], image_cnt)
279
+ history.append([content1, content2])
280
+ images.extend(img)
281
+ image_cnt += len(img)
282
+ videos.extend(vid)
283
+
284
+ question, img, vid = self._message_content_to_intern(prompt, image_cnt)
285
+ images.extend(img)
286
+ videos.extend(vid)
287
+ return question, history, images, videos
288
+
289
+ def _load_image(self, image_file, input_size=448, max_num=12):
290
+ image = image_file.convert("RGB")
291
+ transform = self._build_transform(input_size=input_size)
292
+ images = self._dynamic_preprocess(
293
+ image, image_size=input_size, use_thumbnail=True, max_num=max_num
294
+ )
295
+ pixel_values = [transform(image) for image in images]
296
+ pixel_values = torch.stack(pixel_values)
297
+ return pixel_values
298
+
299
+ def build_inputs_from_messages(
300
+ self,
301
+ messages: List[Dict],
302
+ generate_config: Dict,
303
+ ):
304
+ from .....thirdparty.internvl.conversation import get_conv_template
305
+
306
+ prompt, _, chat_history = parse_messages(messages)
307
+ content, history, images, videos = self._get_prompt_and_chat_history(
308
+ prompt, chat_history
309
+ )
310
+ num_patches_list = []
311
+ if len(images) == 1:
312
+ content = content.replace("Image-1: <image>\n\n", "<image>\n")
313
+ history = [
314
+ [item[0].replace("Image-1: <image>\n\n", "<image>\n"), item[1]]
315
+ for item in history
316
+ ]
317
+ pixel_values = (
318
+ self._load_image(images[-1], max_num=12).to(torch.bfloat16).cuda()
319
+ )
320
+ num_patches_list = (
321
+ [pixel_values.shape[0]] if pixel_values is not None else []
322
+ )
323
+ elif len(images) > 1:
324
+ pixel_values = [
325
+ self._load_image(img, max_num=12).to(torch.bfloat16).cuda()
326
+ for img in images
327
+ ]
328
+ num_patches_list = [values.size(0) for values in pixel_values]
329
+ pixel_values = torch.cat(pixel_values, dim=0)
330
+ else:
331
+ pixel_values = None
332
+
333
+ if len(videos) > 0:
334
+ pixel_values = videos[0][0]
335
+ num_patches_list = videos[0][1]
336
+
337
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
338
+
339
+ IMG_START_TOKEN = "<img>"
340
+ IMG_END_TOKEN = "</img>"
341
+ IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
342
+
343
+ img_context_token_id = self._tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
344
+ self._model.img_context_token_id = img_context_token_id
345
+
346
+ template = get_conv_template(self._model.template)
347
+ template.system_message = self._model.system_message
348
+ eos_token_id = self._tokenizer.convert_tokens_to_ids(template.sep)
349
+
350
+ history = [] if history is None else history
351
+ for old_question, old_answer in history:
352
+ template.append_message(template.roles[0], old_question)
353
+ template.append_message(template.roles[1], old_answer)
354
+ template.append_message(template.roles[0], content)
355
+ template.append_message(template.roles[1], None)
356
+ query = template.get_prompt()
357
+
358
+ for num_patches in num_patches_list:
359
+ image_tokens = (
360
+ IMG_START_TOKEN
361
+ + IMG_CONTEXT_TOKEN * self._model.num_image_token * num_patches
362
+ + IMG_END_TOKEN
363
+ )
364
+ query = query.replace("<image>", image_tokens, 1)
365
+
366
+ model_inputs = self._tokenizer(query, return_tensors="pt")
367
+ input_ids = model_inputs["input_ids"].cuda()
368
+ attention_mask = model_inputs["attention_mask"].cuda()
369
+
370
+ return {
371
+ "pixel_values": pixel_values,
372
+ "input_ids": input_ids,
373
+ "attention_mask": attention_mask,
374
+ "eos_token_id": eos_token_id,
375
+ }
376
+
377
+ def build_generate_kwargs(
378
+ self,
379
+ generate_config: Dict,
380
+ ) -> Dict[str, Any]:
381
+ return {
382
+ "max_new_tokens": generate_config.get("max_tokens", 1024),
383
+ "do_sample": False,
384
+ "temperature": generate_config.get("temperature", None),
385
+ }
386
+
387
+ def build_streaming_iter(
388
+ self,
389
+ messages: List[Dict],
390
+ generate_config: Dict,
391
+ ) -> Tuple[Iterator, int]:
392
+ from transformers import TextIteratorStreamer
393
+
394
+ # Initialize the streamer
395
+ streamer = TextIteratorStreamer(
396
+ self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
397
+ )
398
+
399
+ configs = self.build_generate_kwargs(generate_config)
400
+ inputs = self.build_inputs_from_messages(messages, generate_config)
401
+ generate_kwargs = {**inputs, **configs, "streamer": streamer}
402
+ thread = Thread(
403
+ target=self._model.generate,
404
+ kwargs=generate_kwargs,
405
+ )
406
+ thread.start()
407
+ return streamer, len(inputs["input_ids"][0])
408
+
409
+ def check_conditions(self, new_text: str) -> Tuple[str, bool]:
410
+ if new_text == self._model.conv_template.sep:
411
+ return "", True
412
+ return new_text, False
@@ -1,4 +1,4 @@
1
- # Copyright 2022-2023 XProbe Inc.
1
+ # Copyright 2022-2025 XProbe Inc.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -12,42 +12,28 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import uuid
16
15
  from concurrent.futures import ThreadPoolExecutor
17
- from typing import Dict, Iterator, List, Optional, Tuple, Union
16
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
18
17
 
19
18
  import torch
20
19
  from PIL import Image
21
20
 
22
- from ....core.scheduler import InferenceRequest
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- CompletionChunk,
27
- PytorchModelConfig,
28
- )
29
- from ...utils import select_device
30
- from ..llm_family import LLMFamilyV1, LLMSpecV1
31
- from ..utils import (
32
- _decode_image,
33
- generate_chat_completion,
34
- generate_completion_chunk,
35
- parse_messages,
36
- )
37
- from .core import PytorchChatModel, PytorchGenerateConfig
38
- from .utils import cache_clean
21
+ from .....core.model import register_batching_multimodal_models
22
+ from .....core.scheduler import InferenceRequest
23
+ from .....model.utils import select_device
24
+ from .....types import PytorchModelConfig
25
+ from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
26
+ from ...utils import _decode_image, parse_messages
27
+ from ..core import register_non_default_model
28
+ from .core import PytorchMultiModalModel
39
29
 
40
30
  logger = logging.getLogger(__name__)
41
31
 
42
32
 
43
- class MiniCPMV26Model(PytorchChatModel):
44
- def __init__(self, *args, **kwargs):
45
- super().__init__(*args, **kwargs)
46
- self._device = None
47
- self._tokenizer = None
48
- self._model = None
49
- self._processor = None
50
-
33
+ @register_batching_multimodal_models("MiniCPM-V-2.6")
34
+ @register_transformer
35
+ @register_non_default_model("MiniCPM-V-2.6")
36
+ class Glm4VModel(PytorchMultiModalModel):
51
37
  @classmethod
52
38
  def match_json(
53
39
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
@@ -66,15 +52,7 @@ class MiniCPMV26Model(PytorchChatModel):
66
52
  pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
67
53
  return pytorch_model_config
68
54
 
69
- def _get_model_class(self):
70
- from transformers import AutoModel
71
-
72
- return AutoModel
73
-
74
- def load(self):
75
- from transformers import AutoModel, AutoProcessor, AutoTokenizer
76
- from transformers.generation import GenerationConfig
77
-
55
+ def decide_device(self):
78
56
  device = self._pytorch_model_config.get("device", "auto")
79
57
  self._device = select_device(device)
80
58
  self._device = (
@@ -83,15 +61,25 @@ class MiniCPMV26Model(PytorchChatModel):
83
61
  else self._device
84
62
  )
85
63
 
86
- if "int4" in self.model_path and device == "mps":
87
- logger.error(
88
- "Error: running int4 model with bitsandbytes on Mac is not supported right now."
89
- )
90
- exit()
64
+ def load_processor(self):
65
+ from transformers import AutoProcessor, AutoTokenizer
91
66
 
92
- if self._check_tensorizer_integrity():
93
- self._model, self._tokenizer = self._load_tensorizer()
94
- return
67
+ min_pixels = self._pytorch_model_config.get("min_pixels")
68
+ max_pixels = self._pytorch_model_config.get("max_pixels")
69
+ self._processor = AutoProcessor.from_pretrained(
70
+ self.model_path,
71
+ trust_remote_code=True,
72
+ min_pixels=min_pixels,
73
+ max_pixels=max_pixels,
74
+ )
75
+
76
+ self._tokenizer = AutoTokenizer.from_pretrained(
77
+ self.model_path, trust_remote_code=True
78
+ )
79
+
80
+ def load_multimodal_model(self):
81
+ from transformers import AutoModel
82
+ from transformers.generation import GenerationConfig
95
83
 
96
84
  if "int4" in self.model_path:
97
85
  model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
@@ -104,27 +92,13 @@ class MiniCPMV26Model(PytorchChatModel):
104
92
  device_map=self._device,
105
93
  **kwargs,
106
94
  )
107
- tokenizer = AutoTokenizer.from_pretrained(
108
- self.model_path, trust_remote_code=True
109
- )
110
95
  self._model = model.eval()
111
- self._tokenizer = tokenizer
112
-
113
96
  # Specify hyperparameters for generation
114
97
  self._model.generation_config = GenerationConfig.from_pretrained(
115
98
  self.model_path,
116
99
  trust_remote_code=True,
117
100
  )
118
- min_pixels = self._pytorch_model_config.get("min_pixels")
119
- max_pixels = self._pytorch_model_config.get("max_pixels")
120
- self._processor = AutoProcessor.from_pretrained(
121
- self.model_path,
122
- trust_remote_code=True,
123
- min_pixels=min_pixels,
124
- max_pixels=max_pixels,
125
- )
126
101
  self._device = self._model.device
127
- self._save_tensorizer()
128
102
 
129
103
  def _message_content_to_chat(self, content):
130
104
  MAX_NUM_FRAMES = 64
@@ -220,58 +194,37 @@ class MiniCPMV26Model(PytorchChatModel):
220
194
  msgs.append({"role": "user", "content": images_chat + [content]})
221
195
  return msgs, video_existed
222
196
 
223
- @cache_clean
224
- def chat(
197
+ def build_inputs_from_messages(
225
198
  self,
226
199
  messages: List[Dict],
227
- generate_config: Optional[PytorchGenerateConfig] = None,
228
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
229
- stream = generate_config.get("stream", False) if generate_config else False
200
+ generate_config: Dict,
201
+ ):
230
202
  msgs, video_existed = self._convert_to_specific_style(messages)
231
-
232
203
  # Set decode params for video
233
204
  params = {}
234
205
  if video_existed:
235
206
  params = {"use_image_id": False, "max_slice_nums": 1}
207
+ return dict(msgs=msgs, image=None, **params)
236
208
 
237
- chat = self._model.chat(
238
- image=None,
239
- msgs=msgs,
240
- tokenizer=self._tokenizer,
241
- sampling=True,
242
- **generate_config,
243
- **params,
244
- )
245
- if stream:
246
- it = self.chat_stream(chat)
247
- return self._to_chat_completion_chunks(it)
248
- else:
249
- return generate_chat_completion(self.model_uid, chat)
250
-
251
- def chat_stream(self, chat) -> Iterator[CompletionChunk]:
252
- completion_id = str(uuid.uuid1())
253
- for new_text in chat:
254
- yield generate_completion_chunk(
255
- chunk_text=new_text,
256
- finish_reason=None,
257
- chunk_id=completion_id,
258
- model_uid=self.model_uid,
259
- prompt_tokens=-1,
260
- completion_tokens=-1,
261
- total_tokens=-1,
262
- )
263
- yield generate_completion_chunk(
264
- chunk_text=None,
265
- finish_reason="stop",
266
- chunk_id=completion_id,
267
- model_uid=self.model_uid,
268
- prompt_tokens=-1,
269
- completion_tokens=-1,
270
- total_tokens=-1,
271
- has_choice=True,
272
- has_content=False,
209
+ def build_generate_kwargs(
210
+ self,
211
+ generate_config: Dict,
212
+ ) -> Dict[str, Any]:
213
+ return dict(**generate_config)
214
+
215
+ def build_streaming_iter(
216
+ self,
217
+ messages: List[Dict],
218
+ generate_config: Dict,
219
+ ) -> Tuple[Iterator, int]:
220
+ inputs = self.build_inputs_from_messages(messages, generate_config)
221
+ config = self.build_generate_kwargs(generate_config)
222
+ chat_iter = self._model.chat(
223
+ **inputs, **config, tokenizer=self._tokenizer, sampling=True
273
224
  )
274
225
 
226
+ return chat_iter, -1
227
+
275
228
  def prepare_sanitize_generate_config(self, req: InferenceRequest):
276
229
  """
277
230
  Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
@@ -376,7 +329,7 @@ class MiniCPMV26Model(PytorchChatModel):
376
329
  because the specific inference process is performed by `self._model.llm`,
377
330
  not `self._model` itself
378
331
  """
379
- from .utils import batch_inference_one_step
332
+ from ..utils import batch_inference_one_step
380
333
 
381
334
  self.prepare_batch_inference(req_list)
382
335
  batch_inference_one_step(