xinference 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +1 -1
  3. xinference/conftest.py +0 -7
  4. xinference/core/media_interface.py +9 -8
  5. xinference/core/model.py +13 -6
  6. xinference/core/scheduler.py +1 -10
  7. xinference/core/worker.py +0 -10
  8. xinference/model/audio/model_spec.json +53 -1
  9. xinference/model/audio/model_spec_modelscope.json +57 -1
  10. xinference/model/embedding/core.py +19 -11
  11. xinference/model/image/model_spec.json +10 -1
  12. xinference/model/image/model_spec_modelscope.json +20 -0
  13. xinference/model/llm/__init__.py +6 -54
  14. xinference/model/llm/core.py +19 -5
  15. xinference/model/llm/llama_cpp/core.py +59 -3
  16. xinference/model/llm/llama_cpp/memory.py +455 -0
  17. xinference/model/llm/llm_family.json +185 -397
  18. xinference/model/llm/llm_family.py +88 -16
  19. xinference/model/llm/llm_family_modelscope.json +199 -421
  20. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  21. xinference/model/llm/sglang/core.py +4 -0
  22. xinference/model/llm/transformers/__init__.py +27 -6
  23. xinference/model/llm/transformers/chatglm.py +4 -2
  24. xinference/model/llm/transformers/core.py +49 -28
  25. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  26. xinference/model/llm/transformers/gemma3.py +119 -164
  27. xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
  28. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  29. xinference/model/llm/transformers/multimodal/core.py +205 -0
  30. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  31. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  32. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  33. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  34. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  35. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  36. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  37. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  38. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  39. xinference/model/llm/transformers/opt.py +4 -2
  40. xinference/model/llm/transformers/utils.py +6 -37
  41. xinference/model/llm/vllm/core.py +4 -0
  42. xinference/model/rerank/core.py +7 -1
  43. xinference/model/rerank/utils.py +17 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
  47. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
  52. xinference/web/ui/src/locales/en.json +3 -1
  53. xinference/web/ui/src/locales/zh.json +3 -1
  54. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/METADATA +16 -14
  55. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
  56. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
  57. xinference/model/llm/transformers/cogvlm2.py +0 -442
  58. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  59. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  60. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  61. xinference/model/llm/transformers/intern_vl.py +0 -526
  62. xinference/model/llm/transformers/internlm2.py +0 -94
  63. xinference/model/llm/transformers/minicpmv25.py +0 -193
  64. xinference/model/llm/transformers/omnilmm.py +0 -132
  65. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  66. xinference/model/llm/transformers/qwen_vl.py +0 -360
  67. xinference/thirdparty/omnilmm/LICENSE +0 -201
  68. xinference/thirdparty/omnilmm/__init__.py +0 -0
  69. xinference/thirdparty/omnilmm/chat.py +0 -218
  70. xinference/thirdparty/omnilmm/constants.py +0 -4
  71. xinference/thirdparty/omnilmm/conversation.py +0 -332
  72. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  73. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  74. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  75. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  76. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  77. xinference/thirdparty/omnilmm/utils.py +0 -134
  78. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  79. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  84. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
  85. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
  86. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
  87. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
@@ -1,526 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import logging
15
- import uuid
16
- from concurrent.futures import ThreadPoolExecutor
17
- from typing import Dict, Iterator, List, Optional, Union
18
-
19
- import torch
20
-
21
- from ....types import ChatCompletion, ChatCompletionChunk
22
- from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
23
- from ..utils import (
24
- _decode_image,
25
- generate_chat_completion,
26
- generate_completion_chunk,
27
- parse_messages,
28
- )
29
- from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
30
- from .utils import cache_clean
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- IMAGENET_MEAN = (0.485, 0.456, 0.406)
35
- IMAGENET_STD = (0.229, 0.224, 0.225)
36
-
37
-
38
- def _message_content_to_intern(content, image_cnt):
39
- if not isinstance(content, str):
40
- texts = []
41
- image_urls = []
42
- video_urls = []
43
- for c in content:
44
- c_type = c.get("type")
45
- if c_type == "text":
46
- texts.append(c["text"])
47
- elif c_type == "image_url":
48
- image_urls.append(c["image_url"]["url"])
49
- elif c_type == "video_url":
50
- video_urls.append(c["video_url"]["url"])
51
- if len(video_urls) > 1:
52
- raise RuntimeError("Only one video per message is supported")
53
- image_futures = []
54
- with ThreadPoolExecutor() as executor:
55
- for image_url in image_urls:
56
- fut = executor.submit(_decode_image, image_url)
57
- image_futures.append(fut)
58
- images = [fut.result() for fut in image_futures]
59
- videos = []
60
- for vid_url in video_urls:
61
- videos.append(_load_video(vid_url, num_segments=8, max_num=1))
62
- prefix = ""
63
- for i, _ in enumerate(images):
64
- prefix += f"Image-{image_cnt + i + 1}: <image>\n\n"
65
-
66
- if len(videos) > 0:
67
- prefix = "".join(
68
- [f"Frame{i+1}: <image>\n" for i in range(len(videos[0][1]))]
69
- )
70
-
71
- text = prefix + " ".join(texts)
72
- return text, images, videos
73
- return content, [], []
74
-
75
-
76
- def _get_prompt_and_chat_history(
77
- prompt: Union[str, List[Dict]],
78
- chat_history: Optional[List[Dict]] = None,
79
- ):
80
- # Convert openai history to intern vl history
81
- images = []
82
- videos = []
83
- history = []
84
- image_cnt = 0
85
- for h1, h2 in zip(*[iter(chat_history or [])] * 2):
86
- content1, img, vid = _message_content_to_intern(h1["content"], image_cnt)
87
- content2, _, _ = _message_content_to_intern(h2["content"], image_cnt)
88
- history.append([content1, content2])
89
- images.extend(img)
90
- image_cnt += len(img)
91
- videos.extend(vid)
92
-
93
- question, img, vid = _message_content_to_intern(prompt, image_cnt)
94
- images.extend(img)
95
- videos.extend(vid)
96
- return question, history, images, videos
97
-
98
-
99
- def _build_transform(input_size=448):
100
- import torchvision.transforms as T
101
- from torchvision.transforms.functional import InterpolationMode
102
-
103
- MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
104
- transform = T.Compose(
105
- [
106
- T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
107
- T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
108
- T.ToTensor(),
109
- T.Normalize(mean=MEAN, std=STD),
110
- ]
111
- )
112
- return transform
113
-
114
-
115
- def _find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
116
- best_ratio_diff = float("inf")
117
- best_ratio = (1, 1)
118
- area = width * height
119
- for ratio in target_ratios:
120
- target_aspect_ratio = ratio[0] / ratio[1]
121
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
122
- if ratio_diff < best_ratio_diff:
123
- best_ratio_diff = ratio_diff
124
- best_ratio = ratio
125
- elif ratio_diff == best_ratio_diff:
126
- if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
127
- best_ratio = ratio
128
- return best_ratio
129
-
130
-
131
- def _dynamic_preprocess(
132
- image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
133
- ):
134
- orig_width, orig_height = image.size
135
- aspect_ratio = orig_width / orig_height
136
-
137
- # calculate the existing image aspect ratio
138
- target_ratios = set(
139
- (i, j)
140
- for n in range(min_num, max_num + 1)
141
- for i in range(1, n + 1)
142
- for j in range(1, n + 1)
143
- if i * j <= max_num and i * j >= min_num
144
- )
145
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
146
-
147
- # find the closest aspect ratio to the target
148
- target_aspect_ratio = _find_closest_aspect_ratio(
149
- aspect_ratio, target_ratios, orig_width, orig_height, image_size
150
- )
151
-
152
- # calculate the target width and height
153
- target_width = image_size * target_aspect_ratio[0]
154
- target_height = image_size * target_aspect_ratio[1]
155
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
156
-
157
- # resize the image
158
- resized_img = image.resize((target_width, target_height))
159
- processed_images = []
160
- for i in range(blocks):
161
- box = (
162
- (i % (target_width // image_size)) * image_size,
163
- (i // (target_width // image_size)) * image_size,
164
- ((i % (target_width // image_size)) + 1) * image_size,
165
- ((i // (target_width // image_size)) + 1) * image_size,
166
- )
167
- # split the image
168
- split_img = resized_img.crop(box)
169
- processed_images.append(split_img)
170
- assert len(processed_images) == blocks
171
- if use_thumbnail and len(processed_images) != 1:
172
- thumbnail_img = image.resize((image_size, image_size))
173
- processed_images.append(thumbnail_img)
174
- return processed_images
175
-
176
-
177
- def _load_image(image_file, input_size=448, max_num=12):
178
- image = image_file.convert("RGB")
179
- transform = _build_transform(input_size=input_size)
180
- images = _dynamic_preprocess(
181
- image, image_size=input_size, use_thumbnail=True, max_num=max_num
182
- )
183
- pixel_values = [transform(image) for image in images]
184
- pixel_values = torch.stack(pixel_values)
185
- return pixel_values
186
-
187
-
188
- # video multi-round conversation
189
- def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
190
- import numpy as np
191
-
192
- if bound:
193
- start, end = bound[0], bound[1]
194
- else:
195
- start, end = -100000, 100000
196
- start_idx = max(first_idx, round(start * fps))
197
- end_idx = min(round(end * fps), max_frame)
198
- seg_size = float(end_idx - start_idx) / num_segments
199
- frame_indices = np.array(
200
- [
201
- int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
202
- for idx in range(num_segments)
203
- ]
204
- )
205
- return frame_indices
206
-
207
-
208
- def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
209
- from decord import VideoReader, cpu
210
- from PIL import Image
211
-
212
- vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
213
- max_frame = len(vr) - 1
214
- fps = float(vr.get_avg_fps())
215
-
216
- pixel_values_list, num_patches_list = [], []
217
- transform = _build_transform(input_size=input_size)
218
- frame_indices = _get_index(
219
- bound, fps, max_frame, first_idx=0, num_segments=num_segments
220
- )
221
- for frame_index in frame_indices:
222
- img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
223
- img = _dynamic_preprocess(
224
- img, image_size=input_size, use_thumbnail=True, max_num=max_num
225
- )
226
- pixel_values = [transform(tile) for tile in img]
227
- pixel_values = torch.stack(pixel_values)
228
- pixel_values = pixel_values.to(torch.bfloat16).cuda()
229
- num_patches_list.append(pixel_values.shape[0])
230
- pixel_values_list.append(pixel_values)
231
- pixel_values = torch.cat(pixel_values_list)
232
- return pixel_values, num_patches_list
233
-
234
-
235
- @register_transformer
236
- @register_non_default_model(
237
- "internvl-chat", "internvl2", "Internvl2.5", "Internvl2.5-MPO", "InternVL3"
238
- )
239
- class InternVLChatModel(PytorchChatModel):
240
- def __init__(self, *args, **kwargs):
241
- super().__init__(*args, **kwargs)
242
- self._tokenizer = None
243
- self._model = None
244
-
245
- @classmethod
246
- def match_json(
247
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
248
- ) -> bool:
249
- family = model_family.model_family or model_family.model_name
250
- if "internvl" not in family.lower():
251
- return False
252
- return True
253
-
254
- def _get_model_class(self):
255
- from transformers import AutoModel
256
-
257
- return AutoModel
258
-
259
- # Copy from InternVL page
260
- # reference: https://huggingface.co/OpenGVLab/InternVL2-8B
261
- def _split_model(self):
262
- import math
263
-
264
- from transformers import AutoConfig
265
-
266
- device_map = {}
267
- world_size = torch.cuda.device_count()
268
- # single gpu
269
- if world_size == 1:
270
- return None
271
- model_size = f"{self.model_spec.model_size_in_billions}B"
272
- model_name = self.model_family.model_name.lower().replace("-mpo", "")
273
- model_name = f"{model_name}-{model_size}"
274
- if "internvl3" in model_name.lower():
275
- config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
276
- num_layers = config.llm_config.num_hidden_layers
277
- else:
278
- num_layers = {
279
- "internvl2-1B": 24,
280
- "internvl2-2B": 24,
281
- "internvl2-4B": 32,
282
- "internvl2-8B": 32,
283
- "internvl2-26B": 48,
284
- "internvl2-40B": 60,
285
- "internvl2-76B": 80,
286
- "internvl2.5-1B": 24,
287
- "internvl2.5-2B": 24,
288
- "internvl2.5-4B": 36,
289
- "internvl2.5-8B": 32,
290
- "internvl2.5-26B": 48,
291
- "internvl2.5-38B": 64,
292
- "internvl2.5-78B": 80,
293
- }[model_name]
294
-
295
- # Since the first GPU will be used for ViT, treat it as half a GPU.
296
- num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
297
- num_layers_per_gpu = [num_layers_per_gpu] * world_size
298
- num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
299
- layer_cnt = 0
300
- for i, num_layer in enumerate(num_layers_per_gpu):
301
- for j in range(num_layer):
302
- device_map[f"language_model.model.layers.{layer_cnt}"] = i
303
- layer_cnt += 1
304
- device_map["vision_model"] = 0
305
- device_map["mlp1"] = 0
306
- device_map["language_model.model.tok_embeddings"] = 0
307
- device_map["language_model.model.embed_tokens"] = 0
308
- device_map["language_model.output"] = 0
309
- device_map["language_model.model.norm"] = 0
310
- device_map["language_model.lm_head"] = 0
311
- device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
312
- return device_map
313
-
314
- def load(self):
315
- from transformers import AutoModel, AutoTokenizer
316
-
317
- if self._check_tensorizer_integrity():
318
- self._model, self._tokenizer = self._load_tensorizer()
319
- return
320
-
321
- device = self._split_model()
322
-
323
- kwargs = {
324
- "torch_dtype": torch.bfloat16,
325
- "low_cpu_mem_usage": True,
326
- "trust_remote_code": True,
327
- }
328
-
329
- if device is not None:
330
- kwargs["device_map"] = device
331
-
332
- kwargs = self.apply_bnb_quantization(kwargs)
333
-
334
- self._model = AutoModel.from_pretrained(self.model_path, **kwargs).eval()
335
-
336
- if device is None and "none" in self.quantization.lower():
337
- self._model.cuda()
338
-
339
- self._tokenizer = AutoTokenizer.from_pretrained(
340
- self.model_path, trust_remote_code=True, use_fast=False
341
- )
342
-
343
- @cache_clean
344
- def chat(
345
- self,
346
- messages: List[Dict],
347
- generate_config: Optional[PytorchGenerateConfig] = None,
348
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
349
- from ....thirdparty.internvl.conversation import get_conv_template
350
-
351
- IMG_START_TOKEN = "<img>"
352
- IMG_END_TOKEN = "</img>"
353
- IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
354
-
355
- generate_config = generate_config if isinstance(generate_config, dict) else {}
356
-
357
- generation_config = {
358
- "max_new_tokens": (generate_config.get("max_tokens", 1024)),
359
- "do_sample": False,
360
- "temperature": generate_config.get("temperature", None),
361
- }
362
-
363
- stream = (
364
- generate_config.get("stream", False)
365
- if isinstance(generate_config, dict)
366
- else False
367
- )
368
- stream_options = (
369
- generate_config.get("stream_options", None)
370
- if isinstance(generate_config, dict)
371
- else False
372
- )
373
- include_usage = (
374
- stream_options["include_usage"]
375
- if isinstance(stream_options, dict)
376
- else False
377
- )
378
-
379
- prompt, _, chat_history = parse_messages(messages)
380
- content, history, images, videos = _get_prompt_and_chat_history(
381
- prompt, chat_history
382
- )
383
-
384
- num_patches_list = []
385
- if len(images) == 1:
386
- content = content.replace("Image-1: <image>\n\n", "<image>\n")
387
- history = [
388
- [item[0].replace("Image-1: <image>\n\n", "<image>\n"), item[1]]
389
- for item in history
390
- ]
391
- pixel_values = _load_image(images[-1], max_num=12).to(torch.bfloat16).cuda()
392
- num_patches_list = (
393
- [pixel_values.shape[0]] if pixel_values is not None else []
394
- )
395
- elif len(images) > 1:
396
- pixel_values = [
397
- _load_image(img, max_num=12).to(torch.bfloat16).cuda() for img in images
398
- ]
399
- num_patches_list = [values.size(0) for values in pixel_values]
400
- pixel_values = torch.cat(pixel_values, dim=0)
401
- else:
402
- pixel_values = None
403
-
404
- if len(videos) > 0:
405
- pixel_values = videos[0][0]
406
- num_patches_list = videos[0][1]
407
-
408
- assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
409
-
410
- img_context_token_id = self._tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
411
- self._model.img_context_token_id = img_context_token_id
412
-
413
- template = get_conv_template(self._model.template)
414
- template.system_message = self._model.system_message
415
- eos_token_id = self._tokenizer.convert_tokens_to_ids(template.sep)
416
-
417
- history = [] if history is None else history
418
- for old_question, old_answer in history:
419
- template.append_message(template.roles[0], old_question)
420
- template.append_message(template.roles[1], old_answer)
421
- template.append_message(template.roles[0], content)
422
- template.append_message(template.roles[1], None)
423
- query = template.get_prompt()
424
-
425
- for num_patches in num_patches_list:
426
- image_tokens = (
427
- IMG_START_TOKEN
428
- + IMG_CONTEXT_TOKEN * self._model.num_image_token * num_patches
429
- + IMG_END_TOKEN
430
- )
431
- query = query.replace("<image>", image_tokens, 1)
432
-
433
- model_inputs = self._tokenizer(query, return_tensors="pt")
434
- input_ids = model_inputs["input_ids"].cuda()
435
- attention_mask = model_inputs["attention_mask"].cuda()
436
- generation_config["eos_token_id"] = eos_token_id
437
- generate_kwargs = {
438
- "pixel_values": pixel_values,
439
- "input_ids": input_ids,
440
- "attention_mask": attention_mask,
441
- }
442
- generate_kwargs.update(generation_config)
443
-
444
- if stream:
445
- chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
446
- return self._to_chat_completion_chunks(chunk)
447
- else:
448
- return self._generate(generate_kwargs, input_ids, template)
449
-
450
- def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
451
- prompt_tokens = len(input_ids[0])
452
- generation_output = self._model.generate(**generate_kwargs)
453
- completion_tokens = len(generation_output[0])
454
- response = self._tokenizer.batch_decode(
455
- generation_output, skip_special_tokens=True
456
- )[0]
457
- response = response.split(template.sep)[0].strip()
458
- return generate_chat_completion(
459
- self.model_uid,
460
- response,
461
- prompt_tokens=prompt_tokens,
462
- completion_tokens=completion_tokens,
463
- total_tokens=prompt_tokens + completion_tokens,
464
- )
465
-
466
- def _generate_stream(self, generate_kwargs, input_ids, include_usage):
467
- from threading import Thread
468
-
469
- from transformers import TextIteratorStreamer
470
-
471
- # Initialize the streamer
472
- streamer = TextIteratorStreamer(
473
- self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
474
- )
475
-
476
- # Define the generation configuration
477
- generate_kwargs["streamer"] = streamer
478
- # Start the model chat in a separate thread
479
- thread = Thread(
480
- target=self._model.generate,
481
- kwargs=generate_kwargs,
482
- )
483
- thread.start()
484
-
485
- completion_id = str(uuid.uuid1())
486
- prompt_tokens = len(input_ids[0])
487
- total_tokens, completion_tokens = 0, 0
488
- # Loop through the streamer to get the new text as it is generated
489
- for i, new_text in enumerate(streamer):
490
- if new_text == self._model.conv_template.sep:
491
- break
492
- completion_tokens = max(completion_tokens, len(streamer.token_cache))
493
- total_tokens = prompt_tokens + completion_tokens
494
- yield generate_completion_chunk(
495
- chunk_text=new_text,
496
- finish_reason=None,
497
- chunk_id=completion_id,
498
- model_uid=self.model_uid,
499
- prompt_tokens=prompt_tokens,
500
- completion_tokens=completion_tokens,
501
- total_tokens=total_tokens,
502
- )
503
- yield generate_completion_chunk(
504
- chunk_text=None,
505
- finish_reason="stop",
506
- chunk_id=completion_id,
507
- model_uid=self.model_uid,
508
- prompt_tokens=prompt_tokens,
509
- completion_tokens=completion_tokens,
510
- total_tokens=total_tokens,
511
- has_choice=True,
512
- has_content=False,
513
- )
514
-
515
- if include_usage:
516
- yield generate_completion_chunk(
517
- chunk_text=None,
518
- finish_reason=None,
519
- chunk_id=completion_id,
520
- model_uid=self.model_uid,
521
- prompt_tokens=prompt_tokens,
522
- completion_tokens=completion_tokens,
523
- total_tokens=total_tokens,
524
- has_choice=False,
525
- has_content=False,
526
- )
@@ -1,94 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from typing import List, Optional
16
-
17
- from ....core.scheduler import InferenceRequest
18
- from ....types import LoRA
19
- from ..llm_family import LLMFamilyV1, LLMSpecV1
20
- from .core import PytorchChatModel, PytorchModelConfig
21
-
22
-
23
- class Internlm2PytorchChatModel(PytorchChatModel):
24
- def __init__(
25
- self,
26
- model_uid: str,
27
- model_family: "LLMFamilyV1",
28
- model_spec: "LLMSpecV1",
29
- quantization: str,
30
- model_path: str,
31
- pytorch_model_config: Optional[PytorchModelConfig] = None,
32
- peft_model: Optional[List[LoRA]] = None,
33
- ):
34
- super().__init__(
35
- model_uid,
36
- model_family,
37
- model_spec,
38
- quantization,
39
- model_path,
40
- pytorch_model_config=pytorch_model_config,
41
- peft_model=peft_model,
42
- )
43
-
44
- def _get_model_class(self):
45
- from transformers import AutoModel
46
-
47
- return AutoModel
48
-
49
- def _load_model(self, **kwargs):
50
- try:
51
- from transformers import AutoModel, AutoTokenizer
52
- except ImportError:
53
- error_message = "Failed to import module 'transformers'"
54
- installation_guide = [
55
- "Please make sure 'transformers' is installed. ",
56
- "You can install it by `pip install transformers`\n",
57
- ]
58
-
59
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
60
-
61
- tokenizer = AutoTokenizer.from_pretrained(
62
- self.model_path,
63
- trust_remote_code=kwargs["trust_remote_code"],
64
- encode_special_tokens=True,
65
- revision=kwargs["revision"],
66
- )
67
- model = AutoModel.from_pretrained(
68
- self.model_path,
69
- **kwargs,
70
- )
71
- return model, tokenizer
72
-
73
- @classmethod
74
- def match_json(
75
- cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
76
- ) -> bool:
77
- model_family = llm_family.model_family or llm_family.model_name
78
- if model_family in ["internlm2-chat", "internlm2.5-chat"]:
79
- return True
80
- return False
81
-
82
- def prepare_sanitize_generate_config(self, req: InferenceRequest):
83
- """
84
- Overwrite this func for this special model.
85
- Cannot use the default configuration, which works poorly on this model.
86
- """
87
- raw_config = req.inference_kwargs.get("raw_params", {})
88
- temperature = raw_config.get("temperature", None)
89
- if temperature is None:
90
- raw_config["temperature"] = 0.8
91
- top_p = raw_config.get("top_p", None)
92
- if top_p is None:
93
- raw_config["top_p"] = 0.8
94
- return raw_config