xinference 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (59) hide show
  1. xinference/_compat.py +1 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +4 -0
  4. xinference/core/model.py +23 -3
  5. xinference/core/supervisor.py +6 -0
  6. xinference/core/worker.py +54 -11
  7. xinference/model/llm/__init__.py +4 -2
  8. xinference/model/llm/core.py +1 -0
  9. xinference/model/llm/llama_cpp/core.py +6 -1
  10. xinference/model/llm/llm_family.json +117 -1
  11. xinference/model/llm/llm_family_modelscope.json +125 -1
  12. xinference/model/llm/reasoning_parser.py +3 -3
  13. xinference/model/llm/sglang/core.py +111 -13
  14. xinference/model/llm/transformers/core.py +1 -0
  15. xinference/model/llm/transformers/deepseek_vl.py +1 -1
  16. xinference/model/llm/transformers/deepseek_vl2.py +287 -0
  17. xinference/model/llm/utils.py +26 -14
  18. xinference/model/llm/vllm/core.py +149 -8
  19. xinference/model/llm/vllm/distributed_executor.py +314 -0
  20. xinference/model/rerank/core.py +16 -11
  21. xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
  22. xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
  23. xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
  24. xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
  25. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
  26. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
  27. xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
  28. xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
  29. xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
  30. xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
  31. xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
  32. xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
  33. xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
  34. xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
  35. xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
  36. xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
  37. xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
  38. xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
  39. xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
  40. xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
  41. xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
  42. xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
  43. xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/{main.3cea968e.js → main.5ca4eea1.js} +3 -3
  47. xinference/web/ui/build/static/js/main.5ca4eea1.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +1 -0
  50. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/METADATA +4 -4
  51. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/RECORD +56 -31
  52. xinference/web/ui/build/static/js/main.3cea968e.js.map +0 -1
  53. xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +0 -1
  54. xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +0 -1
  55. /xinference/web/ui/build/static/js/{main.3cea968e.js.LICENSE.txt → main.5ca4eea1.js.LICENSE.txt} +0 -0
  56. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/LICENSE +0 -0
  57. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/WHEEL +0 -0
  58. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/entry_points.txt +0 -0
  59. {xinference-1.4.0.dist-info → xinference-1.4.1.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ from xoscar.utils import get_next_port
25
25
  from ....types import (
26
26
  ChatCompletion,
27
27
  ChatCompletionChunk,
28
+ ChatCompletionMessage,
28
29
  Completion,
29
30
  CompletionChoice,
30
31
  CompletionChunk,
@@ -95,7 +96,6 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
95
96
  "gemma-it",
96
97
  "gemma-2-it",
97
98
  "gemma-3-1b-it",
98
- "gemma-3-it",
99
99
  "deepseek-v2.5",
100
100
  "deepseek-v2-chat",
101
101
  "deepseek-v2-chat-0628",
@@ -108,6 +108,12 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
108
108
  "deepseek-v3",
109
109
  "deepseek-r1",
110
110
  ]
111
+ SGLANG_SUPPORTED_VISION_MODEL_LIST = [
112
+ "qwen2.5-vl-instruct",
113
+ "gemma-3-it",
114
+ "MiniCPM-V",
115
+ "llama-3.2-vision-instruct",
116
+ ]
111
117
 
112
118
 
113
119
  class SGLANGModel(LLM):
@@ -303,10 +309,6 @@ class SGLANGModel(LLM):
303
309
  if llm_spec.model_format == "pytorch":
304
310
  if quantization != "none" and not (quantization is None):
305
311
  return False
306
- if llm_spec.model_format in ["gptq", "awq"]:
307
- # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
308
- if "4" not in quantization:
309
- return False
310
312
  if isinstance(llm_family, CustomLLMFamilyV1):
311
313
  if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
312
314
  return False
@@ -371,12 +373,18 @@ class SGLANGModel(LLM):
371
373
  sampling_params.pop("lora_name", None)
372
374
  return sampling_params
373
375
 
374
- async def _stream_generate(self, prompt: str, **sampling_params):
376
+ async def _stream_generate(
377
+ self,
378
+ prompt: str,
379
+ image_data: Optional[Union[List[str], str]] = None,
380
+ **sampling_params,
381
+ ):
375
382
  import aiohttp
376
383
 
377
384
  sampling_params = self._filter_sampling_params(sampling_params)
378
385
  json_data = {
379
386
  "text": prompt,
387
+ "image_data": image_data,
380
388
  "sampling_params": sampling_params,
381
389
  "stream": True,
382
390
  }
@@ -404,12 +412,18 @@ class SGLANGModel(LLM):
404
412
  if need_stop:
405
413
  break
406
414
 
407
- async def _non_stream_generate(self, prompt: str, **sampling_params) -> dict:
415
+ async def _non_stream_generate(
416
+ self,
417
+ prompt: str,
418
+ image_data: Optional[Union[List[str], str]] = None,
419
+ **sampling_params,
420
+ ) -> dict:
408
421
  import aiohttp
409
422
 
410
423
  sampling_params = self._filter_sampling_params(sampling_params)
411
424
  json_data = {
412
425
  "text": prompt,
426
+ "image_data": image_data,
413
427
  "sampling_params": sampling_params,
414
428
  }
415
429
  async with aiohttp.ClientSession(trust_env=True) as session:
@@ -421,6 +435,7 @@ class SGLANGModel(LLM):
421
435
  async def async_generate(
422
436
  self,
423
437
  prompt: str,
438
+ image_data: Optional[Union[List[str], str]] = None,
424
439
  generate_config: Optional[SGLANGGenerateConfig] = None,
425
440
  request_id: Optional[str] = None,
426
441
  ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
@@ -439,7 +454,9 @@ class SGLANGModel(LLM):
439
454
  if not request_id:
440
455
  request_id = str(uuid.uuid1())
441
456
  if not stream:
442
- state = await self._non_stream_generate(prompt, **sanitized_generate_config)
457
+ state = await self._non_stream_generate(
458
+ prompt, image_data, **sanitized_generate_config
459
+ )
443
460
  return self._convert_state_to_completion(
444
461
  request_id,
445
462
  model=self.model_uid,
@@ -452,7 +469,7 @@ class SGLANGModel(LLM):
452
469
  prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
453
470
  finish_reason = None
454
471
  async for meta_info, out in self._stream_generate(
455
- prompt, **sanitized_generate_config
472
+ prompt, image_data, **sanitized_generate_config
456
473
  ):
457
474
  chunk = self._convert_state_to_completion_chunk(
458
475
  request_id, self.model_uid, output_text=out
@@ -515,10 +532,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
515
532
  if llm_spec.model_format == "pytorch":
516
533
  if quantization != "none" and not (quantization is None):
517
534
  return False
518
- if llm_spec.model_format in ["gptq", "awq"]:
519
- # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
520
- if "4" not in quantization:
521
- return False
522
535
  if isinstance(llm_family, CustomLLMFamilyV1):
523
536
  if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
524
537
  return False
@@ -559,3 +572,88 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
559
572
  c = await self.async_generate(full_prompt, generate_config) # type: ignore
560
573
  assert not isinstance(c, AsyncGenerator)
561
574
  return self._to_chat_completion(c, self.reasoning_parser)
575
+
576
+
577
+ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
578
+ @classmethod
579
+ def match(
580
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
581
+ ) -> bool:
582
+ if not cls._has_cuda_device():
583
+ return False
584
+ if not cls._is_linux():
585
+ return False
586
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
587
+ return False
588
+ if llm_spec.model_format == "pytorch":
589
+ if quantization != "none" and not (quantization is None):
590
+ return False
591
+ if isinstance(llm_family, CustomLLMFamilyV1):
592
+ if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
593
+ return False
594
+ else:
595
+ if llm_family.model_name not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
596
+ return False
597
+ if "vision" not in llm_family.model_ability:
598
+ return False
599
+ return SGLANG_INSTALLED
600
+
601
+ def _sanitize_chat_config(
602
+ self,
603
+ generate_config: Optional[Dict] = None,
604
+ ) -> Dict:
605
+ if not generate_config:
606
+ generate_config = {}
607
+ if self.model_family.stop:
608
+ if (not generate_config.get("stop")) and self.model_family.stop:
609
+ generate_config["stop"] = self.model_family.stop.copy()
610
+ return generate_config
611
+
612
+ async def async_chat(
613
+ self,
614
+ messages: List[ChatCompletionMessage], # type: ignore
615
+ generate_config: Optional[Dict] = None,
616
+ request_id: Optional[str] = None,
617
+ ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
618
+ import base64
619
+ from io import BytesIO
620
+
621
+ from PIL import Image
622
+ from qwen_vl_utils import process_vision_info
623
+
624
+ messages = self._transform_messages(messages)
625
+
626
+ chat_template: str = (
627
+ self.model_family.chat_template if self.model_family.chat_template else ""
628
+ )
629
+
630
+ prompt = self.get_full_context(messages, chat_template)
631
+ images, video_inputs = process_vision_info(messages)
632
+ if video_inputs:
633
+ raise ValueError("Not support video input now.")
634
+
635
+ base64_images: Optional[List[str]] = None
636
+ if images:
637
+ base64_images = []
638
+ for image in images:
639
+ if isinstance(image, Image.Image):
640
+ buffered = BytesIO()
641
+ image.save(buffered, format="JPEG", quality=100)
642
+ base64_images.append(base64.b64encode(buffered.getvalue()).decode())
643
+ elif isinstance(image, str):
644
+ base64_images.append(image)
645
+ else:
646
+ raise ValueError(
647
+ f"Unsupported image type: {type(image)}, only support PIL.Image and base64 string"
648
+ )
649
+
650
+ generate_config = self._sanitize_chat_config(generate_config)
651
+ stream = generate_config.get("stream", None)
652
+ if stream:
653
+ agen = await self.async_generate(prompt, base64_images, generate_config) # type: ignore
654
+ assert isinstance(agen, AsyncGenerator)
655
+ return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
656
+ else:
657
+ c = await self.async_generate(prompt, base64_images, generate_config) # type: ignore
658
+ assert not isinstance(c, AsyncGenerator)
659
+ return self._to_chat_completion(c, self.reasoning_parser)
@@ -81,6 +81,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
81
81
  "cogagent",
82
82
  "gemma-3-1b-it",
83
83
  "gemma-3-it",
84
+ "deepseek-vl2",
84
85
  ]
85
86
 
86
87
 
@@ -46,7 +46,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
46
46
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
47
47
  ) -> bool:
48
48
  llm_family = model_family.model_family or model_family.model_name
49
- if "deepseek-vl" in llm_family:
49
+ if "deepseek-vl" == llm_family.lower():
50
50
  return True
51
51
  return False
52
52
 
@@ -0,0 +1,287 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import base64
15
+ import logging
16
+ import os.path
17
+ import tempfile
18
+ import uuid
19
+ from concurrent.futures import ThreadPoolExecutor
20
+ from io import BytesIO
21
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
22
+
23
+ import requests
24
+ import torch
25
+
26
+ from ....model.utils import select_device
27
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
28
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
29
+ from ..utils import generate_chat_completion, generate_completion_chunk
30
+ from .core import PytorchChatModel, PytorchGenerateConfig
31
+ from .utils import cache_clean
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class DeepSeekVL2ChatModel(PytorchChatModel):
37
+ def __init__(self, *args, **kwargs):
38
+ super().__init__(*args, **kwargs)
39
+ self._tokenizer = None
40
+ self._model = None
41
+ self._vl_chat_processor = None
42
+ self._type = None
43
+
44
+ @classmethod
45
+ def match(
46
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
47
+ ) -> bool:
48
+ llm_family = model_family.model_family or model_family.model_name
49
+ if "deepseek-vl2" == llm_family.lower():
50
+ return True
51
+ return False
52
+
53
+ def load(self):
54
+ from transformers import AutoModelForCausalLM
55
+
56
+ from ....thirdparty.deepseek_vl2.models import (
57
+ DeepseekVLV2ForCausalLM,
58
+ DeepseekVLV2Processor,
59
+ )
60
+
61
+ self._device = self._pytorch_model_config.get("device", "auto")
62
+ self._device = select_device(self._device)
63
+ self._type = torch.float16 if self._device == "mps" else torch.bfloat16
64
+
65
+ # specify the path to the model
66
+ self._vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained( # type: ignore
67
+ self.model_path
68
+ )
69
+ self._tokenizer = self._vl_chat_processor.tokenizer
70
+
71
+ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
72
+ self.model_path, trust_remote_code=True, device_map=self._device
73
+ )
74
+ self._model = vl_gpt.to(torch.bfloat16).cuda().eval()
75
+
76
+ @staticmethod
77
+ def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:
78
+ def _ensure_url(_url):
79
+ if _url.startswith("data:"):
80
+ logging.info("Parse url by base64 decoder.")
81
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
82
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
83
+ _type, data = _url.split(";")
84
+ _, ext = _type.split("/")
85
+ data = data[len("base64,") :]
86
+ data = base64.b64decode(data.encode("utf-8"))
87
+
88
+ with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
89
+ f.write(data)
90
+ logging.info("Dump base64 data to %s", f.name)
91
+ return f.name
92
+ else:
93
+ if len(_url) > 2048:
94
+ raise Exception(f"Image url is too long, {len(_url)} > 2048.")
95
+
96
+ return _url
97
+
98
+ def _download(_images):
99
+ local_images = []
100
+
101
+ # To make requests.get works
102
+ headers = {
103
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
104
+ }
105
+ with ThreadPoolExecutor() as executor:
106
+ for url in images:
107
+ try:
108
+ if os.path.exists(url):
109
+ local_images.append(url)
110
+ continue
111
+ except Exception as e:
112
+ logger.debug("Image is remote: %s, e: %s", url, e)
113
+ pass
114
+ # Append a placeholder
115
+ local_images.append(None)
116
+
117
+ def _fill_placeholder(_url, _index):
118
+ response = requests.get(url, headers=headers)
119
+ local_images[_index] = BytesIO(response.content)
120
+
121
+ executor.submit(_fill_placeholder, url, len(local_images) - 1)
122
+ return local_images
123
+
124
+ if not isinstance(content, str):
125
+ # TODO(codingl2k1): Optimize _ensure_url
126
+
127
+ images = []
128
+ new_content = []
129
+ for c in content:
130
+ c_type = c.get("type")
131
+ if c_type == "image_url":
132
+ images.append(_ensure_url(c["image_url"]["url"]))
133
+ elif c_type == "text":
134
+ new_content.append(c["text"])
135
+ if images:
136
+ new_content.insert(0, "<image_placeholder>")
137
+ images = _download(images)
138
+ return "".join(new_content), images
139
+ return content, []
140
+
141
+ @cache_clean
142
+ def chat(
143
+ self,
144
+ messages: List[Dict],
145
+ generate_config: Optional[PytorchGenerateConfig] = None,
146
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
147
+ if not generate_config:
148
+ generate_config = {}
149
+
150
+ stream = generate_config.get("stream", False)
151
+ stream_options = generate_config.pop("stream_options", None)
152
+ include_usage = (
153
+ stream_options["include_usage"]
154
+ if isinstance(stream_options, dict)
155
+ else False
156
+ )
157
+
158
+ prompt = ""
159
+ deepseek_messages = []
160
+ for i, message in enumerate(messages):
161
+ role = message["role"]
162
+ content = message["content"]
163
+ if role == "user":
164
+ if isinstance(content, str):
165
+ deepseek_messages.append(
166
+ {
167
+ "role": "<|User|>",
168
+ "content": "<image>\n<|ref|>" + content + "<|/ref|>",
169
+ }
170
+ )
171
+ else:
172
+ content, images = self._message_content_to_deepseek(content)
173
+ msg: Dict[str, Any] = {
174
+ "role": "<|User|>",
175
+ "content": "<image>\n<|ref|>" + content + "<|/ref|>",
176
+ }
177
+ if images:
178
+ msg["images"] = images
179
+ deepseek_messages.append(msg)
180
+ deepseek_messages.append({"role": "<|Assistant|>", "content": ""})
181
+ if i == len(messages) - 1:
182
+ prompt = "<image>\n<|ref|>" + content + "<|/ref|>"
183
+ elif role == "assistant":
184
+ deepseek_messages.append({"role": "<|Assistant|>", "content": content})
185
+ else:
186
+ logger.error(
187
+ f"Unexpected message in messages: role: {role}, message: {message}"
188
+ )
189
+
190
+ from ....thirdparty.deepseek_vl2.utils.io import load_pil_images
191
+
192
+ # load images and prepare for inputs
193
+ pil_images = load_pil_images(deepseek_messages)
194
+ prepare_inputs = self._vl_chat_processor(
195
+ conversations=deepseek_messages,
196
+ images=pil_images,
197
+ force_batchify=True,
198
+ system_prompt="",
199
+ ).to(self._model.device, self._model.dtype)
200
+
201
+ # run image encoder to get the image embeddings
202
+ inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
203
+
204
+ max_new_tokens = generate_config.get("max_tokens", 512)
205
+ conversation = self._vl_chat_processor.new_chat_template()
206
+ stop_str = conversation.sep2
207
+
208
+ streamer = self._model.language.generate(
209
+ inputs_embeds=inputs_embeds,
210
+ attention_mask=prepare_inputs.attention_mask,
211
+ pad_token_id=self._tokenizer.eos_token_id,
212
+ bos_token_id=self._tokenizer.bos_token_id,
213
+ eos_token_id=self._tokenizer.eos_token_id,
214
+ max_new_tokens=max_new_tokens,
215
+ do_sample=False,
216
+ use_cache=True,
217
+ )
218
+
219
+ if stream:
220
+ it = self._generate_stream(streamer, stop_str, include_usage, prompt)
221
+ return self._to_chat_completion_chunks(it)
222
+ else:
223
+ return self._generate(streamer, stop_str)
224
+
225
+ def _generate(self, streamer, stop_str) -> ChatCompletion:
226
+ generated_text = ""
227
+
228
+ for new_text in streamer:
229
+ if isinstance(new_text, torch.Tensor):
230
+ new_text = self._tokenizer.decode(
231
+ new_text.cpu().tolist(), skip_special_tokens=True
232
+ )
233
+
234
+ if new_text.endswith(stop_str):
235
+ new_text = new_text[: -len(stop_str)]
236
+
237
+ generated_text += new_text
238
+
239
+ return generate_chat_completion(self.model_uid, generated_text)
240
+
241
+ def _generate_stream(
242
+ self, streamer, stop_str, include_usage, prompt
243
+ ) -> Iterator[CompletionChunk]:
244
+ completion_id = str(uuid.uuid1())
245
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
246
+ input_ids = self._tokenizer(prompt).input_ids
247
+ prompt_tokens = len(input_ids)
248
+ for i, new_text in enumerate(streamer):
249
+ if new_text.endswith(stop_str):
250
+ new_text = new_text[: -len(stop_str)]
251
+ completion_tokens = i
252
+ total_tokens = prompt_tokens + completion_tokens
253
+ yield generate_completion_chunk(
254
+ chunk_text=new_text,
255
+ finish_reason=None,
256
+ chunk_id=completion_id,
257
+ model_uid=self.model_uid,
258
+ prompt_tokens=prompt_tokens,
259
+ completion_tokens=completion_tokens,
260
+ total_tokens=total_tokens,
261
+ has_choice=True,
262
+ has_content=True,
263
+ )
264
+ yield generate_completion_chunk(
265
+ chunk_text=None,
266
+ finish_reason="stop",
267
+ chunk_id=completion_id,
268
+ model_uid=self.model_uid,
269
+ prompt_tokens=prompt_tokens,
270
+ completion_tokens=completion_tokens,
271
+ total_tokens=total_tokens,
272
+ has_choice=True,
273
+ has_content=False,
274
+ )
275
+
276
+ if include_usage:
277
+ yield generate_completion_chunk(
278
+ chunk_text=None,
279
+ finish_reason=None,
280
+ chunk_id=completion_id,
281
+ model_uid=self.model_uid,
282
+ prompt_tokens=prompt_tokens,
283
+ completion_tokens=completion_tokens,
284
+ total_tokens=total_tokens,
285
+ has_choice=False,
286
+ has_content=False,
287
+ )
@@ -255,19 +255,26 @@ class ChatModelMixin:
255
255
  and choices
256
256
  and "delta" in choices[0]
257
257
  ):
258
- if reasoning_parser is not None:
259
- # process parsing reasoning content
260
- assert previous_texts is not None
258
+ if choices[0]["finish_reason"] is None:
259
+ if reasoning_parser is not None:
260
+ # process parsing reasoning content
261
+ assert previous_texts is not None
262
+ delta = choices[0]["delta"] # type: ignore
263
+ if text := delta.get("content"):
264
+ current_text = previous_texts[-1] + text
265
+ delta = reasoning_parser.extract_reasoning_content_streaming(
266
+ previous_text=previous_texts[-1],
267
+ current_text=current_text,
268
+ delta_text=text,
269
+ )
270
+ previous_texts[-1] = current_text
271
+ choices[0]["delta"] = delta # type: ignore
272
+ elif choices[0]["finish_reason"] is not None:
261
273
  delta = choices[0]["delta"] # type: ignore
262
- if text := delta.get("content"):
263
- current_text = previous_texts[-1] + text
264
- delta = reasoning_parser.extract_reasoning_content_streaming(
265
- previous_text=previous_texts[-1],
266
- current_text=current_text,
267
- delta_text=text,
268
- )
269
- previous_texts[-1] = current_text
270
- choices[0]["delta"] = delta # type: ignore
274
+ if "content" not in delta:
275
+ delta["content"] = "" # type: ignore
276
+ if reasoning_parser is not None:
277
+ delta["reasoning_content"] = None # type: ignore
271
278
  # Already a ChatCompletionChunk, we don't need to convert chunk.
272
279
  return cast(ChatCompletionChunk, chunk)
273
280
 
@@ -286,7 +293,11 @@ class ChatModelMixin:
286
293
  delta_text=choice["text"],
287
294
  )
288
295
  previous_texts[-1] = current_text
289
- if "tool_calls" in choice:
296
+ elif "text" in choice and choice["finish_reason"] is not None:
297
+ delta["content"] = choice["text"]
298
+ if reasoning_parser is not None:
299
+ delta["reasoning_content"] = None
300
+ elif "tool_calls" in choice:
290
301
  delta["tool_calls"] = choice["tool_calls"]
291
302
  choices_list.append(
292
303
  {
@@ -319,8 +330,9 @@ class ChatModelMixin:
319
330
  ) -> ChatCompletionChunk:
320
331
  choices_list = []
321
332
  for i, choice in enumerate(chunk["choices"]):
322
- delta = {"role": "assistant", "content": ""}
333
+ delta = ChatCompletionChunkDelta(role="assistant", content="")
323
334
  if reasoning_parser is not None:
335
+ delta["content"] = None
324
336
  delta["reasoning_content"] = ""
325
337
  choices_list.append(
326
338
  {