xinference 1.3.1__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-03-09T12:06:50+0800",
11
+ "date": "2025-03-11T12:00:36+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "5d6ec937ce2aca2511e9e0debc4c2ab06ca41f09",
15
- "version": "1.3.1"
14
+ "full-revisionid": "2ef99fbb5450a76a6ba07a909f58b8c2e4c22a28",
15
+ "version": "1.3.1.post1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -113,6 +113,7 @@ class GradioInterface:
113
113
  max_tokens: int,
114
114
  temperature: float,
115
115
  lora_name: str,
116
+ stream: bool,
116
117
  ) -> Generator:
117
118
  from ..client import RESTfulClient
118
119
 
@@ -123,29 +124,40 @@ class GradioInterface:
123
124
  messages = to_chat(flatten(history))
124
125
  messages.append(dict(role="user", content=message))
125
126
 
126
- response_content = ""
127
- for chunk in model.chat(
128
- messages,
129
- generate_config={
130
- "max_tokens": int(max_tokens),
131
- "temperature": temperature,
132
- "stream": True,
133
- "lora_name": lora_name,
134
- },
135
- ):
136
- assert isinstance(chunk, dict)
137
- delta = chunk["choices"][0]["delta"]
138
- if "content" not in delta:
139
- continue
140
- else:
141
- # some model like deepseek-r1-distill-qwen
142
- # will generate <think>...</think> ...
143
- # in gradio, no output will be rendered,
144
- # thus escape html tags in advance
145
- response_content += html.escape(delta["content"])
146
- yield response_content
147
-
148
- yield response_content
127
+ if stream:
128
+ response_content = ""
129
+ for chunk in model.chat(
130
+ messages,
131
+ generate_config={
132
+ "max_tokens": int(max_tokens),
133
+ "temperature": temperature,
134
+ "stream": True,
135
+ "lora_name": lora_name,
136
+ },
137
+ ):
138
+ assert isinstance(chunk, dict)
139
+ delta = chunk["choices"][0]["delta"]
140
+ if "content" not in delta:
141
+ continue
142
+ else:
143
+ # some model like deepseek-r1-distill-qwen
144
+ # will generate <think>...</think> ...
145
+ # in gradio, no output will be rendered,
146
+ # thus escape html tags in advance
147
+ response_content += html.escape(delta["content"])
148
+ yield response_content
149
+
150
+ yield response_content
151
+ else:
152
+ result = model.chat(
153
+ messages,
154
+ generate_config={
155
+ "max_tokens": int(max_tokens),
156
+ "temperature": temperature,
157
+ "lora_name": lora_name,
158
+ },
159
+ )
160
+ yield html.escape(result["choices"][0]["message"]["content"]) # type: ignore
149
161
 
150
162
  return gr.ChatInterface(
151
163
  fn=generate_wrapper,
@@ -153,7 +165,9 @@ class GradioInterface:
153
165
  gr.Slider(
154
166
  minimum=1,
155
167
  maximum=self.context_length,
156
- value=512,
168
+ value=512
169
+ if "reasoning" not in self.model_ability
170
+ else self.context_length // 2,
157
171
  step=1,
158
172
  label="Max Tokens",
159
173
  ),
@@ -161,6 +175,7 @@ class GradioInterface:
161
175
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
162
176
  ),
163
177
  gr.Text(label="LoRA Name"),
178
+ gr.Checkbox(label="Stream", value=True),
164
179
  ],
165
180
  title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
166
181
  css="""
@@ -25,8 +25,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
25
25
  from ...core.utils import parse_replica_model_uid
26
26
  from ...types import PeftModelConfig
27
27
  from ..core import ModelDescription
28
- from .reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
29
- from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
28
+ from .reasoning_parser import ReasoningParser
30
29
 
31
30
  if TYPE_CHECKING:
32
31
  from .llm_family import LLMFamilyV1, LLMSpecV1
@@ -123,9 +122,7 @@ class LLM(abc.ABC):
123
122
  def prepare_parse_reasoning_content(self, reasoning_content):
124
123
  # Initialize reasoning parser if model has reasoning ability
125
124
  if "reasoning" in self.model_family.model_ability and reasoning_content:
126
- module_name = self.model_family.model_family or self.model_family.model_name
127
- self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
128
- self.reasoning_parser = self.reasoning_parser(
125
+ self.reasoning_parser = ReasoningParser(
129
126
  self.model_family.reasoning_start_tag,
130
127
  self.model_family.reasoning_end_tag,
131
128
  )
@@ -43,7 +43,7 @@ class _Sentinel:
43
43
  pass
44
44
 
45
45
 
46
- class XllamaCppModel(LLM):
46
+ class XllamaCppModel(LLM, ChatModelMixin):
47
47
  def __init__(
48
48
  self,
49
49
  model_uid: str,
@@ -83,6 +83,7 @@ class XllamaCppModel(LLM):
83
83
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
84
84
  elif self._is_linux():
85
85
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
86
+ llamacpp_model_config.setdefault("reasoning_content", False)
86
87
 
87
88
  return llamacpp_model_config
88
89
 
@@ -131,6 +132,9 @@ class XllamaCppModel(LLM):
131
132
 
132
133
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
133
134
 
135
+ reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
136
+ self.prepare_parse_reasoning_content(reasoning_content)
137
+
134
138
  if os.path.isfile(self.model_path):
135
139
  # mostly passed from --model_path
136
140
  model_path = os.path.realpath(self.model_path)
@@ -274,9 +278,11 @@ class XllamaCppModel(LLM):
274
278
  while (r := q.get()) is not _Sentinel:
275
279
  yield r
276
280
 
277
- return _to_iterator()
281
+ return self._to_chat_completion_chunks(
282
+ _to_iterator(), self.reasoning_parser
283
+ )
278
284
  else:
279
- return q.get()
285
+ return self._to_chat_completion(q.get(), self.reasoning_parser)
280
286
 
281
287
 
282
288
  class LlamaCppModel(LLM):
@@ -9449,7 +9449,7 @@
9449
9449
  },
9450
9450
  {
9451
9451
  "version": 1,
9452
- "context_length": 32768,
9452
+ "context_length": 131072,
9453
9453
  "model_name": "QwQ-32B",
9454
9454
  "model_lang": [
9455
9455
  "en",
@@ -9496,15 +9496,99 @@
9496
9496
  "model_size_in_billions": 32,
9497
9497
  "quantizations": [
9498
9498
  "fp16",
9499
- "Q2_k",
9500
- "Q3_K_M",
9501
- "Q4_0",
9502
- "Q4_K_M",
9503
- "Q5_0",
9504
- "Q5_K_M",
9505
- "Q6_K",
9506
- "Q8_0"
9499
+ "q2_k",
9500
+ "q3_k_m",
9501
+ "q4_0",
9502
+ "q4_k_m",
9503
+ "q5_0",
9504
+ "q5_k_m",
9505
+ "q6_k",
9506
+ "q8_0"
9507
9507
  ],
9508
+ "quantization_parts": {
9509
+ "fp16": [
9510
+ "00001-of-000017",
9511
+ "00002-of-000017",
9512
+ "00003-of-000017",
9513
+ "00004-of-000017",
9514
+ "00005-of-000017",
9515
+ "00006-of-000017",
9516
+ "00007-of-000017",
9517
+ "00008-of-000017",
9518
+ "00009-of-000017",
9519
+ "00010-of-000017",
9520
+ "00011-of-000017",
9521
+ "00012-of-000017",
9522
+ "00013-of-000017",
9523
+ "00014-of-000017",
9524
+ "00015-of-000017",
9525
+ "00016-of-000017",
9526
+ "00017-of-000017"
9527
+ ],
9528
+ "q2_k": [
9529
+ "00001-of-00004",
9530
+ "00002-of-00004",
9531
+ "00003-of-00004",
9532
+ "00004-of-00004"
9533
+ ],
9534
+ "q3_k_m": [
9535
+ "00001-of-00005",
9536
+ "00002-of-00005",
9537
+ "00003-of-00005",
9538
+ "00004-of-00005",
9539
+ "00005-of-00005"
9540
+ ],
9541
+ "q4_0": [
9542
+ "00001-of-00005",
9543
+ "00002-of-00005",
9544
+ "00003-of-00005",
9545
+ "00004-of-00005",
9546
+ "00005-of-00005"
9547
+ ],
9548
+ "q4_k_m": [
9549
+ "00001-of-00005",
9550
+ "00002-of-00005",
9551
+ "00003-of-00005",
9552
+ "00004-of-00005",
9553
+ "00005-of-00005"
9554
+ ],
9555
+ "q5_0": [
9556
+ "00001-of-00006",
9557
+ "00002-of-00006",
9558
+ "00003-of-00006",
9559
+ "00004-of-00006",
9560
+ "00005-of-00006",
9561
+ "00006-of-00006"
9562
+ ],
9563
+ "q5_k_m": [
9564
+ "00001-of-00006",
9565
+ "00002-of-00006",
9566
+ "00003-of-00006",
9567
+ "00004-of-00006",
9568
+ "00005-of-00006",
9569
+ "00006-of-00006"
9570
+ ],
9571
+ "q6_k": [
9572
+ "00001-of-00007",
9573
+ "00002-of-00007",
9574
+ "00003-of-00007",
9575
+ "00004-of-00007",
9576
+ "00005-of-00007",
9577
+ "00006-of-00007",
9578
+ "00007-of-00007"
9579
+ ],
9580
+ "q8_0": [
9581
+ "00001-of-00009",
9582
+ "00002-of-00009",
9583
+ "00003-of-00009",
9584
+ "00004-of-00009",
9585
+ "00005-of-00009",
9586
+ "00006-of-00009",
9587
+ "00007-of-00009",
9588
+ "00008-of-00009",
9589
+ "00009-of-00009"
9590
+ ]
9591
+ },
9508
9592
  "model_id": "Qwen/QwQ-32B-GGUF",
9509
9593
  "model_file_name_template": "qwq-32b-{quantization}.gguf"
9510
9594
  }
@@ -7217,7 +7217,7 @@
7217
7217
  ],
7218
7218
  "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
7219
7219
  "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
7220
- "model_hub": "modelscope"
7220
+ "model_hub": "modelscope"
7221
7221
  }
7222
7222
  ],
7223
7223
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -7234,7 +7234,7 @@
7234
7234
  },
7235
7235
  {
7236
7236
  "version": 1,
7237
- "context_length": 32768,
7237
+ "context_length": 131072,
7238
7238
  "model_name": "QwQ-32B",
7239
7239
  "model_lang": [
7240
7240
  "en",
@@ -7284,14 +7284,14 @@
7284
7284
  "model_size_in_billions": 32,
7285
7285
  "quantizations": [
7286
7286
  "fp16",
7287
- "Q2_k",
7288
- "Q3_K_M",
7289
- "Q4_0",
7290
- "Q4_K_M",
7291
- "Q5_0",
7292
- "Q5_K_M",
7293
- "Q6_K",
7294
- "Q8_0"
7287
+ "q2_k",
7288
+ "q3_k_m",
7289
+ "q4_0",
7290
+ "q4_k_m",
7291
+ "q5_0",
7292
+ "q5_k_m",
7293
+ "q6_k",
7294
+ "q8_0"
7295
7295
  ],
7296
7296
  "model_id": "Qwen/QwQ-32B-GGUF",
7297
7297
  "model_file_name_template": "qwq-32b-{quantization}.gguf",
@@ -1,20 +1,17 @@
1
1
  import re
2
2
  from typing import Optional, Tuple, Union
3
3
 
4
- from ....types import ChatCompletionChunkDelta, CompletionChoice
5
- from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
4
+ from ...types import ChatCompletionChunkDelta, CompletionChoice
6
5
 
7
6
 
8
- @ReasoningParserManager.register_module("deepseek-v3")
9
- @ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
10
- @ReasoningParserManager.register_module("deepseek-r1-distill-llama")
11
- class DeepSeekR1ReasoningParser(ReasoningParser):
12
- """Reasoning parser for DeepSeek-R1 model."""
7
+ class ReasoningParser:
8
+ """Reasoning parser for reasoning model."""
13
9
 
14
10
  def __init__(
15
11
  self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
12
  ):
17
- super().__init__(reasoning_start_tag, reasoning_end_tag)
13
+ self.reasoning_start_tag = reasoning_start_tag
14
+ self.reasoning_end_tag = reasoning_end_tag
18
15
  self.reasoning_regex = re.compile(
19
16
  rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
20
17
  )
@@ -55,7 +55,7 @@ from .llm_family import (
55
55
  _get_cache_dir,
56
56
  get_cache_status,
57
57
  )
58
- from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
58
+ from .reasoning_parser import ReasoningParser
59
59
 
60
60
  logger = logging.getLogger(__name__)
61
61
 
@@ -250,8 +250,30 @@ class ChatModelMixin:
250
250
  reasoning_parser: Optional[ReasoningParser] = None,
251
251
  previous_texts: Optional[List[str]] = None,
252
252
  ) -> ChatCompletionChunk:
253
+ choices = chunk.get("choices")
254
+ if (
255
+ chunk.get("object") == "chat.completion.chunk"
256
+ and choices
257
+ and "delta" in choices[0]
258
+ ):
259
+ if reasoning_parser is not None:
260
+ # process parsing reasoning content
261
+ assert previous_texts is not None
262
+ delta = choices[0]["delta"] # type: ignore
263
+ if text := delta.get("content"):
264
+ current_text = previous_texts[-1] + text
265
+ delta = reasoning_parser.extract_reasoning_content_streaming(
266
+ previous_text=previous_texts[-1],
267
+ current_text=current_text,
268
+ delta_text=text,
269
+ )
270
+ previous_texts[-1] = current_text
271
+ choices[0]["delta"] = delta # type: ignore
272
+ # Already a ChatCompletionChunk, we don't need to convert chunk.
273
+ return cast(ChatCompletionChunk, chunk)
274
+
253
275
  choices_list = []
254
- for i, choice in enumerate(chunk["choices"]):
276
+ for i, choice in enumerate(choices): # type: ignore
255
277
  delta = ChatCompletionChunkDelta()
256
278
  if "text" in choice and choice["finish_reason"] is None:
257
279
  if reasoning_parser is None:
@@ -345,9 +367,10 @@ class ChatModelMixin:
345
367
  if not choices:
346
368
  yield cls._get_final_chat_completion_chunk(chunk)
347
369
  else:
348
- yield cls._to_chat_completion_chunk(
370
+ r = cls._to_chat_completion_chunk(
349
371
  chunk, reasoning_parse, previous_texts
350
372
  )
373
+ yield r
351
374
 
352
375
  @classmethod
353
376
  def _tools_to_messages_for_deepseek(
@@ -405,6 +428,21 @@ class ChatModelMixin:
405
428
  def _to_chat_completion(
406
429
  completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
407
430
  ) -> ChatCompletion:
431
+ if completion.get("object") == "chat.completion" and completion.get("choices"):
432
+ # Already a ChatCompletion
433
+ if reasoning_parser is not None:
434
+ for choice in completion["choices"]:
435
+ message = choice["message"] # type: ignore
436
+ text = message["content"]
437
+ (
438
+ reasoning_content,
439
+ content,
440
+ ) = reasoning_parser.extract_reasoning_content(text)
441
+ message["content"] = content
442
+ if reasoning_content is not None:
443
+ message["reasoning_content"] = reasoning_content
444
+ return cast(ChatCompletion, completion)
445
+
408
446
  choices = []
409
447
  for i, choice in enumerate(completion["choices"]):
410
448
  content = choice["text"]
@@ -576,6 +576,8 @@ class VLLMModel(LLM):
576
576
 
577
577
  sanitized_generate_config = self._sanitize_generate_config(generate_config)
578
578
  if self.reasoning_parser:
579
+ # For reasoning model, the </think> we be split into multiple words,
580
+ # if `stop` param is passed, so we pop it from config.
579
581
  sanitized_generate_config.pop("stop")
580
582
  logger.debug(
581
583
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xinference
3
- Version: 1.3.1
3
+ Version: 1.3.1.post1
4
4
  Summary: Model Serving Made Easy
5
5
  Home-page: https://github.com/xorbitsai/inference
6
6
  Author: Qin Xuye
@@ -292,6 +292,7 @@ potential of cutting-edge AI models.
292
292
 
293
293
  ## 🔥 Hot Topics
294
294
  ### Framework Enhancements
295
+ - [Xllamacpp](https://github.com/xorbitsai/xllamacpp): New llama.cpp Python binding, maintained by Xinference team, supports continuous batching and is more production-ready.: [#2997](https://github.com/xorbitsai/inference/pull/2997)
295
296
  - Distributed inference: running models across workers: [#2877](https://github.com/xorbitsai/inference/pull/2877)
296
297
  - VLLM enhancement: Shared KV cache across multiple replicas: [#2732](https://github.com/xorbitsai/inference/pull/2732)
297
298
  - Support Continuous batching for Transformers engine: [#1724](https://github.com/xorbitsai/inference/pull/1724)
@@ -299,8 +300,8 @@ potential of cutting-edge AI models.
299
300
  - Support specifying worker and GPU indexes for launching models: [#1195](https://github.com/xorbitsai/inference/pull/1195)
300
301
  - Support SGLang backend: [#1161](https://github.com/xorbitsai/inference/pull/1161)
301
302
  - Support LoRA for LLM and image models: [#1080](https://github.com/xorbitsai/inference/pull/1080)
302
- - Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
303
303
  ### New Models
304
+ - Built-in support for [QwQ-32B](https://qwenlm.github.io/blog/qwq-32b/): [#3005](https://github.com/xorbitsai/inference/pull/3005)
304
305
  - Built-in support for [DeepSeek V3 and R1](https://github.com/deepseek-ai/DeepSeek-R1): [#2864](https://github.com/xorbitsai/inference/pull/2864)
305
306
  - Built-in support for [InternVL2.5](https://internvl.github.io/blog/2024-12-05-InternVL-2.5/): [#2776](https://github.com/xorbitsai/inference/pull/2776)
306
307
  - Built-in support for [DeepSeek-R1-Distill-Llama](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2811](https://github.com/xorbitsai/inference/pull/2811)
@@ -308,7 +309,6 @@ potential of cutting-edge AI models.
308
309
  - Built-in support for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M): [#2790](https://github.com/xorbitsai/inference/pull/2790)
309
310
  - Built-in support for [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788)
310
311
  - Built-in support for [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789)
311
- - Built-in support for [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760)
312
312
  ### Integrations
313
313
  - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
314
314
  - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
@@ -1,6 +1,6 @@
1
1
  xinference/__init__.py,sha256=nmTTrYbIpj964ZF6ojtgOM7E85JBOj1EyQbmYjbj1jw,915
2
2
  xinference/_compat.py,sha256=URSJQLXrcsTO9B_4x0wVDPijYQDhuVJmZ95npID560w,4197
3
- xinference/_version.py,sha256=Qg0ZT0v5l0WLBFPFak_ZgiMLc3mfmofYyFkTV9kMIUE,497
3
+ xinference/_version.py,sha256=qMz600g9USAjrV1nTxM3bBcpOiTBWs1VJPEtdMggVGg,503
4
4
  xinference/conftest.py,sha256=ZB7li77s4_H4ZEQpDo2PX-b4zrs8-bIpvh59P_CaSoo,9691
5
5
  xinference/constants.py,sha256=mEW4HDzjXtDXN61Mt6TtJrJ4ljbB6VUkh97e3oDbNx4,3905
6
6
  xinference/device_utils.py,sha256=ELsqvnjvz9wYthTyQFzKSV4mZsaASz6hj_IsfMmfMWc,4447
@@ -21,7 +21,7 @@ xinference/client/restful/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7
21
21
  xinference/client/restful/restful_client.py,sha256=DofFF0ZaOmBpCVp9qtAeYDGbvd-KS5u4_GMGp8AbbM4,53994
22
22
  xinference/core/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
23
23
  xinference/core/cache_tracker.py,sha256=3ubjYCU5aZToSp2GEuzedECVrg-PR4kThTefrFUkb9g,6971
24
- xinference/core/chat_interface.py,sha256=5fUr9-OLrFTZ5TvFGE8gX4N_-N4EmYRp74b5fD6cyAU,21048
24
+ xinference/core/chat_interface.py,sha256=X5ZC91M_uKIG8NW1xupKUDNoqzUHMpLp4-ijf-YhjbE,21766
25
25
  xinference/core/event.py,sha256=42F38H2WOl6aPxp2oxX6WNxHRRxbnvYRmbt4Ar7NP4U,1640
26
26
  xinference/core/image_interface.py,sha256=5Iuoiw3g2TvgOYi3gRIAGApve2nNzfMPduRrBHvd1NY,13755
27
27
  xinference/core/metrics.py,sha256=ScmTG15Uq3h_ob72ybZSMWdnk8P4sUZFcm60f4ikSXc,2631
@@ -87,23 +87,21 @@ xinference/model/image/stable_diffusion/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17k
87
87
  xinference/model/image/stable_diffusion/core.py,sha256=V3BaASwx8q1YERb4jhhaYEDFiwh3BuPAz8pVZTuktAQ,24717
88
88
  xinference/model/image/stable_diffusion/mlx.py,sha256=GZsozzGB04NfHAdU9MI6gwWE1t_A-s_Ddn_ic8DlkKQ,7476
89
89
  xinference/model/llm/__init__.py,sha256=UJOSz9zr5mAj8Fm09yoZbEe4xBWYnSxUV9aGE50e5dc,14184
90
- xinference/model/llm/core.py,sha256=SLtv3VHedM4ZSfqa_3oQN5qZfbPhWq2hBr4U17gvPPc,8908
91
- xinference/model/llm/llm_family.json,sha256=sCYtQ7d2ai9Q7zHyUM4VL4fPv8GeiylXVP_VHaLEsaE,372173
90
+ xinference/model/llm/core.py,sha256=2AYRKdiJ5L1iKU9CE_C09IbEtE2KrsIy4dqkqg2txes,8626
91
+ xinference/model/llm/llm_family.json,sha256=xszAQbwI5lvkdcxRSoowNddvPfuFF2aT6xnvzeyzo8w,374447
92
92
  xinference/model/llm/llm_family.py,sha256=SrgTmEKspAELhVqmMs7Rz6xUk7rmc9V61urvbWAZOVE,39214
93
93
  xinference/model/llm/llm_family_csghub.json,sha256=zMKWbihsxQNVB1u5iKJbZUkbOfQ4IPNq1KQ-8IDPQQA,8759
94
- xinference/model/llm/llm_family_modelscope.json,sha256=NRwwsLNUYIwJvRnHlf-4kAThSm3Mqdtzq65wHGPnxP4,304584
94
+ xinference/model/llm/llm_family_modelscope.json,sha256=2OuPertAGKnryliUofjnqemLrhHW1aaq6-4tPsrbKNI,304592
95
95
  xinference/model/llm/llm_family_openmind_hub.json,sha256=jl9pfbe5DztoxgEwKBxDk1Wd7TziTiJ48_Ie_lJdYjA,67872
96
96
  xinference/model/llm/memory.py,sha256=GLNmXBI-AtMbuaJfEf50fnhN4rdbOZjLyT6L_Vjqa5g,10206
97
- xinference/model/llm/utils.py,sha256=5Gp6b0AwaNNFdwb72QnVD_l7uKSlDd3ORhnRNm_ZYnM,31102
97
+ xinference/model/llm/reasoning_parser.py,sha256=Gqf2WS5olkD2SfJ3wlCAQsmQUZOIc1185h_01pIs7TQ,6067
98
+ xinference/model/llm/utils.py,sha256=QYYGfaPYr3GG7dyowNgs4qyayid-gOtJ_ToXqAMVhSU,32846
98
99
  xinference/model/llm/llama_cpp/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
99
- xinference/model/llm/llama_cpp/core.py,sha256=0ChfDq8PC2f2bo4EqbAdLEnvmxtXx5imK1_73oy7f6o,20815
100
+ xinference/model/llm/llama_cpp/core.py,sha256=rUVyaP5tE6xz9jy0m4iZ0ys99vl3sicnWZsEnbaXPfw,21181
100
101
  xinference/model/llm/lmdeploy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
102
  xinference/model/llm/lmdeploy/core.py,sha256=WvSP3x6t-HBv6hKh1qWZatFAzlcZCyyKqvc3ua8yPTI,19835
102
103
  xinference/model/llm/mlx/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
103
104
  xinference/model/llm/mlx/core.py,sha256=l4_MKw5UckM81kaCwgriy0KZU3zPN38p36P3J9USmgA,23568
104
- xinference/model/llm/reasoning_parsers/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
105
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py,sha256=YtOOVbSl6fLugn3vmzo_AQbbjl6H5kX9DPpP9KP3gnY,3004
106
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py,sha256=HmnAsNcoeUpyUSTNF0j_0Z4Am7OKiGrJnhNj-BhtQf0,6323
107
105
  xinference/model/llm/sglang/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
108
106
  xinference/model/llm/sglang/core.py,sha256=tMbvQOwQu5uBXBTMK5Vh-FR2Gc-Nbc0HIhp2iy47wCA,20606
109
107
  xinference/model/llm/transformers/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
@@ -130,7 +128,7 @@ xinference/model/llm/transformers/tensorizer_utils.py,sha256=VXSYbPZtCbd8lVvsnjD
130
128
  xinference/model/llm/transformers/utils.py,sha256=KETjuVR_RpF--fno0KxT068fD1v4REFhe-0wy_sCwRs,19584
131
129
  xinference/model/llm/transformers/yi_vl.py,sha256=iCdRLw-wizbU-qXXc8CT4DhC0Pt-uYg0vFwXEhAZjQg,8961
132
130
  xinference/model/llm/vllm/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
133
- xinference/model/llm/vllm/core.py,sha256=NIGZy-v5k3tAKFOWpeyJnKDIC7ghaAHvzCD9a2uidtM,38113
131
+ xinference/model/llm/vllm/core.py,sha256=j5sdlrctBnouLJAfBs0Ofa1JbngTlYsDzrs2ManQN0o,38261
134
132
  xinference/model/llm/vllm/utils.py,sha256=LKOmwfFRrlSecawxT-uE39tC2RQbf1UIiSH9Uz90X6w,1313
135
133
  xinference/model/llm/vllm/xavier/__init__.py,sha256=CyLLkbImZouAk4lePIgKXT4WQoqyauIEwdqea5IOUVU,581
136
134
  xinference/model/llm/vllm/xavier/allocator.py,sha256=SJ2eCOxF6CWTBZIP39FRxeK6fxIE8pRshOPnSRc72d4,2691
@@ -15726,9 +15724,9 @@ xinference/web/ui/node_modules/yup/package.json,sha256=xRFSROB9NKxqSWHEVFvSTsPs9
15726
15724
  xinference/web/ui/node_modules/yup/node_modules/type-fest/package.json,sha256=JTv2zTTVgxQ2H82m1-6qEpdMv08lHjFx4Puf_MsbB_Q,1134
15727
15725
  xinference/web/ui/src/locales/en.json,sha256=5MN-GKLcPOeUAsDbv_MRvD4uf86WsvUC6rhzTAtQevA,8925
15728
15726
  xinference/web/ui/src/locales/zh.json,sha256=27HeH4Qc96KuDJ9cgb4OybpUFG-GuuUiaD0ASsg3lyQ,8666
15729
- xinference-1.3.1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
15730
- xinference-1.3.1.dist-info/METADATA,sha256=YDvNiU9VGF9Kw0t9iy_RCk-ORuGiCz5sbY-9gXYC3L0,24298
15731
- xinference-1.3.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
15732
- xinference-1.3.1.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
15733
- xinference-1.3.1.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
15734
- xinference-1.3.1.dist-info/RECORD,,
15727
+ xinference-1.3.1.post1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
15728
+ xinference-1.3.1.post1.dist-info/METADATA,sha256=aag4egJ0PONQQ2mu6fWiY9KDHMpqqTzSTHAOPO11vb8,24447
15729
+ xinference-1.3.1.post1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
15730
+ xinference-1.3.1.post1.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
15731
+ xinference-1.3.1.post1.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
15732
+ xinference-1.3.1.post1.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- # Copyright 2022-2024 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
@@ -1,98 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Dict, Optional, Tuple, Type, Union
3
-
4
- from ....types import ChatCompletionChunkDelta, CompletionChoice
5
-
6
-
7
- class ReasoningParser(ABC):
8
- """Abstract base class for reasoning content parsers."""
9
-
10
- def __init__(
11
- self,
12
- reasoning_start_tag: str = "<think>",
13
- reasoning_end_tag: str = "</think>",
14
- ):
15
- """Initialize the reasoning parser.
16
-
17
- Args:
18
- reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
19
- reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
20
- """
21
- self.reasoning_start_tag = reasoning_start_tag
22
- self.reasoning_end_tag = reasoning_end_tag
23
-
24
- @abstractmethod
25
- def extract_reasoning_content_streaming(
26
- self,
27
- previous_text: str,
28
- current_text: str,
29
- delta_text: str,
30
- ) -> ChatCompletionChunkDelta:
31
- """Extract reasoning content from model output in a streaming fashion.
32
-
33
- Args:
34
- content (str): The model output content to parse.
35
-
36
- Yields:
37
- str: Extracted reasoning content chunks.
38
- """
39
- pass
40
-
41
- @abstractmethod
42
- def extract_reasoning_content(
43
- self, model_output: Union[str, CompletionChoice]
44
- ) -> Tuple[Optional[str], Optional[str]]:
45
- """Extract reasoning content from model output.
46
-
47
- Args:
48
- content (str): The model output content to parse.
49
-
50
- Returns:
51
- Optional[str]: Extracted reasoning content, or None if no reasoning content found.
52
- """
53
- pass
54
-
55
-
56
- class ReasoningParserManager:
57
- """Manager class for reasoning parsers."""
58
-
59
- _parsers: Dict[str, Type[ReasoningParser]] = {}
60
-
61
- @classmethod
62
- def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
63
- """Register a reasoning parser for a specific model.
64
-
65
- Args:
66
- model_name (str): The name of the model.
67
- parser_cls (Type[ReasoningParser]): The parser class to register.
68
- """
69
- cls._parsers[model_name] = parser_cls
70
-
71
- @classmethod
72
- def register_module(cls, model_name: str):
73
- """Decorator for registering a reasoning parser for a specific model.
74
-
75
- Args:
76
- model_name (str): The name of the model.
77
-
78
- Returns:
79
- Callable: The decorator function.
80
- """
81
-
82
- def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
83
- cls.register(model_name, parser_cls)
84
- return parser_cls
85
-
86
- return _register
87
-
88
- @classmethod
89
- def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
90
- """Get the registered parser for a specific model.
91
-
92
- Args:
93
- model_name (str): The name of the model.
94
-
95
- Returns:
96
- Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
97
- """
98
- return cls._parsers.get(model_name)