xinference 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +462 -3
  3. xinference/client/restful/async_restful_client.py +158 -5
  4. xinference/client/restful/restful_client.py +131 -0
  5. xinference/core/supervisor.py +12 -0
  6. xinference/model/audio/model_spec.json +20 -20
  7. xinference/model/image/model_spec.json +159 -159
  8. xinference/model/llm/__init__.py +2 -2
  9. xinference/model/llm/llm_family.json +843 -180
  10. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  11. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  12. xinference/model/llm/sglang/core.py +20 -6
  13. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  14. xinference/model/llm/transformers/chatglm.py +3 -0
  15. xinference/model/llm/transformers/core.py +129 -36
  16. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  17. xinference/model/llm/transformers/utils.py +23 -0
  18. xinference/model/llm/utils.py +37 -24
  19. xinference/model/llm/vllm/core.py +128 -69
  20. xinference/model/utils.py +74 -31
  21. xinference/thirdparty/audiotools/core/audio_signal.py +6 -6
  22. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  23. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  24. xinference/types.py +9 -0
  25. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  26. xinference/ui/web/ui/build/index.html +1 -1
  27. xinference/ui/web/ui/build/static/js/{main.d192c4f3.js → main.45e78536.js} +3 -3
  28. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  29. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  30. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/METADATA +7 -5
  31. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/RECORD +36 -35
  32. xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +0 -1
  33. xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +0 -1
  34. /xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -0
  35. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  36. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  37. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  38. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -162,3 +162,44 @@ class DistributedModelMixin:
162
162
  self.layers = self.layers[: self.end_idx]
163
163
  self.layers[: self.start_idx] = [None] * self.start_idx
164
164
  self.num_layers = len(self.layers) - self.start_idx
165
+
166
+
167
+ class SafeKVCache:
168
+ """
169
+ A safe wrapper around mlx_lm's KVCache that handles None keys gracefully.
170
+ This is needed because mlx_lm's generate function accesses cache.state
171
+ before the cache is properly initialized.
172
+ """
173
+
174
+ def __init__(self):
175
+ from mlx_lm.models.cache import KVCache
176
+
177
+ self._cache = KVCache()
178
+
179
+ @property
180
+ def state(self):
181
+ # Safe access to state property
182
+ if self._cache.keys is None:
183
+ return None, None
184
+ if self._cache.offset == self._cache.keys.shape[2]:
185
+ return self._cache.keys, self._cache.values
186
+ else:
187
+ return (
188
+ self._cache.keys[..., : self._cache.offset, :],
189
+ self._cache.values[..., : self._cache.offset, :],
190
+ )
191
+
192
+ @state.setter
193
+ def state(self, v):
194
+ # Safe setter for state property
195
+ if v is None or v[0] is None:
196
+ self._cache.keys = None
197
+ self._cache.values = None
198
+ self._cache.offset = 0
199
+ else:
200
+ self._cache.keys, self._cache.values = v
201
+ self._cache.offset = self._cache.keys.shape[2]
202
+
203
+ def __getattr__(self, name):
204
+ # Delegate all other attributes and methods to the underlying cache
205
+ return getattr(self._cache, name)
@@ -46,11 +46,10 @@ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
46
46
 
47
47
  pipeline_rank = self.rank
48
48
  pipeline_size = self.world_size
49
- if mask is None:
50
- mask = create_attention_mask(h, cache)
51
49
 
52
50
  if cache is None:
53
51
  cache = [None] * self.num_layers
52
+ mask = create_attention_mask(h, cache[0])
54
53
 
55
54
  # Receive from the previous process in the pipeline
56
55
 
@@ -362,9 +362,16 @@ class SGLANGModel(LLM):
362
362
  def _convert_state_to_completion_chunk(
363
363
  request_id: str, model: str, output_text: str, meta_info: Dict
364
364
  ) -> CompletionChunk:
365
- finish_reason = meta_info.get("finish_reason", None)
366
- if isinstance(finish_reason, dict) and "type" in finish_reason:
367
- finish_reason = finish_reason["type"]
365
+ finish_reason_raw = meta_info.get("finish_reason", None)
366
+ finish_reason: Optional[str] = None
367
+ if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
368
+ finish_reason = (
369
+ str(finish_reason_raw["type"])
370
+ if finish_reason_raw["type"] is not None
371
+ else None
372
+ )
373
+ elif isinstance(finish_reason_raw, str):
374
+ finish_reason = finish_reason_raw
368
375
  choices: List[CompletionChoice] = [
369
376
  CompletionChoice(
370
377
  text=output_text,
@@ -392,9 +399,16 @@ class SGLANGModel(LLM):
392
399
  def _convert_state_to_completion(
393
400
  request_id: str, model: str, output_text: str, meta_info: Dict
394
401
  ) -> Completion:
395
- finish_reason = meta_info.get("finish_reason", None)
396
- if isinstance(finish_reason, dict) and "type" in finish_reason:
397
- finish_reason = finish_reason["type"]
402
+ finish_reason_raw = meta_info.get("finish_reason", None)
403
+ finish_reason: Optional[str] = None
404
+ if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
405
+ finish_reason = (
406
+ str(finish_reason_raw["type"])
407
+ if finish_reason_raw["type"] is not None
408
+ else None
409
+ )
410
+ elif isinstance(finish_reason_raw, str):
411
+ finish_reason = finish_reason_raw
398
412
  choices = [
399
413
  CompletionChoice(
400
414
  text=output_text,
@@ -59,10 +59,28 @@ class QwenToolParser(ToolParser):
59
59
  Returns:
60
60
  str: Extracted JSON string or original string if no match found.
61
61
  """
62
+ # First try to find complete tool calls
62
63
  function_calls = self.tool_call_complete_regex.findall(function_call_str)
63
- if len(function_calls) == 0:
64
- return function_call_str
65
- return function_calls[-1]
64
+ if len(function_calls) > 0:
65
+ return function_calls[-1]
66
+
67
+ # If no complete tool calls found, try to extract from incomplete tool calls
68
+ # Handle cases like <tool_call><tool_call>_city
69
+ if self.tool_call_start_token in function_call_str:
70
+ # Extract content between the last tool_call start token and end of string
71
+ last_start = function_call_str.rfind(self.tool_call_start_token)
72
+ potential_json = function_call_str[
73
+ last_start + len(self.tool_call_start_token) :
74
+ ]
75
+ # Remove any trailing tool_call end tokens
76
+ if self.tool_call_end_token in potential_json:
77
+ potential_json = potential_json.split(self.tool_call_end_token)[0]
78
+ # Clean up any extra whitespace
79
+ potential_json = potential_json.strip()
80
+ if potential_json:
81
+ return potential_json
82
+
83
+ return function_call_str
66
84
 
67
85
  def _parse_json_function_call_stream(
68
86
  self,
@@ -229,7 +247,14 @@ class QwenToolParser(ToolParser):
229
247
  try:
230
248
  parsed_json = self._parse_json_function_call(function_call)
231
249
  res = json.loads(parsed_json, strict=False)
232
- results.append((None, res["name"], res["arguments"]))
250
+ # Validate that we have the required fields
251
+ if "name" in res and "arguments" in res:
252
+ results.append((None, res["name"], res["arguments"]))
253
+ else:
254
+ logger.warning(
255
+ "Invalid tool call format, missing required fields: %s", res
256
+ )
257
+ results.append((function_call, None, None))
233
258
  except Exception as e:
234
259
  logger.error(
235
260
  "Can't parse single qwen tool call output: %s. Error: %s",
@@ -472,6 +472,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
472
472
  r.prompt = self._process_messages(
473
473
  r.prompt, tools=tools, tool_choice=tool_choice
474
474
  )
475
+ assert isinstance(
476
+ r.prompt, list
477
+ ), "r.prompt must be a list after processing"
475
478
  r.full_prompt = self.get_full_context(
476
479
  r.prompt,
477
480
  self.model_family.chat_template, # type: ignore
@@ -48,6 +48,7 @@ from ..utils import (
48
48
  )
49
49
  from .utils import (
50
50
  _get_pad_param,
51
+ convert_to_cache_cls,
51
52
  get_context_length,
52
53
  get_max_src_len,
53
54
  pad_prefill_tokens,
@@ -548,31 +549,48 @@ class PytorchModel(LLM):
548
549
  So we need pad `0` on the left again.
549
550
  """
550
551
  data = []
551
- max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
552
+ # For decode phase, attention mask should match the full KV cache sequence length
553
+ # All requests in batch should have attention mask of length `seq_length`
554
+ for r in reqs:
555
+ # Get the actual sequence length for this request from its tracking
556
+ if "attention_mask_seq_len" not in r.extra_kwargs:
557
+ # Initialize with the current sequence length (full KV cache length)
558
+ r.extra_kwargs["attention_mask_seq_len"] = seq_length
559
+ else:
560
+ # Use the previously tracked length, but ensure it doesn't exceed current seq_length
561
+ tracked_len = r.extra_kwargs["attention_mask_seq_len"]
562
+ r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
563
+
564
+ # For decode phase after KV cache merge, all requests should have attention mask
565
+ # that matches the merged sequence length
552
566
  for r in reqs:
553
- r.extra_kwargs["attention_mask_seq_len"] += 1
554
567
  real_len = r.extra_kwargs["attention_mask_seq_len"]
555
- pad_len = max_len - real_len
556
568
 
557
- if self._tokenizer.padding_side == "left":
558
- x = torch.cat(
559
- [
560
- (
561
- torch.full((pad_len,), 0, dtype=torch.long)
562
- if pad_len > 0
563
- else torch.tensor([], dtype=torch.long)
564
- ),
565
- torch.ones((real_len,), dtype=torch.long),
566
- ]
567
- )
569
+ # The attention mask should cover the full sequence length
570
+ if real_len < seq_length:
571
+ # Pad with zeros on the left to reach full sequence length
572
+ pad_len = seq_length - real_len
573
+
574
+ if self._tokenizer.padding_side == "left":
575
+ x = torch.cat(
576
+ [
577
+ torch.full((pad_len,), 0, dtype=torch.long),
578
+ torch.ones((real_len,), dtype=torch.long),
579
+ ]
580
+ )
581
+ else:
582
+ x = torch.cat(
583
+ [
584
+ torch.ones((real_len,), dtype=torch.long),
585
+ torch.full((pad_len,), 0, dtype=torch.long),
586
+ ]
587
+ )
568
588
  else:
569
- x = torch.cat(
570
- [
571
- torch.ones((real_len,), dtype=torch.long),
572
- torch.full((pad_len,), 0, dtype=torch.long),
573
- ]
574
- )
589
+ # Already at correct length
590
+ x = torch.ones((real_len,), dtype=torch.long)
591
+
575
592
  data.append(x)
593
+
576
594
  return torch.stack(data).to(self._device)
577
595
 
578
596
  def build_prefill_position_ids(
@@ -713,30 +731,105 @@ class PytorchModel(LLM):
713
731
  from torch.nn.functional import pad
714
732
  from transformers import DynamicCache
715
733
 
734
+ # Handle case where past_cache is None
735
+ if past_cache is None:
736
+ return new_cache
737
+
738
+ # Convert both caches to DynamicCache if not already
739
+ if not isinstance(past_cache, DynamicCache):
740
+ past_cache = convert_to_cache_cls(past_cache)
741
+ if not isinstance(new_cache, DynamicCache):
742
+ new_cache = convert_to_cache_cls(new_cache)
743
+
716
744
  _, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
717
- past_seq_len = past_cache[0][0].shape[seq_len_idx]
718
- new_seq_len = new_cache[0][0].shape[seq_len_idx]
745
+
746
+ # Handle empty caches
747
+ if len(past_cache) == 0:
748
+ return new_cache
749
+ if len(new_cache) == 0:
750
+ return past_cache
751
+
752
+ # Get first layer seq_len safely
753
+ past_first = past_cache[0] if len(past_cache) > 0 else (None, None)
754
+ new_first = new_cache[0] if len(new_cache) > 0 else (None, None)
755
+
756
+ if past_first[0] is None or past_first[1] is None:
757
+ return new_cache
758
+ if new_first[0] is None or new_first[1] is None:
759
+ return past_cache
760
+
761
+ past_seq_len = past_first[0].shape[seq_len_idx]
762
+ new_seq_len = new_first[0].shape[seq_len_idx]
763
+
764
+ # Pad the shorter cache
719
765
  if past_seq_len != new_seq_len:
720
- padding_target = new_cache if past_seq_len > new_seq_len else past_cache
721
- padding_len = abs(past_seq_len - new_seq_len)
766
+ if past_seq_len > new_seq_len:
767
+ padding_target = new_cache
768
+ padding_len = past_seq_len - new_seq_len
769
+ else:
770
+ padding_target = past_cache
771
+ padding_len = new_seq_len - past_seq_len
772
+
722
773
  pad_param = _get_pad_param(seq_len_idx, padding_len)
723
774
  for idx in range(len(padding_target)):
724
775
  k = padding_target.key_cache[idx]
725
776
  v = padding_target.value_cache[idx]
726
- _k = pad(k, pad_param)
727
- _v = pad(v, pad_param)
728
- padding_target.key_cache[idx] = _k
729
- padding_target.value_cache[idx] = _v
777
+ if k is not None and v is not None:
778
+ padding_target.key_cache[idx] = pad(k, pad_param)
779
+ padding_target.value_cache[idx] = pad(v, pad_param)
730
780
 
781
+ # Merge caches
731
782
  ret_kv = DynamicCache()
732
- for idx in range(len(past_cache)):
733
- k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
734
- v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
735
- ret_kv.update(
736
- torch.cat((k1, k2), 0).contiguous(),
737
- torch.cat((v1, v2), 0).contiguous(),
738
- idx,
739
- )
783
+ max_layers = max(len(past_cache), len(new_cache))
784
+
785
+ for idx in range(max_layers):
786
+ past_k = past_cache.key_cache[idx] if idx < len(past_cache) else None
787
+ past_v = past_cache.value_cache[idx] if idx < len(past_cache) else None
788
+ new_k = new_cache.key_cache[idx] if idx < len(new_cache) else None
789
+ new_v = new_cache.value_cache[idx] if idx < len(new_cache) else None
790
+
791
+ if past_k is not None and new_k is not None:
792
+ # Both layers exist - validate tensor dimensions before concatenation
793
+ if past_k.dim() != new_k.dim():
794
+ logger.error(
795
+ f"KV cache tensor dimension mismatch at layer {idx}: "
796
+ f"past_k.dim()={past_k.dim()}, new_k.dim()={new_k.dim()}"
797
+ )
798
+ # Use the cache with higher batch size
799
+ if past_k.shape[0] >= new_k.shape[0]:
800
+ ret_kv.update(past_k, past_v, idx)
801
+ else:
802
+ ret_kv.update(new_k, new_v, idx)
803
+ continue
804
+
805
+ if past_k.shape[1:] == new_k.shape[1:]:
806
+ # Shapes are compatible, concatenate along batch dimension
807
+ ret_kv.update(
808
+ torch.cat((new_k, past_k), 0).contiguous(),
809
+ torch.cat((new_v, past_v), 0).contiguous(),
810
+ idx,
811
+ )
812
+ else:
813
+ # Detailed logging for shape mismatch
814
+ logger.warning(
815
+ f"KV cache shape mismatch at layer {idx}: "
816
+ f"past_k.shape={past_k.shape}, new_k.shape={new_k.shape}. "
817
+ f"This may be due to inconsistent batch sizes in continuous batching."
818
+ )
819
+
820
+ # Choose the cache with larger batch size to preserve more data
821
+ if past_k.shape[0] >= new_k.shape[0]:
822
+ ret_kv.update(past_k, past_v, idx)
823
+ else:
824
+ ret_kv.update(new_k, new_v, idx)
825
+ elif past_k is not None:
826
+ ret_kv.update(past_k, past_v, idx)
827
+ elif new_k is not None:
828
+ ret_kv.update(new_k, new_v, idx)
829
+ else:
830
+ # both None, fill with None
831
+ ret_kv.update(None, None, idx)
832
+
740
833
  return ret_kv
741
834
 
742
835
  def prepare_batch_inference(self, req_list: List[InferenceRequest]):