PyPI - xinference - Versions diffs - 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

xinference 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show

xinference/model/llm/mlx/distributed_models/core.py CHANGED Viewed

@@ -162,3 +162,44 @@ class DistributedModelMixin:
         self.layers = self.layers[: self.end_idx]
         self.layers[: self.start_idx] = [None] * self.start_idx
         self.num_layers = len(self.layers) - self.start_idx
+class SafeKVCache:
+    """
+    A safe wrapper around mlx_lm's KVCache that handles None keys gracefully.
+    This is needed because mlx_lm's generate function accesses cache.state
+    before the cache is properly initialized.
+    """
+    def __init__(self):
+        from mlx_lm.models.cache import KVCache
+        self._cache = KVCache()
+    @property
+    def state(self):
+        # Safe access to state property
+        if self._cache.keys is None:
+            return None, None
+        if self._cache.offset == self._cache.keys.shape[2]:
+            return self._cache.keys, self._cache.values
+        else:
+            return (
+                self._cache.keys[..., : self._cache.offset, :],
+                self._cache.values[..., : self._cache.offset, :],
+            )
+    @state.setter
+    def state(self, v):
+        # Safe setter for state property
+        if v is None or v[0] is None:
+            self._cache.keys = None
+            self._cache.values = None
+            self._cache.offset = 0
+        else:
+            self._cache.keys, self._cache.values = v
+            self._cache.offset = self._cache.keys.shape[2]
+    def __getattr__(self, name):
+        # Delegate all other attributes and methods to the underlying cache
+        return getattr(self._cache, name)

xinference/model/llm/mlx/distributed_models/qwen2.py CHANGED Viewed

@@ -46,11 +46,10 @@ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
         pipeline_rank = self.rank
         pipeline_size = self.world_size
-        if mask is None:
-            mask = create_attention_mask(h, cache)
         if cache is None:
             cache = [None] * self.num_layers
+        mask = create_attention_mask(h, cache[0])
         # Receive from the previous process in the pipeline

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -362,9 +362,16 @@ class SGLANGModel(LLM):
     def _convert_state_to_completion_chunk(
         request_id: str, model: str, output_text: str, meta_info: Dict
     ) -> CompletionChunk:
-        finish_reason = meta_info.get("finish_reason", None)
-        if isinstance(finish_reason, dict) and "type" in finish_reason:
-            finish_reason = finish_reason["type"]
+        finish_reason_raw = meta_info.get("finish_reason", None)
+        finish_reason: Optional[str] = None
+        if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
+            finish_reason = (
+                str(finish_reason_raw["type"])
+                if finish_reason_raw["type"] is not None
+                else None
+            )
+        elif isinstance(finish_reason_raw, str):
+            finish_reason = finish_reason_raw
         choices: List[CompletionChoice] = [
             CompletionChoice(
                 text=output_text,
@@ -392,9 +399,16 @@ class SGLANGModel(LLM):
     def _convert_state_to_completion(
         request_id: str, model: str, output_text: str, meta_info: Dict
     ) -> Completion:
-        finish_reason = meta_info.get("finish_reason", None)
-        if isinstance(finish_reason, dict) and "type" in finish_reason:
-            finish_reason = finish_reason["type"]
+        finish_reason_raw = meta_info.get("finish_reason", None)
+        finish_reason: Optional[str] = None
+        if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
+            finish_reason = (
+                str(finish_reason_raw["type"])
+                if finish_reason_raw["type"] is not None
+                else None
+            )
+        elif isinstance(finish_reason_raw, str):
+            finish_reason = finish_reason_raw
         choices = [
             CompletionChoice(
                 text=output_text,

xinference/model/llm/tool_parsers/qwen_tool_parser.py CHANGED Viewed

@@ -59,10 +59,28 @@ class QwenToolParser(ToolParser):
         Returns:
             str: Extracted JSON string or original string if no match found.
         """
+        # First try to find complete tool calls
         function_calls = self.tool_call_complete_regex.findall(function_call_str)
-        if len(function_calls) == 0:
-            return function_call_str
-        return function_calls[-1]
+        if len(function_calls) > 0:
+            return function_calls[-1]
+        # If no complete tool calls found, try to extract from incomplete tool calls
+        # Handle cases like <tool_call><tool_call>_city
+        if self.tool_call_start_token in function_call_str:
+            # Extract content between the last tool_call start token and end of string
+            last_start = function_call_str.rfind(self.tool_call_start_token)
+            potential_json = function_call_str[
+                last_start + len(self.tool_call_start_token) :
+            ]
+            # Remove any trailing tool_call end tokens
+            if self.tool_call_end_token in potential_json:
+                potential_json = potential_json.split(self.tool_call_end_token)[0]
+            # Clean up any extra whitespace
+            potential_json = potential_json.strip()
+            if potential_json:
+                return potential_json
+        return function_call_str
     def _parse_json_function_call_stream(
         self,
@@ -229,7 +247,14 @@ class QwenToolParser(ToolParser):
                 try:
                     parsed_json = self._parse_json_function_call(function_call)
                     res = json.loads(parsed_json, strict=False)
-                    results.append((None, res["name"], res["arguments"]))
+                    # Validate that we have the required fields
+                    if "name" in res and "arguments" in res:
+                        results.append((None, res["name"], res["arguments"]))
+                    else:
+                        logger.warning(
+                            "Invalid tool call format, missing required fields: %s", res
+                        )
+                        results.append((function_call, None, None))
                 except Exception as e:
                     logger.error(
                         "Can't parse single qwen tool call output: %s. Error: %s",

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -472,6 +472,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     r.prompt = self._process_messages(
                         r.prompt, tools=tools, tool_choice=tool_choice
                     )
+                    assert isinstance(
+                        r.prompt, list
+                    ), "r.prompt must be a list after processing"
                     r.full_prompt = self.get_full_context(
                         r.prompt,
                         self.model_family.chat_template,  # type: ignore

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -48,6 +48,7 @@ from ..utils import (
 )
 from .utils import (
     _get_pad_param,
+    convert_to_cache_cls,
     get_context_length,
     get_max_src_len,
     pad_prefill_tokens,
@@ -548,31 +549,48 @@ class PytorchModel(LLM):
         So we need pad `0` on the left again.
         """
         data = []
-        max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
+        # For decode phase, attention mask should match the full KV cache sequence length
+        # All requests in batch should have attention mask of length `seq_length`
+        for r in reqs:
+            # Get the actual sequence length for this request from its tracking
+            if "attention_mask_seq_len" not in r.extra_kwargs:
+                # Initialize with the current sequence length (full KV cache length)
+                r.extra_kwargs["attention_mask_seq_len"] = seq_length
+            else:
+                # Use the previously tracked length, but ensure it doesn't exceed current seq_length
+                tracked_len = r.extra_kwargs["attention_mask_seq_len"]
+                r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
+        # For decode phase after KV cache merge, all requests should have attention mask
+        # that matches the merged sequence length
         for r in reqs:
-            r.extra_kwargs["attention_mask_seq_len"] += 1
             real_len = r.extra_kwargs["attention_mask_seq_len"]
-            pad_len = max_len - real_len
-            if self._tokenizer.padding_side == "left":
-                x = torch.cat(
-                    [
-                        (
-                            torch.full((pad_len,), 0, dtype=torch.long)
-                            if pad_len > 0
-                            else torch.tensor([], dtype=torch.long)
-                        ),
-                        torch.ones((real_len,), dtype=torch.long),
-                    ]
-                )
+            # The attention mask should cover the full sequence length
+            if real_len < seq_length:
+                # Pad with zeros on the left to reach full sequence length
+                pad_len = seq_length - real_len
+                if self._tokenizer.padding_side == "left":
+                    x = torch.cat(
+                        [
+                            torch.full((pad_len,), 0, dtype=torch.long),
+                            torch.ones((real_len,), dtype=torch.long),
+                        ]
+                    )
+                else:
+                    x = torch.cat(
+                        [
+                            torch.ones((real_len,), dtype=torch.long),
+                            torch.full((pad_len,), 0, dtype=torch.long),
+                        ]
+                    )
             else:
-                x = torch.cat(
-                    [
-                        torch.ones((real_len,), dtype=torch.long),
-                        torch.full((pad_len,), 0, dtype=torch.long),
-                    ]
-                )
+                # Already at correct length
+                x = torch.ones((real_len,), dtype=torch.long)
             data.append(x)
         return torch.stack(data).to(self._device)
     def build_prefill_position_ids(
@@ -713,30 +731,105 @@ class PytorchModel(LLM):
         from torch.nn.functional import pad
         from transformers import DynamicCache
+        # Handle case where past_cache is None
+        if past_cache is None:
+            return new_cache
+        # Convert both caches to DynamicCache if not already
+        if not isinstance(past_cache, DynamicCache):
+            past_cache = convert_to_cache_cls(past_cache)
+        if not isinstance(new_cache, DynamicCache):
+            new_cache = convert_to_cache_cls(new_cache)
         _, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
-        past_seq_len = past_cache[0][0].shape[seq_len_idx]
-        new_seq_len = new_cache[0][0].shape[seq_len_idx]
+        # Handle empty caches
+        if len(past_cache) == 0:
+            return new_cache
+        if len(new_cache) == 0:
+            return past_cache
+        # Get first layer seq_len safely
+        past_first = past_cache[0] if len(past_cache) > 0 else (None, None)
+        new_first = new_cache[0] if len(new_cache) > 0 else (None, None)
+        if past_first[0] is None or past_first[1] is None:
+            return new_cache
+        if new_first[0] is None or new_first[1] is None:
+            return past_cache
+        past_seq_len = past_first[0].shape[seq_len_idx]
+        new_seq_len = new_first[0].shape[seq_len_idx]
+        # Pad the shorter cache
         if past_seq_len != new_seq_len:
-            padding_target = new_cache if past_seq_len > new_seq_len else past_cache
-            padding_len = abs(past_seq_len - new_seq_len)
+            if past_seq_len > new_seq_len:
+                padding_target = new_cache
+                padding_len = past_seq_len - new_seq_len
+            else:
+                padding_target = past_cache
+                padding_len = new_seq_len - past_seq_len
             pad_param = _get_pad_param(seq_len_idx, padding_len)
             for idx in range(len(padding_target)):
                 k = padding_target.key_cache[idx]
                 v = padding_target.value_cache[idx]
-                _k = pad(k, pad_param)
-                _v = pad(v, pad_param)
-                padding_target.key_cache[idx] = _k
-                padding_target.value_cache[idx] = _v
+                if k is not None and v is not None:
+                    padding_target.key_cache[idx] = pad(k, pad_param)
+                    padding_target.value_cache[idx] = pad(v, pad_param)
+        # Merge caches
         ret_kv = DynamicCache()
-        for idx in range(len(past_cache)):
-            k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
-            v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
-            ret_kv.update(
-                torch.cat((k1, k2), 0).contiguous(),
-                torch.cat((v1, v2), 0).contiguous(),
-                idx,
-            )
+        max_layers = max(len(past_cache), len(new_cache))
+        for idx in range(max_layers):
+            past_k = past_cache.key_cache[idx] if idx < len(past_cache) else None
+            past_v = past_cache.value_cache[idx] if idx < len(past_cache) else None
+            new_k = new_cache.key_cache[idx] if idx < len(new_cache) else None
+            new_v = new_cache.value_cache[idx] if idx < len(new_cache) else None
+            if past_k is not None and new_k is not None:
+                # Both layers exist - validate tensor dimensions before concatenation
+                if past_k.dim() != new_k.dim():
+                    logger.error(
+                        f"KV cache tensor dimension mismatch at layer {idx}: "
+                        f"past_k.dim()={past_k.dim()}, new_k.dim()={new_k.dim()}"
+                    )
+                    # Use the cache with higher batch size
+                    if past_k.shape[0] >= new_k.shape[0]:
+                        ret_kv.update(past_k, past_v, idx)
+                    else:
+                        ret_kv.update(new_k, new_v, idx)
+                    continue
+                if past_k.shape[1:] == new_k.shape[1:]:
+                    # Shapes are compatible, concatenate along batch dimension
+                    ret_kv.update(
+                        torch.cat((new_k, past_k), 0).contiguous(),
+                        torch.cat((new_v, past_v), 0).contiguous(),
+                        idx,
+                    )
+                else:
+                    # Detailed logging for shape mismatch
+                    logger.warning(
+                        f"KV cache shape mismatch at layer {idx}: "
+                        f"past_k.shape={past_k.shape}, new_k.shape={new_k.shape}. "
+                        f"This may be due to inconsistent batch sizes in continuous batching."
+                    )
+                    # Choose the cache with larger batch size to preserve more data
+                    if past_k.shape[0] >= new_k.shape[0]:
+                        ret_kv.update(past_k, past_v, idx)
+                    else:
+                        ret_kv.update(new_k, new_v, idx)
+            elif past_k is not None:
+                ret_kv.update(past_k, past_v, idx)
+            elif new_k is not None:
+                ret_kv.update(new_k, new_v, idx)
+            else:
+                # both None, fill with None
+                ret_kv.update(None, None, idx)
         return ret_kv
     def prepare_batch_inference(self, req_list: List[InferenceRequest]):

xinference 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

xinference 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl