PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show

xinference/model/llm/mlx/distributed_models/core.py CHANGED Viewed

@@ -162,3 +162,44 @@ class DistributedModelMixin:
         self.layers = self.layers[: self.end_idx]
         self.layers[: self.start_idx] = [None] * self.start_idx
         self.num_layers = len(self.layers) - self.start_idx
+class SafeKVCache:
+    """
+    A safe wrapper around mlx_lm's KVCache that handles None keys gracefully.
+    This is needed because mlx_lm's generate function accesses cache.state
+    before the cache is properly initialized.
+    """
+    def __init__(self):
+        from mlx_lm.models.cache import KVCache
+        self._cache = KVCache()
+    @property
+    def state(self):
+        # Safe access to state property
+        if self._cache.keys is None:
+            return None, None
+        if self._cache.offset == self._cache.keys.shape[2]:
+            return self._cache.keys, self._cache.values
+        else:
+            return (
+                self._cache.keys[..., : self._cache.offset, :],
+                self._cache.values[..., : self._cache.offset, :],
+            )
+    @state.setter
+    def state(self, v):
+        # Safe setter for state property
+        if v is None or v[0] is None:
+            self._cache.keys = None
+            self._cache.values = None
+            self._cache.offset = 0
+        else:
+            self._cache.keys, self._cache.values = v
+            self._cache.offset = self._cache.keys.shape[2]
+    def __getattr__(self, name):
+        # Delegate all other attributes and methods to the underlying cache
+        return getattr(self._cache, name)

xinference/model/llm/mlx/distributed_models/qwen2.py CHANGED Viewed

@@ -46,11 +46,10 @@ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
         pipeline_rank = self.rank
         pipeline_size = self.world_size
-        if mask is None:
-            mask = create_attention_mask(h, cache)
         if cache is None:
             cache = [None] * self.num_layers
+        mask = create_attention_mask(h, cache[0])
         # Receive from the previous process in the pipeline

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -73,6 +73,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     stream: bool
     stream_options: Optional[Union[dict, None]]
     json_schema: Optional[dict]
+    response_format: dict
 try:
@@ -317,13 +318,16 @@ class SGLANGModel(LLM):
         stream_options = generate_config.get("stream_options")
         generate_config.setdefault("stream_options", stream_options)
         generate_config.setdefault("ignore_eos", False)
-        json_schema = (
-            generate_config.pop("response_format", {})  # type: ignore
-            .pop("json_schema", {})
-            .pop("schema", {})
-        )
-        if json_schema:
-            generate_config.setdefault("json_schema", json.dumps(json_schema))  # type: ignore
+        response_format = generate_config.pop("response_format", None)
+        if response_format:
+            json_schema_config = response_format.pop("json_schema", None)
+            json_schema = None
+            if "schema_" in json_schema_config:
+                json_schema = json_schema_config.pop("schema_")
+            elif "schema" in json_schema_config:
+                json_schema = json_schema_config.pop("schema")
+            if json_schema:
+                generate_config.setdefault("json_schema", json.dumps(json_schema))  # type: ignore
         return generate_config
@@ -356,22 +360,38 @@ class SGLANGModel(LLM):
     @staticmethod
     def _convert_state_to_completion_chunk(
-        request_id: str, model: str, output_text: str
+        request_id: str, model: str, output_text: str, meta_info: Dict
     ) -> CompletionChunk:
+        finish_reason_raw = meta_info.get("finish_reason", None)
+        finish_reason: Optional[str] = None
+        if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
+            finish_reason = (
+                str(finish_reason_raw["type"])
+                if finish_reason_raw["type"] is not None
+                else None
+            )
+        elif isinstance(finish_reason_raw, str):
+            finish_reason = finish_reason_raw
         choices: List[CompletionChoice] = [
             CompletionChoice(
                 text=output_text,
                 index=0,
                 logprobs=None,
-                finish_reason=None,
+                finish_reason=finish_reason,
             )
         ]
+        usage = CompletionUsage(
+            prompt_tokens=meta_info["prompt_tokens"],
+            completion_tokens=meta_info["completion_tokens"],
+            total_tokens=meta_info["prompt_tokens"] + meta_info["completion_tokens"],
+        )
         chunk = CompletionChunk(
             id=request_id,
             object="text_completion",
             created=int(time.time()),
             model=model,
             choices=choices,
+            usage=usage,
         )
         return chunk
@@ -379,12 +399,22 @@ class SGLANGModel(LLM):
     def _convert_state_to_completion(
         request_id: str, model: str, output_text: str, meta_info: Dict
     ) -> Completion:
+        finish_reason_raw = meta_info.get("finish_reason", None)
+        finish_reason: Optional[str] = None
+        if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
+            finish_reason = (
+                str(finish_reason_raw["type"])
+                if finish_reason_raw["type"] is not None
+                else None
+            )
+        elif isinstance(finish_reason_raw, str):
+            finish_reason = finish_reason_raw
         choices = [
             CompletionChoice(
                 text=output_text,
                 index=0,
                 logprobs=None,
-                finish_reason=None,
+                finish_reason=finish_reason,
             )
         ]
@@ -513,7 +543,10 @@ class SGLANGModel(LLM):
                     prompt, image_data, **sanitized_generate_config
                 ):
                     chunk = self._convert_state_to_completion_chunk(
-                        request_id, self.model_uid, output_text=out
+                        request_id,
+                        self.model_uid,
+                        output_text=out,
+                        meta_info=meta_info,
                     )
                     complete_response += out
                     finish_reason = meta_info["finish_reason"]

xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py CHANGED Viewed

@@ -23,12 +23,27 @@ class DeepseekR1ToolParser(ToolParser):
         Initialize the DeepSeek R1 tool parser.
         """
         super().__init__()
+        # Sentinel tokens for streaming mode
+        self.think_start_token: str = "<think>"
+        self.think_end_token: str = "</think>"
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
         # Regex pattern to match DeepSeek R1 tool call format
         self.tool_calls_regex = (
             r"<\｜tool▁call▁begin｜>function<\｜tool▁sep｜>([^\n]+)\n"
             r"```json\n(.*?)\n```<\｜tool▁call▁end｜>"
         )
+        # Regex pattern to match the entire tool-calls wrapper block.
+        # We intentionally do NOT match <think> blocks here so that the
+        # "text before" chunk will include both the think block and any
+        # narrative text up to the tool calls wrapper, yielding exactly two
+        # blocks when there is a single tool calls section:
+        # [before_text_including_think, tool_calls_wrapper_block]
+        self.content_regex = r"(<\｜tool▁calls▁begin｜>.*?<\｜tool▁calls▁end｜>)"
     def extract_tool_calls(
         self, model_output: str
     ) -> List[Tuple[Optional[str], Optional[str], Optional[dict]]]:
@@ -56,49 +71,96 @@ class DeepseekR1ToolParser(ToolParser):
             >>> print(result)
             [(None, 'get_current_weather', {'location': 'Beijing'})]
         """
-        matches = re.findall(self.tool_calls_regex, model_output, re.DOTALL)
-        if not matches:
-            # No tool calls found, return the original output as content
+        # If no tool call tokens, return original output as content
+        if self.tool_call_start_token not in model_output:
             return [(model_output, None, None)]
+        # Get all content blocks (text, thinking blocks, tool calls)
+        function_calls = self._get_function_calls(model_output)
         # Use set for deduplication of identical tool calls
         tool_calls = set()
         results: List[Tuple[Optional[str], Optional[str], Optional[dict]]] = []
-        for func_name, raw_json in matches:
-            func_and_args = None
-            try:
-                # Parse JSON arguments
-                func_and_args = json.loads(raw_json)
-                # Create hashable representation for deduplication
-                arguments_hashable = frozenset(func_and_args.items())
-                tool_call_tuple = (
-                    None,  # No content error
-                    func_name,
-                    func_and_args,
+        for content_block in function_calls:
+            # Check if this block is a tool call
+            if (
+                self.tool_call_start_token in content_block
+                and self.tool_call_end_token in content_block
+            ):
+                # Extract function name and arguments from tool call block
+                matches = re.findall(self.tool_calls_regex, content_block, re.DOTALL)
+                if not matches:
+                    # Malformed tool call, treat as regular content
+                    results.append((content_block, None, None))
+                    continue
+                func_name, raw_json = matches[0]  # Take the first match
+                func_and_args = None
+                try:
+                    # Parse JSON arguments
+                    func_and_args = json.loads(raw_json)
+                    # Create hashable representation for deduplication
+                    arguments_hashable = frozenset(func_and_args.items())
+                    tool_call_tuple = (
+                        None,  # No content error
+                        func_name,
+                        func_and_args,
+                    )
+                except Exception as e:
+                    # JSON parsing failed, treat as raw content
+                    logger.warning(
+                        f"Failed to parse tool call JSON: {raw_json}, error: {e}"
+                    )
+                    tool_call_tuple = (raw_json, None, None)
+                    arguments_hashable = None
+                # Create deduplication key
+                dedup_key = (
+                    (func_name, arguments_hashable)
+                    if func_and_args is not None
+                    else raw_json
                 )
-            except Exception as e:
-                # JSON parsing failed, treat as raw content
-                logger.warning(
-                    f"Failed to parse tool call JSON: {raw_json}, error: {e}"
-                )
-                tool_call_tuple = (raw_json, None, None)
-                arguments_hashable = None
-            # Create deduplication key
-            dedup_key = (
-                (func_name, arguments_hashable)
-                if func_and_args is not None
-                else raw_json
-            )
-            # Add to results if not already seen
-            if dedup_key not in tool_calls:
-                tool_calls.add(dedup_key)
-                results.append(tool_call_tuple)
+                # Add to results if not already seen
+                if dedup_key not in tool_calls:
+                    tool_calls.add(dedup_key)
+                    results.append(tool_call_tuple)
+            else:
+                # This is regular content (text or thinking block), add as-is
+                if content_block.strip():  # Only add non-empty content
+                    results.append((content_block, None, None))
         return results
+    def _get_function_calls(self, model_output: str) -> List[str]:
+        """
+        Extract all function calls and content blocks from model output.
+        Parses the model output to separate thinking blocks, tool calls,
+        and regular content into individual components.
+        Args:
+            model_output (str): The complete model output to parse.
+        Returns:
+            List[str]: List of content blocks (text, thinking blocks, tool calls).
+        """
+        functions_calls = []
+        last_end = 0
+        for m in re.finditer(self.content_regex, model_output, re.DOTALL):
+            # Add any text before the current match
+            if m.start() > last_end:
+                functions_calls.append(model_output[last_end : m.start()])
+            # Add the matched content (think or tool_call block)
+            functions_calls.append(m.group(0))
+            last_end = m.end()
+        # Add any remaining text after the last match
+        if last_end < len(model_output):
+            functions_calls.append(model_output[last_end:])
+        return functions_calls
     def extract_tool_calls_streaming(
         self, previous_text: List[str], current_text: str, delta_text: str
     ) -> Optional[Any]:

xinference/model/llm/tool_parsers/qwen_tool_parser.py CHANGED Viewed

@@ -59,10 +59,28 @@ class QwenToolParser(ToolParser):
         Returns:
             str: Extracted JSON string or original string if no match found.
         """
+        # First try to find complete tool calls
         function_calls = self.tool_call_complete_regex.findall(function_call_str)
-        if len(function_calls) == 0:
-            return function_call_str
-        return function_calls[-1]
+        if len(function_calls) > 0:
+            return function_calls[-1]
+        # If no complete tool calls found, try to extract from incomplete tool calls
+        # Handle cases like <tool_call><tool_call>_city
+        if self.tool_call_start_token in function_call_str:
+            # Extract content between the last tool_call start token and end of string
+            last_start = function_call_str.rfind(self.tool_call_start_token)
+            potential_json = function_call_str[
+                last_start + len(self.tool_call_start_token) :
+            ]
+            # Remove any trailing tool_call end tokens
+            if self.tool_call_end_token in potential_json:
+                potential_json = potential_json.split(self.tool_call_end_token)[0]
+            # Clean up any extra whitespace
+            potential_json = potential_json.strip()
+            if potential_json:
+                return potential_json
+        return function_call_str
     def _parse_json_function_call_stream(
         self,
@@ -229,7 +247,14 @@ class QwenToolParser(ToolParser):
                 try:
                     parsed_json = self._parse_json_function_call(function_call)
                     res = json.loads(parsed_json, strict=False)
-                    results.append((None, res["name"], res["arguments"]))
+                    # Validate that we have the required fields
+                    if "name" in res and "arguments" in res:
+                        results.append((None, res["name"], res["arguments"]))
+                    else:
+                        logger.warning(
+                            "Invalid tool call format, missing required fields: %s", res
+                        )
+                        results.append((function_call, None, None))
                 except Exception as e:
                     logger.error(
                         "Can't parse single qwen tool call output: %s. Error: %s",

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -472,6 +472,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     r.prompt = self._process_messages(
                         r.prompt, tools=tools, tool_choice=tool_choice
                     )
+                    assert isinstance(
+                        r.prompt, list
+                    ), "r.prompt must be a list after processing"
                     r.full_prompt = self.get_full_context(
                         r.prompt,
                         self.model_family.chat_template,  # type: ignore

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -48,6 +48,7 @@ from ..utils import (
 )
 from .utils import (
     _get_pad_param,
+    convert_to_cache_cls,
     get_context_length,
     get_max_src_len,
     pad_prefill_tokens,
@@ -548,31 +549,48 @@ class PytorchModel(LLM):
         So we need pad `0` on the left again.
         """
         data = []
-        max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
+        # For decode phase, attention mask should match the full KV cache sequence length
+        # All requests in batch should have attention mask of length `seq_length`
+        for r in reqs:
+            # Get the actual sequence length for this request from its tracking
+            if "attention_mask_seq_len" not in r.extra_kwargs:
+                # Initialize with the current sequence length (full KV cache length)
+                r.extra_kwargs["attention_mask_seq_len"] = seq_length
+            else:
+                # Use the previously tracked length, but ensure it doesn't exceed current seq_length
+                tracked_len = r.extra_kwargs["attention_mask_seq_len"]
+                r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
+        # For decode phase after KV cache merge, all requests should have attention mask
+        # that matches the merged sequence length
         for r in reqs:
-            r.extra_kwargs["attention_mask_seq_len"] += 1
             real_len = r.extra_kwargs["attention_mask_seq_len"]
-            pad_len = max_len - real_len
-            if self._tokenizer.padding_side == "left":
-                x = torch.cat(
-                    [
-                        (
-                            torch.full((pad_len,), 0, dtype=torch.long)
-                            if pad_len > 0
-                            else torch.tensor([], dtype=torch.long)
-                        ),
-                        torch.ones((real_len,), dtype=torch.long),
-                    ]
-                )
+            # The attention mask should cover the full sequence length
+            if real_len < seq_length:
+                # Pad with zeros on the left to reach full sequence length
+                pad_len = seq_length - real_len
+                if self._tokenizer.padding_side == "left":
+                    x = torch.cat(
+                        [
+                            torch.full((pad_len,), 0, dtype=torch.long),
+                            torch.ones((real_len,), dtype=torch.long),
+                        ]
+                    )
+                else:
+                    x = torch.cat(
+                        [
+                            torch.ones((real_len,), dtype=torch.long),
+                            torch.full((pad_len,), 0, dtype=torch.long),
+                        ]
+                    )
             else:
-                x = torch.cat(
-                    [
-                        torch.ones((real_len,), dtype=torch.long),
-                        torch.full((pad_len,), 0, dtype=torch.long),
-                    ]
-                )
+                # Already at correct length
+                x = torch.ones((real_len,), dtype=torch.long)
             data.append(x)
         return torch.stack(data).to(self._device)
     def build_prefill_position_ids(
@@ -713,30 +731,105 @@ class PytorchModel(LLM):
         from torch.nn.functional import pad
         from transformers import DynamicCache
+        # Handle case where past_cache is None
+        if past_cache is None:
+            return new_cache
+        # Convert both caches to DynamicCache if not already
+        if not isinstance(past_cache, DynamicCache):
+            past_cache = convert_to_cache_cls(past_cache)
+        if not isinstance(new_cache, DynamicCache):
+            new_cache = convert_to_cache_cls(new_cache)
         _, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
-        past_seq_len = past_cache[0][0].shape[seq_len_idx]
-        new_seq_len = new_cache[0][0].shape[seq_len_idx]
+        # Handle empty caches
+        if len(past_cache) == 0:
+            return new_cache
+        if len(new_cache) == 0:
+            return past_cache
+        # Get first layer seq_len safely
+        past_first = past_cache[0] if len(past_cache) > 0 else (None, None)
+        new_first = new_cache[0] if len(new_cache) > 0 else (None, None)
+        if past_first[0] is None or past_first[1] is None:
+            return new_cache
+        if new_first[0] is None or new_first[1] is None:
+            return past_cache
+        past_seq_len = past_first[0].shape[seq_len_idx]
+        new_seq_len = new_first[0].shape[seq_len_idx]
+        # Pad the shorter cache
         if past_seq_len != new_seq_len:
-            padding_target = new_cache if past_seq_len > new_seq_len else past_cache
-            padding_len = abs(past_seq_len - new_seq_len)
+            if past_seq_len > new_seq_len:
+                padding_target = new_cache
+                padding_len = past_seq_len - new_seq_len
+            else:
+                padding_target = past_cache
+                padding_len = new_seq_len - past_seq_len
             pad_param = _get_pad_param(seq_len_idx, padding_len)
             for idx in range(len(padding_target)):
                 k = padding_target.key_cache[idx]
                 v = padding_target.value_cache[idx]
-                _k = pad(k, pad_param)
-                _v = pad(v, pad_param)
-                padding_target.key_cache[idx] = _k
-                padding_target.value_cache[idx] = _v
+                if k is not None and v is not None:
+                    padding_target.key_cache[idx] = pad(k, pad_param)
+                    padding_target.value_cache[idx] = pad(v, pad_param)
+        # Merge caches
         ret_kv = DynamicCache()
-        for idx in range(len(past_cache)):
-            k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
-            v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
-            ret_kv.update(
-                torch.cat((k1, k2), 0).contiguous(),
-                torch.cat((v1, v2), 0).contiguous(),
-                idx,
-            )
+        max_layers = max(len(past_cache), len(new_cache))
+        for idx in range(max_layers):
+            past_k = past_cache.key_cache[idx] if idx < len(past_cache) else None
+            past_v = past_cache.value_cache[idx] if idx < len(past_cache) else None
+            new_k = new_cache.key_cache[idx] if idx < len(new_cache) else None
+            new_v = new_cache.value_cache[idx] if idx < len(new_cache) else None
+            if past_k is not None and new_k is not None:
+                # Both layers exist - validate tensor dimensions before concatenation
+                if past_k.dim() != new_k.dim():
+                    logger.error(
+                        f"KV cache tensor dimension mismatch at layer {idx}: "
+                        f"past_k.dim()={past_k.dim()}, new_k.dim()={new_k.dim()}"
+                    )
+                    # Use the cache with higher batch size
+                    if past_k.shape[0] >= new_k.shape[0]:
+                        ret_kv.update(past_k, past_v, idx)
+                    else:
+                        ret_kv.update(new_k, new_v, idx)
+                    continue
+                if past_k.shape[1:] == new_k.shape[1:]:
+                    # Shapes are compatible, concatenate along batch dimension
+                    ret_kv.update(
+                        torch.cat((new_k, past_k), 0).contiguous(),
+                        torch.cat((new_v, past_v), 0).contiguous(),
+                        idx,
+                    )
+                else:
+                    # Detailed logging for shape mismatch
+                    logger.warning(
+                        f"KV cache shape mismatch at layer {idx}: "
+                        f"past_k.shape={past_k.shape}, new_k.shape={new_k.shape}. "
+                        f"This may be due to inconsistent batch sizes in continuous batching."
+                    )
+                    # Choose the cache with larger batch size to preserve more data
+                    if past_k.shape[0] >= new_k.shape[0]:
+                        ret_kv.update(past_k, past_v, idx)
+                    else:
+                        ret_kv.update(new_k, new_v, idx)
+            elif past_k is not None:
+                ret_kv.update(past_k, past_v, idx)
+            elif new_k is not None:
+                ret_kv.update(new_k, new_v, idx)
+            else:
+                # both None, fill with None
+                ret_kv.update(None, None, idx)
         return ret_kv
     def prepare_batch_inference(self, req_list: List[InferenceRequest]):

xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl