PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/model/llm/harmony.py ADDED Viewed

@@ -0,0 +1,245 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+from typing import TYPE_CHECKING, AsyncGenerator, Dict, Union
+if TYPE_CHECKING:
+    from ...types import ChatCompletion, ChatCompletionChunk
+class HarmonyStreamParser:
+    def __init__(self):
+        # Current channel: either 'analysis', 'final', or None if not started yet
+        self.current_channel = None
+        # Buffer for accumulating text when looking for 'assistantfinal' marker
+        self.buffer = ""
+    def feed(self, text):
+        """
+        Feed a chunk of text into the parser and return parsed segments.
+        Each segment is a dict:
+        {
+            "channel": "analysis" | "final",
+            "content": <string>
+        }
+        The parser detects 'assistantfinal' markers inside reasoning text,
+        splits the reasoning and final content correctly, and switches the channel.
+        """
+        segments = []
+        # If we are currently in 'analysis' mode
+        if self.current_channel == "analysis":
+            # Add text to buffer and check for 'assistantfinal' marker
+            self.buffer += text
+            if "assistantfinal" in self.buffer:
+                # Split reasoning and final content
+                before, after = self.buffer.split("assistantfinal", 1)
+                if before:
+                    segments.append({"channel": "analysis", "content": before})
+                # Switch to final channel
+                self.current_channel = "final"
+                self.buffer = ""
+                if after:
+                    segments.append({"channel": "final", "content": after})
+                return segments
+            else:
+                # Check if buffer ends with partial 'assistantfinal'
+                if any(
+                    self.buffer.endswith("assistantfinal"[:i])
+                    for i in range(1, len("assistantfinal") + 1)
+                ):
+                    # Don't emit anything yet, wait for more text
+                    return segments
+                else:
+                    # Emit what we have so far and keep buffer for next time
+                    if self.buffer:
+                        segments.append({"channel": "analysis", "content": self.buffer})
+                        self.buffer = ""
+                    return segments
+        # If we are currently in 'final' mode
+        if self.current_channel == "final":
+            # Check if this is actually a new message starting with 'analysis'
+            if text.startswith("analysis"):
+                # Reset parser state for new message
+                self.current_channel = None
+                self.buffer = ""
+                # Re-process this text with the new state
+                return self.feed(text)
+            else:
+                segments.append({"channel": "final", "content": text})
+                return segments
+        # If no channel has been started yet
+        if text.startswith("analysis"):
+            self.current_channel = "analysis"
+            rest = text[len("analysis") :]
+            if "assistantfinal" in rest:
+                # Split immediately if marker is found in the first chunk
+                before, after = rest.split("assistantfinal", 1)
+                if before:
+                    segments.append({"channel": "analysis", "content": before})
+                self.current_channel = "final"
+                if after:
+                    segments.append({"channel": "final", "content": after})
+            else:
+                # Start buffering for potential 'assistantfinal' marker
+                self.buffer = rest
+                # Check if buffer ends with partial 'assistantfinal'
+                if any(
+                    self.buffer.endswith("assistantfinal"[:i])
+                    for i in range(1, len("assistantfinal") + 1)
+                ):
+                    # Don't emit anything yet, wait for more text
+                    pass
+                else:
+                    # Emit what we have so far
+                    if self.buffer:
+                        segments.append({"channel": "analysis", "content": self.buffer})
+                        self.buffer = ""
+        elif text.startswith("assistantfinal"):
+            self.current_channel = "final"
+            rest = text[len("assistantfinal") :]
+            if rest:
+                segments.append({"channel": "final", "content": rest})
+        return segments
+async def async_stream_harmony_chat_completion(
+    chunks: Union[
+        "ChatCompletion",
+        AsyncGenerator["ChatCompletionChunk", None],
+    ],
+) -> AsyncGenerator["ChatCompletion", None]:
+    """
+    Parse Harmony-formatted content from either a full ChatCompletion (non-streaming)
+    or an async stream of ChatCompletionChunk (streaming), using the HarmonyStreamParser defined in this file.
+    Yields parsed objects incrementally.
+    """
+    # --- Non-streaming: ChatCompletion ---
+    if isinstance(chunks, dict) and chunks.get("object") == "chat.completion":
+        out_data = deepcopy(chunks)
+        for choice in out_data["choices"]:
+            parser = HarmonyStreamParser()
+            msg = choice["message"]
+            # Backup original content & reasoning
+            original_content = msg.get("content") or ""
+            original_reasoning = msg.get("reasoning_content") or ""
+            # Reset fields before parsing
+            msg["content"] = ""
+            msg["reasoning_content"] = ""
+            msg.setdefault("tool_calls", [])
+            # Feed original content
+            for seg in parser.feed(original_content):
+                ch, c = seg["channel"], seg["content"]
+                if ch == "final":
+                    msg["content"] += c
+                elif ch == "analysis":
+                    msg["reasoning_content"] += c
+                elif ch == "tool":
+                    msg["tool_calls"].append(c)
+            # Feed original reasoning_content
+            for seg in parser.feed(original_reasoning):
+                if seg["channel"] == "analysis":
+                    msg["reasoning_content"] += seg["content"]
+                elif seg["channel"] == "tool":
+                    msg["tool_calls"].append(seg["content"])
+            # Clean up reasoning_content: set to None if no reasoning content was parsed
+            if not msg["reasoning_content"] and not original_reasoning:
+                msg["reasoning_content"] = None  # type: ignore
+        yield out_data
+    else:
+        # Streaming: handle async generator
+        parsers_per_choice = {}
+        async for chunk in chunks:  # type: ignore
+            out_chunk = {  # type: ignore
+                "id": chunk["id"],
+                "model": chunk["model"],
+                "object": chunk["object"],
+                "created": chunk["created"],
+                "choices": [],
+            }
+            for i, choice in enumerate(chunk["choices"]):
+                delta = choice.get("delta", {})
+                text = delta.get("content") or ""  # type: ignore
+                if i not in parsers_per_choice:
+                    parsers_per_choice[i] = HarmonyStreamParser()
+                # Feed text to parser and collect current delta only
+                curr_delta: Dict[str, object] = {
+                    "content": "",
+                    "reasoning_content": "",
+                    "tool_calls": [],
+                }
+                for seg in parsers_per_choice[i].feed(text):
+                    ch = seg["channel"]
+                    c = seg["content"]
+                    if ch == "final":
+                        curr_delta["content"] += c  # type: ignore
+                    elif ch == "analysis":
+                        curr_delta["reasoning_content"] += c  # type: ignore
+                    elif ch == "tool":
+                        curr_delta["tool_calls"].append(c)  # type: ignore
+                if curr_delta["reasoning_content"]:
+                    if not curr_delta["content"]:
+                        curr_delta["content"] = None
+                elif curr_delta["content"]:
+                    if not curr_delta["reasoning_content"]:
+                        curr_delta["reasoning_content"] = None
+                elif (
+                    choice.get("finish_reason") is not None
+                    and not curr_delta["reasoning_content"]
+                ):
+                    # For the final chunk, if there's no new reasoning content,
+                    # don't include empty reasoning_content to avoid clearing existing state
+                    curr_delta["reasoning_content"] = None
+                out_chunk["choices"].append(  # type: ignore
+                    {
+                        "index": i,
+                        "delta": curr_delta,
+                        "finish_reason": choice.get("finish_reason"),
+                    }
+                )
+            # Only yield if we have either content or reasoning_content
+            has_content = any(
+                choice["delta"].get("content")  # type: ignore
+                or choice["delta"].get("reasoning_content")  # type: ignore
+                or choice.get("finish_reason") is not None  # type: ignore
+                for choice in out_chunk["choices"]  # type: ignore
+            )
+            if has_content:
+                yield out_chunk  # type: ignore

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -19,11 +19,11 @@ import pprint
 import queue
 from typing import Iterator, List, Optional, Union
-import orjson
+from packaging import version
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
-from ..core import LLM
+from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import ChatModelMixin
@@ -98,10 +98,19 @@ class XllamaCppModel(LLM, ChatModelMixin):
             from xllamacpp import (
                 CommonParams,
                 Server,
+                __version__,
                 estimate_gpu_layers,
                 get_device_info,
                 ggml_backend_dev_type,
             )
+            try:
+                if version.parse(__version__) < version.parse("0.2.0"):
+                    raise RuntimeError(
+                        "Please update xllamacpp to >= 0.2.0 by `pip install -U xllamacpp`"
+                    )
+            except version.InvalidVersion:
+                pass  # If the version parse failed, we just skip the version check.
         except ImportError:
             error_message = "Failed to import module 'xllamacpp'"
             installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -160,6 +169,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             params.mmproj.path = mmproj
             if self.model_family.chat_template:
                 params.chat_template = self.model_family.chat_template
+            params.use_jinja = True
             # This is the default value, could be overwritten by _llamacpp_model_config
             params.n_parallel = min(8, os.cpu_count() or 1)
             for k, v in self._llamacpp_model_config.items():
@@ -208,7 +218,8 @@ class XllamaCppModel(LLM, ChatModelMixin):
                         )
                         logger.info("Estimate num gpu layers: %s", estimate)
                         if estimate.tensor_split:
-                            params.tensor_split = estimate.tensor_split
+                            for i in range(len(estimate.tensor_split)):
+                                params.tensor_split[i] = estimate.tensor_split[i]
                         else:
                             params.n_gpu_layers = estimate.layers
                 except Exception as e:
@@ -242,28 +253,18 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 {
                     "prompt": prompt,
                     "stream": stream,
+                    "model": self.model_uid,
                 }
             )
-            prompt_json = orjson.dumps(data)
-            def _error_callback(err):
-                try:
-                    msg = orjson.loads(err)
-                    q.put(_Error(msg))
-                except Exception as e:
-                    q.put(_Error(str(e)))
+            try:
-            def _ok_callback(ok):
-                try:
-                    res = orjson.loads(ok)
-                    res["model"] = self.model_uid
-                    q.put(res)
-                except Exception as e:
-                    logger.exception("handle_completions callback failed: %s", e)
-                    q.put(_Error(str(e)))
+                def _callback(res):
+                    if res.get("code"):
+                        q.put(_Error(res))
+                    else:
+                        q.put(res)
-            try:
-                self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
+                self._llm.handle_completions(data, _callback)
             except Exception as ex:
                 logger.exception("handle_completions failed: %s", ex)
                 q.put(_Error(str(ex)))
@@ -296,6 +297,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
         if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
             generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
         stream = generate_config.get("stream", False)
+        chat_template_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
+        chat_context_var.set(chat_template_kwargs)
         tools = generate_config.pop("tools", []) if generate_config else None
         q: queue.Queue = queue.Queue()
@@ -310,30 +320,21 @@ class XllamaCppModel(LLM, ChatModelMixin):
                     "messages": messages,
                     "stream": stream,
                     "tools": tools,
+                    "model": self.model_uid,
                 }
             )
-            prompt_json = orjson.dumps(data)
+            if chat_template_kwargs:
+                data["chat_template_kwargs"] = chat_template_kwargs
-            def _error_callback(err):
-                try:
-                    msg = orjson.loads(err)
-                    q.put(_Error(msg))
-                except Exception as e:
-                    q.put(_Error(str(e)))
+            try:
-            def _ok_callback(ok):
-                try:
-                    res = orjson.loads(ok)
-                    res["model"] = self.model_uid
-                    q.put(res)
-                except Exception as e:
-                    logger.exception("handle_chat_completions callback failed: %s", e)
-                    q.put(_Error(str(e)))
+                def _callback(res):
+                    if res.get("code"):
+                        q.put(_Error(res))
+                    else:
+                        q.put(res)
-            try:
-                self._llm.handle_chat_completions(
-                    prompt_json, _error_callback, _ok_callback
-                )
+                self._llm.handle_chat_completions(data, _callback)
             except Exception as ex:
                 logger.exception("handle_chat_completions failed: %s", ex)
                 q.put(_Error(str(ex)))