PyPI - xinference - Versions diffs - 1.10.1__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl - Mend

xinference 1.10.1py3-none-any.whl → 1.11.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show

xinference/model/llm/utils.py CHANGED Viewed

@@ -75,6 +75,8 @@ QWEN_TOOL_CALL_FAMILY = [
     "Qwen3-VL-Thinking",
     "Qwen3-Next-Instruct",
     "Qwen3-Next-Thinking",
+    "Qwen3-Omni-Instruct",
+    "Qwen3-Omni-Thinking",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -100,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
 class ChatModelMixin:
     def __init__(self):
         self.model_family = None
         self.model_uid = None
@@ -143,7 +144,7 @@ class ChatModelMixin:
         tokenize=False,
         **kwargs,
     ):
-        if "vision" not in self.model_family.model_ability:  # type: ignore
+        if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability:  # type: ignore
             messages = self.convert_messages_with_content_list_to_str_conversion(
                 messages
             )
@@ -186,8 +187,7 @@ class ChatModelMixin:
                 return kwargs
             else:
                 raise TypeError(
-                    f"`chat_template_kwargs` but be a JSON parsable str "
-                    f"or dict, got: {kwargs}"
+                    f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
                 )
         elif reasoning_parser and not reasoning_parser.enable_thinking:
             # hybrid model like qwen3,
@@ -853,11 +853,11 @@ class ChatModelMixin:
             "tool_calls": tool_calls,
         }
-        try:
-            usage = c.get("usage")
-            assert "prompt_tokens" in usage
-        except Exception:
+        # For tool completion chunks, use None for usage, actual values for stop
+        if finish_reason == "tool_calls":
             usage = None
+        else:
+            usage = c.get("usage")
         return {
             "id": "chat" + f"cmpl-{_id}",
             "model": model_uid,
@@ -882,25 +882,32 @@ class ChatModelMixin:
     ):
         if not self.tool_parser:
             return self._get_final_chat_completion_chunk(c)
-        if self.reasoning_parser:
-            c = self.reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
         reasoning_content = None
+        content = ""
+        # First, process reasoning content if reasoning parser exists
+        text = c["choices"][0]["text"]
         if self.reasoning_parser and self.reasoning_parser.check_content_parser():
-            text = c["choices"][0]["text"]
-            reasoning_content, content = (
+            # Extract reasoning content directly from the original text
+            reasoning_content, processed_content = (
                 self.reasoning_parser.extract_reasoning_content(text)
             )
-            c["choices"][0]["text"] = content
+            # Use the processed content (without thinking tags) for tool parsing
+            if processed_content:
+                text = processed_content
+        # Then, extract tool calls from the processed text (without thinking tags)
         tool_calls = []
         failed_contents = []
         if isinstance(self.tool_parser, Glm4ToolParser):
             tool_result = self.tool_parser.extract_tool_calls(c)
         else:
-            text = c["choices"][0]["text"]
             tool_result = self.tool_parser.extract_tool_calls(text)
-        for content, func, args in tool_result:
+        # Process tool results
+        for tool_content, func, args in tool_result:
             if func:
                 tool_calls.append(
                     {
@@ -913,25 +920,31 @@ class ChatModelMixin:
                     }
                 )
             else:
-                if content:
-                    failed_contents.append(content)
-        finish_reason = "tool_calls" if tool_calls else "stop"
+                if tool_content:
+                    failed_contents.append(tool_content)
-        content = "".join(failed_contents) if failed_contents else None
+        # Determine the final content
+        if tool_calls:
+            # For tool calls, the main content should be empty or contain only non-tool parts
+            content = "".join(failed_contents) if failed_contents else ""
+        else:
+            # For non-tool calls, use the processed content from reasoning parser
+            content = text
+        finish_reason = "tool_calls" if tool_calls else "stop"
         m = {
             "role": "assistant",
-            "content": content if content else "",
+            "content": content,
             "tool_calls": tool_calls,
         }
         # add only reasoning_content is None
         if reasoning_content is not None:
             m["reasoning_content"] = reasoning_content
-        try:
-            usage = c.get("usage")
-            assert "prompt_tokens" in usage
-        except Exception:
+        # For tool completion chunks, use actual usage values when available
+        usage = c.get("usage")
+        if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
             usage = {
                 "prompt_tokens": -1,
                 "completion_tokens": -1,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -131,7 +131,7 @@ except ImportError:
     VLLM_INSTALLED = False
     VLLM_VERSION = None
-VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
+VLLM_SUPPORTED_MULTI_MODEL_LIST: List[str] = []
 VLLM_SUPPORTED_MODELS = [
     "llama-2",
     "llama-3",
@@ -229,34 +229,37 @@ if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.5.3"):
     VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.1"):
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL3")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("internvl2")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5-MPO")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL3")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.3"):
     VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("llama-3.2-vision-instruct")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-vl-instruct")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("QvQ-72B-Preview")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.2"):
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-vl-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio-instruct")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.3"):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwenLong-l1")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-omni")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("gemma-3-it")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.4"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
@@ -272,7 +275,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.1"):
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("Ernie4.5")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.1v-thinking")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.1v-thinking")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
@@ -280,7 +283,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.5v")
     VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
 if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
@@ -291,9 +294,11 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Thinking")
-if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.2"):
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
-    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.11.0"):
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Thinking")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Instruct")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Thinking")
+    VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Instruct")
 class VLLMModel(LLM):
@@ -545,7 +550,7 @@ class VLLMModel(LLM):
                             # patch vllm Executor.get_class
                             Executor.get_class = lambda vllm_config: executor_cls
                             self._engine = AsyncLLMEngine.from_engine_args(engine_args)
-                except:
+                except:  # noqa: E722
                     logger.exception("Creating vllm engine failed")
                     self._loading_error = sys.exc_info()
@@ -714,7 +719,7 @@ class VLLMModel(LLM):
                 logger.info("Detecting vLLM is not health, prepare to quit the process")
                 try:
                     self.stop()
-                except:
+                except:  # noqa: E722
                     # ignore error when stop
                     pass
                 # Just kill the process and let xinference auto-recover the model
@@ -857,7 +862,7 @@ class VLLMModel(LLM):
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
+            if quantization != "none" and quantization is not None:
                 return False
         if llm_spec.model_format == "awq":
             # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
@@ -988,7 +993,10 @@ class VLLMModel(LLM):
         from vllm import TokensPrompt
         token_ids = await asyncio.to_thread(
-            self._tokenize, tokenizer, prompt, config  # type: ignore
+            self._tokenize,
+            tokenizer,
+            prompt,  # type: ignore
+            config,
         )
         return TokensPrompt(prompt_token_ids=token_ids)
@@ -1082,18 +1090,43 @@ class VLLMModel(LLM):
                     logger.warning(f"Failed to create GuidedDecodingParams: {e}")
                     guided_options = None
-            # Use structured_outputs for vLLM >= 0.11.0, guided_decoding for older versions
-            if (
-                VLLM_VERSION >= version.parse("0.11.0")
-                or VLLM_VERSION.base_version >= "0.11.0"
-            ):
-                sampling_params = SamplingParams(
-                    structured_outputs=guided_options, **sanitized_generate_config
-                )
-            else:
-                sampling_params = SamplingParams(
-                    guided_decoding=guided_options, **sanitized_generate_config
+            try:
+                import inspect
+                sp_sig = inspect.signature(SamplingParams)
+                # For v0.9.2 and similar versions, prioritize guided_decoding over structured_outputs
+                # structured_outputs was introduced later (around v0.11.0) and may not accept
+                # GuidedDecodingParams in earlier versions even if the parameter exists
+                if "guided_decoding" in sp_sig.parameters:
+                    sampling_params = SamplingParams(
+                        guided_decoding=guided_options, **sanitized_generate_config
+                    )
+                elif "structured_outputs" in sp_sig.parameters:
+                    try:
+                        sampling_params = SamplingParams(
+                            structured_outputs=guided_options,
+                            **sanitized_generate_config,
+                        )
+                    except TypeError as e:
+                        if "structured_outputs" in str(e):
+                            # structured_outputs parameter exists but doesn't accept GuidedDecodingParams
+                            # Fall back to no guided decoding
+                            logger.warning(
+                                f"structured_outputs parameter failed: {e}. "
+                                "Falling back to no guided decoding for vLLM version compatibility."
+                            )
+                            sampling_params = SamplingParams(
+                                **sanitized_generate_config
+                            )
+                        else:
+                            raise
+                else:
+                    sampling_params = SamplingParams(**sanitized_generate_config)
+            except Exception as e:
+                logger.warning(
+                    f"Failed to create SamplingParams with guided decoding: {e}"
                 )
+                sampling_params = SamplingParams(**sanitized_generate_config)
         else:
             # ignore generate configs for older versions
             sanitized_generate_config.pop("guided_json", None)
@@ -1111,7 +1144,9 @@ class VLLMModel(LLM):
             # this requires tokenizing
             tokenizer = await self._get_tokenizer(lora_request)
             prompt_or_token_ids = await self._gen_tokens_prompt(
-                tokenizer, prompt, sanitized_generate_config  # type: ignore
+                tokenizer,
+                prompt,
+                sanitized_generate_config,  # type: ignore
             )
             sampling_params.max_tokens = max_tokens = self._context_length - len(  # type: ignore
                 prompt_or_token_ids["prompt_token_ids"]  # type: ignore
@@ -1266,11 +1301,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         ]:
             return False
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
+            if quantization != "none" and quantization is not None:
                 return False
         if llm_spec.model_format == "awq":
-            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
-            if "4" not in quantization:
+            if not any(q in quantization for q in ("4", "8")):
                 return False
         if llm_spec.model_format == "gptq":
             if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
@@ -1430,7 +1464,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             return self._to_chat_completion(c, self.reasoning_parser)
-class VLLMVisionModel(VLLMModel, ChatModelMixin):
+class VLLMMultiModel(VLLMModel, ChatModelMixin):
     @classmethod
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
@@ -1442,11 +1476,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
-            if quantization != "none" and not (quantization is None):
+            if quantization != "none" and quantization is not None:
                 return False
         if llm_spec.model_format == "awq":
-            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
-            if "4" not in quantization:
+            if not any(q in quantization for q in ("4", "8")):
                 return False
         if llm_spec.model_format == "gptq":
             if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
@@ -1456,12 +1489,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
                 if "4" not in quantization:
                     return False
         if isinstance(llm_family, CustomLLMFamilyV2):
-            if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
+            if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
                 return False
         else:
-            if llm_family.model_name not in VLLM_SUPPORTED_VISION_MODEL_LIST:
+            if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
                 return False
-        if "vision" not in llm_family.model_ability:
+        if (
+            "vision" not in llm_family.model_ability
+            and "audio" not in llm_family.model_ability
+            and "omni" not in llm_family.model_ability
+        ):
             return False
         return VLLM_INSTALLED
@@ -1470,13 +1507,21 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     ) -> VLLMModelConfig:
         model_config = super()._sanitize_model_config(model_config)
         if VLLM_VERSION >= version.parse("0.5.5"):
-            model_config["limit_mm_per_prompt"] = (
-                json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
-                if model_config.get("limit_mm_per_prompt")
-                else {
-                    "image": 2,  # default 2 images all chat
-                }
-            )
+            if model_config.get("limit_mm_per_prompt"):
+                model_config["limit_mm_per_prompt"] = json.loads(
+                    model_config.get("limit_mm_per_prompt")  # type: ignore
+                )
+            else:
+                if "omni" in self.model_family.model_ability:
+                    model_config["limit_mm_per_prompt"] = {
+                        "image": 2,
+                        "video": 2,
+                        "audio": 2,
+                    }
+                elif "vision" in self.model_family.model_ability:
+                    model_config["limit_mm_per_prompt"] = {"image": 2, "video": 2}
+                elif "audio" in self.model_family.model_ability:
+                    model_config["limit_mm_per_prompt"] = {"audio": 2}
         return model_config
     def _sanitize_chat_config(
@@ -1510,7 +1555,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         multi_modal_data = prompt.get("multi_modal_data")
         token_ids = await asyncio.to_thread(
-            self._tokenize, tokenizer, prompt_str, config  # type: ignore
+            self._tokenize,
+            tokenizer,
+            prompt_str,
+            config,  # type: ignore
         )
         return TokensPrompt(
             prompt_token_ids=token_ids, multi_modal_data=multi_modal_data
@@ -1526,9 +1574,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
+        audios, images, videos = None, None, None
         if "internvl" not in model_family.lower():
-            from qwen_vl_utils import process_vision_info
+            from qwen_omni_utils import (
+                process_audio_info,
+                process_mm_info,
+                process_vision_info,
+            )
             messages = self._transform_messages(messages)
@@ -1543,29 +1595,36 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None
+            if "omni" in self.model_family.model_ability:
+                audios, images, videos = process_mm_info(
+                    messages, use_audio_in_video=True
+                )
+            elif "audio" in self.model_family.model_ability:
+                audios = process_audio_info(messages, use_audio_in_video=False)
+            elif "vision" in self.model_family.model_ability:
+                images, videos = process_vision_info(  # type: ignore
+                    messages, return_video_kwargs=False
+                )
             prompt = self.get_full_context(
                 messages, self.model_family.chat_template, **full_context_kwargs
             )
-            images, video_inputs = process_vision_info(messages)
-            if video_inputs:
-                raise ValueError("Not support video input now.")
-        else:
-            prompt, images = self.get_specific_prompt(model_family, messages)
-        if not images:
-            inputs = {
-                "prompt": prompt,
-            }
-        elif len(images) == 1:
-            inputs = {
-                "prompt": prompt,
-                "multi_modal_data": {"image": images[-1]},  # type: ignore
-            }
         else:
-            inputs = {
-                "prompt": prompt,
-                "multi_modal_data": {"image": images},  # type: ignore
-            }
+            prompt, images = self.get_specific_prompt(model_family, messages)
+        inputs = {"prompt": prompt, "multi_modal_data": {}, "mm_processor_kwargs": {}}
+        if images:
+            inputs["multi_modal_data"]["image"] = images
+        if videos:
+            inputs["multi_modal_data"]["video"] = videos
+        if audios:
+            inputs["multi_modal_data"]["audio"] = audios
+        if "omni" in self.model_family.model_ability:
+            inputs["mm_processor_kwargs"]["use_audio_in_video"] = True
+        if inputs["multi_modal_data"] == {}:
+            inputs.pop("multi_modal_data")
+        if inputs["mm_processor_kwargs"] == {}:
+            inputs.pop("mm_processor_kwargs")
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)

xinference/model/utils.py CHANGED Viewed

@@ -315,6 +315,11 @@ def set_all_random_seed(seed: int):
 class CancellableDownloader:
+    _global_lock = threading.Lock()
+    _active_instances = 0
+    _original_update = None  # Class-level original update method
+    _patch_lock = threading.Lock()  # Additional lock for patching operations
     def __init__(
         self,
         cancel_error_cls: Type[BaseException] = asyncio.CancelledError,
@@ -325,23 +330,23 @@ class CancellableDownloader:
             self._cancelled = threading.Event()
         self._done_event = threading.Event()
         self._cancel_error_cls = cancel_error_cls
-        self._original_update = None
         # progress for tqdm that is main
         self._main_progresses: Set[tqdm] = set()
         # progress for file downloader
         # mainly when tqdm unit is set
         self._download_progresses: Set[tqdm] = set()
-        # tqdm original update
-        self._original_tqdm_update = None
+        # Instance-specific tqdm tracking
+        self._patched_instances: Set[int] = set()
     def reset(self):
         self._main_progresses.clear()
         self._download_progresses.clear()
     def get_progress(self) -> float:
-        if self.cancelled or self.done:
-            # directly return 1.0 when cancelled or finished
+        if self.done:
+            # directly return 1.0 when finished
             return 1.0
+        # Don't return 1.0 when cancelled, calculate actual progress
         tasks = finished_tasks = 0
         for main_progress in self._main_progresses:
@@ -376,6 +381,7 @@ class CancellableDownloader:
     def cancel(self):
         self._cancelled.set()
+        self._done_event.set()
     @property
     def cancelled(self):
@@ -392,39 +398,76 @@ class CancellableDownloader:
         raise self._cancel_error_cls(error_msg)
     def patch_tqdm(self):
-        # patch tqdm
-        # raise error if cancelled
-        self._original_update = original_update = tqdm.update
-        downloader = self
-        def patched_update(self, n):
-            if downloader.cancelled:
-                downloader.raise_error()
-            if not self.disable:
-                progresses = (
-                    downloader._main_progresses
-                    if getattr(self, "unit", "it") == "it"
-                    else downloader._download_progresses
-                )
-                progresses.add(self)
-            return original_update(self, n)
-        tqdm.update = patched_update
+        # Use class-level patching to avoid conflicts
+        with self._patch_lock:
+            if self._original_update is None:
+                self._original_update = original_update = tqdm.update
+                # Thread-safe patched update
+                def patched_update(tqdm_instance, n):
+                    import gc
+                    # Get all CancellableDownloader instances and check for cancellation
+                    downloaders = [
+                        obj
+                        for obj in gc.get_objects()
+                        if isinstance(obj, CancellableDownloader)
+                    ]
+                    for downloader in downloaders:
+                        # if download cancelled, throw error
+                        if getattr(downloader, "cancelled", False):
+                            downloader.raise_error()
+                        progresses = None
+                        if not getattr(tqdm_instance, "disable", False):
+                            unit = getattr(tqdm_instance, "unit", "it")
+                            if unit == "it":
+                                progresses = getattr(
+                                    downloader, "_main_progresses", None
+                                )
+                            else:
+                                progresses = getattr(
+                                    downloader, "_download_progresses", None
+                                )
+                        if progresses is not None:
+                            progresses.add(tqdm_instance)
+                        else:
+                            logger.debug(
+                                f"No progresses found for downloader {downloader}"
+                            )
+                    # Call original update with safety check
+                    return original_update(tqdm_instance, n)
+                tqdm.update = patched_update
     def unpatch_tqdm(self):
-        from tqdm.auto import tqdm
-        if self._original_update:
-            tqdm.update = self._original_update
+        with self._patch_lock:
+            if self._original_update is not None and self._active_instances == 0:
+                tqdm.update = self._original_update
+                self._original_update = None
     def __enter__(self):
-        self.patch_tqdm()
+        # Use global lock to prevent concurrent patching
+        with self._global_lock:
+            if self._active_instances == 0:
+                self.patch_tqdm()
+            self._active_instances += 1
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.unpatch_tqdm()
-        self._done_event.set()
-        self.reset()
+        # Use global lock to prevent concurrent unpatching
+        with self._global_lock:
+            self._active_instances -= 1
+            if self._active_instances == 0:
+                self.unpatch_tqdm()
+        try:
+            self._done_event.set()
+            self.reset()
+        except Exception as e:
+            logger.debug(f"Error during CancellableDownloader cleanup: {e}")
 def get_engine_params_by_name(

xinference 1.10.1__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.10.1py3-none-any.whl → 1.11.0.post1py3-none-any.whl