PyPI - xinference - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

xinference 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (125) hide show

xinference/core/worker.py CHANGED Viewed

@@ -22,6 +22,7 @@ import signal
 import threading
 import time
 from collections import defaultdict
+from dataclasses import dataclass
 from logging import getLogger
 from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -58,6 +59,11 @@ else:
     MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
+@dataclass
+class ModelStatus:
+    last_error: str = ""
 class WorkerActor(xo.StatelessActor):
     def __init__(
         self,
@@ -90,6 +96,7 @@ class WorkerActor(xo.StatelessActor):
         # attributes maintained after model launched:
         self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
         self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
+        self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
         self._gpu_to_model_uid: Dict[int, str] = {}
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
         # Dict structure: gpu_index: {(replica_model_uid, model_type)}
@@ -866,6 +873,9 @@ class WorkerActor(xo.StatelessActor):
             )
             try:
+                xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
+                if xavier_config is not None:
+                    xavier_config["rank_address"] = subpool_address
                 model, model_description = await asyncio.to_thread(
                     create_model_instance,
                     subpool_address,
@@ -893,6 +903,7 @@ class WorkerActor(xo.StatelessActor):
                     model=model,
                     model_description=model_description,
                     request_limits=request_limits,
+                    xavier_config=xavier_config,
                 )
                 await model_ref.load()
             except:
@@ -902,6 +913,7 @@ class WorkerActor(xo.StatelessActor):
                 raise
             self._model_uid_to_model[model_uid] = model_ref
             self._model_uid_to_model_spec[model_uid] = model_description
+            self._model_uid_to_model_status[model_uid] = ModelStatus()
             self._model_uid_to_addr[model_uid] = subpool_address
             self._model_uid_to_recover_count.setdefault(
                 model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
@@ -921,6 +933,7 @@ class WorkerActor(xo.StatelessActor):
             origin_uid,
             {"model_ability": abilities, "status": LaunchStatus.READY.name},
         )
+        return subpool_address
     @log_async(logger=logger, level=logging.INFO)
     async def terminate_model(self, model_uid: str, is_model_die=False):
@@ -976,6 +989,7 @@ class WorkerActor(xo.StatelessActor):
                 status = LaunchStatus.ERROR.name
             else:
                 status = LaunchStatus.TERMINATED.name
+                self._model_uid_to_model_status.pop(model_uid, None)
             if self._status_guard_ref is None:
                 _ = await self.get_supervisor_ref()
@@ -1010,6 +1024,9 @@ class WorkerActor(xo.StatelessActor):
     @log_sync(logger=logger)
     def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
+        model_status = self._model_uid_to_model_status.get(model_uid)
+        if model_status and model_status.last_error:
+            raise Exception(model_status.last_error)
         model_ref = self._model_uid_to_model.get(model_uid, None)
         if model_ref is None:
             raise ValueError(f"Model not found, uid: {model_uid}")
@@ -1138,6 +1155,21 @@ class WorkerActor(xo.StatelessActor):
         }
         return ret
+    def update_model_status(self, model_uid: str, **kwargs):
+        model_status = self._model_uid_to_model_status.get(model_uid)
+        if model_status is not None:
+            for k, v in kwargs.items():
+                setattr(model_status, k, v)
+    def get_model_status(self, model_uid: str):
+        return self._model_uid_to_model_status.get(model_uid)
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)
+    async def start_transfer_for_vllm(
+        self, rep_model_uid: str, rank_addresses: List[str]
+    ):
+        model_ref = self._model_uid_to_model[rep_model_uid]
+        await model_ref.start_transfer_for_vllm(rank_addresses)

xinference/model/image/model_spec.json CHANGED Viewed

@@ -167,6 +167,24 @@
     ],
     "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
   },
+  {
+    "model_name": "HunyuanDiT-v1.2",
+    "model_family": "stable_diffusion",
+    "model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
+    "model_revision": "5e96094e0ad19e7f475de8711f03634ca0ccc40c",
+    "model_ability": [
+      "text2image"
+    ]
+  },
+  {
+    "model_name": "HunyuanDiT-v1.2-Distilled",
+    "model_family": "stable_diffusion",
+    "model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled",
+    "model_revision": "ba991d1546d8c50936c4c16398ed0a87b9b99fb1",
+    "model_ability": [
+      "text2image"
+    ]
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -173,6 +173,26 @@
     ],
     "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
   },
+  {
+    "model_name": "HunyuanDiT-v1.2",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image"
+    ]
+  },
+  {
+    "model_name": "HunyuanDiT-v1.2-Distilled",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers-Distilled",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image"
+    ]
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",

xinference/model/llm/__init__.py CHANGED Viewed

@@ -134,6 +134,7 @@ def _install():
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .transformers.chatglm import ChatglmPytorchChatModel
+    from .transformers.cogagent import CogAgentChatModel
     from .transformers.cogvlm2 import CogVLM2Model
     from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
@@ -195,6 +196,7 @@ def _install():
             DeepSeekV2PytorchChatModel,
             OptPytorchModel,
             GlmEdgeVModel,
+            CogAgentChatModel,
         ]
     )
     if OmniLMMModel:  # type: ignore

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -8989,5 +8989,101 @@
       "<|im_end|>",
       "<|endoftext|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "marco-o1",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "AIDC-AI/Marco-o1"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "QuantFactory/Marco-o1-GGUF",
+        "model_file_name_template": "Marco-o1.{quantization}.gguf"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手，你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n        \n## 重要！！！！！\n当你回答问题时，你的思考应该在<Thought>内完成，<Output>内输出你的结果。\n<Thought>应该尽可能是英文，但是有2个特例，一个是对原文中的引用，另一个是是数学应该使用markdown格式，<Output>内的输出需要遵循用户输入的语言。\n        <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "cogagent",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "9",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/cogagent-9b-20241220"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   }
 ]

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -6722,5 +6722,104 @@
       "<|im_end|>",
       "<|endoftext|>"
     ]
+  },
+   {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "marco-o1",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "AIDC-AI/Marco-o1",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_file_name_template": "Marco-o1.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "QuantFactory/Marco-o1-GGUF"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手，你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n        \n## 重要！！！！！\n当你回答问题时，你的思考应该在<Thought>内完成，<Output>内输出你的结果。\n<Thought>应该尽可能是英文，但是有2个特例，一个是对原文中的引用，另一个是是数学应该使用markdown格式，<Output>内的输出需要遵循用户输入的语言。\n        <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "cogagent",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "9",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/cogagent-9b-20241220",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   }
 ]

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -477,39 +477,6 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         self._model, self._processor = self._load_model(**kwargs)
         self._tokenizer = self._processor.tokenizer
-    def _generate_stream_inner_no_image(self, **kwargs):
-        import mlx.nn as nn
-        from mlx_lm.utils import make_sampler, stream_generate
-        # For mlx-lm, the model(inputs) will return logits,
-        # but the language model in mlx-vlm will return an object
-        # https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
-        # so we cannot pass the language model to stream_generate directly
-        # we wrap here to just let model(inputs) return logits to pass stream_generate
-        class ModelWrapper(nn.Module):
-            def __init__(self, model):
-                super().__init__()
-                self._model = model.language_model
-            @property
-            def layers(self):
-                return self._model.layers
-            def __call__(self, *args, **kwargs):
-                return self._model(*args, **kwargs).logits
-        sampler = make_sampler(
-            temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
-        )
-        prompt_token_ids = kwargs.pop("prompt_token_ids")
-        yield from stream_generate(
-            ModelWrapper(self._model),
-            self._tokenizer,
-            prompt_token_ids,
-            sampler=sampler,
-            **kwargs,
-        )
     def _generate_stream_inner(self, **kwargs):
         import mlx.core as mx
         from mlx_lm.utils import GenerationResponse
@@ -517,27 +484,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         inputs = kwargs["prompt_token_ids"]
-        if not isinstance(inputs, tuple):
-            # no images
-            yield from self._generate_stream_inner_no_image(**kwargs)
-            return
         max_tokens = kwargs.pop("max_tokens")
-        input_ids, pixel_values, mask = inputs[:3]
-        kwargs = {
-            k: v
-            for k, v in zip(
-                [
-                    "image_grid_thw",
-                    "image_sizes",
-                    "aspect_ratio_ids",
-                    "aspect_ratio_mask",
-                    "cross_attention_mask",
-                ],
-                inputs[3:],
-            )
-        }
+        input_ids, pixel_values, mask, kwargs = inputs
         tokenizer = self._processor.tokenizer
         detokenizer = self._processor.detokenizer
@@ -583,37 +531,39 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
     def _prepare_inputs(
         self, prompt: Union[str, Dict[str, Any]], kwargs
     ) -> Tuple[Any, int]:
+        import mlx.core as mx
         from mlx_vlm import prepare_inputs
         prompt_str = prompt.get("prompt")  # type: ignore
         images = prompt.get("multi_modal_data", {}).get("image")  # type: ignore
         if images and not isinstance(images, list):
             images = [images]
-        if hasattr(self._model.config, "image_token_index"):
-            image_token_index = self._model.config.image_token_index
-        else:
-            image_token_index = None
+        resize_shape = kwargs.pop("resize_shape", None)
+        image_token_index = getattr(self._model.config, "image_token_index", None)
+        processor = self._processor
+        tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer
+        prompt_tokens = mx.array(tokenizer.encode(prompt_str))
         if not images:
-            prompt = prompt["prompt"]  # type: ignore
-            prompt_token_ids = self._tokenizer.encode(prompt)
-            prompt_token_ids = self._get_prompt_cache(
-                prompt_token_ids,
-                kwargs.get("lora_name"),
-                model=self._model.language_model,
-            )
-            return prompt_token_ids, len(prompt_token_ids)
+            input_ids = prompt_tokens[None, :]
+            pixel_values = mask = None
+            kwargs = {}
+            input_token_len = input_ids.size
         else:
             inputs = prepare_inputs(
-                None,
-                self._processor,
-                images,
-                prompt_str,
-                image_token_index,
-                kwargs.get("resize_shape"),
+                processor, images, prompt_str, image_token_index, resize_shape
             )
-            input_ids = inputs[0]
-            return inputs, len(input_ids)
+            input_ids = inputs["input_ids"]
+            pixel_values = inputs["pixel_values"]
+            mask = inputs["attention_mask"]
+            kwargs = {
+                k: v
+                for k, v in inputs.items()
+                if k not in ["input_ids", "pixel_values", "attention_mask"]
+            }
+            input_token_len = int(mask.sum())
+        return (input_ids, pixel_values, mask, kwargs), input_token_len
     def chat(
         self,

xinference 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

xinference 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl