PyPI - xinference - Versions diffs - 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

xinference 0.12.3py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (71) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-06-28T15:25:07+0800",
+ "date": "2024-07-05T18:19:09+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "3d9c261a7d5c4941091d1711cb732ce17b34e7f1",
- "version": "0.12.3"
+ "full-revisionid": "007408c55272bc343821dd152df780de5dc9c037",
+ "version": "0.13.0"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -1477,14 +1477,14 @@ class RESTfulAPI:
             await self._report_error_event(model_uid, str(e))
             raise HTTPException(status_code=500, detail=str(e))
-        from ..model.llm.utils import QWEN_TOOL_CALL_FAMILY
+        from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY
         model_family = desc.get("model_family", "")
-        function_call_models = [
-            "chatglm3",
-            "glm4-chat",
-            "gorilla-openfunctions-v1",
-        ] + QWEN_TOOL_CALL_FAMILY
+        function_call_models = (
+            ["chatglm3", "gorilla-openfunctions-v1"]
+            + QWEN_TOOL_CALL_FAMILY
+            + GLM4_TOOL_CALL_FAMILY
+        )
         is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -182,8 +182,6 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
                 f"Failed to rerank documents, detail: {response.json()['detail']}"
             )
         response_data = response.json()
-        for r in response_data["results"]:
-            r["document"] = documents[r["index"]]
         return response_data

xinference/core/model.py CHANGED Viewed

@@ -65,6 +65,9 @@ except ImportError:
     OutOfMemoryError = _OutOfMemoryError
+XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = ["qwen-vl-chat", "cogvlm2", "glm-4v"]
 def request_limit(fn):
     """
     Used by ModelActor.
@@ -268,11 +271,25 @@ class ModelActor(xo.StatelessActor):
         model_ability = self._model_description.get("model_ability", [])
-        return (
-            XINFERENCE_TRANSFORMERS_ENABLE_BATCHING
-            and isinstance(self._model, PytorchModel)
-            and "vision" not in model_ability
+        condition = XINFERENCE_TRANSFORMERS_ENABLE_BATCHING and isinstance(
+            self._model, PytorchModel
         )
+        if condition and "vision" in model_ability:
+            if (
+                self._model.model_family.model_name
+                in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
+                or self._model.model_family.model_family
+                in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
+            ):
+                return True
+            else:
+                logger.warning(
+                    f"Currently for multimodal models, "
+                    f"xinference only supports {', '.join(XINFERENCE_BATCHING_ALLOWED_VISION_MODELS)} for batching. "
+                    f"Your model {self._model.model_family.model_name} with model family {self._model.model_family.model_family} is disqualified."
+                )
+                return False
+        return condition
     async def load(self):
         self._model.load()

xinference/core/scheduler.py CHANGED Viewed

@@ -82,6 +82,8 @@ class InferenceRequest:
         # Record error message when this request has error.
         # Must set stopped=True when this field is set.
         self.error_msg: Optional[str] = None
+        # For compatibility. Record some extra parameters for some special cases.
+        self.extra_kwargs = {}
         # check the integrity of args passed upstream
         self._check_args()

xinference/core/worker.py CHANGED Viewed

@@ -73,6 +73,9 @@ class WorkerActor(xo.StatelessActor):
         self._main_pool.recover_sub_pool = self.recover_sub_pool
         # internal states.
+        # temporary placeholder during model launch process:
+        self._model_uid_launching_guard: Dict[str, bool] = {}
+        # attributes maintained after model launched:
         self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
         self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
         self._gpu_to_model_uid: Dict[int, str] = {}
@@ -594,10 +597,14 @@ class WorkerActor(xo.StatelessActor):
         launch_args.pop("kwargs")
         launch_args.update(kwargs)
-        event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+        try:
+            origin_uid, _, _ = parse_replica_model_uid(model_uid)
+        except Exception as e:
+            logger.exception(e)
+            raise
         try:
             await self._event_collector_ref.report_event(
-                event_model_uid,
+                origin_uid,
                 Event(
                     event_type=EventType.INFO,
                     event_ts=int(time.time()),
@@ -640,50 +647,55 @@ class WorkerActor(xo.StatelessActor):
         assert model_uid not in self._model_uid_to_model
         self._check_model_is_valid(model_name, model_format)
-        subpool_address, devices = await self._create_subpool(
-            model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
-        )
+        if self.get_model_launch_status(model_uid) is not None:
+            raise ValueError(f"{model_uid} is running")
         try:
-            origin_uid, _, _ = parse_replica_model_uid(model_uid)
-            model, model_description = await asyncio.to_thread(
-                create_model_instance,
-                subpool_address,
-                devices,
-                model_uid,
-                model_type,
-                model_name,
-                model_engine,
-                model_format,
-                model_size_in_billions,
-                quantization,
-                peft_model_config,
-                **kwargs,
-            )
-            await self.update_cache_status(model_name, model_description)
-            model_ref = await xo.create_actor(
-                ModelActor,
-                address=subpool_address,
-                uid=model_uid,
-                worker_address=self.address,
-                model=model,
-                model_description=model_description,
-                request_limits=request_limits,
+            self._model_uid_launching_guard[model_uid] = True
+            subpool_address, devices = await self._create_subpool(
+                model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
             )
-            await model_ref.load()
-        except:
-            logger.error(f"Failed to load model {model_uid}", exc_info=True)
-            self.release_devices(model_uid=model_uid)
-            await self._main_pool.remove_sub_pool(subpool_address)
-            raise
-        self._model_uid_to_model[model_uid] = model_ref
-        self._model_uid_to_model_spec[model_uid] = model_description
-        self._model_uid_to_addr[model_uid] = subpool_address
-        self._model_uid_to_recover_count.setdefault(
-            model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
-        )
-        self._model_uid_to_launch_args[model_uid] = launch_args
+            try:
+                model, model_description = await asyncio.to_thread(
+                    create_model_instance,
+                    subpool_address,
+                    devices,
+                    model_uid,
+                    model_type,
+                    model_name,
+                    model_engine,
+                    model_format,
+                    model_size_in_billions,
+                    quantization,
+                    peft_model_config,
+                    **kwargs,
+                )
+                await self.update_cache_status(model_name, model_description)
+                model_ref = await xo.create_actor(
+                    ModelActor,
+                    address=subpool_address,
+                    uid=model_uid,
+                    worker_address=self.address,
+                    model=model,
+                    model_description=model_description,
+                    request_limits=request_limits,
+                )
+                await model_ref.load()
+            except:
+                logger.error(f"Failed to load model {model_uid}", exc_info=True)
+                self.release_devices(model_uid=model_uid)
+                await self._main_pool.remove_sub_pool(subpool_address)
+                raise
+            self._model_uid_to_model[model_uid] = model_ref
+            self._model_uid_to_model_spec[model_uid] = model_description
+            self._model_uid_to_addr[model_uid] = subpool_address
+            self._model_uid_to_recover_count.setdefault(
+                model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
+            )
+            self._model_uid_to_launch_args[model_uid] = launch_args
+        finally:
+            del self._model_uid_launching_guard[model_uid]
         # update status to READY
         abilities = await self._get_model_ability(model, model_type)
@@ -694,10 +706,13 @@ class WorkerActor(xo.StatelessActor):
     @log_async(logger=logger)
     async def terminate_model(self, model_uid: str):
-        event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+        # Terminate model while its launching is not allow
+        if model_uid in self._model_uid_launching_guard:
+            raise ValueError(f"{model_uid} is launching")
+        origin_uid, _, __ = parse_replica_model_uid(model_uid)
         try:
             await self._event_collector_ref.report_event(
-                event_model_uid,
+                origin_uid,
                 Event(
                     event_type=EventType.INFO,
                     event_ts=int(time.time()),
@@ -708,7 +723,6 @@ class WorkerActor(xo.StatelessActor):
             # Report callback error can be log and ignore, should not interrupt the Process
             logger.error("report_event error: %s" % (e))
-        origin_uid, _, _ = parse_replica_model_uid(model_uid)
         await self._status_guard_ref.update_instance_info(
             origin_uid, {"status": LaunchStatus.TERMINATING.name}
         )
@@ -740,6 +754,21 @@ class WorkerActor(xo.StatelessActor):
                 origin_uid, {"status": LaunchStatus.TERMINATED.name}
             )
+    # Provide an interface for future version of supervisor to call
+    def get_model_launch_status(self, model_uid: str) -> Optional[str]:
+        """
+        returns:
+            CREATING: model is launching
+            RREADY: model is running
+            None: model is not running (launch error might have happened)
+        """
+        if model_uid in self._model_uid_launching_guard:
+            return LaunchStatus.CREATING.name
+        if model_uid in self._model_uid_to_model:
+            return LaunchStatus.READY.name
+        return None
     @log_async(logger=logger)
     async def list_models(self) -> Dict[str, Dict[str, Any]]:
         ret = {}

xinference/deploy/utils.py CHANGED Viewed

@@ -79,6 +79,12 @@ def get_config_dict(
                 "stream": "ext://sys.stderr",
                 "filters": ["logger_name_filter"],
             },
+            "console_handler": {
+                "class": "logging.StreamHandler",
+                "formatter": "formatter",
+                "level": log_level,
+                "stream": "ext://sys.stderr",
+            },
             "file_handler": {
                 "class": "logging.handlers.RotatingFileHandler",
                 "formatter": "formatter",
@@ -95,7 +101,32 @@ def get_config_dict(
                 "handlers": ["stream_handler", "file_handler"],
                 "level": log_level,
                 "propagate": False,
-            }
+            },
+            "uvicorn": {
+                "handlers": ["stream_handler", "file_handler"],
+                "level": log_level,
+                "propagate": False,
+            },
+            "uvicorn.error": {
+                "handlers": ["stream_handler", "file_handler"],
+                "level": log_level,
+                "propagate": False,
+            },
+            "uvicorn.access": {
+                "handlers": ["stream_handler", "file_handler"],
+                "level": log_level,
+                "propagate": False,
+            },
+            "transformers": {
+                "handlers": ["console_handler", "file_handler"],
+                "level": log_level,
+                "propagate": False,
+            },
+            "vllm": {
+                "handlers": ["console_handler", "file_handler"],
+                "level": log_level,
+                "propagate": False,
+            },
         },
         "root": {
             "level": "WARN",
@@ -127,7 +158,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
         while attempts < max_attempts:
             time.sleep(sleep_interval)
             try:
-                from xinference.core.supervisor import SupervisorActor
+                from ..core.supervisor import SupervisorActor
                 supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref(  # type: ignore
                     address=address, uid=SupervisorActor.uid()

xinference/model/llm/__init__.py CHANGED Viewed

@@ -34,6 +34,7 @@ from .llm_family import (
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    MLX_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
     TRANSFORMERS_CLASSES,
@@ -42,6 +43,7 @@ from .llm_family import (
     GgmlLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
+    MLXLLMSpecV1,
     PromptStyleV1,
     PytorchLLMSpecV1,
     get_cache_status,
@@ -112,6 +114,7 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .mlx.core import MLXChatModel, MLXModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.cogvlm2 import CogVLM2Model
@@ -147,6 +150,7 @@ def _install():
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    MLX_CLASSES.extend([MLXModel, MLXChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
@@ -176,6 +180,7 @@ def _install():
     SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
+    SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -944,7 +944,7 @@
           "none"
         ],
         "model_id": "THUDM/glm-4v-9b",
-        "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
+        "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
       }
     ],
     "prompt_style": {
@@ -2549,6 +2549,38 @@
         ],
         "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
@@ -2565,6 +2597,82 @@
         ],
         "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
         "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        }
       }
     ],
     "prompt_style": {
@@ -2618,6 +2726,34 @@
           "Int4"
         ],
         "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ]
+        }
       }
     ],
     "prompt_style": {
@@ -5809,6 +5945,16 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
       ]
     }
   },
@@ -5997,6 +6143,99 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "gemma-2-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-2-9b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-2-27b-it"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-fp16"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-fp16"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "gemma",
+      "roles": [
+        "user",
+        "model"
+      ],
+      "stop": [
+        "<end_of_turn>",
+        "<start_of_turn>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

xinference 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl

Potentially problematic release.

xinference 0.12.3py3-none-any.whl → 0.13.0py3-none-any.whl