PyPI - xinference - Versions diffs - 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl - Mend

xinference 0.16.1py3-none-any.whl → 0.16.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show

xinference/_version.py +3 -3
xinference/conftest.py +0 -8
xinference/constants.py +2 -0
xinference/core/model.py +34 -2
xinference/core/supervisor.py +5 -5
xinference/core/utils.py +9 -10
xinference/core/worker.py +8 -5
xinference/deploy/cmdline.py +5 -0
xinference/deploy/utils.py +7 -4
xinference/model/audio/core.py +6 -2
xinference/model/audio/model_spec.json +1 -1
xinference/model/core.py +3 -1
xinference/model/embedding/core.py +6 -2
xinference/model/image/core.py +6 -2
xinference/model/image/ocr/got_ocr2.py +3 -0
xinference/model/llm/__init__.py +33 -0
xinference/model/llm/core.py +4 -4
xinference/model/llm/llm_family.json +87 -0
xinference/model/llm/llm_family.py +68 -2
xinference/model/llm/llm_family_modelscope.json +91 -0
xinference/model/llm/llm_family_openmind_hub.json +1359 -0
xinference/model/llm/vllm/core.py +2 -1
xinference/model/rerank/core.py +9 -1
xinference/model/utils.py +7 -0
xinference/model/video/core.py +6 -2
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.b76aeeb7.js → main.2f269bb3.js} +3 -3
xinference/web/ui/build/static/js/main.2f269bb3.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +1 -0
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/METADATA +5 -4
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/RECORD +37 -36
xinference/web/ui/build/static/js/main.b76aeeb7.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/32ea2c04cf0bba2761b4883d2c40cc259952c94d2d6bb774e510963ca37aac0a.json +0 -1
/xinference/web/ui/build/static/js/{main.b76aeeb7.js.LICENSE.txt → main.2f269bb3.js.LICENSE.txt} +0 -0
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/LICENSE +0 -0
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/WHEEL +0 -0
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/entry_points.txt +0 -0
{xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/top_level.txt +0 -0

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-10-25T12:51:06+0800",
+ "date": "2024-11-07T16:55:36+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "d4cd7b15104c16838e3c562cf2d33337e3d38897",
- "version": "0.16.1"
+ "full-revisionid": "85ab86bf1c0967e45fbec995534cd5a0c9a9c439",
+ "version": "0.16.3"
 }
 '''  # END VERSION_JSON

xinference/conftest.py CHANGED Viewed

@@ -58,10 +58,6 @@ TEST_LOGGING_CONF = {
             "propagate": False,
         }
     },
-    "root": {
-        "level": "WARN",
-        "handlers": ["stream_handler"],
-    },
 }
 TEST_LOG_FILE_PATH = get_log_file(f"test_{get_timestamp_ms()}")
@@ -102,10 +98,6 @@ TEST_FILE_LOGGING_CONF = {
             "propagate": False,
         }
     },
-    "root": {
-        "level": "WARN",
-        "handlers": ["stream_handler", "file_handler"],
-    },
 }

xinference/constants.py CHANGED Viewed

@@ -39,6 +39,7 @@ def get_xinference_home() -> str:
         # if user has already set `XINFERENCE_HOME` env, change huggingface and modelscope default download path
         os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(home_path, "huggingface")
         os.environ["MODELSCOPE_CACHE"] = os.path.join(home_path, "modelscope")
+        os.environ["XDG_CACHE_HOME"] = os.path.join(home_path, "openmind_hub")
     # In multi-tenant mode,
     # gradio's temporary files are stored in their respective home directories,
     # to prevent insufficient permissions
@@ -86,3 +87,4 @@ XINFERENCE_DOWNLOAD_MAX_ATTEMPTS = int(
 XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
     XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
 )
+XINFERENCE_LAUNCH_MODEL_RETRY = 3

xinference/core/model.py CHANGED Viewed

@@ -40,7 +40,10 @@ from typing import (
 import sse_starlette.sse
 import xoscar as xo
-from ..constants import XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE
+from ..constants import (
+    XINFERENCE_LAUNCH_MODEL_RETRY,
+    XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE,
+)
 if TYPE_CHECKING:
     from .progress_tracker import ProgressTrackerActor
@@ -134,6 +137,8 @@ def oom_check(fn):
 class ModelActor(xo.StatelessActor):
+    _replica_model_uid: Optional[str]
     @classmethod
     def gen_uid(cls, model: "LLM"):
         return f"{model.__class__}-model-actor"
@@ -192,6 +197,7 @@ class ModelActor(xo.StatelessActor):
         supervisor_address: str,
         worker_address: str,
         model: "LLM",
+        replica_model_uid: str,
         model_description: Optional["ModelDescription"] = None,
         request_limits: Optional[int] = None,
     ):
@@ -203,6 +209,7 @@ class ModelActor(xo.StatelessActor):
         self._supervisor_address = supervisor_address
         self._worker_address = worker_address
+        self._replica_model_uid = replica_model_uid
         self._model = model
         self._model_description = (
             model_description.to_dict() if model_description else {}
@@ -257,6 +264,9 @@ class ModelActor(xo.StatelessActor):
                 uid=FluxBatchSchedulerActor.gen_uid(self.model_uid()),
             )
+    def __repr__(self) -> str:
+        return f"ModelActor({self._replica_model_uid})"
     async def _record_completion_metrics(
         self, duration, completion_tokens, prompt_tokens
     ):
@@ -374,7 +384,28 @@ class ModelActor(xo.StatelessActor):
         return condition
     async def load(self):
-        self._model.load()
+        try:
+            # Change process title for model
+            import setproctitle
+            setproctitle.setproctitle(f"Model: {self._replica_model_uid}")
+        except ImportError:
+            pass
+        i = 0
+        while True:
+            i += 1
+            try:
+                self._model.load()
+                break
+            except Exception as e:
+                if (
+                    i < XINFERENCE_LAUNCH_MODEL_RETRY
+                    and str(e).find("busy or unavailable") >= 0
+                ):
+                    await asyncio.sleep(5)
+                    logger.warning("Retry to load model {model_uid}: %d times", i)
+                    continue
+                raise
         if self.allow_batching():
             await self._scheduler_ref.set_model(self._model)
             logger.debug(
@@ -385,6 +416,7 @@ class ModelActor(xo.StatelessActor):
             logger.debug(
                 f"Batching enabled for model: {self.model_uid()}, max_num_images: {self._model.get_max_num_images_for_batching()}"
             )
+        logger.info(f"{self} loaded")
     def model_uid(self):
         return (

xinference/core/supervisor.py CHANGED Viewed

@@ -970,7 +970,7 @@ class SupervisorActor(xo.StatelessActor):
                 raise ValueError(
                     f"Model is already in the model list, uid: {_replica_model_uid}"
                 )
-            replica_gpu_idx = assign_replica_gpu(_replica_model_uid, gpu_idx)
+            replica_gpu_idx = assign_replica_gpu(_replica_model_uid, replica, gpu_idx)
             nonlocal model_type
             worker_ref = (
@@ -1084,7 +1084,7 @@ class SupervisorActor(xo.StatelessActor):
                             dead_models,
                         )
                         for replica_model_uid in dead_models:
-                            model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
+                            model_uid, _ = parse_replica_model_uid(replica_model_uid)
                             self._model_uid_to_replica_info.pop(model_uid, None)
                             self._replica_model_uid_to_worker.pop(
                                 replica_model_uid, None
@@ -1137,7 +1137,7 @@ class SupervisorActor(xo.StatelessActor):
             raise ValueError(f"Model not found in the model list, uid: {model_uid}")
         replica_model_uid = build_replica_model_uid(
-            model_uid, replica_info.replica, next(replica_info.scheduler)
+            model_uid, next(replica_info.scheduler)
         )
         worker_ref = self._replica_model_uid_to_worker.get(replica_model_uid, None)
@@ -1154,7 +1154,7 @@ class SupervisorActor(xo.StatelessActor):
             raise ValueError(f"Model not found in the model list, uid: {model_uid}")
         # Use rep id 0 to instead of next(replica_info.scheduler) to avoid
         # consuming the generator.
-        replica_model_uid = build_replica_model_uid(model_uid, replica_info.replica, 0)
+        replica_model_uid = build_replica_model_uid(model_uid, 0)
         worker_ref = self._replica_model_uid_to_worker.get(replica_model_uid, None)
         if worker_ref is None:
             raise ValueError(
@@ -1260,7 +1260,7 @@ class SupervisorActor(xo.StatelessActor):
                 uids_to_remove.append(model_uid)
         for replica_model_uid in uids_to_remove:
-            model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
+            model_uid, _ = parse_replica_model_uid(replica_model_uid)
             self._model_uid_to_replica_info.pop(model_uid, None)
             self._replica_model_uid_to_worker.pop(replica_model_uid, None)

xinference/core/utils.py CHANGED Viewed

@@ -146,27 +146,26 @@ def iter_replica_model_uid(model_uid: str, replica: int) -> Generator[str, None,
     """
     replica = int(replica)
     for rep_id in range(replica):
-        yield f"{model_uid}-{replica}-{rep_id}"
+        yield f"{model_uid}-{rep_id}"
-def build_replica_model_uid(model_uid: str, replica: int, rep_id: int) -> str:
+def build_replica_model_uid(model_uid: str, rep_id: int) -> str:
     """
     Build a replica model uid.
     """
-    return f"{model_uid}-{replica}-{rep_id}"
+    return f"{model_uid}-{rep_id}"
-def parse_replica_model_uid(replica_model_uid: str) -> Tuple[str, int, int]:
+def parse_replica_model_uid(replica_model_uid: str) -> Tuple[str, int]:
     """
-    Parse replica model uid to model uid, replica and rep id.
+    Parse replica model uid to model uid and rep id.
     """
     parts = replica_model_uid.split("-")
     if len(parts) == 1:
-        return replica_model_uid, -1, -1
+        return replica_model_uid, -1
     rep_id = int(parts.pop())
-    replica = int(parts.pop())
     model_uid = "-".join(parts)
-    return model_uid, replica, rep_id
+    return model_uid, rep_id
 def is_valid_model_uid(model_uid: str) -> bool:
@@ -261,9 +260,9 @@ def get_nvidia_gpu_info() -> Dict:
 def assign_replica_gpu(
-    _replica_model_uid: str, gpu_idx: Union[int, List[int]]
+    _replica_model_uid: str, replica: int, gpu_idx: Union[int, List[int]]
 ) -> List[int]:
-    model_uid, replica, rep_id = parse_replica_model_uid(_replica_model_uid)
+    model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
     rep_id, replica = int(rep_id), int(replica)
     if isinstance(gpu_idx, int):
         gpu_idx = [gpu_idx]

xinference/core/worker.py CHANGED Viewed

@@ -157,7 +157,7 @@ class WorkerActor(xo.StatelessActor):
                                 model_uid,
                                 recover_count - 1,
                             )
-                            event_model_uid, _, __ = parse_replica_model_uid(model_uid)
+                            event_model_uid, _ = parse_replica_model_uid(model_uid)
                             try:
                                 if self._event_collector_ref is not None:
                                     await self._event_collector_ref.report_event(
@@ -377,7 +377,7 @@ class WorkerActor(xo.StatelessActor):
         return len(self._model_uid_to_model)
     async def is_model_vllm_backend(self, model_uid: str) -> bool:
-        _model_uid, _, _ = parse_replica_model_uid(model_uid)
+        _model_uid, _ = parse_replica_model_uid(model_uid)
         supervisor_ref = await self.get_supervisor_ref()
         model_ref = await supervisor_ref.get_model(_model_uid)
         return await model_ref.is_vllm_backend()
@@ -785,7 +785,9 @@ class WorkerActor(xo.StatelessActor):
         peft_model_config: Optional[PeftModelConfig] = None,
         request_limits: Optional[int] = None,
         gpu_idx: Optional[Union[int, List[int]]] = None,
-        download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+        download_hub: Optional[
+            Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+        ] = None,
         model_path: Optional[str] = None,
         **kwargs,
     ):
@@ -798,7 +800,7 @@ class WorkerActor(xo.StatelessActor):
         launch_args.update(kwargs)
         try:
-            origin_uid, _, _ = parse_replica_model_uid(model_uid)
+            origin_uid, _ = parse_replica_model_uid(model_uid)
         except Exception as e:
             logger.exception(e)
             raise
@@ -887,6 +889,7 @@ class WorkerActor(xo.StatelessActor):
                     uid=model_uid,
                     supervisor_address=self._supervisor_address,
                     worker_address=self.address,
+                    replica_model_uid=model_uid,
                     model=model,
                     model_description=model_description,
                     request_limits=request_limits,
@@ -924,7 +927,7 @@ class WorkerActor(xo.StatelessActor):
         # Terminate model while its launching is not allow
         if model_uid in self._model_uid_launching_guard:
             raise ValueError(f"{model_uid} is launching")
-        origin_uid, _, __ = parse_replica_model_uid(model_uid)
+        origin_uid, _ = parse_replica_model_uid(model_uid)
         try:
             _ = await self.get_supervisor_ref()
             if self._event_collector_ref is not None:

xinference/deploy/cmdline.py CHANGED Viewed

@@ -43,6 +43,7 @@ from .utils import (
     get_log_file,
     get_timestamp_ms,
     handle_click_args_type,
+    set_envs,
 )
 try:
@@ -106,6 +107,8 @@ def start_local_cluster(
         XINFERENCE_LOG_MAX_BYTES,
     )
     logging.config.dictConfig(dict_config)  # type: ignore
+    # refer to https://huggingface.co/docs/transformers/main_classes/logging
+    set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
     main(
         host=host,
@@ -280,6 +283,7 @@ def supervisor(
         XINFERENCE_LOG_MAX_BYTES,
     )
     logging.config.dictConfig(dict_config)  # type: ignore
+    set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
     main(
         host=host,
@@ -342,6 +346,7 @@ def worker(
         XINFERENCE_LOG_MAX_BYTES,
     )
     logging.config.dictConfig(dict_config)  # type: ignore
+    set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
     endpoint = get_endpoint(endpoint)

xinference/deploy/utils.py CHANGED Viewed

@@ -134,10 +134,6 @@ def get_config_dict(
                 "propagate": False,
             },
         },
-        "root": {
-            "level": "WARN",
-            "handlers": ["stream_handler", "file_handler"],
-        },
     }
     return config_dict
@@ -220,3 +216,10 @@ def handle_click_args_type(arg: str) -> Any:
         pass
     return arg
+def set_envs(key: str, value: str):
+    """
+    Environment variables are set by the parent process and inherited by child processes
+    """
+    os.environ[key] = value

xinference/model/audio/core.py CHANGED Viewed

@@ -100,7 +100,9 @@ def generate_audio_description(
 def match_audio(
     model_name: str,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
 ) -> AudioModelFamilyV1:
     from ..utils import download_from_modelscope
     from . import BUILTIN_AUDIO_MODELS, MODELSCOPE_AUDIO_MODELS
@@ -152,7 +154,9 @@ def create_audio_model_instance(
     devices: List[str],
     model_uid: str,
     model_name: str,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -127,7 +127,7 @@
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
     "model_id": "2Noise/ChatTTS",
-    "model_revision": "3b34118f6d25850440b8901cef3e71c6ef8619c8",
+    "model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
     "model_ability": "text-to-audio",
     "multilingual": true
   },

xinference/model/core.py CHANGED Viewed

@@ -55,7 +55,9 @@ def create_model_instance(
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[Any, ModelDescription]:

xinference/model/embedding/core.py CHANGED Viewed

@@ -433,7 +433,9 @@ class EmbeddingModel:
 def match_embedding(
     model_name: str,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
 ) -> EmbeddingModelSpec:
     from ..utils import download_from_modelscope
     from . import BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS
@@ -469,7 +471,9 @@ def create_embedding_model_instance(
     devices: List[str],
     model_uid: str,
     model_name: str,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[EmbeddingModel, EmbeddingModelDescription]:

xinference/model/image/core.py CHANGED Viewed

@@ -125,7 +125,9 @@ def generate_image_description(
 def match_diffusion(
     model_name: str,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
 ) -> ImageModelFamilyV1:
     from ..utils import download_from_modelscope
     from . import BUILTIN_IMAGE_MODELS, MODELSCOPE_IMAGE_MODELS
@@ -213,7 +215,9 @@ def create_image_model_instance(
     model_uid: str,
     model_name: str,
     peft_model_config: Optional[PeftModelConfig] = None,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[

xinference/model/image/ocr/got_ocr2.py CHANGED Viewed

@@ -71,6 +71,9 @@ class GotOCR2Model:
         logger.info("Got OCR 2.0 kwargs: %s", kwargs)
         if "ocr_type" not in kwargs:
             kwargs["ocr_type"] = "ocr"
+        if image.mode == "RGBA" or image.mode == "CMYK":
+            # convert to RGB
+            image = image.convert("RGB")
         assert self._model is not None
         # This chat API limits the max new tokens inside.
         return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)

xinference/model/llm/__init__.py CHANGED Viewed

@@ -32,6 +32,7 @@ from .llm_family import (
     BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
+    BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
     LMDEPLOY_CLASSES,
@@ -258,6 +259,36 @@ def _install():
         if "tools" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
+    openmind_hub_json_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "llm_family_openmind_hub.json"
+    )
+    for json_obj in json.load(
+        codecs.open(openmind_hub_json_path, "r", encoding="utf-8")
+    ):
+        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_OPENMIND_HUB_LLM_FAMILIES.append(model_spec)
+        # register prompt style, in case that we have something missed
+        # if duplicated with huggingface json, keep it as the huggingface style
+        if (
+            "chat" in model_spec.model_ability
+            and isinstance(model_spec.chat_template, str)
+            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
+        ):
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
+        # register model family
+        if "chat" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
+        else:
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
+        if "tools" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
     csghub_json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
     )
@@ -288,6 +319,7 @@ def _install():
     for llm_specs in [
         BUILTIN_LLM_FAMILIES,
         BUILTIN_MODELSCOPE_LLM_FAMILIES,
+        BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
         BUILTIN_CSGHUB_LLM_FAMILIES,
     ]:
         for llm_spec in llm_specs:
@@ -298,6 +330,7 @@ def _install():
     for families in [
         BUILTIN_LLM_FAMILIES,
         BUILTIN_MODELSCOPE_LLM_FAMILIES,
+        BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
         BUILTIN_CSGHUB_LLM_FAMILIES,
     ]:
         for family in families:

xinference/model/llm/core.py CHANGED Viewed

@@ -52,9 +52,7 @@ class LLM(abc.ABC):
         *args,
         **kwargs,
     ):
-        self.model_uid, self.replica, self.rep_id = parse_replica_model_uid(
-            replica_model_uid
-        )
+        self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
         self.model_family = model_family
         self.model_spec = model_spec
         self.quantization = quantization
@@ -193,7 +191,9 @@ def create_llm_model_instance(
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
-    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    download_hub: Optional[
+        Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
+    ] = None,
     model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[LLM, LLMDescription]:

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -1312,6 +1312,93 @@
       "<|eom_id|>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "llama-3.2-vision-instruct",
+    "model_lang": [
+      "en",
+      "de",
+      "fr",
+      "it",
+      "pt",
+      "hi",
+      "es",
+      "th"
+    ],
+    "model_ability": [
+	"chat",
+	"vision"
+    ],
+    "model_description": "Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 11,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "meta-llama/Llama-3.2-11B-Vision-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 90,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "meta-llama/Llama-3.2-90B-Vision-Instruct"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+    "stop_token_ids": [
+	128001,
+	128008,
+	128009
+    ],
+    "stop": [
+      "<|end_of_text|>",
+	"<|eot_id|>",
+	"<|eom_id|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "llama-3.2-vision",
+    "model_lang": [
+      "en",
+      "de",
+      "fr",
+      "it",
+      "pt",
+      "hi",
+      "es",
+      "th"
+    ],
+    "model_ability": [
+	"generate",
+	"vision"
+    ],
+    "model_description": "The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 11,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3.2-11B-Vision"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 90,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3.2-90B-Vision"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 2048,

xinference 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl

Potentially problematic release.

xinference 0.16.1py3-none-any.whl → 0.16.3py3-none-any.whl