PyPI - xinference - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/embedding/embed_family.py CHANGED Viewed

@@ -13,57 +13,99 @@
 # limitations under the License.
 import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Type
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Type, Union
 if TYPE_CHECKING:
-    from .core import EmbeddingModel, EmbeddingModelSpec
+    from .core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 FLAG_EMBEDDER_CLASSES: List[Type["EmbeddingModel"]] = []
 SENTENCE_TRANSFORMER_CLASSES: List[Type["EmbeddingModel"]] = []
 VLLM_CLASSES: List[Type["EmbeddingModel"]] = []
+LLAMA_CPP_CLASSES: List[Type["EmbeddingModel"]] = []
-BUILTIN_EMBEDDING_MODELS: Dict[str, Any] = {}
-MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
+BUILTIN_EMBEDDING_MODELS: Dict[str, "EmbeddingModelFamilyV2"] = {}
 logger = logging.getLogger(__name__)
-# Desc: this file used to manage embedding models information.
 def match_embedding(
     model_name: str,
+    model_format: Optional[str] = None,
+    quantization: Optional[str] = None,
     download_hub: Optional[
         Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
     ] = None,
-) -> "EmbeddingModelSpec":
+) -> "EmbeddingModelFamilyV2":
     from ..utils import download_from_modelscope
-    # The model info has benn init by __init__.py with model_spec.json file
     from .custom import get_user_defined_embeddings
-    # first, check whether it is a user-defined embedding model
-    for model_spec in get_user_defined_embeddings():
-        if model_name == model_spec.model_name:
-            return model_spec
-    if download_hub == "modelscope" and model_name in MODELSCOPE_EMBEDDING_MODELS:
-        logger.debug(f"Embedding model {model_name} found in ModelScope.")
-        return MODELSCOPE_EMBEDDING_MODELS[model_name]
-    elif download_hub == "huggingface" and model_name in BUILTIN_EMBEDDING_MODELS:
-        logger.debug(f"Embedding model {model_name} found in Huggingface.")
-        return BUILTIN_EMBEDDING_MODELS[model_name]
-    elif download_from_modelscope() and model_name in MODELSCOPE_EMBEDDING_MODELS:
-        logger.debug(f"Embedding model {model_name} found in ModelScope.")
-        return MODELSCOPE_EMBEDDING_MODELS[model_name]
-    elif model_name in BUILTIN_EMBEDDING_MODELS:
-        logger.debug(f"Embedding model {model_name} found in Huggingface.")
-        return BUILTIN_EMBEDDING_MODELS[model_name]
+    target_family = None
+    if model_name in BUILTIN_EMBEDDING_MODELS:
+        target_family = BUILTIN_EMBEDDING_MODELS[model_name]
     else:
+        for model_family in get_user_defined_embeddings():
+            if model_name == model_family.model_name:
+                target_family = model_family
+                break
+    if target_family is None:
         raise ValueError(
-            f"Embedding model {model_name} not found, available"
-            f"Huggingface: {BUILTIN_EMBEDDING_MODELS.keys()}"
-            f"ModelScope: {MODELSCOPE_EMBEDDING_MODELS.keys()}"
+            f"Embedding model {model_name} not found, available "
+            f"models: {BUILTIN_EMBEDDING_MODELS.keys()}"
         )
+    if download_hub == "modelscope" or download_from_modelscope():
+        specs = [
+            x for x in target_family.model_specs if x.model_hub == "modelscope"
+        ] + [x for x in target_family.model_specs if x.model_hub == "huggingface"]
+    else:
+        specs = [x for x in target_family.model_specs if x.model_hub == "huggingface"]
+    def _match_quantization(q: Union[str, None], _quantization: str):
+        # Currently, the quantization name could include both uppercase and lowercase letters,
+        # so it is necessary to ensure that the case sensitivity does not
+        # affect the matching results.
+        if q is None:
+            return None
+        return _quantization if q.lower() == _quantization.lower() else None
+    def _apply_format_to_model_id(
+        _spec: "EmbeddingSpecV1", q: str
+    ) -> "EmbeddingSpecV1":
+        # Different quantized versions of some models use different model ids,
+        # Here we check the `{}` in the model id to format the id.
+        if _spec.model_id and "{" in _spec.model_id:
+            _spec.model_id = _spec.model_id.format(quantization=q)
+        return _spec
+    for spec in specs:
+        matched_quantization = _match_quantization(quantization, spec.quantization)
+        if (
+            model_format
+            and model_format != spec.model_format
+            or quantization
+            and matched_quantization is None
+        ):
+            continue
+        # Copy spec to avoid _apply_format_to_model_id modify the original spec.
+        spec = spec.copy()
+        _family = target_family.copy()
+        if quantization:
+            _family.model_specs = [
+                _apply_format_to_model_id(spec, matched_quantization)
+            ]
+            return _family
+        else:
+            # TODO: If user does not specify quantization, just use the first one
+            _q = "none" if spec.model_format == "pytorch" else spec.quantization
+            _family.model_specs = [_apply_format_to_model_id(spec, _q)]
+            return _family
+    raise ValueError(
+        f"Embedding model {model_name} with format {model_format} and quantization {quantization} not found."
+    )
 # { embedding model name -> { engine name -> engine params } }
 EMBEDDING_ENGINES: Dict[str, Dict[str, List[Dict[str, Type["EmbeddingModel"]]]]] = {}
@@ -71,8 +113,10 @@ SUPPORTED_ENGINES: Dict[str, List[Type["EmbeddingModel"]]] = {}
 def check_engine_by_model_name_and_engine(
-    model_name: str,
     model_engine: str,
+    model_name: str,
+    model_format: Optional[str],
+    quantization: Optional[str],
 ) -> Type["EmbeddingModel"]:
     def get_model_engine_from_spell(engine_str: str) -> str:
         for engine in EMBEDDING_ENGINES[model_name].keys():
@@ -87,6 +131,11 @@ def check_engine_by_model_name_and_engine(
         raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
     match_params = EMBEDDING_ENGINES[model_name][model_engine]
     for param in match_params:
-        if model_name == param["model_name"]:
-            return param["embedding_class"]
+        if model_name != param["model_name"]:
+            continue
+        if (model_format and model_format != param["model_format"]) or (
+            quantization and quantization != param["quantization"]
+        ):
+            continue
+        return param["embedding_class"]
     raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")

xinference/model/embedding/flag/core.py CHANGED Viewed

@@ -30,7 +30,7 @@ except ImportError:
 from ....device_utils import get_available_device
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
-from ..core import EmbeddingModel, EmbeddingModelSpec
+from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 FLAG_EMBEDDER_MODEL_LIST = support_native_bge_model_list() if flag_installed else []
 logger = logging.getLogger(__name__)
@@ -41,12 +41,20 @@ class FlagEmbeddingModel(EmbeddingModel):
         self,
         model_uid: str,
         model_path: str,
-        model_spec: EmbeddingModelSpec,
+        model_family: EmbeddingModelFamilyV2,
+        quantization: Optional[str] = None,
         device: Optional[str] = None,
         return_sparse: bool = False,
         **kwargs,
     ):
-        super().__init__(model_uid, model_path, model_spec, device, **kwargs)
+        super().__init__(
+            model_uid,
+            model_path,
+            model_family,
+            quantization,
+            device,
+            **kwargs,
+        )
         self._return_sparse = return_sparse
     def load(self):
@@ -276,7 +284,15 @@ class FlagEmbeddingModel(EmbeddingModel):
         return importlib.util.find_spec("FlagEmbedding") is not None
     @classmethod
-    def match_json(cls, model_spec: EmbeddingModelSpec) -> bool:
-        if model_spec.model_name in FLAG_EMBEDDER_MODEL_LIST:
+    def match_json(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> bool:
+        if (
+            model_spec.model_format in ["pytorch"]
+            and model_family.model_name in FLAG_EMBEDDER_MODEL_LIST
+        ):
             return True
         return False

xinference/model/embedding/llama_cpp/__init__.py ADDED Viewed

File without changes

xinference/model/embedding/llama_cpp/core.py ADDED Viewed

@@ -0,0 +1,234 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import concurrent.futures
+import importlib.util
+import logging
+import os
+import platform
+import pprint
+import queue
+import sys
+from typing import List, Optional, Union
+import orjson
+from ....types import Embedding
+from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
+logger = logging.getLogger(__name__)
+class _Done:
+    pass
+class _Error:
+    def __init__(self, msg):
+        self.msg = msg
+class XllamaCppEmbeddingModel(EmbeddingModel):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self._llm = None
+        self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
+        llamacpp_model_config = self._kwargs.get("llamacpp_model_config")
+        self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
+    def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
+        if llamacpp_model_config is None:
+            llamacpp_model_config = {}
+        llamacpp_model_config.setdefault("embedding", True)
+        llamacpp_model_config.setdefault("use_mmap", False)
+        llamacpp_model_config.setdefault("use_mlock", True)
+        if self._is_darwin_and_apple_silicon():
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
+        elif self._is_linux():
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
+        return llamacpp_model_config
+    def _is_darwin_and_apple_silicon(self):
+        return sys.platform == "darwin" and platform.processor() == "arm"
+    def _is_linux(self):
+        return sys.platform.startswith("linux")
+    def load(self):
+        try:
+            from xllamacpp import (
+                CommonParams,
+                Server,
+                estimate_gpu_layers,
+                get_device_info,
+                ggml_backend_dev_type,
+                llama_pooling_type,
+            )
+        except ImportError:
+            error_message = "Failed to import module 'xllamacpp'"
+            installation_guide = ["Please make sure 'xllamacpp' is installed. "]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        # handle legacy cache.
+        if (
+            self._model_spec.model_file_name_split_template
+            and self._quantization in self._model_spec.quantization_parts
+        ):
+            part = self._model_spec.quantization_parts[self._quantization]
+            model_path = os.path.join(
+                self._model_path,
+                self._model_spec.model_file_name_split_template.format(
+                    quantization=self._quantization, part=part[0]
+                ),
+            )
+        else:
+            model_path = os.path.join(
+                self._model_path,
+                self._model_spec.model_file_name_template.format(
+                    quantization=self._quantization
+                ),
+            )
+        try:
+            params = CommonParams()
+            params.embedding = True
+            # Compatible with xllamacpp changes
+            try:
+                params.model = model_path
+            except Exception:
+                params.model.path = model_path
+            # This is the default value, could be overwritten by _llamacpp_model_config
+            params.n_parallel = min(8, os.cpu_count() or 1)
+            params.pooling_type = llama_pooling_type.LLAMA_POOLING_TYPE_LAST
+            for k, v in self._llamacpp_model_config.items():
+                try:
+                    if "." in k:
+                        parts = k.split(".")
+                        sub_param = params
+                        for p in parts[:-1]:
+                            sub_param = getattr(sub_param, p)
+                        setattr(sub_param, parts[-1], v)
+                    else:
+                        setattr(params, k, v)
+                except Exception as e:
+                    logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
+            n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
+            params.cpuparams.n_threads = n_threads
+            params.cpuparams_batch.n_threads = n_threads
+            if params.n_gpu_layers == -1:
+                # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
+                # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+                params.n_gpu_layers = 0x7FFFFFFF
+                try:
+                    device_info = get_device_info()
+                    gpus = [
+                        info
+                        for info in device_info
+                        if info["type"]
+                        == ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
+                    ]
+                    if gpus:
+                        logger.info(
+                            "Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
+                            params.n_ctx,
+                            params.n_batch,
+                            params.n_parallel,
+                            pprint.pformat(gpus),
+                        )
+                        estimate = estimate_gpu_layers(
+                            gpus=gpus,
+                            model_path=model_path,
+                            projectors=[],
+                            context_length=params.n_ctx,
+                            batch_size=params.n_batch,
+                            num_parallel=params.n_parallel,
+                            kv_cache_type="",
+                        )
+                        logger.info("Estimate num gpu layers: %s", estimate)
+                        if estimate.tensor_split:
+                            params.tensor_split = estimate.tensor_split
+                        else:
+                            params.n_gpu_layers = estimate.layers
+                except Exception as e:
+                    logger.exception(
+                        "Estimate num gpu layers for llama.cpp backend failed: %s", e
+                    )
+            self._llm = Server(params)
+            self._executor = concurrent.futures.ThreadPoolExecutor(
+                max_workers=max(10, n_threads)
+            )
+        except AssertionError:
+            raise RuntimeError(f"Load model {self._model_name} failed")
+    def create_embedding(self, sentences: Union[str, List[str]], **kwargs) -> Embedding:
+        if self._llm is None:
+            raise RuntimeError("Model is not loaded.")
+        q: queue.Queue = queue.Queue()
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        def _handle_embedding():
+            data = {"input": sentences}
+            prompt_json = orjson.dumps(data)
+            def _error_callback(err):
+                try:
+                    msg = orjson.loads(err)
+                    q.put(_Error(msg))
+                except Exception as e:
+                    q.put(_Error(str(e)))
+            def _ok_callback(ok):
+                try:
+                    res = orjson.loads(ok)
+                    q.put(res)
+                except Exception as e:
+                    q.put(_Error(str(e)))
+            try:
+                self._llm.handle_embeddings(prompt_json, _error_callback, _ok_callback)
+            except Exception as ex:
+                q.put(_Error(str(ex)))
+            q.put(_Done)
+        assert self._executor
+        self._executor.submit(_handle_embedding)
+        r = q.get()
+        if type(r) is _Error:
+            raise Exception(f"Failed to create embedding: {r.msg}")
+        r["model_replica"] = self._model_uid
+        return Embedding(**r)  # type: ignore
+    @classmethod
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("xllamacpp") is not None
+    @classmethod
+    def match_json(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> bool:
+        if model_spec.model_format not in ["ggufv2"]:
+            return False
+        return True

xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl