PyPI - xinference - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

xinference 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (22) hide show

xinference/_version.py +3 -3
xinference/client.py +18 -0
xinference/constants.py +1 -0
xinference/core/gradio.py +2 -2
xinference/core/restful_api.py +31 -5
xinference/core/supervisor.py +64 -1
xinference/core/worker.py +22 -0
xinference/deploy/cmdline.py +39 -13
xinference/deploy/worker.py +2 -2
xinference/model/llm/__init__.py +20 -83
xinference/model/llm/ggml/llamacpp.py +1 -0
xinference/model/llm/llm_family.json +30 -15
xinference/model/llm/llm_family.py +152 -7
xinference/model/llm/pytorch/core.py +63 -40
xinference/model/llm/pytorch/utils.py +5 -1
xinference/model/llm/utils.py +6 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/METADATA +133 -29
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/RECORD +22 -22
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/WHEEL +1 -1
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/LICENSE +0 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/entry_points.txt +0 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -14,12 +14,15 @@
 import logging
 import os
-from typing import List, Optional, Union
+import platform
+from threading import Lock
+from typing import List, Optional, Tuple, Type, Union
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated, Literal
-from xinference.constants import XINFERENCE_CACHE_DIR
+from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
+from . import LLM
 logger = logging.getLogger(__name__)
@@ -30,7 +33,8 @@ class GgmlLLMSpecV1(BaseModel):
     quantizations: List[str]
     model_id: str
     model_file_name_template: str
-    model_local_path: Optional[str]
+    model_uri: Optional[str]
+    model_revision: Optional[str]
 class PytorchLLMSpecV1(BaseModel):
@@ -38,7 +42,8 @@ class PytorchLLMSpecV1(BaseModel):
     model_size_in_billions: int
     quantizations: List[str]
     model_id: str
-    model_local_path: Optional[str]
+    model_uri: Optional[str]
+    model_revision: Optional[str]
 class PromptStyleV1(BaseModel):
@@ -68,7 +73,14 @@ LLMSpecV1 = Annotated[
 LLMFamilyV1.update_forward_refs()
-LLM_FAMILIES: List[LLMFamilyV1] = []
+LLM_CLASSES: List[Type[LLM]] = []
+BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
+UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
+UD_LLM_FAMILIES_LOCK = Lock()
 def get_legacy_cache_path(
@@ -96,7 +108,18 @@ def cache(
         logger.debug("Legacy cache path exists: %s", legacy_cache_path)
         return os.path.dirname(legacy_cache_path)
     else:
-        return cache_from_huggingface(llm_family, llm_spec, quantization)
+        if llm_spec.model_uri is not None:
+            return cache_from_uri(llm_family, llm_spec, quantization)
+        else:
+            return cache_from_huggingface(llm_family, llm_spec, quantization)
+def cache_from_uri(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+    quantization: Optional[str] = None,
+) -> str:
+    raise NotImplementedError
 def cache_from_huggingface(
@@ -110,7 +133,7 @@ def cache_from_huggingface(
     import huggingface_hub
     cache_dir_name = f"{llm_family.model_name}-{llm_spec.model_format}-{llm_spec.model_size_in_billions}b"
-    cache_dir = os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name)
+    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
     if not os.path.exists(cache_dir):
         os.makedirs(cache_dir, exist_ok=True)
@@ -118,6 +141,7 @@ def cache_from_huggingface(
         assert isinstance(llm_spec, PytorchLLMSpecV1)
         huggingface_hub.snapshot_download(
             llm_spec.model_id,
+            revision=llm_spec.model_revision,
             local_dir=cache_dir,
             local_dir_use_symlinks=True,
         )
@@ -126,9 +150,130 @@ def cache_from_huggingface(
         file_name = llm_spec.model_file_name_template.format(quantization=quantization)
         huggingface_hub.hf_hub_download(
             llm_spec.model_id,
+            revision=llm_spec.model_revision,
             filename=file_name,
             local_dir=cache_dir,
             local_dir_use_symlinks=True,
         )
     return cache_dir
+def _is_linux():
+    return platform.system() == "Linux"
+def _has_cuda_device():
+    # `cuda_count` method already contains the logic for the
+    # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
+    from xorbits._mars.resource import cuda_count
+    return cuda_count() > 0
+def get_user_defined_llm_families():
+    with UD_LLM_FAMILIES_LOCK:
+        return UD_LLM_FAMILIES.copy()
+def match_llm(
+    model_name: str,
+    model_format: Optional[str] = None,
+    model_size_in_billions: Optional[int] = None,
+    quantization: Optional[str] = None,
+    is_local_deployment: bool = False,
+) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
+    """
+    Find an LLM family, spec, and quantization that satisfy given criteria.
+    """
+    user_defined_llm_families = get_user_defined_llm_families()
+    for family in BUILTIN_LLM_FAMILIES + user_defined_llm_families:
+        if model_name != family.model_name:
+            continue
+        for spec in family.model_specs:
+            if (
+                model_format
+                and model_format != spec.model_format
+                or model_size_in_billions
+                and model_size_in_billions != spec.model_size_in_billions
+                or quantization
+                and quantization not in spec.quantizations
+            ):
+                continue
+            if quantization:
+                return family, spec, quantization
+            else:
+                # by default, choose the most coarse-grained quantization.
+                # TODO: too hacky.
+                quantizations = spec.quantizations
+                quantizations.sort()
+                for q in quantizations:
+                    if (
+                        is_local_deployment
+                        and not (_is_linux() and _has_cuda_device())
+                        and q == "4-bit"
+                    ):
+                        logger.warning(
+                            "Skipping %s for non-linux or non-cuda local deployment .",
+                            q,
+                        )
+                        continue
+                    return family, spec, q
+    return None
+def register_llm(llm_family: LLMFamilyV1, persist: bool):
+    from .utils import is_valid_model_name
+    if not is_valid_model_name(llm_family.model_name):
+        raise ValueError(
+            f"Invalid model name {llm_family.model_name}. The model name must start with a letter"
+            f" or a digit, and can only contain letters, digits, underscores, or dashes."
+        )
+    with UD_LLM_FAMILIES_LOCK:
+        for family in BUILTIN_LLM_FAMILIES + UD_LLM_FAMILIES:
+            if llm_family.model_name == family.model_name:
+                raise ValueError(
+                    f"Model name conflicts with existing model {family.model_name}"
+                )
+        UD_LLM_FAMILIES.append(llm_family)
+    if persist:
+        persist_path = os.path.join(
+            XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
+        )
+        os.makedirs(os.path.dirname(persist_path), exist_ok=True)
+        with open(persist_path, mode="w") as fd:
+            fd.write(llm_family.json())
+def unregister_llm(model_name: str):
+    with UD_LLM_FAMILIES_LOCK:
+        llm_family = None
+        for i, f in enumerate(UD_LLM_FAMILIES):
+            if f.model_name == model_name:
+                llm_family = f
+                break
+        if llm_family:
+            UD_LLM_FAMILIES.remove(llm_family)
+            persist_path = os.path.join(
+                XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
+            )
+            if os.path.exists(persist_path):
+                os.remove(persist_path)
+        else:
+            raise ValueError(f"Model {model_name} not found")
+def match_llm_cls(family: LLMFamilyV1, llm_spec: "LLMSpecV1") -> Optional[Type[LLM]]:
+    """
+    Find an LLM implementation for given LLM family and spec.
+    """
+    for cls in LLM_CLASSES:
+        if cls.match(family, llm_spec):
+            return cls
+    return None

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -47,7 +47,7 @@ class PytorchGenerateConfig(TypedDict, total=False):
 class PytorchModelConfig(TypedDict, total=False):
-    revision: str
+    revision: Optional[str]
     device: str
     gpus: Optional[str]
     num_gpus: int
@@ -79,17 +79,14 @@ class PytorchModel(LLM):
     ) -> PytorchModelConfig:
         if pytorch_model_config is None:
             pytorch_model_config = PytorchModelConfig()
-        pytorch_model_config.setdefault("revision", "main")
+        pytorch_model_config.setdefault("revision", self.model_spec.model_revision)
         pytorch_model_config.setdefault("gpus", None)
         pytorch_model_config.setdefault("num_gpus", 1)
         pytorch_model_config.setdefault("gptq_ckpt", None)
         pytorch_model_config.setdefault("gptq_wbits", 16)
         pytorch_model_config.setdefault("gptq_groupsize", -1)
         pytorch_model_config.setdefault("gptq_act_order", False)
-        if self._is_darwin_and_apple_silicon():
-            pytorch_model_config.setdefault("device", "mps")
-        else:
-            pytorch_model_config.setdefault("device", "cuda")
+        pytorch_model_config.setdefault("device", "auto")
         return pytorch_model_config
     def _sanitize_generate_config(
@@ -142,26 +139,35 @@ class PytorchModel(LLM):
         quantization = self.quantization
         num_gpus = self._pytorch_model_config.get("num_gpus", 1)
-        if self._is_darwin_and_apple_silicon():
-            device = self._pytorch_model_config.get("device", "mps")
-        else:
-            device = self._pytorch_model_config.get("device", "cuda")
+        device = self._pytorch_model_config.get("device", "auto")
+        self._pytorch_model_config["device"] = self._select_device(device)
+        self._device = self._pytorch_model_config["device"]
-        if device == "cpu":
+        if self._device == "cpu":
             kwargs = {"torch_dtype": torch.float32}
-        elif device == "cuda":
+        elif self._device == "cuda":
             kwargs = {"torch_dtype": torch.float16}
-        elif device == "mps":
+        elif self._device == "mps":
             kwargs = {"torch_dtype": torch.float16}
         else:
-            raise ValueError(f"Device {device} is not supported in temporary")
-        kwargs["revision"] = self._pytorch_model_config.get("revision", "main")
+            raise ValueError(f"Device {self._device} is not supported in temporary")
+        kwargs["revision"] = self._pytorch_model_config.get(
+            "revision", self.model_spec.model_revision
+        )
         if quantization != "none":
-            if device == "cuda" and self._is_linux():
+            if self._device == "cuda" and self._is_linux():
                 kwargs["device_map"] = "auto"
                 if quantization == "4-bit":
                     kwargs["load_in_4bit"] = True
+                    kwargs["bnb_4bit_compute_dtype"] = torch.float16
+                    kwargs["bnb_4bit_use_double_quant"] = True
+                    kwargs["llm_int8_skip_modules"] = [
+                        "lm_head",
+                        "encoder",
+                        "EncDecAttention",
+                    ]
                 elif quantization == "8-bit":
                     kwargs["load_in_8bit"] = True
                 else:
@@ -178,7 +184,7 @@ class PytorchModel(LLM):
                 else:
                     self._model, self._tokenizer = load_compress_model(
                         model_path=self.model_path,
-                        device=device,
+                        device=self._device,
                         torch_dtype=kwargs["torch_dtype"],
                         use_fast=self._use_fast_tokenizer,
                         revision=kwargs["revision"],
@@ -189,11 +195,37 @@ class PytorchModel(LLM):
         self._model, self._tokenizer = self._load_model(kwargs)
         if (
-            device == "cuda" and num_gpus == 1 and quantization == "none"
-        ) or device == "mps":
-            self._model.to(device)
+            self._device == "cuda" and num_gpus == 1 and quantization == "none"
+        ) or self._device == "mps":
+            self._model.to(self._device)
         logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
+    def _select_device(self, device: str) -> str:
+        try:
+            import torch
+        except ImportError:
+            raise ImportError(
+                f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n"
+            )
+        if device == "auto":
+            if torch.cuda.is_available():
+                return "cuda"
+            elif torch.backends.mps.is_available():
+                return "mps"
+            return "cpu"
+        elif device == "cuda":
+            if not torch.cuda.is_available():
+                raise ValueError("cuda is unavailable in your environment")
+        elif device == "mps":
+            if not torch.backends.mps.is_available():
+                raise ValueError("mps is unavailable in your environment")
+        elif device == "cpu":
+            pass
+        else:
+            raise ValueError(f"Device {device} is not supported in temporary")
+        return device
     @classmethod
     def match(cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1") -> bool:
         if llm_spec.model_format != "pytorch":
@@ -222,21 +254,21 @@ class PytorchModel(LLM):
         )
         def generator_wrapper(
-            prompt: str, device: str, generate_config: PytorchGenerateConfig
+            prompt: str, generate_config: PytorchGenerateConfig
         ) -> Iterator[CompletionChunk]:
             if "falcon" in self.model_family.model_name:
                 for completion_chunk, _ in generate_stream_falcon(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     yield completion_chunk
             elif "chatglm" in self.model_family.model_name:
                 for completion_chunk, _ in generate_stream_chatglm(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     yield completion_chunk
             else:
                 for completion_chunk, _ in generate_stream(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     yield completion_chunk
@@ -250,24 +282,20 @@ class PytorchModel(LLM):
         assert self._tokenizer is not None
         stream = generate_config.get("stream", False)
-        if self._is_darwin_and_apple_silicon():
-            device = self._pytorch_model_config.get("device", "mps")
-        else:
-            device = self._pytorch_model_config.get("device", "cuda")
         if not stream:
             if "falcon" in self.model_family.model_name:
                 for completion_chunk, completion_usage in generate_stream_falcon(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     pass
             elif "chatglm" in self.model_family.model_name:
                 for completion_chunk, completion_usage in generate_stream_chatglm(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     pass
             else:
                 for completion_chunk, completion_usage in generate_stream(
-                    self._model, self._tokenizer, prompt, device, generate_config
+                    self._model, self._tokenizer, prompt, self._device, generate_config
                 ):
                     pass
             completion = Completion(
@@ -280,7 +308,7 @@ class PytorchModel(LLM):
             )
             return completion
         else:
-            return generator_wrapper(prompt, device, generate_config)
+            return generator_wrapper(prompt, generate_config)
     def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
         try:
@@ -291,11 +319,6 @@ class PytorchModel(LLM):
                 "Could not import torch. Please install it with `pip install torch`."
             ) from e
-        if self._is_darwin_and_apple_silicon():
-            device = self._pytorch_model_config.get("device", "mps")
-        else:
-            device = self._pytorch_model_config.get("device", "cuda")
         if isinstance(input, str):
             inputs = [input]
         else:
@@ -308,8 +331,8 @@ class PytorchModel(LLM):
             encoding = tokenizer.batch_encode_plus(
                 inputs, padding=True, return_tensors="pt"
             )
-            input_ids = encoding["input_ids"].to(device)
-            attention_mask = encoding["attention_mask"].to(device)
+            input_ids = encoding["input_ids"].to(self._device)
+            attention_mask = encoding["attention_mask"].to(self._device)
             model_output = self._model(
                 input_ids, attention_mask, output_hidden_states=True
             )
@@ -342,7 +365,7 @@ class PytorchModel(LLM):
             embedding = []
             token_num = 0
             for index, text in enumerate(inputs):
-                input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
+                input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
                 model_output = self._model(input_ids, output_hidden_states=True)
                 if is_chatglm:
                     data = (model_output.hidden_states[-1].transpose(0, 1))[0]

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -104,7 +104,11 @@ def generate_stream(
         temperature, repetition_penalty, top_p, top_k
     )
-    input_ids = tokenizer(prompt).input_ids
+    if "qwen" in str(type(model)).lower():
+        # TODO: hacky
+        input_ids = tokenizer(prompt, allowed_special="all").input_ids
+    else:
+        input_ids = tokenizer(prompt).input_ids
     output_ids = list(input_ids)
     if model.config.is_encoder_decoder:

xinference/model/llm/utils.py CHANGED Viewed

@@ -192,3 +192,9 @@ class ChatModelMixin:
             ],
             "usage": completion["usage"],
         }
+def is_valid_model_name(model_name: str) -> bool:
+    import re
+    return re.match(r"^[A-Za-z0-9][A-Za-z0-9_\-]*$", model_name) is not None

{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xinference
-Version: 0.1.1
+Version: 0.1.3
 Summary: Model Serving Made Easy
 Home-page: https://github.com/xorbitsai/inference
 Author: Qin Xuye
@@ -21,62 +21,62 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: xoscar
 Requires-Dist: xorbits
-Requires-Dist: gradio (>=3.35.0)
+Requires-Dist: gradio >=3.35.0
 Requires-Dist: click
-Requires-Dist: tqdm (>=4.27)
+Requires-Dist: tqdm >=4.27
 Requires-Dist: tabulate
 Requires-Dist: requests
-Requires-Dist: pydantic (<2)
+Requires-Dist: pydantic <2
 Requires-Dist: fastapi
 Requires-Dist: uvicorn
 Requires-Dist: sse-starlette
-Requires-Dist: huggingface-hub (<1.0,>=0.14.1)
+Requires-Dist: huggingface-hub <1.0,>=0.14.1
 Requires-Dist: typing-extensions
 Provides-Extra: all
-Requires-Dist: chatglm-cpp ; extra == 'all'
-Requires-Dist: llama-cpp-python (>=0.1.77) ; extra == 'all'
-Requires-Dist: transformers (>=4.31.0) ; extra == 'all'
+Requires-Dist: llama-cpp-python >=0.1.77 ; extra == 'all'
+Requires-Dist: transformers >=4.31.0 ; extra == 'all'
 Requires-Dist: torch ; extra == 'all'
-Requires-Dist: accelerate (>=0.20.3) ; extra == 'all'
+Requires-Dist: accelerate >=0.20.3 ; extra == 'all'
 Requires-Dist: sentencepiece ; extra == 'all'
 Requires-Dist: transformers-stream-generator ; extra == 'all'
 Requires-Dist: bitsandbytes ; extra == 'all'
 Requires-Dist: protobuf ; extra == 'all'
 Requires-Dist: einops ; extra == 'all'
+Requires-Dist: tiktoken ; extra == 'all'
 Provides-Extra: benchmark
 Requires-Dist: psutil ; extra == 'benchmark'
 Requires-Dist: pynvml ; extra == 'benchmark'
 Provides-Extra: dev
-Requires-Dist: cython (>=0.29) ; extra == 'dev'
-Requires-Dist: pytest (>=3.5.0) ; extra == 'dev'
-Requires-Dist: pytest-cov (>=2.5.0) ; extra == 'dev'
-Requires-Dist: pytest-timeout (>=1.2.0) ; extra == 'dev'
-Requires-Dist: pytest-forked (>=1.0) ; extra == 'dev'
-Requires-Dist: pytest-asyncio (>=0.14.0) ; extra == 'dev'
-Requires-Dist: ipython (>=6.5.0) ; extra == 'dev'
-Requires-Dist: sphinx (<5.0.0,>=3.0.0) ; extra == 'dev'
-Requires-Dist: pydata-sphinx-theme (>=0.3.0) ; extra == 'dev'
-Requires-Dist: sphinx-intl (>=0.9.9) ; extra == 'dev'
-Requires-Dist: jieba (>=0.42.0) ; extra == 'dev'
-Requires-Dist: flake8 (>=3.8.0) ; extra == 'dev'
+Requires-Dist: cython >=0.29 ; extra == 'dev'
+Requires-Dist: pytest >=3.5.0 ; extra == 'dev'
+Requires-Dist: pytest-cov >=2.5.0 ; extra == 'dev'
+Requires-Dist: pytest-timeout >=1.2.0 ; extra == 'dev'
+Requires-Dist: pytest-forked >=1.0 ; extra == 'dev'
+Requires-Dist: pytest-asyncio >=0.14.0 ; extra == 'dev'
+Requires-Dist: ipython >=6.5.0 ; extra == 'dev'
+Requires-Dist: sphinx <5.0.0,>=3.0.0 ; extra == 'dev'
+Requires-Dist: pydata-sphinx-theme >=0.3.0 ; extra == 'dev'
+Requires-Dist: sphinx-intl >=0.9.9 ; extra == 'dev'
+Requires-Dist: jieba >=0.42.0 ; extra == 'dev'
+Requires-Dist: flake8 >=3.8.0 ; extra == 'dev'
 Requires-Dist: black ; extra == 'dev'
 Provides-Extra: doc
-Requires-Dist: ipython (>=6.5.0) ; extra == 'doc'
-Requires-Dist: sphinx (<5.0.0,>=3.0.0) ; extra == 'doc'
-Requires-Dist: pydata-sphinx-theme (>=0.3.0) ; extra == 'doc'
-Requires-Dist: sphinx-intl (>=0.9.9) ; extra == 'doc'
+Requires-Dist: ipython >=6.5.0 ; extra == 'doc'
+Requires-Dist: sphinx <5.0.0,>=3.0.0 ; extra == 'doc'
+Requires-Dist: pydata-sphinx-theme >=0.3.0 ; extra == 'doc'
+Requires-Dist: sphinx-intl >=0.9.9 ; extra == 'doc'
 Provides-Extra: ggml
-Requires-Dist: chatglm-cpp ; extra == 'ggml'
-Requires-Dist: llama-cpp-python (>=0.1.77) ; extra == 'ggml'
+Requires-Dist: llama-cpp-python >=0.1.77 ; extra == 'ggml'
 Provides-Extra: pytorch
-Requires-Dist: transformers (>=4.31.0) ; extra == 'pytorch'
+Requires-Dist: transformers >=4.31.0 ; extra == 'pytorch'
 Requires-Dist: torch ; extra == 'pytorch'
-Requires-Dist: accelerate (>=0.20.3) ; extra == 'pytorch'
+Requires-Dist: accelerate >=0.20.3 ; extra == 'pytorch'
 Requires-Dist: sentencepiece ; extra == 'pytorch'
 Requires-Dist: transformers-stream-generator ; extra == 'pytorch'
 Requires-Dist: bitsandbytes ; extra == 'pytorch'
 Requires-Dist: protobuf ; extra == 'pytorch'
 Requires-Dist: einops ; extra == 'pytorch'
+Requires-Dist: tiktoken ; extra == 'pytorch'
 <div align="center">
 <img src="./assets/xorbits-logo.png" width="180px" alt="xorbits" />
@@ -290,6 +290,110 @@ $ xinference list --all
 - If you want to use Apple Metal GPU for acceleration, please choose the q4_0 and q4_1 quantization methods.
 - `llama-2-chat` 70B ggmlv3 model only supports q4_0 quantization currently.
+## Custom models \[Experimental\]
+Custom models are currently an experimental feature and are expected to be officially released in version v0.2.0.
+Define a custom model based on the following template:
+```python
+custom_model = {
+  "version": 1,
+  # model name. must start with a letter or a
+  # digit, and can only contain letters, digits,
+  # underscores, or dashes.
+  "model_name": "nsql-2B",
+  # supported languages
+  "model_lang": [
+    "en"
+  ],
+  # model abilities. could be "embed", "generate"
+  # and "chat".
+  "model_ability": [
+    "generate"
+  ],
+  # model specifications.
+  "model_specs": [
+    {
+      # model format.
+      "model_format": "pytorch",
+      "model_size_in_billions": 2,
+      # quantizations.
+      "quantizations": [
+        "4-bit",
+        "8-bit",
+        "none"
+      ],
+      # hugging face model ID.
+      "model_id": "NumbersStation/nsql-2B"
+    }
+  ],
+  # prompt style, required by chat models.
+  # for more details, see: xinference/model/llm/tests/test_utils.py
+  "prompt_style": None
+}
+```
+Register the custom model:
+```python
+import json
+from xinference.client import Client
+# replace with real xinference endpoint
+endpoint = "http://localhost:9997"
+client = Client(endpoint)
+client.register_model(model_type="LLM", model=json.dumps(custom_model), persist=False)
+```
+Load the custom model:
+```python
+uid = client.launch_model(model_name='nsql-2B')
+```
+Run the custom model:
+```python
+text = """CREATE TABLE work_orders (
+    ID NUMBER,
+    CREATED_AT TEXT,
+    COST FLOAT,
+    INVOICE_AMOUNT FLOAT,
+    IS_DUE BOOLEAN,
+    IS_OPEN BOOLEAN,
+    IS_OVERDUE BOOLEAN,
+    COUNTRY_NAME TEXT,
+)
+-- Using valid SQLite, answer the following questions for the tables provided above.
+-- how many work orders are open?
+SELECT"""
+model = client.get_model(model_uid=uid)
+model.generate(prompt=text)
+```
+Result:
+```json
+{
+   "id":"aeb5c87a-352e-11ee-89ad-9af9f16816c5",
+   "object":"text_completion",
+   "created":1691418511,
+   "model":"3b912fc4-352e-11ee-8e66-9af9f16816c5",
+   "choices":[
+      {
+         "text":" COUNT(*) FROM work_orders WHERE IS_OPEN = '1';",
+         "index":0,
+         "logprobs":"None",
+         "finish_reason":"stop"
+      }
+   ],
+   "usage":{
+      "prompt_tokens":117,
+      "completion_tokens":17,
+      "total_tokens":134
+   }
+}
+```
 ## Pytorch Model Best Practices

xinference 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

Potentially problematic release.

xinference 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl