PyPI - xinference - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

xinference 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (22) hide show

xinference/_version.py +3 -3
xinference/client.py +18 -0
xinference/constants.py +1 -0
xinference/core/gradio.py +2 -2
xinference/core/restful_api.py +31 -5
xinference/core/supervisor.py +64 -1
xinference/core/worker.py +22 -0
xinference/deploy/cmdline.py +39 -13
xinference/deploy/worker.py +2 -2
xinference/model/llm/__init__.py +20 -83
xinference/model/llm/ggml/llamacpp.py +1 -0
xinference/model/llm/llm_family.json +30 -15
xinference/model/llm/llm_family.py +152 -7
xinference/model/llm/pytorch/core.py +63 -40
xinference/model/llm/pytorch/utils.py +5 -1
xinference/model/llm/utils.py +6 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/METADATA +133 -29
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/RECORD +22 -22
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/WHEEL +1 -1
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/LICENSE +0 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/entry_points.txt +0 -0
{xinference-0.1.1.dist-info → xinference-0.1.3.dist-info}/top_level.txt +0 -0

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2023-08-03T23:45:49+0800",
+ "date": "2023-08-09T18:43:41+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "b21d9275912ad7e0ab6e4534a82c6fe71d55c179",
- "version": "0.1.1"
+ "full-revisionid": "4d2f61cb6591ac94624f035b37259a89002abefd",
+ "version": "0.1.3"
 }
 '''  # END VERSION_JSON

xinference/client.py CHANGED Viewed

@@ -480,6 +480,24 @@ class Client:
         # generate a time-based uuid.
         return str(uuid.uuid1())
+    def register_model(self, model_type: str, model: str, persist: bool):
+        coro = self._supervisor_ref.register_model(model_type, model, persist)
+        self._isolation.call(coro)
+    def unregister_model(self, model_type: str, model_name: str):
+        coro = self._supervisor_ref.unregister_model(model_type, model_name)
+        self._isolation.call(coro)
+    def list_model_registrations(self, model_type: str) -> List[Dict[str, Any]]:
+        coro = self._supervisor_ref.list_model_registrations(model_type)
+        return self._isolation.call(coro)
+    def get_model_registration(
+        self, model_type: str, model_name: str
+    ) -> Dict[str, Any]:
+        coro = self._supervisor_ref.get_model_registration(model_type, model_name)
+        return self._isolation.call(coro)
     def launch_model(
         self,
         model_name: str,

xinference/constants.py CHANGED Viewed

@@ -17,6 +17,7 @@ from pathlib import Path
 XINFERENCE_HOME = str(Path.home() / ".xinference")
 XINFERENCE_CACHE_DIR = os.path.join(XINFERENCE_HOME, "cache")
+XINFERENCE_MODEL_DIR = os.path.join(XINFERENCE_HOME, "model")
 XINFERENCE_LOG_DIR = os.path.join(XINFERENCE_HOME, "logs")
 XINFERENCE_DEFAULT_LOCAL_HOST = "127.0.0.1"

xinference/core/gradio.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional
 import gradio as gr
 from ..locale.utils import Locale
-from ..model.llm import LLM_FAMILIES, LLMFamilyV1, match_llm
+from ..model.llm import BUILTIN_LLM_FAMILIES, LLMFamilyV1, match_llm
 from ..model.llm.llm_family import cache
 from .api import SyncSupervisorAPI
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
 MODEL_TO_FAMILIES: Dict[str, LLMFamilyV1] = dict(
     (model_family.model_name, model_family)
-    for model_family in LLM_FAMILIES
+    for model_family in BUILTIN_LLM_FAMILIES
     if "chat" in model_family.model_ability
 )

xinference/core/restful_api.py CHANGED Viewed

@@ -480,7 +480,7 @@ class RESTfulAPIActor(xo.Actor):
             (msg["content"] for msg in body.messages if msg["role"] == "system"), None
         )
-        chat_history = body.messages
+        chat_history = body.messages[:-1]  # exclude the prompt
         model_uid = body.model
@@ -494,6 +494,26 @@ class RESTfulAPIActor(xo.Actor):
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+        try:
+            desc = await self._supervisor_ref.describe_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
+        is_chatglm_ggml = desc.get(
+            "model_format"
+        ) == "ggmlv3" and "chatglm" in desc.get("model_name", "")
+        if is_chatglm_ggml and system_prompt is not None:
+            raise HTTPException(
+                status_code=400, detail="ChatGLM ggml does not have system prompt"
+            )
         if body.stream:
             # create a pair of memory object streams
             send_chan, recv_chan = anyio.create_memory_object_stream(10)
@@ -501,9 +521,12 @@ class RESTfulAPIActor(xo.Actor):
             async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                 async with inner_send_chan:
                     try:
-                        iterator = await model.chat(
-                            prompt, system_prompt, chat_history, kwargs
-                        )
+                        if is_chatglm_ggml:
+                            iterator = await model.chat(prompt, chat_history, kwargs)
+                        else:
+                            iterator = await model.chat(
+                                prompt, system_prompt, chat_history, kwargs
+                            )
                         async for chunk in iterator:
                             await inner_send_chan.send(dict(data=json.dumps(chunk)))
                             if await request.is_disconnected():
@@ -525,7 +548,10 @@ class RESTfulAPIActor(xo.Actor):
         else:
             try:
-                return await model.chat(prompt, system_prompt, chat_history, kwargs)
+                if is_chatglm_ggml:
+                    return await model.chat(prompt, chat_history, kwargs)
+                else:
+                    return await model.chat(prompt, system_prompt, chat_history, kwargs)
             except Exception as e:
                 logger.error(e, exc_info=True)
                 raise HTTPException(status_code=500, detail=str(e))

xinference/core/supervisor.py CHANGED Viewed

@@ -16,7 +16,7 @@ import asyncio
 import time
 from dataclasses import dataclass
 from logging import getLogger
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import xoscar as xo
@@ -74,6 +74,69 @@ class SupervisorActor(xo.Actor):
         raise RuntimeError("No available worker found")
+    @log_sync(logger=logger)
+    def list_model_registrations(self, model_type: str) -> List[Dict[str, Any]]:
+        if model_type == "LLM":
+            from ..model.llm import BUILTIN_LLM_FAMILIES, get_user_defined_llm_families
+            ret = [
+                {"model_name": f.model_name, "is_builtin": True}
+                for f in BUILTIN_LLM_FAMILIES
+            ]
+            user_defined_llm_families = get_user_defined_llm_families()
+            ret.extend(
+                [
+                    {"model_name": f.model_name, "is_builtin": False}
+                    for f in user_defined_llm_families
+                ]
+            )
+            return ret
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_sync(logger=logger)
+    def get_model_registration(
+        self, model_type: str, model_name: str
+    ) -> Dict[str, Any]:
+        if model_type == "LLM":
+            from ..model.llm import BUILTIN_LLM_FAMILIES, get_user_defined_llm_families
+            for f in BUILTIN_LLM_FAMILIES + get_user_defined_llm_families():
+                if f.model_name == model_name:
+                    return f
+            raise ValueError(f"Model {model_name} not found")
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_async(logger=logger)
+    async def register_model(self, model_type: str, model: str, persist: bool):
+        if model_type == "LLM":
+            from ..model.llm import LLMFamilyV1, register_llm
+            llm_family = LLMFamilyV1.parse_raw(model)
+            register_llm(llm_family, persist)
+            if not self.is_local_deployment:
+                for worker in self._worker_address_to_worker.values():
+                    await worker.register_model(model_type, model, persist)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_async(logger=logger)
+    async def unregister_model(self, model_type: str, model_name: str):
+        if model_type == "LLM":
+            from ..model.llm import unregister_llm
+            unregister_llm(model_name)
+            if not self.is_local_deployment:
+                for worker in self._worker_address_to_worker.values():
+                    await worker.unregister_model(model_name)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
     async def launch_builtin_model(
         self,
         model_uid: str,

xinference/core/worker.py CHANGED Viewed

@@ -108,8 +108,30 @@ class WorkerActor(xo.Actor):
             "model_format": llm_spec.model_format,
             "model_size_in_billions": llm_spec.model_size_in_billions,
             "quantization": quantization,
+            "revision": llm_spec.model_revision,
         }
+    @log_sync(logger=logger)
+    async def register_model(self, model_type: str, model: str, persist: bool):
+        # TODO: centralized model registrations
+        if model_type == "LLM":
+            from ..model.llm import LLMFamilyV1, register_llm
+            llm_family = LLMFamilyV1.parse_raw(model)
+            register_llm(llm_family, persist)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_sync(logger=logger)
+    async def unregister_model(self, model_type: str, model_name: str):
+        # TODO: centralized model registrations
+        if model_type == "LLM":
+            from ..model.llm import unregister_llm
+            unregister_llm(model_name)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
     @log_async(logger=logger)
     async def launch_builtin_model(
         self,

xinference/deploy/cmdline.py CHANGED Viewed

@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import configparser
 import logging
 import os
+import sys
 from typing import Optional
 import click
@@ -30,6 +30,32 @@ from ..constants import (
 )
+def get_config_string(log_level: str) -> str:
+    return f"""
+        [loggers]
+        keys=root
+        [handlers]
+        keys=stream_handler
+        [formatters]
+        keys=formatter
+        [logger_root]
+        level={log_level.upper()}
+        handlers=stream_handler
+        [handler_stream_handler]
+        class=StreamHandler
+        formatter=formatter
+        level={log_level.upper()}
+        args=(sys.stderr,)
+        [formatter_formatter]
+        format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
+        """
 def get_endpoint(endpoint: Optional[str]) -> str:
     # user didn't specify the endpoint.
     if endpoint is None:
@@ -57,9 +83,10 @@ def cli(
     if ctx.invoked_subcommand is None:
         from .local import main
-        if log_level:
-            logging.basicConfig(level=logging.getLevelName(log_level.upper()))
-        logging_conf = dict(level=log_level.upper())
+        logging_conf = configparser.RawConfigParser()
+        logger_config_string = get_config_string(log_level)
+        logging_conf.read_string(logger_config_string)
+        logging.config.fileConfig(logging_conf)  # type: ignore
         address = f"{host}:{get_next_port()}"
@@ -102,9 +129,10 @@ def supervisor(
 def worker(log_level: str, endpoint: Optional[str], host: str):
     from ..deploy.worker import main
-    if log_level:
-        logging.basicConfig(level=logging.getLevelName(log_level.upper()))
-    logging_conf = dict(level=log_level.upper())
+    logging_conf = configparser.RawConfigParser()
+    logger_config_string = get_config_string(log_level)
+    logging_conf.read_string(logger_config_string)
+    logging.config.fileConfig(level=logging.getLevelName(log_level.upper()))  # type: ignore
     endpoint = get_endpoint(endpoint)
@@ -146,7 +174,7 @@ def model_launch(
         quantization=quantization,
     )
-    print(f"Model uid: {model_uid}")
+    print(f"Model uid: {model_uid}", file=sys.stderr)
 @cli.command("list")
@@ -157,18 +185,16 @@ def model_launch(
 )
 @click.option("--all", is_flag=True)
 def model_list(endpoint: Optional[str], all: bool):
-    import sys
     from tabulate import tabulate
     # TODO: get from the supervisor
-    from ..model.llm import LLM_FAMILIES
+    from ..model.llm import BUILTIN_LLM_FAMILIES
     endpoint = get_endpoint(endpoint)
     table = []
     if all:
-        for model_family in LLM_FAMILIES:
+        for model_family in BUILTIN_LLM_FAMILIES:
             table.append(
                 [
                     model_family.model_name,

xinference/deploy/worker.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import asyncio
 import logging
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 import xoscar as xo
@@ -53,7 +53,7 @@ async def _start_worker(
     await pool.join()
-def main(address: str, supervisor_address: str, logging_conf: Optional[Dict] = None):
+def main(address: str, supervisor_address: str, logging_conf: Any = None):
     loop = asyncio.get_event_loop()
     task = loop.create_task(_start_worker(address, supervisor_address, logging_conf))

xinference/model/llm/__init__.py CHANGED Viewed

@@ -12,98 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import codecs
 import json
-import logging
 import os
-import platform
-from typing import List, Optional, Tuple, Type
 from .core import LLM
 from .llm_family import (
+    BUILTIN_LLM_FAMILIES,
+    LLM_CLASSES,
     GgmlLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
     PromptStyleV1,
     PytorchLLMSpecV1,
+    get_user_defined_llm_families,
+    match_llm,
+    match_llm_cls,
+    register_llm,
+    unregister_llm,
 )
-_LLM_CLASSES: List[Type[LLM]] = []
-LLM_FAMILIES: List["LLMFamilyV1"] = []
-logger = logging.getLogger(__name__)
-def _is_linux():
-    return platform.system() == "Linux"
-def _has_cuda_device():
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
-    if cuda_visible_devices:
-        return True
-    else:
-        from xorbits._mars.resource import cuda_count
-        return cuda_count() > 0
-def match_llm(
-    model_name: str,
-    model_format: Optional[str] = None,
-    model_size_in_billions: Optional[int] = None,
-    quantization: Optional[str] = None,
-    is_local_deployment: bool = False,
-) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
-    """
-    Find an LLM family, spec, and quantization that satisfy given criteria.
-    """
-    for family in LLM_FAMILIES:
-        if model_name != family.model_name:
-            continue
-        for spec in family.model_specs:
-            if (
-                model_format
-                and model_format != spec.model_format
-                or model_size_in_billions
-                and model_size_in_billions != spec.model_size_in_billions
-                or quantization
-                and quantization not in spec.quantizations
-            ):
-                continue
-            if quantization:
-                return family, spec, quantization
-            else:
-                # by default, choose the most coarse-grained quantization.
-                # TODO: too hacky.
-                quantizations = spec.quantizations
-                quantizations.sort()
-                for q in quantizations:
-                    if (
-                        is_local_deployment
-                        and not (_is_linux() and _has_cuda_device())
-                        and q == "4-bit"
-                    ):
-                        logger.warning(
-                            "Skipping %s for non-linux or non-cuda local deployment .",
-                            q,
-                        )
-                        continue
-                    return family, spec, q
-    return None
-def match_llm_cls(
-    llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1"
-) -> Optional[Type[LLM]]:
-    """
-    Find an LLM implementation for given LLM family and spec.
-    """
-    for cls in _LLM_CLASSES:
-        if cls.match(llm_family, llm_spec):
-            return cls
-    return None
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
@@ -114,7 +42,7 @@ def _install():
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
     from .pytorch.vicuna import VicunaPytorchChatModel
-    _LLM_CLASSES.extend(
+    LLM_CLASSES.extend(
         [
             ChatglmCppChatModel,
             LlamaCppModel,
@@ -132,5 +60,14 @@ def _install():
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
     )
-    for json_obj in json.load(open(json_path)):
-        LLM_FAMILIES.append(LLMFamilyV1.parse_obj(json_obj))
+    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
+        BUILTIN_LLM_FAMILIES.append(LLMFamilyV1.parse_obj(json_obj))
+    from ...constants import XINFERENCE_MODEL_DIR
+    user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")
+    if os.path.isdir(user_defined_llm_dir):
+        for f in os.listdir(user_defined_llm_dir):
+            with codecs.open(f, encoding="utf-8") as fd:
+                user_defined_llm_family = LLMFamilyV1.parse_obj(json.load(fd))
+                register_llm(user_defined_llm_family, persist=False)

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -139,6 +139,7 @@ class LlamaCppModel(LLM):
             llamacpp_model_config["n_gqa"] = 8
         if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
+            # TODO: platform.processor() is not safe, need to be replaced to other method.
             llamacpp_model_config.setdefault("n_gpu_layers", 1)
         elif self._is_linux() and self._can_apply_cublas():
             llamacpp_model_config.setdefault("n_gpu_layers", self._gpu_layers)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -41,7 +41,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "baichuan-inc/Baichuan-7B"
+        "model_id": "baichuan-inc/Baichuan-7B",
+        "model_revision": "c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756"
       },
       {
         "model_format": "pytorch",
@@ -51,7 +52,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "baichuan-inc/Baichuan-13B-Base"
+        "model_id": "baichuan-inc/Baichuan-13B-Base",
+        "model_revision": "0ef0739c7bdd34df954003ef76d80f3dabca2ff9"
       }
     ],
     "prompt_style": null
@@ -98,7 +100,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "baichuan-inc/Baichuan-13B-Chat"
+        "model_id": "baichuan-inc/Baichuan-13B-Chat",
+        "model_revision": "19ef51ba5bad8935b03acd20ff04a269210983bc"
       }
     ],
     "prompt_style": {
@@ -267,7 +270,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "lmsys/vicuna-33b-v1.3"
+        "model_id": "lmsys/vicuna-33b-v1.3",
+        "model_revision": "ef8d6becf883fb3ce52e3706885f761819477ab4"
       },
       {
         "model_format": "pytorch",
@@ -277,7 +281,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "lmsys/vicuna-13b-v1.3"
+        "model_id": "lmsys/vicuna-13b-v1.3",
+        "model_revision": "6566e9cb1787585d1147dcf4f9bc48f29e1328d2"
       },
       {
         "model_format": "pytorch",
@@ -287,7 +292,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "lmsys/vicuna-7b-v1.3"
+        "model_id": "lmsys/vicuna-7b-v1.3",
+        "model_revision": "236eeeab96f0dc2e463f2bebb7bb49809279c6d6"
       }
     ],
     "prompt_style": {
@@ -395,7 +401,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "THUDM/chatglm-6b"
+        "model_id": "THUDM/chatglm-6b",
+        "model_revision": "b1502f4f75c71499a3d566b14463edd62620ce9f"
       }
     ],
     "prompt_style": {
@@ -441,7 +448,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "THUDM/chatglm2-6b"
+        "model_id": "THUDM/chatglm2-6b",
+        "model_revision": "b1502f4f75c71499a3d566b14463edd62620ce9f"
       }
     ],
     "prompt_style": {
@@ -474,7 +482,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "THUDM/chatglm2-6b-32k"
+        "model_id": "THUDM/chatglm2-6b-32k",
+        "model_revision": "455746d4706479a1cbbd07179db39eb2741dc692"
       }
     ],
     "prompt_style": {
@@ -643,7 +652,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "facebook/opt-125m"
+        "model_id": "facebook/opt-125m",
+        "model_revision": "3d2b5f275bdf882b8775f902e1bfdb790e2cfc32"
       }
     ],
     "prompt_style": null
@@ -667,7 +677,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "tiiuae/falcon-40b"
+        "model_id": "tiiuae/falcon-40b",
+        "model_revision": "561820f7eef0cc56a31ea38af15ca1acb07fab5d"
       },
       {
         "model_format": "pytorch",
@@ -677,7 +688,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "tiiuae/falcon-7b"
+        "model_id": "tiiuae/falcon-7b",
+        "model_revision": "378337427557d1df3e742264a2901a49f25d4eb1"
       }
     ],
     "prompt_style": null
@@ -701,7 +713,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "tiiuae/falcon-7b-instruct"
+        "model_id": "tiiuae/falcon-7b-instruct",
+        "model_revision": "eb410fb6ffa9028e97adb801f0d6ec46d02f8b07"
       },
       {
         "model_format": "pytorch",
@@ -711,7 +724,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "tiiuae/falcon-40b-instruct"
+        "model_id": "tiiuae/falcon-40b-instruct",
+        "model_revision": "ca78eac0ed45bf64445ff0687fabba1598daebf3"
       }
     ],
     "prompt_style": {
@@ -759,7 +773,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "Qwen/Qwen-7B-Chat"
+        "model_id": "Qwen/Qwen-7B-Chat",
+        "model_revision": "5c611a5cde5769440581f91e8b4bba050f62b1af"
       }
     ],
     "prompt_style": {

xinference 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

Potentially problematic release.

xinference 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl