PyPI - xinference - Versions diffs - 0.1.1__tar.gz → 0.1.3__tar.gz - Mend

xinference 0.1.1tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (60) hide show

{xinference-0.1.1/xinference.egg-info → xinference-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xinference
-Version: 0.1.1
+Version: 0.1.3
 Summary: Model Serving Made Easy
 Home-page: https://github.com/xorbitsai/inference
 Author: Qin Xuye
@@ -238,6 +238,110 @@ $ xinference list --all
 - If you want to use Apple Metal GPU for acceleration, please choose the q4_0 and q4_1 quantization methods.
 - `llama-2-chat` 70B ggmlv3 model only supports q4_0 quantization currently.
+## Custom models \[Experimental\]
+Custom models are currently an experimental feature and are expected to be officially released in version v0.2.0.
+Define a custom model based on the following template:
+```python
+custom_model = {
+  "version": 1,
+  # model name. must start with a letter or a
+  # digit, and can only contain letters, digits,
+  # underscores, or dashes.
+  "model_name": "nsql-2B",
+  # supported languages
+  "model_lang": [
+    "en"
+  ],
+  # model abilities. could be "embed", "generate"
+  # and "chat".
+  "model_ability": [
+    "generate"
+  ],
+  # model specifications.
+  "model_specs": [
+    {
+      # model format.
+      "model_format": "pytorch",
+      "model_size_in_billions": 2,
+      # quantizations.
+      "quantizations": [
+        "4-bit",
+        "8-bit",
+        "none"
+      ],
+      # hugging face model ID.
+      "model_id": "NumbersStation/nsql-2B"
+    }
+  ],
+  # prompt style, required by chat models.
+  # for more details, see: xinference/model/llm/tests/test_utils.py
+  "prompt_style": None
+}
+```
+Register the custom model:
+```python
+import json
+from xinference.client import Client
+# replace with real xinference endpoint
+endpoint = "http://localhost:9997"
+client = Client(endpoint)
+client.register_model(model_type="LLM", model=json.dumps(custom_model), persist=False)
+```
+Load the custom model:
+```python
+uid = client.launch_model(model_name='nsql-2B')
+```
+Run the custom model:
+```python
+text = """CREATE TABLE work_orders (
+    ID NUMBER,
+    CREATED_AT TEXT,
+    COST FLOAT,
+    INVOICE_AMOUNT FLOAT,
+    IS_DUE BOOLEAN,
+    IS_OPEN BOOLEAN,
+    IS_OVERDUE BOOLEAN,
+    COUNTRY_NAME TEXT,
+)
+-- Using valid SQLite, answer the following questions for the tables provided above.
+-- how many work orders are open?
+SELECT"""
+model = client.get_model(model_uid=uid)
+model.generate(prompt=text)
+```
+Result:
+```json
+{
+   "id":"aeb5c87a-352e-11ee-89ad-9af9f16816c5",
+   "object":"text_completion",
+   "created":1691418511,
+   "model":"3b912fc4-352e-11ee-8e66-9af9f16816c5",
+   "choices":[
+      {
+         "text":" COUNT(*) FROM work_orders WHERE IS_OPEN = '1';",
+         "index":0,
+         "logprobs":"None",
+         "finish_reason":"stop"
+      }
+   ],
+   "usage":{
+      "prompt_tokens":117,
+      "completion_tokens":17,
+      "total_tokens":134
+   }
+}
+```
 ## Pytorch Model Best Practices

{xinference-0.1.1 → xinference-0.1.3}/README.md RENAMED Viewed

@@ -210,6 +210,110 @@ $ xinference list --all
 - If you want to use Apple Metal GPU for acceleration, please choose the q4_0 and q4_1 quantization methods.
 - `llama-2-chat` 70B ggmlv3 model only supports q4_0 quantization currently.
+## Custom models \[Experimental\]
+Custom models are currently an experimental feature and are expected to be officially released in version v0.2.0.
+Define a custom model based on the following template:
+```python
+custom_model = {
+  "version": 1,
+  # model name. must start with a letter or a
+  # digit, and can only contain letters, digits,
+  # underscores, or dashes.
+  "model_name": "nsql-2B",
+  # supported languages
+  "model_lang": [
+    "en"
+  ],
+  # model abilities. could be "embed", "generate"
+  # and "chat".
+  "model_ability": [
+    "generate"
+  ],
+  # model specifications.
+  "model_specs": [
+    {
+      # model format.
+      "model_format": "pytorch",
+      "model_size_in_billions": 2,
+      # quantizations.
+      "quantizations": [
+        "4-bit",
+        "8-bit",
+        "none"
+      ],
+      # hugging face model ID.
+      "model_id": "NumbersStation/nsql-2B"
+    }
+  ],
+  # prompt style, required by chat models.
+  # for more details, see: xinference/model/llm/tests/test_utils.py
+  "prompt_style": None
+}
+```
+Register the custom model:
+```python
+import json
+from xinference.client import Client
+# replace with real xinference endpoint
+endpoint = "http://localhost:9997"
+client = Client(endpoint)
+client.register_model(model_type="LLM", model=json.dumps(custom_model), persist=False)
+```
+Load the custom model:
+```python
+uid = client.launch_model(model_name='nsql-2B')
+```
+Run the custom model:
+```python
+text = """CREATE TABLE work_orders (
+    ID NUMBER,
+    CREATED_AT TEXT,
+    COST FLOAT,
+    INVOICE_AMOUNT FLOAT,
+    IS_DUE BOOLEAN,
+    IS_OPEN BOOLEAN,
+    IS_OVERDUE BOOLEAN,
+    COUNTRY_NAME TEXT,
+)
+-- Using valid SQLite, answer the following questions for the tables provided above.
+-- how many work orders are open?
+SELECT"""
+model = client.get_model(model_uid=uid)
+model.generate(prompt=text)
+```
+Result:
+```json
+{
+   "id":"aeb5c87a-352e-11ee-89ad-9af9f16816c5",
+   "object":"text_completion",
+   "created":1691418511,
+   "model":"3b912fc4-352e-11ee-8e66-9af9f16816c5",
+   "choices":[
+      {
+         "text":" COUNT(*) FROM work_orders WHERE IS_OPEN = '1';",
+         "index":0,
+         "logprobs":"None",
+         "finish_reason":"stop"
+      }
+   ],
+   "usage":{
+      "prompt_tokens":117,
+      "completion_tokens":17,
+      "total_tokens":134
+   }
+}
+```
 ## Pytorch Model Best Practices

{xinference-0.1.1 → xinference-0.1.3}/setup.cfg RENAMED Viewed

@@ -60,7 +60,6 @@ dev =
 	flake8>=3.8.0
 	black
 all =
-	chatglm-cpp
 	llama-cpp-python>=0.1.77
 	transformers>=4.31.0
 	torch
@@ -70,8 +69,8 @@ all =
 	bitsandbytes
 	protobuf
 	einops
+	tiktoken
 ggml =
-	chatglm-cpp
 	llama-cpp-python>=0.1.77
 pytorch =
 	transformers>=4.31.0
@@ -82,6 +81,7 @@ pytorch =
 	bitsandbytes
 	protobuf
 	einops
+	tiktoken
 doc =
 	ipython>=6.5.0
 	sphinx>=3.0.0,<5.0.0

{xinference-0.1.1 → xinference-0.1.3}/xinference/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2023-08-03T23:45:49+0800",
+ "date": "2023-08-09T18:43:41+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "b21d9275912ad7e0ab6e4534a82c6fe71d55c179",
- "version": "0.1.1"
+ "full-revisionid": "4d2f61cb6591ac94624f035b37259a89002abefd",
+ "version": "0.1.3"
 }
 '''  # END VERSION_JSON

{xinference-0.1.1 → xinference-0.1.3}/xinference/client.py RENAMED Viewed

@@ -480,6 +480,24 @@ class Client:
         # generate a time-based uuid.
         return str(uuid.uuid1())
+    def register_model(self, model_type: str, model: str, persist: bool):
+        coro = self._supervisor_ref.register_model(model_type, model, persist)
+        self._isolation.call(coro)
+    def unregister_model(self, model_type: str, model_name: str):
+        coro = self._supervisor_ref.unregister_model(model_type, model_name)
+        self._isolation.call(coro)
+    def list_model_registrations(self, model_type: str) -> List[Dict[str, Any]]:
+        coro = self._supervisor_ref.list_model_registrations(model_type)
+        return self._isolation.call(coro)
+    def get_model_registration(
+        self, model_type: str, model_name: str
+    ) -> Dict[str, Any]:
+        coro = self._supervisor_ref.get_model_registration(model_type, model_name)
+        return self._isolation.call(coro)
     def launch_model(
         self,
         model_name: str,

{xinference-0.1.1 → xinference-0.1.3}/xinference/constants.py RENAMED Viewed

@@ -17,6 +17,7 @@ from pathlib import Path
 XINFERENCE_HOME = str(Path.home() / ".xinference")
 XINFERENCE_CACHE_DIR = os.path.join(XINFERENCE_HOME, "cache")
+XINFERENCE_MODEL_DIR = os.path.join(XINFERENCE_HOME, "model")
 XINFERENCE_LOG_DIR = os.path.join(XINFERENCE_HOME, "logs")
 XINFERENCE_DEFAULT_LOCAL_HOST = "127.0.0.1"

{xinference-0.1.1 → xinference-0.1.3}/xinference/core/gradio.py RENAMED Viewed

@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional
 import gradio as gr
 from ..locale.utils import Locale
-from ..model.llm import LLM_FAMILIES, LLMFamilyV1, match_llm
+from ..model.llm import BUILTIN_LLM_FAMILIES, LLMFamilyV1, match_llm
 from ..model.llm.llm_family import cache
 from .api import SyncSupervisorAPI
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
 MODEL_TO_FAMILIES: Dict[str, LLMFamilyV1] = dict(
     (model_family.model_name, model_family)
-    for model_family in LLM_FAMILIES
+    for model_family in BUILTIN_LLM_FAMILIES
     if "chat" in model_family.model_ability
 )

{xinference-0.1.1 → xinference-0.1.3}/xinference/core/restful_api.py RENAMED Viewed

@@ -480,7 +480,7 @@ class RESTfulAPIActor(xo.Actor):
             (msg["content"] for msg in body.messages if msg["role"] == "system"), None
         )
-        chat_history = body.messages
+        chat_history = body.messages[:-1]  # exclude the prompt
         model_uid = body.model
@@ -494,6 +494,26 @@ class RESTfulAPIActor(xo.Actor):
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+        try:
+            desc = await self._supervisor_ref.describe_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
+        is_chatglm_ggml = desc.get(
+            "model_format"
+        ) == "ggmlv3" and "chatglm" in desc.get("model_name", "")
+        if is_chatglm_ggml and system_prompt is not None:
+            raise HTTPException(
+                status_code=400, detail="ChatGLM ggml does not have system prompt"
+            )
         if body.stream:
             # create a pair of memory object streams
             send_chan, recv_chan = anyio.create_memory_object_stream(10)
@@ -501,9 +521,12 @@ class RESTfulAPIActor(xo.Actor):
             async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                 async with inner_send_chan:
                     try:
-                        iterator = await model.chat(
-                            prompt, system_prompt, chat_history, kwargs
-                        )
+                        if is_chatglm_ggml:
+                            iterator = await model.chat(prompt, chat_history, kwargs)
+                        else:
+                            iterator = await model.chat(
+                                prompt, system_prompt, chat_history, kwargs
+                            )
                         async for chunk in iterator:
                             await inner_send_chan.send(dict(data=json.dumps(chunk)))
                             if await request.is_disconnected():
@@ -525,7 +548,10 @@ class RESTfulAPIActor(xo.Actor):
         else:
             try:
-                return await model.chat(prompt, system_prompt, chat_history, kwargs)
+                if is_chatglm_ggml:
+                    return await model.chat(prompt, chat_history, kwargs)
+                else:
+                    return await model.chat(prompt, system_prompt, chat_history, kwargs)
             except Exception as e:
                 logger.error(e, exc_info=True)
                 raise HTTPException(status_code=500, detail=str(e))

{xinference-0.1.1 → xinference-0.1.3}/xinference/core/supervisor.py RENAMED Viewed

@@ -16,7 +16,7 @@ import asyncio
 import time
 from dataclasses import dataclass
 from logging import getLogger
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import xoscar as xo
@@ -74,6 +74,69 @@ class SupervisorActor(xo.Actor):
         raise RuntimeError("No available worker found")
+    @log_sync(logger=logger)
+    def list_model_registrations(self, model_type: str) -> List[Dict[str, Any]]:
+        if model_type == "LLM":
+            from ..model.llm import BUILTIN_LLM_FAMILIES, get_user_defined_llm_families
+            ret = [
+                {"model_name": f.model_name, "is_builtin": True}
+                for f in BUILTIN_LLM_FAMILIES
+            ]
+            user_defined_llm_families = get_user_defined_llm_families()
+            ret.extend(
+                [
+                    {"model_name": f.model_name, "is_builtin": False}
+                    for f in user_defined_llm_families
+                ]
+            )
+            return ret
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_sync(logger=logger)
+    def get_model_registration(
+        self, model_type: str, model_name: str
+    ) -> Dict[str, Any]:
+        if model_type == "LLM":
+            from ..model.llm import BUILTIN_LLM_FAMILIES, get_user_defined_llm_families
+            for f in BUILTIN_LLM_FAMILIES + get_user_defined_llm_families():
+                if f.model_name == model_name:
+                    return f
+            raise ValueError(f"Model {model_name} not found")
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_async(logger=logger)
+    async def register_model(self, model_type: str, model: str, persist: bool):
+        if model_type == "LLM":
+            from ..model.llm import LLMFamilyV1, register_llm
+            llm_family = LLMFamilyV1.parse_raw(model)
+            register_llm(llm_family, persist)
+            if not self.is_local_deployment:
+                for worker in self._worker_address_to_worker.values():
+                    await worker.register_model(model_type, model, persist)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_async(logger=logger)
+    async def unregister_model(self, model_type: str, model_name: str):
+        if model_type == "LLM":
+            from ..model.llm import unregister_llm
+            unregister_llm(model_name)
+            if not self.is_local_deployment:
+                for worker in self._worker_address_to_worker.values():
+                    await worker.unregister_model(model_name)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
     async def launch_builtin_model(
         self,
         model_uid: str,

{xinference-0.1.1 → xinference-0.1.3}/xinference/core/worker.py RENAMED Viewed

@@ -108,8 +108,30 @@ class WorkerActor(xo.Actor):
             "model_format": llm_spec.model_format,
             "model_size_in_billions": llm_spec.model_size_in_billions,
             "quantization": quantization,
+            "revision": llm_spec.model_revision,
         }
+    @log_sync(logger=logger)
+    async def register_model(self, model_type: str, model: str, persist: bool):
+        # TODO: centralized model registrations
+        if model_type == "LLM":
+            from ..model.llm import LLMFamilyV1, register_llm
+            llm_family = LLMFamilyV1.parse_raw(model)
+            register_llm(llm_family, persist)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
+    @log_sync(logger=logger)
+    async def unregister_model(self, model_type: str, model_name: str):
+        # TODO: centralized model registrations
+        if model_type == "LLM":
+            from ..model.llm import unregister_llm
+            unregister_llm(model_name)
+        else:
+            raise ValueError(f"Unsupported model type: {model_type}")
     @log_async(logger=logger)
     async def launch_builtin_model(
         self,

{xinference-0.1.1 → xinference-0.1.3}/xinference/deploy/cmdline.py RENAMED Viewed

@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import configparser
 import logging
 import os
+import sys
 from typing import Optional
 import click
@@ -30,6 +30,32 @@ from ..constants import (
 )
+def get_config_string(log_level: str) -> str:
+    return f"""
+        [loggers]
+        keys=root
+        [handlers]
+        keys=stream_handler
+        [formatters]
+        keys=formatter
+        [logger_root]
+        level={log_level.upper()}
+        handlers=stream_handler
+        [handler_stream_handler]
+        class=StreamHandler
+        formatter=formatter
+        level={log_level.upper()}
+        args=(sys.stderr,)
+        [formatter_formatter]
+        format=%(asctime)s %(name)-12s %(process)d %(levelname)-8s %(message)s
+        """
 def get_endpoint(endpoint: Optional[str]) -> str:
     # user didn't specify the endpoint.
     if endpoint is None:
@@ -57,9 +83,10 @@ def cli(
     if ctx.invoked_subcommand is None:
         from .local import main
-        if log_level:
-            logging.basicConfig(level=logging.getLevelName(log_level.upper()))
-        logging_conf = dict(level=log_level.upper())
+        logging_conf = configparser.RawConfigParser()
+        logger_config_string = get_config_string(log_level)
+        logging_conf.read_string(logger_config_string)
+        logging.config.fileConfig(logging_conf)  # type: ignore
         address = f"{host}:{get_next_port()}"
@@ -102,9 +129,10 @@ def supervisor(
 def worker(log_level: str, endpoint: Optional[str], host: str):
     from ..deploy.worker import main
-    if log_level:
-        logging.basicConfig(level=logging.getLevelName(log_level.upper()))
-    logging_conf = dict(level=log_level.upper())
+    logging_conf = configparser.RawConfigParser()
+    logger_config_string = get_config_string(log_level)
+    logging_conf.read_string(logger_config_string)
+    logging.config.fileConfig(level=logging.getLevelName(log_level.upper()))  # type: ignore
     endpoint = get_endpoint(endpoint)
@@ -146,7 +174,7 @@ def model_launch(
         quantization=quantization,
     )
-    print(f"Model uid: {model_uid}")
+    print(f"Model uid: {model_uid}", file=sys.stderr)
 @cli.command("list")
@@ -157,18 +185,16 @@ def model_launch(
 )
 @click.option("--all", is_flag=True)
 def model_list(endpoint: Optional[str], all: bool):
-    import sys
     from tabulate import tabulate
     # TODO: get from the supervisor
-    from ..model.llm import LLM_FAMILIES
+    from ..model.llm import BUILTIN_LLM_FAMILIES
     endpoint = get_endpoint(endpoint)
     table = []
     if all:
-        for model_family in LLM_FAMILIES:
+        for model_family in BUILTIN_LLM_FAMILIES:
             table.append(
                 [
                     model_family.model_name,

{xinference-0.1.1 → xinference-0.1.3}/xinference/deploy/worker.py RENAMED Viewed

@@ -14,7 +14,7 @@
 import asyncio
 import logging
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 import xoscar as xo
@@ -53,7 +53,7 @@ async def _start_worker(
     await pool.join()
-def main(address: str, supervisor_address: str, logging_conf: Optional[Dict] = None):
+def main(address: str, supervisor_address: str, logging_conf: Any = None):
     loop = asyncio.get_event_loop()
     task = loop.create_task(_start_worker(address, supervisor_address, logging_conf))

xinference-0.1.3/xinference/model/llm/__init__.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import json
+import os
+from .core import LLM
+from .llm_family import (
+    BUILTIN_LLM_FAMILIES,
+    LLM_CLASSES,
+    GgmlLLMSpecV1,
+    LLMFamilyV1,
+    LLMSpecV1,
+    PromptStyleV1,
+    PytorchLLMSpecV1,
+    get_user_defined_llm_families,
+    match_llm,
+    match_llm_cls,
+    register_llm,
+    unregister_llm,
+)
+def _install():
+    from .ggml.chatglm import ChatglmCppChatModel
+    from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .pytorch.baichuan import BaichuanPytorchChatModel
+    from .pytorch.chatglm import ChatglmPytorchChatModel
+    from .pytorch.core import PytorchChatModel, PytorchModel
+    from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
+    from .pytorch.vicuna import VicunaPytorchChatModel
+    LLM_CLASSES.extend(
+        [
+            ChatglmCppChatModel,
+            LlamaCppModel,
+            LlamaCppChatModel,
+            PytorchModel,
+            PytorchChatModel,
+            BaichuanPytorchChatModel,
+            VicunaPytorchChatModel,
+            FalconPytorchModel,
+            FalconPytorchChatModel,
+            ChatglmPytorchChatModel,
+        ]
+    )
+    json_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
+    )
+    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
+        BUILTIN_LLM_FAMILIES.append(LLMFamilyV1.parse_obj(json_obj))
+    from ...constants import XINFERENCE_MODEL_DIR
+    user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")
+    if os.path.isdir(user_defined_llm_dir):
+        for f in os.listdir(user_defined_llm_dir):
+            with codecs.open(f, encoding="utf-8") as fd:
+                user_defined_llm_family = LLMFamilyV1.parse_obj(json.load(fd))
+                register_llm(user_defined_llm_family, persist=False)

xinference 0.1.1__tar.gz → 0.1.3__tar.gz

Potentially problematic release.

xinference 0.1.1tar.gz → 0.1.3tar.gz