PyPI - xinference - Versions diffs - 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl - Mend

xinference 0.7.4.1py3-none-any.whl → 0.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2023-12-29T17:07:55+0800",
+ "date": "2024-01-05T15:29:43+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "1f83892d421998d63e76720eb7a3757678285dc7",
- "version": "0.7.4.1"
+ "full-revisionid": "56b28b3e4149b0a9ab6f5322401b1c3f1fc95c1a",
+ "version": "0.7.5"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -160,6 +160,9 @@ class RESTfulAPI:
         self._router.add_api_route(
             "/v1/models/prompts", self._get_builtin_prompts, methods=["GET"]
         )
+        self._router.add_api_route(
+            "/v1/models/families", self._get_builtin_families, methods=["GET"]
+        )
         self._router.add_api_route(
             "/v1/cluster/devices", self._get_devices_count, methods=["GET"]
         )
@@ -312,6 +315,17 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+    async def _get_builtin_families(self) -> JSONResponse:
+        """
+        For internal usage
+        """
+        try:
+            data = await (await self._get_supervisor_ref()).get_builtin_families()
+            return JSONResponse(content=data)
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
     async def _get_devices_count(self) -> JSONResponse:
         """
         For internal usage
@@ -565,7 +579,7 @@ class RESTfulAPI:
                     except RuntimeError as re:
                         self.handle_request_limit_error(re)
                     async for item in iterator:
-                        yield dict(data=json.dumps(item))
+                        yield item
                 except Exception as ex:
                     if iterator is not None:
                         await iterator.destroy()
@@ -577,7 +591,7 @@ class RESTfulAPI:
         else:
             try:
                 data = await model.generate(body.prompt, kwargs)
-                return JSONResponse(content=data)
+                return Response(data, media_type="application/json")
             except Exception as e:
                 logger.error(e, exc_info=True)
                 self.handle_request_limit_error(e)
@@ -634,7 +648,7 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
-    async def create_images(self, request: TextToImageRequest) -> JSONResponse:
+    async def create_images(self, request: TextToImageRequest) -> Response:
         model_uid = request.model
         try:
             model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -655,7 +669,7 @@ class RESTfulAPI:
                 response_format=request.response_format,
                 **kwargs,
             )
-            return JSONResponse(content=image_list)
+            return Response(content=image_list, media_type="application/json")
         except RuntimeError as re:
             logger.error(re, exc_info=True)
             self.handle_request_limit_error(re)
@@ -674,7 +688,7 @@ class RESTfulAPI:
         response_format: Optional[str] = Form("url"),
         size: Optional[str] = Form("1024*1024"),
         kwargs: Optional[str] = Form(None),
-    ) -> JSONResponse:
+    ) -> Response:
         model_uid = model
         try:
             model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -697,7 +711,7 @@ class RESTfulAPI:
                 response_format=response_format,
                 **kwargs,
             )
-            return JSONResponse(content=image_list)
+            return Response(content=image_list, media_type="application/json")
         except RuntimeError as re:
             logger.error(re, exc_info=True)
             raise HTTPException(status_code=400, detail=str(re))
@@ -828,7 +842,7 @@ class RESTfulAPI:
                     except RuntimeError as re:
                         self.handle_request_limit_error(re)
                     async for item in iterator:
-                        yield dict(data=json.dumps(item))
+                        yield item
                 except Exception as ex:
                     if iterator is not None:
                         await iterator.destroy()
@@ -843,7 +857,7 @@ class RESTfulAPI:
                     data = await model.chat(prompt, chat_history, kwargs)
                 else:
                     data = await model.chat(prompt, system_prompt, chat_history, kwargs)
-                return JSONResponse(content=data)
+                return Response(content=data, media_type="application/json")
             except Exception as e:
                 logger.error(e, exc_info=True)
                 self.handle_request_limit_error(e)

xinference/client/oscar/actor_client.py CHANGED Viewed

@@ -13,11 +13,13 @@
 # limitations under the License.
 import asyncio
+import re
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
+import orjson
 import xoscar as xo
-from ...core.model import ModelActor
+from ...core.model import IteratorWrapper, ModelActor
 from ...core.supervisor import SupervisorActor
 from ...isolation import Isolation
 from ..restful.restful_client import Client
@@ -38,6 +40,52 @@ if TYPE_CHECKING:
     )
+class SSEEvent(object):
+    # https://github.com/btubbs/sseclient/blob/master/sseclient.py
+    sse_line_pattern = re.compile("(?P<name>[^:]*):?( ?(?P<value>.*))?")
+    def __init__(self, data="", event="message", id=None, retry=None):
+        self.data = data
+        self.event = event
+        self.id = id
+        self.retry = retry
+    @classmethod
+    def parse(cls, raw):
+        """
+        Given a possibly-multiline string representing an SSE message, parse it
+        and return a Event object.
+        """
+        msg = cls()
+        for line in raw.splitlines():
+            m = cls.sse_line_pattern.match(line)
+            if m is None:
+                # Malformed line.  Discard but warn.
+                continue
+            name = m.group("name")
+            if name == "":
+                # line began with a ":", so is a comment.  Ignore
+                continue
+            value = m.group("value")
+            if name == "data":
+                # If we already have some data, then join to it with a newline.
+                # Else this is it.
+                if msg.data:
+                    msg.data = "%s\n%s" % (msg.data, value)
+                else:
+                    msg.data = value
+            elif name == "event":
+                msg.event = value
+            elif name == "id":
+                msg.id = value
+            elif name == "retry":
+                msg.retry = int(value)
+        return msg
 class ModelHandle:
     """
     A sync model interface (for rpc client) which provides type hints that makes it much easier to use xinference
@@ -49,6 +97,19 @@ class ModelHandle:
         self._isolation = isolation
+class ClientIteratorWrapper(IteratorWrapper):
+    async def __anext__(self):
+        r = await super().__anext__()
+        text = r.decode("utf-8")
+        return orjson.loads(SSEEvent.parse(text).data)
+    @classmethod
+    def wrap(cls, iterator_wrapper):
+        c = cls.__new__(cls)
+        c.__dict__.update(iterator_wrapper.__dict__)
+        return c
 class EmbeddingModelHandle(ModelHandle):
     def create_embedding(self, input: Union[str, List[str]]) -> bytes:
         """
@@ -68,7 +129,7 @@ class EmbeddingModelHandle(ModelHandle):
         """
         coro = self._model_ref.create_embedding(input)
-        return self._isolation.call(coro)
+        return orjson.loads(self._isolation.call(coro))
 class RerankModelHandle(ModelHandle):
@@ -104,7 +165,7 @@ class RerankModelHandle(ModelHandle):
         coro = self._model_ref.rerank(
             documents, query, top_n, max_chunks_per_doc, return_documents
         )
-        results = self._isolation.call(coro)
+        results = orjson.loads(self._isolation.call(coro))
         for r in results["results"]:
             r["document"] = documents[r["index"]]
         return results
@@ -140,7 +201,10 @@ class GenerateModelHandle(EmbeddingModelHandle):
         """
         coro = self._model_ref.generate(prompt, generate_config)
-        return self._isolation.call(coro)
+        r = self._isolation.call(coro)
+        if isinstance(r, bytes):
+            return orjson.loads(r)
+        return ClientIteratorWrapper.wrap(r)
 class ChatModelHandle(GenerateModelHandle):
@@ -185,7 +249,10 @@ class ChatModelHandle(GenerateModelHandle):
         coro = self._model_ref.chat(
             prompt, system_prompt, chat_history, generate_config
         )
-        return self._isolation.call(coro)
+        r = self._isolation.call(coro)
+        if isinstance(r, bytes):
+            return orjson.loads(r)
+        return ClientIteratorWrapper.wrap(r)
 class ChatglmCppChatModelHandle(EmbeddingModelHandle):
@@ -217,7 +284,10 @@ class ChatglmCppChatModelHandle(EmbeddingModelHandle):
         """
         coro = self._model_ref.chat(prompt, chat_history, generate_config)
-        return self._isolation.call(coro)
+        r = self._isolation.call(coro)
+        if isinstance(r, bytes):
+            return orjson.loads(r)
+        return ClientIteratorWrapper.wrap(r)
 class ImageModelHandle(ModelHandle):
@@ -249,7 +319,7 @@ class ImageModelHandle(ModelHandle):
         """
         coro = self._model_ref.text_to_image(prompt, n, size, response_format, **kwargs)
-        return self._isolation.call(coro)
+        return orjson.loads(self._isolation.call(coro))
     def image_to_image(
         self,
@@ -294,7 +364,7 @@ class ImageModelHandle(ModelHandle):
         coro = self._model_ref.image_to_image(
             image, prompt, negative_prompt, n, size, response_format, **kwargs
         )
-        return self._isolation.call(coro)
+        return orjson.loads(self._isolation.call(coro))
 class ActorClient:

xinference/core/model.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import asyncio
 import inspect
+import json
 import os
 import uuid
 from typing import (
@@ -30,6 +31,7 @@ from typing import (
     Union,
 )
+import sse_starlette.sse
 import xoscar as xo
 if TYPE_CHECKING:
@@ -186,7 +188,7 @@ class ModelActor(xo.StatelessActor):
             )
         )
-    async def _wrap_generator(self, ret: Any):
+    def _wrap_generator(self, ret: Any):
         if inspect.isgenerator(ret) or inspect.isasyncgen(ret):
             if self._lock is not None and self._generators:
                 raise Exception("Parallel generation is not supported by ggml.")
@@ -199,7 +201,7 @@ class ModelActor(xo.StatelessActor):
                 model_actor_uid=self.uid,
             )
         else:
-            return ret
+            return json_dumps(ret)
     async def _call_wrapper(self, _wrapper: Callable):
         try:
@@ -335,9 +337,10 @@ class ModelActor(xo.StatelessActor):
             )
         def _wrapper():
-            return getattr(self._model, "text_to_image")(
+            r = getattr(self._model, "text_to_image")(
                 prompt, n, size, response_format, *args, **kwargs
             )
+            return json_dumps(r)
         return await self._call_wrapper(_wrapper)
@@ -358,7 +361,7 @@ class ModelActor(xo.StatelessActor):
             )
         def _wrapper():
-            return getattr(self._model, "image_to_image")(
+            r = getattr(self._model, "image_to_image")(
                 image,
                 prompt,
                 negative_prompt,
@@ -368,10 +371,10 @@ class ModelActor(xo.StatelessActor):
                 *args,
                 **kwargs,
             )
+            return json_dumps(r)
         return await self._call_wrapper(_wrapper)
-    @log_async(logger=logger)
     async def next(
         self, generator_uid: str
     ) -> Union["ChatCompletionChunk", "CompletionChunk"]:
@@ -381,14 +384,18 @@ class ModelActor(xo.StatelessActor):
         def _wrapper():
             try:
-                return next(gen)
+                v = dict(data=json.dumps(next(gen)))
+                return sse_starlette.sse.ensure_bytes(v, None)
             except StopIteration:
                 return stop
         async def _async_wrapper():
             try:
                 # anext is only available for Python >= 3.10
-                return await gen.__anext__()  # noqa: F821
+                v = await gen.__anext__()
+                v = await asyncio.to_thread(json.dumps, v)
+                v = dict(data=v)  # noqa: F821
+                return await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
             except StopAsyncIteration:
                 return stop

xinference/core/supervisor.py CHANGED Viewed

@@ -114,6 +114,18 @@ class SupervisorActor(xo.StatelessActor):
             data[k] = v.dict()
         return data
+    @staticmethod
+    async def get_builtin_families() -> Dict[str, List[str]]:
+        from ..model.llm.llm_family import (
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES,
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
+        )
+        return {
+            "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
+            "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
+        }
     async def get_devices_count(self) -> int:
         from ..utils import cuda_count

xinference/deploy/cmdline.py CHANGED Viewed

@@ -402,6 +402,22 @@ def list_model_registrations(
             tabulate(table, headers=["Type", "Name", "Family", "Is-built-in"]),
             file=sys.stderr,
         )
+    elif model_type == "multimodal":
+        for registration in registrations:
+            model_name = registration["model_name"]
+            model_family = client.get_model_registration(model_type, model_name)
+            table.append(
+                [
+                    model_type,
+                    model_family["model_name"],
+                    model_family["model_lang"],
+                    registration["is_builtin"],
+                ]
+            )
+        print(
+            tabulate(table, headers=["Type", "Name", "Language", "Is-built-in"]),
+            file=sys.stderr,
+        )
     else:
         raise NotImplementedError(f"List {model_type} is not implemented.")

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -159,6 +159,7 @@ def test_cmdline_of_custom_model(setup):
     "embed",
     "chat"
   ],
+  "model_family": "other",
   "model_specs": [
     {
       "model_format": "pytorch",

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -142,5 +142,45 @@
     "language": ["en"],
     "model_id": "jinaai/jina-embeddings-v2-base-en",
     "model_revision": "7302ac470bed880590f9344bfeee32ff8722d0e5"
+  },
+  {
+    "model_name": "text2vec-large-chinese",
+    "dimensions": 1024,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "shibing624/text2vec-bge-large-chinese",
+    "model_revision": "f5027ca48ea8316d63ee26d2b9bd27a061de33a3"
+  },
+  {
+    "model_name": "text2vec-base-chinese",
+    "dimensions": 768,
+    "max_tokens": 128,
+    "language": ["zh"],
+    "model_id": "shibing624/text2vec-base-chinese",
+    "model_revision": "8acc1289891d75f6b665ad623359798b55f86adb"
+  },
+  {
+    "model_name": "text2vec-base-chinese-paraphrase",
+    "dimensions": 768,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "shibing624/text2vec-base-chinese-paraphrase",
+    "model_revision": "beaf10481a5d9ca3b0daa9f0df6831ec956bf739"
+  },
+  {
+    "model_name": "text2vec-base-chinese-sentence",
+    "dimensions": 768,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "shibing624/text2vec-base-chinese-sentence",
+    "model_revision": "e73a94e821f22c6163166bfab9408d03933a5525"
+  },
+  {
+    "model_name": "text2vec-base-multilingual",
+    "dimensions": 384,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "shibing624/text2vec-base-multilingual",
+    "model_revision": "f241877385fa56ebcc75f04d1850e1579cfa661d"
   }
 ]

xinference/model/llm/__init__.py CHANGED Viewed

@@ -19,9 +19,12 @@ import os
 from .core import LLM
 from .llm_family import (
     BUILTIN_LLM_FAMILIES,
+    BUILTIN_LLM_MODEL_CHAT_FAMILIES,
+    BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLM_CLASSES,
+    CustomLLMFamilyV1,
     GgmlLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
@@ -94,6 +97,11 @@ def _install():
             # note that the key is the model name,
             # since there are multiple representations of the same prompt style name in json.
             BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+        # register model family
+        if "chat" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
+        else:
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
     modelscope_json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family_modelscope.json"
@@ -110,6 +118,11 @@ def _install():
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
             BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+        # register model family
+        if "chat" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
+        else:
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
     from ...constants import XINFERENCE_MODEL_DIR
@@ -119,5 +132,5 @@ def _install():
             with codecs.open(
                 os.path.join(user_defined_llm_dir, f), encoding="utf-8"
             ) as fd:
-                user_defined_llm_family = LLMFamilyV1.parse_obj(json.load(fd))
+                user_defined_llm_family = CustomLLMFamilyV1.parse_obj(json.load(fd))
                 register_llm(user_defined_llm_family, persist=False)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -557,7 +557,7 @@
           "none"
         ],
         "model_id": "THUDM/chatglm3-6b",
-        "model_revision": "e46a14881eae613281abbd266ee918e93a56018f"
+        "model_revision": "b098244a71fbe69ce149682d9072a7629f7e908c"
       }
     ],
     "prompt_style": {
@@ -566,6 +566,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -17,7 +17,7 @@ import os
 import platform
 import shutil
 from threading import Lock
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 from pydantic import BaseModel, Field, Protocol, ValidationError, validator
 from pydantic.error_wrappers import ErrorWrapper
@@ -41,6 +41,8 @@ logger = logging.getLogger(__name__)
 DEFAULT_CONTEXT_LENGTH = 2048
 BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
+BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
+BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
 class GgmlLLMSpecV1(BaseModel):
@@ -105,6 +107,8 @@ class LLMFamilyV1(BaseModel):
     model_lang: List[str]
     model_ability: List[Literal["embed", "generate", "chat"]]
     model_description: Optional[str]
+    # reason for not required str here: legacy registration
+    model_family: Optional[str]
     model_specs: List["LLMSpecV1"]
     prompt_style: Optional["PromptStyleV1"]
@@ -134,7 +138,39 @@ class CustomLLMFamilyV1(LLMFamilyV1):
             )
         except (ValueError, TypeError, UnicodeDecodeError) as e:
             raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
-        llm_spec = cls.parse_obj(obj)
+        llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
+        # check model_family
+        if llm_spec.model_family is None:
+            raise ValueError(
+                f"You must specify `model_family` when registering custom LLM models."
+            )
+        assert isinstance(llm_spec.model_family, str)
+        if (
+            llm_spec.model_family != "other"
+            and "chat" in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for chat model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
+            )
+        if (
+            llm_spec.model_family != "other"
+            and "chat" not in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for generate model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}"
+            )
+        # set prompt style when it is the builtin model family
+        if (
+            llm_spec.prompt_style is None
+            and llm_spec.model_family != "other"
+            and "chat" in llm_spec.model_ability
+        ):
+            llm_spec.prompt_style = llm_spec.model_family
         # handle prompt style when user choose existing style
         if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -331,6 +331,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -357,7 +366,7 @@
         ],
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/chatglm3-6b-32k",
-        "model_revision": "v1.0.0"
+        "model_revision": "master"
       }
     ],
     "prompt_style": {

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -58,6 +58,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
             trust_remote_code=kwargs["trust_remote_code"],
+            encode_special_tokens=True,
             revision=kwargs["revision"],
         )
         model = AutoModel.from_pretrained(

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -409,7 +409,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     ) -> PytorchGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
         if (
-            generate_config.get("stop", None) is None
+            (not generate_config.get("stop"))
             and self.model_family.prompt_style
             and self.model_family.prompt_style.stop
         ):

xinference 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl

Potentially problematic release.

xinference 0.7.4.1py3-none-any.whl → 0.7.5py3-none-any.whl