PyPI - xinference - Versions diffs - 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl - Mend

xinference 0.14.1py3-none-any.whl → 0.14.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-08-09T18:03:26+0800",
+ "date": "2024-08-16T18:10:38+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "3e7ed865c3b3de601c92edbd9744f2bff9054051",
- "version": "0.14.1"
+ "full-revisionid": "e4d225774dc7a9a9260396bf833e03a1df8e8a92",
+ "version": "0.14.2"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -1682,18 +1682,9 @@ class RESTfulAPI:
         model_family = desc.get("model_family", "")
         function_call_models = (
-            ["chatglm3", "gorilla-openfunctions-v1"]
-            + QWEN_TOOL_CALL_FAMILY
-            + GLM4_TOOL_CALL_FAMILY
+            ["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
         )
-        is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
-        if is_qwen and system_prompt is not None:
-            raise HTTPException(
-                status_code=400, detail="Qwen ggml does not have system prompt"
-            )
         if model_family not in function_call_models:
             if body.tools:
                 raise HTTPException(
@@ -1724,18 +1715,13 @@ class RESTfulAPI:
                 iterator = None
                 try:
                     try:
-                        if is_qwen:
-                            iterator = await model.chat(
-                                prompt, chat_history, kwargs, raw_params=raw_kwargs
-                            )
-                        else:
-                            iterator = await model.chat(
-                                prompt,
-                                system_prompt,
-                                chat_history,
-                                kwargs,
-                                raw_params=raw_kwargs,
-                            )
+                        iterator = await model.chat(
+                            prompt,
+                            system_prompt,
+                            chat_history,
+                            kwargs,
+                            raw_params=raw_kwargs,
+                        )
                     except RuntimeError as re:
                         await self._report_error_event(model_uid, str(re))
                         self.handle_request_limit_error(re)
@@ -1763,18 +1749,13 @@ class RESTfulAPI:
             return EventSourceResponse(stream_results())
         else:
             try:
-                if is_qwen:
-                    data = await model.chat(
-                        prompt, chat_history, kwargs, raw_params=raw_kwargs
-                    )
-                else:
-                    data = await model.chat(
-                        prompt,
-                        system_prompt,
-                        chat_history,
-                        kwargs,
-                        raw_params=raw_kwargs,
-                    )
+                data = await model.chat(
+                    prompt,
+                    system_prompt,
+                    chat_history,
+                    kwargs,
+                    raw_params=raw_kwargs,
+                )
                 return Response(content=data, media_type="application/json")
             except Exception as e:
                 logger.error(e, exc_info=True)

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -426,7 +426,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
             The user's message or user's input.
         generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
             Additional configuration for the chat generation.
-            "LlamaCppGenerateConfig" -> Configuration for ggml model
+            "LlamaCppGenerateConfig" -> Configuration for llama-cpp-python model
             "PytorchGenerateConfig" -> Configuration for pytorch model
         Returns
@@ -493,7 +493,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
             A tool list.
         generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
             Additional configuration for the chat generation.
-            "LlamaCppGenerateConfig" -> configuration for ggml model
+            "LlamaCppGenerateConfig" -> configuration for llama-cpp-python model
             "PytorchGenerateConfig" -> configuration for pytorch model
         Returns

xinference/core/chat_interface.py CHANGED Viewed

@@ -236,8 +236,8 @@ class GradioInterface:
                 bot[-1][1] = history[-1]["content"]
                 yield history, bot
-        def add_text(history, bot, text, image):
-            logger.debug("Add text, text: %s, image: %s", text, image)
+        def add_text(history, bot, text, image, video):
+            logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
@@ -257,16 +257,47 @@ class GradioInterface:
                         },
                     ],
                 }
+            elif video:
+                def video_to_base64(video_path):
+                    with open(video_path, "rb") as video_file:
+                        encoded_string = base64.b64encode(video_file.read()).decode(
+                            "utf-8"
+                        )
+                    return encoded_string
+                def generate_html_video(video_path):
+                    base64_video = video_to_base64(video_path)
+                    video_format = video_path.split(".")[-1]
+                    html_code = f"""
+                    <video controls>
+                        <source src="data:video/{video_format};base64,{base64_video}" type="video/{video_format}">
+                        Your browser does not support the video tag.
+                    </video>
+                    """
+                    return html_code
+                display_content = f"{generate_html_video(video)}\n{text}"
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                        {
+                            "type": "video_url",
+                            "video_url": {"url": video},
+                        },
+                    ],
+                }
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
             bot = bot + [[display_content, None]]
-            return history, bot, "", None
+            return history, bot, "", None, None
         def clear_history():
             logger.debug("Clear history.")
-            return [], None, "", None
+            return [], None, "", None, None
         def update_button(text):
             return gr.update(interactive=bool(text))
@@ -313,6 +344,7 @@ class GradioInterface:
                 )
                 with gr.Column(scale=3):
                     imagebox = gr.Image(type="filepath")
+                    videobox = gr.Video()
                     textbox = gr.Textbox(
                         show_label=False,
                         placeholder="Enter text and press ENTER",
@@ -340,8 +372,8 @@ class GradioInterface:
             textbox.submit(
                 add_text,
-                [state, chatbot, textbox, imagebox],
-                [state, chatbot, textbox, imagebox],
+                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox],
                 queue=False,
             ).then(
                 predict,
@@ -351,8 +383,8 @@ class GradioInterface:
             submit_btn.click(
                 add_text,
-                [state, chatbot, textbox, imagebox],
-                [state, chatbot, textbox, imagebox],
+                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox],
                 queue=False,
             ).then(
                 predict,
@@ -361,7 +393,10 @@ class GradioInterface:
             )
             clear_btn.click(
-                clear_history, None, [state, chatbot, textbox, imagebox], queue=False
+                clear_history,
+                None,
+                [state, chatbot, textbox, imagebox, videobox],
+                queue=False,
             )
         return chat_vl_interface

xinference/core/model.py CHANGED Viewed

@@ -132,8 +132,8 @@ class ModelActor(xo.StatelessActor):
     async def __pre_destroy__(self):
         from ..model.embedding.core import EmbeddingModel
-        from ..model.llm.pytorch.core import PytorchModel as LLMPytorchModel
         from ..model.llm.sglang.core import SGLANGModel
+        from ..model.llm.transformers.core import PytorchModel as LLMPytorchModel
         from ..model.llm.vllm.core import VLLMModel as LLMVLLMModel
         if self.allow_batching():
@@ -177,8 +177,8 @@ class ModelActor(xo.StatelessActor):
         request_limits: Optional[int] = None,
     ):
         super().__init__()
-        from ..model.llm.pytorch.core import PytorchModel
         from ..model.llm.sglang.core import SGLANGModel
+        from ..model.llm.transformers.core import PytorchModel
         from ..model.llm.vllm.core import VLLMModel
         self._worker_address = worker_address
@@ -272,7 +272,7 @@ class ModelActor(xo.StatelessActor):
         return isinstance(self._model, VLLMModel)
     def allow_batching(self) -> bool:
-        from ..model.llm.pytorch.core import PytorchModel
+        from ..model.llm.transformers.core import PytorchModel
         model_ability = self._model_description.get("model_ability", [])
@@ -415,7 +415,7 @@ class ModelActor(xo.StatelessActor):
                     ret = await asyncio.to_thread(fn, *args, **kwargs)
         if self._lock is not None and self._current_generator():
-            raise Exception("Parallel generation is not supported by ggml.")
+            raise Exception("Parallel generation is not supported by llama-cpp-python.")
         if inspect.isgenerator(ret):
             gen = self._to_generator(output_type, ret)

xinference/core/scheduler.py CHANGED Viewed

@@ -24,7 +24,6 @@ import xoscar as xo
 logger = logging.getLogger(__name__)
-XINFERENCE_BATCHING_CLEAN_CACHE_INTERVAL = 5
 XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
 XINFERENCE_STREAMING_ERROR_FLAG = "<XINFERENCE_STREAMING_ERROR>"
 XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
@@ -359,7 +358,7 @@ class SchedulerActor(xo.StatelessActor):
     @staticmethod
     def _empty_cache():
-        from ..model.llm.pytorch.utils import empty_cache
+        from ..model.llm.transformers.utils import empty_cache
         empty_cache()

xinference/core/worker.py CHANGED Viewed

@@ -830,7 +830,7 @@ class WorkerActor(xo.StatelessActor):
                 raise ValueError(
                     f"PEFT adaptors cannot be applied to embedding or rerank models."
                 )
-            if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
+            if model_type == "LLM" and model_format in ("ggufv2",):
                 raise ValueError(
                     f"PEFT adaptors can only be applied to pytorch-like models"
                 )

xinference/deploy/cmdline.py CHANGED Viewed

@@ -750,7 +750,7 @@ def remove_cache(
     "-f",
     default=None,
     type=str,
-    help="Specify the format of the model, e.g. pytorch, ggmlv3, etc.",
+    help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
 )
 @click.option(
     "--quantization",
@@ -1516,7 +1516,7 @@ def query_engine_by_model_name(
     "-f",
     type=str,
     required=True,
-    help="Specify the format of the model, e.g. pytorch, ggmlv3, etc.",
+    help="Specify the format of the model, e.g. pytorch, ggufv2, etc.",
 )
 @click.option(
     "--quantization",

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -66,10 +66,10 @@ def test_cmdline(setup, stream, model_uid):
     replica = 1
     original_model_uid = model_uid
     model_uid = client.launch_model(
-        model_name="orca",
+        model_name="qwen1.5-chat",
         model_engine="llama.cpp",
         model_uid=model_uid,
-        model_size_in_billions=3,
+        model_size_in_billions="0_5",
         quantization="q4_0",
         replica=replica,
     )
@@ -249,10 +249,10 @@ def test_rotate_logs(setup_with_file_logging):
     runner = CliRunner()
     replica = 1 if os.name == "nt" else 2
     model_uid = client.launch_model(
-        model_name="orca",
+        model_name="qwen1.5-chat",
         model_engine="llama.cpp",
         model_uid=None,
-        model_size_in_billions=3,
+        model_size_in_billions="0_5",
         quantization="q4_0",
         replica=replica,
     )
@@ -288,7 +288,7 @@ def test_list_cached_models(setup):
     result = runner.invoke(
         list_cached_models,
-        ["--endpoint", endpoint, "--model_name", "orca"],
+        ["--endpoint", endpoint, "--model_name", "qwen1.5-chat"],
     )
     assert "model_name" in result.stdout
     assert "model_format" in result.stdout
@@ -305,9 +305,9 @@ def test_remove_cache(setup):
     result = runner.invoke(
         remove_cache,
-        ["--endpoint", endpoint, "--model_version", "orca"],
+        ["--endpoint", endpoint, "--model_version", "qwen1.5-chat"],
         input="y\n",
     )
     assert result.exit_code == 0
-    assert "Cache directory orca has been deleted."
+    assert "Cache directory qwen1.5-chat has been deleted."

xinference/model/llm/__init__.py CHANGED Viewed

@@ -40,7 +40,7 @@ from .llm_family import (
     TRANSFORMERS_CLASSES,
     VLLM_CLASSES,
     CustomLLMFamilyV1,
-    GgmlLLMSpecV1,
+    LlamaCppLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
     MLXLLMSpecV1,
@@ -55,10 +55,10 @@ from .llm_family import (
 def check_format_with_engine(model_format, engine):
-    # only llama-cpp-python support and only support ggufv2 and ggmlv3
-    if model_format in ["ggufv2", "ggmlv3"] and engine != "llama.cpp":
+    # only llama-cpp-python support and only support ggufv2
+    if model_format in ["ggufv2"] and engine != "llama.cpp":
         return False
-    if model_format not in ["ggufv2", "ggmlv3"] and engine == "llama.cpp":
+    if model_format not in ["ggufv2"] and engine == "llama.cpp":
         return False
     return True
@@ -112,28 +112,25 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
-    from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
     from .mlx.core import MLXChatModel, MLXModel
-    from .pytorch.baichuan import BaichuanPytorchChatModel
-    from .pytorch.chatglm import ChatglmPytorchChatModel
-    from .pytorch.cogvlm2 import CogVLM2Model
-    from .pytorch.core import PytorchChatModel, PytorchModel
-    from .pytorch.deepseek_vl import DeepSeekVLChatModel
-    from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
-    from .pytorch.glm4v import Glm4VModel
-    from .pytorch.intern_vl import InternVLChatModel
-    from .pytorch.internlm2 import Internlm2PytorchChatModel
-    from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
-    from .pytorch.minicpmv25 import MiniCPMV25Model
-    from .pytorch.minicpmv26 import MiniCPMV26Model
-    from .pytorch.qwen_vl import QwenVLChatModel
-    from .pytorch.vicuna import VicunaPytorchChatModel
-    from .pytorch.yi_vl import YiVLChatModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
-    from .vllm.core import VLLMChatModel, VLLMModel
+    from .transformers.chatglm import ChatglmPytorchChatModel
+    from .transformers.cogvlm2 import CogVLM2Model
+    from .transformers.core import PytorchChatModel, PytorchModel
+    from .transformers.deepseek_vl import DeepSeekVLChatModel
+    from .transformers.glm4v import Glm4VModel
+    from .transformers.intern_vl import InternVLChatModel
+    from .transformers.internlm2 import Internlm2PytorchChatModel
+    from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
+    from .transformers.minicpmv25 import MiniCPMV25Model
+    from .transformers.minicpmv26 import MiniCPMV26Model
+    from .transformers.qwen_vl import QwenVLChatModel
+    from .transformers.yi_vl import YiVLChatModel
+    from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
     try:
-        from .pytorch.omnilmm import OmniLMMModel
+        from .transformers.omnilmm import OmniLMMModel
     except ImportError as e:
         # For quite old transformers version,
         # import will generate error
@@ -148,18 +145,14 @@ def _install():
         ]
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
-    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
-            BaichuanPytorchChatModel,
-            VicunaPytorchChatModel,
-            FalconPytorchChatModel,
             ChatglmPytorchChatModel,
             LlamaPytorchModel,
             LlamaPytorchChatModel,
             PytorchChatModel,
-            FalconPytorchModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
             YiVLChatModel,

xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} RENAMED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import datetime
 import logging
 import os
 import time
@@ -104,35 +103,6 @@ class LlamaCppModel(LLM):
         generate_config.pop("lora_name", None)  # type: ignore
         return generate_config
-    def _convert_ggml_to_gguf(self, model_path: str) -> str:
-        from .tools import convert
-        root_dir = os.path.dirname(os.path.dirname(model_path))
-        gguf_dir = os.path.join(
-            root_dir,
-            "{}-ggufv2-{}b".format(
-                self.model_family.model_name, self.model_spec.model_size_in_billions
-            ),
-        )
-        os.makedirs(gguf_dir, exist_ok=True)
-        gguf_path = os.path.join(
-            gguf_dir,
-            "{}.{}.ggufv2".format(self.model_family.model_name, self.quantization),
-        )
-        # trick for validation, use a mark file to make sure the gguf file is converted
-        mark_file = os.path.join(gguf_dir, f"__valid_{self.quantization}")
-        if os.path.exists(mark_file):
-            return gguf_path
-        else:
-            logger.warning(
-                "You are using a model with ggmlv3, "
-                "and it will take some time to convert to ggufv2"
-            )
-            convert(model_path, gguf_path)
-            with open(mark_file, "w") as f:
-                f.write(str(datetime.datetime.now()))
-            return gguf_path
     def load(self):
         try:
             import llama_cpp
@@ -167,9 +137,6 @@ class LlamaCppModel(LLM):
         if os.path.exists(legacy_model_file_path):
             model_path = legacy_model_file_path
-        if self.model_spec.model_format == "ggmlv3":
-            model_path = self._convert_ggml_to_gguf(model_path)
         try:
             self._llm = Llama(
                 model_path=model_path,
@@ -183,7 +150,7 @@ class LlamaCppModel(LLM):
     def match(
         cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
+        if llm_spec.model_format not in ["ggufv2"]:
             return False
         if "qwen" in llm_family.model_name:
             return False
@@ -285,7 +252,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     def match(
         cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
+        if llm_spec.model_format not in ["ggufv2"]:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl

Potentially problematic release.

xinference 0.14.1py3-none-any.whl → 0.14.2py3-none-any.whl