PyPI - xinference - Versions diffs - 0.11.2.post1__py3-none-any.whl → 0.11.3__py3-none-any.whl - Mend

xinference 0.11.2.post1py3-none-any.whl → 0.11.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (22) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +14 -8
xinference/constants.py +4 -0
xinference/core/__init__.py +0 -2
xinference/core/cache_tracker.py +22 -1
xinference/core/chat_interface.py +71 -10
xinference/core/supervisor.py +5 -3
xinference/core/worker.py +8 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +336 -39
xinference/model/llm/llm_family_modelscope.json +267 -1
xinference/model/llm/pytorch/baichuan.py +2 -1
xinference/model/llm/pytorch/cogvlm2.py +257 -0
xinference/model/llm/pytorch/core.py +1 -0
xinference/model/llm/pytorch/intern_vl.py +5 -10
xinference/model/llm/vllm/core.py +1 -1
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/METADATA +4 -4
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/RECORD +22 -21
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/LICENSE +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/WHEEL +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/entry_points.txt +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.11.3.dist-info}/top_level.txt +0 -0

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-05-24T19:39:58+0800",
+ "date": "2024-05-31T17:12:13+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "ac8f33439c25e6fb05eba79e7932cbbadd068174",
- "version": "0.11.2.post1"
+ "full-revisionid": "69c09cd068a530cd2fdcac07e4e81f03d48f04f9",
+ "version": "0.11.3"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -52,7 +52,7 @@ from xoscar.utils import get_next_port
 from .._compat import BaseModel, Field
 from .._version import get_versions
-from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT
+from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT, XINFERENCE_DISABLE_METRICS
 from ..core.event import Event, EventCollectorActor, EventType
 from ..core.supervisor import SupervisorActor
 from ..core.utils import json_dumps
@@ -504,13 +504,19 @@ class RESTfulAPI:
             ),
         )
-        # Clear the global Registry for the MetricsMiddleware, or
-        # the MetricsMiddleware will register duplicated metrics if the port
-        # conflict (This serve method run more than once).
-        REGISTRY.clear()
-        self._app.add_middleware(MetricsMiddleware)
-        self._app.include_router(self._router)
-        self._app.add_route("/metrics", metrics)
+        if XINFERENCE_DISABLE_METRICS:
+            logger.info(
+                "Supervisor metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
+            )
+            self._app.include_router(self._router)
+        else:
+            # Clear the global Registry for the MetricsMiddleware, or
+            # the MetricsMiddleware will register duplicated metrics if the port
+            # conflict (This serve method run more than once).
+            REGISTRY.clear()
+            self._app.add_middleware(MetricsMiddleware)
+            self._app.include_router(self._router)
+            self._app.add_route("/metrics", metrics)
         # Check all the routes returns Response.
         # This is to avoid `jsonable_encoder` performance issue:

xinference/constants.py CHANGED Viewed

@@ -26,6 +26,7 @@ XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
 XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
 XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
 XINFERENCE_ENV_ENABLE_SGLANG = "XINFERENCE_ENABLE_SGLANG"
+XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
 def get_xinference_home() -> str:
@@ -66,3 +67,6 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
 )
 XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
 XINFERENCE_ENABLE_SGLANG = bool(int(os.environ.get(XINFERENCE_ENV_ENABLE_SGLANG, 0)))
+XINFERENCE_DISABLE_METRICS = bool(
+    int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
+)

xinference/core/__init__.py CHANGED Viewed

@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .model import ModelActor

xinference/core/cache_tracker.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from logging import getLogger
 from typing import Any, Dict, List, Optional
@@ -105,9 +106,29 @@ class CacheTrackerActor(xo.Actor):
         cached_models = []
         for model_name, model_versions in self._model_name_to_version_info.items():
             for version_info in model_versions:
-                if version_info["cache_status"]:
+                cache_status = version_info.get("cache_status", None)
+                if cache_status == True:
                     ret = version_info.copy()
                     ret["model_name"] = model_name
+                    re_dict = version_info.get("model_file_location", None)
+                    if re_dict is not None and isinstance(re_dict, dict):
+                        if re_dict:
+                            actor_ip_address, path = next(iter(re_dict.items()))
+                        else:
+                            raise ValueError("The dictionary is empty.")
+                    else:
+                        raise ValueError("re_dict must be a non-empty dictionary.")
+                    ret["actor_ip_address"] = actor_ip_address
+                    ret["path"] = path
+                    if os.path.isdir(path):
+                        files = os.listdir(path)
+                        resolved_file = os.path.realpath(os.path.join(path, files[0]))
+                        if resolved_file:
+                            ret["real_path"] = os.path.dirname(resolved_file)
+                    else:
+                        ret["real_path"] = os.path.realpath(path)
                     cached_models.append(ret)
         cached_models = sorted(cached_models, key=lambda x: x["model_name"])
         return cached_models

xinference/core/chat_interface.py CHANGED Viewed

@@ -186,8 +186,7 @@ class GradioInterface:
     def build_chat_vl_interface(
         self,
     ) -> "gr.Blocks":
-        def predict(history, bot):
-            logger.debug("Predict model: %s, history: %s", self.model_uid, history)
+        def predict(history, bot, max_tokens, temperature, stream):
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -199,10 +198,46 @@ class GradioInterface:
             assert prompt["role"] == "user"
             prompt = prompt["content"]
             # multimodal chat does not support stream.
-            response = model.chat(prompt=prompt, chat_history=history[:-1])
-            history.append(response["choices"][0]["message"])
-            bot[-1][1] = history[-1]["content"]
-            return history, bot
+            if stream:
+                response_content = ""
+                for chunk in model.chat(
+                    prompt=prompt,
+                    chat_history=history[:-1],
+                    generate_config={
+                        "max_tokens": max_tokens,
+                        "temperature": temperature,
+                        "stream": stream,
+                    },
+                ):
+                    assert isinstance(chunk, dict)
+                    delta = chunk["choices"][0]["delta"]
+                    if "content" not in delta:
+                        continue
+                    else:
+                        response_content += delta["content"]
+                        bot[-1][1] = response_content
+                        yield history, bot
+                history.append(
+                    {
+                        "content": response_content,
+                        "role": "assistant",
+                    }
+                )
+                bot[-1][1] = response_content
+                yield history, bot
+            else:
+                response = model.chat(
+                    prompt=prompt,
+                    chat_history=history[:-1],
+                    generate_config={
+                        "max_tokens": max_tokens,
+                        "temperature": temperature,
+                        "stream": stream,
+                    },
+                )
+                history.append(response["choices"][0]["message"])
+                bot[-1][1] = history[-1]["content"]
+                yield history, bot
         def add_text(history, bot, text, image):
             logger.debug("Add text, text: %s, image: %s", text, image)
@@ -217,14 +252,19 @@ class GradioInterface:
                     "role": "user",
                     "content": [
                         {"type": "text", "text": text},
-                        {"type": "image_url", "image_url": {"url": image}},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{img_b64_str}"
+                            },
+                        },
                     ],
                 }
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
-            bot = bot + [(display_content, None)]
+            bot = bot + [[display_content, None]]
             return history, bot, "", None
         def clear_history():
@@ -286,6 +326,19 @@ class GradioInterface:
                     )
                     clear_btn = gr.Button(value="Clear")
+            with gr.Accordion("Additional Inputs", open=False):
+                max_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=self.context_length,
+                    value=512,
+                    step=1,
+                    label="Max Tokens",
+                )
+                temperature = gr.Slider(
+                    minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
+                )
+                stream = gr.Checkbox(label="Stream", value=False)
             textbox.change(update_button, [textbox], [submit_btn], queue=False)
             textbox.submit(
@@ -293,14 +346,22 @@ class GradioInterface:
                 [state, chatbot, textbox, imagebox],
                 [state, chatbot, textbox, imagebox],
                 queue=False,
-            ).then(predict, [state, chatbot], [state, chatbot])
+            ).then(
+                predict,
+                [state, chatbot, max_tokens, temperature, stream],
+                [state, chatbot],
+            )
             submit_btn.click(
                 add_text,
                 [state, chatbot, textbox, imagebox],
                 [state, chatbot, textbox, imagebox],
                 queue=False,
-            ).then(predict, [state, chatbot], [state, chatbot])
+            ).then(
+                predict,
+                [state, chatbot, max_tokens, temperature, stream],
+                [state, chatbot],
+            )
             clear_btn.click(
                 clear_history, None, [state, chatbot, textbox, imagebox], queue=False

xinference/core/supervisor.py CHANGED Viewed

@@ -28,7 +28,7 @@ from ..constants import (
     XINFERENCE_HEALTH_CHECK_INTERVAL,
     XINFERENCE_HEALTH_CHECK_TIMEOUT,
 )
-from ..core import ModelActor
+from ..core.model import ModelActor
 from ..core.status_guard import InstanceInfo, LaunchStatus
 from ..types import PeftModelConfig
 from .metrics import record_metrics
@@ -993,8 +993,9 @@ class SupervisorActor(xo.StatelessActor):
                     "model_size_in_billions", None
                 )
                 quantizations = model_version.get("quantization", None)
-                re_dict = model_version.get("model_file_location", None)
-                actor_ip_address, path = next(iter(re_dict.items()))
+                actor_ip_address = model_version.get("actor_ip_address", None)
+                path = model_version.get("path", None)
+                real_path = model_version.get("real_path", None)
                 cache_entry = {
                     "model_name": model_name,
@@ -1003,6 +1004,7 @@ class SupervisorActor(xo.StatelessActor):
                     "quantizations": quantizations,
                     "path": path,
                     "Actor IP Address": actor_ip_address,
+                    "real_path": real_path,
                 }
                 cached_models.append(cache_entry)

xinference/core/worker.py CHANGED Viewed

@@ -30,9 +30,10 @@ from xoscar import MainActorPoolType
 from ..constants import (
     XINFERENCE_CACHE_DIR,
     XINFERENCE_DISABLE_HEALTH_CHECK,
+    XINFERENCE_DISABLE_METRICS,
     XINFERENCE_HEALTH_CHECK_INTERVAL,
 )
-from ..core import ModelActor
+from ..core.model import ModelActor
 from ..core.status_guard import LaunchStatus
 from ..device_utils import get_available_device_env_name, gpu_count
 from ..model.core import ModelDescription, create_model_instance
@@ -83,8 +84,12 @@ class WorkerActor(xo.StatelessActor):
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
-        # metrics export server.
-        if metrics_exporter_host is not None or metrics_exporter_port is not None:
+        if XINFERENCE_DISABLE_METRICS:
+            logger.info(
+                "Worker metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
+            )
+        elif metrics_exporter_host is not None or metrics_exporter_port is not None:
+            # metrics export server.
             logger.info(
                 f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
             )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -113,6 +113,7 @@ def _install():
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
+    from .pytorch.cogvlm2 import CogVLM2Model
     from .pytorch.core import PytorchChatModel, PytorchModel
     from .pytorch.deepseek_vl import DeepSeekVLChatModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
@@ -159,6 +160,7 @@ def _install():
             DeepSeekVLChatModel,
             InternVLChatModel,
             PytorchModel,
+            CogVLM2Model,
         ]
     )
     if OmniLMMModel:  # type: ignore

xinference 0.11.2.post1__py3-none-any.whl → 0.11.3__py3-none-any.whl

Potentially problematic release.

xinference 0.11.2.post1py3-none-any.whl → 0.11.3py3-none-any.whl