PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

xinference 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (56) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-05-11T17:30:18+0800",
+ "date": "2024-05-24T16:46:11+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "21be5abd6ff8411015a9b8862cbdb6b070bc2b1c",
- "version": "0.11.0"
+ "full-revisionid": "77e79f863daf9aa0bc6b9e8bf5e6d74e14bd5367",
+ "version": "0.11.2"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -493,6 +493,16 @@ class RESTfulAPI:
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/cached/list_cached_models",
+            self.list_cached_models,
+            methods=["GET"],
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:list"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         # Clear the global Registry for the MetricsMiddleware, or
         # the MetricsMiddleware will register duplicated metrics if the port
@@ -688,6 +698,15 @@ class RESTfulAPI:
                 detail="Invalid input. Please specify the `model_engine` field.",
             )
+        if isinstance(gpu_idx, int):
+            gpu_idx = [gpu_idx]
+        if gpu_idx:
+            if len(gpu_idx) % replica:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Invalid input. Allocated gpu must be a multiple of replica.",
+                )
         if peft_model_config is not None:
             peft_model_config = PeftModelConfig.from_dict(peft_model_config)
         else:
@@ -1470,6 +1489,17 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
+    async def list_cached_models(self) -> JSONResponse:
+        try:
+            data = await (await self._get_supervisor_ref()).list_cached_models()
+            return JSONResponse(content=data)
+        except ValueError as re:
+            logger.error(re, exc_info=True)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
     async def get_model_events(self, model_uid: str) -> JSONResponse:
         try:
             event_collector_ref = await self._get_event_collector_ref()

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -1102,6 +1102,35 @@ class Client:
         response_data = response.json()
         return response_data
+    def list_cached_models(self) -> List[Dict[Any, Any]]:
+        """
+        Get a list of cached models.
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        List[Dict[Any, Any]]
+            The collection of cached models on the server.
+        Raises
+        ------
+        RuntimeError
+            Raised when the request fails, including the reason for the failure.
+        """
+        url = f"{self.base_url}/v1/cached/list_cached_models"
+        response = requests.get(url, headers=self._headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to list cached model, detail: {_get_error_string(response)}"
+            )
+        response_data = response.json()
+        return response_data
     def get_model_registration(
         self, model_type: str, model_name: str
     ) -> Dict[str, Any]:

xinference/core/cache_tracker.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from logging import getLogger
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 import xoscar as xo
@@ -100,3 +100,14 @@ class CacheTrackerActor(xo.Actor):
     def get_model_version_count(self, model_name: str) -> int:
         return len(self.get_model_versions(model_name))
+    def list_cached_models(self) -> List[Dict[Any, Any]]:
+        cached_models = []
+        for model_name, model_versions in self._model_name_to_version_info.items():
+            for version_info in model_versions:
+                if version_info["cache_status"]:
+                    ret = version_info.copy()
+                    ret["model_name"] = model_name
+                    cached_models.append(ret)
+        cached_models = sorted(cached_models, key=lambda x: x["model_name"])
+        return cached_models

xinference/core/chat_interface.py CHANGED Viewed

@@ -109,6 +109,7 @@ class GradioInterface:
             history: List[List[str]],
             max_tokens: int,
             temperature: float,
+            lora_name: str,
         ) -> Generator:
             from ..client import RESTfulClient
@@ -127,6 +128,7 @@ class GradioInterface:
                     "max_tokens": int(max_tokens),
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -152,6 +154,7 @@ class GradioInterface:
                 gr.Slider(
                     minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                 ),
+                gr.Text(label="LoRA Name"),
             ],
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -331,7 +334,7 @@ class GradioInterface:
                 history: hist,
             }
-        def complete(text, hist, max_tokens, temperature) -> Generator:
+        def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -349,6 +352,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -368,7 +372,7 @@ class GradioInterface:
                 history: hist,
             }
-        def retry(text, hist, max_tokens, temperature) -> Generator:
+        def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -387,6 +391,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -470,10 +475,11 @@ class GradioInterface:
                     temperature = gr.Slider(
                         minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                     )
+                    lora_name = gr.Text(label="LoRA Name")
                 btn_generate.click(
                     fn=complete,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )
@@ -485,7 +491,7 @@ class GradioInterface:
                 btn_retry.click(
                     fn=retry,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )

xinference/core/model.py CHANGED Viewed

@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
             for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = dict(data=json.dumps(v))
                 yield sse_starlette.sse.ensure_bytes(v, None)
         except OutOfMemoryError:
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
             async for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = await asyncio.to_thread(json.dumps, v)
                 v = dict(data=v)  # noqa: F821
                 yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)

xinference/core/supervisor.py CHANGED Viewed

@@ -34,6 +34,7 @@ from ..types import PeftModelConfig
 from .metrics import record_metrics
 from .resource import GPUStatus, ResourceStatus
 from .utils import (
+    assign_replica_gpu,
     build_replica_model_uid,
     gen_random_string,
     is_valid_model_uid,
@@ -769,7 +770,7 @@ class SupervisorActor(xo.StatelessActor):
                 raise ValueError(
                     f"Model is already in the model list, uid: {_replica_model_uid}"
                 )
+            replica_gpu_idx = assign_replica_gpu(_replica_model_uid, gpu_idx)
             nonlocal model_type
             worker_ref = (
                 target_ip_worker_ref
@@ -789,7 +790,7 @@ class SupervisorActor(xo.StatelessActor):
                 n_gpu=n_gpu,
                 request_limits=request_limits,
                 peft_model_config=peft_model_config,
-                gpu_idx=gpu_idx,
+                gpu_idx=replica_gpu_idx,
                 **kwargs,
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
@@ -980,6 +981,33 @@ class SupervisorActor(xo.StatelessActor):
             and list(self._worker_address_to_worker)[0] == self.address
         )
+    @log_async(logger=logger)
+    async def list_cached_models(self) -> List[Dict[str, Any]]:
+        cached_models = []
+        for worker in self._worker_address_to_worker.values():
+            ret = await worker.list_cached_models()
+            for model_version in ret:
+                model_name = model_version.get("model_name", None)
+                model_format = model_version.get("model_format", None)
+                model_size_in_billions = model_version.get(
+                    "model_size_in_billions", None
+                )
+                quantizations = model_version.get("quantization", None)
+                re_dict = model_version.get("model_file_location", None)
+                actor_ip_address, path = next(iter(re_dict.items()))
+                cache_entry = {
+                    "model_name": model_name,
+                    "model_format": model_format,
+                    "model_size_in_billions": model_size_in_billions,
+                    "quantizations": quantizations,
+                    "path": path,
+                    "Actor IP Address": actor_ip_address,
+                }
+                cached_models.append(cache_entry)
+        return cached_models
     @log_async(logger=logger)
     async def add_worker(self, worker_address: str):
         from .worker import WorkerActor

xinference/core/utils.py CHANGED Viewed

@@ -191,3 +191,15 @@ def get_nvidia_gpu_info() -> Dict:
             nvmlShutdown()
         except:
             pass
+def assign_replica_gpu(
+    _replica_model_uid: str, gpu_idx: Union[int, List[int]]
+) -> List[int]:
+    model_uid, replica, rep_id = parse_replica_model_uid(_replica_model_uid)
+    rep_id, replica = int(rep_id), int(replica)
+    if isinstance(gpu_idx, int):
+        gpu_idx = [gpu_idx]
+    if isinstance(gpu_idx, list) and gpu_idx:
+        return gpu_idx[rep_id::replica]
+    return gpu_idx

xinference/core/worker.py CHANGED Viewed

@@ -456,7 +456,7 @@ class WorkerActor(xo.StatelessActor):
     ) -> Tuple[str, List[str]]:
         env = {}
         devices = []
-        env_name = get_available_device_env_name()
+        env_name = get_available_device_env_name() or "CUDA_VISIBLE_DEVICES"
         if gpu_idx is None:
             if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
                 # Currently, n_gpu=auto means using 1 GPU
@@ -781,6 +781,9 @@ class WorkerActor(xo.StatelessActor):
             except asyncio.CancelledError:  # pragma: no cover
                 break
+    async def list_cached_models(self) -> List[Dict[Any, Any]]:
+        return self._cache_tracker_ref.list_cached_models()
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)

xinference/deploy/cmdline.py CHANGED Viewed

@@ -570,6 +570,44 @@ def list_model_registrations(
         raise NotImplementedError(f"List {model_type} is not implemented.")
+@cli.command("cached", help="List all cached models in Xinference.")
+@click.option(
+    "--endpoint",
+    "-e",
+    type=str,
+    help="Xinference endpoint.",
+)
+@click.option(
+    "--api-key",
+    "-ak",
+    default=None,
+    type=str,
+    help="Api-Key for access xinference api with authorization.",
+)
+def list_cached_models(
+    endpoint: Optional[str],
+    api_key: Optional[str],
+):
+    from tabulate import tabulate
+    endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint, api_key=api_key)
+    if api_key is None:
+        client._set_token(get_stored_token(endpoint, client))
+    cached_models = client.list_cached_models()
+    print("cached_model: ")
+    headers = list(cached_models[0].keys())
+    table_data = []
+    for model in cached_models:
+        row_data = [
+            str(value) if value is not None else "-" for value in model.values()
+        ]
+        table_data.append(row_data)
+    print(tabulate(table_data, headers=headers, tablefmt="pretty"))
 @cli.command(
     "launch",
     help="Launch a model with the Xinference framework with the given parameters.",
@@ -1368,5 +1406,93 @@ def query_engine_by_model_name(
         )
+@cli.command(
+    "cal-model-mem",
+    help="calculate gpu mem usage with specified model size and context_length",
+)
+@click.option(
+    "--model-name",
+    "-n",
+    type=str,
+    help="The model name is optional.\
+    If provided, fetch model config from huggingface/modelscope;\
+    If not specified, use default model layer to estimate.",
+)
+@click.option(
+    "--size-in-billions",
+    "-s",
+    type=str,
+    required=True,
+    help="Specify the model size in billions of parameters. Format accept 1_8 and 1.8",
+)
+@click.option(
+    "--model-format",
+    "-f",
+    type=str,
+    required=True,
+    help="Specify the format of the model, e.g. pytorch, ggmlv3, etc.",
+)
+@click.option(
+    "--quantization",
+    "-q",
+    type=str,
+    default=None,
+    help="Define the quantization settings for the model.",
+)
+@click.option(
+    "--context-length",
+    "-c",
+    type=int,
+    required=True,
+    help="Specify the context length",
+)
+@click.option(
+    "--kv-cache-dtype",
+    type=int,
+    default=16,
+    help="Specified the kv_cache_dtype, one of: 8, 16, 32",
+)
+def cal_model_mem(
+    model_name: Optional[str],
+    size_in_billions: str,
+    model_format: str,
+    quantization: Optional[str],
+    context_length: int,
+    kv_cache_dtype: int,
+):
+    if kv_cache_dtype not in [8, 16, 32]:
+        print("Invalid kv_cache_dtype:", kv_cache_dtype)
+        os._exit(1)
+    import math
+    from ..model.llm.llm_family import convert_model_size_to_float
+    from ..model.llm.memory import estimate_llm_gpu_memory
+    mem_info = estimate_llm_gpu_memory(
+        model_size_in_billions=size_in_billions,
+        quantization=quantization,
+        context_length=context_length,
+        model_format=model_format,
+        model_name=model_name,
+        kv_cache_dtype=kv_cache_dtype,
+    )
+    if mem_info is None:
+        print("The Specified model parameters is not match: `%s`" % model_name)
+        os._exit(1)
+    total_mem_g = math.ceil(mem_info.total / 1024.0)
+    print("model_name:", model_name)
+    print("kv_cache_dtype:", kv_cache_dtype)
+    print("model size: %.1f B" % (convert_model_size_to_float(size_in_billions)))
+    print("quant: %s" % (quantization))
+    print("context: %d" % (context_length))
+    print("gpu mem usage:")
+    print("  model mem: %d MB" % (mem_info.model_mem))
+    print("  kv_cache: %d MB" % (mem_info.kv_cache_mem))
+    print("  overhead: %d MB" % (mem_info.overhead))
+    print("  active: %d MB" % (mem_info.activation_mem))
+    print("  total: %d MB (%d GB)" % (mem_info.total, total_mem_g))
 if __name__ == "__main__":
     cli()

xinference/deploy/test/test_cmdline.py CHANGED Viewed

@@ -19,6 +19,7 @@ from click.testing import CliRunner
 from ...client import Client
 from ..cmdline import (
+    list_cached_models,
     list_model_registrations,
     model_chat,
     model_generate,
@@ -278,3 +279,26 @@ def test_rotate_logs(setup_with_file_logging):
     with open(log_file, "r") as f:
         content = f.read()
         assert len(content) > 0
+def test_list_cached_models(setup):
+    endpoint, _ = setup
+    runner = CliRunner()
+    result = runner.invoke(
+        list_cached_models,
+        [
+            "--endpoint",
+            endpoint,
+        ],
+    )
+    assert result.exit_code == 0
+    assert "cached_model: " in result.stdout
+    # check if the output is in tabular format
+    assert "model_name" in result.stdout
+    assert "model_format" in result.stdout
+    assert "model_size_in_billions" in result.stdout
+    assert "quantizations" in result.stdout
+    assert "path" in result.stdout
+    assert "Actor IP Address" in result.stdout

xinference/fields.py CHANGED Viewed

@@ -75,7 +75,9 @@ stream_field = Field(
 )
 stream_option_field = Field(
-    default={},
+    default={
+        "include_usage": False,
+    },
     description="If set, an additional chunk will be streamed before the `data: [DONE]` message.",
 )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -116,6 +116,7 @@ def _install():
     from .pytorch.core import PytorchChatModel, PytorchModel
     from .pytorch.deepseek_vl import DeepSeekVLChatModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
+    from .pytorch.intern_vl import InternVLChatModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
     from .pytorch.qwen_vl import QwenVLChatModel
@@ -156,6 +157,7 @@ def _install():
             QwenVLChatModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
+            InternVLChatModel,
             PytorchModel,
         ]
     )

xinference 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

Potentially problematic release.

xinference 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl