PyPI - xinference - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show

xinference/deploy/cmdline.py CHANGED Viewed

@@ -24,13 +24,13 @@ from xoscar.utils import get_next_port
 from .. import __version__
 from ..client import RESTfulClient
-from ..client.oscar.actor_client import ActorClient
 from ..client.restful.restful_client import (
     RESTfulChatglmCppChatModelHandle,
     RESTfulChatModelHandle,
     RESTfulGenerateModelHandle,
 )
 from ..constants import (
+    XINFERENCE_AUTH_DIR,
     XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
     XINFERENCE_DEFAULT_ENDPOINT_PORT,
     XINFERENCE_DEFAULT_LOCAL_HOST,
@@ -62,10 +62,37 @@ def get_endpoint(endpoint: Optional[str]) -> str:
         return endpoint
+def get_hash_endpoint(endpoint: str) -> str:
+    import hashlib
+    m = hashlib.sha256()
+    m.update(bytes(endpoint, "utf-8"))
+    return m.hexdigest()
+def get_stored_token(
+    endpoint: str, client: Optional[RESTfulClient] = None
+) -> Optional[str]:
+    rest_client = RESTfulClient(endpoint) if client is None else client
+    authed = rest_client._cluster_authed
+    if not authed:
+        return None
+    token_path = os.path.join(XINFERENCE_AUTH_DIR, get_hash_endpoint(endpoint))
+    if not os.path.exists(token_path):
+        raise RuntimeError("Cannot find access token, please login first!")
+    with open(token_path, "r") as f:
+        access_token = str(f.read())
+    return access_token
 def start_local_cluster(
     log_level: str,
     host: str,
     port: int,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    auth_config_file: Optional[str] = None,
 ):
     from .local import main
@@ -80,7 +107,10 @@ def start_local_cluster(
     main(
         host=host,
         port=port,
+        metrics_exporter_host=metrics_exporter_host,
+        metrics_exporter_port=metrics_exporter_port,
         logging_conf=dict_config,
+        auth_config_file=auth_config_file,
     )
@@ -159,12 +189,42 @@ def cli(
     type=int,
     help="Specify the port number for the Xinference server.",
 )
+@click.option(
+    "--metrics-exporter-host",
+    "-MH",
+    default=None,
+    type=str,
+    help="Specify the host address for the Xinference metrics exporter server, default is the same as --host.",
+)
+@click.option(
+    "--metrics-exporter-port",
+    "-mp",
+    type=int,
+    help="Specify the port number for the Xinference metrics exporter server.",
+)
+@click.option(
+    "--auth-config",
+    type=str,
+    help="Specify the auth config json file.",
+)
 def local(
     log_level: str,
     host: str,
     port: int,
+    metrics_exporter_host: Optional[str],
+    metrics_exporter_port: Optional[int],
+    auth_config: Optional[str],
 ):
-    start_local_cluster(log_level=log_level, host=host, port=port)
+    if metrics_exporter_host is None:
+        metrics_exporter_host = host
+    start_local_cluster(
+        log_level=log_level,
+        host=host,
+        port=port,
+        metrics_exporter_host=metrics_exporter_host,
+        metrics_exporter_port=metrics_exporter_port,
+        auth_config_file=auth_config,
+    )
 @click.command(
@@ -196,7 +256,18 @@ def local(
     type=int,
     help="Specify the port number for the Xinference supervisor.",
 )
-def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[int]):
+@click.option(
+    "--auth-config",
+    type=str,
+    help="Specify the auth config json file.",
+)
+def supervisor(
+    log_level: str,
+    host: str,
+    port: int,
+    supervisor_port: Optional[int],
+    auth_config: Optional[str],
+):
     from ..deploy.supervisor import main
     dict_config = get_config_dict(
@@ -208,7 +279,11 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
     logging.config.dictConfig(dict_config)  # type: ignore
     main(
-        host=host, port=port, supervisor_port=supervisor_port, logging_conf=dict_config
+        host=host,
+        port=port,
+        supervisor_port=supervisor_port,
+        logging_conf=dict_config,
+        auth_config_file=auth_config,
     )
@@ -235,8 +310,25 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
     type=int,
     help="Specify the port number for the Xinference worker.",
 )
+@click.option(
+    "--metrics-exporter-host",
+    "-MH",
+    default=XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
+    type=str,
+    help="Specify the host address for the metrics exporter server.",
+)
+@click.option(
+    "--metrics-exporter-port",
+    type=int,
+    help="Specify the port number for the Xinference metrics exporter worker.",
+)
 def worker(
-    log_level: str, endpoint: Optional[str], host: str, worker_port: Optional[int]
+    log_level: str,
+    endpoint: Optional[str],
+    host: str,
+    worker_port: Optional[int],
+    metrics_exporter_host: Optional[str],
+    metrics_exporter_port: Optional[int],
 ):
     from ..deploy.worker import main
@@ -257,6 +349,8 @@ def worker(
     main(
         address=address,
         supervisor_address=supervisor_internal_addr,
+        metrics_exporter_host=metrics_exporter_host,
+        metrics_exporter_port=metrics_exporter_port,
         logging_conf=dict_config,
     )
@@ -288,6 +382,7 @@ def register_model(
         model = fd.read()
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     client.register_model(
         model_type=model_type,
         model=model,
@@ -316,6 +411,7 @@ def unregister_model(
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     client.unregister_model(
         model_type=model_type,
         model_name=model_name,
@@ -343,8 +439,9 @@ def list_model_registrations(
     from tabulate import tabulate
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     registrations = client.list_model_registrations(model_type=model_type)
     table = []
@@ -518,8 +615,9 @@ def model_launch(
         if size_in_billions is None or "_" in size_in_billions
         else int(size_in_billions)
     )
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     model_uid = client.launch_model(
         model_name=model_name,
         model_type=model_type,
@@ -550,6 +648,7 @@ def model_list(endpoint: Optional[str]):
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     llm_table = []
     embedding_table = []
@@ -626,8 +725,8 @@ def model_terminate(
     model_uid: str,
 ):
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     client.terminate_model(model_uid=model_uid)
@@ -657,6 +756,8 @@ def model_generate(
     stream: bool,
 ):
     endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     if stream:
         # TODO: when stream=True, RestfulClient cannot generate words one by one.
         # So use Client in temporary. The implementation needs to be changed to
@@ -669,7 +770,7 @@ def model_generate(
                 if prompt == "":
                     break
                 print(f"Completion: {prompt}", end="", file=sys.stdout)
-                async for chunk in model.generate(
+                for chunk in model.generate(
                     prompt=prompt,
                     generate_config={"stream": stream, "max_tokens": max_tokens},
                 ):
@@ -680,7 +781,6 @@ def model_generate(
                         print(choice["text"], end="", flush=True, file=sys.stdout)
                 print("", file=sys.stdout)
-        client = ActorClient(endpoint=endpoint)
         model = client.get_model(model_uid=model_uid)
         loop = asyncio.get_event_loop()
@@ -700,8 +800,7 @@ def model_generate(
                 # avoid displaying exception-unhandled warnings
                 task.exception()
     else:
-        restful_client = RESTfulClient(base_url=endpoint)
-        restful_model = restful_client.get_model(model_uid=model_uid)
+        restful_model = client.get_model(model_uid=model_uid)
         if not isinstance(
             restful_model, (RESTfulChatModelHandle, RESTfulGenerateModelHandle)
         ):
@@ -744,6 +843,9 @@ def model_chat(
 ):
     # TODO: chat model roles may not be user and assistant.
     endpoint = get_endpoint(endpoint)
+    client = RESTfulClient(base_url=endpoint)
+    client._set_token(get_stored_token(endpoint, client))
     chat_history: "List[ChatCompletionMessage]" = []
     if stream:
         # TODO: when stream=True, RestfulClient cannot generate words one by one.
@@ -758,7 +860,7 @@ def model_chat(
                     break
                 print("Assistant: ", end="", file=sys.stdout)
                 response_content = ""
-                async for chunk in model.chat(
+                for chunk in model.chat(
                     prompt=prompt,
                     chat_history=chat_history,
                     generate_config={"stream": stream, "max_tokens": max_tokens},
@@ -775,7 +877,6 @@ def model_chat(
                     ChatCompletionMessage(role="assistant", content=response_content)
                 )
-        client = ActorClient(endpoint=endpoint)
         model = client.get_model(model_uid=model_uid)
         loop = asyncio.get_event_loop()
@@ -795,8 +896,7 @@ def model_chat(
                 # avoid displaying exception-unhandled warnings
                 task.exception()
     else:
-        restful_client = RESTfulClient(base_url=endpoint)
-        restful_model = restful_client.get_model(model_uid=model_uid)
+        restful_model = client.get_model(model_uid=model_uid)
         if not isinstance(
             restful_model, (RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle)
         ):
@@ -822,5 +922,31 @@ def model_chat(
             )
+@cli.command("login", help="Login when the cluster is authenticated.")
+@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
+@click.option("--username", type=str, required=True, help="Username.")
+@click.option(
+    "--password",
+    type=str,
+    required=True,
+    help="Password.",
+)
+def cluster_login(
+    endpoint: Optional[str],
+    username: str,
+    password: str,
+):
+    endpoint = get_endpoint(endpoint)
+    restful_client = RESTfulClient(base_url=endpoint)
+    if restful_client._cluster_authed:
+        restful_client.login(username, password)
+        access_token = restful_client._get_token()
+        assert access_token is not None
+        os.makedirs(XINFERENCE_AUTH_DIR, exist_ok=True)
+        hashed_ep = get_hash_endpoint(endpoint)
+        with open(os.path.join(XINFERENCE_AUTH_DIR, hashed_ep), "w") as f:
+            f.write(access_token)
 if __name__ == "__main__":
     cli()

xinference/deploy/local.py CHANGED Viewed

@@ -35,6 +35,8 @@ logger = logging.getLogger(__name__)
 async def _start_local_cluster(
     address: str,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
     logging_conf: Optional[Dict] = None,
 ):
     from .utils import create_worker_actor_pool
@@ -50,7 +52,11 @@ async def _start_local_cluster(
             SupervisorActor, address=address, uid=SupervisorActor.uid()
         )
         await start_worker_components(
-            address=address, supervisor_address=address, main_pool=pool
+            address=address,
+            supervisor_address=address,
+            main_pool=pool,
+            metrics_exporter_host=metrics_exporter_host,
+            metrics_exporter_port=metrics_exporter_port,
         )
         await pool.join()
     except asyncio.CancelledError:
@@ -58,7 +64,12 @@ async def _start_local_cluster(
             await pool.stop()
-def run(address: str, logging_conf: Optional[Dict] = None):
+def run(
+    address: str,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    logging_conf: Optional[Dict] = None,
+):
     def sigterm_handler(signum, frame):
         sys.exit(0)
@@ -66,22 +77,42 @@ def run(address: str, logging_conf: Optional[Dict] = None):
     loop = asyncio.get_event_loop()
     task = loop.create_task(
-        _start_local_cluster(address=address, logging_conf=logging_conf)
+        _start_local_cluster(
+            address=address,
+            metrics_exporter_host=metrics_exporter_host,
+            metrics_exporter_port=metrics_exporter_port,
+            logging_conf=logging_conf,
+        )
     )
     loop.run_until_complete(task)
 def run_in_subprocess(
-    address: str, logging_conf: Optional[Dict] = None
+    address: str,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    logging_conf: Optional[Dict] = None,
 ) -> multiprocessing.Process:
-    p = multiprocessing.Process(target=run, args=(address, logging_conf))
+    p = multiprocessing.Process(
+        target=run,
+        args=(address, metrics_exporter_host, metrics_exporter_port, logging_conf),
+    )
     p.start()
     return p
-def main(host: str, port: int, logging_conf: Optional[Dict] = None):
+def main(
+    host: str,
+    port: int,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    logging_conf: Optional[Dict] = None,
+    auth_config_file: Optional[str] = None,
+):
     supervisor_address = f"{host}:{get_next_port()}"
-    local_cluster = run_in_subprocess(supervisor_address, logging_conf)
+    local_cluster = run_in_subprocess(
+        supervisor_address, metrics_exporter_host, metrics_exporter_port, logging_conf
+    )
     if not health_check(
         address=supervisor_address,
@@ -98,6 +129,7 @@ def main(host: str, port: int, logging_conf: Optional[Dict] = None):
             host=host,
             port=port,
             logging_conf=logging_conf,
+            auth_config_file=auth_config_file,
         )
     finally:
         local_cluster.terminate()

xinference/deploy/supervisor.py CHANGED Viewed

@@ -75,6 +75,7 @@ def main(
     port: int,
     supervisor_port: Optional[int],
     logging_conf: Optional[Dict] = None,
+    auth_config_file: Optional[str] = None,
 ):
     supervisor_address = f"{host}:{supervisor_port or get_next_port()}"
     local_cluster = run_in_subprocess(supervisor_address, logging_conf)
@@ -94,6 +95,7 @@ def main(
             host=host,
             port=port,
             logging_conf=logging_conf,
+            auth_config_file=auth_config_file,
         )
     finally:
         local_cluster.terminate()

xinference/deploy/worker.py CHANGED Viewed

@@ -27,7 +27,11 @@ logger = logging.getLogger(__name__)
 async def start_worker_components(
-    address: str, supervisor_address: str, main_pool: MainActorPoolType
+    address: str,
+    supervisor_address: str,
+    main_pool: MainActorPoolType,
+    metrics_exporter_host: Optional[str],
+    metrics_exporter_port: Optional[int],
 ):
     cuda_device_indices = []
     cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
@@ -43,24 +47,48 @@ async def start_worker_components(
         supervisor_address=supervisor_address,
         main_pool=main_pool,
         cuda_devices=cuda_device_indices,
+        metrics_exporter_host=metrics_exporter_host,
+        metrics_exporter_port=metrics_exporter_port,
     )
 async def _start_worker(
-    address: str, supervisor_address: str, logging_conf: Any = None
+    address: str,
+    supervisor_address: str,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    logging_conf: Any = None,
 ):
     from .utils import create_worker_actor_pool
     pool = await create_worker_actor_pool(address=address, logging_conf=logging_conf)
     await start_worker_components(
-        address=address, supervisor_address=supervisor_address, main_pool=pool
+        address=address,
+        supervisor_address=supervisor_address,
+        main_pool=pool,
+        metrics_exporter_host=metrics_exporter_host,
+        metrics_exporter_port=metrics_exporter_port,
     )
     await pool.join()
-def main(address: str, supervisor_address: str, logging_conf: Optional[dict] = None):
+def main(
+    address: str,
+    supervisor_address: str,
+    metrics_exporter_host: Optional[str] = None,
+    metrics_exporter_port: Optional[int] = None,
+    logging_conf: Optional[dict] = None,
+):
     loop = asyncio.get_event_loop()
-    task = loop.create_task(_start_worker(address, supervisor_address, logging_conf))
+    task = loop.create_task(
+        _start_worker(
+            address,
+            supervisor_address,
+            metrics_exporter_host,
+            metrics_exporter_port,
+            logging_conf,
+        )
+    )
     try:
         loop.run_until_complete(task)

xinference/fields.py CHANGED Viewed

@@ -30,7 +30,10 @@ logprobs_field = Field(
 )
 max_tokens_field = Field(
-    default=128, ge=1, le=32768, description="The maximum number of tokens to generate."
+    default=1024,
+    ge=1,
+    le=32768,
+    description="The maximum number of tokens to generate.",
 )
 temperature_field = Field(

xinference/model/core.py CHANGED Viewed

@@ -78,7 +78,14 @@ def create_model_instance(
     elif model_type == "multimodal":
         kwargs.pop("trust_remote_code", None)
         return create_multimodal_model_instance(
-            subpool_addr, devices, model_uid, model_name, **kwargs
+            subpool_addr,
+            devices,
+            model_uid,
+            model_name,
+            model_format,
+            model_size_in_billions,
+            quantization,
+            **kwargs,
         )
     else:
         raise ValueError(f"Unsupported model type: {model_type}.")

xinference/model/embedding/core.py CHANGED Viewed

@@ -40,7 +40,8 @@ class EmbeddingModelSpec(BaseModel):
     max_tokens: int
     language: List[str]
     model_id: str
-    model_revision: str
+    model_revision: Optional[str]
+    model_hub: str = "huggingface"
 class EmbeddingModelDescription(ModelDescription):
@@ -165,7 +166,7 @@ def cache(model_spec: EmbeddingModelSpec):
     if valid_model_revision(meta_path, model_spec.model_revision):
         return cache_dir
-    from_modelscope: bool = model_spec.model_id.startswith("Xorbits/")
+    from_modelscope: bool = model_spec.model_hub == "modelscope"
     if from_modelscope:
         download_dir = retry_download(
             ms_download,

xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl