PyPI - xinference - Versions diffs - 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-04-24T10:45:37+0800",
+ "date": "2024-05-17T14:10:09+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "2ba72b0ed55c2dbff12491485ffacee7996d3490",
- "version": "0.10.3"
+ "full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
+ "version": "0.11.1"
 }
 '''  # END VERSION_JSON

xinference/api/oauth2/auth_service.py CHANGED Viewed

@@ -48,7 +48,7 @@ class AuthService:
     def init_auth_config(self):
         if self._auth_config_file:
-            config: AuthStartupConfig = parse_file_as(
+            config: AuthStartupConfig = parse_file_as(  # type: ignore
                 path=self._auth_config_file, type_=AuthStartupConfig
             )
             all_api_keys = set()

xinference/api/restful_api.py CHANGED Viewed

@@ -275,6 +275,16 @@ class RESTfulAPI:
         self._router.add_api_route(
             "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
         )
+        self._router.add_api_route(
+            "/v1/engines/{model_name}",
+            self.query_engines_by_model_name,
+            methods=["GET"],
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:list"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         # running instances
         self._router.add_api_route(
             "/v1/models/instances",
@@ -347,16 +357,6 @@ class RESTfulAPI:
                 else None
             ),
         )
-        self._router.add_api_route(
-            "/experimental/speculative_llms",
-            self.launch_speculative_llm,
-            methods=["POST"],
-            dependencies=(
-                [Security(self._auth_service, scopes=["models:start"])]
-                if self.is_authenticated()
-                else None
-            ),
-        )
         self._router.add_api_route(
             "/v1/models/{model_uid}",
             self.terminate_model,
@@ -639,57 +639,17 @@ class RESTfulAPI:
             logger.error(e, exc_info=True)
             raise HTTPException(status_code=500, detail=str(e))
-    async def launch_speculative_llm(self, request: Request) -> JSONResponse:
-        payload = await request.json()
-        model_uid = payload.get("model_uid")
-        model_name = payload.get("model_name")
-        model_size_in_billions = payload.get("model_size_in_billions")
-        quantization = payload.get("quantization")
-        draft_model_name = payload.get("draft_model_name")
-        draft_model_size_in_billions = payload.get("draft_model_size_in_billions")
-        draft_quantization = payload.get("draft_quantization")
-        n_gpu = payload.get("n_gpu", "auto")
-        if not model_name:
-            raise HTTPException(
-                status_code=400,
-                detail="Invalid input. Please specify the model name",
-            )
-        try:
-            model_uid = await (await self._get_supervisor_ref()).launch_speculative_llm(
-                model_uid=model_uid,
-                model_name=model_name,
-                model_size_in_billions=model_size_in_billions,
-                quantization=quantization,
-                draft_model_name=draft_model_name,
-                draft_model_size_in_billions=draft_model_size_in_billions,
-                draft_quantization=draft_quantization,
-                n_gpu=n_gpu,
-            )
-        except ValueError as ve:
-            logger.error(str(ve), exc_info=True)
-            raise HTTPException(status_code=400, detail=str(ve))
-        except RuntimeError as re:
-            logger.error(str(re), exc_info=True)
-            raise HTTPException(status_code=503, detail=str(re))
-        except Exception as e:
-            logger.error(str(e), exc_info=True)
-            raise HTTPException(status_code=500, detail=str(e))
-        return JSONResponse(content={"model_uid": model_uid})
     async def launch_model(
         self, request: Request, wait_ready: bool = Query(True)
     ) -> JSONResponse:
         payload = await request.json()
         model_uid = payload.get("model_uid")
         model_name = payload.get("model_name")
+        model_engine = payload.get("model_engine")
         model_size_in_billions = payload.get("model_size_in_billions")
         model_format = payload.get("model_format")
         quantization = payload.get("quantization")
-        model_type = payload.get("model_type")
+        model_type = payload.get("model_type", "LLM")
         replica = payload.get("replica", 1)
         n_gpu = payload.get("n_gpu", "auto")
         request_limits = payload.get("request_limits", None)
@@ -700,6 +660,7 @@ class RESTfulAPI:
         exclude_keys = {
             "model_uid",
             "model_name",
+            "model_engine",
             "model_size_in_billions",
             "model_format",
             "quantization",
@@ -719,7 +680,12 @@ class RESTfulAPI:
         if not model_name:
             raise HTTPException(
                 status_code=400,
-                detail="Invalid input. Please specify the model name",
+                detail="Invalid input. Please specify the `model_name` field.",
+            )
+        if not model_engine and model_type == "LLM":
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid input. Please specify the `model_engine` field.",
             )
         if peft_model_config is not None:
@@ -731,6 +697,7 @@ class RESTfulAPI:
             model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
                 model_uid=model_uid,
                 model_name=model_name,
+                model_engine=model_engine,
                 model_size_in_billions=model_size_in_billions,
                 model_format=model_format,
                 quantization=quantization,
@@ -776,6 +743,7 @@ class RESTfulAPI:
     ) -> JSONResponse:
         payload = await request.json()
         model_uid = payload.get("model_uid")
+        model_engine = payload.get("model_engine")
         model_type = payload.get("model_type")
         model_version = payload.get("model_version")
         replica = payload.get("replica", 1)
@@ -786,6 +754,7 @@ class RESTfulAPI:
                 await self._get_supervisor_ref()
             ).launch_model_by_version(
                 model_uid=model_uid,
+                model_engine=model_engine,
                 model_type=model_type,
                 model_version=model_version,
                 replica=replica,
@@ -1085,6 +1054,7 @@ class RESTfulAPI:
     async def create_transcriptions(
         self,
+        request: Request,
         model: str = Form(...),
         file: UploadFile = File(media_type="application/octet-stream"),
         language: Optional[str] = Form(None),
@@ -1093,6 +1063,10 @@ class RESTfulAPI:
         temperature: Optional[float] = Form(0),
         kwargs: Optional[str] = Form(None),
     ) -> Response:
+        form = await request.form()
+        timestamp_granularities = form.get("timestamp_granularities[]")
+        if timestamp_granularities:
+            timestamp_granularities = [timestamp_granularities]
         model_uid = model
         try:
             model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1116,6 +1090,7 @@ class RESTfulAPI:
                 prompt=prompt,
                 response_format=response_format,
                 temperature=temperature,
+                timestamp_granularities=timestamp_granularities,
                 **parsed_kwargs,
             )
             return Response(content=transcription, media_type="application/json")
@@ -1130,13 +1105,19 @@ class RESTfulAPI:
     async def create_translations(
         self,
+        request: Request,
         model: str = Form(...),
         file: UploadFile = File(media_type="application/octet-stream"),
+        language: Optional[str] = Form(None),
         prompt: Optional[str] = Form(None),
         response_format: Optional[str] = Form("json"),
         temperature: Optional[float] = Form(0),
         kwargs: Optional[str] = Form(None),
     ) -> Response:
+        form = await request.form()
+        timestamp_granularities = form.get("timestamp_granularities[]")
+        if timestamp_granularities:
+            timestamp_granularities = [timestamp_granularities]
         model_uid = model
         try:
             model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1156,9 +1137,11 @@ class RESTfulAPI:
                 parsed_kwargs = {}
             translation = await model_ref.translations(
                 audio=await file.read(),
+                language=language,
                 prompt=prompt,
                 response_format=response_format,
                 temperature=temperature,
+                timestamp_granularities=timestamp_granularities,
                 **parsed_kwargs,
             )
             return Response(content=translation, media_type="application/json")
@@ -1274,11 +1257,7 @@ class RESTfulAPI:
         messages = body.messages and list(body.messages) or None
-        if (
-            not messages
-            or messages[-1].get("role") not in ["user", "system", "tool"]
-            or not messages[-1].get("content")
-        ):
+        if not messages or messages[-1].get("role") not in ["user", "system", "tool"]:
             raise HTTPException(
                 status_code=400, detail="Invalid input. Please specify the prompt."
             )
@@ -1298,15 +1277,15 @@ class RESTfulAPI:
             {"role": "system", "content": ". ".join(system_messages_contents)}
         )
-        assert non_system_messages
         has_tool_message = messages[-1].get("role") == "tool"
         if has_tool_message:
             prompt = SPECIAL_TOOL_PROMPT
             system_prompt = system_messages[0]["content"] if system_messages else None
             chat_history = non_system_messages  # exclude the prompt
         else:
-            prompt = non_system_messages[-1]["content"]
+            prompt = None
+            if non_system_messages:
+                prompt = non_system_messages[-1]["content"]
             system_prompt = system_messages[0]["content"] if system_messages else None
             chat_history = non_system_messages[:-1]  # exclude the prompt
@@ -1418,6 +1397,19 @@ class RESTfulAPI:
                 self.handle_request_limit_error(e)
                 raise HTTPException(status_code=500, detail=str(e))
+    async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
+        try:
+            content = await (
+                await self._get_supervisor_ref()
+            ).query_engines_by_model_name(model_name)
+            return JSONResponse(content=content)
+        except ValueError as re:
+            logger.error(re, exc_info=True)
+            raise HTTPException(status_code=400, detail=str(re))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
     async def register_model(self, model_type: str, request: Request) -> JSONResponse:
         body = RegisterModelRequest.parse_obj(await request.json())
         model = body.model

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import json
 import typing
-import warnings
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
 import requests
@@ -566,6 +565,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         prompt: Optional[str] = None,
         response_format: Optional[str] = "json",
         temperature: Optional[float] = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
         """
         Transcribes audio into the input language.
@@ -589,6 +589,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             while lower values like 0.2 will make it more focused and deterministic.
             If set to 0, the model will use log probability to automatically increase the temperature
             until certain thresholds are hit.
+        timestamp_granularities: Optional[List[str]], default is None.
+            The timestamp granularities to populate for this transcription. response_format must be set verbose_json
+            to use timestamp granularities. Either or both of these options are supported: word, or segment.
+            Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
+            additional latency.
         Returns
         -------
@@ -601,12 +606,13 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             "prompt": prompt,
             "response_format": response_format,
             "temperature": temperature,
+            "timestamp_granularities[]": timestamp_granularities,
         }
         files: List[Any] = []
-        for key, value in params.items():
-            files.append((key, (None, value)))
         files.append(("file", ("file", audio, "application/octet-stream")))
-        response = requests.post(url, files=files, headers=self.auth_headers)
+        response = requests.post(
+            url, data=params, files=files, headers=self.auth_headers
+        )
         if response.status_code != 200:
             raise RuntimeError(
                 f"Failed to transcribe the audio, detail: {_get_error_string(response)}"
@@ -618,9 +624,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
     def translations(
         self,
         audio: bytes,
+        language: Optional[str] = None,
         prompt: Optional[str] = None,
         response_format: Optional[str] = "json",
         temperature: Optional[float] = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
         """
         Translates audio into English.
@@ -631,6 +639,9 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         audio: bytes
             The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg,
             mpga, m4a, ogg, wav, or webm.
+        language: Optional[str]
+            The language of the input audio. Supplying the input language in ISO-639-1
+            (https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes) format will improve accuracy and latency.
         prompt: Optional[str]
             An optional text to guide the model's style or continue a previous audio segment.
             The prompt should match the audio language.
@@ -641,6 +652,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             while lower values like 0.2 will make it more focused and deterministic.
             If set to 0, the model will use log probability to automatically increase the temperature
             until certain thresholds are hit.
+        timestamp_granularities: Optional[List[str]], default is None.
+            The timestamp granularities to populate for this transcription. response_format must be set verbose_json
+            to use timestamp granularities. Either or both of these options are supported: word, or segment.
+            Note: There is no additional latency for segment timestamps, but generating word timestamps incurs
+            additional latency.
         Returns
         -------
@@ -649,15 +665,17 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         url = f"{self._base_url}/v1/audio/translations"
         params = {
             "model": self._model_uid,
+            "language": language,
             "prompt": prompt,
             "response_format": response_format,
             "temperature": temperature,
+            "timestamp_granularities[]": timestamp_granularities,
         }
         files: List[Any] = []
-        for key, value in params.items():
-            files.append((key, (None, value)))
         files.append(("file", ("file", audio, "application/octet-stream")))
-        response = requests.post(url, files=files, headers=self.auth_headers)
+        response = requests.post(
+            url, data=params, files=files, headers=self.auth_headers
+        )
         if response.status_code != 200:
             raise RuntimeError(
                 f"Failed to translate the audio, detail: {_get_error_string(response)}"
@@ -754,60 +772,11 @@ class Client:
         model_list = response_data["data"]
         return {item["id"]: item for item in model_list}
-    def launch_speculative_llm(
-        self,
-        model_name: str,
-        model_size_in_billions: Optional[Union[int, str, float]],
-        quantization: Optional[str],
-        draft_model_name: str,
-        draft_model_size_in_billions: Optional[int],
-        draft_quantization: Optional[str],
-        n_gpu: Optional[Union[int, str]] = "auto",
-    ):
-        """
-        Launch the LLM along with a draft model based on the parameters on the server via RESTful APIs. This is an
-        experimental feature and the API may change in the future.
-        Returns
-        -------
-        str
-            The unique model_uid for the launched model.
-        """
-        warnings.warn(
-            "`launch_speculative_llm` is an experimental feature and the API may change in the future."
-        )
-        # convert float to int or string since the RESTful API does not accept float.
-        if isinstance(model_size_in_billions, float):
-            model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
-        payload = {
-            "model_uid": None,
-            "model_name": model_name,
-            "model_size_in_billions": model_size_in_billions,
-            "quantization": quantization,
-            "draft_model_name": draft_model_name,
-            "draft_model_size_in_billions": draft_model_size_in_billions,
-            "draft_quantization": draft_quantization,
-            "n_gpu": n_gpu,
-        }
-        url = f"{self.base_url}/experimental/speculative_llms"
-        response = requests.post(url, json=payload, headers=self._headers)
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Failed to launch model, detail: {_get_error_string(response)}"
-            )
-        response_data = response.json()
-        model_uid = response_data["model_uid"]
-        return model_uid
     def launch_model(
         self,
         model_name: str,
         model_type: str = "LLM",
+        model_engine: Optional[str] = None,
         model_uid: Optional[str] = None,
         model_size_in_billions: Optional[Union[int, str, float]] = None,
         model_format: Optional[str] = None,
@@ -829,6 +798,8 @@ class Client:
             The name of model.
         model_type: str
             type of model.
+        model_engine: Optional[str]
+            Specify the inference engine of the model when launching LLM.
         model_uid: str
             UID of model, auto generate a UUID if is None.
         model_size_in_billions: Optional[Union[int, str, float]]
@@ -872,6 +843,7 @@ class Client:
         payload = {
             "model_uid": model_uid,
             "model_name": model_name,
+            "model_engine": model_engine,
             "peft_model_config": peft_model_config,
             "model_type": model_type,
             "model_size_in_billions": model_size_in_billions,
@@ -1157,3 +1129,26 @@ class Client:
         response_data = response.json()
         return response_data
+    def query_engine_by_model_name(self, model_name: str):
+        """
+        Get the engine parameters with the model name registered on the server.
+        Parameters
+        ----------
+        model_name: str
+            The name of the model.
+        Returns
+        -------
+        Dict[str, List[Dict[str, Any]]]
+            The supported engine parameters of registered models on the server.
+        """
+        url = f"{self.base_url}/v1/engines/{model_name}"
+        response = requests.get(url, headers=self._headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to query engine parameters by model name, detail: {_get_error_string(response)}"
+            )
+        response_data = response.json()
+        return response_data

xinference/conftest.py CHANGED Viewed

@@ -237,7 +237,7 @@ def setup_with_file_logging():
         logging_conf=TEST_FILE_LOGGING_CONF,
     )
     endpoint = f"http://localhost:{port}"
-    if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
+    if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
         raise RuntimeError("Endpoint is not available after multiple attempts")
     try:

xinference/core/cache_tracker.py CHANGED Viewed

@@ -22,7 +22,7 @@ logger = getLogger(__name__)
 class CacheTrackerActor(xo.Actor):
     def __init__(self):
         super().__init__()
-        self._model_name_to_version_info: Dict[str, List[Dict]] = {}
+        self._model_name_to_version_info: Dict[str, List[Dict]] = {}  # type: ignore
     @classmethod
     def uid(cls) -> str:

xinference/core/chat_interface.py CHANGED Viewed

@@ -109,6 +109,7 @@ class GradioInterface:
             history: List[List[str]],
             max_tokens: int,
             temperature: float,
+            lora_name: str,
         ) -> Generator:
             from ..client import RESTfulClient
@@ -127,6 +128,7 @@ class GradioInterface:
                     "max_tokens": int(max_tokens),
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -152,6 +154,7 @@ class GradioInterface:
                 gr.Slider(
                     minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                 ),
+                gr.Text(label="LoRA Name"),
             ],
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -331,7 +334,7 @@ class GradioInterface:
                 history: hist,
             }
-        def complete(text, hist, max_tokens, temperature) -> Generator:
+        def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -349,6 +352,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -368,7 +372,7 @@ class GradioInterface:
                 history: hist,
             }
-        def retry(text, hist, max_tokens, temperature) -> Generator:
+        def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -387,6 +391,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -470,10 +475,11 @@ class GradioInterface:
                     temperature = gr.Slider(
                         minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                     )
+                    lora_name = gr.Text(label="LoRA Name")
                 btn_generate.click(
                     fn=complete,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )
@@ -485,7 +491,7 @@ class GradioInterface:
                 btn_retry.click(
                     fn=retry,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )

xinference/core/event.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Event(TypedDict):
 class EventCollectorActor(xo.StatelessActor):
     def __init__(self):
         super().__init__()
-        self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict(
+        self._model_uid_to_events: Dict[str, queue.Queue] = defaultdict(  # type: ignore
             lambda: queue.Queue(maxsize=MAX_EVENT_COUNT_PER_MODEL)
         )

xinference/core/model.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import (
     AsyncGenerator,
     Callable,
     Dict,
+    Generator,
     Iterator,
     List,
     Optional,
@@ -153,7 +154,6 @@ class ModelActor(xo.StatelessActor):
     ):
         super().__init__()
         from ..model.llm.pytorch.core import PytorchModel
-        from ..model.llm.pytorch.spec_model import SpeculativeModel
         from ..model.llm.vllm.core import VLLMModel
         self._worker_address = worker_address
@@ -167,7 +167,7 @@ class ModelActor(xo.StatelessActor):
         self._current_generator = lambda: None
         self._lock = (
             None
-            if isinstance(self._model, (PytorchModel, SpeculativeModel, VLLMModel))
+            if isinstance(self._model, (PytorchModel, VLLMModel))
             else asyncio.locks.Lock()
         )
         self._worker_ref = None
@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
             for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = dict(data=json.dumps(v))
                 yield sse_starlette.sse.ensure_bytes(v, None)
         except OutOfMemoryError:
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
             async for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = await asyncio.to_thread(json.dumps, v)
                 v = dict(data=v)  # noqa: F821
                 yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
@@ -379,8 +379,13 @@ class ModelActor(xo.StatelessActor):
             raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
         finally:
             # For the non stream result.
-            if response is not None and isinstance(response, dict):
-                usage = response["usage"]
+            record = None
+            if isinstance(response, Generator) or isinstance(response, AsyncGenerator):
+                record = response
+            elif isinstance(response, bytes):
+                record = json.loads(response)
+            if record and isinstance(record, dict):
+                usage = record["usage"]
                 # Some backends may not have a valid usage, we just skip them.
                 completion_tokens = usage["completion_tokens"]
                 prompt_tokens = usage["prompt_tokens"]
@@ -436,6 +441,7 @@ class ModelActor(xo.StatelessActor):
         prompt: Optional[str] = None,
         response_format: str = "json",
         temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
         if hasattr(self._model, "transcriptions"):
             return await self._call_wrapper(
@@ -445,6 +451,7 @@ class ModelActor(xo.StatelessActor):
                 prompt,
                 response_format,
                 temperature,
+                timestamp_granularities,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating transcriptions."
@@ -455,17 +462,21 @@ class ModelActor(xo.StatelessActor):
     async def translations(
         self,
         audio: bytes,
+        language: Optional[str] = None,
         prompt: Optional[str] = None,
         response_format: str = "json",
         temperature: float = 0,
+        timestamp_granularities: Optional[List[str]] = None,
     ):
         if hasattr(self._model, "translations"):
             return await self._call_wrapper(
                 self._model.translations,
                 audio,
+                language,
                 prompt,
                 response_format,
                 temperature,
+                timestamp_granularities,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating translations."

xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl