PyPI - xinference - Versions diffs - 1.6.1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-05-30T19:36:43+0800",
+ "date": "2025-06-13T18:51:07+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "72cc5e39040bdc49981b240c2b59af998554a75f",
- "version": "1.6.1"
+ "full-revisionid": "a362dba7334ef08c758bbc4a3d4904fe53cefe78",
+ "version": "1.7.0"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -387,6 +387,7 @@ class RESTfulAPI(CancelMixin):
         self._router.add_api_route(
             "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
         )
+        # just for compatibility, LLM only
         self._router.add_api_route(
             "/v1/engines/{model_name}",
             self.query_engines_by_model_name,
@@ -397,6 +398,17 @@ class RESTfulAPI(CancelMixin):
                 else None
             ),
         )
+        # engines for all model types
+        self._router.add_api_route(
+            "/v1/engines/{model_type}/{model_name}",
+            self.query_engines_by_model_name,
+            methods=["GET"],
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:list"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         # running instances
         self._router.add_api_route(
             "/v1/models/instances",
@@ -708,6 +720,17 @@ class RESTfulAPI(CancelMixin):
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/video/generations/flf",
+            self.create_videos_from_first_last_frame,
+            methods=["POST"],
+            response_model=VideoList,
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         self._router.add_api_route(
             "/v1/chat/completions",
             self.create_chat_completion,
@@ -2084,6 +2107,57 @@ class RESTfulAPI(CancelMixin):
             self.handle_request_limit_error(e)
             raise HTTPException(status_code=500, detail=str(e))
+    async def create_videos_from_first_last_frame(
+        self,
+        model: str = Form(...),
+        first_frame: UploadFile = File(media_type="application/octet-stream"),
+        last_frame: UploadFile = File(media_type="application/octet-stream"),
+        prompt: Optional[Union[str, List[str]]] = Form(None),
+        negative_prompt: Optional[Union[str, List[str]]] = Form(None),
+        n: Optional[int] = Form(1),
+        kwargs: Optional[str] = Form(None),
+    ) -> Response:
+        model_uid = model
+        try:
+            model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+        request_id = None
+        try:
+            if kwargs is not None:
+                parsed_kwargs = json.loads(kwargs)
+            else:
+                parsed_kwargs = {}
+            request_id = parsed_kwargs.get("request_id")
+            self._add_running_task(request_id)
+            video_list = await model_ref.flf_to_video(
+                first_frame=Image.open(first_frame.file),
+                last_frame=Image.open(last_frame.file),
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                n=n,
+                **parsed_kwargs,
+            )
+            return Response(content=video_list, media_type="application/json")
+        except asyncio.CancelledError:
+            err_str = f"The request has been cancelled: {request_id}"
+            logger.error(err_str)
+            await self._report_error_event(model_uid, err_str)
+            raise HTTPException(status_code=409, detail=err_str)
+        except Exception as e:
+            e = await self._get_model_last_error(model_ref.uid, e)
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            self.handle_request_limit_error(e)
+            raise HTTPException(status_code=500, detail=str(e))
     async def create_chat_completion(self, request: Request) -> Response:
         raw_body = await request.json()
         body = CreateChatCompletion.parse_obj(raw_body)
@@ -2234,11 +2308,14 @@ class RESTfulAPI(CancelMixin):
                 self.handle_request_limit_error(e)
                 raise HTTPException(status_code=500, detail=str(e))
-    async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
+    async def query_engines_by_model_name(
+        self, request: Request, model_name: str, model_type: Optional[str] = None
+    ) -> JSONResponse:
         try:
+            model_type = model_type or request.path_params.get("model_type", "LLM")
             content = await (
                 await self._get_supervisor_ref()
-            ).query_engines_by_model_name(model_name)
+            ).query_engines_by_model_name(model_name, model_type=model_type)
             return JSONResponse(content=content)
         except ValueError as re:
             logger.error(re, exc_info=True)

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -510,6 +510,59 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
         response_data = response.json()
         return response_data
+    def flf_to_video(
+        self,
+        first_frame: Union[str, bytes],
+        last_frame: Union[str, bytes],
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        n: int = 1,
+        **kwargs,
+    ) -> "VideoList":
+        """
+        Creates a video by the first frame, last frame and text.
+        Parameters
+        ----------
+        first_frame: `Union[str, bytes]`
+            The first frame to condition the generation on.
+        last_frame: `Union[str, bytes]`
+            The last frame to condition the generation on.
+        prompt: `str` or `List[str]`
+            The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation.
+        n: `int`, defaults to 1
+            The number of videos to generate per prompt. Must be between 1 and 10.
+        Returns
+        -------
+        VideoList
+            A list of video objects.
+        """
+        url = f"{self._base_url}/v1/video/generations/flf"
+        params = {
+            "model": self._model_uid,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "n": n,
+            "kwargs": json.dumps(kwargs),
+        }
+        files: List[Any] = []
+        for key, value in params.items():
+            files.append((key, (None, value)))
+        files.append(
+            ("first_frame", ("image", first_frame, "application/octet-stream"))
+        )
+        files.append(("last_frame", ("image", last_frame, "application/octet-stream")))
+        response = requests.post(url, files=files, headers=self.auth_headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to create the video from image, detail: {_get_error_string(response)}"
+            )
+        response_data = response.json()
+        return response_data
 class RESTfulGenerateModelHandle(RESTfulModelHandle):
     def generate(
@@ -637,6 +690,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         response_format: Optional[str] = "json",
         temperature: Optional[float] = 0,
         timestamp_granularities: Optional[List[str]] = None,
+        **kwargs,
     ):
         """
         Transcribes audio into the input language.
@@ -678,6 +732,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             "response_format": response_format,
             "temperature": temperature,
             "timestamp_granularities[]": timestamp_granularities,
+            "kwargs": json.dumps(kwargs),
         }
         files: List[Any] = []
         files.append(("file", ("file", audio, "application/octet-stream")))
@@ -1502,7 +1557,9 @@ class Client:
         response_data = response.json()
         return response_data
-    def query_engine_by_model_name(self, model_name: str):
+    def query_engine_by_model_name(
+        self, model_name: str, model_type: Optional[str] = "LLM"
+    ):
         """
         Get the engine parameters with the model name registered on the server.
@@ -1510,12 +1567,17 @@ class Client:
         ----------
         model_name: str
             The name of the model.
+        model_type: str
+            Model type, LLM by default.
         Returns
         -------
         Dict[str, List[Dict[str, Any]]]
             The supported engine parameters of registered models on the server.
         """
-        url = f"{self.base_url}/v1/engines/{model_name}"
+        if not model_type:
+            url = f"{self.base_url}/v1/engines/{model_name}"
+        else:
+            url = f"{self.base_url}/v1/engines/{model_type}/{model_name}"
         response = requests.get(url, headers=self._headers)
         if response.status_code != 200:
             raise RuntimeError(

xinference/core/media_interface.py CHANGED Viewed

@@ -577,6 +577,126 @@ class MediaInterface:
         return image2video_ui
+    def flf2video_interface(self) -> "gr.Blocks":
+        def generate_video_from_flf(
+            first_frame: "PIL.Image.Image",
+            last_frame: "PIL.Image.Image",
+            prompt: str,
+            negative_prompt: str,
+            num_frames: int,
+            fps: int,
+            num_inference_steps: int,
+            guidance_scale: float,
+            width: int,
+            height: int,
+            progress=gr.Progress(),
+        ) -> List[Tuple[str, str]]:
+            from ..client import RESTfulClient
+            client = RESTfulClient(self.endpoint)
+            client._set_token(self.access_token)
+            model = client.get_model(self.model_uid)
+            assert hasattr(model, "flf_to_video")
+            request_id = str(uuid.uuid4())
+            response = None
+            exc = None
+            buffer_first = io.BytesIO()
+            buffer_last = io.BytesIO()
+            first_frame.save(buffer_first, format="PNG")
+            last_frame.save(buffer_last, format="PNG")
+            def run_in_thread():
+                nonlocal exc, response
+                try:
+                    response = model.flf_to_video(
+                        first_frame=buffer_first.getvalue(),
+                        last_frame=buffer_last.getvalue(),
+                        prompt=prompt,
+                        negative_prompt=negative_prompt,
+                        n=1,
+                        num_frames=num_frames,
+                        fps=fps,
+                        num_inference_steps=num_inference_steps,
+                        guidance_scale=guidance_scale,
+                        width=width,
+                        height=height,
+                        response_format="b64_json",
+                        request_id=request_id,
+                    )
+                except Exception as e:
+                    exc = e
+            t = threading.Thread(target=run_in_thread)
+            t.start()
+            while t.is_alive():
+                try:
+                    cur_progress = client.get_progress(request_id)["progress"]
+                except Exception:
+                    cur_progress = 0.0
+                progress(cur_progress, desc="Generating video from first/last frames")
+                time.sleep(1)
+            if exc:
+                raise exc
+            videos = []
+            for video_dict in response["data"]:  # type: ignore
+                video_data = base64.b64decode(video_dict["b64_json"])
+                video_path = f"/tmp/{uuid.uuid4()}.mp4"
+                with open(video_path, "wb") as f:
+                    f.write(video_data)
+                videos.append((video_path, "Generated Video"))
+            return videos
+        # Gradio UI
+        with gr.Blocks() as flf2video_ui:
+            with gr.Row():
+                first_frame = gr.Image(label="First Frame", type="pil")
+                last_frame = gr.Image(label="Last Frame", type="pil")
+            prompt = gr.Textbox(label="Prompt", placeholder="Enter video prompt")
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt", placeholder="Enter negative prompt"
+            )
+            with gr.Row():
+                with gr.Column():
+                    width = gr.Number(label="Width", value=512)
+                    num_frames = gr.Number(label="Frames", value=16)
+                    steps = gr.Number(label="Inference Steps", value=25)
+                with gr.Column():
+                    height = gr.Number(label="Height", value=512)
+                    fps = gr.Number(label="FPS", value=8)
+                    guidance_scale = gr.Slider(
+                        label="Guidance Scale", minimum=1, maximum=20, value=7.5
+                    )
+            generate = gr.Button("Generate")
+            gallery = gr.Gallery(label="Generated Videos", columns=2)
+            generate.click(
+                fn=generate_video_from_flf,
+                inputs=[
+                    first_frame,
+                    last_frame,
+                    prompt,
+                    negative_prompt,
+                    num_frames,
+                    fps,
+                    steps,
+                    guidance_scale,
+                    width,
+                    height,
+                ],
+                outputs=gallery,
+            )
+        return flf2video_ui
     def audio2text_interface(self) -> "gr.Blocks":
         def transcribe_audio(
             audio_path: str,
@@ -750,6 +870,9 @@ class MediaInterface:
             if "image2video" in self.model_ability:
                 with gr.Tab("Image to Video"):
                     self.image2video_interface()
+            if "firstlastframe2video" in self.model_ability:
+                with gr.Tab("FirstLastFrame to Video"):
+                    self.flf2video_interface()
             if "audio2text" in self.model_ability:
                 with gr.Tab("Audio to Text"):
                     self.audio2text_interface()

xinference/core/model.py CHANGED Viewed

@@ -1289,6 +1289,37 @@ class ModelActor(xo.StatelessActor, CancelMixin):
             f"Model {self._model.model_spec} is not for creating video from image."
         )
+    @request_limit
+    @log_async(logger=logger)
+    async def flf_to_video(
+        self,
+        first_frame: "PIL.Image.Image",
+        last_frame: "PIL.Image.Image",
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        n: int = 1,
+        *args,
+        **kwargs,
+    ):
+        kwargs["negative_prompt"] = negative_prompt
+        progressor = kwargs["progressor"] = await self._get_progressor(
+            kwargs.pop("request_id", None)
+        )
+        with progressor:
+            if hasattr(self._model, "firstlastframe_to_video"):
+                return await self._call_wrapper_json(
+                    self._model.firstlastframe_to_video,
+                    first_frame,
+                    last_frame,
+                    prompt,
+                    n,
+                    *args,
+                    **kwargs,
+                )
+        raise AttributeError(
+            f"Model {self._model.model_spec} is not for creating video from first-last-frame."
+        )
     async def record_metrics(self, name, op, kwargs):
         worker_ref = await self._get_worker_ref()
         await worker_ref.record_metrics(name, op, kwargs)

xinference/core/supervisor.py CHANGED Viewed

@@ -45,6 +45,7 @@ from ..constants import (
 )
 from ..core.model import ModelActor
 from ..core.status_guard import InstanceInfo, LaunchStatus
+from ..model.utils import get_engine_params_by_name
 from ..types import PeftModelConfig
 from .metrics import record_metrics
 from .resource import GPUStatus, ResourceStatus
@@ -780,29 +781,19 @@ class SupervisorActor(xo.StatelessActor):
             raise ValueError(f"Unsupported model type: {model_type}")
     @log_async(logger=logger)
-    async def query_engines_by_model_name(self, model_name: str):
-        from copy import deepcopy
-        from ..model.llm.llm_family import LLM_ENGINES
+    async def query_engines_by_model_name(
+        self, model_name: str, model_type: Optional[str] = None
+    ):
         # search in worker first
         workers = list(self._worker_address_to_worker.values())
         for worker in workers:
-            res = await worker.query_engines_by_model_name(model_name)
+            res = await worker.query_engines_by_model_name(
+                model_name, model_type=model_type
+            )
             if res is not None:
                 return res
-        if model_name not in LLM_ENGINES:
-            raise ValueError(f"Model {model_name} not found")
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine in engine_params:
-            params = engine_params[engine]
-            for param in params:
-                del param["llm_class"]
-        return engine_params
+        return get_engine_params_by_name(model_type, model_name)
     @log_async(logger=logger)
     async def register_model(

xinference/core/worker.py CHANGED Viewed

@@ -53,7 +53,7 @@ from ..core.model import ModelActor
 from ..core.status_guard import LaunchStatus
 from ..device_utils import get_available_device_env_name, gpu_count
 from ..model.core import ModelDescription, VirtualEnvSettings, create_model_instance
-from ..model.utils import CancellableDownloader
+from ..model.utils import CancellableDownloader, get_engine_params_by_name
 from ..types import PeftModelConfig
 from ..utils import get_pip_config_args, get_real_path
 from .cache_tracker import CacheTrackerActor
@@ -747,22 +747,10 @@ class WorkerActor(xo.StatelessActor):
         return None
     @log_async(logger=logger)
-    async def query_engines_by_model_name(self, model_name: str):
-        from copy import deepcopy
-        from ..model.llm.llm_family import LLM_ENGINES
-        if model_name not in LLM_ENGINES:
-            return None
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine in engine_params:
-            params = engine_params[engine]
-            for param in params:
-                del param["llm_class"]
-        return engine_params
+    async def query_engines_by_model_name(
+        self, model_name: str, model_type: Optional[str] = None
+    ):
+        return get_engine_params_by_name(model_type, model_name)
     async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
         from ..model.llm.core import LLM

xinference/deploy/cmdline.py CHANGED Viewed

@@ -1315,8 +1315,12 @@ def model_chat(
                     if "content" not in delta:
                         continue
                     else:
-                        response_content += delta["content"]
-                        print(delta["content"], end="", flush=True, file=sys.stdout)
+                        # The first chunk of stream output may have no content (None). Related PRs:
+                        # https://github.com/ggml-org/llama.cpp/pull/13634
+                        # https://github.com/ggml-org/llama.cpp/pull/12379
+                        content = delta["content"] or ""
+                        response_content += content
+                        print(content, end="", flush=True, file=sys.stdout)
                 print("", file=sys.stdout)
                 messages.append(dict(role="assistant", content=response_content))

xinference/model/audio/chattts.py CHANGED Viewed

@@ -71,9 +71,10 @@ class ChatTTSModel:
         import ChatTTS
         import numpy as np
         import torch
-        import torchaudio
         import xxhash
+        from .utils import audio_stream_generator, audio_to_bytes
         rnd_spk_emb = None
         if len(voice) > 400:
@@ -105,44 +106,28 @@ class ChatTTSModel:
         )
         assert self._model is not None
+        output = self._model.infer(
+            [input], params_infer_code=params_infer_code, stream=stream
+        )
         if stream:
-            iter = self._model.infer(
-                [input], params_infer_code=params_infer_code, stream=True
-            )
-            def _generator():
-                with BytesIO() as out:
-                    writer = torchaudio.io.StreamWriter(out, format=response_format)
-                    writer.add_audio_stream(sample_rate=24000, num_channels=1)
-                    i = 0
-                    last_pos = 0
-                    with writer.open():
-                        for it in iter:
-                            for chunk in it:
-                                chunk = np.array([chunk]).transpose()
-                                writer.write_audio_chunk(i, torch.from_numpy(chunk))
-                                new_last_pos = out.tell()
-                                if new_last_pos != last_pos:
-                                    out.seek(last_pos)
-                                    encoded_bytes = out.read()
-                                    yield encoded_bytes
-                                    last_pos = new_last_pos
-            return _generator()
+            def _gen_chunk():
+                for it in output:
+                    for chunk in it:
+                        yield chunk
+            return audio_stream_generator(
+                response_format=response_format,
+                sample_rate=24000,
+                output_generator=_gen_chunk(),
+                output_chunk_transformer=lambda c: torch.from_numpy(
+                    np.array([c]).transpose()
+                ),
+            )
         else:
-            wavs = self._model.infer([input], params_infer_code=params_infer_code)
-            # Save the generated audio
-            with BytesIO() as out:
-                try:
-                    torchaudio.save(
-                        out,
-                        torch.from_numpy(wavs[0]).unsqueeze(0),
-                        24000,
-                        format=response_format,
-                    )
-                except:
-                    torchaudio.save(
-                        out, torch.from_numpy(wavs[0]), 24000, format=response_format
-                    )
-                return out.getvalue()
+            return audio_to_bytes(
+                response_format=response_format,
+                sample_rate=24000,
+                tensor=torch.from_numpy(output[0]).unsqueeze(0),
+            )

xinference/model/audio/cosyvoice.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import io
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, Optional
 from ..utils import set_all_random_seed
@@ -132,36 +131,25 @@ class CosyVoiceModel:
                 output = self._model.inference_sft(input, voice, stream=stream)
         import torch
-        import torchaudio
-        def _generator_stream():
-            with BytesIO() as out:
-                writer = torchaudio.io.StreamWriter(out, format=response_format)
-                writer.add_audio_stream(
-                    sample_rate=self._model.sample_rate, num_channels=1
-                )
-                i = 0
-                last_pos = 0
-                with writer.open():
-                    for chunk in output:
-                        chunk = chunk["tts_speech"]
-                        trans_chunk = torch.transpose(chunk, 0, 1)
-                        writer.write_audio_chunk(i, trans_chunk)
-                        new_last_pos = out.tell()
-                        if new_last_pos != last_pos:
-                            out.seek(last_pos)
-                            encoded_bytes = out.read()
-                            yield encoded_bytes
-                            last_pos = new_last_pos
-        def _generator_block():
-            chunks = [o["tts_speech"] for o in output]
-            t = torch.cat(chunks, dim=1)
-            with BytesIO() as out:
-                torchaudio.save(out, t, self._model.sample_rate, format=response_format)
-                return out.getvalue()
-        return _generator_stream() if stream else _generator_block()
+        from .utils import audio_stream_generator, audio_to_bytes
+        return (
+            audio_stream_generator(
+                response_format=response_format,
+                sample_rate=self._model.sample_rate,
+                output_generator=output,
+                output_chunk_transformer=lambda c: torch.transpose(
+                    c["tts_speech"], 0, 1
+                ),
+            )
+            if stream
+            else audio_to_bytes(
+                response_format=response_format,
+                sample_rate=self._model.sample_rate,
+                tensor=torch.cat([o["tts_speech"] for o in output], dim=1),
+            )
+        )
     def speech(
         self,

xinference 1.6.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.1py3-none-any.whl → 1.7.0py3-none-any.whl