PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-05-17T15:09:06+0800",
+ "date": "2025-06-13T18:51:07+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "1adc5d3e5cffb2752cd3e05ca782c4cfe3c0ce57",
- "version": "1.6.0.post1"
+ "full-revisionid": "a362dba7334ef08c758bbc4a3d4904fe53cefe78",
+ "version": "1.7.0"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -387,6 +387,7 @@ class RESTfulAPI(CancelMixin):
         self._router.add_api_route(
             "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
         )
+        # just for compatibility, LLM only
         self._router.add_api_route(
             "/v1/engines/{model_name}",
             self.query_engines_by_model_name,
@@ -397,6 +398,17 @@ class RESTfulAPI(CancelMixin):
                 else None
             ),
         )
+        # engines for all model types
+        self._router.add_api_route(
+            "/v1/engines/{model_type}/{model_name}",
+            self.query_engines_by_model_name,
+            methods=["GET"],
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:list"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         # running instances
         self._router.add_api_route(
             "/v1/models/instances",
@@ -708,6 +720,17 @@ class RESTfulAPI(CancelMixin):
                 else None
             ),
         )
+        self._router.add_api_route(
+            "/v1/video/generations/flf",
+            self.create_videos_from_first_last_frame,
+            methods=["POST"],
+            response_model=VideoList,
+            dependencies=(
+                [Security(self._auth_service, scopes=["models:read"])]
+                if self.is_authenticated()
+                else None
+            ),
+        )
         self._router.add_api_route(
             "/v1/chat/completions",
             self.create_chat_completion,
@@ -2084,6 +2107,57 @@ class RESTfulAPI(CancelMixin):
             self.handle_request_limit_error(e)
             raise HTTPException(status_code=500, detail=str(e))
+    async def create_videos_from_first_last_frame(
+        self,
+        model: str = Form(...),
+        first_frame: UploadFile = File(media_type="application/octet-stream"),
+        last_frame: UploadFile = File(media_type="application/octet-stream"),
+        prompt: Optional[Union[str, List[str]]] = Form(None),
+        negative_prompt: Optional[Union[str, List[str]]] = Form(None),
+        n: Optional[int] = Form(1),
+        kwargs: Optional[str] = Form(None),
+    ) -> Response:
+        model_uid = model
+        try:
+            model_ref = await (await self._get_supervisor_ref()).get_model(model_uid)
+        except ValueError as ve:
+            logger.error(str(ve), exc_info=True)
+            await self._report_error_event(model_uid, str(ve))
+            raise HTTPException(status_code=400, detail=str(ve))
+        except Exception as e:
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            raise HTTPException(status_code=500, detail=str(e))
+        request_id = None
+        try:
+            if kwargs is not None:
+                parsed_kwargs = json.loads(kwargs)
+            else:
+                parsed_kwargs = {}
+            request_id = parsed_kwargs.get("request_id")
+            self._add_running_task(request_id)
+            video_list = await model_ref.flf_to_video(
+                first_frame=Image.open(first_frame.file),
+                last_frame=Image.open(last_frame.file),
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                n=n,
+                **parsed_kwargs,
+            )
+            return Response(content=video_list, media_type="application/json")
+        except asyncio.CancelledError:
+            err_str = f"The request has been cancelled: {request_id}"
+            logger.error(err_str)
+            await self._report_error_event(model_uid, err_str)
+            raise HTTPException(status_code=409, detail=err_str)
+        except Exception as e:
+            e = await self._get_model_last_error(model_ref.uid, e)
+            logger.error(e, exc_info=True)
+            await self._report_error_event(model_uid, str(e))
+            self.handle_request_limit_error(e)
+            raise HTTPException(status_code=500, detail=str(e))
     async def create_chat_completion(self, request: Request) -> Response:
         raw_body = await request.json()
         body = CreateChatCompletion.parse_obj(raw_body)
@@ -2234,11 +2308,14 @@ class RESTfulAPI(CancelMixin):
                 self.handle_request_limit_error(e)
                 raise HTTPException(status_code=500, detail=str(e))
-    async def query_engines_by_model_name(self, model_name: str) -> JSONResponse:
+    async def query_engines_by_model_name(
+        self, request: Request, model_name: str, model_type: Optional[str] = None
+    ) -> JSONResponse:
         try:
+            model_type = model_type or request.path_params.get("model_type", "LLM")
             content = await (
                 await self._get_supervisor_ref()
-            ).query_engines_by_model_name(model_name)
+            ).query_engines_by_model_name(model_name, model_type=model_type)
             return JSONResponse(content=content)
         except ValueError as re:
             logger.error(re, exc_info=True)

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -510,6 +510,59 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
         response_data = response.json()
         return response_data
+    def flf_to_video(
+        self,
+        first_frame: Union[str, bytes],
+        last_frame: Union[str, bytes],
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        n: int = 1,
+        **kwargs,
+    ) -> "VideoList":
+        """
+        Creates a video by the first frame, last frame and text.
+        Parameters
+        ----------
+        first_frame: `Union[str, bytes]`
+            The first frame to condition the generation on.
+        last_frame: `Union[str, bytes]`
+            The last frame to condition the generation on.
+        prompt: `str` or `List[str]`
+            The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
+        negative_prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts not to guide the image generation.
+        n: `int`, defaults to 1
+            The number of videos to generate per prompt. Must be between 1 and 10.
+        Returns
+        -------
+        VideoList
+            A list of video objects.
+        """
+        url = f"{self._base_url}/v1/video/generations/flf"
+        params = {
+            "model": self._model_uid,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "n": n,
+            "kwargs": json.dumps(kwargs),
+        }
+        files: List[Any] = []
+        for key, value in params.items():
+            files.append((key, (None, value)))
+        files.append(
+            ("first_frame", ("image", first_frame, "application/octet-stream"))
+        )
+        files.append(("last_frame", ("image", last_frame, "application/octet-stream")))
+        response = requests.post(url, files=files, headers=self.auth_headers)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to create the video from image, detail: {_get_error_string(response)}"
+            )
+        response_data = response.json()
+        return response_data
 class RESTfulGenerateModelHandle(RESTfulModelHandle):
     def generate(
@@ -637,6 +690,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
         response_format: Optional[str] = "json",
         temperature: Optional[float] = 0,
         timestamp_granularities: Optional[List[str]] = None,
+        **kwargs,
     ):
         """
         Transcribes audio into the input language.
@@ -678,6 +732,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
             "response_format": response_format,
             "temperature": temperature,
             "timestamp_granularities[]": timestamp_granularities,
+            "kwargs": json.dumps(kwargs),
         }
         files: List[Any] = []
         files.append(("file", ("file", audio, "application/octet-stream")))
@@ -1017,7 +1072,7 @@ class Client:
         model_path: Optional[str]
             Model path, if gguf format, should be the file path, otherwise, should be directory of the model.
         **kwargs:
-            Any other parameters been specified.
+            Any other parameters been specified. e.g. multimodal_projector for multimodal inference with the llama.cpp backend.
         Returns
         -------
@@ -1502,7 +1557,9 @@ class Client:
         response_data = response.json()
         return response_data
-    def query_engine_by_model_name(self, model_name: str):
+    def query_engine_by_model_name(
+        self, model_name: str, model_type: Optional[str] = "LLM"
+    ):
         """
         Get the engine parameters with the model name registered on the server.
@@ -1510,12 +1567,17 @@ class Client:
         ----------
         model_name: str
             The name of the model.
+        model_type: str
+            Model type, LLM by default.
         Returns
         -------
         Dict[str, List[Dict[str, Any]]]
             The supported engine parameters of registered models on the server.
         """
-        url = f"{self.base_url}/v1/engines/{model_name}"
+        if not model_type:
+            url = f"{self.base_url}/v1/engines/{model_name}"
+        else:
+            url = f"{self.base_url}/v1/engines/{model_type}/{model_name}"
         response = requests.get(url, headers=self._headers)
         if response.status_code != 200:
             raise RuntimeError(

xinference/conftest.py CHANGED Viewed

@@ -304,10 +304,3 @@ def setup_with_auth():
             os.remove(auth_file)
         except:
             pass
-@pytest.fixture
-def set_use_xllamacpp():
-    os.environ["USE_XLLAMACPP"] = "1"
-    yield
-    del os.environ["USE_XLLAMACPP"]

xinference/core/media_interface.py CHANGED Viewed

@@ -19,7 +19,7 @@ import os
 import threading
 import time
 import uuid
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import gradio as gr
 import PIL.Image
@@ -463,7 +463,7 @@ class MediaInterface:
     def image2video_interface(self) -> "gr.Blocks":
         def image_generate_video(
-            image: "PIL.Image",
+            image: "PIL.Image.Image",
             prompt: str,
             negative_prompt: str,
             num_frames: int,
@@ -577,6 +577,126 @@ class MediaInterface:
         return image2video_ui
+    def flf2video_interface(self) -> "gr.Blocks":
+        def generate_video_from_flf(
+            first_frame: "PIL.Image.Image",
+            last_frame: "PIL.Image.Image",
+            prompt: str,
+            negative_prompt: str,
+            num_frames: int,
+            fps: int,
+            num_inference_steps: int,
+            guidance_scale: float,
+            width: int,
+            height: int,
+            progress=gr.Progress(),
+        ) -> List[Tuple[str, str]]:
+            from ..client import RESTfulClient
+            client = RESTfulClient(self.endpoint)
+            client._set_token(self.access_token)
+            model = client.get_model(self.model_uid)
+            assert hasattr(model, "flf_to_video")
+            request_id = str(uuid.uuid4())
+            response = None
+            exc = None
+            buffer_first = io.BytesIO()
+            buffer_last = io.BytesIO()
+            first_frame.save(buffer_first, format="PNG")
+            last_frame.save(buffer_last, format="PNG")
+            def run_in_thread():
+                nonlocal exc, response
+                try:
+                    response = model.flf_to_video(
+                        first_frame=buffer_first.getvalue(),
+                        last_frame=buffer_last.getvalue(),
+                        prompt=prompt,
+                        negative_prompt=negative_prompt,
+                        n=1,
+                        num_frames=num_frames,
+                        fps=fps,
+                        num_inference_steps=num_inference_steps,
+                        guidance_scale=guidance_scale,
+                        width=width,
+                        height=height,
+                        response_format="b64_json",
+                        request_id=request_id,
+                    )
+                except Exception as e:
+                    exc = e
+            t = threading.Thread(target=run_in_thread)
+            t.start()
+            while t.is_alive():
+                try:
+                    cur_progress = client.get_progress(request_id)["progress"]
+                except Exception:
+                    cur_progress = 0.0
+                progress(cur_progress, desc="Generating video from first/last frames")
+                time.sleep(1)
+            if exc:
+                raise exc
+            videos = []
+            for video_dict in response["data"]:  # type: ignore
+                video_data = base64.b64decode(video_dict["b64_json"])
+                video_path = f"/tmp/{uuid.uuid4()}.mp4"
+                with open(video_path, "wb") as f:
+                    f.write(video_data)
+                videos.append((video_path, "Generated Video"))
+            return videos
+        # Gradio UI
+        with gr.Blocks() as flf2video_ui:
+            with gr.Row():
+                first_frame = gr.Image(label="First Frame", type="pil")
+                last_frame = gr.Image(label="Last Frame", type="pil")
+            prompt = gr.Textbox(label="Prompt", placeholder="Enter video prompt")
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt", placeholder="Enter negative prompt"
+            )
+            with gr.Row():
+                with gr.Column():
+                    width = gr.Number(label="Width", value=512)
+                    num_frames = gr.Number(label="Frames", value=16)
+                    steps = gr.Number(label="Inference Steps", value=25)
+                with gr.Column():
+                    height = gr.Number(label="Height", value=512)
+                    fps = gr.Number(label="FPS", value=8)
+                    guidance_scale = gr.Slider(
+                        label="Guidance Scale", minimum=1, maximum=20, value=7.5
+                    )
+            generate = gr.Button("Generate")
+            gallery = gr.Gallery(label="Generated Videos", columns=2)
+            generate.click(
+                fn=generate_video_from_flf,
+                inputs=[
+                    first_frame,
+                    last_frame,
+                    prompt,
+                    negative_prompt,
+                    num_frames,
+                    fps,
+                    steps,
+                    guidance_scale,
+                    width,
+                    height,
+                ],
+                outputs=gallery,
+            )
+        return flf2video_ui
     def audio2text_interface(self) -> "gr.Blocks":
         def transcribe_audio(
             audio_path: str,
@@ -653,13 +773,14 @@ class MediaInterface:
                 with open(prompt_speech_file, "rb") as f:
                     prompt_speech_bytes = f.read()
+            kw: Dict[str, Any] = {}
+            if prompt_speech_bytes:
+                kw["prompt_speech"] = prompt_speech_bytes
+            if prompt_text:
+                kw["prompt_text"] = prompt_text
             response = model.speech(
-                input=input_text,
-                voice=voice,
-                speed=speed,
-                response_format="mp3",
-                prompt_speech=prompt_speech_bytes,
-                prompt_text=prompt_text,
+                input=input_text, voice=voice, speed=speed, response_format="mp3", **kw
             )
             # Write to a temp .mp3 file and return its path
@@ -749,6 +870,9 @@ class MediaInterface:
             if "image2video" in self.model_ability:
                 with gr.Tab("Image to Video"):
                     self.image2video_interface()
+            if "firstlastframe2video" in self.model_ability:
+                with gr.Tab("FirstLastFrame to Video"):
+                    self.flf2video_interface()
             if "audio2text" in self.model_ability:
                 with gr.Tab("Audio to Text"):
                     self.audio2text_interface()

xinference/core/model.py CHANGED Viewed

@@ -71,12 +71,8 @@ except ImportError:
     OutOfMemoryError = _OutOfMemoryError
-XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
-    "qwen-vl-chat",
-    "cogvlm2",
-    "glm-4v",
-    "MiniCPM-V-2.6",
-]
+# !!!!! DO NOT add model_name to this list, using `register_batching_multimodal_models` below instead.
+XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = []
 XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
 XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
@@ -84,6 +80,16 @@ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
 )
+def register_batching_multimodal_models(*model_names: str):
+    def decorator(cls):
+        for name in model_names:
+            if name not in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS:
+                XINFERENCE_BATCHING_ALLOWED_VISION_MODELS.append(name)
+        return cls
+    return decorator
 def request_limit(fn):
     """
     Used by ModelActor.
@@ -977,6 +983,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
                 response_format,
                 temperature,
                 timestamp_granularities,
+                **kwargs,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating transcriptions."
@@ -1282,6 +1289,37 @@ class ModelActor(xo.StatelessActor, CancelMixin):
             f"Model {self._model.model_spec} is not for creating video from image."
         )
+    @request_limit
+    @log_async(logger=logger)
+    async def flf_to_video(
+        self,
+        first_frame: "PIL.Image.Image",
+        last_frame: "PIL.Image.Image",
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        n: int = 1,
+        *args,
+        **kwargs,
+    ):
+        kwargs["negative_prompt"] = negative_prompt
+        progressor = kwargs["progressor"] = await self._get_progressor(
+            kwargs.pop("request_id", None)
+        )
+        with progressor:
+            if hasattr(self._model, "firstlastframe_to_video"):
+                return await self._call_wrapper_json(
+                    self._model.firstlastframe_to_video,
+                    first_frame,
+                    last_frame,
+                    prompt,
+                    n,
+                    *args,
+                    **kwargs,
+                )
+        raise AttributeError(
+            f"Model {self._model.model_spec} is not for creating video from first-last-frame."
+        )
     async def record_metrics(self, name, op, kwargs):
         worker_ref = await self._get_worker_ref()
         await worker_ref.record_metrics(name, op, kwargs)

xinference/core/scheduler.py CHANGED Viewed

@@ -272,15 +272,6 @@ class InferenceRequest:
         )
-def _get_valid_batch_kv_cache(cache, skipped_indexes: Set[int]):
-    batch_size = cache.key_cache[0].shape[0]
-    batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
-    for idx in range(len(cache)):
-        cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
-        cache.value_cache[idx] = cache.value_cache[idx][batch_slices, ::].contiguous()
-    return cache
 class SchedulerActor(xo.StatelessActor):
     @classmethod
     def gen_uid(cls, model_uid: str, replica_id: str):
@@ -409,7 +400,7 @@ class SchedulerActor(xo.StatelessActor):
         # Some requests have been completed. Batch size needs to be reduced for kv cache.
         if stopped_batch_indexes and len(self._running_queue) > 0:
             kv_cache = self._running_queue[0].kv_cache
-            reduced_kv_cache = _get_valid_batch_kv_cache(
+            reduced_kv_cache = self._model.build_reduced_kv_cache(
                 kv_cache, stopped_batch_indexes
             )
             for r in self._running_queue:

xinference/core/supervisor.py CHANGED Viewed

@@ -45,6 +45,7 @@ from ..constants import (
 )
 from ..core.model import ModelActor
 from ..core.status_guard import InstanceInfo, LaunchStatus
+from ..model.utils import get_engine_params_by_name
 from ..types import PeftModelConfig
 from .metrics import record_metrics
 from .resource import GPUStatus, ResourceStatus
@@ -780,29 +781,19 @@ class SupervisorActor(xo.StatelessActor):
             raise ValueError(f"Unsupported model type: {model_type}")
     @log_async(logger=logger)
-    async def query_engines_by_model_name(self, model_name: str):
-        from copy import deepcopy
-        from ..model.llm.llm_family import LLM_ENGINES
+    async def query_engines_by_model_name(
+        self, model_name: str, model_type: Optional[str] = None
+    ):
         # search in worker first
         workers = list(self._worker_address_to_worker.values())
         for worker in workers:
-            res = await worker.query_engines_by_model_name(model_name)
+            res = await worker.query_engines_by_model_name(
+                model_name, model_type=model_type
+            )
             if res is not None:
                 return res
-        if model_name not in LLM_ENGINES:
-            raise ValueError(f"Model {model_name} not found")
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine in engine_params:
-            params = engine_params[engine]
-            for param in params:
-                del param["llm_class"]
-        return engine_params
+        return get_engine_params_by_name(model_type, model_name)
     @log_async(logger=logger)
     async def register_model(

xinference/core/worker.py CHANGED Viewed

@@ -53,7 +53,7 @@ from ..core.model import ModelActor
 from ..core.status_guard import LaunchStatus
 from ..device_utils import get_available_device_env_name, gpu_count
 from ..model.core import ModelDescription, VirtualEnvSettings, create_model_instance
-from ..model.utils import CancellableDownloader
+from ..model.utils import CancellableDownloader, get_engine_params_by_name
 from ..types import PeftModelConfig
 from ..utils import get_pip_config_args, get_real_path
 from .cache_tracker import CacheTrackerActor
@@ -533,16 +533,6 @@ class WorkerActor(xo.StatelessActor):
                 existing_model_uids.append(rep_uid)
             if idx in self._gpu_to_embedding_model_uids:
                 existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
-            # If user has run the vLLM model on the GPU that was forced to be specified,
-            # it is not possible to force this GPU to be allocated again
-            if idx in self._user_specified_gpu_to_model_uids:
-                for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
-                    is_vllm_model = await self.is_model_vllm_backend(rep_uid)
-                    if is_vllm_model:
-                        raise RuntimeError(
-                            f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
-                            f"therefore cannot allocate GPU memory for a new model."
-                        )
             if existing_model_uids:
                 logger.warning(
@@ -757,22 +747,10 @@ class WorkerActor(xo.StatelessActor):
         return None
     @log_async(logger=logger)
-    async def query_engines_by_model_name(self, model_name: str):
-        from copy import deepcopy
-        from ..model.llm.llm_family import LLM_ENGINES
-        if model_name not in LLM_ENGINES:
-            return None
-        # filter llm_class
-        engine_params = deepcopy(LLM_ENGINES[model_name])
-        for engine in engine_params:
-            params = engine_params[engine]
-            for param in params:
-                del param["llm_class"]
-        return engine_params
+    async def query_engines_by_model_name(
+        self, model_name: str, model_type: Optional[str] = None
+    ):
+        return get_engine_params_by_name(model_type, model_name)
     async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
         from ..model.llm.core import LLM

xinference/deploy/cmdline.py CHANGED Viewed

@@ -1315,8 +1315,12 @@ def model_chat(
                     if "content" not in delta:
                         continue
                     else:
-                        response_content += delta["content"]
-                        print(delta["content"], end="", flush=True, file=sys.stdout)
+                        # The first chunk of stream output may have no content (None). Related PRs:
+                        # https://github.com/ggml-org/llama.cpp/pull/13634
+                        # https://github.com/ggml-org/llama.cpp/pull/12379
+                        content = delta["content"] or ""
+                        response_content += content
+                        print(content, end="", flush=True, file=sys.stdout)
                 print("", file=sys.stdout)
                 messages.append(dict(role="assistant", content=response_content))

xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl