PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -42,27 +42,38 @@ def _message_content_to_intern(content, image_cnt):
     if not isinstance(content, str):
         texts = []
         image_urls = []
+        video_urls = []
         for c in content:
             c_type = c.get("type")
             if c_type == "text":
                 texts.append(c["text"])
             elif c_type == "image_url":
                 image_urls.append(c["image_url"]["url"])
+            elif c_type == "video_url":
+                video_urls.append(c["video_url"]["url"])
+        if len(video_urls) > 1:
+            raise RuntimeError("Only one video per message is supported")
         image_futures = []
         with ThreadPoolExecutor() as executor:
             for image_url in image_urls:
                 fut = executor.submit(_decode_image, image_url)
                 image_futures.append(fut)
         images = [fut.result() for fut in image_futures]
+        videos = []
+        for vid_url in video_urls:
+            videos.append(_load_video(vid_url, num_segments=8, max_num=1))
         prefix = ""
         for i, _ in enumerate(images):
             prefix += f"Image-{image_cnt + i + 1}: <image>\n\n"
+        if len(videos) > 0:
+            prefix = "".join(
+                [f"Frame{i+1}: <image>\n" for i in range(len(videos[0][1]))]
+            )
         text = prefix + " ".join(texts)
-        if len(images) == 0:
-            return text, []
-        else:
-            return text, images
-    return content, []
+        return text, images, videos
+    return content, [], []
 def _get_prompt_and_chat_history(
@@ -71,18 +82,21 @@ def _get_prompt_and_chat_history(
 ):
     # Convert openai history to intern vl history
     images = []
+    videos = []
     history = []
     image_cnt = 0
     for h1, h2 in zip(*[iter(chat_history or [])] * 2):
-        content1, img = _message_content_to_intern(h1["content"], image_cnt)
-        content2, _ = _message_content_to_intern(h2["content"], image_cnt)
+        content1, img, vid = _message_content_to_intern(h1["content"], image_cnt)
+        content2, _, _ = _message_content_to_intern(h2["content"], image_cnt)
         history.append([content1, content2])
         images.extend(img)
         image_cnt += len(img)
+        videos.extend(vid)
-    question, img = _message_content_to_intern(prompt, image_cnt)
+    question, img, vid = _message_content_to_intern(prompt, image_cnt)
     images.extend(img)
-    return question, history, images
+    videos.extend(vid)
+    return question, history, images, videos
 def _build_transform(input_size=448):
@@ -174,6 +188,53 @@ def _load_image(image_file, input_size=448, max_num=12):
     return pixel_values
+# video multi-round conversation
+def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    import numpy as np
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array(
+        [
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ]
+    )
+    return frame_indices
+def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    from decord import VideoReader, cpu
+    from PIL import Image
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = _build_transform(input_size=input_size)
+    frame_indices = _get_index(
+        bound, fps, max_frame, first_idx=0, num_segments=num_segments
+    )
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = _dynamic_preprocess(
+            img, image_size=input_size, use_thumbnail=True, max_num=max_num
+        )
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(torch.bfloat16).cuda()
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
 class InternVLChatModel(PytorchChatModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -305,7 +366,9 @@ class InternVLChatModel(PytorchChatModel):
             else False
         )
-        content, history, images = _get_prompt_and_chat_history(prompt, chat_history)
+        content, history, images, videos = _get_prompt_and_chat_history(
+            prompt, chat_history
+        )
         num_patches_list = []
         if len(images) == 1:
@@ -327,6 +390,10 @@ class InternVLChatModel(PytorchChatModel):
         else:
             pixel_values = None
+        if len(videos) > 0:
+            pixel_values = videos[0][0]
+            num_patches_list = videos[0][1]
         assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
         img_context_token_id = self._tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
@@ -440,7 +507,23 @@ class InternVLChatModel(PytorchChatModel):
             )
             chunk["usage"] = completion_usage
             yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        )
+        chunk["usage"] = completion_usage
+        yield chunk
         if include_usage:
             chunk = CompletionChunk(
                 id=completion_id,

xinference/model/llm/transformers/minicpmv25.py CHANGED Viewed

@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import json
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
-from PIL import Image
 from ....types import (
     ChatCompletion,
@@ -35,6 +31,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -102,24 +99,6 @@ class MiniCPMV25Model(PytorchChatModel):
         self._save_tensorizer()
     def _message_content_to_chat(self, content):
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data)).convert("RGB")
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url).convert("RGB")
-                else:
-                    return Image.open(BytesIO(response.content)).convert("RGB")
         if not isinstance(content, str):
             texts = []
             image_urls = []
@@ -132,7 +111,7 @@ class MiniCPMV25Model(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             text = " ".join(texts)

xinference/model/llm/transformers/minicpmv26.py CHANGED Viewed

@@ -11,15 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
 from PIL import Image
@@ -34,6 +31,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -105,24 +103,6 @@ class MiniCPMV26Model(PytorchChatModel):
         self._save_tensorizer()
     def _message_content_to_chat(self, content):
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data)).convert("RGB")
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url).convert("RGB")
-                else:
-                    return Image.open(BytesIO(response.content)).convert("RGB")
         MAX_NUM_FRAMES = 64
         def encode_video(video_path):
@@ -166,7 +146,7 @@ class MiniCPMV26Model(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             frames = []

xinference/model/llm/transformers/yi_vl.py CHANGED Viewed

@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from threading import Thread
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
-from PIL import Image
 from ....model.utils import select_device
 from ....types import (
@@ -35,6 +31,7 @@ from ....types import (
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -78,25 +75,6 @@ class YiVLChatModel(PytorchChatModel):
     @staticmethod
     def _message_content_to_yi(content) -> Union[str, tuple]:
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data))
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url)
-                else:
-                    return Image.open(BytesIO(response.content))
         if not isinstance(content, str):
             from ....thirdparty.llava.model.constants import DEFAULT_IMAGE_TOKEN
@@ -111,7 +89,7 @@ class YiVLChatModel(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             text = " ".join(texts)

xinference/model/llm/utils.py CHANGED Viewed

@@ -459,7 +459,16 @@ Begin!"""
                 role = get_role(message["role"])
                 content = message["content"]
                 if isinstance(content, str):
-                    ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
+                    if content:
+                        ret += (
+                            role
+                            + "\n"
+                            + content
+                            + prompt_style.intra_message_sep
+                            + "\n"
+                        )
+                    else:
+                        ret += role + "\n"
                 elif isinstance(content, list):
                     text = ""
                     image_urls = []

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -721,7 +721,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         prompt_style = self.model_family.prompt_style.copy()
         chat_history = chat_history or []
         prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
-        logger.info(f"messages:{prompt}")
         if len(images) == 0:
             inputs = {
                 "prompt": prompt,

xinference/thirdparty/fish_speech/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .grad_norm import GradNormMonitor
+__all__ = ["GradNormMonitor"]

xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py ADDED Viewed

@@ -0,0 +1,113 @@
+from typing import Optional, Union
+import lightning.pytorch as pl
+import torch
+from lightning import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from torch import Tensor, nn
+from torch.utils._foreach_utils import (
+    _group_tensors_by_device_and_dtype,
+    _has_foreach_support,
+)
+@torch.no_grad()
+def grad_norm(
+    parameters: Union[Tensor, list[Tensor]],
+    norm_type: float = 2.0,
+) -> float:
+    """
+    Returns the norm of the gradients of the given parameters.
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        norm_type (float): type of the used p-norm.
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """  # noqa: E501
+    if isinstance(parameters, Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    if len(grads) == 0:
+        return None
+    first_device = grads[0].device
+    grouped_grads: dict[
+        tuple[torch.device, torch.dtype], list[list[Tensor]]
+    ] = _group_tensors_by_device_and_dtype(
+        [[g.detach() for g in grads]]
+    )  # type: ignore[assignment]
+    norms = []
+    for (device, _), ([grads], _) in grouped_grads.items():
+        if _has_foreach_support(grads, device=device):
+            norms.extend(torch._foreach_norm(grads, norm_type))
+        else:
+            norms.extend([torch.norm(g, norm_type) for g in grads])
+    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
+class GradNormMonitor(Callback):
+    """
+    Callback that computes the gradient norm of the model parameters.
+    """
+    def __init__(
+        self,
+        norm_type: float = 2.0,
+        logging_interval: str = "step",
+        sub_module: Optional[Union[str, list[str]]] = None,
+    ) -> None:
+        """
+        Args:
+            norm_type (float): type of the used p-norm.
+            logging_interval (str): "step" or "epoch".
+        """
+        super().__init__()
+        self.norm_type = norm_type
+        self.logging_interval = logging_interval
+        self.sub_module = sub_module
+    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
+        """
+        Computes the gradient norm of the model parameters and logs it to the logger.
+        Args:
+            trainer (Trainer): The trainer object
+            model (LightningModule): The current lightningModule
+        """
+        lightning_model = model
+        if self.sub_module is None:
+            return self.log_sub_module_grad_norm(lightning_model, model, "")
+        sub_modules = self.sub_module
+        if isinstance(sub_modules, str):
+            sub_modules = [sub_modules]
+        for sub_module in sub_modules:
+            self.log_sub_module_grad_norm(
+                lightning_model, getattr(model, sub_module), f"/{sub_module}"
+            )
+    def log_sub_module_grad_norm(
+        self, lightning_model: LightningModule, model: nn.Module, path: str
+    ) -> None:
+        grad_norm_val = grad_norm(model.parameters(), self.norm_type)
+        if grad_norm_val is None:
+            return
+        on_step = self.logging_interval == "step"
+        lightning_model.log(
+            f"train{path}/grad_norm",
+            grad_norm_val,
+            on_step=on_step,
+            on_epoch=not on_step,
+        )

xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/conversation.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ SEMANTIC_TOKEN = "<\|semantic\|>"
2	+ CODEBOOK_PAD_TOKEN_ID = 0

xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py ADDED Viewed

@@ -0,0 +1,53 @@
+import bisect
+import random
+from typing import Iterable
+from torch.utils.data import Dataset, IterableDataset
+class ConcatRepeatDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    repeats: list[int]
+    @staticmethod
+    def cumsum(sequence, repeats):
+        r, s = [], 0
+        for dataset, repeat in zip(sequence, repeats):
+            l = len(dataset) * repeat
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
+        super().__init__()
+        self.datasets = list(datasets)
+        self.repeats = repeats
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
+        assert len(self.datasets) == len(
+            repeats
+        ), "datasets and repeats should have the same length"
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatRepeatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset = self.datasets[dataset_idx]
+        return dataset[sample_idx % len(dataset)]

xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py ADDED Viewed

@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: text-data.proto
+# Protobuf Python Version: 4.25.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
+)
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+    DESCRIPTOR._options = None
+    _globals["_SEMANTICS"]._serialized_start = 30
+    _globals["_SEMANTICS"]._serialized_end = 57
+    _globals["_SENTENCE"]._serialized_start = 59
+    _globals["_SENTENCE"]._serialized_end = 125
+    _globals["_TEXTDATA"]._serialized_start = 127
+    _globals["_TEXTDATA"]._serialized_end = 207
+    _globals["_SAMPLEDDATA"]._serialized_start = 209
+    _globals["_SAMPLEDDATA"]._serialized_end = 290
+# @@protoc_insertion_point(module_scope)

xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py ADDED Viewed

@@ -0,0 +1,36 @@
+import struct
+from .text_data_pb2 import TextData
+def read_pb_stream(f):
+    while True:
+        buf = f.read(4)
+        if len(buf) == 0:
+            break
+        size = struct.unpack("I", buf)[0]
+        buf = f.read(size)
+        text_data = TextData()
+        text_data.ParseFromString(buf)
+        yield text_data
+def write_pb_stream(f, text_data):
+    buf = text_data.SerializeToString()
+    f.write(struct.pack("I", len(buf)))
+    f.write(buf)
+def pack_pb_stream(text_data):
+    buf = text_data.SerializeToString()
+    return struct.pack("I", len(buf)) + buf
+def split_pb_stream(f):
+    while True:
+        head = f.read(4)
+        if len(head) == 0:
+            break
+        size = struct.unpack("I", head)[0]
+        buf = f.read(size)
+        yield head + buf

xinference 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.3py3-none-any.whl