PyPI - xinference - Versions diffs - 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl - Mend

xinference 0.16.0py3-none-any.whl → 0.16.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (50) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +48 -0
xinference/client/restful/restful_client.py +19 -0
xinference/core/chat_interface.py +5 -1
xinference/core/image_interface.py +5 -1
xinference/core/model.py +106 -16
xinference/core/scheduler.py +1 -1
xinference/deploy/supervisor.py +0 -4
xinference/model/audio/chattts.py +25 -14
xinference/model/audio/model_spec.json +1 -1
xinference/model/audio/model_spec_modelscope.json +1 -1
xinference/model/embedding/model_spec.json +1 -1
xinference/model/image/core.py +59 -4
xinference/model/image/model_spec.json +24 -3
xinference/model/image/model_spec_modelscope.json +25 -3
xinference/model/image/ocr/__init__.py +13 -0
xinference/model/image/ocr/got_ocr2.py +76 -0
xinference/model/image/scheduler/flux.py +1 -1
xinference/model/image/stable_diffusion/core.py +2 -3
xinference/model/image/stable_diffusion/mlx.py +221 -0
xinference/model/llm/llm_family.json +9 -0
xinference/model/llm/llm_family_modelscope.json +11 -0
xinference/thirdparty/mlx/__init__.py +13 -0
xinference/thirdparty/mlx/flux/__init__.py +15 -0
xinference/thirdparty/mlx/flux/autoencoder.py +357 -0
xinference/thirdparty/mlx/flux/clip.py +154 -0
xinference/thirdparty/mlx/flux/datasets.py +75 -0
xinference/thirdparty/mlx/flux/flux.py +247 -0
xinference/thirdparty/mlx/flux/layers.py +302 -0
xinference/thirdparty/mlx/flux/lora.py +76 -0
xinference/thirdparty/mlx/flux/model.py +134 -0
xinference/thirdparty/mlx/flux/sampler.py +56 -0
xinference/thirdparty/mlx/flux/t5.py +244 -0
xinference/thirdparty/mlx/flux/tokenizers.py +185 -0
xinference/thirdparty/mlx/flux/trainer.py +98 -0
xinference/thirdparty/mlx/flux/utils.py +179 -0
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.f7da0140.js → main.b76aeeb7.js} +3 -3
xinference/web/ui/build/static/js/main.b76aeeb7.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/32ea2c04cf0bba2761b4883d2c40cc259952c94d2d6bb774e510963ca37aac0a.json +1 -0
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/METADATA +15 -8
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/RECORD +48 -31
xinference/web/ui/build/static/js/main.f7da0140.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +0 -1
/xinference/web/ui/build/static/js/{main.f7da0140.js.LICENSE.txt → main.b76aeeb7.js.LICENSE.txt} +0 -0
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/LICENSE +0 -0
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/WHEEL +0 -0
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/entry_points.txt +0 -0
{xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/top_level.txt +0 -0

xinference/model/image/ocr/got_ocr2.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING, Optional
+import PIL.Image
+if TYPE_CHECKING:
+    from ..core import ImageModelFamilyV1
+logger = logging.getLogger(__name__)
+class GotOCR2Model:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: Optional[str] = None,
+        device: Optional[str] = None,
+        model_spec: Optional["ImageModelFamilyV1"] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._device = device
+        # model info when loading
+        self._model = None
+        self._tokenizer = None
+        # info
+        self._model_spec = model_spec
+        self._abilities = model_spec.model_ability or []  # type: ignore
+        self._kwargs = kwargs
+    @property
+    def model_ability(self):
+        return self._abilities
+    def load(self):
+        from transformers import AutoModel, AutoTokenizer
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self._model_path, trust_remote_code=True
+        )
+        model = AutoModel.from_pretrained(
+            self._model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            device_map="cuda",
+            use_safetensors=True,
+            pad_token_id=self._tokenizer.eos_token_id,
+        )
+        self._model = model.eval().cuda()
+    def ocr(
+        self,
+        image: PIL.Image,
+        **kwargs,
+    ):
+        logger.info("Got OCR 2.0 kwargs: %s", kwargs)
+        if "ocr_type" not in kwargs:
+            kwargs["ocr_type"] = "ocr"
+        assert self._model is not None
+        # This chat API limits the max new tokens inside.
+        return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)

xinference/model/image/scheduler/flux.py CHANGED Viewed

@@ -124,7 +124,7 @@ class FluxBatchSchedulerActor(xo.StatelessActor):
         self._running_queue: deque[Text2ImageRequest] = deque()  # type: ignore
         self._model = None
         self._available_device = get_available_device()
-        self._id_to_req: Dict[str, Text2ImageRequest] = {}
+        self._id_to_req: Dict[str, Text2ImageRequest] = {}  # type: ignore
     def set_model(self, model):
         """

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -283,9 +283,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             model.enable_sequential_cpu_offload()
         elif not self._kwargs.get("device_map"):
             logger.debug("Loading model to available device")
-            model = move_model_to_available_device(self._model)
-        # Recommended if your computer has < 64 GB of RAM
-        if self._kwargs.get("attention_slicing", True):
+            model = move_model_to_available_device(model)
+        if self._kwargs.get("attention_slicing", False):
             model.enable_attention_slicing()
         if self._kwargs.get("vae_tiling", False):
             model.enable_vae_tiling()

xinference/model/image/stable_diffusion/mlx.py ADDED Viewed

@@ -0,0 +1,221 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import gc
+import logging
+import re
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+import numpy as np
+from PIL import Image
+from xoscar.utils import classproperty
+from ....types import LoRA
+from ..sdapi import SDAPIDiffusionModelMixin
+from ..utils import handle_image_result
+if TYPE_CHECKING:
+    from ....core.progress_tracker import Progressor
+    from ..core import ImageModelFamilyV1
+logger = logging.getLogger(__name__)
+def quantization_predicate(name: str, m) -> bool:
+    return hasattr(m, "to_quantized") and m.weight.shape[1] % 512 == 0
+def to_latent_size(image_size: Tuple[int, int]):
+    h, w = image_size
+    h = ((h + 15) // 16) * 16
+    w = ((w + 15) // 16) * 16
+    if (h, w) != image_size:
+        print(
+            "Warning: The image dimensions need to be divisible by 16px. "
+            f"Changing size to {h}x{w}."
+        )
+    return (h // 8, w // 8)
+class MLXDiffusionModel(SDAPIDiffusionModelMixin):
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: Optional[str] = None,
+        device: Optional[str] = None,
+        lora_model: Optional[List[LoRA]] = None,
+        lora_load_kwargs: Optional[Dict] = None,
+        lora_fuse_kwargs: Optional[Dict] = None,
+        model_spec: Optional["ImageModelFamilyV1"] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._device = device
+        # model info when loading
+        self._model = None
+        self._lora_model = lora_model
+        self._lora_load_kwargs = lora_load_kwargs or {}
+        self._lora_fuse_kwargs = lora_fuse_kwargs or {}
+        # info
+        self._model_spec = model_spec
+        self._abilities = model_spec.model_ability or []  # type: ignore
+        self._kwargs = kwargs
+    @property
+    def model_ability(self):
+        return self._abilities
+    @classproperty
+    def supported_models(self):
+        return ["FLUX.1-schnell", "FLUX.1-dev"]
+    def load(self):
+        try:
+            import mlx.nn as nn
+        except ImportError:
+            error_message = "Failed to import module 'mlx'"
+            installation_guide = [
+                "Please make sure 'mlx' is installed. ",
+                "You can install it by `pip install mlx`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        from ....thirdparty.mlx.flux import FluxPipeline
+        logger.debug(
+            "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
+        )
+        flux = self._model = FluxPipeline(
+            "flux-" + self._model_spec.model_name.split("-")[1],
+            model_path=self._model_path,
+            t5_padding=self._kwargs.get("t5_padding", True),
+        )
+        self._apply_lora()
+        quantize = self._kwargs.get("quantize", True)
+        if quantize:
+            nn.quantize(flux.flow, class_predicate=quantization_predicate)
+            nn.quantize(flux.t5, class_predicate=quantization_predicate)
+            nn.quantize(flux.clip, class_predicate=quantization_predicate)
+    def _apply_lora(self):
+        if self._lora_model is not None:
+            import mlx.core as mx
+            for lora_model in self._lora_model:
+                weights, lora_config = mx.load(
+                    lora_model.local_path, return_metadata=True
+                )
+                rank = int(lora_config.get("lora_rank", 8))
+                num_blocks = int(lora_config.get("lora_blocks", -1))
+                flux = self._model
+                flux.linear_to_lora_layers(rank, num_blocks)
+                flux.flow.load_weights(list(weights.items()), strict=False)
+                flux.fuse_lora_layers()
+            logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
+    @staticmethod
+    @contextlib.contextmanager
+    def _release_after():
+        import mlx.core as mx
+        try:
+            yield
+        finally:
+            gc.collect()
+            mx.metal.clear_cache()
+    def text_to_image(
+        self,
+        prompt: str,
+        n: int = 1,
+        size: str = "1024*1024",
+        response_format: str = "url",
+        **kwargs,
+    ):
+        import mlx.core as mx
+        flux = self._model
+        width, height = map(int, re.split(r"[^\d]+", size))
+        # Make the generator
+        latent_size = to_latent_size((height, width))
+        gen_latent_kwargs = {}
+        if (num_steps := kwargs.get("num_inference_steps")) is None:
+            num_steps = 50 if "dev" in self._model_spec.model_name else 2  # type: ignore
+        gen_latent_kwargs["num_steps"] = num_steps
+        if guidance := kwargs.get("guidance_scale"):
+            gen_latent_kwargs["guidance"] = guidance
+        if seed := kwargs.get("seed"):
+            gen_latent_kwargs["seed"] = seed
+        with self._release_after():
+            latents = flux.generate_latents(  # type: ignore
+                prompt, n_images=n, latent_size=latent_size, **gen_latent_kwargs
+            )
+            # First we get and eval the conditioning
+            conditioning = next(latents)
+            mx.eval(conditioning)
+            peak_mem_conditioning = mx.metal.get_peak_memory() / 1024**3
+            mx.metal.reset_peak_memory()
+            progressor: Progressor = kwargs.pop("progressor", None)
+            # Actual denoising loop
+            for i, x_t in enumerate(latents):
+                mx.eval(x_t)
+                progressor.set_progress((i + 1) / num_steps)
+            peak_mem_generation = mx.metal.get_peak_memory() / 1024**3
+            mx.metal.reset_peak_memory()
+            # Decode them into images
+            decoded = []
+            for i in range(n):
+                decoded.append(flux.decode(x_t[i : i + 1], latent_size))  # type: ignore
+                mx.eval(decoded[-1])
+            peak_mem_decoding = mx.metal.get_peak_memory() / 1024**3
+            peak_mem_overall = max(
+                peak_mem_conditioning, peak_mem_generation, peak_mem_decoding
+            )
+            images = []
+            x = mx.concatenate(decoded, axis=0)
+            x = (x * 255).astype(mx.uint8)
+            for i in range(len(x)):
+                im = Image.fromarray(np.array(x[i]))
+                images.append(im)
+        logger.debug(
+            f"Peak memory used for the text:       {peak_mem_conditioning:.3f}GB"
+        )
+        logger.debug(
+            f"Peak memory used for the generation: {peak_mem_generation:.3f}GB"
+        )
+        logger.debug(f"Peak memory used for the decoding:   {peak_mem_decoding:.3f}GB")
+        logger.debug(f"Peak memory used overall:            {peak_mem_overall:.3f}GB")
+        return handle_image_result(response_format, images)
+    def image_to_image(self, **kwargs):
+        raise NotImplementedError
+    def inpainting(self, **kwargs):
+        raise NotImplementedError

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -8176,6 +8176,15 @@
         ],
         "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "7",
+        "quantizations": [
+            "Int4",
+            "Int8"
+        ],
+        "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "1_5",

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -5880,6 +5880,17 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "1_5",

xinference/thirdparty/mlx/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/thirdparty/mlx/flux/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright © 2024 Apple Inc.
+from .datasets import Dataset, load_dataset
+from .flux import FluxPipeline
+from .lora import LoRALinear
+from .sampler import FluxSampler
+from .trainer import Trainer
+from .utils import (
+    load_ae,
+    load_clip,
+    load_clip_tokenizer,
+    load_flow_model,
+    load_t5,
+    load_t5_tokenizer,
+)

xinference 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.0py3-none-any.whl → 0.16.1py3-none-any.whl