xinference 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (50) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +48 -0
  3. xinference/client/restful/restful_client.py +19 -0
  4. xinference/core/chat_interface.py +5 -1
  5. xinference/core/image_interface.py +5 -1
  6. xinference/core/model.py +106 -16
  7. xinference/core/scheduler.py +1 -1
  8. xinference/deploy/supervisor.py +0 -4
  9. xinference/model/audio/chattts.py +25 -14
  10. xinference/model/audio/model_spec.json +1 -1
  11. xinference/model/audio/model_spec_modelscope.json +1 -1
  12. xinference/model/embedding/model_spec.json +1 -1
  13. xinference/model/image/core.py +59 -4
  14. xinference/model/image/model_spec.json +24 -3
  15. xinference/model/image/model_spec_modelscope.json +25 -3
  16. xinference/model/image/ocr/__init__.py +13 -0
  17. xinference/model/image/ocr/got_ocr2.py +76 -0
  18. xinference/model/image/scheduler/flux.py +1 -1
  19. xinference/model/image/stable_diffusion/core.py +2 -3
  20. xinference/model/image/stable_diffusion/mlx.py +221 -0
  21. xinference/model/llm/llm_family.json +9 -0
  22. xinference/model/llm/llm_family_modelscope.json +11 -0
  23. xinference/thirdparty/mlx/__init__.py +13 -0
  24. xinference/thirdparty/mlx/flux/__init__.py +15 -0
  25. xinference/thirdparty/mlx/flux/autoencoder.py +357 -0
  26. xinference/thirdparty/mlx/flux/clip.py +154 -0
  27. xinference/thirdparty/mlx/flux/datasets.py +75 -0
  28. xinference/thirdparty/mlx/flux/flux.py +247 -0
  29. xinference/thirdparty/mlx/flux/layers.py +302 -0
  30. xinference/thirdparty/mlx/flux/lora.py +76 -0
  31. xinference/thirdparty/mlx/flux/model.py +134 -0
  32. xinference/thirdparty/mlx/flux/sampler.py +56 -0
  33. xinference/thirdparty/mlx/flux/t5.py +244 -0
  34. xinference/thirdparty/mlx/flux/tokenizers.py +185 -0
  35. xinference/thirdparty/mlx/flux/trainer.py +98 -0
  36. xinference/thirdparty/mlx/flux/utils.py +179 -0
  37. xinference/web/ui/build/asset-manifest.json +3 -3
  38. xinference/web/ui/build/index.html +1 -1
  39. xinference/web/ui/build/static/js/{main.f7da0140.js → main.b76aeeb7.js} +3 -3
  40. xinference/web/ui/build/static/js/main.b76aeeb7.js.map +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/32ea2c04cf0bba2761b4883d2c40cc259952c94d2d6bb774e510963ca37aac0a.json +1 -0
  42. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/METADATA +15 -8
  43. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/RECORD +48 -31
  44. xinference/web/ui/build/static/js/main.f7da0140.js.map +0 -1
  45. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +0 -1
  46. /xinference/web/ui/build/static/js/{main.f7da0140.js.LICENSE.txt → main.b76aeeb7.js.LICENSE.txt} +0 -0
  47. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/LICENSE +0 -0
  48. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/WHEEL +0 -0
  49. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/entry_points.txt +0 -0
  50. {xinference-0.16.0.dist-info → xinference-0.16.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ import PIL.Image
19
+
20
+ if TYPE_CHECKING:
21
+ from ..core import ImageModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class GotOCR2Model:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: Optional[str] = None,
31
+ device: Optional[str] = None,
32
+ model_spec: Optional["ImageModelFamilyV1"] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._device = device
38
+ # model info when loading
39
+ self._model = None
40
+ self._tokenizer = None
41
+ # info
42
+ self._model_spec = model_spec
43
+ self._abilities = model_spec.model_ability or [] # type: ignore
44
+ self._kwargs = kwargs
45
+
46
+ @property
47
+ def model_ability(self):
48
+ return self._abilities
49
+
50
+ def load(self):
51
+ from transformers import AutoModel, AutoTokenizer
52
+
53
+ self._tokenizer = AutoTokenizer.from_pretrained(
54
+ self._model_path, trust_remote_code=True
55
+ )
56
+ model = AutoModel.from_pretrained(
57
+ self._model_path,
58
+ trust_remote_code=True,
59
+ low_cpu_mem_usage=True,
60
+ device_map="cuda",
61
+ use_safetensors=True,
62
+ pad_token_id=self._tokenizer.eos_token_id,
63
+ )
64
+ self._model = model.eval().cuda()
65
+
66
+ def ocr(
67
+ self,
68
+ image: PIL.Image,
69
+ **kwargs,
70
+ ):
71
+ logger.info("Got OCR 2.0 kwargs: %s", kwargs)
72
+ if "ocr_type" not in kwargs:
73
+ kwargs["ocr_type"] = "ocr"
74
+ assert self._model is not None
75
+ # This chat API limits the max new tokens inside.
76
+ return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)
@@ -124,7 +124,7 @@ class FluxBatchSchedulerActor(xo.StatelessActor):
124
124
  self._running_queue: deque[Text2ImageRequest] = deque() # type: ignore
125
125
  self._model = None
126
126
  self._available_device = get_available_device()
127
- self._id_to_req: Dict[str, Text2ImageRequest] = {}
127
+ self._id_to_req: Dict[str, Text2ImageRequest] = {} # type: ignore
128
128
 
129
129
  def set_model(self, model):
130
130
  """
@@ -283,9 +283,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
283
283
  model.enable_sequential_cpu_offload()
284
284
  elif not self._kwargs.get("device_map"):
285
285
  logger.debug("Loading model to available device")
286
- model = move_model_to_available_device(self._model)
287
- # Recommended if your computer has < 64 GB of RAM
288
- if self._kwargs.get("attention_slicing", True):
286
+ model = move_model_to_available_device(model)
287
+ if self._kwargs.get("attention_slicing", False):
289
288
  model.enable_attention_slicing()
290
289
  if self._kwargs.get("vae_tiling", False):
291
290
  model.enable_vae_tiling()
@@ -0,0 +1,221 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import contextlib
16
+ import gc
17
+ import logging
18
+ import re
19
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
20
+
21
+ import numpy as np
22
+ from PIL import Image
23
+ from xoscar.utils import classproperty
24
+
25
+ from ....types import LoRA
26
+ from ..sdapi import SDAPIDiffusionModelMixin
27
+ from ..utils import handle_image_result
28
+
29
+ if TYPE_CHECKING:
30
+ from ....core.progress_tracker import Progressor
31
+ from ..core import ImageModelFamilyV1
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def quantization_predicate(name: str, m) -> bool:
38
+ return hasattr(m, "to_quantized") and m.weight.shape[1] % 512 == 0
39
+
40
+
41
+ def to_latent_size(image_size: Tuple[int, int]):
42
+ h, w = image_size
43
+ h = ((h + 15) // 16) * 16
44
+ w = ((w + 15) // 16) * 16
45
+
46
+ if (h, w) != image_size:
47
+ print(
48
+ "Warning: The image dimensions need to be divisible by 16px. "
49
+ f"Changing size to {h}x{w}."
50
+ )
51
+
52
+ return (h // 8, w // 8)
53
+
54
+
55
+ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
56
+ def __init__(
57
+ self,
58
+ model_uid: str,
59
+ model_path: Optional[str] = None,
60
+ device: Optional[str] = None,
61
+ lora_model: Optional[List[LoRA]] = None,
62
+ lora_load_kwargs: Optional[Dict] = None,
63
+ lora_fuse_kwargs: Optional[Dict] = None,
64
+ model_spec: Optional["ImageModelFamilyV1"] = None,
65
+ **kwargs,
66
+ ):
67
+ self._model_uid = model_uid
68
+ self._model_path = model_path
69
+ self._device = device
70
+ # model info when loading
71
+ self._model = None
72
+ self._lora_model = lora_model
73
+ self._lora_load_kwargs = lora_load_kwargs or {}
74
+ self._lora_fuse_kwargs = lora_fuse_kwargs or {}
75
+ # info
76
+ self._model_spec = model_spec
77
+ self._abilities = model_spec.model_ability or [] # type: ignore
78
+ self._kwargs = kwargs
79
+
80
+ @property
81
+ def model_ability(self):
82
+ return self._abilities
83
+
84
+ @classproperty
85
+ def supported_models(self):
86
+ return ["FLUX.1-schnell", "FLUX.1-dev"]
87
+
88
+ def load(self):
89
+ try:
90
+ import mlx.nn as nn
91
+ except ImportError:
92
+ error_message = "Failed to import module 'mlx'"
93
+ installation_guide = [
94
+ "Please make sure 'mlx' is installed. ",
95
+ "You can install it by `pip install mlx`\n",
96
+ ]
97
+
98
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
99
+
100
+ from ....thirdparty.mlx.flux import FluxPipeline
101
+
102
+ logger.debug(
103
+ "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
104
+ )
105
+ flux = self._model = FluxPipeline(
106
+ "flux-" + self._model_spec.model_name.split("-")[1],
107
+ model_path=self._model_path,
108
+ t5_padding=self._kwargs.get("t5_padding", True),
109
+ )
110
+ self._apply_lora()
111
+
112
+ quantize = self._kwargs.get("quantize", True)
113
+ if quantize:
114
+ nn.quantize(flux.flow, class_predicate=quantization_predicate)
115
+ nn.quantize(flux.t5, class_predicate=quantization_predicate)
116
+ nn.quantize(flux.clip, class_predicate=quantization_predicate)
117
+
118
+ def _apply_lora(self):
119
+ if self._lora_model is not None:
120
+ import mlx.core as mx
121
+
122
+ for lora_model in self._lora_model:
123
+ weights, lora_config = mx.load(
124
+ lora_model.local_path, return_metadata=True
125
+ )
126
+ rank = int(lora_config.get("lora_rank", 8))
127
+ num_blocks = int(lora_config.get("lora_blocks", -1))
128
+ flux = self._model
129
+ flux.linear_to_lora_layers(rank, num_blocks)
130
+ flux.flow.load_weights(list(weights.items()), strict=False)
131
+ flux.fuse_lora_layers()
132
+ logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
133
+
134
+ @staticmethod
135
+ @contextlib.contextmanager
136
+ def _release_after():
137
+ import mlx.core as mx
138
+
139
+ try:
140
+ yield
141
+ finally:
142
+ gc.collect()
143
+ mx.metal.clear_cache()
144
+
145
+ def text_to_image(
146
+ self,
147
+ prompt: str,
148
+ n: int = 1,
149
+ size: str = "1024*1024",
150
+ response_format: str = "url",
151
+ **kwargs,
152
+ ):
153
+ import mlx.core as mx
154
+
155
+ flux = self._model
156
+ width, height = map(int, re.split(r"[^\d]+", size))
157
+
158
+ # Make the generator
159
+ latent_size = to_latent_size((height, width))
160
+ gen_latent_kwargs = {}
161
+ if (num_steps := kwargs.get("num_inference_steps")) is None:
162
+ num_steps = 50 if "dev" in self._model_spec.model_name else 2 # type: ignore
163
+ gen_latent_kwargs["num_steps"] = num_steps
164
+ if guidance := kwargs.get("guidance_scale"):
165
+ gen_latent_kwargs["guidance"] = guidance
166
+ if seed := kwargs.get("seed"):
167
+ gen_latent_kwargs["seed"] = seed
168
+
169
+ with self._release_after():
170
+ latents = flux.generate_latents( # type: ignore
171
+ prompt, n_images=n, latent_size=latent_size, **gen_latent_kwargs
172
+ )
173
+
174
+ # First we get and eval the conditioning
175
+ conditioning = next(latents)
176
+ mx.eval(conditioning)
177
+ peak_mem_conditioning = mx.metal.get_peak_memory() / 1024**3
178
+ mx.metal.reset_peak_memory()
179
+
180
+ progressor: Progressor = kwargs.pop("progressor", None)
181
+ # Actual denoising loop
182
+ for i, x_t in enumerate(latents):
183
+ mx.eval(x_t)
184
+ progressor.set_progress((i + 1) / num_steps)
185
+
186
+ peak_mem_generation = mx.metal.get_peak_memory() / 1024**3
187
+ mx.metal.reset_peak_memory()
188
+
189
+ # Decode them into images
190
+ decoded = []
191
+ for i in range(n):
192
+ decoded.append(flux.decode(x_t[i : i + 1], latent_size)) # type: ignore
193
+ mx.eval(decoded[-1])
194
+ peak_mem_decoding = mx.metal.get_peak_memory() / 1024**3
195
+ peak_mem_overall = max(
196
+ peak_mem_conditioning, peak_mem_generation, peak_mem_decoding
197
+ )
198
+
199
+ images = []
200
+ x = mx.concatenate(decoded, axis=0)
201
+ x = (x * 255).astype(mx.uint8)
202
+ for i in range(len(x)):
203
+ im = Image.fromarray(np.array(x[i]))
204
+ images.append(im)
205
+
206
+ logger.debug(
207
+ f"Peak memory used for the text: {peak_mem_conditioning:.3f}GB"
208
+ )
209
+ logger.debug(
210
+ f"Peak memory used for the generation: {peak_mem_generation:.3f}GB"
211
+ )
212
+ logger.debug(f"Peak memory used for the decoding: {peak_mem_decoding:.3f}GB")
213
+ logger.debug(f"Peak memory used overall: {peak_mem_overall:.3f}GB")
214
+
215
+ return handle_image_result(response_format, images)
216
+
217
+ def image_to_image(self, **kwargs):
218
+ raise NotImplementedError
219
+
220
+ def inpainting(self, **kwargs):
221
+ raise NotImplementedError
@@ -8176,6 +8176,15 @@
8176
8176
  ],
8177
8177
  "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
8178
8178
  },
8179
+ {
8180
+ "model_format": "gptq",
8181
+ "model_size_in_billions": "7",
8182
+ "quantizations": [
8183
+ "Int4",
8184
+ "Int8"
8185
+ ],
8186
+ "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}"
8187
+ },
8179
8188
  {
8180
8189
  "model_format": "ggufv2",
8181
8190
  "model_size_in_billions": "1_5",
@@ -5880,6 +5880,17 @@
5880
5880
  "model_revision": "master",
5881
5881
  "model_hub": "modelscope"
5882
5882
  },
5883
+ {
5884
+ "model_format": "gptq",
5885
+ "model_size_in_billions": 7,
5886
+ "quantizations": [
5887
+ "Int4",
5888
+ "Int8"
5889
+ ],
5890
+ "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}",
5891
+ "model_revision": "master",
5892
+ "model_hub": "modelscope"
5893
+ },
5883
5894
  {
5884
5895
  "model_format": "ggufv2",
5885
5896
  "model_size_in_billions": "1_5",
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,15 @@
1
+ # Copyright © 2024 Apple Inc.
2
+
3
+ from .datasets import Dataset, load_dataset
4
+ from .flux import FluxPipeline
5
+ from .lora import LoRALinear
6
+ from .sampler import FluxSampler
7
+ from .trainer import Trainer
8
+ from .utils import (
9
+ load_ae,
10
+ load_clip,
11
+ load_clip_tokenizer,
12
+ load_flow_model,
13
+ load_t5,
14
+ load_t5_tokenizer,
15
+ )