xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/chat_interface.py +1 -1
  3. xinference/core/image_interface.py +9 -0
  4. xinference/core/model.py +4 -1
  5. xinference/core/worker.py +60 -44
  6. xinference/model/audio/chattts.py +25 -9
  7. xinference/model/audio/core.py +8 -2
  8. xinference/model/audio/cosyvoice.py +4 -3
  9. xinference/model/audio/custom.py +4 -5
  10. xinference/model/audio/fish_speech.py +228 -0
  11. xinference/model/audio/model_spec.json +8 -0
  12. xinference/model/embedding/core.py +25 -1
  13. xinference/model/embedding/custom.py +4 -5
  14. xinference/model/flexible/core.py +5 -1
  15. xinference/model/image/custom.py +4 -5
  16. xinference/model/image/model_spec.json +2 -1
  17. xinference/model/image/model_spec_modelscope.json +2 -1
  18. xinference/model/image/stable_diffusion/core.py +66 -3
  19. xinference/model/llm/__init__.py +6 -0
  20. xinference/model/llm/llm_family.json +54 -9
  21. xinference/model/llm/llm_family.py +7 -6
  22. xinference/model/llm/llm_family_modelscope.json +56 -10
  23. xinference/model/llm/lmdeploy/__init__.py +0 -0
  24. xinference/model/llm/lmdeploy/core.py +557 -0
  25. xinference/model/llm/sglang/core.py +7 -1
  26. xinference/model/llm/transformers/cogvlm2.py +4 -45
  27. xinference/model/llm/transformers/cogvlm2_video.py +524 -0
  28. xinference/model/llm/transformers/core.py +3 -0
  29. xinference/model/llm/transformers/glm4v.py +2 -23
  30. xinference/model/llm/transformers/intern_vl.py +94 -11
  31. xinference/model/llm/transformers/minicpmv25.py +2 -23
  32. xinference/model/llm/transformers/minicpmv26.py +2 -22
  33. xinference/model/llm/transformers/yi_vl.py +2 -24
  34. xinference/model/llm/utils.py +13 -1
  35. xinference/model/llm/vllm/core.py +1 -34
  36. xinference/model/rerank/custom.py +4 -5
  37. xinference/model/utils.py +41 -1
  38. xinference/model/video/core.py +3 -1
  39. xinference/model/video/diffusers.py +41 -38
  40. xinference/model/video/model_spec.json +24 -1
  41. xinference/model/video/model_spec_modelscope.json +25 -1
  42. xinference/thirdparty/fish_speech/__init__.py +0 -0
  43. xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
  44. xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
  45. xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
  46. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  47. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  48. xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
  49. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  50. xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
  51. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  52. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
  53. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
  54. xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
  55. xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
  56. xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
  57. xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
  58. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
  61. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
  62. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
  63. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
  64. xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
  65. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
  67. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
  68. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
  69. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
  72. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  73. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
  74. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
  75. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
  76. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
  77. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
  78. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
  79. xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
  80. xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
  81. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
  82. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
  83. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
  84. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
  85. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
  88. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
  89. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
  90. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
  91. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
  92. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
  93. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
  94. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
  95. xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
  96. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
  97. xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
  98. xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
  99. xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
  100. xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
  101. xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
  102. xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
  103. xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
  104. xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
  105. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  107. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
  108. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
  109. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  110. xinference/thirdparty/fish_speech/tools/api.py +495 -0
  111. xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
  112. xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
  113. xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
  114. xinference/thirdparty/fish_speech/tools/file.py +108 -0
  115. xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
  116. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  117. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
  118. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
  122. xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
  123. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
  124. xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
  126. xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
  127. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
  128. xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
  129. xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
  130. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  131. xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
  132. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
  133. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
  134. xinference/thirdparty/fish_speech/tools/webui.py +619 -0
  135. xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
  136. xinference/thirdparty/matcha/__init__.py +0 -0
  137. xinference/thirdparty/matcha/app.py +357 -0
  138. xinference/thirdparty/matcha/cli.py +419 -0
  139. xinference/thirdparty/matcha/data/__init__.py +0 -0
  140. xinference/thirdparty/matcha/data/components/__init__.py +0 -0
  141. xinference/thirdparty/matcha/data/text_mel_datamodule.py +274 -0
  142. xinference/thirdparty/matcha/hifigan/__init__.py +0 -0
  143. xinference/thirdparty/matcha/hifigan/config.py +28 -0
  144. xinference/thirdparty/matcha/hifigan/denoiser.py +64 -0
  145. xinference/thirdparty/matcha/hifigan/env.py +17 -0
  146. xinference/thirdparty/matcha/hifigan/meldataset.py +217 -0
  147. xinference/thirdparty/matcha/hifigan/models.py +368 -0
  148. xinference/thirdparty/matcha/hifigan/xutils.py +60 -0
  149. xinference/thirdparty/matcha/models/__init__.py +0 -0
  150. xinference/thirdparty/matcha/models/baselightningmodule.py +210 -0
  151. xinference/thirdparty/matcha/models/components/__init__.py +0 -0
  152. xinference/thirdparty/matcha/models/components/decoder.py +443 -0
  153. xinference/thirdparty/matcha/models/components/flow_matching.py +132 -0
  154. xinference/thirdparty/matcha/models/components/text_encoder.py +410 -0
  155. xinference/thirdparty/matcha/models/components/transformer.py +316 -0
  156. xinference/thirdparty/matcha/models/matcha_tts.py +244 -0
  157. xinference/thirdparty/matcha/onnx/__init__.py +0 -0
  158. xinference/thirdparty/matcha/onnx/export.py +181 -0
  159. xinference/thirdparty/matcha/onnx/infer.py +168 -0
  160. xinference/thirdparty/matcha/text/__init__.py +53 -0
  161. xinference/thirdparty/matcha/text/cleaners.py +121 -0
  162. xinference/thirdparty/matcha/text/numbers.py +71 -0
  163. xinference/thirdparty/matcha/text/symbols.py +17 -0
  164. xinference/thirdparty/matcha/train.py +122 -0
  165. xinference/thirdparty/matcha/utils/__init__.py +5 -0
  166. xinference/thirdparty/matcha/utils/audio.py +82 -0
  167. xinference/thirdparty/matcha/utils/generate_data_statistics.py +112 -0
  168. xinference/thirdparty/matcha/utils/get_durations_from_trained_model.py +195 -0
  169. xinference/thirdparty/matcha/utils/instantiators.py +56 -0
  170. xinference/thirdparty/matcha/utils/logging_utils.py +53 -0
  171. xinference/thirdparty/matcha/utils/model.py +90 -0
  172. xinference/thirdparty/matcha/utils/monotonic_align/__init__.py +22 -0
  173. xinference/thirdparty/matcha/utils/monotonic_align/core.pyx +47 -0
  174. xinference/thirdparty/matcha/utils/monotonic_align/setup.py +7 -0
  175. xinference/thirdparty/matcha/utils/pylogger.py +21 -0
  176. xinference/thirdparty/matcha/utils/rich_utils.py +101 -0
  177. xinference/thirdparty/matcha/utils/utils.py +259 -0
  178. xinference/web/ui/build/asset-manifest.json +3 -3
  179. xinference/web/ui/build/index.html +1 -1
  180. xinference/web/ui/build/static/js/{main.ffc26121.js → main.661c7b0a.js} +3 -3
  181. xinference/web/ui/build/static/js/main.661c7b0a.js.map +1 -0
  182. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
  183. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/METADATA +31 -11
  184. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/RECORD +189 -49
  185. xinference/web/ui/build/static/js/main.ffc26121.js.map +0 -1
  186. xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
  187. /xinference/web/ui/build/static/js/{main.ffc26121.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
  188. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/LICENSE +0 -0
  189. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/WHEEL +0 -0
  190. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/entry_points.txt +0 -0
  191. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/top_level.txt +0 -0
@@ -146,5 +146,13 @@
146
146
  "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
147
147
  "ability": "text-to-audio",
148
148
  "multilingual": true
149
+ },
150
+ {
151
+ "model_name": "FishSpeech-1.2-SFT",
152
+ "model_family": "FishAudio",
153
+ "model_id": "fishaudio/fish-speech-1.2-sft",
154
+ "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
155
+ "ability": "text-to-audio",
156
+ "multilingual": true
149
157
  }
150
158
  ]
@@ -124,6 +124,7 @@ class EmbeddingModel:
124
124
  model_path: str,
125
125
  model_spec: EmbeddingModelSpec,
126
126
  device: Optional[str] = None,
127
+ **kwargs,
127
128
  ):
128
129
  self._model_uid = model_uid
129
130
  self._model_path = model_path
@@ -131,6 +132,7 @@ class EmbeddingModel:
131
132
  self._model = None
132
133
  self._counter = 0
133
134
  self._model_spec = model_spec
135
+ self._kwargs = kwargs
134
136
 
135
137
  def load(self):
136
138
  try:
@@ -154,10 +156,32 @@ class EmbeddingModel:
154
156
  "gte" in self._model_spec.model_name.lower()
155
157
  and "qwen2" in self._model_spec.model_name.lower()
156
158
  ):
159
+ import torch
160
+
161
+ torch_dtype_str = self._kwargs.get("torch_dtype")
162
+ if torch_dtype_str is not None:
163
+ try:
164
+ torch_dtype = getattr(torch, torch_dtype_str)
165
+ if torch_dtype not in [
166
+ torch.float16,
167
+ torch.float32,
168
+ torch.bfloat16,
169
+ ]:
170
+ logger.warning(
171
+ f"Load embedding model with unsupported torch dtype : {torch_dtype_str}. Using default torch dtype: fp32."
172
+ )
173
+ torch_dtype = torch.float32
174
+ except AttributeError:
175
+ logger.warning(
176
+ f"Load embedding model with unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
177
+ )
178
+ torch_dtype = torch.float32
179
+ else:
180
+ torch_dtype = "auto"
157
181
  self._model = XSentenceTransformer(
158
182
  self._model_path,
159
183
  device=self._device,
160
- model_kwargs={"device_map": "auto"},
184
+ model_kwargs={"device_map": "auto", "torch_dtype": torch_dtype},
161
185
  )
162
186
  else:
163
187
  self._model = SentenceTransformer(self._model_path, device=self._device)
@@ -47,6 +47,10 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
47
47
  if not is_valid_model_name(model_spec.model_name):
48
48
  raise ValueError(f"Invalid model name {model_spec.model_name}.")
49
49
 
50
+ model_uri = model_spec.model_uri
51
+ if model_uri and not is_valid_model_uri(model_uri):
52
+ raise ValueError(f"Invalid model URI {model_uri}.")
53
+
50
54
  with UD_EMBEDDING_LOCK:
51
55
  for model_name in (
52
56
  list(BUILTIN_EMBEDDING_MODELS.keys())
@@ -61,11 +65,6 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
61
65
  UD_EMBEDDINGS.append(model_spec)
62
66
 
63
67
  if persist:
64
- # We only validate model URL when persist is True.
65
- model_uri = model_spec.model_uri
66
- if model_uri and not is_valid_model_uri(model_uri):
67
- raise ValueError(f"Invalid model URI {model_uri}.")
68
-
69
68
  persist_path = os.path.join(
70
69
  XINFERENCE_MODEL_DIR, "embedding", f"{model_spec.model_name}.json"
71
70
  )
@@ -99,11 +99,15 @@ def get_flexible_model_descriptions():
99
99
 
100
100
 
101
101
  def register_flexible_model(model_spec: FlexibleModelSpec, persist: bool):
102
- from ..utils import is_valid_model_name
102
+ from ..utils import is_valid_model_name, is_valid_model_uri
103
103
 
104
104
  if not is_valid_model_name(model_spec.model_name):
105
105
  raise ValueError(f"Invalid model name {model_spec.model_name}.")
106
106
 
107
+ model_uri = model_spec.model_uri
108
+ if model_uri and not is_valid_model_uri(model_uri):
109
+ raise ValueError(f"Invalid model URI {model_uri}.")
110
+
107
111
  if model_spec.launcher_args:
108
112
  try:
109
113
  model_spec.parser_args()
@@ -47,6 +47,10 @@ def register_image(model_spec: CustomImageModelFamilyV1, persist: bool):
47
47
  if not is_valid_model_name(model_spec.model_name):
48
48
  raise ValueError(f"Invalid model name {model_spec.model_name}.")
49
49
 
50
+ model_uri = model_spec.model_uri
51
+ if model_uri and not is_valid_model_uri(model_uri):
52
+ raise ValueError(f"Invalid model URI {model_uri}")
53
+
50
54
  with UD_IMAGE_LOCK:
51
55
  for model_name in (
52
56
  list(BUILTIN_IMAGE_MODELS.keys())
@@ -60,11 +64,6 @@ def register_image(model_spec: CustomImageModelFamilyV1, persist: bool):
60
64
  UD_IMAGES.append(model_spec)
61
65
 
62
66
  if persist:
63
- # We only validate model URL when persist is True.
64
- model_uri = model_spec.model_uri
65
- if model_uri and not is_valid_model_uri(model_uri):
66
- raise ValueError(f"Invalid model URI {model_uri}")
67
-
68
67
  persist_path = os.path.join(
69
68
  XINFERENCE_MODEL_DIR, "image", f"{model_spec.model_name}.json"
70
69
  )
@@ -24,7 +24,8 @@
24
24
  "model_revision": "ea42f8cef0f178587cf766dc8129abd379c90671",
25
25
  "model_ability": [
26
26
  "text2image",
27
- "image2image"
27
+ "image2image",
28
+ "inpainting"
28
29
  ]
29
30
  },
30
31
  {
@@ -27,7 +27,8 @@
27
27
  "model_revision": "master",
28
28
  "model_ability": [
29
29
  "text2image",
30
- "image2image"
30
+ "image2image",
31
+ "inpainting"
31
32
  ]
32
33
  },
33
34
  {
@@ -24,6 +24,9 @@ from functools import partial
24
24
  from io import BytesIO
25
25
  from typing import Dict, List, Optional, Union
26
26
 
27
+ import PIL.Image
28
+ from PIL import ImageOps
29
+
27
30
  from ....constants import XINFERENCE_IMAGE_DIR
28
31
  from ....device_utils import move_model_to_available_device
29
32
  from ....types import Image, ImageList, LoRA
@@ -46,8 +49,13 @@ class DiffusionModel:
46
49
  self._model_uid = model_uid
47
50
  self._model_path = model_path
48
51
  self._device = device
52
+ # when a model has text2image ability,
53
+ # it will be loaded as AutoPipelineForText2Image
54
+ # for image2image and inpainting,
55
+ # we convert to the corresponding model
49
56
  self._model = None
50
57
  self._i2i_model = None # image to image model
58
+ self._inpainting_model = None # inpainting model
51
59
  self._lora_model = lora_model
52
60
  self._lora_load_kwargs = lora_load_kwargs or {}
53
61
  self._lora_fuse_kwargs = lora_fuse_kwargs or {}
@@ -152,6 +160,10 @@ class DiffusionModel:
152
160
  model=None,
153
161
  **kwargs,
154
162
  ):
163
+ import gc
164
+
165
+ from ....device_utils import empty_cache
166
+
155
167
  logger.debug(
156
168
  "stable diffusion args: %s",
157
169
  kwargs,
@@ -159,6 +171,11 @@ class DiffusionModel:
159
171
  model = model if model is not None else self._model
160
172
  assert callable(model)
161
173
  images = model(**kwargs).images
174
+
175
+ # clean cache
176
+ gc.collect()
177
+ empty_cache()
178
+
162
179
  if response_format == "url":
163
180
  os.makedirs(XINFERENCE_IMAGE_DIR, exist_ok=True)
164
181
  image_list = []
@@ -209,9 +226,17 @@ class DiffusionModel:
209
226
  **kwargs,
210
227
  )
211
228
 
229
+ @staticmethod
230
+ def pad_to_multiple(image, multiple=8):
231
+ x, y = image.size
232
+ padding_x = (multiple - x % multiple) % multiple
233
+ padding_y = (multiple - y % multiple) % multiple
234
+ padding = (0, 0, padding_x, padding_y)
235
+ return ImageOps.expand(image, padding)
236
+
212
237
  def image_to_image(
213
238
  self,
214
- image: bytes,
239
+ image: PIL.Image,
215
240
  prompt: Optional[Union[str, List[str]]] = None,
216
241
  negative_prompt: Optional[Union[str, List[str]]] = None,
217
242
  n: int = 1,
@@ -232,10 +257,19 @@ class DiffusionModel:
232
257
  self._i2i_model = model = AutoPipelineForImage2Image.from_pipe(
233
258
  self._model
234
259
  )
260
+
261
+ if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
262
+ # Model like SD3 image to image requires image's height and width is times of 16
263
+ # padding the image if specified
264
+ image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
265
+
235
266
  if size:
236
267
  width, height = map(int, re.split(r"[^\d]+", size))
268
+ if padding_image_to_multiple:
269
+ width, height = image.size
237
270
  kwargs["width"] = width
238
271
  kwargs["height"] = height
272
+
239
273
  self._filter_kwargs(kwargs)
240
274
  return self._call_model(
241
275
  image=image,
@@ -249,8 +283,8 @@ class DiffusionModel:
249
283
 
250
284
  def inpainting(
251
285
  self,
252
- image: bytes,
253
- mask_image: bytes,
286
+ image: PIL.Image,
287
+ mask_image: PIL.Image,
254
288
  prompt: Optional[Union[str, List[str]]] = None,
255
289
  negative_prompt: Optional[Union[str, List[str]]] = None,
256
290
  n: int = 1,
@@ -258,7 +292,35 @@ class DiffusionModel:
258
292
  response_format: str = "url",
259
293
  **kwargs,
260
294
  ):
295
+ if "inpainting" not in self._abilities:
296
+ raise RuntimeError(f"{self._model_uid} does not support inpainting")
297
+
298
+ if (
299
+ "text2image" in self._abilities or "image2image" in self._abilities
300
+ ) and self._model is not None:
301
+ from diffusers import AutoPipelineForInpainting
302
+
303
+ if self._inpainting_model is not None:
304
+ model = self._inpainting_model
305
+ else:
306
+ model = self._inpainting_model = AutoPipelineForInpainting.from_pipe(
307
+ self._model
308
+ )
309
+ else:
310
+ model = self._model
311
+
261
312
  width, height = map(int, re.split(r"[^\d]+", size))
313
+
314
+ if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
315
+ # Model like SD3 inpainting requires image's height and width is times of 16
316
+ # padding the image if specified
317
+ image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
318
+ mask_image = self.pad_to_multiple(
319
+ mask_image, multiple=int(padding_image_to_multiple)
320
+ )
321
+ # calculate actual image size after padding
322
+ width, height = image.size
323
+
262
324
  return self._call_model(
263
325
  image=image,
264
326
  mask_image=mask_image,
@@ -268,5 +330,6 @@ class DiffusionModel:
268
330
  width=width,
269
331
  num_images_per_prompt=n,
270
332
  response_format=response_format,
333
+ model=model,
271
334
  **kwargs,
272
335
  )
@@ -34,6 +34,7 @@ from .llm_family import (
34
34
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
35
35
  LLAMA_CLASSES,
36
36
  LLM_ENGINES,
37
+ LMDEPLOY_CLASSES,
37
38
  MLX_CLASSES,
38
39
  SGLANG_CLASSES,
39
40
  SUPPORTED_ENGINES,
@@ -113,10 +114,12 @@ def generate_engine_config_by_model_family(model_family):
113
114
 
114
115
  def _install():
115
116
  from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
117
+ from .lmdeploy.core import LMDeployChatModel, LMDeployModel
116
118
  from .mlx.core import MLXChatModel, MLXModel
117
119
  from .sglang.core import SGLANGChatModel, SGLANGModel
118
120
  from .transformers.chatglm import ChatglmPytorchChatModel
119
121
  from .transformers.cogvlm2 import CogVLM2Model
122
+ from .transformers.cogvlm2_video import CogVLM2VideoModel
120
123
  from .transformers.core import PytorchChatModel, PytorchModel
121
124
  from .transformers.deepseek_vl import DeepSeekVLChatModel
122
125
  from .transformers.glm4v import Glm4VModel
@@ -147,6 +150,7 @@ def _install():
147
150
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
148
151
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
149
152
  MLX_CLASSES.extend([MLXModel, MLXChatModel])
153
+ LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
150
154
  TRANSFORMERS_CLASSES.extend(
151
155
  [
152
156
  ChatglmPytorchChatModel,
@@ -160,6 +164,7 @@ def _install():
160
164
  InternVLChatModel,
161
165
  PytorchModel,
162
166
  CogVLM2Model,
167
+ CogVLM2VideoModel,
163
168
  MiniCPMV25Model,
164
169
  MiniCPMV26Model,
165
170
  Glm4VModel,
@@ -174,6 +179,7 @@ def _install():
174
179
  SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
175
180
  SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
176
181
  SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
182
+ SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
177
183
 
178
184
  json_path = os.path.join(
179
185
  os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
@@ -7189,15 +7189,6 @@
7189
7189
  "model_id": "OpenGVLab/InternVL2-4B",
7190
7190
  "model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
7191
7191
  },
7192
- {
7193
- "model_format": "awq",
7194
- "model_size_in_billions": 4,
7195
- "quantizations": [
7196
- "Int4"
7197
- ],
7198
- "model_id": "OpenGVLab/InternVL2-8B-AWQ",
7199
- "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
7200
- },
7201
7192
  {
7202
7193
  "model_format": "pytorch",
7203
7194
  "model_size_in_billions": 8,
@@ -7209,6 +7200,15 @@
7209
7200
  "model_id": "OpenGVLab/InternVL2-8B",
7210
7201
  "model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
7211
7202
  },
7203
+ {
7204
+ "model_format": "awq",
7205
+ "model_size_in_billions": 8,
7206
+ "quantizations": [
7207
+ "Int4"
7208
+ ],
7209
+ "model_id": "OpenGVLab/InternVL2-8B-AWQ",
7210
+ "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
7211
+ },
7212
7212
  {
7213
7213
  "model_format": "pytorch",
7214
7214
  "model_size_in_billions": 26,
@@ -7342,6 +7342,51 @@
7342
7342
  ]
7343
7343
  }
7344
7344
  },
7345
+ {
7346
+ "version": 1,
7347
+ "context_length": 8192,
7348
+ "model_name": "cogvlm2-video-llama3-chat",
7349
+ "model_lang": [
7350
+ "en",
7351
+ "zh"
7352
+ ],
7353
+ "model_ability": [
7354
+ "chat",
7355
+ "vision"
7356
+ ],
7357
+ "model_description": "CogVLM2-Video achieves state-of-the-art performance on multiple video question answering tasks.",
7358
+ "model_specs": [
7359
+ {
7360
+ "model_format": "pytorch",
7361
+ "model_size_in_billions": 12,
7362
+ "quantizations": [
7363
+ "4-bit",
7364
+ "8-bit",
7365
+ "none"
7366
+ ],
7367
+ "model_id": "THUDM/cogvlm2-video-llama3-chat",
7368
+ "model_revision": "f375ead7d8202ebe2c3d09f1068abdddeb2929fa"
7369
+ }
7370
+ ],
7371
+ "prompt_style": {
7372
+ "style_name": "LLAMA3",
7373
+ "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
7374
+ "roles": [
7375
+ "user",
7376
+ "assistant"
7377
+ ],
7378
+ "intra_message_sep": "\n\n",
7379
+ "inter_message_sep": "<|eot_id|>",
7380
+ "stop_token_ids": [
7381
+ 128001,
7382
+ 128009
7383
+ ],
7384
+ "stop": [
7385
+ "<|end_of_text|>",
7386
+ "<|eot_id|>"
7387
+ ]
7388
+ }
7389
+ },
7345
7390
  {
7346
7391
  "version": 1,
7347
7392
  "context_length": 8192,
@@ -271,6 +271,8 @@ VLLM_CLASSES: List[Type[LLM]] = []
271
271
 
272
272
  MLX_CLASSES: List[Type[LLM]] = []
273
273
 
274
+ LMDEPLOY_CLASSES: List[Type[LLM]] = []
275
+
274
276
  LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
275
277
  SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
276
278
 
@@ -1002,6 +1004,11 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
1002
1004
  if not is_valid_model_name(llm_family.model_name):
1003
1005
  raise ValueError(f"Invalid model name {llm_family.model_name}.")
1004
1006
 
1007
+ for spec in llm_family.model_specs:
1008
+ model_uri = spec.model_uri
1009
+ if model_uri and not is_valid_model_uri(model_uri):
1010
+ raise ValueError(f"Invalid model URI {model_uri}.")
1011
+
1005
1012
  with UD_LLM_FAMILIES_LOCK:
1006
1013
  for family in BUILTIN_LLM_FAMILIES + UD_LLM_FAMILIES:
1007
1014
  if llm_family.model_name == family.model_name:
@@ -1013,12 +1020,6 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
1013
1020
  generate_engine_config_by_model_family(llm_family)
1014
1021
 
1015
1022
  if persist:
1016
- # We only validate model URL when persist is True.
1017
- for spec in llm_family.model_specs:
1018
- model_uri = spec.model_uri
1019
- if model_uri and not is_valid_model_uri(model_uri):
1020
- raise ValueError(f"Invalid model URI {model_uri}.")
1021
-
1022
1023
  persist_path = os.path.join(
1023
1024
  XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
1024
1025
  )
@@ -4778,10 +4778,10 @@
4778
4778
  "model_revision": "master"
4779
4779
  },
4780
4780
  {
4781
- "model_format": "pytorch",
4781
+ "model_format": "awq",
4782
4782
  "model_size_in_billions": 2,
4783
4783
  "quantizations": [
4784
- "none"
4784
+ "Int4"
4785
4785
  ],
4786
4786
  "model_hub": "modelscope",
4787
4787
  "model_id": "OpenGVLab/InternVL2-2B-AWQ",
@@ -4812,10 +4812,10 @@
4812
4812
  "model_revision": "master"
4813
4813
  },
4814
4814
  {
4815
- "model_format": "pytorch",
4815
+ "model_format": "awq",
4816
4816
  "model_size_in_billions": 8,
4817
4817
  "quantizations": [
4818
- "none"
4818
+ "Int4"
4819
4819
  ],
4820
4820
  "model_hub": "modelscope",
4821
4821
  "model_id": "OpenGVLab/InternVL2-8B-AWQ",
@@ -4834,10 +4834,10 @@
4834
4834
  "model_revision": "master"
4835
4835
  },
4836
4836
  {
4837
- "model_format": "pytorch",
4837
+ "model_format": "awq",
4838
4838
  "model_size_in_billions": 26,
4839
4839
  "quantizations": [
4840
- "none"
4840
+ "Int4"
4841
4841
  ],
4842
4842
  "model_hub": "modelscope",
4843
4843
  "model_id": "OpenGVLab/InternVL2-26B-AWQ",
@@ -4856,10 +4856,10 @@
4856
4856
  "model_revision": "master"
4857
4857
  },
4858
4858
  {
4859
- "model_format": "pytorch",
4859
+ "model_format": "awq",
4860
4860
  "model_size_in_billions": 40,
4861
4861
  "quantizations": [
4862
- "none"
4862
+ "Int4"
4863
4863
  ],
4864
4864
  "model_hub": "modelscope",
4865
4865
  "model_id": "OpenGVLab/InternVL2-40B-AWQ",
@@ -4878,10 +4878,10 @@
4878
4878
  "model_revision": "master"
4879
4879
  },
4880
4880
  {
4881
- "model_format": "pytorch",
4881
+ "model_format": "awq",
4882
4882
  "model_size_in_billions": 76,
4883
4883
  "quantizations": [
4884
- "none"
4884
+ "Int4"
4885
4885
  ],
4886
4886
  "model_hub": "modelscope",
4887
4887
  "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
@@ -4962,6 +4962,52 @@
4962
4962
  ]
4963
4963
  }
4964
4964
  },
4965
+ {
4966
+ "version": 1,
4967
+ "context_length": 8192,
4968
+ "model_name": "cogvlm2-video-llama3-chat",
4969
+ "model_lang": [
4970
+ "en",
4971
+ "zh"
4972
+ ],
4973
+ "model_ability": [
4974
+ "chat",
4975
+ "vision"
4976
+ ],
4977
+ "model_description": "CogVLM2-Video achieves state-of-the-art performance on multiple video question answering tasks.",
4978
+ "model_specs": [
4979
+ {
4980
+ "model_format": "pytorch",
4981
+ "model_size_in_billions": 12,
4982
+ "quantizations": [
4983
+ "4-bit",
4984
+ "8-bit",
4985
+ "none"
4986
+ ],
4987
+ "model_hub": "modelscope",
4988
+ "model_id": "ZhipuAI/cogvlm2-video-llama3-chat",
4989
+ "model_revision": "master"
4990
+ }
4991
+ ],
4992
+ "prompt_style": {
4993
+ "style_name": "LLAMA3",
4994
+ "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
4995
+ "roles": [
4996
+ "user",
4997
+ "assistant"
4998
+ ],
4999
+ "intra_message_sep": "\n\n",
5000
+ "inter_message_sep": "<|eot_id|>",
5001
+ "stop_token_ids": [
5002
+ 128001,
5003
+ 128009
5004
+ ],
5005
+ "stop": [
5006
+ "<|end_of_text|>",
5007
+ "<|eot_id|>"
5008
+ ]
5009
+ }
5010
+ },
4965
5011
  {
4966
5012
  "version": 1,
4967
5013
  "context_length": 8192,
File without changes