xinference 1.9.0__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/model.py +3 -4
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/model_spec.json +44 -20
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +15 -0
- xinference/model/embedding/vllm/core.py +33 -7
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +114 -6
- xinference/model/image/stable_diffusion/core.py +141 -31
- xinference/model/llm/llama_cpp/core.py +41 -40
- xinference/model/llm/llm_family.json +395 -3
- xinference/model/llm/transformers/core.py +5 -11
- xinference/model/llm/utils.py +1 -1
- xinference/model/llm/vllm/core.py +6 -0
- xinference/model/rerank/core.py +3 -0
- xinference/model/rerank/sentence_transformers/core.py +1 -1
- xinference/model/rerank/vllm/core.py +56 -6
- xinference/model/utils.py +1 -2
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/METADATA +10 -10
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/RECORD +68 -67
- xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/WHEEL +0 -0
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.0.dist-info → xinference-1.9.1.dist-info}/top_level.txt +0 -0
|
@@ -169,7 +169,9 @@
|
|
|
169
169
|
},
|
|
170
170
|
"virtualenv": {
|
|
171
171
|
"packages": [
|
|
172
|
-
"
|
|
172
|
+
"diffusers==0.35.1",
|
|
173
|
+
"peft>=0.17.0",
|
|
174
|
+
"#system_torch#",
|
|
173
175
|
"#system_numpy#"
|
|
174
176
|
],
|
|
175
177
|
"no_build_isolation": true
|
|
@@ -180,7 +182,9 @@
|
|
|
180
182
|
"model_name": "Qwen-Image",
|
|
181
183
|
"model_family": "stable_diffusion",
|
|
182
184
|
"model_ability": [
|
|
183
|
-
"text2image"
|
|
185
|
+
"text2image",
|
|
186
|
+
"image2image",
|
|
187
|
+
"inpainting"
|
|
184
188
|
],
|
|
185
189
|
"model_src": {
|
|
186
190
|
"huggingface": {
|
|
@@ -202,7 +206,16 @@
|
|
|
202
206
|
"Q6_K",
|
|
203
207
|
"Q8_0"
|
|
204
208
|
],
|
|
205
|
-
"gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
|
|
209
|
+
"gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
|
|
210
|
+
"lightning_model_id": "lightx2v/Qwen-Image-Lightning",
|
|
211
|
+
"lightning_versions": [
|
|
212
|
+
"4steps-V1.0-bf16",
|
|
213
|
+
"4steps-V1.0",
|
|
214
|
+
"8steps-V1.0",
|
|
215
|
+
"8steps-V1.1-bf16",
|
|
216
|
+
"8steps-V1.1"
|
|
217
|
+
],
|
|
218
|
+
"lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
|
|
206
219
|
},
|
|
207
220
|
"modelscope": {
|
|
208
221
|
"model_id": "Qwen/Qwen-Image",
|
|
@@ -223,7 +236,102 @@
|
|
|
223
236
|
"Q6_K",
|
|
224
237
|
"Q8_0"
|
|
225
238
|
],
|
|
226
|
-
"gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
|
|
239
|
+
"gguf_model_file_name_template": "qwen-image-{quantization}.gguf",
|
|
240
|
+
"lightning_model_id": "lightx2v/Qwen-Image-Lightning",
|
|
241
|
+
"lightning_versions": [
|
|
242
|
+
"4steps-V1.0-bf16",
|
|
243
|
+
"4steps-V1.0",
|
|
244
|
+
"8steps-V1.0",
|
|
245
|
+
"8steps-V1.1-bf16",
|
|
246
|
+
"8steps-V1.1"
|
|
247
|
+
],
|
|
248
|
+
"lightning_model_file_name_template": "Qwen-Image-Lightning-{lightning_version}.safetensors"
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
"default_model_config": {
|
|
252
|
+
"quantize": true,
|
|
253
|
+
"quantize_text_encoder": "text_encoder",
|
|
254
|
+
"torch_dtype": "bfloat16"
|
|
255
|
+
},
|
|
256
|
+
"default_generate_config": {
|
|
257
|
+
"guidance_scale": 1.0,
|
|
258
|
+
"true_cfg_scale": 1.0
|
|
259
|
+
},
|
|
260
|
+
"virtualenv": {
|
|
261
|
+
"packages": [
|
|
262
|
+
"diffusers==0.35.1",
|
|
263
|
+
"peft>=0.17.0",
|
|
264
|
+
"#system_torch#",
|
|
265
|
+
"#system_numpy#"
|
|
266
|
+
],
|
|
267
|
+
"no_build_isolation": true
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"version": 2,
|
|
272
|
+
"model_name": "Qwen-Image-Edit",
|
|
273
|
+
"model_family": "stable_diffusion",
|
|
274
|
+
"model_ability": [
|
|
275
|
+
"image2image"
|
|
276
|
+
],
|
|
277
|
+
"model_src": {
|
|
278
|
+
"huggingface": {
|
|
279
|
+
"model_id": "Qwen/Qwen-Image-Edit",
|
|
280
|
+
"model_revision": "0b71959872ea3bf4d106c578b7c480ebb133dba7",
|
|
281
|
+
"gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
|
|
282
|
+
"gguf_quantizations": [
|
|
283
|
+
"Q2_K",
|
|
284
|
+
"Q3_K_M",
|
|
285
|
+
"Q3_K_S",
|
|
286
|
+
"Q4_0",
|
|
287
|
+
"Q4_1",
|
|
288
|
+
"Q4_K_M",
|
|
289
|
+
"Q4_K_S",
|
|
290
|
+
"Q5_0",
|
|
291
|
+
"Q5_1",
|
|
292
|
+
"Q5_K_M",
|
|
293
|
+
"Q5_K_S",
|
|
294
|
+
"Q6_K",
|
|
295
|
+
"Q8_0"
|
|
296
|
+
],
|
|
297
|
+
"gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
|
|
298
|
+
"lightning_model_id": "lightx2v/Qwen-Image-Lightning",
|
|
299
|
+
"lightning_versions": [
|
|
300
|
+
"4steps-V1.0-bf16",
|
|
301
|
+
"4steps-V1.0",
|
|
302
|
+
"8steps-V1.0-bf16",
|
|
303
|
+
"8steps-V1.0"
|
|
304
|
+
],
|
|
305
|
+
"lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
|
|
306
|
+
},
|
|
307
|
+
"modelscope": {
|
|
308
|
+
"model_id": "Qwen/Qwen-Image-Edit",
|
|
309
|
+
"model_revision": "master",
|
|
310
|
+
"gguf_model_id": "QuantStack/Qwen-Image-Edit-GGUF",
|
|
311
|
+
"gguf_quantizations": [
|
|
312
|
+
"Q2_K",
|
|
313
|
+
"Q3_K_M",
|
|
314
|
+
"Q3_K_S",
|
|
315
|
+
"Q4_0",
|
|
316
|
+
"Q4_1",
|
|
317
|
+
"Q4_K_M",
|
|
318
|
+
"Q4_K_S",
|
|
319
|
+
"Q5_0",
|
|
320
|
+
"Q5_1",
|
|
321
|
+
"Q5_K_M",
|
|
322
|
+
"Q5_K_S",
|
|
323
|
+
"Q6_K",
|
|
324
|
+
"Q8_0"
|
|
325
|
+
],
|
|
326
|
+
"gguf_model_file_name_template": "Qwen_Image_Edit-{quantization}.gguf",
|
|
327
|
+
"lightning_model_id": "lightx2v/Qwen-Image-Lightning",
|
|
328
|
+
"lightning_versions": [
|
|
329
|
+
"4steps-V1.0-bf16",
|
|
330
|
+
"4steps-V1.0",
|
|
331
|
+
"8steps-V1.0-bf16",
|
|
332
|
+
"8steps-V1.0"
|
|
333
|
+
],
|
|
334
|
+
"lightning_model_file_name_template": "Qwen-Image-Edit-Lightning-{lightning_version}.safetensors"
|
|
227
335
|
}
|
|
228
336
|
},
|
|
229
337
|
"default_model_config": {
|
|
@@ -232,11 +340,11 @@
|
|
|
232
340
|
"torch_dtype": "bfloat16"
|
|
233
341
|
},
|
|
234
342
|
"default_generate_config": {
|
|
235
|
-
"
|
|
343
|
+
"true_cfg_scale": 4.0
|
|
236
344
|
},
|
|
237
345
|
"virtualenv": {
|
|
238
346
|
"packages": [
|
|
239
|
-
"
|
|
347
|
+
"diffusers==0.35.1",
|
|
240
348
|
"peft>=0.17.0",
|
|
241
349
|
"#system_torch#",
|
|
242
350
|
"#system_numpy#"
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import asyncio
|
|
15
16
|
import contextlib
|
|
16
17
|
import gc
|
|
17
18
|
import importlib
|
|
@@ -19,6 +20,7 @@ import inspect
|
|
|
19
20
|
import itertools
|
|
20
21
|
import json
|
|
21
22
|
import logging
|
|
23
|
+
import math
|
|
22
24
|
import os
|
|
23
25
|
import re
|
|
24
26
|
import sys
|
|
@@ -30,7 +32,11 @@ import PIL.Image
|
|
|
30
32
|
import torch
|
|
31
33
|
from PIL import ImageOps
|
|
32
34
|
|
|
33
|
-
from ....device_utils import
|
|
35
|
+
from ....device_utils import (
|
|
36
|
+
get_available_device,
|
|
37
|
+
gpu_count,
|
|
38
|
+
move_model_to_available_device,
|
|
39
|
+
)
|
|
34
40
|
from ....types import LoRA
|
|
35
41
|
from ..sdapi import SDAPIDiffusionModelMixin
|
|
36
42
|
from ..utils import handle_image_result
|
|
@@ -89,6 +95,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
89
95
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
90
96
|
model_spec: Optional["ImageModelFamilyV2"] = None,
|
|
91
97
|
gguf_model_path: Optional[str] = None,
|
|
98
|
+
lightning_model_path: Optional[str] = None,
|
|
92
99
|
**kwargs,
|
|
93
100
|
):
|
|
94
101
|
self.model_family = model_spec
|
|
@@ -115,6 +122,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
115
122
|
self._kwargs = kwargs
|
|
116
123
|
# gguf
|
|
117
124
|
self._gguf_model_path = gguf_model_path
|
|
125
|
+
# lightning
|
|
126
|
+
self._lightning_model_path = lightning_model_path
|
|
118
127
|
|
|
119
128
|
@property
|
|
120
129
|
def model_ability(self):
|
|
@@ -171,7 +180,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
171
180
|
)
|
|
172
181
|
model = model_type.from_pipe(self._model, controlnet=controlnet)
|
|
173
182
|
else:
|
|
174
|
-
|
|
183
|
+
try:
|
|
184
|
+
from diffusers import (
|
|
185
|
+
QwenImageImg2ImgPipeline,
|
|
186
|
+
QwenImageInpaintPipeline,
|
|
187
|
+
QwenImagePipeline,
|
|
188
|
+
)
|
|
189
|
+
except ImportError:
|
|
190
|
+
QwenImagePipeline = None
|
|
191
|
+
QwenImageImg2ImgPipeline = None
|
|
192
|
+
QwenImageInpaintPipeline = None
|
|
193
|
+
|
|
194
|
+
if QwenImagePipeline is not None and isinstance(
|
|
195
|
+
self._model, QwenImagePipeline
|
|
196
|
+
):
|
|
197
|
+
# special process for Qwen-image
|
|
198
|
+
if ability == "image2image":
|
|
199
|
+
model = QwenImageImg2ImgPipeline.from_pipe(
|
|
200
|
+
self._model, torch_dtype=None
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
assert ability == "inpainting"
|
|
204
|
+
model = QwenImageInpaintPipeline.from_pipe(
|
|
205
|
+
self._model, torch_dtype=None
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
model = model_type.from_pipe(self._model)
|
|
175
209
|
self._load_to_device(model)
|
|
176
210
|
|
|
177
211
|
self._ability_to_models[ability, controlnet_name] = model
|
|
@@ -237,35 +271,42 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
237
271
|
else:
|
|
238
272
|
self._quantize_transformer()
|
|
239
273
|
|
|
274
|
+
if (device_count := gpu_count()) > 1 and "device_map" not in self._kwargs:
|
|
275
|
+
logger.debug(
|
|
276
|
+
"Device count (%d) > 1, force to set device_map=balanced", device_count
|
|
277
|
+
)
|
|
278
|
+
self._kwargs["device_map"] = "balanced"
|
|
279
|
+
|
|
240
280
|
logger.debug(
|
|
241
281
|
"Loading model from %s, kwargs: %s", self._model_path, self._kwargs
|
|
242
282
|
)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
self.
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
except ValueError:
|
|
249
|
-
if "kontext" in self._model_spec.model_name.lower():
|
|
250
|
-
# TODO: remove this branch when auto pipeline supports
|
|
251
|
-
# flux.1-kontext-dev
|
|
252
|
-
from diffusers import FluxKontextPipeline
|
|
253
|
-
|
|
254
|
-
self._model = FluxKontextPipeline.from_pretrained(
|
|
255
|
-
self._model_path, **self._kwargs
|
|
283
|
+
with self._process_lightning(self._kwargs):
|
|
284
|
+
try:
|
|
285
|
+
self._model = AutoPipelineModel.from_pretrained(
|
|
286
|
+
self._model_path,
|
|
287
|
+
**self._kwargs,
|
|
256
288
|
)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
self.
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
289
|
+
except ValueError:
|
|
290
|
+
if "kontext" in self._model_spec.model_name.lower():
|
|
291
|
+
# TODO: remove this branch when auto pipeline supports
|
|
292
|
+
# flux.1-kontext-dev
|
|
293
|
+
from diffusers import FluxKontextPipeline
|
|
294
|
+
|
|
295
|
+
self._model = FluxKontextPipeline.from_pretrained(
|
|
296
|
+
self._model_path, **self._kwargs
|
|
297
|
+
)
|
|
298
|
+
elif "qwen" in self._model_spec.model_name.lower():
|
|
299
|
+
# TODO: remove this branch when auto pipeline supports
|
|
300
|
+
# Qwen-Image
|
|
301
|
+
from diffusers import DiffusionPipeline
|
|
302
|
+
|
|
303
|
+
self._model = DiffusionPipeline.from_pretrained(
|
|
304
|
+
self._model_path, **self._kwargs
|
|
305
|
+
)
|
|
306
|
+
else:
|
|
307
|
+
raise
|
|
308
|
+
self._load_to_device(self._model)
|
|
309
|
+
self._apply_lora()
|
|
269
310
|
|
|
270
311
|
if self._kwargs.get("deepcache", False):
|
|
271
312
|
try:
|
|
@@ -440,6 +481,44 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
440
481
|
config=os.path.join(self._model_path, "transformer"),
|
|
441
482
|
)
|
|
442
483
|
|
|
484
|
+
@contextlib.contextmanager
|
|
485
|
+
def _process_lightning(self, kwargs):
|
|
486
|
+
lightning_model_path = self._lightning_model_path
|
|
487
|
+
if not lightning_model_path:
|
|
488
|
+
yield
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
from diffusers import FlowMatchEulerDiscreteScheduler
|
|
492
|
+
|
|
493
|
+
if "qwen" in self._model_spec.model_name.lower():
|
|
494
|
+
scheduler_config = {
|
|
495
|
+
"base_image_seq_len": 256,
|
|
496
|
+
"base_shift": math.log(3), # We use shift=3 in distillation
|
|
497
|
+
"invert_sigmas": False,
|
|
498
|
+
"max_image_seq_len": 8192,
|
|
499
|
+
"max_shift": math.log(3), # We use shift=3 in distillation
|
|
500
|
+
"num_train_timesteps": 1000,
|
|
501
|
+
"shift": 1.0,
|
|
502
|
+
"shift_terminal": None, # set shift_terminal to None
|
|
503
|
+
"stochastic_sampling": False,
|
|
504
|
+
"time_shift_type": "exponential",
|
|
505
|
+
"use_beta_sigmas": False,
|
|
506
|
+
"use_dynamic_shifting": True,
|
|
507
|
+
"use_exponential_sigmas": False,
|
|
508
|
+
"use_karras_sigmas": False,
|
|
509
|
+
}
|
|
510
|
+
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
|
|
511
|
+
kwargs["scheduler"] = scheduler
|
|
512
|
+
|
|
513
|
+
yield
|
|
514
|
+
|
|
515
|
+
model = self._model
|
|
516
|
+
logger.debug("Loading lightning lora: %s", self._lightning_model_path)
|
|
517
|
+
model.load_lora_weights(self._lightning_model_path)
|
|
518
|
+
else:
|
|
519
|
+
logger.debug("No lightning applied")
|
|
520
|
+
yield
|
|
521
|
+
|
|
443
522
|
def _load_to_device(self, model):
|
|
444
523
|
if self._kwargs.get("cpu_offload", False):
|
|
445
524
|
logger.debug("CPU offloading model")
|
|
@@ -687,7 +766,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
687
766
|
await self._image_batch_scheduler.add_request(
|
|
688
767
|
prompt, future, n, size, response_format, **kwargs
|
|
689
768
|
)
|
|
690
|
-
import asyncio
|
|
691
769
|
|
|
692
770
|
fut = asyncio.wrap_future(future)
|
|
693
771
|
return await fut
|
|
@@ -702,6 +780,18 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
702
780
|
if self._image_batch_scheduler and not self._image_batch_scheduler._running:
|
|
703
781
|
await self._image_batch_scheduler.start()
|
|
704
782
|
|
|
783
|
+
def _gen_config_for_lightning(self, kwargs):
|
|
784
|
+
if (
|
|
785
|
+
not kwargs.get("num_inference_steps")
|
|
786
|
+
and self._lightning_model_path is not None
|
|
787
|
+
):
|
|
788
|
+
is_4_steps = "4steps" in self._lightning_model_path
|
|
789
|
+
if is_4_steps:
|
|
790
|
+
kwargs["num_inference_steps"] = 4
|
|
791
|
+
else:
|
|
792
|
+
assert "8steps" in self._lightning_model_path
|
|
793
|
+
kwargs["num_inference_steps"] = 8
|
|
794
|
+
|
|
705
795
|
async def _direct_text_to_image(
|
|
706
796
|
self,
|
|
707
797
|
prompt: str,
|
|
@@ -714,14 +804,28 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
714
804
|
generate_kwargs = self._model_spec.default_generate_config.copy() # type: ignore
|
|
715
805
|
generate_kwargs.update({k: v for k, v in kwargs.items() if v is not None})
|
|
716
806
|
generate_kwargs["width"], generate_kwargs["height"] = width, height
|
|
807
|
+
self._gen_config_for_lightning(generate_kwargs)
|
|
717
808
|
|
|
718
|
-
return
|
|
719
|
-
|
|
720
|
-
|
|
809
|
+
return await asyncio.to_thread(
|
|
810
|
+
self._call_model,
|
|
811
|
+
prompt=prompt, # type: ignore
|
|
812
|
+
num_images_per_prompt=n, # type: ignore
|
|
721
813
|
response_format=response_format,
|
|
722
814
|
**generate_kwargs,
|
|
723
815
|
)
|
|
724
816
|
|
|
817
|
+
async def abort_request(self, request_id: str) -> str:
|
|
818
|
+
"""Abort a running request."""
|
|
819
|
+
from ....model.scheduler.core import AbortRequestMessage
|
|
820
|
+
|
|
821
|
+
# Check if we have a cancel callback for this request
|
|
822
|
+
if hasattr(self, "_cancel_callbacks") and request_id in self._cancel_callbacks:
|
|
823
|
+
cancel_callback = self._cancel_callbacks.pop(request_id)
|
|
824
|
+
cancel_callback()
|
|
825
|
+
return AbortRequestMessage.DONE.name
|
|
826
|
+
|
|
827
|
+
return AbortRequestMessage.NO_OP.name
|
|
828
|
+
|
|
725
829
|
@staticmethod
|
|
726
830
|
def pad_to_multiple(image, multiple=8):
|
|
727
831
|
x, y = image.size
|
|
@@ -769,6 +873,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
769
873
|
if allow_width_height:
|
|
770
874
|
kwargs["width"], kwargs["height"] = image.size
|
|
771
875
|
|
|
876
|
+
# generate config for lightning
|
|
877
|
+
self._gen_config_for_lightning(kwargs)
|
|
878
|
+
|
|
772
879
|
return self._call_model(
|
|
773
880
|
image=image,
|
|
774
881
|
prompt=prompt,
|
|
@@ -819,6 +926,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
819
926
|
# calculate actual image size after padding
|
|
820
927
|
kwargs["width"], kwargs["height"] = image.size
|
|
821
928
|
|
|
929
|
+
# generate config for lightning
|
|
930
|
+
self._gen_config_for_lightning(kwargs)
|
|
931
|
+
|
|
822
932
|
return self._call_model(
|
|
823
933
|
image=image,
|
|
824
934
|
mask_image=mask_image,
|
|
@@ -19,11 +19,11 @@ import pprint
|
|
|
19
19
|
import queue
|
|
20
20
|
from typing import Iterator, List, Optional, Union
|
|
21
21
|
|
|
22
|
-
import
|
|
22
|
+
from packaging import version
|
|
23
23
|
|
|
24
24
|
from ....constants import XINFERENCE_MAX_TOKENS
|
|
25
25
|
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
|
|
26
|
-
from ..core import LLM
|
|
26
|
+
from ..core import LLM, chat_context_var
|
|
27
27
|
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
28
28
|
from ..utils import ChatModelMixin
|
|
29
29
|
|
|
@@ -98,10 +98,19 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
98
98
|
from xllamacpp import (
|
|
99
99
|
CommonParams,
|
|
100
100
|
Server,
|
|
101
|
+
__version__,
|
|
101
102
|
estimate_gpu_layers,
|
|
102
103
|
get_device_info,
|
|
103
104
|
ggml_backend_dev_type,
|
|
104
105
|
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
if version.parse(__version__) < version.parse("0.2.0"):
|
|
109
|
+
raise RuntimeError(
|
|
110
|
+
"Please update xllamacpp to >= 0.2.0 by `pip install -U xllamacpp`"
|
|
111
|
+
)
|
|
112
|
+
except version.InvalidVersion:
|
|
113
|
+
pass # If the version parse failed, we just skip the version check.
|
|
105
114
|
except ImportError:
|
|
106
115
|
error_message = "Failed to import module 'xllamacpp'"
|
|
107
116
|
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
|
@@ -160,6 +169,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
160
169
|
params.mmproj.path = mmproj
|
|
161
170
|
if self.model_family.chat_template:
|
|
162
171
|
params.chat_template = self.model_family.chat_template
|
|
172
|
+
params.use_jinja = True
|
|
163
173
|
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
164
174
|
params.n_parallel = min(8, os.cpu_count() or 1)
|
|
165
175
|
for k, v in self._llamacpp_model_config.items():
|
|
@@ -208,7 +218,8 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
208
218
|
)
|
|
209
219
|
logger.info("Estimate num gpu layers: %s", estimate)
|
|
210
220
|
if estimate.tensor_split:
|
|
211
|
-
|
|
221
|
+
for i in range(len(estimate.tensor_split)):
|
|
222
|
+
params.tensor_split[i] = estimate.tensor_split[i]
|
|
212
223
|
else:
|
|
213
224
|
params.n_gpu_layers = estimate.layers
|
|
214
225
|
except Exception as e:
|
|
@@ -242,28 +253,18 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
242
253
|
{
|
|
243
254
|
"prompt": prompt,
|
|
244
255
|
"stream": stream,
|
|
256
|
+
"model": self.model_uid,
|
|
245
257
|
}
|
|
246
258
|
)
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
def _error_callback(err):
|
|
250
|
-
try:
|
|
251
|
-
msg = orjson.loads(err)
|
|
252
|
-
q.put(_Error(msg))
|
|
253
|
-
except Exception as e:
|
|
254
|
-
q.put(_Error(str(e)))
|
|
259
|
+
try:
|
|
255
260
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
except Exception as e:
|
|
262
|
-
logger.exception("handle_completions callback failed: %s", e)
|
|
263
|
-
q.put(_Error(str(e)))
|
|
261
|
+
def _callback(res):
|
|
262
|
+
if res.get("code"):
|
|
263
|
+
q.put(_Error(res))
|
|
264
|
+
else:
|
|
265
|
+
q.put(res)
|
|
264
266
|
|
|
265
|
-
|
|
266
|
-
self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
|
|
267
|
+
self._llm.handle_completions(data, _callback)
|
|
267
268
|
except Exception as ex:
|
|
268
269
|
logger.exception("handle_completions failed: %s", ex)
|
|
269
270
|
q.put(_Error(str(ex)))
|
|
@@ -296,6 +297,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
296
297
|
if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
|
|
297
298
|
generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
|
|
298
299
|
stream = generate_config.get("stream", False)
|
|
300
|
+
|
|
301
|
+
chat_template_kwargs = (
|
|
302
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
303
|
+
generate_config, self.reasoning_parser
|
|
304
|
+
)
|
|
305
|
+
or {}
|
|
306
|
+
)
|
|
307
|
+
chat_context_var.set(chat_template_kwargs)
|
|
308
|
+
|
|
299
309
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
300
310
|
q: queue.Queue = queue.Queue()
|
|
301
311
|
|
|
@@ -310,30 +320,21 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
310
320
|
"messages": messages,
|
|
311
321
|
"stream": stream,
|
|
312
322
|
"tools": tools,
|
|
323
|
+
"model": self.model_uid,
|
|
313
324
|
}
|
|
314
325
|
)
|
|
315
|
-
|
|
326
|
+
if chat_template_kwargs:
|
|
327
|
+
data["chat_template_kwargs"] = chat_template_kwargs
|
|
316
328
|
|
|
317
|
-
|
|
318
|
-
try:
|
|
319
|
-
msg = orjson.loads(err)
|
|
320
|
-
q.put(_Error(msg))
|
|
321
|
-
except Exception as e:
|
|
322
|
-
q.put(_Error(str(e)))
|
|
329
|
+
try:
|
|
323
330
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
except Exception as e:
|
|
330
|
-
logger.exception("handle_chat_completions callback failed: %s", e)
|
|
331
|
-
q.put(_Error(str(e)))
|
|
331
|
+
def _callback(res):
|
|
332
|
+
if res.get("code"):
|
|
333
|
+
q.put(_Error(res))
|
|
334
|
+
else:
|
|
335
|
+
q.put(res)
|
|
332
336
|
|
|
333
|
-
|
|
334
|
-
self._llm.handle_chat_completions(
|
|
335
|
-
prompt_json, _error_callback, _ok_callback
|
|
336
|
-
)
|
|
337
|
+
self._llm.handle_chat_completions(data, _callback)
|
|
337
338
|
except Exception as ex:
|
|
338
339
|
logger.exception("handle_chat_completions failed: %s", ex)
|
|
339
340
|
q.put(_Error(str(ex)))
|