xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +107 -11
- xinference/client/restful/restful_client.py +51 -11
- xinference/constants.py +5 -1
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/supervisor.py +1 -1
- xinference/core/utils.py +1 -1
- xinference/core/worker.py +33 -39
- xinference/deploy/cmdline.py +17 -0
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +2 -1
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +63 -46
- xinference/model/audio/model_spec_modelscope.json +31 -14
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +40 -115
- xinference/model/llm/core.py +29 -6
- xinference/model/llm/llama_cpp/core.py +30 -347
- xinference/model/llm/llm_family.json +1674 -2203
- xinference/model/llm/llm_family.py +71 -7
- xinference/model/llm/llm_family_csghub.json +0 -32
- xinference/model/llm/llm_family_modelscope.json +1838 -2016
- xinference/model/llm/llm_family_openmind_hub.json +19 -325
- xinference/model/llm/lmdeploy/core.py +7 -2
- xinference/model/llm/mlx/core.py +23 -7
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +39 -11
- xinference/model/llm/transformers/chatglm.py +9 -2
- xinference/model/llm/transformers/cogagent.py +10 -12
- xinference/model/llm/transformers/cogvlm2.py +6 -3
- xinference/model/llm/transformers/cogvlm2_video.py +3 -6
- xinference/model/llm/transformers/core.py +58 -60
- xinference/model/llm/transformers/deepseek_v2.py +4 -2
- xinference/model/llm/transformers/deepseek_vl.py +10 -4
- xinference/model/llm/transformers/deepseek_vl2.py +9 -4
- xinference/model/llm/transformers/gemma3.py +4 -5
- xinference/model/llm/transformers/glm4v.py +3 -21
- xinference/model/llm/transformers/glm_edge_v.py +3 -20
- xinference/model/llm/transformers/intern_vl.py +3 -6
- xinference/model/llm/transformers/internlm2.py +1 -1
- xinference/model/llm/transformers/minicpmv25.py +4 -2
- xinference/model/llm/transformers/minicpmv26.py +5 -3
- xinference/model/llm/transformers/omnilmm.py +1 -1
- xinference/model/llm/transformers/opt.py +1 -1
- xinference/model/llm/transformers/ovis2.py +302 -0
- xinference/model/llm/transformers/qwen-omni.py +8 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +5 -1
- xinference/model/llm/transformers/qwen_vl.py +5 -2
- xinference/model/llm/utils.py +96 -45
- xinference/model/llm/vllm/core.py +108 -24
- xinference/model/llm/vllm/distributed_executor.py +8 -7
- xinference/model/llm/vllm/xavier/allocator.py +1 -1
- xinference/model/llm/vllm/xavier/block_manager.py +1 -1
- xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
- xinference/model/llm/vllm/xavier/executor.py +1 -1
- xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +157 -13
- xinference/model/video/model_spec.json +100 -0
- xinference/model/video/model_spec_modelscope.json +104 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +2 -71
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
- xinference/web/ui/src/locales/en.json +7 -4
- xinference/web/ui/src/locales/zh.json +7 -4
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/model/llm/transformers/compression.py +0 -258
- xinference/model/llm/transformers/yi_vl.py +0 -239
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
- xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
- xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
- /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -56,29 +56,8 @@ def register_custom_model():
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def _install():
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
os.path.dirname(__file__), "model_spec_modelscope.json"
|
|
62
|
-
)
|
|
63
|
-
BUILTIN_RERANK_MODELS.update(
|
|
64
|
-
dict(
|
|
65
|
-
(spec["model_name"], RerankModelSpec(**spec))
|
|
66
|
-
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
|
|
70
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
71
|
-
|
|
72
|
-
MODELSCOPE_RERANK_MODELS.update(
|
|
73
|
-
dict(
|
|
74
|
-
(spec["model_name"], RerankModelSpec(**spec))
|
|
75
|
-
for spec in json.load(
|
|
76
|
-
codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
|
|
77
|
-
)
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
|
|
81
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
59
|
+
load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
|
|
60
|
+
load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
|
|
82
61
|
|
|
83
62
|
# register model description after recording model revision
|
|
84
63
|
for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
|
|
@@ -94,5 +73,15 @@ def _install():
|
|
|
94
73
|
for ud_rerank in get_user_defined_reranks():
|
|
95
74
|
RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
|
|
96
75
|
|
|
76
|
+
|
|
77
|
+
def load_model_family_from_json(json_filename, target_families):
|
|
78
|
+
_model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
|
|
79
|
+
target_families.update(
|
|
80
|
+
dict(
|
|
81
|
+
(spec["model_name"], RerankModelSpec(**spec))
|
|
82
|
+
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
for model_name, model_spec in target_families.items():
|
|
86
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
97
87
|
del _model_spec_json
|
|
98
|
-
del _model_spec_modelscope_json
|
|
@@ -30,29 +30,8 @@ from .core import (
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def _install():
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
os.path.dirname(__file__), "model_spec_modelscope.json"
|
|
36
|
-
)
|
|
37
|
-
BUILTIN_VIDEO_MODELS.update(
|
|
38
|
-
dict(
|
|
39
|
-
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
40
|
-
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
|
|
44
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
45
|
-
|
|
46
|
-
MODELSCOPE_VIDEO_MODELS.update(
|
|
47
|
-
dict(
|
|
48
|
-
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
49
|
-
for spec in json.load(
|
|
50
|
-
codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
|
|
51
|
-
)
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
|
|
55
|
-
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
33
|
+
load_model_family_from_json("model_spec.json", BUILTIN_VIDEO_MODELS)
|
|
34
|
+
load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_VIDEO_MODELS)
|
|
56
35
|
|
|
57
36
|
# register model description
|
|
58
37
|
for model_name, model_spec in chain(
|
|
@@ -60,5 +39,16 @@ def _install():
|
|
|
60
39
|
):
|
|
61
40
|
VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
|
|
62
41
|
|
|
63
|
-
|
|
64
|
-
|
|
42
|
+
|
|
43
|
+
def load_model_family_from_json(json_filename, target_families):
|
|
44
|
+
json_path = os.path.join(os.path.dirname(__file__), json_filename)
|
|
45
|
+
target_families.update(
|
|
46
|
+
dict(
|
|
47
|
+
(spec["model_name"], VideoModelFamilyV1(**spec))
|
|
48
|
+
for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
for model_name, model_spec in target_families.items():
|
|
52
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
53
|
+
|
|
54
|
+
del json_path
|
xinference/model/video/core.py
CHANGED
|
@@ -19,7 +19,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
|
19
19
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
20
20
|
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
21
21
|
from ..utils import valid_model_revision
|
|
22
|
-
from .diffusers import
|
|
22
|
+
from .diffusers import DiffusersVideoModel
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
@@ -169,13 +169,13 @@ def create_video_model_instance(
|
|
|
169
169
|
] = None,
|
|
170
170
|
model_path: Optional[str] = None,
|
|
171
171
|
**kwargs,
|
|
172
|
-
) -> Tuple[
|
|
172
|
+
) -> Tuple[DiffusersVideoModel, VideoModelDescription]:
|
|
173
173
|
model_spec = match_diffusion(model_name, download_hub)
|
|
174
174
|
if not model_path:
|
|
175
175
|
model_path = cache(model_spec)
|
|
176
176
|
assert model_path is not None
|
|
177
177
|
|
|
178
|
-
model =
|
|
178
|
+
model = DiffusersVideoModel(
|
|
179
179
|
model_uid,
|
|
180
180
|
model_path,
|
|
181
181
|
model_spec,
|
|
@@ -14,12 +14,13 @@
|
|
|
14
14
|
|
|
15
15
|
import base64
|
|
16
16
|
import logging
|
|
17
|
+
import operator
|
|
17
18
|
import os
|
|
18
19
|
import time
|
|
19
20
|
import uuid
|
|
20
21
|
from concurrent.futures import ThreadPoolExecutor
|
|
21
|
-
from functools import partial
|
|
22
|
-
from typing import TYPE_CHECKING, List, Union
|
|
22
|
+
from functools import partial, reduce
|
|
23
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
23
24
|
|
|
24
25
|
import numpy as np
|
|
25
26
|
import PIL.Image
|
|
@@ -29,6 +30,7 @@ from ...device_utils import gpu_count, move_model_to_available_device
|
|
|
29
30
|
from ...types import Video, VideoList
|
|
30
31
|
|
|
31
32
|
if TYPE_CHECKING:
|
|
33
|
+
from ....core.progress_tracker import Progressor
|
|
32
34
|
from .core import VideoModelFamilyV1
|
|
33
35
|
|
|
34
36
|
|
|
@@ -53,7 +55,7 @@ def export_to_video_imageio(
|
|
|
53
55
|
return output_video_path
|
|
54
56
|
|
|
55
57
|
|
|
56
|
-
class
|
|
58
|
+
class DiffusersVideoModel:
|
|
57
59
|
def __init__(
|
|
58
60
|
self,
|
|
59
61
|
model_uid: str,
|
|
@@ -64,6 +66,7 @@ class DiffUsersVideoModel:
|
|
|
64
66
|
self._model_uid = model_uid
|
|
65
67
|
self._model_path = model_path
|
|
66
68
|
self._model_spec = model_spec
|
|
69
|
+
self._abilities = model_spec.model_ability or [] # type: ignore
|
|
67
70
|
self._model = None
|
|
68
71
|
self._kwargs = kwargs
|
|
69
72
|
|
|
@@ -71,6 +74,10 @@ class DiffUsersVideoModel:
|
|
|
71
74
|
def model_spec(self):
|
|
72
75
|
return self._model_spec
|
|
73
76
|
|
|
77
|
+
@property
|
|
78
|
+
def model_ability(self):
|
|
79
|
+
return self._abilities
|
|
80
|
+
|
|
74
81
|
def load(self):
|
|
75
82
|
import torch
|
|
76
83
|
|
|
@@ -105,6 +112,28 @@ class DiffUsersVideoModel:
|
|
|
105
112
|
pipeline = self._model = HunyuanVideoPipeline.from_pretrained(
|
|
106
113
|
self._model_path, transformer=transformer, **kwargs
|
|
107
114
|
)
|
|
115
|
+
elif self.model_spec.model_family == "Wan":
|
|
116
|
+
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanPipeline
|
|
117
|
+
from transformers import CLIPVisionModel
|
|
118
|
+
|
|
119
|
+
if "text2video" in self.model_spec.model_ability:
|
|
120
|
+
pipeline = self._model = WanPipeline.from_pretrained(
|
|
121
|
+
self._model_path, **kwargs
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
assert "image2video" in self.model_spec.model_ability
|
|
125
|
+
|
|
126
|
+
image_encoder = CLIPVisionModel.from_pretrained(
|
|
127
|
+
self._model_path,
|
|
128
|
+
subfolder="image_encoder",
|
|
129
|
+
torch_dtype=torch.float32,
|
|
130
|
+
)
|
|
131
|
+
vae = AutoencoderKLWan.from_pretrained(
|
|
132
|
+
self._model_path, subfolder="vae", torch_dtype=torch.float32
|
|
133
|
+
)
|
|
134
|
+
pipeline = self._model = WanImageToVideoPipeline.from_pretrained(
|
|
135
|
+
self._model_path, vae=vae, image_encoder=image_encoder, **kwargs
|
|
136
|
+
)
|
|
108
137
|
else:
|
|
109
138
|
raise Exception(
|
|
110
139
|
f"Unsupported model family: {self._model_spec.model_family}"
|
|
@@ -119,13 +148,53 @@ class DiffUsersVideoModel:
|
|
|
119
148
|
pipeline.transformer = torch.compile(
|
|
120
149
|
pipeline.transformer, mode="max-autotune", fullgraph=True
|
|
121
150
|
)
|
|
151
|
+
if kwargs.get("layerwise_cast", False):
|
|
152
|
+
compute_dtype = pipeline.transformer.dtype
|
|
153
|
+
pipeline.transformer.enable_layerwise_casting(
|
|
154
|
+
storage_dtype=torch.float8_e4m3fn, compute_dtype=compute_dtype
|
|
155
|
+
)
|
|
122
156
|
if kwargs.get("cpu_offload", False):
|
|
123
157
|
logger.debug("CPU offloading model")
|
|
124
158
|
pipeline.enable_model_cpu_offload()
|
|
125
159
|
if kwargs.get("sequential_cpu_offload", True):
|
|
126
160
|
pipeline.enable_sequential_cpu_offload()
|
|
127
|
-
|
|
128
|
-
|
|
161
|
+
try:
|
|
162
|
+
pipeline.vae.enable_slicing()
|
|
163
|
+
except AttributeError:
|
|
164
|
+
# model does not support slicing
|
|
165
|
+
pass
|
|
166
|
+
try:
|
|
167
|
+
pipeline.vae.enable_tiling()
|
|
168
|
+
except AttributeError:
|
|
169
|
+
# model does support tiling
|
|
170
|
+
pass
|
|
171
|
+
elif kwargs.get("group_offload", False):
|
|
172
|
+
from diffusers.hooks.group_offloading import apply_group_offloading
|
|
173
|
+
|
|
174
|
+
onload_device = torch.device("cuda")
|
|
175
|
+
offload_device = torch.device("cpu")
|
|
176
|
+
|
|
177
|
+
apply_group_offloading(
|
|
178
|
+
pipeline.text_encoder,
|
|
179
|
+
onload_device=onload_device,
|
|
180
|
+
offload_device=offload_device,
|
|
181
|
+
offload_type="block_level",
|
|
182
|
+
num_blocks_per_group=4,
|
|
183
|
+
)
|
|
184
|
+
group_offload_kwargs = {}
|
|
185
|
+
if kwargs.get("use_stream", False):
|
|
186
|
+
group_offload_kwargs["offload_type"] = "block_level"
|
|
187
|
+
group_offload_kwargs["num_blocks_per_group"] = 4
|
|
188
|
+
else:
|
|
189
|
+
group_offload_kwargs["offload_type"] = "leaf_level"
|
|
190
|
+
group_offload_kwargs["use_stream"] = True
|
|
191
|
+
pipeline.transformer.enable_group_offload(
|
|
192
|
+
onload_device=onload_device,
|
|
193
|
+
offload_device=offload_device,
|
|
194
|
+
**group_offload_kwargs,
|
|
195
|
+
)
|
|
196
|
+
# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
|
|
197
|
+
pipeline = move_model_to_available_device(pipeline)
|
|
129
198
|
elif not kwargs.get("device_map"):
|
|
130
199
|
logger.debug("Loading model to available device")
|
|
131
200
|
if gpu_count() > 1:
|
|
@@ -135,6 +204,26 @@ class DiffUsersVideoModel:
|
|
|
135
204
|
# Recommended if your computer has < 64 GB of RAM
|
|
136
205
|
pipeline.enable_attention_slicing()
|
|
137
206
|
|
|
207
|
+
@staticmethod
|
|
208
|
+
def _process_progressor(kwargs: dict):
|
|
209
|
+
import diffusers
|
|
210
|
+
|
|
211
|
+
progressor: Progressor = kwargs.pop("progressor", None)
|
|
212
|
+
|
|
213
|
+
def report_status_callback(
|
|
214
|
+
pipe: diffusers.DiffusionPipeline,
|
|
215
|
+
step: int,
|
|
216
|
+
timestep: int,
|
|
217
|
+
callback_kwargs: dict,
|
|
218
|
+
):
|
|
219
|
+
num_steps = pipe.num_timesteps
|
|
220
|
+
progressor.set_progress((step + 1) / num_steps)
|
|
221
|
+
|
|
222
|
+
return callback_kwargs
|
|
223
|
+
|
|
224
|
+
if progressor and progressor.request_id:
|
|
225
|
+
kwargs["callback_on_step_end"] = report_status_callback
|
|
226
|
+
|
|
138
227
|
def text_to_video(
|
|
139
228
|
self,
|
|
140
229
|
prompt: str,
|
|
@@ -143,27 +232,77 @@ class DiffUsersVideoModel:
|
|
|
143
232
|
response_format: str = "b64_json",
|
|
144
233
|
**kwargs,
|
|
145
234
|
) -> VideoList:
|
|
146
|
-
import gc
|
|
147
|
-
|
|
148
|
-
# cv2 bug will cause the video cannot be normally displayed
|
|
149
|
-
# thus we use the imageio one
|
|
150
|
-
# from diffusers.utils import export_to_video
|
|
151
|
-
from ...device_utils import empty_cache
|
|
152
|
-
|
|
153
235
|
assert self._model is not None
|
|
154
236
|
assert callable(self._model)
|
|
155
237
|
generate_kwargs = self._model_spec.default_generate_config.copy()
|
|
156
238
|
generate_kwargs.update(kwargs)
|
|
157
239
|
generate_kwargs["num_videos_per_prompt"] = n
|
|
240
|
+
fps = generate_kwargs.pop("fps", 10)
|
|
158
241
|
logger.debug(
|
|
159
242
|
"diffusers text_to_video args: %s",
|
|
160
243
|
generate_kwargs,
|
|
161
244
|
)
|
|
245
|
+
self._process_progressor(generate_kwargs)
|
|
162
246
|
output = self._model(
|
|
163
247
|
prompt=prompt,
|
|
164
248
|
num_inference_steps=num_inference_steps,
|
|
165
249
|
**generate_kwargs,
|
|
166
250
|
)
|
|
251
|
+
return self._output_to_video(output, fps, response_format)
|
|
252
|
+
|
|
253
|
+
def image_to_video(
|
|
254
|
+
self,
|
|
255
|
+
image: PIL.Image,
|
|
256
|
+
prompt: str,
|
|
257
|
+
n: int = 1,
|
|
258
|
+
num_inference_steps: Optional[int] = None,
|
|
259
|
+
response_format: str = "b64_json",
|
|
260
|
+
**kwargs,
|
|
261
|
+
):
|
|
262
|
+
assert self._model is not None
|
|
263
|
+
assert callable(self._model)
|
|
264
|
+
generate_kwargs = self._model_spec.default_generate_config.copy()
|
|
265
|
+
generate_kwargs.update(kwargs)
|
|
266
|
+
generate_kwargs["num_videos_per_prompt"] = n
|
|
267
|
+
if num_inference_steps:
|
|
268
|
+
generate_kwargs["num_inference_steps"] = num_inference_steps
|
|
269
|
+
fps = generate_kwargs.pop("fps", 10)
|
|
270
|
+
|
|
271
|
+
# process image
|
|
272
|
+
max_area = generate_kwargs.pop("max_area")
|
|
273
|
+
if isinstance(max_area, str):
|
|
274
|
+
max_area = [int(v) for v in max_area.split("*")]
|
|
275
|
+
max_area = reduce(operator.mul, max_area, 1)
|
|
276
|
+
image = self._process_image(image, max_area)
|
|
277
|
+
|
|
278
|
+
height, width = image.height, image.width
|
|
279
|
+
generate_kwargs.pop("width", None)
|
|
280
|
+
generate_kwargs.pop("height", None)
|
|
281
|
+
self._process_progressor(generate_kwargs)
|
|
282
|
+
output = self._model(
|
|
283
|
+
image=image, prompt=prompt, height=height, width=width, **generate_kwargs
|
|
284
|
+
)
|
|
285
|
+
return self._output_to_video(output, fps, response_format)
|
|
286
|
+
|
|
287
|
+
def _process_image(self, image: PIL.Image, max_area: int) -> PIL.Image:
|
|
288
|
+
assert self._model is not None
|
|
289
|
+
aspect_ratio = image.height / image.width
|
|
290
|
+
mod_value = (
|
|
291
|
+
self._model.vae_scale_factor_spatial
|
|
292
|
+
* self._model.transformer.config.patch_size[1]
|
|
293
|
+
)
|
|
294
|
+
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
|
295
|
+
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
|
296
|
+
return image.resize((width, height))
|
|
297
|
+
|
|
298
|
+
def _output_to_video(self, output: Any, fps: int, response_format: str):
|
|
299
|
+
import gc
|
|
300
|
+
|
|
301
|
+
# cv2 bug will cause the video cannot be normally displayed
|
|
302
|
+
# thus we use the imageio one
|
|
303
|
+
from diffusers.utils import export_to_video
|
|
304
|
+
|
|
305
|
+
from ...device_utils import empty_cache
|
|
167
306
|
|
|
168
307
|
# clean cache
|
|
169
308
|
gc.collect()
|
|
@@ -173,7 +312,12 @@ class DiffUsersVideoModel:
|
|
|
173
312
|
urls = []
|
|
174
313
|
for f in output.frames:
|
|
175
314
|
path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4")
|
|
176
|
-
|
|
315
|
+
export = (
|
|
316
|
+
export_to_video
|
|
317
|
+
if self.model_spec.model_family != "CogVideoX"
|
|
318
|
+
else export_to_video_imageio
|
|
319
|
+
)
|
|
320
|
+
p = export(f, path, fps=fps)
|
|
177
321
|
urls.append(p)
|
|
178
322
|
if response_format == "url":
|
|
179
323
|
return VideoList(
|
|
@@ -45,5 +45,105 @@
|
|
|
45
45
|
},
|
|
46
46
|
"default_generate_config": {
|
|
47
47
|
}
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"model_name": "Wan2.1-1.3B",
|
|
51
|
+
"model_family": "Wan",
|
|
52
|
+
"model_id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
|
|
53
|
+
"model_revision": "0fad780a534b6463e45facd96134c9f345acfa5b",
|
|
54
|
+
"model_ability": [
|
|
55
|
+
"text2video"
|
|
56
|
+
],
|
|
57
|
+
"default_model_config": {
|
|
58
|
+
"torch_dtype": "bfloat16"
|
|
59
|
+
},
|
|
60
|
+
"default_generate_config": {
|
|
61
|
+
},
|
|
62
|
+
"virtualenv": {
|
|
63
|
+
"packages": [
|
|
64
|
+
"diffusers>=0.33.0",
|
|
65
|
+
"ftfy",
|
|
66
|
+
"imageio-ffmpeg",
|
|
67
|
+
"imageio",
|
|
68
|
+
"numpy==1.26.4"
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"model_name": "Wan2.1-14B",
|
|
74
|
+
"model_family": "Wan",
|
|
75
|
+
"model_id": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
|
|
76
|
+
"model_revision": "38ec498cb3208fb688890f8cc7e94ede2cbd7f68",
|
|
77
|
+
"model_ability": [
|
|
78
|
+
"text2video"
|
|
79
|
+
],
|
|
80
|
+
"default_model_config": {
|
|
81
|
+
"torch_dtype": "bfloat16"
|
|
82
|
+
},
|
|
83
|
+
"default_generate_config": {
|
|
84
|
+
},
|
|
85
|
+
"virtualenv": {
|
|
86
|
+
"packages": [
|
|
87
|
+
"diffusers>=0.33.0",
|
|
88
|
+
"ftfy",
|
|
89
|
+
"imageio-ffmpeg",
|
|
90
|
+
"imageio",
|
|
91
|
+
"numpy==1.26.4"
|
|
92
|
+
]
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"model_name": "Wan2.1-i2v-14B-480p",
|
|
97
|
+
"model_family": "Wan",
|
|
98
|
+
"model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
|
|
99
|
+
"model_revision": "b184e23a8a16b20f108f727c902e769e873ffc73",
|
|
100
|
+
"model_ability": [
|
|
101
|
+
"image2video"
|
|
102
|
+
],
|
|
103
|
+
"default_model_config": {
|
|
104
|
+
"torch_dtype": "bfloat16"
|
|
105
|
+
},
|
|
106
|
+
"default_generate_config": {
|
|
107
|
+
"max_area": [
|
|
108
|
+
480,
|
|
109
|
+
832
|
|
110
|
+
]
|
|
111
|
+
},
|
|
112
|
+
"virtualenv": {
|
|
113
|
+
"packages": [
|
|
114
|
+
"diffusers>=0.33.0",
|
|
115
|
+
"ftfy",
|
|
116
|
+
"imageio-ffmpeg",
|
|
117
|
+
"imageio",
|
|
118
|
+
"numpy==1.26.4"
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"model_name": "Wan2.1-i2v-14B-720p",
|
|
124
|
+
"model_family": "Wan",
|
|
125
|
+
"model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
|
|
126
|
+
"model_revision": "eb849f76dfa246545b65774a9e25943ee69b3fa3",
|
|
127
|
+
"model_ability": [
|
|
128
|
+
"image2video"
|
|
129
|
+
],
|
|
130
|
+
"default_model_config": {
|
|
131
|
+
"torch_dtype": "bfloat16"
|
|
132
|
+
},
|
|
133
|
+
"default_generate_config": {
|
|
134
|
+
"max_area": [
|
|
135
|
+
720,
|
|
136
|
+
1280
|
|
137
|
+
]
|
|
138
|
+
},
|
|
139
|
+
"virtualenv": {
|
|
140
|
+
"packages": [
|
|
141
|
+
"diffusers>=0.33.0",
|
|
142
|
+
"ftfy",
|
|
143
|
+
"imageio-ffmpeg",
|
|
144
|
+
"imageio",
|
|
145
|
+
"numpy==1.26.4"
|
|
146
|
+
]
|
|
147
|
+
}
|
|
48
148
|
}
|
|
49
149
|
]
|
|
@@ -48,5 +48,109 @@
|
|
|
48
48
|
},
|
|
49
49
|
"default_generate_config": {
|
|
50
50
|
}
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"model_name": "Wan2.1-1.3B",
|
|
54
|
+
"model_family": "Wan",
|
|
55
|
+
"model_hub": "modelscope",
|
|
56
|
+
"model_id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
|
|
57
|
+
"model_revision": "master",
|
|
58
|
+
"model_ability": [
|
|
59
|
+
"text2video"
|
|
60
|
+
],
|
|
61
|
+
"default_model_config": {
|
|
62
|
+
"torch_dtype": "bfloat16"
|
|
63
|
+
},
|
|
64
|
+
"default_generate_config": {
|
|
65
|
+
},
|
|
66
|
+
"virtualenv": {
|
|
67
|
+
"packages": [
|
|
68
|
+
"diffusers>=0.33.0",
|
|
69
|
+
"ftfy",
|
|
70
|
+
"imageio-ffmpeg",
|
|
71
|
+
"imageio",
|
|
72
|
+
"numpy==1.26.4"
|
|
73
|
+
]
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"model_name": "Wan2.1-14B",
|
|
78
|
+
"model_family": "Wan",
|
|
79
|
+
"model_hub": "modelscope",
|
|
80
|
+
"model_id": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
|
|
81
|
+
"model_revision": "master",
|
|
82
|
+
"model_ability": [
|
|
83
|
+
"text2video"
|
|
84
|
+
],
|
|
85
|
+
"default_model_config": {
|
|
86
|
+
"torch_dtype": "bfloat16"
|
|
87
|
+
},
|
|
88
|
+
"default_generate_config": {
|
|
89
|
+
},
|
|
90
|
+
"virtualenv": {
|
|
91
|
+
"packages": [
|
|
92
|
+
"diffusers>=0.33.0",
|
|
93
|
+
"ftfy",
|
|
94
|
+
"imageio-ffmpeg",
|
|
95
|
+
"imageio",
|
|
96
|
+
"numpy==1.26.4"
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"model_name": "Wan2.1-i2v-14B-480p",
|
|
102
|
+
"model_family": "Wan",
|
|
103
|
+
"model_hub": "modelscope",
|
|
104
|
+
"model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
|
|
105
|
+
"model_revision": "master",
|
|
106
|
+
"model_ability": [
|
|
107
|
+
"image2video"
|
|
108
|
+
],
|
|
109
|
+
"default_model_config": {
|
|
110
|
+
"torch_dtype": "bfloat16"
|
|
111
|
+
},
|
|
112
|
+
"default_generate_config": {
|
|
113
|
+
"max_area": [
|
|
114
|
+
480,
|
|
115
|
+
832
|
|
116
|
+
]
|
|
117
|
+
},
|
|
118
|
+
"virtualenv": {
|
|
119
|
+
"packages": [
|
|
120
|
+
"diffusers>=0.33.0",
|
|
121
|
+
"ftfy",
|
|
122
|
+
"imageio-ffmpeg",
|
|
123
|
+
"imageio",
|
|
124
|
+
"numpy==1.26.4"
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"model_name": "Wan2.1-i2v-14B-720p",
|
|
130
|
+
"model_family": "Wan",
|
|
131
|
+
"model_hub": "modelscope",
|
|
132
|
+
"model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
|
|
133
|
+
"model_revision": "master",
|
|
134
|
+
"model_ability": [
|
|
135
|
+
"image2video"
|
|
136
|
+
],
|
|
137
|
+
"default_model_config": {
|
|
138
|
+
"torch_dtype": "bfloat16"
|
|
139
|
+
},
|
|
140
|
+
"default_generate_config": {
|
|
141
|
+
"max_area": [
|
|
142
|
+
720,
|
|
143
|
+
1280
|
|
144
|
+
]
|
|
145
|
+
},
|
|
146
|
+
"virtualenv": {
|
|
147
|
+
"packages": [
|
|
148
|
+
"diffusers>=0.33.0",
|
|
149
|
+
"ftfy",
|
|
150
|
+
"imageio-ffmpeg",
|
|
151
|
+
"imageio",
|
|
152
|
+
"numpy==1.26.4"
|
|
153
|
+
]
|
|
154
|
+
}
|
|
51
155
|
}
|
|
52
156
|
]
|
|
@@ -75,10 +75,11 @@ def main():
|
|
|
75
75
|
print('Processing {}'.format(path))
|
|
76
76
|
states = torch.load(path, map_location=torch.device('cpu'))
|
|
77
77
|
for k in states.keys():
|
|
78
|
-
if k not in
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
78
|
+
if k not in ['step', 'epoch']:
|
|
79
|
+
if k not in avg.keys():
|
|
80
|
+
avg[k] = states[k].clone()
|
|
81
|
+
else:
|
|
82
|
+
avg[k] += states[k]
|
|
82
83
|
# average
|
|
83
84
|
for k in avg.keys():
|
|
84
85
|
if avg[k] is not None:
|