xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +107 -11
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/constants.py +5 -1
  5. xinference/core/media_interface.py +758 -0
  6. xinference/core/model.py +49 -9
  7. xinference/core/supervisor.py +1 -1
  8. xinference/core/utils.py +1 -1
  9. xinference/core/worker.py +33 -39
  10. xinference/deploy/cmdline.py +17 -0
  11. xinference/deploy/utils.py +0 -3
  12. xinference/model/audio/__init__.py +16 -27
  13. xinference/model/audio/core.py +2 -1
  14. xinference/model/audio/cosyvoice.py +4 -2
  15. xinference/model/audio/model_spec.json +63 -46
  16. xinference/model/audio/model_spec_modelscope.json +31 -14
  17. xinference/model/embedding/__init__.py +16 -24
  18. xinference/model/image/__init__.py +15 -25
  19. xinference/model/llm/__init__.py +40 -115
  20. xinference/model/llm/core.py +29 -6
  21. xinference/model/llm/llama_cpp/core.py +30 -347
  22. xinference/model/llm/llm_family.json +1674 -2203
  23. xinference/model/llm/llm_family.py +71 -7
  24. xinference/model/llm/llm_family_csghub.json +0 -32
  25. xinference/model/llm/llm_family_modelscope.json +1838 -2016
  26. xinference/model/llm/llm_family_openmind_hub.json +19 -325
  27. xinference/model/llm/lmdeploy/core.py +7 -2
  28. xinference/model/llm/mlx/core.py +23 -7
  29. xinference/model/llm/reasoning_parser.py +281 -5
  30. xinference/model/llm/sglang/core.py +39 -11
  31. xinference/model/llm/transformers/chatglm.py +9 -2
  32. xinference/model/llm/transformers/cogagent.py +10 -12
  33. xinference/model/llm/transformers/cogvlm2.py +6 -3
  34. xinference/model/llm/transformers/cogvlm2_video.py +3 -6
  35. xinference/model/llm/transformers/core.py +58 -60
  36. xinference/model/llm/transformers/deepseek_v2.py +4 -2
  37. xinference/model/llm/transformers/deepseek_vl.py +10 -4
  38. xinference/model/llm/transformers/deepseek_vl2.py +9 -4
  39. xinference/model/llm/transformers/gemma3.py +4 -5
  40. xinference/model/llm/transformers/glm4v.py +3 -21
  41. xinference/model/llm/transformers/glm_edge_v.py +3 -20
  42. xinference/model/llm/transformers/intern_vl.py +3 -6
  43. xinference/model/llm/transformers/internlm2.py +1 -1
  44. xinference/model/llm/transformers/minicpmv25.py +4 -2
  45. xinference/model/llm/transformers/minicpmv26.py +5 -3
  46. xinference/model/llm/transformers/omnilmm.py +1 -1
  47. xinference/model/llm/transformers/opt.py +1 -1
  48. xinference/model/llm/transformers/ovis2.py +302 -0
  49. xinference/model/llm/transformers/qwen-omni.py +8 -1
  50. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  51. xinference/model/llm/transformers/qwen2_vl.py +5 -1
  52. xinference/model/llm/transformers/qwen_vl.py +5 -2
  53. xinference/model/llm/utils.py +96 -45
  54. xinference/model/llm/vllm/core.py +108 -24
  55. xinference/model/llm/vllm/distributed_executor.py +8 -7
  56. xinference/model/llm/vllm/xavier/allocator.py +1 -1
  57. xinference/model/llm/vllm/xavier/block_manager.py +1 -1
  58. xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
  59. xinference/model/llm/vllm/xavier/executor.py +1 -1
  60. xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
  61. xinference/model/rerank/__init__.py +13 -24
  62. xinference/model/video/__init__.py +15 -25
  63. xinference/model/video/core.py +3 -3
  64. xinference/model/video/diffusers.py +157 -13
  65. xinference/model/video/model_spec.json +100 -0
  66. xinference/model/video/model_spec_modelscope.json +104 -0
  67. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  68. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  69. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  70. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  71. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  74. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  75. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  76. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  77. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  78. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  79. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  80. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  81. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  84. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  85. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  86. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  87. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  88. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  89. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  90. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  91. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  92. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  93. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  94. xinference/types.py +2 -71
  95. xinference/web/ui/build/asset-manifest.json +6 -6
  96. xinference/web/ui/build/index.html +1 -1
  97. xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
  98. xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
  99. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  100. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  101. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  102. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  103. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  109. xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
  112. xinference/web/ui/src/locales/en.json +7 -4
  113. xinference/web/ui/src/locales/zh.json +7 -4
  114. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  115. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
  116. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  117. xinference/core/image_interface.py +0 -377
  118. xinference/model/llm/transformers/compression.py +0 -258
  119. xinference/model/llm/transformers/yi_vl.py +0 -239
  120. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  121. xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
  122. xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
  123. xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
  124. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
  132. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  133. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
  134. /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  135. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  136. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  137. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -56,29 +56,8 @@ def register_custom_model():
56
56
 
57
57
 
58
58
  def _install():
59
- _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
60
- _model_spec_modelscope_json = os.path.join(
61
- os.path.dirname(__file__), "model_spec_modelscope.json"
62
- )
63
- BUILTIN_RERANK_MODELS.update(
64
- dict(
65
- (spec["model_name"], RerankModelSpec(**spec))
66
- for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
67
- )
68
- )
69
- for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
70
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
71
-
72
- MODELSCOPE_RERANK_MODELS.update(
73
- dict(
74
- (spec["model_name"], RerankModelSpec(**spec))
75
- for spec in json.load(
76
- codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
77
- )
78
- )
79
- )
80
- for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
81
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
59
+ load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
60
+ load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
82
61
 
83
62
  # register model description after recording model revision
84
63
  for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
@@ -94,5 +73,15 @@ def _install():
94
73
  for ud_rerank in get_user_defined_reranks():
95
74
  RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
96
75
 
76
+
77
+ def load_model_family_from_json(json_filename, target_families):
78
+ _model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
79
+ target_families.update(
80
+ dict(
81
+ (spec["model_name"], RerankModelSpec(**spec))
82
+ for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
83
+ )
84
+ )
85
+ for model_name, model_spec in target_families.items():
86
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
97
87
  del _model_spec_json
98
- del _model_spec_modelscope_json
@@ -30,29 +30,8 @@ from .core import (
30
30
 
31
31
 
32
32
  def _install():
33
- _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
34
- _model_spec_modelscope_json = os.path.join(
35
- os.path.dirname(__file__), "model_spec_modelscope.json"
36
- )
37
- BUILTIN_VIDEO_MODELS.update(
38
- dict(
39
- (spec["model_name"], VideoModelFamilyV1(**spec))
40
- for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
41
- )
42
- )
43
- for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
44
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
45
-
46
- MODELSCOPE_VIDEO_MODELS.update(
47
- dict(
48
- (spec["model_name"], VideoModelFamilyV1(**spec))
49
- for spec in json.load(
50
- codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
51
- )
52
- )
53
- )
54
- for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
55
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
33
+ load_model_family_from_json("model_spec.json", BUILTIN_VIDEO_MODELS)
34
+ load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_VIDEO_MODELS)
56
35
 
57
36
  # register model description
58
37
  for model_name, model_spec in chain(
@@ -60,5 +39,16 @@ def _install():
60
39
  ):
61
40
  VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
62
41
 
63
- del _model_spec_json
64
- del _model_spec_modelscope_json
42
+
43
+ def load_model_family_from_json(json_filename, target_families):
44
+ json_path = os.path.join(os.path.dirname(__file__), json_filename)
45
+ target_families.update(
46
+ dict(
47
+ (spec["model_name"], VideoModelFamilyV1(**spec))
48
+ for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
49
+ )
50
+ )
51
+ for model_name, model_spec in target_families.items():
52
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
53
+
54
+ del json_path
@@ -19,7 +19,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
19
19
  from ...constants import XINFERENCE_CACHE_DIR
20
20
  from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
21
21
  from ..utils import valid_model_revision
22
- from .diffusers import DiffUsersVideoModel
22
+ from .diffusers import DiffusersVideoModel
23
23
 
24
24
  logger = logging.getLogger(__name__)
25
25
 
@@ -169,13 +169,13 @@ def create_video_model_instance(
169
169
  ] = None,
170
170
  model_path: Optional[str] = None,
171
171
  **kwargs,
172
- ) -> Tuple[DiffUsersVideoModel, VideoModelDescription]:
172
+ ) -> Tuple[DiffusersVideoModel, VideoModelDescription]:
173
173
  model_spec = match_diffusion(model_name, download_hub)
174
174
  if not model_path:
175
175
  model_path = cache(model_spec)
176
176
  assert model_path is not None
177
177
 
178
- model = DiffUsersVideoModel(
178
+ model = DiffusersVideoModel(
179
179
  model_uid,
180
180
  model_path,
181
181
  model_spec,
@@ -14,12 +14,13 @@
14
14
 
15
15
  import base64
16
16
  import logging
17
+ import operator
17
18
  import os
18
19
  import time
19
20
  import uuid
20
21
  from concurrent.futures import ThreadPoolExecutor
21
- from functools import partial
22
- from typing import TYPE_CHECKING, List, Union
22
+ from functools import partial, reduce
23
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
23
24
 
24
25
  import numpy as np
25
26
  import PIL.Image
@@ -29,6 +30,7 @@ from ...device_utils import gpu_count, move_model_to_available_device
29
30
  from ...types import Video, VideoList
30
31
 
31
32
  if TYPE_CHECKING:
33
+ from ....core.progress_tracker import Progressor
32
34
  from .core import VideoModelFamilyV1
33
35
 
34
36
 
@@ -53,7 +55,7 @@ def export_to_video_imageio(
53
55
  return output_video_path
54
56
 
55
57
 
56
- class DiffUsersVideoModel:
58
+ class DiffusersVideoModel:
57
59
  def __init__(
58
60
  self,
59
61
  model_uid: str,
@@ -64,6 +66,7 @@ class DiffUsersVideoModel:
64
66
  self._model_uid = model_uid
65
67
  self._model_path = model_path
66
68
  self._model_spec = model_spec
69
+ self._abilities = model_spec.model_ability or [] # type: ignore
67
70
  self._model = None
68
71
  self._kwargs = kwargs
69
72
 
@@ -71,6 +74,10 @@ class DiffUsersVideoModel:
71
74
  def model_spec(self):
72
75
  return self._model_spec
73
76
 
77
+ @property
78
+ def model_ability(self):
79
+ return self._abilities
80
+
74
81
  def load(self):
75
82
  import torch
76
83
 
@@ -105,6 +112,28 @@ class DiffUsersVideoModel:
105
112
  pipeline = self._model = HunyuanVideoPipeline.from_pretrained(
106
113
  self._model_path, transformer=transformer, **kwargs
107
114
  )
115
+ elif self.model_spec.model_family == "Wan":
116
+ from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanPipeline
117
+ from transformers import CLIPVisionModel
118
+
119
+ if "text2video" in self.model_spec.model_ability:
120
+ pipeline = self._model = WanPipeline.from_pretrained(
121
+ self._model_path, **kwargs
122
+ )
123
+ else:
124
+ assert "image2video" in self.model_spec.model_ability
125
+
126
+ image_encoder = CLIPVisionModel.from_pretrained(
127
+ self._model_path,
128
+ subfolder="image_encoder",
129
+ torch_dtype=torch.float32,
130
+ )
131
+ vae = AutoencoderKLWan.from_pretrained(
132
+ self._model_path, subfolder="vae", torch_dtype=torch.float32
133
+ )
134
+ pipeline = self._model = WanImageToVideoPipeline.from_pretrained(
135
+ self._model_path, vae=vae, image_encoder=image_encoder, **kwargs
136
+ )
108
137
  else:
109
138
  raise Exception(
110
139
  f"Unsupported model family: {self._model_spec.model_family}"
@@ -119,13 +148,53 @@ class DiffUsersVideoModel:
119
148
  pipeline.transformer = torch.compile(
120
149
  pipeline.transformer, mode="max-autotune", fullgraph=True
121
150
  )
151
+ if kwargs.get("layerwise_cast", False):
152
+ compute_dtype = pipeline.transformer.dtype
153
+ pipeline.transformer.enable_layerwise_casting(
154
+ storage_dtype=torch.float8_e4m3fn, compute_dtype=compute_dtype
155
+ )
122
156
  if kwargs.get("cpu_offload", False):
123
157
  logger.debug("CPU offloading model")
124
158
  pipeline.enable_model_cpu_offload()
125
159
  if kwargs.get("sequential_cpu_offload", True):
126
160
  pipeline.enable_sequential_cpu_offload()
127
- pipeline.vae.enable_slicing()
128
- pipeline.vae.enable_tiling()
161
+ try:
162
+ pipeline.vae.enable_slicing()
163
+ except AttributeError:
164
+ # model does not support slicing
165
+ pass
166
+ try:
167
+ pipeline.vae.enable_tiling()
168
+ except AttributeError:
169
+ # model does support tiling
170
+ pass
171
+ elif kwargs.get("group_offload", False):
172
+ from diffusers.hooks.group_offloading import apply_group_offloading
173
+
174
+ onload_device = torch.device("cuda")
175
+ offload_device = torch.device("cpu")
176
+
177
+ apply_group_offloading(
178
+ pipeline.text_encoder,
179
+ onload_device=onload_device,
180
+ offload_device=offload_device,
181
+ offload_type="block_level",
182
+ num_blocks_per_group=4,
183
+ )
184
+ group_offload_kwargs = {}
185
+ if kwargs.get("use_stream", False):
186
+ group_offload_kwargs["offload_type"] = "block_level"
187
+ group_offload_kwargs["num_blocks_per_group"] = 4
188
+ else:
189
+ group_offload_kwargs["offload_type"] = "leaf_level"
190
+ group_offload_kwargs["use_stream"] = True
191
+ pipeline.transformer.enable_group_offload(
192
+ onload_device=onload_device,
193
+ offload_device=offload_device,
194
+ **group_offload_kwargs,
195
+ )
196
+ # Since we've offloaded the larger models already, we can move the rest of the model components to GPU
197
+ pipeline = move_model_to_available_device(pipeline)
129
198
  elif not kwargs.get("device_map"):
130
199
  logger.debug("Loading model to available device")
131
200
  if gpu_count() > 1:
@@ -135,6 +204,26 @@ class DiffUsersVideoModel:
135
204
  # Recommended if your computer has < 64 GB of RAM
136
205
  pipeline.enable_attention_slicing()
137
206
 
207
+ @staticmethod
208
+ def _process_progressor(kwargs: dict):
209
+ import diffusers
210
+
211
+ progressor: Progressor = kwargs.pop("progressor", None)
212
+
213
+ def report_status_callback(
214
+ pipe: diffusers.DiffusionPipeline,
215
+ step: int,
216
+ timestep: int,
217
+ callback_kwargs: dict,
218
+ ):
219
+ num_steps = pipe.num_timesteps
220
+ progressor.set_progress((step + 1) / num_steps)
221
+
222
+ return callback_kwargs
223
+
224
+ if progressor and progressor.request_id:
225
+ kwargs["callback_on_step_end"] = report_status_callback
226
+
138
227
  def text_to_video(
139
228
  self,
140
229
  prompt: str,
@@ -143,27 +232,77 @@ class DiffUsersVideoModel:
143
232
  response_format: str = "b64_json",
144
233
  **kwargs,
145
234
  ) -> VideoList:
146
- import gc
147
-
148
- # cv2 bug will cause the video cannot be normally displayed
149
- # thus we use the imageio one
150
- # from diffusers.utils import export_to_video
151
- from ...device_utils import empty_cache
152
-
153
235
  assert self._model is not None
154
236
  assert callable(self._model)
155
237
  generate_kwargs = self._model_spec.default_generate_config.copy()
156
238
  generate_kwargs.update(kwargs)
157
239
  generate_kwargs["num_videos_per_prompt"] = n
240
+ fps = generate_kwargs.pop("fps", 10)
158
241
  logger.debug(
159
242
  "diffusers text_to_video args: %s",
160
243
  generate_kwargs,
161
244
  )
245
+ self._process_progressor(generate_kwargs)
162
246
  output = self._model(
163
247
  prompt=prompt,
164
248
  num_inference_steps=num_inference_steps,
165
249
  **generate_kwargs,
166
250
  )
251
+ return self._output_to_video(output, fps, response_format)
252
+
253
+ def image_to_video(
254
+ self,
255
+ image: PIL.Image,
256
+ prompt: str,
257
+ n: int = 1,
258
+ num_inference_steps: Optional[int] = None,
259
+ response_format: str = "b64_json",
260
+ **kwargs,
261
+ ):
262
+ assert self._model is not None
263
+ assert callable(self._model)
264
+ generate_kwargs = self._model_spec.default_generate_config.copy()
265
+ generate_kwargs.update(kwargs)
266
+ generate_kwargs["num_videos_per_prompt"] = n
267
+ if num_inference_steps:
268
+ generate_kwargs["num_inference_steps"] = num_inference_steps
269
+ fps = generate_kwargs.pop("fps", 10)
270
+
271
+ # process image
272
+ max_area = generate_kwargs.pop("max_area")
273
+ if isinstance(max_area, str):
274
+ max_area = [int(v) for v in max_area.split("*")]
275
+ max_area = reduce(operator.mul, max_area, 1)
276
+ image = self._process_image(image, max_area)
277
+
278
+ height, width = image.height, image.width
279
+ generate_kwargs.pop("width", None)
280
+ generate_kwargs.pop("height", None)
281
+ self._process_progressor(generate_kwargs)
282
+ output = self._model(
283
+ image=image, prompt=prompt, height=height, width=width, **generate_kwargs
284
+ )
285
+ return self._output_to_video(output, fps, response_format)
286
+
287
+ def _process_image(self, image: PIL.Image, max_area: int) -> PIL.Image:
288
+ assert self._model is not None
289
+ aspect_ratio = image.height / image.width
290
+ mod_value = (
291
+ self._model.vae_scale_factor_spatial
292
+ * self._model.transformer.config.patch_size[1]
293
+ )
294
+ height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
295
+ width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
296
+ return image.resize((width, height))
297
+
298
+ def _output_to_video(self, output: Any, fps: int, response_format: str):
299
+ import gc
300
+
301
+ # cv2 bug will cause the video cannot be normally displayed
302
+ # thus we use the imageio one
303
+ from diffusers.utils import export_to_video
304
+
305
+ from ...device_utils import empty_cache
167
306
 
168
307
  # clean cache
169
308
  gc.collect()
@@ -173,7 +312,12 @@ class DiffUsersVideoModel:
173
312
  urls = []
174
313
  for f in output.frames:
175
314
  path = os.path.join(XINFERENCE_VIDEO_DIR, uuid.uuid4().hex + ".mp4")
176
- p = export_to_video_imageio(f, path, fps=8)
315
+ export = (
316
+ export_to_video
317
+ if self.model_spec.model_family != "CogVideoX"
318
+ else export_to_video_imageio
319
+ )
320
+ p = export(f, path, fps=fps)
177
321
  urls.append(p)
178
322
  if response_format == "url":
179
323
  return VideoList(
@@ -45,5 +45,105 @@
45
45
  },
46
46
  "default_generate_config": {
47
47
  }
48
+ },
49
+ {
50
+ "model_name": "Wan2.1-1.3B",
51
+ "model_family": "Wan",
52
+ "model_id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
53
+ "model_revision": "0fad780a534b6463e45facd96134c9f345acfa5b",
54
+ "model_ability": [
55
+ "text2video"
56
+ ],
57
+ "default_model_config": {
58
+ "torch_dtype": "bfloat16"
59
+ },
60
+ "default_generate_config": {
61
+ },
62
+ "virtualenv": {
63
+ "packages": [
64
+ "diffusers>=0.33.0",
65
+ "ftfy",
66
+ "imageio-ffmpeg",
67
+ "imageio",
68
+ "numpy==1.26.4"
69
+ ]
70
+ }
71
+ },
72
+ {
73
+ "model_name": "Wan2.1-14B",
74
+ "model_family": "Wan",
75
+ "model_id": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
76
+ "model_revision": "38ec498cb3208fb688890f8cc7e94ede2cbd7f68",
77
+ "model_ability": [
78
+ "text2video"
79
+ ],
80
+ "default_model_config": {
81
+ "torch_dtype": "bfloat16"
82
+ },
83
+ "default_generate_config": {
84
+ },
85
+ "virtualenv": {
86
+ "packages": [
87
+ "diffusers>=0.33.0",
88
+ "ftfy",
89
+ "imageio-ffmpeg",
90
+ "imageio",
91
+ "numpy==1.26.4"
92
+ ]
93
+ }
94
+ },
95
+ {
96
+ "model_name": "Wan2.1-i2v-14B-480p",
97
+ "model_family": "Wan",
98
+ "model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
99
+ "model_revision": "b184e23a8a16b20f108f727c902e769e873ffc73",
100
+ "model_ability": [
101
+ "image2video"
102
+ ],
103
+ "default_model_config": {
104
+ "torch_dtype": "bfloat16"
105
+ },
106
+ "default_generate_config": {
107
+ "max_area": [
108
+ 480,
109
+ 832
110
+ ]
111
+ },
112
+ "virtualenv": {
113
+ "packages": [
114
+ "diffusers>=0.33.0",
115
+ "ftfy",
116
+ "imageio-ffmpeg",
117
+ "imageio",
118
+ "numpy==1.26.4"
119
+ ]
120
+ }
121
+ },
122
+ {
123
+ "model_name": "Wan2.1-i2v-14B-720p",
124
+ "model_family": "Wan",
125
+ "model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
126
+ "model_revision": "eb849f76dfa246545b65774a9e25943ee69b3fa3",
127
+ "model_ability": [
128
+ "image2video"
129
+ ],
130
+ "default_model_config": {
131
+ "torch_dtype": "bfloat16"
132
+ },
133
+ "default_generate_config": {
134
+ "max_area": [
135
+ 720,
136
+ 1280
137
+ ]
138
+ },
139
+ "virtualenv": {
140
+ "packages": [
141
+ "diffusers>=0.33.0",
142
+ "ftfy",
143
+ "imageio-ffmpeg",
144
+ "imageio",
145
+ "numpy==1.26.4"
146
+ ]
147
+ }
48
148
  }
49
149
  ]
@@ -48,5 +48,109 @@
48
48
  },
49
49
  "default_generate_config": {
50
50
  }
51
+ },
52
+ {
53
+ "model_name": "Wan2.1-1.3B",
54
+ "model_family": "Wan",
55
+ "model_hub": "modelscope",
56
+ "model_id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
57
+ "model_revision": "master",
58
+ "model_ability": [
59
+ "text2video"
60
+ ],
61
+ "default_model_config": {
62
+ "torch_dtype": "bfloat16"
63
+ },
64
+ "default_generate_config": {
65
+ },
66
+ "virtualenv": {
67
+ "packages": [
68
+ "diffusers>=0.33.0",
69
+ "ftfy",
70
+ "imageio-ffmpeg",
71
+ "imageio",
72
+ "numpy==1.26.4"
73
+ ]
74
+ }
75
+ },
76
+ {
77
+ "model_name": "Wan2.1-14B",
78
+ "model_family": "Wan",
79
+ "model_hub": "modelscope",
80
+ "model_id": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
81
+ "model_revision": "master",
82
+ "model_ability": [
83
+ "text2video"
84
+ ],
85
+ "default_model_config": {
86
+ "torch_dtype": "bfloat16"
87
+ },
88
+ "default_generate_config": {
89
+ },
90
+ "virtualenv": {
91
+ "packages": [
92
+ "diffusers>=0.33.0",
93
+ "ftfy",
94
+ "imageio-ffmpeg",
95
+ "imageio",
96
+ "numpy==1.26.4"
97
+ ]
98
+ }
99
+ },
100
+ {
101
+ "model_name": "Wan2.1-i2v-14B-480p",
102
+ "model_family": "Wan",
103
+ "model_hub": "modelscope",
104
+ "model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
105
+ "model_revision": "master",
106
+ "model_ability": [
107
+ "image2video"
108
+ ],
109
+ "default_model_config": {
110
+ "torch_dtype": "bfloat16"
111
+ },
112
+ "default_generate_config": {
113
+ "max_area": [
114
+ 480,
115
+ 832
116
+ ]
117
+ },
118
+ "virtualenv": {
119
+ "packages": [
120
+ "diffusers>=0.33.0",
121
+ "ftfy",
122
+ "imageio-ffmpeg",
123
+ "imageio",
124
+ "numpy==1.26.4"
125
+ ]
126
+ }
127
+ },
128
+ {
129
+ "model_name": "Wan2.1-i2v-14B-720p",
130
+ "model_family": "Wan",
131
+ "model_hub": "modelscope",
132
+ "model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
133
+ "model_revision": "master",
134
+ "model_ability": [
135
+ "image2video"
136
+ ],
137
+ "default_model_config": {
138
+ "torch_dtype": "bfloat16"
139
+ },
140
+ "default_generate_config": {
141
+ "max_area": [
142
+ 720,
143
+ 1280
144
+ ]
145
+ },
146
+ "virtualenv": {
147
+ "packages": [
148
+ "diffusers>=0.33.0",
149
+ "ftfy",
150
+ "imageio-ffmpeg",
151
+ "imageio",
152
+ "numpy==1.26.4"
153
+ ]
154
+ }
51
155
  }
52
156
  ]
@@ -75,10 +75,11 @@ def main():
75
75
  print('Processing {}'.format(path))
76
76
  states = torch.load(path, map_location=torch.device('cpu'))
77
77
  for k in states.keys():
78
- if k not in avg.keys():
79
- avg[k] = states[k].clone()
80
- else:
81
- avg[k] += states[k]
78
+ if k not in ['step', 'epoch']:
79
+ if k not in avg.keys():
80
+ avg[k] = states[k].clone()
81
+ else:
82
+ avg[k] += states[k]
82
83
  # average
83
84
  for k in avg.keys():
84
85
  if avg[k] is not None: