diffsynth 1.1.1__tar.gz → 1.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. {diffsynth-1.1.1 → diffsynth-1.1.3}/PKG-INFO +1 -1
  2. {diffsynth-1.1.1 → diffsynth-1.1.3}/README.md +34 -9
  3. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/configs/model_config.py +65 -1
  4. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/controlnets/processors.py +7 -6
  5. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/data/video.py +2 -2
  6. diffsynth-1.1.3/diffsynth/distributed/xdit_context_parallel.py +129 -0
  7. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/BLIP/__init__.py +1 -0
  8. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/BLIP/blip.py +77 -0
  9. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/BLIP/blip_pretrain.py +44 -0
  10. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/BLIP/med.py +947 -0
  11. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/BLIP/vit.py +301 -0
  12. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/__init__.py +148 -0
  13. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/aesthetic.py +148 -0
  14. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/clip.py +97 -0
  15. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/config.py +23 -0
  16. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/hps.py +118 -0
  17. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/imagereward.py +212 -0
  18. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/mps.py +129 -0
  19. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/__init__.py +14 -0
  20. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py +458 -0
  21. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/constants.py +2 -0
  22. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/factory.py +433 -0
  23. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/hf_configs.py +45 -0
  24. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py +176 -0
  25. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/loss.py +270 -0
  26. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/model.py +461 -0
  27. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/modified_resnet.py +181 -0
  28. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/openai.py +144 -0
  29. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py +376 -0
  30. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/push_to_hf_hub.py +243 -0
  31. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py +127 -0
  32. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py +211 -0
  33. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/transform.py +216 -0
  34. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py +727 -0
  35. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/utils.py +60 -0
  36. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/version.py +1 -0
  37. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/pickscore.py +112 -0
  38. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/trainer/__init__.py +1 -0
  39. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/trainer/models/__init__.py +3 -0
  40. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/trainer/models/base_model.py +7 -0
  41. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/trainer/models/clip_model.py +146 -0
  42. diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/trainer/models/cross_modeling.py +292 -0
  43. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/flux_controlnet.py +2 -0
  44. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/flux_dit.py +13 -10
  45. diffsynth-1.1.3/diffsynth/models/flux_infiniteyou.py +128 -0
  46. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_video_dit.py +81 -46
  47. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_video_text_encoder.py +23 -10
  48. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/kolors_text_encoder.py +1 -2
  49. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/lora.py +69 -50
  50. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/model_manager.py +20 -7
  51. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd3_text_encoder.py +2 -1
  52. diffsynth-1.1.3/diffsynth/models/stepvideo_dit.py +940 -0
  53. diffsynth-1.1.3/diffsynth/models/stepvideo_text_encoder.py +553 -0
  54. diffsynth-1.1.3/diffsynth/models/stepvideo_vae.py +1132 -0
  55. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/tiler.py +1 -1
  56. diffsynth-1.1.3/diffsynth/models/wan_video_dit.py +498 -0
  57. diffsynth-1.1.3/diffsynth/models/wan_video_image_encoder.py +902 -0
  58. diffsynth-1.1.3/diffsynth/models/wan_video_text_encoder.py +269 -0
  59. diffsynth-1.1.3/diffsynth/models/wan_video_vae.py +807 -0
  60. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/__init__.py +2 -0
  61. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/base.py +12 -2
  62. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/flux_image.py +182 -4
  63. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/hunyuan_video.py +147 -17
  64. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/omnigen_image.py +1 -1
  65. diffsynth-1.1.3/diffsynth/pipelines/step_video.py +209 -0
  66. diffsynth-1.1.3/diffsynth/pipelines/wan_video.py +416 -0
  67. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/__init__.py +2 -0
  68. diffsynth-1.1.3/diffsynth/prompters/hunyuan_video_prompter.py +275 -0
  69. diffsynth-1.1.3/diffsynth/prompters/stepvideo_prompter.py +56 -0
  70. diffsynth-1.1.3/diffsynth/prompters/wan_prompter.py +109 -0
  71. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/schedulers/flow_match.py +9 -4
  72. diffsynth-1.1.3/diffsynth/tokenizer_configs/__init__.py +0 -0
  73. diffsynth-1.1.3/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/preprocessor_config.json +45 -0
  74. diffsynth-1.1.3/diffsynth/trainers/__init__.py +0 -0
  75. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/trainers/text_to_image.py +27 -2
  76. diffsynth-1.1.3/diffsynth/vram_management/__init__.py +1 -0
  77. diffsynth-1.1.3/diffsynth/vram_management/layers.py +95 -0
  78. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth.egg-info/PKG-INFO +1 -1
  79. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth.egg-info/SOURCES.txt +55 -1
  80. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth.egg-info/requires.txt +1 -0
  81. {diffsynth-1.1.1 → diffsynth-1.1.3}/setup.py +1 -1
  82. diffsynth-1.1.1/diffsynth/prompters/hunyuan_video_prompter.py +0 -143
  83. {diffsynth-1.1.1 → diffsynth-1.1.3}/LICENSE +0 -0
  84. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/__init__.py +0 -0
  85. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/configs/__init__.py +0 -0
  86. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/controlnets/__init__.py +0 -0
  87. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/controlnets/controlnet_unit.py +0 -0
  88. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/data/__init__.py +0 -0
  89. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/data/simple_text_image.py +0 -0
  90. {diffsynth-1.1.1/diffsynth/extensions → diffsynth-1.1.3/diffsynth/distributed}/__init__.py +0 -0
  91. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/ESRGAN/__init__.py +0 -0
  92. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/__init__.py +0 -0
  93. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/api.py +0 -0
  94. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/cupy_kernels.py +0 -0
  95. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/data.py +0 -0
  96. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/patch_match.py +0 -0
  97. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/runners/__init__.py +0 -0
  98. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/runners/accurate.py +0 -0
  99. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/runners/balanced.py +0 -0
  100. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/runners/fast.py +0 -0
  101. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/FastBlend/runners/interpolation.py +0 -0
  102. /diffsynth-1.1.1/diffsynth/processors/__init__.py → /diffsynth-1.1.3/diffsynth/extensions/ImageQualityMetric/open_clip/generation_utils.py +0 -0
  103. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/extensions/RIFE/__init__.py +0 -0
  104. {diffsynth-1.1.1/diffsynth/tokenizer_configs → diffsynth-1.1.3/diffsynth/extensions}/__init__.py +0 -0
  105. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/__init__.py +0 -0
  106. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/attention.py +0 -0
  107. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/cog_dit.py +0 -0
  108. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/cog_vae.py +0 -0
  109. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/downloader.py +0 -0
  110. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/flux_ipadapter.py +0 -0
  111. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/flux_text_encoder.py +0 -0
  112. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/flux_vae.py +0 -0
  113. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_dit.py +0 -0
  114. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_dit_text_encoder.py +0 -0
  115. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_video_vae_decoder.py +0 -0
  116. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/hunyuan_video_vae_encoder.py +0 -0
  117. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/omnigen.py +0 -0
  118. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd3_dit.py +0 -0
  119. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd3_vae_decoder.py +0 -0
  120. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd3_vae_encoder.py +0 -0
  121. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_controlnet.py +0 -0
  122. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_ipadapter.py +0 -0
  123. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_motion.py +0 -0
  124. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_text_encoder.py +0 -0
  125. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_unet.py +0 -0
  126. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_vae_decoder.py +0 -0
  127. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sd_vae_encoder.py +0 -0
  128. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_controlnet.py +0 -0
  129. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_ipadapter.py +0 -0
  130. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_motion.py +0 -0
  131. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_text_encoder.py +0 -0
  132. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_unet.py +0 -0
  133. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_vae_decoder.py +0 -0
  134. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/sdxl_vae_encoder.py +0 -0
  135. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/svd_image_encoder.py +0 -0
  136. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/svd_unet.py +0 -0
  137. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/svd_vae_decoder.py +0 -0
  138. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/svd_vae_encoder.py +0 -0
  139. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/models/utils.py +0 -0
  140. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/cog_video.py +0 -0
  141. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/dancer.py +0 -0
  142. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/hunyuan_image.py +0 -0
  143. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/pipeline_runner.py +0 -0
  144. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/sd3_image.py +0 -0
  145. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/sd_image.py +0 -0
  146. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/sd_video.py +0 -0
  147. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/sdxl_image.py +0 -0
  148. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/sdxl_video.py +0 -0
  149. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/pipelines/svd_video.py +0 -0
  150. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/processors/FastBlend.py +0 -0
  151. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/processors/PILEditor.py +0 -0
  152. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/processors/RIFE.py +0 -0
  153. {diffsynth-1.1.1/diffsynth/trainers → diffsynth-1.1.3/diffsynth/processors}/__init__.py +0 -0
  154. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/processors/base.py +0 -0
  155. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/processors/sequencial_processor.py +0 -0
  156. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/base_prompter.py +0 -0
  157. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/cog_prompter.py +0 -0
  158. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/flux_prompter.py +0 -0
  159. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/hunyuan_dit_prompter.py +0 -0
  160. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/kolors_prompter.py +0 -0
  161. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/omnigen_prompter.py +0 -0
  162. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/omost.py +0 -0
  163. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/prompt_refiners.py +0 -0
  164. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/sd3_prompter.py +0 -0
  165. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/sd_prompter.py +0 -0
  166. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/prompters/sdxl_prompter.py +0 -0
  167. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/schedulers/__init__.py +0 -0
  168. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/schedulers/continuous_ode.py +0 -0
  169. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/schedulers/ddim.py +0 -0
  170. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/cog/tokenizer/added_tokens.json +0 -0
  171. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/cog/tokenizer/special_tokens_map.json +0 -0
  172. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/cog/tokenizer/spiece.model +0 -0
  173. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/cog/tokenizer/tokenizer_config.json +0 -0
  174. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_1/merges.txt +0 -0
  175. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_1/special_tokens_map.json +0 -0
  176. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_1/tokenizer_config.json +0 -0
  177. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_1/vocab.json +0 -0
  178. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_2/special_tokens_map.json +0 -0
  179. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_2/spiece.model +0 -0
  180. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer.json +0 -0
  181. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer_config.json +0 -0
  182. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/special_tokens_map.json +0 -0
  183. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/tokenizer_config.json +0 -0
  184. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab.txt +0 -0
  185. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab_org.txt +0 -0
  186. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/config.json +0 -0
  187. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json +0 -0
  188. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model +0 -0
  189. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json +0 -0
  190. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt +0 -0
  191. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json +0 -0
  192. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json +0 -0
  193. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json +0 -0
  194. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json +0 -0
  195. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json +0 -0
  196. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json +0 -0
  197. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer.model +0 -0
  198. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer_config.json +0 -0
  199. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt +0 -0
  200. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/merges.txt +0 -0
  201. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/special_tokens_map.json +0 -0
  202. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/tokenizer_config.json +0 -0
  203. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/vocab.json +0 -0
  204. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/merges.txt +0 -0
  205. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/special_tokens_map.json +0 -0
  206. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/tokenizer_config.json +0 -0
  207. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/vocab.json +0 -0
  208. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/merges.txt +0 -0
  209. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/special_tokens_map.json +0 -0
  210. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/tokenizer_config.json +0 -0
  211. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/vocab.json +0 -0
  212. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/special_tokens_map.json +0 -0
  213. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/spiece.model +0 -0
  214. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer.json +0 -0
  215. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer_config.json +0 -0
  216. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/merges.txt +0 -0
  217. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json +0 -0
  218. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json +0 -0
  219. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/vocab.json +0 -0
  220. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth.egg-info/dependency_links.txt +0 -0
  221. {diffsynth-1.1.1 → diffsynth-1.1.3}/diffsynth.egg-info/top_level.txt +0 -0
  222. {diffsynth-1.1.1 → diffsynth-1.1.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: diffsynth
3
- Version: 1.1.1
3
+ Version: 1.1.3
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: Artiprocher
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,11 +13,19 @@ Document: https://diffsynth-studio.readthedocs.io/zh-cn/latest/index.html
13
13
 
14
14
  ## Introduction
15
15
 
16
- DiffSynth Studio is a Diffusion engine. We have restructured architectures including Text Encoder, UNet, VAE, among others, maintaining compatibility with models from the open-source community while enhancing computational performance. We provide many interesting features. Enjoy the magic of Diffusion models!
16
+ Welcome to the magic world of Diffusion models!
17
17
 
18
- Until now, DiffSynth Studio has supported the following models:
18
+ DiffSynth consists of two open-source projects:
19
+ * [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): Focused on aggressive technological exploration. Targeted at academia. Provides more cutting-edge technical support and novel inference capabilities.
20
+ * [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine): Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
19
21
 
20
- * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
22
+ DiffSynth-Studio is an open-source project aimed at exploring innovations in AIGC technology. We have integrated numerous open-source Diffusion models, including FLUX and Wan, among others. Through this open-source project, we hope to connect models within the open-source community and explore new technologies based on diffusion models.
23
+
24
+ Until now, DiffSynth-Studio has supported the following models:
25
+
26
+ * [Wan-Video](https://github.com/Wan-Video/Wan2.1)
27
+ * [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
28
+ * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo), [HunyuanVideo-I2V]()
21
29
  * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
22
30
  * [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
23
31
  * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
@@ -34,11 +42,21 @@ Until now, DiffSynth Studio has supported the following models:
34
42
  * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
35
43
 
36
44
  ## News
45
+ - **March 31, 2025** We support InfiniteYou, an identity preserving method for FLUX. Please refer to [./examples/InfiniteYou/](./examples/InfiniteYou/) for more details.
46
+
47
+ - **March 25, 2025** 🔥🔥🔥 Our new open-source project, [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine), is now open-sourced! Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
48
+
49
+ - **March 13, 2025** We support HunyuanVideo-I2V, the image-to-video generation version of HunyuanVideo open-sourced by Tencent. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details.
50
+
51
+ - **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
52
+
53
+ - **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
54
+
37
55
  - **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
38
- * Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
39
- * Github: [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
40
- * Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
41
- * Training dataset: Coming soon
56
+ - Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
57
+ - Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
58
+ - Online Demo: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
59
+ - Training Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
42
60
 
43
61
  - **December 19, 2024** We implement advanced VRAM management for HunyuanVideo, making it possible to generate videos at a resolution of 129x720x1280 using 24GB of VRAM, or at 129x512x384 resolution with just 6GB of VRAM. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details.
44
62
 
@@ -65,7 +83,7 @@ Until now, DiffSynth Studio has supported the following models:
65
83
  - Enable CFG and highres-fix to improve visual quality. See [here](/examples/image_synthesis/README.md)
66
84
  - LoRA, ControlNet, and additional models will be available soon.
67
85
 
68
- - **June 21, 2024.** 🔥🔥🔥 We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
86
+ - **June 21, 2024.** We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
69
87
  - [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
70
88
  - Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/).
71
89
  - Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
@@ -114,12 +132,19 @@ cd DiffSynth-Studio
114
132
  pip install -e .
115
133
  ```
116
134
 
117
- Or install from pypi:
135
+ Or install from pypi (There is a delay in the update. If you want to experience the latest features, please do not use this installation method.):
118
136
 
119
137
  ```
120
138
  pip install diffsynth
121
139
  ```
122
140
 
141
+ If you encounter issues during installation, it may be caused by the packages we depend on. Please refer to the documentation of the package that caused the problem.
142
+
143
+ * [torch](https://pytorch.org/get-started/locally/)
144
+ * [sentencepiece](https://github.com/google/sentencepiece)
145
+ * [cmake](https://cmake.org)
146
+ * [cupy](https://docs.cupy.dev/en/stable/install.html)
147
+
123
148
  ## Usage (in Python code)
124
149
 
125
150
  The Python examples are in [`examples`](./examples/). We provide an overview here.
@@ -37,6 +37,7 @@ from ..models.flux_text_encoder import FluxTextEncoder2
37
37
  from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
38
38
  from ..models.flux_controlnet import FluxControlNet
39
39
  from ..models.flux_ipadapter import FluxIpAdapter
40
+ from ..models.flux_infiniteyou import InfiniteYouImageProjector
40
41
 
41
42
  from ..models.cog_vae import CogVAEEncoder, CogVAEDecoder
42
43
  from ..models.cog_dit import CogDiT
@@ -51,6 +52,14 @@ from ..extensions.ESRGAN import RRDBNet
51
52
 
52
53
  from ..models.hunyuan_video_dit import HunyuanVideoDiT
53
54
 
55
+ from ..models.stepvideo_vae import StepVideoVAE
56
+ from ..models.stepvideo_dit import StepVideoModel
57
+
58
+ from ..models.wan_video_dit import WanModel
59
+ from ..models.wan_video_text_encoder import WanTextEncoder
60
+ from ..models.wan_video_image_encoder import WanImageEncoder
61
+ from ..models.wan_video_vae import WanVideoVAE
62
+
54
63
 
55
64
  model_loader_configs = [
56
65
  # These configs are provided for detecting model type automatically.
@@ -87,6 +96,7 @@ model_loader_configs = [
87
96
  (None, "57b02550baab820169365b3ee3afa2c9", ["flux_dit"], [FluxDiT], "civitai"),
88
97
  (None, "3394f306c4cbf04334b712bf5aaed95f", ["flux_dit"], [FluxDiT], "civitai"),
89
98
  (None, "023f054d918a84ccf503481fd1e3379e", ["flux_dit"], [FluxDiT], "civitai"),
99
+ (None, "605c56eab23e9e2af863ad8f0813a25d", ["flux_dit"], [FluxDiT], "diffusers"),
90
100
  (None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
91
101
  (None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
92
102
  (None, "6b7116078c4170bfbeaedc8fe71f6649", ["esrgan"], [RRDBNet], "civitai"),
@@ -95,6 +105,8 @@ model_loader_configs = [
95
105
  (None, "b001c89139b5f053c715fe772362dd2a", ["flux_controlnet"], [FluxControlNet], "diffusers"),
96
106
  (None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"),
97
107
  (None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"),
108
+ (None, "7f9583eb8ba86642abb9a21a4b2c9e16", ["flux_controlnet"], [FluxControlNet], "diffusers"),
109
+ (None, "c07c0f04f5ff55e86b4e937c7a40d481", ["infiniteyou_image_projector"], [InfiniteYouImageProjector], "diffusers"),
98
110
  (None, "4daaa66cc656a8fe369908693dad0a35", ["flux_ipadapter"], [FluxIpAdapter], "diffusers"),
99
111
  (None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
100
112
  (None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
@@ -103,6 +115,16 @@ model_loader_configs = [
103
115
  (None, "aeb82dce778a03dcb4d726cb03f3c43f", ["hunyuan_video_vae_decoder", "hunyuan_video_vae_encoder"], [HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder], "diffusers"),
104
116
  (None, "b9588f02e78f5ccafc9d7c0294e46308", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
105
117
  (None, "84ef4bd4757f60e906b54aa6a7815dc6", ["hunyuan_video_dit"], [HunyuanVideoDiT], "civitai"),
118
+ (None, "68beaf8429b7c11aa8ca05b1bd0058bd", ["stepvideo_vae"], [StepVideoVAE], "civitai"),
119
+ (None, "5c0216a2132b082c10cb7a0e0377e681", ["stepvideo_dit"], [StepVideoModel], "civitai"),
120
+ (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
121
+ (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
122
+ (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
123
+ (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
124
+ (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
125
+ (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
126
+ (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
127
+ (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
106
128
  ]
107
129
  huggingface_model_loader_configs = [
108
130
  # These configs are provided for detecting model type automatically.
@@ -115,7 +137,9 @@ huggingface_model_loader_configs = [
115
137
  ("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
116
138
  ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
117
139
  ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
118
- ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder")
140
+ ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
141
+ ("LlavaForConditionalGeneration", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoMLLMEncoder"),
142
+ ("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
119
143
  ]
120
144
  patch_model_loader_configs = [
121
145
  # These configs are provided for detecting model type automatically.
@@ -577,6 +601,25 @@ preset_models_on_modelscope = {
577
601
  "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
578
602
  ],
579
603
  },
604
+ "InfiniteYou":{
605
+ "file_list":[
606
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
607
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
608
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/image_proj_model.bin", "models/InfiniteYou"),
609
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/1k3d68.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
610
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/2d106det.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
611
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/genderage.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
612
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/glintr100.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
613
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
614
+ ],
615
+ "load_path":[
616
+ [
617
+ "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
618
+ "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors"
619
+ ],
620
+ "models/InfiniteYou/image_proj_model.bin",
621
+ ],
622
+ },
580
623
  # ESRGAN
581
624
  "ESRGAN_x4": [
582
625
  ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
@@ -657,6 +700,25 @@ preset_models_on_modelscope = {
657
700
  "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
658
701
  ],
659
702
  },
703
+ "HunyuanVideoI2V":{
704
+ "file_list": [
705
+ ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideoI2V/text_encoder"),
706
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00001-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
707
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00002-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
708
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00003-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
709
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00004-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
710
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "config.json", "models/HunyuanVideoI2V/text_encoder_2"),
711
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model.safetensors.index.json", "models/HunyuanVideoI2V/text_encoder_2"),
712
+ ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/vae/pytorch_model.pt", "models/HunyuanVideoI2V/vae"),
713
+ ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideoI2V/transformers")
714
+ ],
715
+ "load_path": [
716
+ "models/HunyuanVideoI2V/text_encoder/model.safetensors",
717
+ "models/HunyuanVideoI2V/text_encoder_2",
718
+ "models/HunyuanVideoI2V/vae/pytorch_model.pt",
719
+ "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
720
+ ],
721
+ },
660
722
  "HunyuanVideo-fp8":{
661
723
  "file_list": [
662
724
  ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
@@ -717,6 +779,7 @@ Preset_model_id: TypeAlias = Literal[
717
779
  "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
718
780
  "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
719
781
  "InstantX/FLUX.1-dev-IP-Adapter",
782
+ "InfiniteYou",
720
783
  "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
721
784
  "QwenPrompt",
722
785
  "OmostPrompt",
@@ -733,4 +796,5 @@ Preset_model_id: TypeAlias = Literal[
733
796
  "StableDiffusion3.5-medium",
734
797
  "HunyuanVideo",
735
798
  "HunyuanVideo-fp8",
799
+ "HunyuanVideoI2V",
736
800
  ]
@@ -1,10 +1,4 @@
1
1
  from typing_extensions import Literal, TypeAlias
2
- import warnings
3
- with warnings.catch_warnings():
4
- warnings.simplefilter("ignore")
5
- from controlnet_aux.processor import (
6
- CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector, NormalBaeDetector
7
- )
8
2
 
9
3
 
10
4
  Processor_id: TypeAlias = Literal[
@@ -15,18 +9,25 @@ class Annotator:
15
9
  def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda', skip_processor=False):
16
10
  if not skip_processor:
17
11
  if processor_id == "canny":
12
+ from controlnet_aux.processor import CannyDetector
18
13
  self.processor = CannyDetector()
19
14
  elif processor_id == "depth":
15
+ from controlnet_aux.processor import MidasDetector
20
16
  self.processor = MidasDetector.from_pretrained(model_path).to(device)
21
17
  elif processor_id == "softedge":
18
+ from controlnet_aux.processor import HEDdetector
22
19
  self.processor = HEDdetector.from_pretrained(model_path).to(device)
23
20
  elif processor_id == "lineart":
21
+ from controlnet_aux.processor import LineartDetector
24
22
  self.processor = LineartDetector.from_pretrained(model_path).to(device)
25
23
  elif processor_id == "lineart_anime":
24
+ from controlnet_aux.processor import LineartAnimeDetector
26
25
  self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
27
26
  elif processor_id == "openpose":
27
+ from controlnet_aux.processor import OpenposeDetector
28
28
  self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
29
29
  elif processor_id == "normal":
30
+ from controlnet_aux.processor import NormalBaeDetector
30
31
  self.processor = NormalBaeDetector.from_pretrained(model_path).to(device)
31
32
  elif processor_id == "tile" or processor_id == "none" or processor_id == "inpaint":
32
33
  self.processor = None
@@ -135,8 +135,8 @@ class VideoData:
135
135
  frame.save(os.path.join(folder, f"{i}.png"))
136
136
 
137
137
 
138
- def save_video(frames, save_path, fps, quality=9):
139
- writer = imageio.get_writer(save_path, fps=fps, quality=quality)
138
+ def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
139
+ writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params)
140
140
  for frame in tqdm(frames, desc="Saving video"):
141
141
  frame = np.array(frame)
142
142
  writer.append_data(frame)
@@ -0,0 +1,129 @@
1
+ import torch
2
+ from typing import Optional
3
+ from einops import rearrange
4
+ from xfuser.core.distributed import (get_sequence_parallel_rank,
5
+ get_sequence_parallel_world_size,
6
+ get_sp_group)
7
+ from xfuser.core.long_ctx_attention import xFuserLongContextAttention
8
+
9
+ def sinusoidal_embedding_1d(dim, position):
10
+ sinusoid = torch.outer(position.type(torch.float64), torch.pow(
11
+ 10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
12
+ x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
13
+ return x.to(position.dtype)
14
+
15
+ def pad_freqs(original_tensor, target_len):
16
+ seq_len, s1, s2 = original_tensor.shape
17
+ pad_size = target_len - seq_len
18
+ padding_tensor = torch.ones(
19
+ pad_size,
20
+ s1,
21
+ s2,
22
+ dtype=original_tensor.dtype,
23
+ device=original_tensor.device)
24
+ padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
25
+ return padded_tensor
26
+
27
+ def rope_apply(x, freqs, num_heads):
28
+ x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
29
+ s_per_rank = x.shape[1]
30
+
31
+ x_out = torch.view_as_complex(x.to(torch.float64).reshape(
32
+ x.shape[0], x.shape[1], x.shape[2], -1, 2))
33
+
34
+ sp_size = get_sequence_parallel_world_size()
35
+ sp_rank = get_sequence_parallel_rank()
36
+ freqs = pad_freqs(freqs, s_per_rank * sp_size)
37
+ freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
38
+
39
+ x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
40
+ return x_out.to(x.dtype)
41
+
42
+ def usp_dit_forward(self,
43
+ x: torch.Tensor,
44
+ timestep: torch.Tensor,
45
+ context: torch.Tensor,
46
+ clip_feature: Optional[torch.Tensor] = None,
47
+ y: Optional[torch.Tensor] = None,
48
+ use_gradient_checkpointing: bool = False,
49
+ use_gradient_checkpointing_offload: bool = False,
50
+ **kwargs,
51
+ ):
52
+ t = self.time_embedding(
53
+ sinusoidal_embedding_1d(self.freq_dim, timestep))
54
+ t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
55
+ context = self.text_embedding(context)
56
+
57
+ if self.has_image_input:
58
+ x = torch.cat([x, y], dim=1) # (b, c_x + c_y, f, h, w)
59
+ clip_embdding = self.img_emb(clip_feature)
60
+ context = torch.cat([clip_embdding, context], dim=1)
61
+
62
+ x, (f, h, w) = self.patchify(x)
63
+
64
+ freqs = torch.cat([
65
+ self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
66
+ self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
67
+ self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
68
+ ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
69
+
70
+ def create_custom_forward(module):
71
+ def custom_forward(*inputs):
72
+ return module(*inputs)
73
+ return custom_forward
74
+
75
+ # Context Parallel
76
+ x = torch.chunk(
77
+ x, get_sequence_parallel_world_size(),
78
+ dim=1)[get_sequence_parallel_rank()]
79
+
80
+ for block in self.blocks:
81
+ if self.training and use_gradient_checkpointing:
82
+ if use_gradient_checkpointing_offload:
83
+ with torch.autograd.graph.save_on_cpu():
84
+ x = torch.utils.checkpoint.checkpoint(
85
+ create_custom_forward(block),
86
+ x, context, t_mod, freqs,
87
+ use_reentrant=False,
88
+ )
89
+ else:
90
+ x = torch.utils.checkpoint.checkpoint(
91
+ create_custom_forward(block),
92
+ x, context, t_mod, freqs,
93
+ use_reentrant=False,
94
+ )
95
+ else:
96
+ x = block(x, context, t_mod, freqs)
97
+
98
+ x = self.head(x, t)
99
+
100
+ # Context Parallel
101
+ x = get_sp_group().all_gather(x, dim=1)
102
+
103
+ # unpatchify
104
+ x = self.unpatchify(x, (f, h, w))
105
+ return x
106
+
107
+
108
+ def usp_attn_forward(self, x, freqs):
109
+ q = self.norm_q(self.q(x))
110
+ k = self.norm_k(self.k(x))
111
+ v = self.v(x)
112
+
113
+ q = rope_apply(q, freqs, self.num_heads)
114
+ k = rope_apply(k, freqs, self.num_heads)
115
+ q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
116
+ k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
117
+ v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
118
+
119
+ x = xFuserLongContextAttention()(
120
+ None,
121
+ query=q,
122
+ key=k,
123
+ value=v,
124
+ )
125
+ x = x.flatten(2)
126
+
127
+ del q, k, v
128
+ torch.cuda.empty_cache()
129
+ return self.o(x)
@@ -0,0 +1 @@
1
+ from .blip_pretrain import *
@@ -0,0 +1,77 @@
1
+ '''
2
+ * Adapted from BLIP (https://github.com/salesforce/BLIP)
3
+ '''
4
+
5
+ import warnings
6
+ warnings.filterwarnings("ignore")
7
+
8
+ import torch
9
+ import os
10
+ from urllib.parse import urlparse
11
+ from timm.models.hub import download_cached_file
12
+ from transformers import BertTokenizer
13
+ from .vit import VisionTransformer, interpolate_pos_embed
14
+
15
+
16
+ def default_bert():
17
+ current_dir = os.path.dirname(os.path.abspath(__file__))
18
+ project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
19
+ model_path = os.path.join(project_root, 'models', 'QualityMetric')
20
+ return os.path.join(model_path, "bert-base-uncased")
21
+
22
+
23
+ def init_tokenizer(bert_model_path):
24
+ tokenizer = BertTokenizer.from_pretrained(bert_model_path)
25
+ tokenizer.add_special_tokens({'bos_token':'[DEC]'})
26
+ tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
27
+ tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
28
+ return tokenizer
29
+
30
+
31
+ def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
32
+
33
+ assert vit in ['base', 'large'], "vit parameter must be base or large"
34
+ if vit=='base':
35
+ vision_width = 768
36
+ visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
37
+ num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
38
+ drop_path_rate=0 or drop_path_rate
39
+ )
40
+ elif vit=='large':
41
+ vision_width = 1024
42
+ visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
43
+ num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
44
+ drop_path_rate=0.1 or drop_path_rate
45
+ )
46
+ return visual_encoder, vision_width
47
+
48
+
49
+ def is_url(url_or_filename):
50
+ parsed = urlparse(url_or_filename)
51
+ return parsed.scheme in ("http", "https")
52
+
53
+ def load_checkpoint(model,url_or_filename):
54
+ if is_url(url_or_filename):
55
+ cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
56
+ checkpoint = torch.load(cached_file, map_location='cpu')
57
+ elif os.path.isfile(url_or_filename):
58
+ checkpoint = torch.load(url_or_filename, map_location='cpu')
59
+ else:
60
+ raise RuntimeError('checkpoint url or path is invalid')
61
+
62
+ state_dict = checkpoint['model']
63
+
64
+ state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
65
+ if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
66
+ state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
67
+ model.visual_encoder_m)
68
+ for key in model.state_dict().keys():
69
+ if key in state_dict.keys():
70
+ if state_dict[key].shape!=model.state_dict()[key].shape:
71
+ print(key, ": ", state_dict[key].shape, ', ', model.state_dict()[key].shape)
72
+ del state_dict[key]
73
+
74
+ msg = model.load_state_dict(state_dict,strict=False)
75
+ print('load checkpoint from %s'%url_or_filename)
76
+ return model,msg
77
+
@@ -0,0 +1,44 @@
1
+ '''
2
+ * Adapted from BLIP (https://github.com/salesforce/BLIP)
3
+ '''
4
+
5
+ import transformers
6
+ transformers.logging.set_verbosity_error()
7
+
8
+ from torch import nn
9
+ import os
10
+ from .med import BertConfig, BertModel
11
+ from .blip import create_vit, init_tokenizer
12
+
13
+ class BLIP_Pretrain(nn.Module):
14
+ def __init__(self,
15
+ med_config = "med_config.json",
16
+ image_size = 224,
17
+ vit = 'base',
18
+ vit_grad_ckpt = False,
19
+ vit_ckpt_layer = 0,
20
+ embed_dim = 256,
21
+ queue_size = 57600,
22
+ momentum = 0.995,
23
+ bert_model_path = ""
24
+ ):
25
+ """
26
+ Args:
27
+ med_config (str): path for the mixture of encoder-decoder model's configuration file
28
+ image_size (int): input image size
29
+ vit (str): model size of vision transformer
30
+ """
31
+ super().__init__()
32
+
33
+ self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, 0)
34
+
35
+ self.tokenizer = init_tokenizer(bert_model_path)
36
+ encoder_config = BertConfig.from_json_file(med_config)
37
+ encoder_config.encoder_width = vision_width
38
+ self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
39
+
40
+ text_width = self.text_encoder.config.hidden_size
41
+
42
+ self.vision_proj = nn.Linear(vision_width, embed_dim)
43
+ self.text_proj = nn.Linear(text_width, embed_dim)
44
+