diffsynth 1.1.2__tar.gz → 1.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. {diffsynth-1.1.2 → diffsynth-1.1.7}/PKG-INFO +1 -1
  2. {diffsynth-1.1.2 → diffsynth-1.1.7}/README.md +16 -5
  3. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/configs/model_config.py +52 -0
  4. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/processors.py +7 -6
  5. diffsynth-1.1.7/diffsynth/distributed/xdit_context_parallel.py +129 -0
  6. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/factory.py +1 -1
  7. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_controlnet.py +2 -0
  8. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_dit.py +13 -10
  9. diffsynth-1.1.7/diffsynth/models/flux_infiniteyou.py +128 -0
  10. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_dit.py +81 -46
  11. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_text_encoder.py +23 -10
  12. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/lora.py +67 -49
  13. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/model_manager.py +4 -0
  14. diffsynth-1.1.7/diffsynth/models/wan_video_dit.py +554 -0
  15. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_image_encoder.py +8 -10
  16. diffsynth-1.1.7/diffsynth/models/wan_video_motion_controller.py +44 -0
  17. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_vae.py +3 -4
  18. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/flux_image.py +78 -2
  19. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/hunyuan_video.py +147 -17
  20. diffsynth-1.1.7/diffsynth/pipelines/wan_video.py +493 -0
  21. diffsynth-1.1.7/diffsynth/prompters/hunyuan_video_prompter.py +275 -0
  22. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/wan_prompter.py +2 -1
  23. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/flow_match.py +1 -1
  24. diffsynth-1.1.7/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/preprocessor_config.json +45 -0
  25. diffsynth-1.1.7/diffsynth/trainers/__init__.py +0 -0
  26. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/trainers/text_to_image.py +1 -1
  27. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/PKG-INFO +1 -1
  28. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/SOURCES.txt +5 -0
  29. {diffsynth-1.1.2 → diffsynth-1.1.7}/setup.py +1 -1
  30. diffsynth-1.1.2/diffsynth/models/wan_video_dit.py +0 -799
  31. diffsynth-1.1.2/diffsynth/pipelines/wan_video.py +0 -276
  32. diffsynth-1.1.2/diffsynth/prompters/hunyuan_video_prompter.py +0 -143
  33. {diffsynth-1.1.2 → diffsynth-1.1.7}/LICENSE +0 -0
  34. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/__init__.py +0 -0
  35. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/configs/__init__.py +0 -0
  36. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/__init__.py +0 -0
  37. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/controlnets/controlnet_unit.py +0 -0
  38. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/__init__.py +0 -0
  39. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/simple_text_image.py +0 -0
  40. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/data/video.py +0 -0
  41. {diffsynth-1.1.2/diffsynth/extensions → diffsynth-1.1.7/diffsynth/distributed}/__init__.py +0 -0
  42. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ESRGAN/__init__.py +0 -0
  43. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/__init__.py +0 -0
  44. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/api.py +0 -0
  45. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/cupy_kernels.py +0 -0
  46. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/data.py +0 -0
  47. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/patch_match.py +0 -0
  48. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/__init__.py +0 -0
  49. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/accurate.py +0 -0
  50. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/balanced.py +0 -0
  51. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/fast.py +0 -0
  52. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/FastBlend/runners/interpolation.py +0 -0
  53. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/__init__.py +0 -0
  54. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/blip.py +0 -0
  55. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/blip_pretrain.py +0 -0
  56. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/med.py +0 -0
  57. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/BLIP/vit.py +0 -0
  58. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/__init__.py +0 -0
  59. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/aesthetic.py +0 -0
  60. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/clip.py +0 -0
  61. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/config.py +0 -0
  62. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/hps.py +0 -0
  63. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/imagereward.py +0 -0
  64. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/mps.py +0 -0
  65. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/__init__.py +0 -0
  66. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/coca_model.py +0 -0
  67. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/constants.py +0 -0
  68. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/generation_utils.py +0 -0
  69. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/hf_configs.py +0 -0
  70. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/hf_model.py +0 -0
  71. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/loss.py +0 -0
  72. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/model.py +0 -0
  73. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/modified_resnet.py +0 -0
  74. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/openai.py +0 -0
  75. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/pretrained.py +0 -0
  76. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/push_to_hf_hub.py +0 -0
  77. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/timm_model.py +0 -0
  78. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/tokenizer.py +0 -0
  79. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/transform.py +0 -0
  80. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/transformer.py +0 -0
  81. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/utils.py +0 -0
  82. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/open_clip/version.py +0 -0
  83. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/pickscore.py +0 -0
  84. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/__init__.py +0 -0
  85. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/__init__.py +0 -0
  86. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/base_model.py +0 -0
  87. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/clip_model.py +0 -0
  88. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/ImageQualityMetric/trainer/models/cross_modeling.py +0 -0
  89. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/extensions/RIFE/__init__.py +0 -0
  90. {diffsynth-1.1.2/diffsynth/processors → diffsynth-1.1.7/diffsynth/extensions}/__init__.py +0 -0
  91. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/__init__.py +0 -0
  92. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/attention.py +0 -0
  93. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/cog_dit.py +0 -0
  94. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/cog_vae.py +0 -0
  95. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/downloader.py +0 -0
  96. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_ipadapter.py +0 -0
  97. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_text_encoder.py +0 -0
  98. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/flux_vae.py +0 -0
  99. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_dit.py +0 -0
  100. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_dit_text_encoder.py +0 -0
  101. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_vae_decoder.py +0 -0
  102. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/hunyuan_video_vae_encoder.py +0 -0
  103. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/kolors_text_encoder.py +0 -0
  104. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/omnigen.py +0 -0
  105. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_dit.py +0 -0
  106. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_text_encoder.py +0 -0
  107. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_vae_decoder.py +0 -0
  108. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd3_vae_encoder.py +0 -0
  109. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_controlnet.py +0 -0
  110. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_ipadapter.py +0 -0
  111. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_motion.py +0 -0
  112. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_text_encoder.py +0 -0
  113. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_unet.py +0 -0
  114. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_vae_decoder.py +0 -0
  115. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sd_vae_encoder.py +0 -0
  116. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_controlnet.py +0 -0
  117. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_ipadapter.py +0 -0
  118. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_motion.py +0 -0
  119. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_text_encoder.py +0 -0
  120. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_unet.py +0 -0
  121. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_vae_decoder.py +0 -0
  122. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/sdxl_vae_encoder.py +0 -0
  123. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_dit.py +0 -0
  124. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_text_encoder.py +0 -0
  125. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/stepvideo_vae.py +0 -0
  126. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_image_encoder.py +0 -0
  127. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_unet.py +0 -0
  128. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_vae_decoder.py +0 -0
  129. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/svd_vae_encoder.py +0 -0
  130. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/tiler.py +0 -0
  131. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/utils.py +0 -0
  132. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/models/wan_video_text_encoder.py +0 -0
  133. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/__init__.py +0 -0
  134. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/base.py +0 -0
  135. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/cog_video.py +0 -0
  136. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/dancer.py +0 -0
  137. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/hunyuan_image.py +0 -0
  138. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/omnigen_image.py +0 -0
  139. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/pipeline_runner.py +0 -0
  140. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd3_image.py +0 -0
  141. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd_image.py +0 -0
  142. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sd_video.py +0 -0
  143. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sdxl_image.py +0 -0
  144. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/sdxl_video.py +0 -0
  145. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/step_video.py +0 -0
  146. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/pipelines/svd_video.py +0 -0
  147. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/FastBlend.py +0 -0
  148. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/PILEditor.py +0 -0
  149. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/RIFE.py +0 -0
  150. {diffsynth-1.1.2/diffsynth/tokenizer_configs → diffsynth-1.1.7/diffsynth/processors}/__init__.py +0 -0
  151. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/base.py +0 -0
  152. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/processors/sequencial_processor.py +0 -0
  153. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/__init__.py +0 -0
  154. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/base_prompter.py +0 -0
  155. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/cog_prompter.py +0 -0
  156. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/flux_prompter.py +0 -0
  157. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/hunyuan_dit_prompter.py +0 -0
  158. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/kolors_prompter.py +0 -0
  159. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/omnigen_prompter.py +0 -0
  160. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/omost.py +0 -0
  161. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/prompt_refiners.py +0 -0
  162. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sd3_prompter.py +0 -0
  163. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sd_prompter.py +0 -0
  164. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/sdxl_prompter.py +0 -0
  165. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/prompters/stepvideo_prompter.py +0 -0
  166. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/__init__.py +0 -0
  167. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/continuous_ode.py +0 -0
  168. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/schedulers/ddim.py +0 -0
  169. {diffsynth-1.1.2/diffsynth/trainers → diffsynth-1.1.7/diffsynth/tokenizer_configs}/__init__.py +0 -0
  170. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/added_tokens.json +0 -0
  171. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/special_tokens_map.json +0 -0
  172. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/spiece.model +0 -0
  173. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/cog/tokenizer/tokenizer_config.json +0 -0
  174. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/merges.txt +0 -0
  175. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/special_tokens_map.json +0 -0
  176. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/tokenizer_config.json +0 -0
  177. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_1/vocab.json +0 -0
  178. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/special_tokens_map.json +0 -0
  179. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/spiece.model +0 -0
  180. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer.json +0 -0
  181. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/flux/tokenizer_2/tokenizer_config.json +0 -0
  182. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/special_tokens_map.json +0 -0
  183. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/tokenizer_config.json +0 -0
  184. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab.txt +0 -0
  185. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer/vocab_org.txt +0 -0
  186. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/config.json +0 -0
  187. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/special_tokens_map.json +0 -0
  188. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/spiece.model +0 -0
  189. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_dit/tokenizer_t5/tokenizer_config.json +0 -0
  190. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/merges.txt +0 -0
  191. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/special_tokens_map.json +0 -0
  192. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/tokenizer_config.json +0 -0
  193. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_1/vocab.json +0 -0
  194. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/special_tokens_map.json +0 -0
  195. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer.json +0 -0
  196. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/hunyuan_video/tokenizer_2/tokenizer_config.json +0 -0
  197. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer.model +0 -0
  198. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/tokenizer_config.json +0 -0
  199. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt +0 -0
  200. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/merges.txt +0 -0
  201. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/special_tokens_map.json +0 -0
  202. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/tokenizer_config.json +0 -0
  203. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion/tokenizer/vocab.json +0 -0
  204. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/merges.txt +0 -0
  205. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/special_tokens_map.json +0 -0
  206. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/tokenizer_config.json +0 -0
  207. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_1/vocab.json +0 -0
  208. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/merges.txt +0 -0
  209. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/special_tokens_map.json +0 -0
  210. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/tokenizer_config.json +0 -0
  211. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_2/vocab.json +0 -0
  212. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/special_tokens_map.json +0 -0
  213. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/spiece.model +0 -0
  214. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer.json +0 -0
  215. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_3/tokenizer_3/tokenizer_config.json +0 -0
  216. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/merges.txt +0 -0
  217. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/special_tokens_map.json +0 -0
  218. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/tokenizer_config.json +0 -0
  219. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/tokenizer_configs/stable_diffusion_xl/tokenizer_2/vocab.json +0 -0
  220. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/vram_management/__init__.py +0 -0
  221. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth/vram_management/layers.py +0 -0
  222. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/dependency_links.txt +0 -0
  223. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/requires.txt +0 -0
  224. {diffsynth-1.1.2 → diffsynth-1.1.7}/diffsynth.egg-info/top_level.txt +0 -0
  225. {diffsynth-1.1.2 → diffsynth-1.1.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: diffsynth
3
- Version: 1.1.2
3
+ Version: 1.1.7
4
4
  Summary: Enjoy the magic of Diffusion models!
5
5
  Author: Artiprocher
6
6
  Classifier: Programming Language :: Python :: 3
@@ -13,13 +13,19 @@ Document: https://diffsynth-studio.readthedocs.io/zh-cn/latest/index.html
13
13
 
14
14
  ## Introduction
15
15
 
16
- DiffSynth Studio is a Diffusion engine. We have restructured architectures including Text Encoder, UNet, VAE, among others, maintaining compatibility with models from the open-source community while enhancing computational performance. We provide many interesting features. Enjoy the magic of Diffusion models!
16
+ Welcome to the magic world of Diffusion models!
17
17
 
18
- Until now, DiffSynth Studio has supported the following models:
18
+ DiffSynth consists of two open-source projects:
19
+ * [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio): Focused on aggressive technological exploration. Targeted at academia. Provides more cutting-edge technical support and novel inference capabilities.
20
+ * [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine): Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
21
+
22
+ DiffSynth-Studio is an open-source project aimed at exploring innovations in AIGC technology. We have integrated numerous open-source Diffusion models, including FLUX and Wan, among others. Through this open-source project, we hope to connect models within the open-source community and explore new technologies based on diffusion models.
23
+
24
+ Until now, DiffSynth-Studio has supported the following models:
19
25
 
20
26
  * [Wan-Video](https://github.com/Wan-Video/Wan2.1)
21
27
  * [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
22
- * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
28
+ * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo), [HunyuanVideo-I2V]()
23
29
  * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
24
30
  * [FLUX](https://huggingface.co/black-forest-labs/FLUX.1-dev)
25
31
  * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
@@ -36,6 +42,11 @@ Until now, DiffSynth Studio has supported the following models:
36
42
  * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
37
43
 
38
44
  ## News
45
+ - **March 31, 2025** We support InfiniteYou, an identity preserving method for FLUX. Please refer to [./examples/InfiniteYou/](./examples/InfiniteYou/) for more details.
46
+
47
+ - **March 25, 2025** 🔥🔥🔥 Our new open-source project, [DiffSynth-Engine](https://github.com/modelscope/DiffSynth-Engine), is now open-sourced! Focused on stable model deployment. Geared towards industry. Offers better engineering support, higher computational performance, and more stable functionality.
48
+
49
+ - **March 13, 2025** We support HunyuanVideo-I2V, the image-to-video generation version of HunyuanVideo open-sourced by Tencent. Please refer to [./examples/HunyuanVideo/](./examples/HunyuanVideo/) for more details.
39
50
 
40
51
  - **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
41
52
 
@@ -43,7 +54,7 @@ Until now, DiffSynth Studio has supported the following models:
43
54
 
44
55
  - **December 31, 2024** We propose EliGen, a novel framework for precise entity-level controlled text-to-image generation, complemented by an inpainting fusion pipeline to extend its capabilities to image inpainting tasks. EliGen seamlessly integrates with existing community models, such as IP-Adapter and In-Context LoRA, enhancing its versatility. For more details, see [./examples/EntityControl](./examples/EntityControl/).
45
56
  - Paper: [EliGen: Entity-Level Controlled Image Generation with Regional Attention](https://arxiv.org/abs/2501.01097)
46
- - Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen)
57
+ - Model: [ModelScope](https://www.modelscope.cn/models/DiffSynth-Studio/Eligen), [HuggingFace](https://huggingface.co/modelscope/EliGen)
47
58
  - Online Demo: [ModelScope EliGen Studio](https://www.modelscope.cn/studios/DiffSynth-Studio/EliGen)
48
59
  - Training Dataset: [EliGen Train Set](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)
49
60
 
@@ -72,7 +83,7 @@ Until now, DiffSynth Studio has supported the following models:
72
83
  - Enable CFG and highres-fix to improve visual quality. See [here](/examples/image_synthesis/README.md)
73
84
  - LoRA, ControlNet, and additional models will be available soon.
74
85
 
75
- - **June 21, 2024.** 🔥🔥🔥 We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
86
+ - **June 21, 2024.** We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
76
87
  - [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
77
88
  - Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/).
78
89
  - Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
@@ -37,6 +37,7 @@ from ..models.flux_text_encoder import FluxTextEncoder2
37
37
  from ..models.flux_vae import FluxVAEEncoder, FluxVAEDecoder
38
38
  from ..models.flux_controlnet import FluxControlNet
39
39
  from ..models.flux_ipadapter import FluxIpAdapter
40
+ from ..models.flux_infiniteyou import InfiniteYouImageProjector
40
41
 
41
42
  from ..models.cog_vae import CogVAEEncoder, CogVAEDecoder
42
43
  from ..models.cog_dit import CogDiT
@@ -58,6 +59,7 @@ from ..models.wan_video_dit import WanModel
58
59
  from ..models.wan_video_text_encoder import WanTextEncoder
59
60
  from ..models.wan_video_image_encoder import WanImageEncoder
60
61
  from ..models.wan_video_vae import WanVideoVAE
62
+ from ..models.wan_video_motion_controller import WanMotionControllerModel
61
63
 
62
64
 
63
65
  model_loader_configs = [
@@ -95,6 +97,7 @@ model_loader_configs = [
95
97
  (None, "57b02550baab820169365b3ee3afa2c9", ["flux_dit"], [FluxDiT], "civitai"),
96
98
  (None, "3394f306c4cbf04334b712bf5aaed95f", ["flux_dit"], [FluxDiT], "civitai"),
97
99
  (None, "023f054d918a84ccf503481fd1e3379e", ["flux_dit"], [FluxDiT], "civitai"),
100
+ (None, "605c56eab23e9e2af863ad8f0813a25d", ["flux_dit"], [FluxDiT], "diffusers"),
98
101
  (None, "280189ee084bca10f70907bf6ce1649d", ["cog_vae_encoder", "cog_vae_decoder"], [CogVAEEncoder, CogVAEDecoder], "diffusers"),
99
102
  (None, "9b9313d104ac4df27991352fec013fd4", ["rife"], [IFNet], "civitai"),
100
103
  (None, "6b7116078c4170bfbeaedc8fe71f6649", ["esrgan"], [RRDBNet], "civitai"),
@@ -103,6 +106,8 @@ model_loader_configs = [
103
106
  (None, "b001c89139b5f053c715fe772362dd2a", ["flux_controlnet"], [FluxControlNet], "diffusers"),
104
107
  (None, "52357cb26250681367488a8954c271e8", ["flux_controlnet"], [FluxControlNet], "diffusers"),
105
108
  (None, "0cfd1740758423a2a854d67c136d1e8c", ["flux_controlnet"], [FluxControlNet], "diffusers"),
109
+ (None, "7f9583eb8ba86642abb9a21a4b2c9e16", ["flux_controlnet"], [FluxControlNet], "diffusers"),
110
+ (None, "c07c0f04f5ff55e86b4e937c7a40d481", ["infiniteyou_image_projector"], [InfiniteYouImageProjector], "diffusers"),
106
111
  (None, "4daaa66cc656a8fe369908693dad0a35", ["flux_ipadapter"], [FluxIpAdapter], "diffusers"),
107
112
  (None, "51aed3d27d482fceb5e0739b03060e8f", ["sd3_dit", "sd3_vae_encoder", "sd3_vae_decoder"], [SD3DiT, SD3VAEEncoder, SD3VAEDecoder], "civitai"),
108
113
  (None, "98cc34ccc5b54ae0e56bdea8688dcd5a", ["sd3_text_encoder_2"], [SD3TextEncoder2], "civitai"),
@@ -116,10 +121,16 @@ model_loader_configs = [
116
121
  (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
117
122
  (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
118
123
  (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
124
+ (None, "6d6ccde6845b95ad9114ab993d917893", ["wan_video_dit"], [WanModel], "civitai"),
125
+ (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
126
+ (None, "349723183fc063b2bfc10bb2835cf677", ["wan_video_dit"], [WanModel], "civitai"),
127
+ (None, "efa44cddf936c70abd0ea28b6cbe946c", ["wan_video_dit"], [WanModel], "civitai"),
128
+ (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
119
129
  (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
120
130
  (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
121
131
  (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
122
132
  (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
133
+ (None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
123
134
  ]
124
135
  huggingface_model_loader_configs = [
125
136
  # These configs are provided for detecting model type automatically.
@@ -133,6 +144,7 @@ huggingface_model_loader_configs = [
133
144
  ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
134
145
  ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
135
146
  ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
147
+ ("LlavaForConditionalGeneration", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoMLLMEncoder"),
136
148
  ("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
137
149
  ]
138
150
  patch_model_loader_configs = [
@@ -595,6 +607,25 @@ preset_models_on_modelscope = {
595
607
  "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
596
608
  ],
597
609
  },
610
+ "InfiniteYou":{
611
+ "file_list":[
612
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
613
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
614
+ ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/image_proj_model.bin", "models/InfiniteYou"),
615
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/1k3d68.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
616
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/2d106det.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
617
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/genderage.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
618
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/glintr100.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
619
+ ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
620
+ ],
621
+ "load_path":[
622
+ [
623
+ "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
624
+ "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors"
625
+ ],
626
+ "models/InfiniteYou/image_proj_model.bin",
627
+ ],
628
+ },
598
629
  # ESRGAN
599
630
  "ESRGAN_x4": [
600
631
  ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
@@ -675,6 +706,25 @@ preset_models_on_modelscope = {
675
706
  "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
676
707
  ],
677
708
  },
709
+ "HunyuanVideoI2V":{
710
+ "file_list": [
711
+ ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideoI2V/text_encoder"),
712
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00001-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
713
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00002-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
714
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00003-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
715
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00004-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
716
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "config.json", "models/HunyuanVideoI2V/text_encoder_2"),
717
+ ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model.safetensors.index.json", "models/HunyuanVideoI2V/text_encoder_2"),
718
+ ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/vae/pytorch_model.pt", "models/HunyuanVideoI2V/vae"),
719
+ ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideoI2V/transformers")
720
+ ],
721
+ "load_path": [
722
+ "models/HunyuanVideoI2V/text_encoder/model.safetensors",
723
+ "models/HunyuanVideoI2V/text_encoder_2",
724
+ "models/HunyuanVideoI2V/vae/pytorch_model.pt",
725
+ "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
726
+ ],
727
+ },
678
728
  "HunyuanVideo-fp8":{
679
729
  "file_list": [
680
730
  ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
@@ -735,6 +785,7 @@ Preset_model_id: TypeAlias = Literal[
735
785
  "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
736
786
  "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
737
787
  "InstantX/FLUX.1-dev-IP-Adapter",
788
+ "InfiniteYou",
738
789
  "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
739
790
  "QwenPrompt",
740
791
  "OmostPrompt",
@@ -751,4 +802,5 @@ Preset_model_id: TypeAlias = Literal[
751
802
  "StableDiffusion3.5-medium",
752
803
  "HunyuanVideo",
753
804
  "HunyuanVideo-fp8",
805
+ "HunyuanVideoI2V",
754
806
  ]
@@ -1,10 +1,4 @@
1
1
  from typing_extensions import Literal, TypeAlias
2
- import warnings
3
- with warnings.catch_warnings():
4
- warnings.simplefilter("ignore")
5
- from controlnet_aux.processor import (
6
- CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector, NormalBaeDetector
7
- )
8
2
 
9
3
 
10
4
  Processor_id: TypeAlias = Literal[
@@ -15,18 +9,25 @@ class Annotator:
15
9
  def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda', skip_processor=False):
16
10
  if not skip_processor:
17
11
  if processor_id == "canny":
12
+ from controlnet_aux.processor import CannyDetector
18
13
  self.processor = CannyDetector()
19
14
  elif processor_id == "depth":
15
+ from controlnet_aux.processor import MidasDetector
20
16
  self.processor = MidasDetector.from_pretrained(model_path).to(device)
21
17
  elif processor_id == "softedge":
18
+ from controlnet_aux.processor import HEDdetector
22
19
  self.processor = HEDdetector.from_pretrained(model_path).to(device)
23
20
  elif processor_id == "lineart":
21
+ from controlnet_aux.processor import LineartDetector
24
22
  self.processor = LineartDetector.from_pretrained(model_path).to(device)
25
23
  elif processor_id == "lineart_anime":
24
+ from controlnet_aux.processor import LineartAnimeDetector
26
25
  self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
27
26
  elif processor_id == "openpose":
27
+ from controlnet_aux.processor import OpenposeDetector
28
28
  self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
29
29
  elif processor_id == "normal":
30
+ from controlnet_aux.processor import NormalBaeDetector
30
31
  self.processor = NormalBaeDetector.from_pretrained(model_path).to(device)
31
32
  elif processor_id == "tile" or processor_id == "none" or processor_id == "inpaint":
32
33
  self.processor = None
@@ -0,0 +1,129 @@
1
+ import torch
2
+ from typing import Optional
3
+ from einops import rearrange
4
+ from xfuser.core.distributed import (get_sequence_parallel_rank,
5
+ get_sequence_parallel_world_size,
6
+ get_sp_group)
7
+ from xfuser.core.long_ctx_attention import xFuserLongContextAttention
8
+
9
+ def sinusoidal_embedding_1d(dim, position):
10
+ sinusoid = torch.outer(position.type(torch.float64), torch.pow(
11
+ 10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
12
+ x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
13
+ return x.to(position.dtype)
14
+
15
+ def pad_freqs(original_tensor, target_len):
16
+ seq_len, s1, s2 = original_tensor.shape
17
+ pad_size = target_len - seq_len
18
+ padding_tensor = torch.ones(
19
+ pad_size,
20
+ s1,
21
+ s2,
22
+ dtype=original_tensor.dtype,
23
+ device=original_tensor.device)
24
+ padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
25
+ return padded_tensor
26
+
27
+ def rope_apply(x, freqs, num_heads):
28
+ x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
29
+ s_per_rank = x.shape[1]
30
+
31
+ x_out = torch.view_as_complex(x.to(torch.float64).reshape(
32
+ x.shape[0], x.shape[1], x.shape[2], -1, 2))
33
+
34
+ sp_size = get_sequence_parallel_world_size()
35
+ sp_rank = get_sequence_parallel_rank()
36
+ freqs = pad_freqs(freqs, s_per_rank * sp_size)
37
+ freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
38
+
39
+ x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
40
+ return x_out.to(x.dtype)
41
+
42
+ def usp_dit_forward(self,
43
+ x: torch.Tensor,
44
+ timestep: torch.Tensor,
45
+ context: torch.Tensor,
46
+ clip_feature: Optional[torch.Tensor] = None,
47
+ y: Optional[torch.Tensor] = None,
48
+ use_gradient_checkpointing: bool = False,
49
+ use_gradient_checkpointing_offload: bool = False,
50
+ **kwargs,
51
+ ):
52
+ t = self.time_embedding(
53
+ sinusoidal_embedding_1d(self.freq_dim, timestep))
54
+ t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
55
+ context = self.text_embedding(context)
56
+
57
+ if self.has_image_input:
58
+ x = torch.cat([x, y], dim=1) # (b, c_x + c_y, f, h, w)
59
+ clip_embdding = self.img_emb(clip_feature)
60
+ context = torch.cat([clip_embdding, context], dim=1)
61
+
62
+ x, (f, h, w) = self.patchify(x)
63
+
64
+ freqs = torch.cat([
65
+ self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
66
+ self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
67
+ self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
68
+ ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
69
+
70
+ def create_custom_forward(module):
71
+ def custom_forward(*inputs):
72
+ return module(*inputs)
73
+ return custom_forward
74
+
75
+ # Context Parallel
76
+ x = torch.chunk(
77
+ x, get_sequence_parallel_world_size(),
78
+ dim=1)[get_sequence_parallel_rank()]
79
+
80
+ for block in self.blocks:
81
+ if self.training and use_gradient_checkpointing:
82
+ if use_gradient_checkpointing_offload:
83
+ with torch.autograd.graph.save_on_cpu():
84
+ x = torch.utils.checkpoint.checkpoint(
85
+ create_custom_forward(block),
86
+ x, context, t_mod, freqs,
87
+ use_reentrant=False,
88
+ )
89
+ else:
90
+ x = torch.utils.checkpoint.checkpoint(
91
+ create_custom_forward(block),
92
+ x, context, t_mod, freqs,
93
+ use_reentrant=False,
94
+ )
95
+ else:
96
+ x = block(x, context, t_mod, freqs)
97
+
98
+ x = self.head(x, t)
99
+
100
+ # Context Parallel
101
+ x = get_sp_group().all_gather(x, dim=1)
102
+
103
+ # unpatchify
104
+ x = self.unpatchify(x, (f, h, w))
105
+ return x
106
+
107
+
108
+ def usp_attn_forward(self, x, freqs):
109
+ q = self.norm_q(self.q(x))
110
+ k = self.norm_k(self.k(x))
111
+ v = self.v(x)
112
+
113
+ q = rope_apply(q, freqs, self.num_heads)
114
+ k = rope_apply(k, freqs, self.num_heads)
115
+ q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
116
+ k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
117
+ v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
118
+
119
+ x = xFuserLongContextAttention()(
120
+ None,
121
+ query=q,
122
+ key=k,
123
+ value=v,
124
+ )
125
+ x = x.flatten(2)
126
+
127
+ del q, k, v
128
+ torch.cuda.empty_cache()
129
+ return self.o(x)
@@ -5,7 +5,7 @@ import pathlib
5
5
  import re
6
6
  from copy import deepcopy
7
7
  from pathlib import Path
8
- from turtle import forward
8
+ # from turtle import forward
9
9
  from typing import Any, Dict, Optional, Tuple, Union
10
10
 
11
11
  import torch
@@ -318,6 +318,8 @@ class FluxControlNetStateDictConverter:
318
318
  extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4}
319
319
  elif hash_value == "0cfd1740758423a2a854d67c136d1e8c":
320
320
  extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
321
+ elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16":
322
+ extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10}
321
323
  else:
322
324
  extra_kwargs = {}
323
325
  return state_dict_, extra_kwargs
@@ -628,19 +628,22 @@ class FluxDiTStateDictConverter:
628
628
  else:
629
629
  pass
630
630
  for name in list(state_dict_.keys()):
631
- if ".proj_in_besides_attn." in name:
632
- name_ = name.replace(".proj_in_besides_attn.", ".to_qkv_mlp.")
631
+ if "single_blocks." in name and ".a_to_q." in name:
632
+ mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
633
+ if mlp is None:
634
+ mlp = torch.zeros(4 * state_dict_[name].shape[0],
635
+ *state_dict_[name].shape[1:],
636
+ dtype=state_dict_[name].dtype)
637
+ else:
638
+ state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
633
639
  param = torch.concat([
634
- state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_q.")],
635
- state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_k.")],
636
- state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_v.")],
637
- state_dict_[name],
640
+ state_dict_.pop(name),
641
+ state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
642
+ state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
643
+ mlp,
638
644
  ], dim=0)
645
+ name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
639
646
  state_dict_[name_] = param
640
- state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
641
- state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
642
- state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
643
- state_dict_.pop(name)
644
647
  for name in list(state_dict_.keys()):
645
648
  for component in ["a", "b"]:
646
649
  if f".{component}_to_q." in name:
@@ -0,0 +1,128 @@
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+
6
+ # FFN
7
+ def FeedForward(dim, mult=4):
8
+ inner_dim = int(dim * mult)
9
+ return nn.Sequential(
10
+ nn.LayerNorm(dim),
11
+ nn.Linear(dim, inner_dim, bias=False),
12
+ nn.GELU(),
13
+ nn.Linear(inner_dim, dim, bias=False),
14
+ )
15
+
16
+
17
+ def reshape_tensor(x, heads):
18
+ bs, length, width = x.shape
19
+ #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
20
+ x = x.view(bs, length, heads, -1)
21
+ # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
22
+ x = x.transpose(1, 2)
23
+ # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
24
+ x = x.reshape(bs, heads, length, -1)
25
+ return x
26
+
27
+
28
+ class PerceiverAttention(nn.Module):
29
+
30
+ def __init__(self, *, dim, dim_head=64, heads=8):
31
+ super().__init__()
32
+ self.scale = dim_head**-0.5
33
+ self.dim_head = dim_head
34
+ self.heads = heads
35
+ inner_dim = dim_head * heads
36
+
37
+ self.norm1 = nn.LayerNorm(dim)
38
+ self.norm2 = nn.LayerNorm(dim)
39
+
40
+ self.to_q = nn.Linear(dim, inner_dim, bias=False)
41
+ self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
42
+ self.to_out = nn.Linear(inner_dim, dim, bias=False)
43
+
44
+ def forward(self, x, latents):
45
+ """
46
+ Args:
47
+ x (torch.Tensor): image features
48
+ shape (b, n1, D)
49
+ latent (torch.Tensor): latent features
50
+ shape (b, n2, D)
51
+ """
52
+ x = self.norm1(x)
53
+ latents = self.norm2(latents)
54
+
55
+ b, l, _ = latents.shape
56
+
57
+ q = self.to_q(latents)
58
+ kv_input = torch.cat((x, latents), dim=-2)
59
+ k, v = self.to_kv(kv_input).chunk(2, dim=-1)
60
+
61
+ q = reshape_tensor(q, self.heads)
62
+ k = reshape_tensor(k, self.heads)
63
+ v = reshape_tensor(v, self.heads)
64
+
65
+ # attention
66
+ scale = 1 / math.sqrt(math.sqrt(self.dim_head))
67
+ weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
68
+ weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
69
+ out = weight @ v
70
+
71
+ out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
72
+
73
+ return self.to_out(out)
74
+
75
+
76
+ class InfiniteYouImageProjector(nn.Module):
77
+
78
+ def __init__(
79
+ self,
80
+ dim=1280,
81
+ depth=4,
82
+ dim_head=64,
83
+ heads=20,
84
+ num_queries=8,
85
+ embedding_dim=512,
86
+ output_dim=4096,
87
+ ff_mult=4,
88
+ ):
89
+ super().__init__()
90
+ self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
91
+ self.proj_in = nn.Linear(embedding_dim, dim)
92
+
93
+ self.proj_out = nn.Linear(dim, output_dim)
94
+ self.norm_out = nn.LayerNorm(output_dim)
95
+
96
+ self.layers = nn.ModuleList([])
97
+ for _ in range(depth):
98
+ self.layers.append(
99
+ nn.ModuleList([
100
+ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
101
+ FeedForward(dim=dim, mult=ff_mult),
102
+ ]))
103
+
104
+ def forward(self, x):
105
+
106
+ latents = self.latents.repeat(x.size(0), 1, 1)
107
+
108
+ x = self.proj_in(x)
109
+
110
+ for attn, ff in self.layers:
111
+ latents = attn(x, latents) + latents
112
+ latents = ff(latents) + latents
113
+
114
+ latents = self.proj_out(latents)
115
+ return self.norm_out(latents)
116
+
117
+ @staticmethod
118
+ def state_dict_converter():
119
+ return FluxInfiniteYouImageProjectorStateDictConverter()
120
+
121
+
122
+ class FluxInfiniteYouImageProjectorStateDictConverter:
123
+
124
+ def __init__(self):
125
+ pass
126
+
127
+ def from_diffusers(self, state_dict):
128
+ return state_dict['image_proj']