PyPI - hcpdiff - Versions diffs - 0.9.1__py3-none-any.whl → 2.2__py3-none-any.whl - Mend

hcpdiff 0.9.1py3-none-any.whl → 2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (211) hide show

hcpdiff/__init__.py +4 -4
hcpdiff/ckpt_manager/__init__.py +4 -5
hcpdiff/ckpt_manager/ckpt.py +24 -0
hcpdiff/ckpt_manager/format/__init__.py +4 -0
hcpdiff/ckpt_manager/format/diffusers.py +59 -0
hcpdiff/ckpt_manager/format/emb.py +21 -0
hcpdiff/ckpt_manager/format/lora_webui.py +252 -0
hcpdiff/ckpt_manager/format/sd_single.py +41 -0
hcpdiff/ckpt_manager/loader.py +64 -0
hcpdiff/data/__init__.py +4 -28
hcpdiff/data/cache/__init__.py +1 -0
hcpdiff/data/cache/vae.py +102 -0
hcpdiff/data/dataset.py +20 -0
hcpdiff/data/handler/__init__.py +3 -0
hcpdiff/data/handler/controlnet.py +18 -0
hcpdiff/data/handler/diffusion.py +90 -0
hcpdiff/data/handler/text.py +111 -0
hcpdiff/data/source/__init__.py +3 -3
hcpdiff/data/source/folder_class.py +12 -29
hcpdiff/data/source/text.py +40 -0
hcpdiff/data/source/text2img.py +36 -74
hcpdiff/data/source/text2img_cond.py +9 -15
hcpdiff/diffusion/__init__.py +0 -0
hcpdiff/diffusion/noise/__init__.py +2 -0
hcpdiff/diffusion/noise/pyramid_noise.py +42 -0
hcpdiff/diffusion/noise/zero_terminal.py +39 -0
hcpdiff/diffusion/sampler/__init__.py +5 -0
hcpdiff/diffusion/sampler/base.py +72 -0
hcpdiff/diffusion/sampler/ddpm.py +20 -0
hcpdiff/diffusion/sampler/diffusers.py +66 -0
hcpdiff/diffusion/sampler/edm.py +22 -0
hcpdiff/diffusion/sampler/sigma_scheduler/__init__.py +3 -0
hcpdiff/diffusion/sampler/sigma_scheduler/base.py +14 -0
hcpdiff/diffusion/sampler/sigma_scheduler/ddpm.py +197 -0
hcpdiff/diffusion/sampler/sigma_scheduler/edm.py +48 -0
hcpdiff/easy/__init__.py +2 -0
hcpdiff/easy/cfg/__init__.py +3 -0
hcpdiff/easy/cfg/sd15_train.py +207 -0
hcpdiff/easy/cfg/sdxl_train.py +147 -0
hcpdiff/easy/cfg/t2i.py +228 -0
hcpdiff/easy/model/__init__.py +2 -0
hcpdiff/easy/model/cnet.py +31 -0
hcpdiff/easy/model/loader.py +79 -0
hcpdiff/easy/sampler.py +46 -0
hcpdiff/evaluate/__init__.py +1 -0
hcpdiff/evaluate/previewer.py +60 -0
hcpdiff/loss/__init__.py +4 -1
hcpdiff/loss/base.py +41 -0
hcpdiff/loss/gw.py +35 -0
hcpdiff/loss/ssim.py +37 -0
hcpdiff/loss/vlb.py +79 -0
hcpdiff/loss/weighting.py +66 -0
hcpdiff/models/__init__.py +2 -2
hcpdiff/models/cfg_context.py +17 -14
hcpdiff/models/compose/compose_hook.py +44 -23
hcpdiff/models/compose/compose_tokenizer.py +21 -8
hcpdiff/models/compose/sdxl_composer.py +4 -4
hcpdiff/models/controlnet.py +16 -16
hcpdiff/models/lora_base_patch.py +14 -25
hcpdiff/models/lora_layers.py +3 -9
hcpdiff/models/lora_layers_patch.py +14 -24
hcpdiff/models/text_emb_ex.py +84 -6
hcpdiff/models/textencoder_ex.py +54 -18
hcpdiff/models/wrapper/__init__.py +3 -0
hcpdiff/models/wrapper/pixart.py +19 -0
hcpdiff/models/wrapper/sd.py +218 -0
hcpdiff/models/wrapper/utils.py +20 -0
hcpdiff/parser/__init__.py +1 -0
hcpdiff/parser/embpt.py +32 -0
hcpdiff/tools/convert_caption_txt2json.py +1 -1
hcpdiff/tools/dataset_generator.py +94 -0
hcpdiff/tools/download_hf_model.py +24 -0
hcpdiff/tools/init_proj.py +3 -21
hcpdiff/tools/lora_convert.py +18 -17
hcpdiff/tools/save_model.py +12 -0
hcpdiff/tools/sd2diffusers.py +1 -1
hcpdiff/train_colo.py +1 -1
hcpdiff/train_deepspeed.py +1 -1
hcpdiff/trainer_ac.py +79 -0
hcpdiff/trainer_ac_single.py +31 -0
hcpdiff/utils/__init__.py +0 -2
hcpdiff/utils/inpaint_pipe.py +7 -2
hcpdiff/utils/net_utils.py +29 -6
hcpdiff/utils/pipe_hook.py +24 -7
hcpdiff/utils/utils.py +21 -4
hcpdiff/workflow/__init__.py +15 -10
hcpdiff/workflow/daam/__init__.py +1 -0
hcpdiff/workflow/daam/act.py +66 -0
hcpdiff/workflow/daam/hook.py +109 -0
hcpdiff/workflow/diffusion.py +118 -128
hcpdiff/workflow/fast.py +31 -0
hcpdiff/workflow/flow.py +67 -0
hcpdiff/workflow/io.py +36 -130
hcpdiff/workflow/model.py +46 -43
hcpdiff/workflow/text.py +60 -47
hcpdiff/workflow/utils.py +32 -12
hcpdiff/workflow/vae.py +37 -38
hcpdiff-2.2.dist-info/METADATA +299 -0
hcpdiff-2.2.dist-info/RECORD +115 -0
{hcpdiff-0.9.1.dist-info → hcpdiff-2.2.dist-info}/WHEEL +1 -1
hcpdiff-2.2.dist-info/entry_points.txt +5 -0
hcpdiff/ckpt_manager/base.py +0 -16
hcpdiff/ckpt_manager/ckpt_diffusers.py +0 -45
hcpdiff/ckpt_manager/ckpt_pkl.py +0 -138
hcpdiff/ckpt_manager/ckpt_safetensor.py +0 -64
hcpdiff/ckpt_manager/ckpt_webui.py +0 -54
hcpdiff/data/bucket.py +0 -358
hcpdiff/data/caption_loader.py +0 -80
hcpdiff/data/cond_dataset.py +0 -40
hcpdiff/data/crop_info_dataset.py +0 -40
hcpdiff/data/data_processor.py +0 -33
hcpdiff/data/pair_dataset.py +0 -146
hcpdiff/data/sampler.py +0 -54
hcpdiff/data/source/base.py +0 -30
hcpdiff/data/utils.py +0 -80
hcpdiff/deprecated/__init__.py +0 -1
hcpdiff/deprecated/cfg_converter.py +0 -81
hcpdiff/deprecated/lora_convert.py +0 -31
hcpdiff/infer_workflow.py +0 -57
hcpdiff/loggers/__init__.py +0 -13
hcpdiff/loggers/base_logger.py +0 -76
hcpdiff/loggers/cli_logger.py +0 -40
hcpdiff/loggers/preview/__init__.py +0 -1
hcpdiff/loggers/preview/image_previewer.py +0 -149
hcpdiff/loggers/tensorboard_logger.py +0 -30
hcpdiff/loggers/wandb_logger.py +0 -31
hcpdiff/loggers/webui_logger.py +0 -9
hcpdiff/loss/min_snr_loss.py +0 -52
hcpdiff/models/layers.py +0 -81
hcpdiff/models/plugin.py +0 -348
hcpdiff/models/wrapper.py +0 -75
hcpdiff/noise/__init__.py +0 -3
hcpdiff/noise/noise_base.py +0 -16
hcpdiff/noise/pyramid_noise.py +0 -50
hcpdiff/noise/zero_terminal.py +0 -44
hcpdiff/train_ac.py +0 -566
hcpdiff/train_ac_single.py +0 -39
hcpdiff/utils/caption_tools.py +0 -105
hcpdiff/utils/cfg_net_tools.py +0 -321
hcpdiff/utils/cfg_resolvers.py +0 -16
hcpdiff/utils/ema.py +0 -52
hcpdiff/utils/img_size_tool.py +0 -248
hcpdiff/vis/__init__.py +0 -3
hcpdiff/vis/base_interface.py +0 -12
hcpdiff/vis/disk_interface.py +0 -48
hcpdiff/vis/webui_interface.py +0 -17
hcpdiff/viser_fast.py +0 -138
hcpdiff/visualizer.py +0 -265
hcpdiff/visualizer_reloadable.py +0 -237
hcpdiff/workflow/base.py +0 -59
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/anime/text2img_anime.yaml +0 -21
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/anime/text2img_anime_lora.yaml +0 -58
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/change_vae.yaml +0 -6
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/euler_a.yaml +0 -8
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/img2img.yaml +0 -10
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/img2img_controlnet.yaml +0 -19
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/inpaint.yaml +0 -11
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/load_lora.yaml +0 -26
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/load_unet_part.yaml +0 -18
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/offload_2GB.yaml +0 -6
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/save_model.yaml +0 -44
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/text2img.yaml +0 -53
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/text2img_DA++.yaml +0 -34
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/infer/text2img_sdxl.yaml +0 -9
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/plugins/plugin_controlnet.yaml +0 -17
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/te_struct.txt +0 -193
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/dataset/base_dataset.yaml +0 -29
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/dataset/regularization_dataset.yaml +0 -31
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/CustomDiffusion.yaml +0 -74
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/DreamArtist++.yaml +0 -135
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/DreamArtist.yaml +0 -45
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/DreamBooth.yaml +0 -62
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/FT_sdxl.yaml +0 -33
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/Lion_optimizer.yaml +0 -17
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/TextualInversion.yaml +0 -41
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/add_logger_tensorboard_wandb.yaml +0 -15
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/controlnet.yaml +0 -53
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/ema.yaml +0 -10
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/fine-tuning.yaml +0 -53
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/locon.yaml +0 -24
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/lora_anime_character.yaml +0 -77
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/lora_conventional.yaml +0 -56
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/lora_sdxl.yaml +0 -41
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/min_snr.yaml +0 -7
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples/preview_in_training.yaml +0 -6
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples_noob/DreamBooth.yaml +0 -70
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples_noob/TextualInversion.yaml +0 -45
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples_noob/fine-tuning.yaml +0 -45
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/examples_noob/lora.yaml +0 -63
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/train_base.yaml +0 -81
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/train/tuning_base.yaml +0 -42
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/unet_struct.txt +0 -932
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/workflow/highres_fix_latent.yaml +0 -86
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/workflow/highres_fix_pixel.yaml +0 -99
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/workflow/text2img.yaml +0 -59
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/workflow/text2img_lora.yaml +0 -70
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/zero2.json +0 -32
hcpdiff-0.9.1.data/data/hcpdiff/cfgs/zero3.json +0 -39
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/caption.txt +0 -1
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/name.txt +0 -1
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/name_2pt_caption.txt +0 -1
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/name_caption.txt +0 -1
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/object.txt +0 -27
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/object_caption.txt +0 -27
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/style.txt +0 -19
hcpdiff-0.9.1.data/data/hcpdiff/prompt_tuning_template/style_caption.txt +0 -19
hcpdiff-0.9.1.dist-info/METADATA +0 -199
hcpdiff-0.9.1.dist-info/RECORD +0 -160
hcpdiff-0.9.1.dist-info/entry_points.txt +0 -2
{hcpdiff-0.9.1.dist-info → hcpdiff-2.2.dist-info/licenses}/LICENSE +0 -0
{hcpdiff-0.9.1.dist-info → hcpdiff-2.2.dist-info}/top_level.txt +0 -0

hcpdiff/models/lora_layers_patch.py CHANGED Viewed

@@ -8,19 +8,18 @@ lora_layers.py
     :Licence:     Apache-2.0
 """
+import math
 import torch
-from einops import einsum, rearrange
+from einops import einsum
 from torch import nn
 from torch.nn import functional as F
 from .lora_base_patch import LoraBlock, PatchPluginContainer
-from .layers import GroupLinear
-import math
-from typing import Union, List
 class LoraLayer(LoraBlock):
-    def __init__(self, lora_id: int, host, rank=1, dropout=0.1, alpha=1.0, bias=False, alpha_auto_scale=True, **kwargs):
-        super().__init__(lora_id, host, rank, dropout, alpha=alpha, bias=bias, alpha_auto_scale=alpha_auto_scale, **kwargs)
+    def __init__(self, name: str, host, rank=1, dropout=0.0, alpha=1.0, bias=False, alpha_auto_scale=True, **kwargs):
+        super().__init__(name, host, rank, dropout, alpha=alpha, bias=bias, alpha_auto_scale=alpha_auto_scale, **kwargs)
     class LinearLayer(LoraBlock.LinearLayer):
         def __init__(self, host:nn.Linear, rank, bias, block):
@@ -99,6 +98,11 @@ class LoraLayer(LoraBlock):
             b = self.bias.data if self.bias else None
             return w, b
+def none_add(a, b):
+    if a is None:
+        return b
+    return a+b
 class DAPPPatchContainer(PatchPluginContainer):
     def forward(self, x, *args, **kwargs):
         weight_p = None
@@ -107,25 +111,11 @@ class DAPPPatchContainer(PatchPluginContainer):
         bias_n = None
         for name in self.plugin_names:
             if self[name].branch=='p':
-                if weight_p is None:
-                    weight_p = self[name].get_weight()
-                else:
-                    weight_p = weight_p + self[name].get_weight()
-                if bias_p is None:
-                    bias_p = self[name].get_bias()
-                else:
-                    bias_p = bias_p+self[name].get_bias()
+                weight_p = none_add(weight_p, self[name].get_weight())
+                bias_p = none_add(bias_p, self[name].get_bias())
             elif self[name].branch=='n':
-                if weight_n is None:
-                    weight_n = self[name].get_weight()
-                else:
-                    weight_n = weight_n + self[name].get_weight()
-                if bias_n is None:
-                    bias_n = self[name].get_bias()
-                else:
-                    bias_n = bias_n+self[name].get_bias()
+                weight_n = none_add(weight_n, self[name].get_weight())
+                bias_n = none_add(bias_n, self[name].get_bias())
         B = x.shape[0]//2
         x_p = self[name].post_forward(x[B:], self._host.weight, weight_p, self._host.bias, bias_p)

hcpdiff/models/text_emb_ex.py CHANGED Viewed

@@ -7,16 +7,17 @@ text_emb_ex.py
     :Created:     10/03/2023
     :Licence:     Apache-2.0
 """
-from typing import Tuple
+from typing import Tuple, Dict, Any
 import torch
 from torch import nn
 import os
-from loguru import logger
+from rainbowneko import _share
 from einops import rearrange, repeat
+import torch.nn.functional as F
 from ..utils.net_utils import load_emb
-from .plugin import SinglePluginBlock
+from rainbowneko.models.plugin import SinglePluginBlock
 class EmbeddingPTHook(SinglePluginBlock):
     def __init__(self, token_embedding:nn.Embedding, N_word=75, N_repeats=3):
@@ -37,6 +38,84 @@ class EmbeddingPTHook(SinglePluginBlock):
         self.input_ids = rearrange(input_ids[0], '(b r) w -> b (r w)', r=self.N_repeats)  # 兼容Attention mask
         return self.input_ids.clip(0, self.num_embeddings-1)
+    def forward(self, inputs_embeds:torch.Tensor, *args: Tuple[Any, ...], **kwargs: Dict[str, Any]):
+        '''
+        :param input_ids: [B, N_ids]
+        :param inputs_embeds: [B, N_repeat*(N_word+2), N_emb]
+        :return: [B, N_repeat, N_word+2, N_emb]
+        '''
+        rep_idxs_B = self.input_ids >= self.num_embeddings
+        BOS = repeat(inputs_embeds[:,0,:], 'b e -> b r 1 e', r=self.N_repeats)
+        EOS = repeat(inputs_embeds[:,-1,:], 'b e -> b r 1 e', r=self.N_repeats)
+        replaced_embeds = []
+        for i, (item, rep_idxs, ids_raw) in enumerate(zip(inputs_embeds, rep_idxs_B, self.input_ids)):
+            # insert pt to embeddings
+            rep_idxs=torch.where(rep_idxs)[0]
+            item_new=[]
+            rep_idx_last=0
+            for rep_idx in rep_idxs:
+                rep_idx=rep_idx.item()
+                item_new.append(item[rep_idx_last:rep_idx, :])
+                item_new.append(self.emb[ids_raw[rep_idx].item()].to(dtype=item.dtype))
+                rep_idx_last=rep_idx+1
+            item_new.append(item[rep_idx_last:, :])
+            # split to N_repeat sentence
+            replaced_item = torch.cat(item_new, dim=0)[1:self.N_word*self.N_repeats+1, :]
+            replaced_item = rearrange(replaced_item, '(r w) e -> r w e', r=self.N_repeats, w=self.N_word)
+            replaced_item = torch.cat([BOS[i], replaced_item, EOS[i]], dim=1) # [N_repeat, N_word+2, N_emb]
+            replaced_embeds.append(replaced_item)
+        return torch.cat(replaced_embeds, dim=0) # [B*N_repeat, N_word+2, N_emb]
+    def remove(self):
+        super(EmbeddingPTHook, self).remove()
+        self.handle_pre.remove()
+    @classmethod
+    def hook(cls, ex_words_emb, tokenizer, text_encoder, **kwargs):
+        word_list = list(ex_words_emb.keys())
+        tokenizer.add_tokens(word_list)
+        token_ids = tokenizer(' '.join(word_list)).input_ids[1:-1]
+        embedding_hook = cls(text_encoder.get_input_embeddings(), N_word=tokenizer.model_max_length-2, **kwargs)
+        #text_encoder.text_model.embeddings.token_embedding = embedding_hook
+        for tid, word in zip(token_ids, word_list):
+            embedding_hook.add_emb(ex_words_emb[word], tid)
+            _share.loggers.info(f'hook: {word}, len: {ex_words_emb[word].shape[0]}, id: {tid}')
+        return embedding_hook
+    @classmethod
+    def hook_from_dir(cls, emb_dir, tokenizer, text_encoder, device='cuda:0', **kwargs):
+        ex_words_emb = {file[:-3]: nn.Parameter(load_emb(os.path.join(emb_dir, file)).to(device), requires_grad=False)
+                        for file in os.listdir(emb_dir) if file.endswith('.pt')}
+        return cls.hook(ex_words_emb, tokenizer, text_encoder, **kwargs), ex_words_emb
+class EmbeddingPTInterpHook(SinglePluginBlock):
+    def __init__(self, token_embedding:nn.Embedding, N_word=75, N_repeats=3):
+        super().__init__('emb_ex', token_embedding)
+        self.handle_pre = token_embedding.register_forward_pre_hook(self.pre_hook)
+        new_len = int(token_embedding.num_embeddings*N_repeats)
+        original_weights = token_embedding.weight.data.unsqueeze(1)
+        token_embedding.weight.data = F.interpolate(original_weights, size=new_len, mode='linear', align_corners=False).squeeze(1)
+        token_embedding.num_embeddings = new_len
+        self.N_word=N_word
+        self.N_repeats=N_repeats
+        self.num_embeddings=token_embedding.num_embeddings
+        self.embedding_dim=token_embedding.embedding_dim
+        self.emb={}
+        self.emb_train=nn.ParameterList()
+    def add_emb(self, emb:nn.Parameter, token_id:int):
+        self.emb[token_id]=emb
+    def pre_hook(self, host, input_ids: Tuple[torch.Tensor]):
+        self.input_ids = rearrange(input_ids[0], '(b r) w -> b (r w)', r=self.N_repeats)  # 兼容Attention mask
+        return self.input_ids.clip(0, self.num_embeddings-1)
     def forward(self, fea_in:Tuple[torch.Tensor], inputs_embeds:torch.Tensor):
         '''
         :param input_ids: [B, N_ids]
@@ -83,12 +162,11 @@ class EmbeddingPTHook(SinglePluginBlock):
         for tid, word in zip(token_ids, word_list):
             embedding_hook.add_emb(ex_words_emb[word], tid)
             if log:
-                logger.info(f'hook: {word}, len: {ex_words_emb[word].shape[0]}, id: {tid}')
+                _share.logger.info(f'hook: {word}, len: {ex_words_emb[word].shape[0]}, id: {tid}')
         return embedding_hook
     @classmethod
     def hook_from_dir(cls, emb_dir, tokenizer, text_encoder, log=True, device='cuda:0', **kwargs):
         ex_words_emb = {file[:-3]: nn.Parameter(load_emb(os.path.join(emb_dir, file)).to(device), requires_grad=False)
                         for file in os.listdir(emb_dir) if file.endswith('.pt')}
-        return cls.hook(ex_words_emb, tokenizer, text_encoder, log, **kwargs), ex_words_emb
+        return cls.hook(ex_words_emb, tokenizer, text_encoder, log, **kwargs), ex_words_emb

hcpdiff/models/textencoder_ex.py CHANGED Viewed

@@ -8,29 +8,53 @@ textencoder_ex.py
     :Licence:     Apache-2.0
 """
-from typing import Tuple, Optional, List
+from typing import Tuple, Optional
 import torch
 from einops import repeat, rearrange
 from einops.layers.torch import Rearrange
+from loguru import logger
 from torch import nn
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
 from transformers.models.clip.modeling_clip import CLIPAttention
 class TEEXHook:
-    def __init__(self, text_enc: nn.Module, tokenizer, N_repeats=3, clip_skip=0, clip_final_norm=True, device='cuda', use_attention_mask=False):
+    def __init__(self, text_enc: nn.Module, tokenizer, N_repeats=1, clip_skip=0, clip_final_norm=True, use_attention_mask=False):
         self.text_enc = text_enc
         self.tokenizer = tokenizer
         self.N_repeats = N_repeats
         self.clip_skip = clip_skip
         self.clip_final_norm = clip_final_norm
-        self.device = device
-        self.attn_mult = None
         self.use_attention_mask = use_attention_mask
         text_enc.register_forward_hook(self.forward_hook)
         text_enc.register_forward_pre_hook(self.forward_hook_input)
+    def find_final_norm(self, text_enc: nn.Module):
+        for module in text_enc.modules():
+            if 'final_layer_norm' in module._modules:
+                logger.info(f'find final_layer_norm in {type(module)}')
+                return module.final_layer_norm
+        logger.info(f'final_layer_norm not found in {type(text_enc)}')
+        return None
+    @property
+    def clip_final_norm(self):
+        return self.final_layer_norm is not None
+    @clip_final_norm.setter
+    def clip_final_norm(self, value: bool):
+        if value:
+            self.final_layer_norm = self.find_final_norm(self.text_enc)
+        else:
+            self.final_layer_norm = None
+    @property
+    def device(self):
+        return self.text_enc.device
     def encode_prompt_to_emb(self, prompt):
         text_inputs = self.tokenizer(
             prompt,
@@ -50,12 +74,23 @@ class TEEXHook:
         if position_ids is not None:
             position_ids = position_ids.to(self.device)
-        prompt_embeds, pooled_output = self.text_enc(
-            text_input_ids.to(self.device),
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_hidden_states=True,
-        )
+        # align with sd-webui
+        if isinstance(self.text_enc, CLIPTextModelWithProjection):
+            self.text_enc.text_projection.weight.data = self.text_enc.text_projection.weight.data.t()
+        if isinstance(self.text_enc, T5EncoderModel):
+            prompt_embeds, pooled_output = self.text_enc(
+                text_input_ids.to(self.device),
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+            )
+        else:
+            prompt_embeds, pooled_output = self.text_enc(
+                text_input_ids.to(self.device),
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_hidden_states=True,
+            )
         return prompt_embeds, pooled_output, attention_mask
     def forward_hook_input(self, host, feat_in):
@@ -64,13 +99,12 @@ class TEEXHook:
     def forward_hook(self, host, feat_in: Tuple[torch.Tensor], feat_out):
         encoder_hidden_states = feat_out['hidden_states'][-self.clip_skip-1]
-        if self.clip_final_norm:
-            encoder_hidden_states = self.text_enc.text_model.final_layer_norm(encoder_hidden_states)
+        if self.clip_final_norm and self.final_layer_norm is not None:
+            encoder_hidden_states = self.final_layer_norm(encoder_hidden_states)
         if self.text_enc.training and self.clip_skip>0:
             encoder_hidden_states = encoder_hidden_states+0*feat_out['last_hidden_state'].mean()  # avoid unused parameters, make gradient checkpointing happy
         encoder_hidden_states = rearrange(encoder_hidden_states, '(b r) ... -> b r ...', r=self.N_repeats)  # [B, N_repeat, N_word+2, N_emb]
-        pooled_output = feat_out.pooler_output
+        pooled_output = feat_out.get('pooler_output', feat_out.get('text_embeds', None))
         # TODO: may have better fusion method
         if pooled_output is not None:
             pooled_output = rearrange(pooled_output, '(b r) ... -> b r ...', r=self.N_repeats).mean(dim=1)
@@ -81,7 +115,7 @@ class TEEXHook:
         return encoder_hidden_states, pooled_output
     def pool_hidden_states(self, encoder_hidden_states, input_ids):
-        pooled_output = encoder_hidden_states[:, :, -1, :].mean(dim=1) # [B, N_emb]
+        pooled_output = encoder_hidden_states[:, :, -1, :].mean(dim=1)  # [B, N_emb]
         return pooled_output
     @staticmethod
@@ -147,9 +181,11 @@ class TEEXHook:
         layer.forward = forward
     @classmethod
-    def hook(cls, text_enc: nn.Module, tokenizer, N_repeats=3, clip_skip=0, clip_final_norm=True, device='cuda', use_attention_mask=False):
-        return cls(text_enc, tokenizer, N_repeats=N_repeats, clip_skip=clip_skip, clip_final_norm=clip_final_norm, device=device, use_attention_mask=use_attention_mask)
+    def hook(cls, text_enc: nn.Module, tokenizer, N_repeats=3, clip_skip=0, clip_final_norm=True, use_attention_mask=False):
+        return cls(text_enc, tokenizer, N_repeats=N_repeats, clip_skip=clip_skip, clip_final_norm=clip_final_norm,
+                   use_attention_mask=use_attention_mask)
     @classmethod
     def hook_pipe(cls, pipe, N_repeats=3, clip_skip=0, clip_final_norm=True, use_attention_mask=False):
-        return cls(pipe.text_encoder, pipe.tokenizer, N_repeats=N_repeats, device='cuda', clip_skip=clip_skip, clip_final_norm=clip_final_norm, use_attention_mask=use_attention_mask)
+        return cls(pipe.text_encoder, pipe.tokenizer, N_repeats=N_repeats, clip_skip=clip_skip, clip_final_norm=clip_final_norm,
+                   use_attention_mask=use_attention_mask)

hcpdiff/models/wrapper/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .sd import SD15Wrapper, SDXLWrapper
+from .pixart import PixArtWrapper
+from .utils import TEHookCFG, SD15_TEHookCFG, SDXL_TEHookCFG

hcpdiff/models/wrapper/pixart.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .sd import SD15Wrapper
+from hcpdiff.utils import pad_attn_bias
+class PixArtWrapper(SD15Wrapper):
+    def forward_denoiser(self, x_t, prompt_ids, encoder_hidden_states, timesteps, attn_mask=None, position_ids=None, resolution=None, aspect_ratio=None,
+                     plugin_input={}, **kwargs):
+        if attn_mask is not None:
+            attn_mask[:, :self.min_attnmask] = 1
+            encoder_hidden_states, attn_mask = pad_attn_bias(encoder_hidden_states, attn_mask)
+        input_all = dict(prompt_ids=prompt_ids, timesteps=timesteps, position_ids=position_ids, attn_mask=attn_mask,
+                         encoder_hidden_states=encoder_hidden_states, **plugin_input)
+        if hasattr(self.denoiser, 'input_feeder'):
+            for feeder in self.denoiser.input_feeder:
+                feeder(input_all)
+        added_cond_kwargs = {"resolution":resolution, "aspect_ratio":aspect_ratio}
+        model_pred = self.denoiser(x_t, encoder_hidden_states, timesteps, encoder_attention_mask=attn_mask,
+                               added_cond_kwargs=added_cond_kwargs).sample  # Predict the noise residual
+        return model_pred

hcpdiff/models/wrapper/sd.py ADDED Viewed

@@ -0,0 +1,218 @@
+from contextlib import nullcontext
+from functools import partial
+from typing import Dict, Union
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from rainbowneko.models.wrapper import BaseWrapper
+from torch import Tensor
+from torch import nn
+from hcpdiff.diffusion.sampler import BaseSampler
+from hcpdiff.models import TEEXHook
+from hcpdiff.models.compose import ComposeTEEXHook
+from hcpdiff.utils import pad_attn_bias
+from .utils import TEHookCFG, SD15_TEHookCFG, SDXL_TEHookCFG
+from ..cfg_context import CFGContext
+class SD15Wrapper(BaseWrapper):
+    def __init__(self, denoiser: UNet2DConditionModel, TE, vae: AutoencoderKL, noise_sampler: BaseSampler, tokenizer, min_attnmask=0,
+                 pred_type='eps', TE_hook_cfg:TEHookCFG=SD15_TEHookCFG, cfg_context=CFGContext(), key_map_in=None, key_map_out=None):
+        super().__init__()
+        self.key_mapper_in = self.build_mapper(key_map_in, None, (
+            'prompt -> prompt_ids', 'image -> image', 'attn_mask -> attn_mask', 'position_ids -> position_ids', 'neg_prompt -> neg_prompt_ids',
+            'neg_attn_mask -> neg_attn_mask', 'neg_position_ids -> neg_position_ids', 'plugin_input -> plugin_input'))
+        self.key_mapper_out = self.build_mapper(key_map_out, None, None)
+        self.denoiser = denoiser
+        self.TE = TE
+        self.vae = vae
+        self.noise_sampler = noise_sampler
+        self.tokenizer = tokenizer
+        self.min_attnmask = min_attnmask
+        self.pred_type = pred_type
+        self.TE_hook_cfg = TEHookCFG.create(TE_hook_cfg)
+        self.cfg_context = cfg_context
+        self.tokenizer.N_repeats = self.TE_hook_cfg.tokenizer_repeats
+    def post_init(self):
+        self.make_TE_hook(self.TE_hook_cfg)
+        self.vae_trainable = False
+        if self.vae is not None:
+            for p in self.vae.parameters():
+                if p.requires_grad:
+                    self.vae_trainable = True
+                    break
+        self.TE_trainable = False
+        for p in self.TE.parameters():
+            if p.requires_grad:
+                self.TE_trainable = True
+                break
+    def make_TE_hook(self, TE_hook_cfg):
+        # Hook and extend text_encoder
+        self.text_enc_hook = TEEXHook.hook(self.TE, self.tokenizer, N_repeats=TE_hook_cfg.tokenizer_repeats,
+                                           clip_skip=TE_hook_cfg.clip_skip, clip_final_norm=TE_hook_cfg.clip_final_norm)
+    def get_latents(self, image: Tensor):
+        if image.shape[1] == 3:
+            with torch.no_grad() if self.vae_trainable else nullcontext():
+                latents = self.vae.encode(image.to(dtype=self.vae.dtype)).latent_dist.sample()
+                latents = latents*self.vae.config.scaling_factor
+        else:
+            latents = image  # Cached latents
+        return latents
+    def forward_TE(self, prompt_ids, timesteps, attn_mask=None, position_ids=None, plugin_input={}, **kwargs):
+        input_all = dict(prompt_ids=prompt_ids, timesteps=timesteps, position_ids=position_ids, attn_mask=attn_mask, **plugin_input)
+        if hasattr(self.TE, 'input_feeder'):
+            for feeder in self.TE.input_feeder:
+                feeder(input_all)
+        # Get the text embedding for conditioning
+        encoder_hidden_states = self.TE(prompt_ids, position_ids=position_ids, attention_mask=attn_mask, output_hidden_states=True)[0]
+        return encoder_hidden_states
+    def forward_denoiser(self, x_t, prompt_ids, encoder_hidden_states, timesteps, attn_mask=None, position_ids=None, plugin_input={}, **kwargs):
+        if attn_mask is not None:
+            attn_mask[:, :self.min_attnmask] = 1
+            encoder_hidden_states, attn_mask = pad_attn_bias(encoder_hidden_states, attn_mask)
+        input_all = dict(prompt_ids=prompt_ids, timesteps=timesteps, position_ids=position_ids, attn_mask=attn_mask,
+                         encoder_hidden_states=encoder_hidden_states, **plugin_input)
+        if hasattr(self.denoiser, 'input_feeder'):
+            for feeder in self.denoiser.input_feeder:
+                feeder(input_all)
+        model_pred = self.denoiser(x_t, timesteps, encoder_hidden_states, encoder_attention_mask=attn_mask).sample  # Predict the noise residual
+        return model_pred
+    def model_forward(self, prompt_ids, image, attn_mask=None, position_ids=None, neg_prompt_ids=None, neg_attn_mask=None, neg_position_ids=None,
+                      plugin_input={}, **kwargs):
+        # input prepare
+        x_0 = self.get_latents(image)
+        x_t, noise, sigma, timesteps = self.noise_sampler.add_noise_rand_t(x_0)
+        x_t_in = x_t*self.noise_sampler.c_in(sigma).to(dtype=x_t.dtype)
+        if neg_prompt_ids:
+            prompt_ids = torch.cat([neg_prompt_ids, prompt_ids], dim=0)
+            if neg_attn_mask:
+                attn_mask = torch.cat([neg_attn_mask, attn_mask], dim=0)
+            if neg_position_ids:
+                position_ids = torch.cat([neg_position_ids, position_ids], dim=0)
+        # model forward
+        x_t_in, timesteps = self.cfg_context.pre(x_t_in, timesteps)
+        encoder_hidden_states = self.forward_TE(prompt_ids, timesteps, attn_mask=attn_mask, position_ids=position_ids,
+                                                plugin_input=plugin_input, **kwargs)
+        model_pred = self.forward_denoiser(x_t_in, prompt_ids, encoder_hidden_states, timesteps, attn_mask=attn_mask, position_ids=position_ids,
+                                           plugin_input=plugin_input, **kwargs)
+        model_pred = self.cfg_context.post(model_pred)
+        return dict(model_pred=model_pred, noise=noise, sigma=sigma, timesteps=timesteps, x_0=x_0, x_t=x_t, pred_type=self.pred_type,
+                    noise_sampler=self.noise_sampler)
+    def forward(self, ds_name=None, **kwargs):
+        model_args, model_kwargs = self.get_map_data(self.key_mapper_in, kwargs, ds_name)
+        out = self.model_forward(*model_args, **model_kwargs)
+        return self.get_map_data(self.key_mapper_out, out, ds_name=ds_name)[1]
+    def enable_gradient_checkpointing(self):
+        def grad_ckpt_enable(m):
+            if getattr(m, 'gradient_checkpointing', False):
+                m.training = True
+        self.denoiser.enable_gradient_checkpointing()
+        if self.TE_trainable:
+            self.TE.gradient_checkpointing_enable()
+        self.apply(grad_ckpt_enable)
+    def enable_xformers(self):
+        self.denoiser.enable_xformers_memory_efficient_attention()
+    @property
+    def trainable_parameters(self):
+        return [p for p in self.parameters() if p.requires_grad]
+    @property
+    def trainable_models(self) -> Dict[str, nn.Module]:
+        return {'self':self}
+    def set_dtype(self, dtype, vae_dtype):
+        self.dtype = dtype
+        self.vae_dtype = vae_dtype
+        # Move vae and text_encoder to device and cast to weight_dtype
+        if self.vae is not None:
+            self.vae = self.vae.to(dtype=vae_dtype)
+        if not self.TE_trainable:
+            self.TE = self.TE.to(dtype=dtype)
+    @classmethod
+    def from_pretrained(cls, models: Union[partial, Dict[str, nn.Module]], **kwargs):
+        models = models() if isinstance(models, partial) else models
+        return cls(models['denoiser'], models['TE'], models['vae'], models['noise_sampler'], models['tokenizer'], **kwargs)
+class SDXLWrapper(SD15Wrapper):
+    def __init__(self, denoiser: UNet2DConditionModel, TE, vae: AutoencoderKL, noise_sampler: BaseSampler, tokenizer, min_attnmask=0,
+                 pred_type='eps', TE_hook_cfg:TEHookCFG=SDXL_TEHookCFG, cfg_context=CFGContext(), key_map_in=None, key_map_out=None):
+        super().__init__(denoiser, TE, vae, noise_sampler, tokenizer, min_attnmask, pred_type, TE_hook_cfg, cfg_context, key_map_in, key_map_out)
+        self.key_mapper_in = self.build_mapper(key_map_in, None, (
+            'prompt -> prompt_ids', 'image -> image', 'attn_mask -> attn_mask', 'position_ids -> position_ids', 'neg_prompt -> neg_prompt_ids',
+            'neg_attn_mask -> neg_attn_mask', 'neg_position_ids -> neg_position_ids', 'plugin_input -> plugin_input', 'coord -> crop_info'))
+    def make_TE_hook(self, TE_hook_cfg):
+        # Hook and extend text_encoder
+        self.text_enc_hook = ComposeTEEXHook.hook(self.TE, self.tokenizer, N_repeats=TE_hook_cfg.tokenizer_repeats,
+                                                  clip_skip=TE_hook_cfg.clip_skip, clip_final_norm=TE_hook_cfg.clip_final_norm)
+    def forward_TE(self, prompt_ids, timesteps, attn_mask=None, position_ids=None, plugin_input={}, **kwargs):
+        input_all = dict(prompt_ids=prompt_ids, timesteps=timesteps, position_ids=position_ids, attn_mask=attn_mask, **plugin_input)
+        if hasattr(self.TE, 'input_feeder'):
+            for feeder in self.TE.input_feeder:
+                feeder(input_all)
+        # Get the text embedding for conditioning
+        encoder_hidden_states, pooled_output = self.TE(prompt_ids, position_ids=position_ids, attention_mask=attn_mask, output_hidden_states=True)
+        return encoder_hidden_states, pooled_output
+    def forward_denoiser(self, x_t, prompt_ids, encoder_hidden_states, timesteps, added_cond_kwargs, attn_mask=None, position_ids=None,
+                         plugin_input={}, **kwargs):
+        if attn_mask is not None:
+            attn_mask[:, :self.min_attnmask] = 1
+            encoder_hidden_states, attn_mask = pad_attn_bias(encoder_hidden_states, attn_mask)
+        input_all = dict(prompt_ids=prompt_ids, timesteps=timesteps, position_ids=position_ids, attn_mask=attn_mask,
+                         encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs, **plugin_input)
+        if hasattr(self.denoiser, 'input_feeder'):
+            for feeder in self.denoiser.input_feeder:
+                feeder(input_all)
+        model_pred = self.denoiser(x_t, timesteps, encoder_hidden_states, encoder_attention_mask=attn_mask,
+                                   added_cond_kwargs=added_cond_kwargs).sample  # Predict the noise residual
+        return model_pred
+    def model_forward(self, prompt_ids, image, attn_mask=None, position_ids=None, neg_prompt_ids=None, neg_attn_mask=None, neg_position_ids=None,
+                      crop_info=None, plugin_input={}):
+        # input prepare
+        x_0 = self.get_latents(image)
+        x_t, noise, sigma, timesteps = self.noise_sampler.add_noise_rand_t(x_0)
+        x_t_in = x_t*self.noise_sampler.c_in(sigma).to(dtype=x_t.dtype)
+        if neg_prompt_ids:
+            prompt_ids = torch.cat([neg_prompt_ids, prompt_ids], dim=0)
+            if neg_attn_mask:
+                attn_mask = torch.cat([neg_attn_mask, attn_mask], dim=0)
+            if neg_position_ids:
+                position_ids = torch.cat([neg_position_ids, position_ids], dim=0)
+        # model forward
+        x_t_in, timesteps = self.cfg_context.pre(x_t_in, timesteps)
+        encoder_hidden_states, pooled_output = self.forward_TE(prompt_ids, timesteps, attn_mask=attn_mask, position_ids=position_ids,
+                                                               plugin_input=plugin_input)
+        added_cond_kwargs = {"text_embeds":pooled_output[-1], "time_ids":crop_info}
+        model_pred = self.forward_denoiser(x_t_in, prompt_ids, encoder_hidden_states, timesteps, added_cond_kwargs=added_cond_kwargs,
+                                           attn_mask=attn_mask, position_ids=position_ids, plugin_input=plugin_input)
+        model_pred = self.cfg_context.post(model_pred)
+        return dict(model_pred=model_pred, noise=noise, sigma=sigma, timesteps=timesteps, x_0=x_0, x_t=x_t, pred_type=self.pred_type,
+                    noise_sampler=self.noise_sampler)

hcpdiff/models/wrapper/utils.py ADDED Viewed

@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from rainbowneko.utils import is_dict
+class TEHookCFG:
+    def __init__(self, tokenizer_repeats: int = 1, clip_skip: int = 0, clip_final_norm: bool = True):
+        self.tokenizer_repeats = tokenizer_repeats
+        self.clip_skip = clip_skip
+        self.clip_final_norm = clip_final_norm
+    @classmethod
+    def create(cls, cfg):
+        if is_dict(cfg):
+            return cls(**cfg)
+        elif isinstance(cfg, cls):
+            return cfg
+        else:
+            raise ValueError(f'Invalid TEHookCFG type: {type(cfg)}')
+SD15_TEHookCFG = TEHookCFG()
+SDXL_TEHookCFG = TEHookCFG(clip_skip=1, clip_final_norm=False)

hcpdiff/parser/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .embpt import CfgEmbPTParser

hcpdiff/parser/embpt.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Dict, Tuple, List
+from rainbowneko.utils import Path_Like
+from hcpdiff.models import EmbeddingPTHook
+from torch import Tensor
+class CfgEmbPTParser:
+    def __init__(self, emb_dir: Path_Like, cfg_pt: Dict[str, Dict], lr: float = 1e-5, weight_decay: float = 0):
+        self.emb_dir = emb_dir
+        self.cfg_pt = cfg_pt
+        self.lr = lr
+        self.weight_decay = weight_decay
+    def get_params_group(self, model) -> Tuple[List, Dict[str, Tensor]]:
+        self.embedding_hook, self.ex_words_emb = EmbeddingPTHook.hook_from_dir(
+            self.emb_dir, model.tokenizer, model.TE, N_repeats=model.tokenizer.N_repeats)
+        self.embedding_hook.requires_grad_(False)
+        train_params_emb = []
+        train_pts = {}
+        for pt_name, info in self.cfg_pt.items():
+            word_emb = self.ex_words_emb[pt_name]
+            train_pts[pt_name] = word_emb
+            word_emb.requires_grad = True
+            self.embedding_hook.emb_train.append(word_emb)
+            param_group = {'params':word_emb}
+            if 'lr' in info:
+                param_group['lr'] = info.lr
+            if 'weight_decay' in info:
+                param_group['weight_decay'] = info.weight_decay
+            train_params_emb.append(param_group)
+        return train_params_emb, train_pts

hcpdiff/tools/convert_caption_txt2json.py CHANGED Viewed

@@ -2,7 +2,7 @@ import argparse
 import json
 import os
-from hcpdiff.utils.img_size_tool import types_support
+from rainbowneko.utils import types_support
 parser = argparse.ArgumentParser(description='Stable Diffusion Training')
 parser.add_argument('--data_root', type=str, default='')

hcpdiff 0.9.1__py3-none-any.whl → 2.2__py3-none-any.whl

hcpdiff 0.9.1py3-none-any.whl → 2.2py3-none-any.whl