PyPI - diffsynth-engine - Versions diffs - 0.0.0__py3-none-any.whl - Mend

diffsynth-engine 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

diffsynth_engine/__init__.py +28 -0
diffsynth_engine/algorithm/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/__init__.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py +10 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/__init__.py +5 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_beta.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_ddim.py +25 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/recifited_flow.py +50 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/beta.py +26 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/ddim.py +25 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/exponential.py +19 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/karras.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/linear.py +77 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/sgm_uniform.py +17 -0
diffsynth_engine/algorithm/sampler/__init__.py +19 -0
diffsynth_engine/algorithm/sampler/flow_match/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py +22 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/brownian_tree.py +54 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/ddpm.py +32 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/deis.py +125 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m_sde.py +53 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_3m_sde.py +59 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/epsilon.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler.py +12 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler_ancestral.py +30 -0
diffsynth_engine/conf/models/components/vae.json +254 -0
diffsynth_engine/conf/models/flux/flux_dit.json +105 -0
diffsynth_engine/conf/models/flux/flux_text_encoder.json +20 -0
diffsynth_engine/conf/models/flux/flux_vae.json +250 -0
diffsynth_engine/conf/models/sd/sd_text_encoder.json +220 -0
diffsynth_engine/conf/models/sd/sd_unet.json +397 -0
diffsynth_engine/conf/models/sd3/sd3_dit.json +908 -0
diffsynth_engine/conf/models/sd3/sd3_text_encoder.json +756 -0
diffsynth_engine/conf/models/sdxl/sdxl_text_encoder.json +455 -0
diffsynth_engine/conf/models/sdxl/sdxl_unet.json +1056 -0
diffsynth_engine/conf/models/wan/dit/1.3b-t2v.json +13 -0
diffsynth_engine/conf/models/wan/dit/14b-i2v.json +13 -0
diffsynth_engine/conf/models/wan/dit/14b-t2v.json +13 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/special_tokens_map.json +125 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer.json +129428 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer_config.json +940 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/merges.txt +40213 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/tokenizer_config.json +38 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/vocab.json +49411 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json +308 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json +1028026 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json +2748 -0
diffsynth_engine/kernels/__init__.py +0 -0
diffsynth_engine/models/__init__.py +7 -0
diffsynth_engine/models/base.py +64 -0
diffsynth_engine/models/basic/__init__.py +0 -0
diffsynth_engine/models/basic/attention.py +217 -0
diffsynth_engine/models/basic/lora.py +293 -0
diffsynth_engine/models/basic/relative_position_emb.py +56 -0
diffsynth_engine/models/basic/timestep.py +81 -0
diffsynth_engine/models/basic/transformer_helper.py +88 -0
diffsynth_engine/models/basic/unet_helper.py +244 -0
diffsynth_engine/models/components/__init__.py +0 -0
diffsynth_engine/models/components/clip.py +56 -0
diffsynth_engine/models/components/t5.py +222 -0
diffsynth_engine/models/components/vae.py +392 -0
diffsynth_engine/models/flux/__init__.py +14 -0
diffsynth_engine/models/flux/flux_dit.py +476 -0
diffsynth_engine/models/flux/flux_text_encoder.py +88 -0
diffsynth_engine/models/flux/flux_vae.py +78 -0
diffsynth_engine/models/sd/__init__.py +12 -0
diffsynth_engine/models/sd/sd_text_encoder.py +142 -0
diffsynth_engine/models/sd/sd_unet.py +293 -0
diffsynth_engine/models/sd/sd_vae.py +38 -0
diffsynth_engine/models/sd3/__init__.py +14 -0
diffsynth_engine/models/sd3/sd3_dit.py +302 -0
diffsynth_engine/models/sd3/sd3_text_encoder.py +163 -0
diffsynth_engine/models/sd3/sd3_vae.py +43 -0
diffsynth_engine/models/sdxl/__init__.py +13 -0
diffsynth_engine/models/sdxl/sdxl_text_encoder.py +307 -0
diffsynth_engine/models/sdxl/sdxl_unet.py +306 -0
diffsynth_engine/models/sdxl/sdxl_vae.py +38 -0
diffsynth_engine/models/utils.py +54 -0
diffsynth_engine/models/wan/__init__.py +0 -0
diffsynth_engine/models/wan/wan_dit.py +497 -0
diffsynth_engine/models/wan/wan_image_encoder.py +494 -0
diffsynth_engine/models/wan/wan_text_encoder.py +297 -0
diffsynth_engine/models/wan/wan_vae.py +771 -0
diffsynth_engine/pipelines/__init__.py +18 -0
diffsynth_engine/pipelines/base.py +253 -0
diffsynth_engine/pipelines/flux_image.py +512 -0
diffsynth_engine/pipelines/sd_image.py +352 -0
diffsynth_engine/pipelines/sdxl_image.py +395 -0
diffsynth_engine/pipelines/wan_video.py +524 -0
diffsynth_engine/tokenizers/__init__.py +6 -0
diffsynth_engine/tokenizers/base.py +157 -0
diffsynth_engine/tokenizers/clip.py +288 -0
diffsynth_engine/tokenizers/t5.py +194 -0
diffsynth_engine/tokenizers/wan.py +74 -0
diffsynth_engine/utils/__init__.py +0 -0
diffsynth_engine/utils/constants.py +34 -0
diffsynth_engine/utils/download.py +135 -0
diffsynth_engine/utils/env.py +7 -0
diffsynth_engine/utils/flag.py +46 -0
diffsynth_engine/utils/fp8_linear.py +64 -0
diffsynth_engine/utils/gguf.py +415 -0
diffsynth_engine/utils/loader.py +17 -0
diffsynth_engine/utils/lock.py +56 -0
diffsynth_engine/utils/logging.py +12 -0
diffsynth_engine/utils/offload.py +44 -0
diffsynth_engine/utils/parallel.py +390 -0
diffsynth_engine/utils/prompt.py +9 -0
diffsynth_engine/utils/video.py +40 -0
diffsynth_engine-0.0.0.dist-info/LICENSE +201 -0
diffsynth_engine-0.0.0.dist-info/METADATA +236 -0
diffsynth_engine-0.0.0.dist-info/RECORD +127 -0
diffsynth_engine-0.0.0.dist-info/WHEEL +5 -0
diffsynth_engine-0.0.0.dist-info/top_level.txt +1 -0

diffsynth_engine/tokenizers/clip.py ADDED Viewed

@@ -0,0 +1,288 @@
+import os
+import json
+import ftfy
+import regex as re
+import torch
+from functools import lru_cache
+from typing import Dict, List, Union, Optional
+from diffsynth_engine.tokenizers.base import BaseTokenizer, TOKENIZER_CONFIG_FILE
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+CLIP_DEFAULT_MAX_LENGTH = 77
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+# Modified from transformers.models.clip.tokenization_clip.CLIPTokenizer and open_clip.tokenizer.SimpleTokenizer
+class CLIPTokenizer(BaseTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file: str,
+        merges_file: str,
+        bos_token: Optional[str] = "<|startoftext|>",
+        eos_token: Optional[str] = "<|endoftext|>",
+        unk_token: Optional[str] = "<|endoftext|>",
+        pad_token: Optional[str] = "<|endoftext|>",  # hack to enable padding
+        **kwargs,
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+        self.model_max_length = self.model_max_length if self.model_max_length else CLIP_DEFAULT_MAX_LENGTH
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Union[str, os.PathLike], **kwargs):
+        tokenizer_config_file = os.path.join(pretrained_model_path, TOKENIZER_CONFIG_FILE)
+        with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+            init_kwargs = json.load(tokenizer_config_handle)
+            init_kwargs.update(**kwargs)
+        vocab_file = os.path.join(pretrained_model_path, cls.vocab_files_names["vocab_file"])
+        merges_file = os.path.join(pretrained_model_path, cls.vocab_files_names["merges_file"])
+        return cls(vocab_file=vocab_file, merges_file=merges_file, **init_kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        return self.encoder
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + "</w>"
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def tokenize(self, texts: Union[str, List[str]]) -> Union[List[str], List[List[str]]]:
+        """Convert string to tokens."""
+        if isinstance(texts, str):
+            return self._tokenize(texts)
+        return [self._tokenize(text) for text in texts]
+    def _tokenize(self, text: str) -> List[str]:
+        bpe_tokens = []
+        text = whitespace_clean(ftfy.fix_text(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    def encode(self, texts: str) -> List[int]:
+        tokens = self.tokenize(texts)
+        return self.convert_tokens_to_ids(tokens)
+    def batch_encode(self, texts: List[str]) -> List[List[int]]:
+        return [self.encode(text) for text in texts]
+    def decode(
+        self, ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None
+    ) -> str:
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens)
+        text = self.convert_tokens_to_string(tokens)
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = self.clean_up_tokenization(text)
+        return text
+    def batch_decode(
+        self, ids: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None
+    ) -> List[str]:
+        return [self.decode(index, skip_special_tokens, clean_up_tokenization_spaces) for index in ids]
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            return self.encoder.get(tokens, self.encoder.get(self.unk_token))
+        return [self.encoder.get(token, self.encoder.get(self.unk_token)) for token in tokens]
+    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
+        if isinstance(ids, int):
+            return self.decoder.get(ids)
+        tokens = []
+        for index in ids:
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            tokens.append(self.decoder.get(index))
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        text = "".join(tokens)
+        byte_array = bytearray([self.byte_decoder[c] for c in text])
+        text = byte_array.decode("utf-8", errors="replace").replace("</w>", " ").strip()
+        return text
+    def __call__(
+        self,
+        texts: Union[str, List[str]],
+        max_length: Optional[int] = None,
+        **kwargs,
+    ) -> Dict[str, "torch.Tensor"]:
+        """
+        Tokenize text and prepare for model inputs.
+        Args:
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be encoded.
+            max_length (`int`, *optional*):
+                Each encoded sequence will be truncated or padded to max_length.
+        Returns:
+            `Dict[str, "torch.Tensor"]`: tensor dict compatible with model_input_names.
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        max_length = max_length if max_length else self.model_max_length
+        encoded = torch.zeros(len(texts), max_length, dtype=torch.long)
+        encoded.fill_(self.pad_token_id)
+        attention_mask = torch.zeros(len(texts), max_length, dtype=torch.long)
+        for i, text in enumerate(texts):
+            tokens = self.tokenize(text)
+            ids = [self.bos_token_id] + self.convert_tokens_to_ids(tokens) + [self.eos_token_id]
+            if len(ids) > max_length:
+                ids = ids[:max_length]
+                ids[-1] = self.eos_token_id
+            encoded[i, : len(ids)] = torch.tensor(ids)
+            attention_mask[i, : len(ids)] = torch.ones((1, len(ids)))
+        return {"input_ids": encoded, "attention_mask": attention_mask}

diffsynth_engine/tokenizers/t5.py ADDED Viewed

@@ -0,0 +1,194 @@
+import os
+import json
+import torch
+from typing import Dict, List, Union, Optional
+from tokenizers import Tokenizer as TokenizerFast
+from diffsynth_engine.tokenizers.base import BaseTokenizer, TOKENIZER_CONFIG_FILE
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+T5_DEFAULT_MAX_LENGTH = 512
+class T5TokenizerFast(BaseTokenizer):
+    """
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`):
+            Precompiled file for initializing a fast tokenizer.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+            </Tip>
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    prefix_tokens: List[int] = []
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        **kwargs,
+    ):
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        fast_tokenizer = TokenizerFast.from_file(tokenizer_file)
+        self._tokenizer = fast_tokenizer
+        # disable truncation and padding
+        self._tokenizer.no_truncation()
+        self._tokenizer.no_padding()
+        self.model_max_length = self.model_max_length if self.model_max_length else T5_DEFAULT_MAX_LENGTH
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Union[str, os.PathLike], **kwargs):
+        tokenizer_config_file = os.path.join(pretrained_model_path, TOKENIZER_CONFIG_FILE)
+        with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+            init_kwargs = json.load(tokenizer_config_handle)
+            init_kwargs.update(**kwargs)
+        vocab_file = os.path.join(pretrained_model_path, cls.vocab_files_names["vocab_file"])
+        tokenizer_file = os.path.join(pretrained_model_path, cls.vocab_files_names["tokenizer_file"])
+        return cls(vocab_file=vocab_file, tokenizer_file=tokenizer_file, **init_kwargs)
+    @property
+    def vocab_size(self):
+        return self._tokenizer.get_vocab_size(with_added_tokens=True)
+    def get_vocab(self):
+        return self._tokenizer.get_vocab(with_added_tokens=True)
+    def tokenize(self, texts: Union[str, List[str]]) -> Union[List[str], List[List[str]]]:
+        if isinstance(texts, str):
+            encoding = self._tokenizer.encode(texts)
+            return encoding.tokens
+        encodings = self._tokenizer.encode_batch(texts)
+        return [encoding.tokens for encoding in encodings]
+    def encode(self, texts: str) -> List[int]:
+        encoding = self._tokenizer.encode(texts, add_special_tokens=True)
+        return encoding.ids
+    def batch_encode(self, texts: List[str]) -> List[List[int]]:
+        encodings = self._tokenizer.encode_batch(texts, add_special_tokens=True)
+        return [encoding.ids for encoding in encodings]
+    def decode(
+        self, ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None
+    ) -> str:
+        text = self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = self.clean_up_tokenization(text)
+        return text
+    def batch_decode(
+        self, ids: List[List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None
+    ) -> List[str]:
+        texts = self._tokenizer.decode_batch(ids, skip_special_tokens=skip_special_tokens)
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            texts = [self.clean_up_tokenization(text) for text in texts]
+        return texts
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            index = self._tokenizer.token_to_id(tokens)
+            return index if index is not None else self._tokenizer.token_to_id(self.unk_token)
+        ids = [self._tokenizer.token_to_id(token) for token in tokens]
+        return [index if index is not None else self._tokenizer.token_to_id(self.unk_token) for index in ids]
+    def convert_ids_to_tokens(
+        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
+    ) -> Union[str, List[str]]:
+        if isinstance(ids, int):
+            return self._tokenizer.id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            tokens.append(self._tokenizer.id_to_token(index))
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self._tokenizer.decode(tokens)
+    def __call__(
+        self,
+        texts: Union[str, List[str]],
+        max_length: Optional[int] = None,
+        **kwargs,
+    ) -> Dict[str, "torch.Tensor"]:
+        """
+        Tokenize text and prepare for model inputs.
+        Args:
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be encoded.
+            max_length (`int`, *optional*):
+                Each encoded sequence will be truncated or padded to max_length.
+        Returns:
+            `Dict[str, "torch.Tensor"]`: tensor dict compatible with model_input_names.
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        max_length = max_length if max_length else self.model_max_length
+        encoded = torch.zeros(len(texts), max_length, dtype=torch.long)
+        encoded.fill_(self.pad_token_id)
+        attention_mask = torch.zeros(len(texts), max_length, dtype=torch.long)
+        batch_ids = self.batch_encode(texts)
+        for i, ids in enumerate(batch_ids):
+            if len(ids) > max_length:
+                ids = ids[:max_length]
+                ids[-1] = self.eos_token_id
+            encoded[i, : len(ids)] = torch.tensor(ids)
+            attention_mask[i, : len(ids)] = torch.ones((1, len(ids)))
+        return {"input_ids": encoded, "attention_mask": attention_mask}

diffsynth_engine/tokenizers/wan.py ADDED Viewed

@@ -0,0 +1,74 @@
+import html
+import string
+import ftfy
+import regex as re
+from .t5 import T5TokenizerFast
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class WanT5Tokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, "whitespace", "lower", "canonicalize")
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = T5TokenizerFast.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop("return_mask", False)
+        # arguments
+        _kwargs = {"return_tensors": "pt"}
+        if self.seq_len is not None:
+            _kwargs.update({"padding": "max_length", "truncation": True, "max_length": self.seq_len})
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids["input_ids"], ids["attention_mask"]
+        else:
+            return ids["input_ids"]
+    def _clean(self, text):
+        if self.clean == "whitespace":
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == "lower":
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == "canonicalize":
+            text = canonicalize(basic_clean(text))
+        return text

diffsynth_engine/utils/__init__.py ADDED Viewed

File without changes

diffsynth_engine/utils/constants.py ADDED Viewed

@@ -0,0 +1,34 @@
+import os
+PACKAGE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+REPO_ROOT = os.path.dirname(PACKAGE_ROOT)
+# conf
+CONF_PATH = os.path.join(PACKAGE_ROOT, "conf")
+# tokenizers
+FLUX_TOKENIZER_1_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "flux", "tokenizer_1")
+FLUX_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "flux", "tokenizer_2")
+SDXL_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "tokenizer")
+SDXL_TOKENIZER_2_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "sdxl", "tokenizer_2")
+WAN_TOKENIZER_CONF_PATH = os.path.join(CONF_PATH, "tokenizers", "wan", "umt5-xxl")
+# models
+VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "components", "vae.json")
+FLUX_DIT_CONFIG_FILE = os.path.join(CONF_PATH, "models", "flux", "flux_dit.json")
+FLUX_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "flux", "flux_text_encoder.json")
+FLUX_VAE_CONFIG_FILE = os.path.join(CONF_PATH, "models", "flux", "flux_vae.json")
+SD_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd", "sd_text_encoder.json")
+SD_UNET_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd", "sd_unet.json")
+SD3_DIT_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd3", "sd3_dit.json")
+SD3_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sd3", "sd3_text_encoder.json")
+SDXL_TEXT_ENCODER_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_text_encoder.json")
+SDXL_UNET_CONFIG_FILE = os.path.join(CONF_PATH, "models", "sdxl", "sdxl_unet.json")
+WAN_DIT_1_3B_T2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "1.3b-t2v.json")
+WAN_DIT_14B_I2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "14b-i2v.json")
+WAN_DIT_14B_T2V_CONFIG_FILE = os.path.join(CONF_PATH, "models", "wan", "dit", "14b-t2v.json")
+# data size
+KB = 1024
+MB = 1024 * KB
+GB = 1024 * MB
+TB = 1024 * GB