PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/sd/modeling/model_io.py ADDED Viewed

@@ -0,0 +1,385 @@
+# Copyright © 2023-2024 Apple Inc.
+import json
+import os
+from typing import Optional
+import mlx.core as mx
+from huggingface_hub import hf_hub_download
+from mlx.utils import tree_unflatten
+from .clip import CLIPTextModel
+from .config import AutoencoderConfig, CLIPTextModelConfig, DiffusionConfig, UNetConfig
+from .tokenizer import Tokenizer
+from .unet import UNetModel
+from .vae import Autoencoder
+_DEFAULT_MODEL = "stabilityai/stable-diffusion-2-1-base"
+_MODELS = {
+    # See https://huggingface.co/stabilityai/sdxl-turbo for the model details and license
+    "stabilityai/sdxl-turbo": {
+        "unet_config": "unet/config.json",
+        "unet": "unet/diffusion_pytorch_model.safetensors",
+        "text_encoder_config": "text_encoder/config.json",
+        "text_encoder": "text_encoder/model.safetensors",
+        "text_encoder_2_config": "text_encoder_2/config.json",
+        "text_encoder_2": "text_encoder_2/model.safetensors",
+        "vae_config": "vae/config.json",
+        "vae": "vae/diffusion_pytorch_model.safetensors",
+        "diffusion_config": "scheduler/scheduler_config.json",
+        "tokenizer_vocab": "tokenizer/vocab.json",
+        "tokenizer_merges": "tokenizer/merges.txt",
+        "tokenizer_2_vocab": "tokenizer_2/vocab.json",
+        "tokenizer_2_merges": "tokenizer_2/merges.txt",
+    },
+    # See https://huggingface.co/stabilityai/stable-diffusion-2-1-base for the model details and license
+    "stabilityai/stable-diffusion-2-1-base": {
+        "unet_config": "unet/config.json",
+        "unet": "unet/diffusion_pytorch_model.safetensors",
+        "text_encoder_config": "text_encoder/config.json",
+        "text_encoder": "text_encoder/model.safetensors",
+        "vae_config": "vae/config.json",
+        "vae": "vae/diffusion_pytorch_model.safetensors",
+        "diffusion_config": "scheduler/scheduler_config.json",
+        "tokenizer_vocab": "tokenizer/vocab.json",
+        "tokenizer_merges": "tokenizer/merges.txt",
+    },
+}
+def map_unet_weights(key, value):
+    # Map up/downsampling
+    if "downsamplers" in key:
+        key = key.replace("downsamplers.0.conv", "downsample")
+    if "upsamplers" in key:
+        key = key.replace("upsamplers.0.conv", "upsample")
+    # Map the mid block
+    if "mid_block.resnets.0" in key:
+        key = key.replace("mid_block.resnets.0", "mid_blocks.0")
+    if "mid_block.attentions.0" in key:
+        key = key.replace("mid_block.attentions.0", "mid_blocks.1")
+    if "mid_block.resnets.1" in key:
+        key = key.replace("mid_block.resnets.1", "mid_blocks.2")
+    # Map attention layers
+    if "to_k" in key:
+        key = key.replace("to_k", "key_proj")
+    if "to_out.0" in key:
+        key = key.replace("to_out.0", "out_proj")
+    if "to_q" in key:
+        key = key.replace("to_q", "query_proj")
+    if "to_v" in key:
+        key = key.replace("to_v", "value_proj")
+    # Map transformer ffn
+    if "ff.net.2" in key:
+        key = key.replace("ff.net.2", "linear3")
+    if "ff.net.0" in key:
+        k1 = key.replace("ff.net.0.proj", "linear1")
+        k2 = key.replace("ff.net.0.proj", "linear2")
+        v1, v2 = mx.split(value, 2)
+        return [(k1, v1), (k2, v2)]
+    if "conv_shortcut.weight" in key:
+        value = value.squeeze()
+    # Transform the weights from 1x1 convs to linear
+    if len(value.shape) == 4 and ("proj_in" in key or "proj_out" in key):
+        value = value.squeeze()
+    if len(value.shape) == 4:
+        value = value.transpose(0, 2, 3, 1)
+        value = value.reshape(-1).reshape(value.shape)
+    return [(key, value)]
+def map_clip_text_encoder_weights(key, value):
+    # Remove prefixes
+    if key.startswith("text_model."):
+        key = key[11:]
+    if key.startswith("embeddings."):
+        key = key[11:]
+    if key.startswith("encoder."):
+        key = key[8:]
+    # Map attention layers
+    if "self_attn." in key:
+        key = key.replace("self_attn.", "attention.")
+    if "q_proj." in key:
+        key = key.replace("q_proj.", "query_proj.")
+    if "k_proj." in key:
+        key = key.replace("k_proj.", "key_proj.")
+    if "v_proj." in key:
+        key = key.replace("v_proj.", "value_proj.")
+    # Map ffn layers
+    if "mlp.fc1" in key:
+        key = key.replace("mlp.fc1", "linear1")
+    if "mlp.fc2" in key:
+        key = key.replace("mlp.fc2", "linear2")
+    return [(key, value)]
+def map_vae_weights(key, value):
+    # Map up/downsampling
+    if "downsamplers" in key:
+        key = key.replace("downsamplers.0.conv", "downsample")
+    if "upsamplers" in key:
+        key = key.replace("upsamplers.0.conv", "upsample")
+    # Map attention layers
+    if "to_k" in key:
+        key = key.replace("to_k", "key_proj")
+    if "to_out.0" in key:
+        key = key.replace("to_out.0", "out_proj")
+    if "to_q" in key:
+        key = key.replace("to_q", "query_proj")
+    if "to_v" in key:
+        key = key.replace("to_v", "value_proj")
+    # Map the mid block
+    if "mid_block.resnets.0" in key:
+        key = key.replace("mid_block.resnets.0", "mid_blocks.0")
+    if "mid_block.attentions.0" in key:
+        key = key.replace("mid_block.attentions.0", "mid_blocks.1")
+    if "mid_block.resnets.1" in key:
+        key = key.replace("mid_block.resnets.1", "mid_blocks.2")
+    # Map the quant/post_quant layers
+    if "quant_conv" in key:
+        key = key.replace("quant_conv", "quant_proj")
+        value = value.squeeze()
+    # Map the conv_shortcut to linear
+    if "conv_shortcut.weight" in key:
+        value = value.squeeze()
+    if len(value.shape) == 4:
+        value = value.transpose(0, 2, 3, 1)
+        value = value.reshape(-1).reshape(value.shape)
+    return [(key, value)]
+def _flatten(params):
+    return [(k, v) for p in params for (k, v) in p]
+def _load_safetensor_weights(mapper, model, weight_file, float16: bool = False):
+    dtype = mx.float16 if float16 else mx.float32
+    weights = mx.load(weight_file)
+    weights = _flatten([mapper(k, v.astype(dtype)) for k, v in weights.items()])
+    model.update(tree_unflatten(weights))
+def _check_key(key: str, part: str):
+    # Check if it's a local path
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # For local paths, we'll use a default model structure
+        return
+    if key not in _MODELS:
+        raise ValueError(
+            f"[{part}] '{key}' model not found, choose one of {{{','.join(_MODELS.keys())}}}"
+        )
+def _get_model_path(key: str, file_path: str):
+    """Get the full path for a model file, supporting both local and HuggingFace paths"""
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path
+        return os.path.join(key, file_path)
+    else:
+        # HuggingFace path
+        return hf_hub_download(key, file_path)
+def load_unet(key: str = _DEFAULT_MODEL, float16: bool = False):
+    """Load the stable diffusion UNet from Hugging Face Hub."""
+    _check_key(key, "load_unet")
+    # Get the config path
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        unet_config = "unet/config.json"
+    else:
+        unet_config = _MODELS[key]["unet_config"]
+    with open(_get_model_path(key, unet_config)) as f:
+        config = json.load(f)
+    n_blocks = len(config["block_out_channels"])
+    model = UNetModel(
+        UNetConfig(
+            in_channels=config["in_channels"],
+            out_channels=config["out_channels"],
+            block_out_channels=config["block_out_channels"],
+            layers_per_block=[config["layers_per_block"]] * n_blocks,
+            transformer_layers_per_block=config.get(
+                "transformer_layers_per_block", (1,) * 4
+            ),
+            num_attention_heads=(
+                [config["attention_head_dim"]] * n_blocks
+                if isinstance(config["attention_head_dim"], int)
+                else config["attention_head_dim"]
+            ),
+            cross_attention_dim=[config["cross_attention_dim"]] * n_blocks,
+            norm_num_groups=config["norm_num_groups"],
+            down_block_types=config["down_block_types"],
+            up_block_types=config["up_block_types"][::-1],
+            addition_embed_type=config.get("addition_embed_type", None),
+            addition_time_embed_dim=config.get("addition_time_embed_dim", None),
+            projection_class_embeddings_input_dim=config.get(
+                "projection_class_embeddings_input_dim", None
+            ),
+        )
+    )
+    # Download the weights and map them into the model
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        unet_weights = "unet/diffusion_pytorch_model.safetensors"
+    else:
+        unet_weights = _MODELS[key]["unet"]
+    weight_file = _get_model_path(key, unet_weights)
+    _load_safetensor_weights(map_unet_weights, model, weight_file, float16)
+    return model
+def load_text_encoder(
+    key: str = _DEFAULT_MODEL,
+    float16: bool = False,
+    model_key: str = "text_encoder",
+    config_key: Optional[str] = None,
+):
+    """Load the stable diffusion text encoder from Hugging Face Hub."""
+    _check_key(key, "load_text_encoder")
+    config_key = config_key or (model_key + "_config")
+    # Download the config and create the model
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        text_encoder_config = f"{model_key}/config.json"
+    else:
+        text_encoder_config = _MODELS[key][config_key]
+    with open(_get_model_path(key, text_encoder_config)) as f:
+        config = json.load(f)
+    with_projection = "WithProjection" in config["architectures"][0]
+    model = CLIPTextModel(
+        CLIPTextModelConfig(
+            num_layers=config["num_hidden_layers"],
+            model_dims=config["hidden_size"],
+            num_heads=config["num_attention_heads"],
+            max_length=config["max_position_embeddings"],
+            vocab_size=config["vocab_size"],
+            projection_dim=config["projection_dim"] if with_projection else None,
+            hidden_act=config.get("hidden_act", "quick_gelu"),
+        )
+    )
+    # Download the weights and map them into the model
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        text_encoder_weights = f"{model_key}/model.safetensors"
+    else:
+        text_encoder_weights = _MODELS[key][model_key]
+    weight_file = _get_model_path(key, text_encoder_weights)
+    _load_safetensor_weights(map_clip_text_encoder_weights, model, weight_file, float16)
+    return model
+def load_autoencoder(key: str = _DEFAULT_MODEL, float16: bool = False):
+    """Load the stable diffusion autoencoder from Hugging Face Hub."""
+    _check_key(key, "load_autoencoder")
+    # Download the config and create the model
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        vae_config = "vae/config.json"
+    else:
+        vae_config = _MODELS[key]["vae_config"]
+    with open(_get_model_path(key, vae_config)) as f:
+        config = json.load(f)
+    model = Autoencoder(
+        AutoencoderConfig(
+            in_channels=config["in_channels"],
+            out_channels=config["out_channels"],
+            latent_channels_out=2 * config["latent_channels"],
+            latent_channels_in=config["latent_channels"],
+            block_out_channels=config["block_out_channels"],
+            layers_per_block=config["layers_per_block"],
+            norm_num_groups=config["norm_num_groups"],
+            scaling_factor=config.get("scaling_factor", 0.18215),
+        )
+    )
+    # Download the weights and map them into the model
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        vae_weights = "vae/diffusion_pytorch_model.safetensors"
+    else:
+        vae_weights = _MODELS[key]["vae"]
+    weight_file = _get_model_path(key, vae_weights)
+    _load_safetensor_weights(map_vae_weights, model, weight_file, float16)
+    return model
+def load_diffusion_config(key: str = _DEFAULT_MODEL):
+    """Load the stable diffusion config from Hugging Face Hub."""
+    _check_key(key, "load_diffusion_config")
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        diffusion_config = "scheduler/scheduler_config.json"
+    else:
+        diffusion_config = _MODELS[key]["diffusion_config"]
+    with open(_get_model_path(key, diffusion_config)) as f:
+        config = json.load(f)
+    return DiffusionConfig(
+        beta_start=config["beta_start"],
+        beta_end=config["beta_end"],
+        beta_schedule=config["beta_schedule"],
+        num_train_steps=config["num_train_timesteps"],
+    )
+def load_tokenizer(
+    key: str = _DEFAULT_MODEL,
+    vocab_key: str = "tokenizer_vocab",
+    merges_key: str = "tokenizer_merges",
+):
+    _check_key(key, "load_tokenizer")
+    if os.path.exists(key) or '/' in key or '\\' in key:
+        # Local path - use SDXL Turbo structure
+        vocab_file = _get_model_path(key, f"tokenizer/{vocab_key.split('_')[1]}.json")
+        merges_file = _get_model_path(key, f"tokenizer/{merges_key.split('_')[1]}.txt")
+    else:
+        vocab_file = _get_model_path(key, _MODELS[key][vocab_key])
+        merges_file = _get_model_path(key, _MODELS[key][merges_key])
+    with open(vocab_file, encoding="utf-8") as f:
+        vocab = json.load(f)
+    with open(merges_file, encoding="utf-8") as f:
+        bpe_merges = f.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
+    bpe_merges = [tuple(m.split()) for m in bpe_merges]
+    bpe_ranks = dict(map(reversed, enumerate(bpe_merges)))
+    return Tokenizer(bpe_ranks, vocab)

nexaai/mlx_backend/sd/modeling/sampler.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright © 2023 Apple Inc.
+import mlx.core as mx
+from .config import DiffusionConfig
+def _linspace(a, b, num):
+    x = mx.arange(0, num) / (num - 1)
+    return (b - a) * x + a
+def _interp(y, x_new):
+    """Interpolate the function defined by (arange(0, len(y)), y) at positions x_new."""
+    x_low = x_new.astype(mx.int32)
+    x_high = mx.minimum(x_low + 1, len(y) - 1)
+    y_low = y[x_low]
+    y_high = y[x_high]
+    delta_x = x_new - x_low
+    y_new = y_low * (1 - delta_x) + delta_x * y_high
+    return y_new
+class SimpleEulerSampler:
+    """A simple Euler integrator that can be used to sample from our diffusion models.
+    The method ``step()`` performs one Euler step from x_t to x_t_prev.
+    """
+    def __init__(self, config: DiffusionConfig):
+        # Compute the noise schedule
+        if config.beta_schedule == "linear":
+            betas = _linspace(
+                config.beta_start, config.beta_end, config.num_train_steps
+            )
+        elif config.beta_schedule == "scaled_linear":
+            betas = _linspace(
+                config.beta_start**0.5, config.beta_end**0.5, config.num_train_steps
+            ).square()
+        else:
+            raise NotImplementedError(f"{config.beta_schedule} is not implemented.")
+        alphas = 1 - betas
+        alphas_cumprod = mx.cumprod(alphas)
+        self._sigmas = mx.concatenate(
+            [mx.zeros(1), ((1 - alphas_cumprod) / alphas_cumprod).sqrt()]
+        )
+    @property
+    def max_time(self):
+        return len(self._sigmas) - 1
+    def sample_prior(self, shape, dtype=mx.float32, key=None):
+        noise = mx.random.normal(shape, key=key)
+        return (
+            noise * self._sigmas[-1] * (self._sigmas[-1].square() + 1).rsqrt()
+        ).astype(dtype)
+    def add_noise(self, x, t, key=None):
+        noise = mx.random.normal(x.shape, key=key)
+        s = self.sigmas(t)
+        return (x + noise * s) * (s.square() + 1).rsqrt()
+    def sigmas(self, t):
+        return _interp(self._sigmas, t)
+    def timesteps(self, num_steps: int, start_time=None, dtype=mx.float32):
+        start_time = start_time or (len(self._sigmas) - 1)
+        assert 0 < start_time <= (len(self._sigmas) - 1)
+        steps = _linspace(start_time, 0, num_steps + 1).astype(dtype)
+        return list(zip(steps, steps[1:]))
+    def step(self, eps_pred, x_t, t, t_prev):
+        sigma = self.sigmas(t).astype(eps_pred.dtype)
+        sigma_prev = self.sigmas(t_prev).astype(eps_pred.dtype)
+        dt = sigma_prev - sigma
+        x_t_prev = (sigma.square() + 1).sqrt() * x_t + eps_pred * dt
+        x_t_prev = x_t_prev * (sigma_prev.square() + 1).rsqrt()
+        return x_t_prev
+class SimpleEulerAncestralSampler(SimpleEulerSampler):
+    def step(self, eps_pred, x_t, t, t_prev):
+        sigma = self.sigmas(t).astype(eps_pred.dtype)
+        sigma_prev = self.sigmas(t_prev).astype(eps_pred.dtype)
+        sigma2 = sigma.square()
+        sigma_prev2 = sigma_prev.square()
+        sigma_up = (sigma_prev2 * (sigma2 - sigma_prev2) / sigma2).sqrt()
+        sigma_down = (sigma_prev2 - sigma_up**2).sqrt()
+        dt = sigma_down - sigma
+        x_t_prev = (sigma2 + 1).sqrt() * x_t + eps_pred * dt
+        noise = mx.random.normal(x_t_prev.shape).astype(x_t_prev.dtype)
+        x_t_prev = x_t_prev + noise * sigma_up
+        x_t_prev = x_t_prev * (sigma_prev2 + 1).rsqrt()
+        return x_t_prev

nexaai/mlx_backend/sd/modeling/tokenizer.py ADDED Viewed

@@ -0,0 +1,100 @@
+# Copyright © 2023 Apple Inc.
+import regex
+class Tokenizer:
+    """A simple port of CLIPTokenizer from https://github.com/huggingface/transformers/ ."""
+    def __init__(self, bpe_ranks, vocab):
+        self.bpe_ranks = bpe_ranks
+        self.vocab = vocab
+        self.pat = regex.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            regex.IGNORECASE,
+        )
+        self._cache = {self.bos: self.bos, self.eos: self.eos}
+    @property
+    def bos(self):
+        return "<|startoftext|>"
+    @property
+    def bos_token(self):
+        return self.vocab[self.bos]
+    @property
+    def eos(self):
+        return "<|endoftext|>"
+    @property
+    def eos_token(self):
+        return self.vocab[self.eos]
+    def bpe(self, text):
+        if text in self._cache:
+            return self._cache[text]
+        unigrams = list(text[:-1]) + [text[-1] + "</w>"]
+        unique_bigrams = set(zip(unigrams, unigrams[1:]))
+        if not unique_bigrams:
+            return unigrams
+        # In every iteration try to merge the two most likely bigrams. If none
+        # was merged we are done.
+        #
+        # Ported from https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/tokenization_clip.py
+        while unique_bigrams:
+            bigram = min(
+                unique_bigrams, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))
+            )
+            if bigram not in self.bpe_ranks:
+                break
+            new_unigrams = []
+            skip = False
+            for a, b in zip(unigrams, unigrams[1:]):
+                if skip:
+                    skip = False
+                    continue
+                if (a, b) == bigram:
+                    new_unigrams.append(a + b)
+                    skip = True
+                else:
+                    new_unigrams.append(a)
+            if not skip:
+                new_unigrams.append(b)
+            unigrams = new_unigrams
+            unique_bigrams = set(zip(unigrams, unigrams[1:]))
+        self._cache[text] = unigrams
+        return unigrams
+    def tokenize(self, text, prepend_bos=True, append_eos=True):
+        if isinstance(text, list):
+            return [self.tokenize(t, prepend_bos, append_eos) for t in text]
+        # Lower case cleanup and split according to self.pat. Hugging Face does
+        # a much more thorough job here but this should suffice for 95% of
+        # cases.
+        clean_text = regex.sub(r"\s+", " ", text.lower())
+        tokens = regex.findall(self.pat, clean_text)
+        # Split the tokens according to the byte-pair merge file
+        bpe_tokens = [ti for t in tokens for ti in self.bpe(t)]
+        # Map to token ids and return
+        tokens = [self.vocab[t] for t in bpe_tokens]
+        if prepend_bos:
+            tokens = [self.bos_token] + tokens
+        if append_eos:
+            tokens.append(self.eos_token)
+        return tokens