PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/clients/image_generation/mindalle/models/stage2/layers.py ADDED Viewed

@@ -0,0 +1,144 @@
+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class GELU(nn.Module):
+    def __init__(self, use_approx=False):
+        super().__init__()
+        self.use_approx = use_approx
+    def forward(self, x):
+        if self.use_approx:
+            return x * torch.sigmoid(1.702 * x)
+        else:
+            return F.gelu(x)
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(
+        self,
+        ctx_len: int,
+        embed_dim: int,
+        n_heads: int,
+        resid_pdrop: float,
+        attn_pdrop: float,
+        attn_bias: bool,
+        use_mask: bool = True,
+    ):
+        super().__init__()
+        assert embed_dim % n_heads == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.query = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.value = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        # regularization
+        self.attn_drop = nn.Dropout(attn_pdrop)
+        self.resid_drop = nn.Dropout(resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(embed_dim, embed_dim, attn_bias)
+        self.n_heads = n_heads
+        self.ctx_len = ctx_len
+        self.use_mask = use_mask
+        if self.use_mask:
+            self.register_buffer("mask", torch.ones(ctx_len, ctx_len), persistent=False)
+            self.mask = torch.tril(self.mask).view(1, ctx_len, ctx_len)
+    def forward(self, x, use_cache=False, layer_past=None):
+        B, T, C = x.shape
+        x = x.transpose(0, 1).contiguous()  # (B, T, C) -> (T, B, C)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(T, B * self.n_heads, C // self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        q = self.query(x).view(T, B * self.n_heads, C // self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        v = self.value(x).view(T, B * self.n_heads, C // self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        if use_cache:
+            present = torch.stack([k, v])
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat([past_key, k], dim=-2)
+            v = torch.cat([past_value, v], dim=-2)
+        if use_cache and layer_past is not None:
+            # Tensor shape below: (B * nh, 1, hs) X (B * nh, hs, K) -> (B * nh, 1, K)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, 1, K) X (B*nh, K, hs) -> (B*nh, 1, hs)
+        else:
+            # Tensor shape below: (B * nh, T, hs) X (B * nh, hs, T) -> (B * nh, T, T)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            if self.use_mask:
+                mask = self.mask if T == self.ctx_len else self.mask[:, :T, :T]
+                att = att.masked_fill(mask == 0, float("-inf"))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, T, T) X (B*nh, T, hs) -> (B*nh, T, hs)
+        y = y.transpose(0, 1).contiguous().view(T, B, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        if use_cache:
+            return y.transpose(0, 1).contiguous(), present  # (T, B, C) -> (B, T, C)
+        else:
+            return y.transpose(0, 1).contiguous()  # (T, B, C) -> (B, T, C)
+class Block(nn.Module):
+    def __init__(
+        self,
+        ctx_len: int,
+        embed_dim: int,
+        n_heads: int,
+        mlp_bias: bool,
+        attn_bias: bool,
+        resid_pdrop: bool,
+        attn_pdrop: bool,
+        gelu_use_approx: bool,
+    ):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.attn = MultiHeadSelfAttention(
+            ctx_len=ctx_len,
+            embed_dim=embed_dim,
+            n_heads=n_heads,
+            attn_pdrop=attn_pdrop,
+            resid_pdrop=resid_pdrop,
+            attn_bias=attn_bias,
+            use_mask=True,
+        )
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim, bias=mlp_bias),
+            GELU(gelu_use_approx),
+            nn.Linear(4 * embed_dim, embed_dim, bias=mlp_bias),
+            nn.Dropout(resid_pdrop),
+        )
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+    def sample(self, x, layer_past=None):
+        attn, present = self.attn(self.ln1(x), use_cache=True, layer_past=layer_past)
+        x = x + attn
+        x = x + self.mlp(self.ln2(x))
+        return x, present

helm/clients/image_generation/mindalle/models/stage2/transformer.py ADDED Viewed

@@ -0,0 +1,268 @@
+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, List
+from torch.cuda.amp import autocast
+from .layers import Block
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from omegaconf import OmegaConf
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["heim"])
+class Transformer1d(nn.Module):
+    def __init__(self, vocab_size_txt: int, vocab_size_img: int, hparams: OmegaConf) -> None:
+        super().__init__()
+        assert hparams.n_layers == hparams.n_dense_layers
+        # input embedding for image and text
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.tok_emb_txt = nn.Embedding(vocab_size_txt, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.pos_emb_txt = nn.Embedding(hparams.ctx_len_txt, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [
+            Block(
+                ctx_len=hparams.ctx_len_img + hparams.ctx_len_txt,
+                embed_dim=hparams.embed_dim,
+                n_heads=hparams.n_heads,
+                mlp_bias=hparams.mlp_bias,
+                attn_bias=hparams.attn_bias,
+                resid_pdrop=hparams.resid_pdrop,
+                attn_pdrop=hparams.attn_pdrop,
+                gelu_use_approx=hparams.gelu_use_approx,
+            )
+            for i in range(1, hparams.n_layers + 1)
+        ]
+        self.blocks = nn.Sequential(*self.blocks)
+        # heads for image and text
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head_img = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.head_txt = nn.Linear(hparams.embed_dim, vocab_size_txt, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.ctx_len_txt = hparams.ctx_len_txt
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(
+        self,
+        images: torch.LongTensor,
+        texts: torch.LongTensor,
+        pos_images: torch.LongTensor,
+        pos_texts: torch.LongTensor,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        B, T = images.shape
+        _, N = texts.shape
+        assert T <= self.ctx_len_img, "Already reached the maximum context length (image)."
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        texts = self.tok_emb_txt(texts)
+        images = self.tok_emb_img(images)
+        texts = texts + self.pos_emb_txt(pos_texts)
+        images = images + self.pos_emb_img(pos_images)
+        x = torch.cat([texts, images], axis=1).contiguous()
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        texts = x[:, : N - 1].contiguous()
+        images = x[:, N - 1 : -1].contiguous()
+        logits_txt = self.head_txt(texts)
+        logits_img = self.head_img(images)
+        return logits_img, logits_txt
+    @torch.no_grad()
+    def sampling(
+        self,
+        images: torch.LongTensor,
+        texts: torch.LongTensor,
+        pos_images: torch.LongTensor,
+        pos_texts: torch.LongTensor,
+        use_fp16: bool = True,
+        past: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        _, N = texts.shape
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        with autocast(enabled=use_fp16):
+            if images is None:
+                assert past is None
+                texts = self.tok_emb_txt(texts)
+                x = texts + self.pos_emb_txt(pos_texts)
+                x = self.drop(x)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None)
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, N - 1].contiguous()
+                logits = self.head_img(x)
+            else:
+                if past is None:
+                    texts = self.tok_emb_txt(texts)
+                    images = self.tok_emb_img(images)
+                    texts = texts + self.pos_emb_txt(pos_texts)
+                    images = images + self.pos_emb_img(pos_images)
+                    x = torch.cat([texts, images], axis=1).contiguous()
+                else:
+                    images = self.tok_emb_img(images)
+                    x = images + self.pos_emb_img(pos_images)
+                x = self.drop(x)
+                if past is not None:
+                    past = torch.cat(past, dim=-2)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, -1].contiguous()
+                logits = self.head_img(x)
+            return logits, presents
+    def from_ckpt(self, path: str) -> None:
+        ckpt = torch.load(path, map_location="cpu")["state_dict"]
+        self.load_state_dict(ckpt, strict=True)
+        print(f"{path} succesfully restored..")
+class iGPT(nn.Module):
+    def __init__(self, vocab_size_img: int, use_cls_cond: bool, hparams: OmegaConf) -> None:
+        super().__init__()
+        self.use_cls_cond = use_cls_cond
+        # sos token embedding
+        if self.use_cls_cond:
+            self.sos = nn.Embedding(hparams.n_classes, hparams.embed_dim)
+        else:
+            self.sos = nn.Parameter(torch.randn(1, 1, hparams.embed_dim))
+        # input embedding
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [
+            Block(
+                ctx_len=hparams.ctx_len_img + 1,
+                embed_dim=hparams.embed_dim,
+                n_heads=hparams.n_heads,
+                mlp_bias=hparams.mlp_bias,
+                attn_bias=hparams.attn_bias,
+                resid_pdrop=hparams.resid_pdrop,
+                attn_pdrop=hparams.attn_pdrop,
+                gelu_use_approx=hparams.gelu_use_approx,
+            )
+            for i in range(1, hparams.n_layers + 1)
+        ]
+        self.blocks = nn.Sequential(*self.blocks)
+        # head
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    @torch.no_grad()
+    def sampling(
+        self,
+        sos: torch.FloatTensor,
+        codes: torch.LongTensor,
+        pos_codes: torch.LongTensor,
+        n_samples: int = 16,
+        use_fp16: bool = True,
+        past: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        with autocast(enabled=use_fp16):
+            if codes is None:
+                assert past is None
+                xs = self.drop(sos)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None)
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            else:
+                if past is None:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                    xs = torch.cat([sos, xs], dim=1)
+                else:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                xs = self.drop(xs)
+                past = torch.cat(past, dim=-2) if past is not None else past
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            return logits, presents
+    def forward(self, codes: torch.LongTensor, labels: Optional[torch.LongTensor] = None) -> torch.FloatTensor:
+        B, T = codes.shape
+        xps = torch.arange(T, device=codes.device).repeat((B, 1))
+        sos = self.sos.repeat((B, 1, 1)) if labels is None else self.sos(labels).unsqueeze(1)
+        h = self.tok_emb_img(codes) + self.pos_emb_img(xps)
+        h = torch.cat([sos, h[:, :-1]], dim=1).contiguous()
+        h = self.drop(h)
+        h = self.blocks(h)
+        h = self.ln_f(h)
+        logits = self.head(h)
+        return logits
+    def from_ckpt(self, path: str, strict: bool = True) -> None:
+        ckpt = torch.load(path, map_location="cpu")["state_dict"]
+        self.load_state_dict(ckpt, strict=strict)
+        print(f"{path} successfully restored..")

helm/clients/image_generation/mindalle/models/tokenizer.py ADDED Viewed

@@ -0,0 +1,30 @@
+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+from functools import partial
+from helm.common.optional_dependencies import handle_module_not_found_error
+def build_tokenizer(path: str, context_length: int = 64, *args, **kwargs):
+    try:
+        from tokenizers import CharBPETokenizer
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["heim"])
+    from_file = partial(
+        CharBPETokenizer.from_file,
+        vocab_filename=os.path.join(path, "bpe-16k-vocab.json"),
+        merges_filename=os.path.join(path, "bpe-16k-merges.txt"),
+        unk_token="[UNK]",
+    )
+    tokenizer = from_file(*args, **kwargs)
+    tokenizer.add_special_tokens(["[PAD]"])
+    tokenizer.enable_padding(length=context_length, pad_id=tokenizer.token_to_id("[PAD]"))
+    tokenizer.enable_truncation(max_length=context_length)
+    print(f"{path} successfully restored..")
+    return tokenizer

helm/clients/image_generation/mindalle/utils/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .utils import *
+from .config import *
+from .sampling import *

helm/clients/image_generation/mindalle/utils/config.py ADDED Viewed

@@ -0,0 +1,129 @@
+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+from typing import Optional, List
+from dataclasses import dataclass, field
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from omegaconf import OmegaConf
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["heim"])
+@dataclass
+class DataConfig:
+    dataset: Optional[str] = None
+    tokenizer_type: str = "CharBPE"
+    context_length: int = 64
+    image_resolution: int = 256
+    transforms: str = "dalle-vqvae"
+    bpe_pdrop: Optional[float] = None
+@dataclass
+class Stage1Hparams:
+    double_z: bool = False
+    z_channels: int = 256
+    resolution: int = 256
+    in_channels: int = 3
+    out_ch: int = 3
+    ch: int = 128
+    ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    num_res_blocks: int = 2
+    attn_resolutions: List[int] = field(default_factory=lambda: [16])
+    pdrop: float = 0.0
+@dataclass
+class Stage2Hparams:
+    embed_dim: int = 1536
+    n_layers: int = 42
+    n_heads: int = 24
+    n_dense_layers: int = 42
+    ctx_len_img: int = 256
+    ctx_len_txt: int = 64
+    embd_pdrop: float = 0.0
+    resid_pdrop: float = 0.0
+    attn_pdrop: float = 0.0
+    mlp_bias: bool = True
+    attn_bias: bool = True
+    gelu_use_approx: bool = False
+    use_head_txt: bool = True
+    n_classes: Optional[int] = None
+@dataclass
+class Stage1Config:
+    type: str = "vqgan"
+    embed_dim: int = 256
+    n_embed: int = 16384
+    hparams: Stage1Hparams = Stage1Hparams()
+@dataclass
+class Stage2Config:
+    type: str = "transformer1d"
+    vocab_size_txt: int = 16384
+    vocab_size_img: int = 16384
+    use_cls_cond: Optional[bool] = None
+    hparams: Stage2Hparams = Stage2Hparams()
+@dataclass
+class WarmupConfig:
+    epoch: int = 1
+    multiplier: int = 1
+    buffer_epoch: int = 0
+    min_lr: float = 0.0
+    mode: str = "fix"
+    peak_lr: float = 1e-4
+    start_from_zero: bool = True
+@dataclass
+class OptConfig:
+    opt_type: str = "adamW"
+    base_lr: float = 1e-4
+    weight_decay: float = 1e-4
+    betas: List[float] = field(default_factory=lambda: [0.9, 0.99])
+    grad_clip_norm: float = 1.0
+    sched_type: str = "cosine"
+    max_steps: int = 0
+    min_lr: float = 0.0
+@dataclass
+class ExpConfig:
+    local_batch_size: int = 4
+    total_batch_size: int = 512
+    valid_batch_size: int = 32
+    epochs: int = 10
+    save_ckpt_freq: int = 2
+    test_freq: int = 1
+    use_amp: bool = True
+@dataclass
+class DefaultConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+@dataclass
+class FineTuningConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+    optimizer: OptConfig = OptConfig()
+    experiment: ExpConfig = ExpConfig()
+def get_base_config(use_default=True):
+    return OmegaConf.structured(DefaultConfig if use_default else FineTuningConfig)

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl