PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py ADDED Viewed

@@ -0,0 +1,190 @@
+# -*- encoding: utf-8 -*-
+"""
+@File    :   cuda2d_sampling.py
+@Time    :   2021/10/09 00:46:04
+@Author  :   Ming Ding
+@Contact :   dm18@mails.tsinghua.edu.cn
+"""
+# here put the import lib
+import os
+import math
+import torch
+import torch.nn.functional as F
+import numpy as np
+def top_k_logits_(logits, top_k=0, filter_value=-float("Inf")):
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits[indices_to_remove] = filter_value
+    return logits
+class IterativeEntfilterStrategy:
+    def __init__(self, invalid_slices=[], temperature=1.0, topk=6, temperature2=0.9):
+        self.invalid_slices = invalid_slices
+        self.temperature = temperature
+        self.topk = topk
+        self.cluster_labels = torch.tensor(
+            np.load(f"{os.path.dirname(os.path.dirname(os.path.abspath(__file__)))}/cluster_label.npy"),
+            device="cuda" if torch.cuda.is_available() else "cpu",
+            dtype=torch.long,
+        )
+        self.temperature2 = temperature2
+    def forward(self, logits_, tokens, temperature=None):
+        # In interative strategy, logits are of shape [batch_size, seq_length, hidden_size]
+        if temperature is None:
+            temperature = self.temperature
+        logits = logits_.float() / temperature
+        for invalid_slice in self.invalid_slices:
+            logits[..., invalid_slice] = -float("Inf")
+        logits = logits.view(-1, logits.shape[-1])
+        rprobs = F.softmax(logits.float(), dim=-1)
+        c = self.cluster_labels.expand(*rprobs.shape)
+        cprobs = torch.zeros(logits.shape[0], 500, device=logits.device).scatter_add_(1, c, rprobs)
+        best_scores, best_clusters = cprobs.topk(self.topk)
+        bz = logits.shape[0]
+        best_scores = best_scores / best_scores.sum(dim=-1, keepdim=True)
+        sampled_ids = torch.multinomial(best_scores, num_samples=1)
+        selected_clusters = torch.gather(best_clusters, dim=1, index=sampled_ids)
+        selected_mask = (
+            self.cluster_labels.unsqueeze(0).expand(bz, -1) != selected_clusters
+        )  # cluster_labels [1, 20000] \in [0,500)
+        logits[selected_mask] = -65504
+        # for i in range(bz):
+        #     selected_cluster = \
+        #       best_clusters[i][torch.multinomial(best_scores[i] / best_scores[i].sum(), num_samples=1)]
+        #     logits[i, self.cluster_labels != selected_cluster] = -65504
+        # logits = top_k_logits(logits, self.topk, self.top_p)
+        probs = F.softmax(logits.float() / self.temperature2, dim=-1)  # float is essetial, due to a bug in Pytorch
+        pred = torch.multinomial(probs, num_samples=1).view(*logits_.shape[:2])
+        assert tokens.shape[1] == pred.shape[1] + 1
+        tokens = torch.cat((tokens[:, :1], pred), dim=1)
+        return tokens
+# class IterativeEntfilterStrategy:
+#     def __init__(self, invalid_slices=[], temperature=1., topk=40):
+#         self.invalid_slices = invalid_slices
+#         self.temperature = temperature
+#         self.topk = topk
+#     def forward(self, logits, tokens, temperature=None, entfilter=None, filter_topk=5, temperature2=None):
+#         # In interative strategy, logits are of shape [batch_size, seq_length, hidden_size]
+#         if temperature is None:
+#             temperature = self.temperature
+#         logits = logits.float() / temperature
+#         for invalid_slice in self.invalid_slices:
+#             logits[..., invalid_slice] = -float('Inf')
+#         top_k_logits_(logits, self.topk)
+#         probs = F.softmax(logits, dim=-1)
+#         pred = torch.multinomial(probs.view(-1, logits.shape[-1]), num_samples=1).view(*logits.shape[:2], 1)
+#         pred.squeeze_(-1)
+#         assert tokens.shape[1] == pred.shape[1] + 1
+#         tokens = torch.cat((tokens[:, :1], pred), dim=1)
+#         return tokens
+def filling_sequence_dsr(
+    model,
+    seq0,
+    seq1,
+    warmup_steps=3,
+    block_hw=(4, 4),
+    strategy=IterativeEntfilterStrategy(topk=10),
+):
+    """
+    seq: [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1]
+        4095 {layout[2]} final_token.
+    Attention:
+    The sampling temperature are changing, temporally we hard code them here.
+    The temperature in the strategy is not used.
+    """
+    assert hasattr(model, "layout")
+    layout = model.layout
+    assert len(seq0.shape) == 2 and len(seq1.shape) == 2 and seq0.shape[0] == seq1.shape[0]
+    assert len(layout) == 3
+    assert seq1.shape[1] == layout[-1] - layout[-2] + 1
+    assert (seq1 >= 0).all() and (seq0 >= 0).all()
+    device = seq0.device
+    # concat and pad sequences
+    batch_size = seq0.shape[0]
+    n_pad = layout[1] - seq0.shape[1]
+    assert n_pad > 0, "You should truncate long input before filling."
+    seq = torch.cat(
+        (torch.tensor([0] * n_pad, device=device, dtype=seq0.dtype).unsqueeze(0).expand(batch_size, n_pad), seq0, seq1),
+        dim=1,
+    )  # [b, layout[-1]+1]
+    assert seq.shape[1] == layout[-1] + 1
+    # build initial tokens, attention_mask, and position_ids
+    tokens = seq.clone()
+    attention_mask = torch.ones(layout[1], layout[1]).to(device)
+    attention_mask[: layout[0], layout[0] :] = 0
+    attention_mask[n_pad:, :n_pad] = 0
+    attention_mask = attention_mask.type_as(next(model.parameters()))  # if fp16
+    position_ids = torch.cat(
+        (
+            torch.zeros(n_pad, dtype=torch.long),
+            torch.arange(0, layout[0] - n_pad),
+            torch.arange(513, 513 + layout[1] - layout[0]),
+            torch.arange(1024, 1024 + layout[2] - layout[1]),
+        )
+    ).to(device)
+    log_attention_weights = torch.zeros(layout[1], layout[1], device=device).type_as(next(model.parameters()))
+    log_attention_weights[layout[0] :, n_pad : layout[0]] = 0.0
+    # prepare for interation
+    unfixed = tokens < 0  # just init an all-False tensor
+    unfixed[:, -layout[-1] + layout[-2] :] = True
+    ll, rr = block_hw
+    edge_len = int(math.sqrt(layout[-1] - layout[-2]) + 1e-4)
+    num_steps = warmup_steps + ll - 1 + rr
+    # interative refining
+    # unfixed[..., -(layout[-1] - layout[-2]):].view(
+    #     batch_size, edge_len//ll, ll, edge_len//rr, rr)[:, :, :, :, -1] = False
+    ret = []
+    ret.append(tokens[:, layout[-2] + 1 :].clone())
+    for step_cnt in range(1, num_steps + 1):
+        if step_cnt <= warmup_steps:
+            logits, *_dump = model(
+                tokens[:, :-1], position_ids, attention_mask, log_attention_weights=log_attention_weights
+            )
+            real_temp = 1.0
+            new_tokens = strategy.forward(logits, tokens, real_temp)
+            tokens[unfixed] = new_tokens[unfixed]
+        else:
+            logits, *_dump = model(
+                tokens[:, :-1], position_ids, attention_mask, log_attention_weights=log_attention_weights
+            )
+            real_temp = 1.0
+            new_tokens = strategy.forward(logits, tokens, real_temp, entfilter=1.3, filter_topk=5, temperature2=0.6)
+            # tokens[unfixed] = new_tokens[unfixed]
+            # fixed tokens (update unfixed)
+            unfixed2 = tokens > 10000000
+            for x in range(min(ll, step_cnt - warmup_steps)):
+                y = step_cnt - warmup_steps - x - 1
+                if y < rr:
+                    unfixed[..., -(layout[-1] - layout[-2]) :].view(batch_size, edge_len // ll, ll, edge_len // rr, rr)[
+                        :, :, x, :, y
+                    ] = False
+                    unfixed2[..., -(layout[-1] - layout[-2]) :].view(
+                        batch_size, edge_len // ll, ll, edge_len // rr, rr
+                    )[:, :, x, :, y] = True
+            tokens[unfixed2] = new_tokens[unfixed2]
+        ret.append(tokens[:, layout[-2] + 1 :].clone())
+    return ret

helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py ADDED Viewed

@@ -0,0 +1,141 @@
+# -*- encoding: utf-8 -*-
+"""
+@File    :   iterative_sr.py
+@Time    :   2022/03/02 15:57:45
+@Author  :   Ming Ding
+@Contact :   dm18@mails.tsinghua.edu.cn
+"""
+import torch
+from icetk import icetk as tokenizer
+from .itersr_sampling import filling_sequence_itersr, IterativeEntfilterStrategy
+from .itersr_model import ItersrModel
+from helm.common.optional_dependencies import handle_module_not_found_error
+class IterativeSuperResolution:
+    def __init__(self, args, path, max_bz=4, shared_transformer=None):
+        try:
+            from SwissArmyTransformer.training.model_io import load_checkpoint
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        args.load = path
+        args.kernel_size = 5
+        args.kernel_size2 = 5
+        args.new_sequence_length = 4624
+        args.layout = [16, 3616]
+        model = ItersrModel(args, transformer=shared_transformer)
+        if args.fp16:
+            model = model.half()
+        load_checkpoint(model, args)  # on cpu
+        model.eval()
+        self.model = model.cuda() if torch.cuda.is_available() else model
+        # save cpu weights
+        self.saved_weights = dict((k, v.cpu()) for k, v in model.named_parameters() if "transformer" in k)
+        invalid_slices = [slice(tokenizer.num_image_tokens, None)]
+        self.strategy = IterativeEntfilterStrategy(
+            invalid_slices, temperature=args.temp_all_itersr, topk=args.topk_itersr
+        )
+        self.max_bz = max_bz
+    def _restore_transformer_from_cpu(self, non_blocking=False):
+        for k, v in self.model.named_parameters():
+            if k in self.saved_weights:
+                v.copy_(self.saved_weights[k])
+    def __call__(self, text_tokens, image_tokens, enhance=False, input_mask=None):
+        try:
+            from PIL import ImageEnhance, Image
+        except ModuleNotFoundError as e:
+            handle_module_not_found_error(e, ["heim"])
+        if len(text_tokens.shape) == 1:
+            text_tokens.unsqueeze_(0)
+        text_tokens = text_tokens.clone()[..., :16]
+        if len(image_tokens.shape) == 1:
+            image_tokens.unsqueeze_(0)
+        if enhance:
+            new_image_tokens = []
+            for big_img in image_tokens:
+                decoded = tokenizer.decode(image_ids=big_img).squeeze(0)
+                ndarr = decoded.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+                image_pil_raw = ImageEnhance.Sharpness(Image.fromarray(ndarr))
+                big_img2 = tokenizer.encode(image_pil=image_pil_raw.enhance(1.5), image_size=480).view(-1)
+                new_image_tokens.append(big_img2)
+            image_tokens = torch.stack(new_image_tokens)
+        self._restore_transformer_from_cpu()
+        model = self.model
+        output_list = []
+        for tim in range(max(text_tokens.shape[0] // self.max_bz, 1)):
+            big_img = image_tokens[tim * self.max_bz : (tim + 1) * self.max_bz]
+            text_seq = text_tokens[tim * self.max_bz : (tim + 1) * self.max_bz]
+            mask_raw = (
+                torch.tensor(
+                    [
+                        -1,
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        0,
+                        -1,
+                        2,
+                        -1,
+                        -2,
+                        5,
+                        1,
+                        -2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        2,
+                        3,
+                        4,
+                        5,
+                        -1,
+                        1,
+                        3,
+                        -1,
+                        -2,
+                        0,
+                        -1,
+                        2,
+                        4,
+                        5,
+                        6,
+                        1,
+                        3,
+                        -2,
+                    ]
+                )
+                .view(1, 6, 1, 6)
+                .expand(10, 6, 10, 6)
+                .reshape(-1)
+                .contiguous()
+            )
+            topks = [60, 40, 40, 40, 20, 20, 10]
+            for mask_ratio in range(1, 7):
+                self.strategy.topk = topks[mask_ratio]
+                mask = mask_raw.to(big_img.device) >= mask_ratio
+                if input_mask is not None:
+                    mask = mask & input_mask
+                big_img.masked_fill_(mask, tokenizer["<start_of_image>"])
+                seq1 = big_img
+                output1 = filling_sequence_itersr(
+                    model, text_seq, seq1, warmup_steps=1, block_hw=(1, 0), strategy=self.strategy
+                )
+                big_img = output1
+            output_list.append(output1.clone())
+        return torch.cat(output_list, dim=0)

helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py ADDED Viewed

@@ -0,0 +1,269 @@
+# -*- encoding: utf-8 -*-
+"""
+@File    :   itersr_model.py
+@Time    :   2021/10/02 01:36:32
+@Author  :   Ming Ding
+@Contact :   dm18@mails.tsinghua.edu.cn
+"""
+# here put the import lib
+import math
+import torch
+import torch.nn.functional as F
+from helm.common.optional_dependencies import handle_module_not_found_error
+try:
+    from deepspeed.runtime.activation_checkpointing.checkpointing import get_cuda_rng_tracker
+    from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin
+    from SwissArmyTransformer.mpu.utils import sqrt
+    from SwissArmyTransformer.model.transformer import split_tensor_along_last_dim
+    from SwissArmyTransformer.ops.local_attention_function import f_similar, f_weighting
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["heim"])
+class PositionEmbeddingMixin(BaseMixin):
+    def __init__(
+        self, additional_sequence_length, hidden_size, init_method_std=0.02, reinit_slice=slice(512, 512 + 400)
+    ):
+        super(PositionEmbeddingMixin, self).__init__()
+        self.reinit_slice = reinit_slice
+        self.position_embeddings = torch.nn.Embedding(additional_sequence_length, hidden_size)
+        torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
+    def reinit(self, parent_model=None):
+        old_weights = self.transformer.position_embeddings.weight.data[self.reinit_slice]
+        old_len, hidden_size = old_weights.shape
+        assert hidden_size == self.position_embeddings.weight.shape[-1]
+        old_edge, new_edge = sqrt(old_len), sqrt(self.position_embeddings.weight.shape[-2])
+        assert new_edge % old_edge == 0
+        self.position_embeddings.weight.data.view(
+            new_edge // old_edge, old_edge, new_edge // old_edge, old_edge, hidden_size
+        ).copy_(old_weights.view(1, old_edge, 1, old_edge, hidden_size))
+class ItersrModel(BaseModel):
+    def __init__(self, args, transformer=None):
+        super().__init__(args, transformer=transformer)
+        self.original_sequence_length = args.max_sequence_length
+        additional_seqlen = args.new_sequence_length - args.max_sequence_length
+        self.add_mixin("extra_position_embedding", PositionEmbeddingMixin(additional_seqlen, args.hidden_size))
+        # self.add_mixin('attention_plus', AttentionMixin(
+        #     num_layers=args.num_layers,
+        #     hidden_size=args.hidden_size
+        # ))
+        self.layout = args.layout
+        # [PAD]... [ROI1] text ... [BOI1] {layout[0]} 1024 {layout[1]} [EOI1] 4095 {layout[2]}
+        self.kernel_size = args.kernel_size
+        self.kernel_size2 = args.kernel_size2
+        self.log_attention_weights = None
+    def position_embedding_forward(self, position_ids, **kw_args):
+        position = position_ids[..., : self.layout[0]]
+        position_plus = position_ids[..., self.layout[0] :] - self.original_sequence_length
+        position_embeddings = torch.cat(
+            (
+                self.transformer.position_embeddings(position),
+                self.get_mixin("extra_position_embedding").position_embeddings(position_plus),
+            ),
+            dim=-2,
+        )
+        return position_embeddings
+    def attention_forward(self, hidden_states, mask, layer_id=None, log_attention_weights=None, **kw_args):
+        attn_module = self.transformer.layers[layer_id].attention
+        # base model qkv
+        mixed_raw_layer = attn_module.query_key_value(hidden_states)
+        q0, k0, v0 = split_tensor_along_last_dim(mixed_raw_layer[:, : self.layout[0]], 3)
+        # cuda2d model qkv
+        q1, k1, v1 = split_tensor_along_last_dim(mixed_raw_layer[:, self.layout[0] :], 3)
+        dropout_fn = attn_module.attention_dropout if self.training else None
+        # cuda2d attention
+        context_layer = sparse_attention_2d_text(
+            q0,
+            k0,
+            v0,
+            q1,
+            k1,
+            v1,
+            mask,
+            n_head=attn_module.num_attention_heads_per_partition,
+            text_len=self.layout[0],
+            kernel_size=self.kernel_size,
+            attention_dropout=dropout_fn,
+            log_attention_weights=log_attention_weights,
+        )
+        output = attn_module.dense(context_layer)
+        return output
+    def final_forward(self, logits, **kwargs):
+        logits_parallel = logits
+        logits_parallel = torch.nn.functional.linear(
+            logits_parallel, self.transformer.word_embeddings.weight[:20000]
+        ).float()
+        # logits_parallel = torch.nn.functional.linear(logits_parallel, self.transformer.word_embeddings.weight[:20000])
+        return logits_parallel
+    # def disable_untrainable_params(self):
+    #     self.transformer.requires_grad_(False)
+    @classmethod
+    def add_model_specific_args(cls, parser):
+        group = parser.add_argument_group("Cuda2dModel", "cuda2d model configurations")
+        group.add_argument("--kernel-size", type=int, default=5)
+        group.add_argument("--kernel-size2", type=int, default=5)
+        group.add_argument("--layout", type=str, default="16,3616")
+        group.add_argument("--new-sequence-length", type=int, default=4096)
+        return parser
+def sparse_attention_2d_text(
+    q0,
+    k0,
+    v0,
+    q1,
+    k1,
+    v1,
+    attention_mask,
+    n_head,
+    text_len,
+    kernel_size=9,
+    attention_dropout=None,
+    log_attention_weights=None,
+    **kwargs,
+):
+    """
+    q0, k0, v0: [batch_size, 16, hidden_size]
+    q1, k1, v1: [batch_size, 3600, hidden_size]
+    n_head: int
+    attention_mask: [batch_size, 16]
+    """
+    b, s0, h0 = q0.shape
+    b, s1, h1 = q1.shape
+    h, l1 = h0 // n_head, sqrt(s1)
+    assert attention_mask.shape[-1] == s0, f"Mask Shape: {attention_mask.shape}"
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    # standard attention for level 0
+    attention_scores = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+    attention_scores = torch.mul(attention_scores, attention_mask) - 10000.0 * (1.0 - attention_mask)
+    attention_probs0 = F.softmax(attention_scores, dim=-1)
+    # local attention for level 1
+    q1 = (
+        (q1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1) / math.sqrt(h1 // n_head))
+        .contiguous()
+        .view(b * n_head, h1 // n_head, l1, l1)
+    )
+    k1 = k1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b * n_head, h1 // n_head, l1, l1)
+    v1 = v1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b * n_head, h1 // n_head, l1, l1)
+    scores_1_to_1 = f_similar(q1, k1, kernel_size * 2 - 1, kernel_size, False)
+    # cross attention
+    scores_1_to_0 = torch.matmul(q1.view(b, n_head, h, s1).transpose(-1, -2), k0T)
+    if log_attention_weights is not None:
+        scores_1_to_0 += log_attention_weights
+    scores_1_to_0 = torch.mul(scores_1_to_0, attention_mask) - 10000.0 * (1.0 - attention_mask)
+    scores_1 = torch.cat(
+        (scores_1_to_0.view(b * n_head, s1, s0), scores_1_to_1.view(b * n_head, -1, scores_1_to_1.shape[3])), dim=-1
+    )
+    attention_probs1 = F.softmax(scores_1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs1 = attention_dropout(attention_probs1)
+    # weighting for level 0
+    context0 = torch.matmul(attention_probs0, v0)  # [b, n_head, s0, h]
+    # weighting for level 1
+    probs_1_to_1 = attention_probs1[:, :, -scores_1_to_1.shape[3] :].view_as(scores_1_to_1)
+    context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size * 2 - 1, kernel_size, False)
+    context1 = context1_to_1.view(b, n_head, h, l1**2)
+    # weighting for cross attention
+    probs_1_to_0 = attention_probs1[:, :, : scores_1_to_0.shape[3]].view(b, n_head, -1, scores_1_to_0.shape[3])
+    context1_to_0 = torch.matmul(probs_1_to_0, v0)
+    context1 = context1.transpose(-1, -2) + context1_to_0
+    output = torch.cat((context0, context1), dim=2).transpose(1, 2).reshape(b, s0 + s1, h0)
+    return output
+def sparse_attention_2d_notext(
+    q0,
+    k0,
+    v0,
+    q1,
+    k1,
+    v1,
+    attention_mask,
+    n_head,
+    text_len,
+    kernel_size=9,
+    attention_dropout=None,
+    log_attention_weights=None,
+    **kwargs,
+):
+    """
+    q0, k0, v0: [batch_size, 16, hidden_size]
+    q1, k1, v1: [batch_size, 3600, hidden_size]
+    n_head: int
+    attention_mask: [batch_size, 16]
+    """
+    b, s0, h0 = q0.shape
+    b, s1, h1 = q1.shape
+    h, l1 = h0 // n_head, sqrt(s1)
+    assert len(attention_mask.shape) == 4 and attention_mask.shape[-1] == s0, f"Mask Shape: {attention_mask.shape}"
+    q0 = q0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    v0 = v0.reshape(b, s0, n_head, h).permute(0, 2, 1, 3)
+    k0T = k0.reshape(b, s0, n_head, h).permute(0, 2, 3, 1)
+    # standard attention for level 0
+    attention_scores = torch.matmul(q0 / math.sqrt(q0.shape[-1]), k0T)
+    attention_scores = torch.mul(attention_scores, attention_mask) - 10000.0 * (1.0 - attention_mask)
+    attention_probs0 = F.softmax(attention_scores, dim=-1)
+    # local attention for level 1
+    q1 = (
+        (q1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1) / math.sqrt(h1 // n_head))
+        .contiguous()
+        .view(b * n_head, h1 // n_head, l1, l1)
+    )
+    k1 = k1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b * n_head, h1 // n_head, l1, l1)
+    v1 = v1.view(b, s1, n_head, h1 // n_head).permute(0, 2, 3, 1).contiguous().view(b * n_head, h1 // n_head, l1, l1)
+    scores_1_to_1 = f_similar(q1, k1, kernel_size * 2 - 1, kernel_size, False)
+    attention_probs1 = F.softmax(scores_1_to_1, dim=-1)
+    if attention_dropout is not None:
+        with get_cuda_rng_tracker().fork():
+            attention_probs1 = attention_dropout(attention_probs1)
+    # weighting for level 0
+    context0 = torch.matmul(attention_probs0, v0)  # [b, n_head, s0, h]
+    # weighting for level 1
+    probs_1_to_1 = attention_probs1
+    context1_to_1 = f_weighting(v1, probs_1_to_1.contiguous(), kernel_size * 2 - 1, kernel_size, False)
+    context1 = context1_to_1.view(b, n_head, h, l1**2)
+    # weighting for cross attention
+    context1 = context1.transpose(-1, -2)
+    output = torch.cat((context0, context1), dim=2).transpose(1, 2).reshape(b, s0 + s1, h0)
+    return output

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl