PyPI - mortm - Versions diffs - 4.5__py3-none-any.whl - Mend

mortm 4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

mortm/__init__.py +0 -0
mortm/constants.py +31 -0
mortm/models/__init__.py +0 -0
mortm/models/bertm.py +294 -0
mortm/models/modules/PositionalEncoding.py +27 -0
mortm/models/modules/__init__.py +0 -0
mortm/models/modules/attention.py +300 -0
mortm/models/modules/audio_patch.py +44 -0
mortm/models/modules/config.py +77 -0
mortm/models/modules/layers.py +471 -0
mortm/models/modules/progress.py +52 -0
mortm/models/mortm.py +338 -0
mortm/models/mortm_live.py +26 -0
mortm/models/v_mortm.py +65 -0
mortm/train/__init__.py +0 -0
mortm/train/config.py +55 -0
mortm/train/custom_token.py +603 -0
mortm/train/datasets.py +321 -0
mortm/train/epoch.py +20 -0
mortm/train/noam.py +7 -0
mortm/train/rl/__init__.py +0 -0
mortm/train/rl/reinforcement.py +207 -0
mortm/train/tokenizer.py +204 -0
mortm/train/train.py +686 -0
mortm/train/utils/__init__.py +0 -0
mortm/train/utils/chord_midi.py +47 -0
mortm/train/utils/loss.py +135 -0
mortm/utils/__init__.py +0 -0
mortm/utils/convert.py +1220 -0
mortm/utils/de_convert.py +40 -0
mortm/utils/eval.py +155 -0
mortm/utils/generate.py +149 -0
mortm/utils/gmail_messanger.py +66 -0
mortm/utils/key.py +354 -0
mortm/utils/messager.py +21 -0
mortm/utils/pianoroll_convert.py +182 -0
mortm/utils/tag.py +97 -0
mortm-4.5.dist-info/METADATA +254 -0
mortm-4.5.dist-info/RECORD +41 -0
mortm-4.5.dist-info/WHEEL +5 -0
mortm-4.5.dist-info/top_level.txt +1 -0

mortm/__init__.py ADDED Viewed

File without changes

mortm/constants.py ADDED Viewed

@@ -0,0 +1,31 @@
+import torch
+PITCH_MAX = 128
+VELO = 128
+LENGTH = 999
+LENGTH_HALF = 999
+BEGIN = 999
+BEGIN_HALF = 999
+ROOT = 99
+START_SEQ_TOKEN = "<S_SEQ>"
+END_SEQ_TOKEN = "<E_SEQ>"
+PADDING_TOKEN = "<PAD>"
+MODEL_NAME = "MORTM"
+# 前回のトークンのID ＋　前回のトークンの使用個数
+PADDING_BEGIN_ID = 0
+SPECIAL_BEGIN_ID = PADDING_BEGIN_ID + 1
+PITCH_BEGIN_ID = SPECIAL_BEGIN_ID + 2
+VELOCITY_BEGIN_ID = PITCH_BEGIN_ID + 128
+DURATION_BEGIN_ID = VELOCITY_BEGIN_ID + 128
+START_BEGIN_ID = DURATION_BEGIN_ID + 100
+SHIFT_BEGIN_ID = START_BEGIN_ID + 32
+PITCH_GROUP = range(PITCH_BEGIN_ID, PITCH_BEGIN_ID + 128 + 1)
+VELOCITY_GROUP = range(VELOCITY_BEGIN_ID, VELOCITY_BEGIN_ID + 128 + 1)
+DURATION_GROUP = range(DURATION_BEGIN_ID, DURATION_BEGIN_ID + 100 + 1)
+START_GROUP = range(START_BEGIN_ID, START_BEGIN_ID + 32 + 1)
+SHIFT_GROUP = range(SHIFT_BEGIN_ID, SHIFT_BEGIN_ID + 4 + 1)

mortm/models/__init__.py ADDED Viewed

File without changes

mortm/models/bertm.py ADDED Viewed

@@ -0,0 +1,294 @@
+import numpy as np
+import torch
+from torch.distributions import Categorical
+from .modules.layers import *
+from .modules.config import MORTMArgs
+from flash_attn.bert_padding import unpad_input, pad_input
+class ActorCritic(nn.Module):
+    def __init__(self, args: MORTMArgs, progress):
+        super(ActorCritic, self).__init__()
+        self.args = args
+        self.progress = progress
+        self.e_layer = args.e_layer
+        self.d_layer = args.d_layer
+        self.num_heads = args.num_heads
+        self.d_model = args.d_model
+        self.dim_feedforward = args.dim_feedforward
+        self.dropout = args.dropout
+        self.use_lora = args.use_lora
+        self.decoder = MORTMDecoder(args, progress=progress)
+        print(f"Input Vocab Size:{args.vocab_size}")
+        self.embedding: nn.Embedding = nn.Embedding(args.vocab_size, self.d_model, padding_idx=0).to(self.progress.get_device())
+        if not self.use_lora:
+            self.Wout: nn.Linear = nn.Linear(self.d_model, args.vocab_size).to(self.progress.get_device())
+        else:
+            self.Wout: lora.Linear = lora.Linear(self.d_model, args.vocab_size, r=args.lora_r, lora_alpha=args.lora_alpha)
+        self.critic_hidden = nn.Linear(self.d_model, self.d_model // 2)
+        self.critic_out = nn.Linear(self.d_model // 2, 1)  # 出力次元を1に設定
+        self.softmax: nn.Softmax = nn.Softmax(dim=-1).to(self.progress.get_device())
+    def evaluate_actions(self, sequence_tensors, padding_mask):
+        # 1. 入力とターゲットを作成（1つずらす）
+        input_ids = sequence_tensors[:, :-1]
+        target_ids = sequence_tensors[:, 1:]
+        # マスクも同様にずらす
+        if padding_mask is not None:
+            input_mask = padding_mask[:, :-1]
+        else:
+            input_mask = None
+        # 2. モデルに`input_ids`を通して、各ステップのlogitsを取得
+        logits, new_values = self.forward(input_ids, padding_mask=input_mask, is_causal=True)
+        reshaped_logits = logits.view(-1, self.args.vocab_size)
+        reshaped_targets = target_ids.reshape(-1).long()
+        # Categorical分布を使って一括で計算
+        dist = Categorical(logits=reshaped_logits)
+        log_probs = dist.log_prob(reshaped_targets)
+        # 元の形状 (batch, seq_len-1) に戻す
+        log_probs = log_probs.view(logits.size(0), logits.size(1))
+        # パディング部分のlog_probを0にする
+        if padding_mask is not None:
+            log_probs = log_probs * padding_mask[:, 1:]
+        return log_probs, new_values.reshape(new_values.shape[0], new_values.shape[1])
+    def eval_seq(self, src, print_log=True):
+        """
+        KVキャッシュを利用してトークンを生成するためのメソッドです。
+        複数バッチに対応しています。
+        """
+        self.eval()
+        is_running = True
+        end_count = 0
+        device = self.progress.get_device()
+        if isinstance(src, numpy.ndarray):
+            src = torch.tensor(src, device=device)
+        if src.dim() == 1:
+            src = src.unsqueeze(0)
+        # --- 1. プロンプト処理 (Pre-fill) ---
+        if print_log: print("--- Pre-fill Phase ---")
+        prompt_padding_mask = (src != self.embedding.padding_idx)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            logits, values = self.forward(src, padding_mask=prompt_padding_mask, is_causal=True, is_save_cache=True)
+        prob_list = Categorical(logits=logits)
+        last_token_logits = logits[:, -1, :]
+        # next_tokens は (batch_size,) の形状を持つテンソル
+        next_tokens = self.top_p_sampling(last_token_logits, p=0.95, temperature=1.0)
+        # 全トークンを保持するテンソル
+        all_tokens = torch.cat([src, next_tokens.unsqueeze(1)], dim=1)
+        all_values = values
+        all_probs = prob_list.log_prob(all_tokens[:, 1:])
+        # --- 2. トークン生成 (Decoding) ---
+        i = len(all_tokens)
+        if print_log: print("\n--- Decoding Phase ---")
+        while is_running:
+            # 入力は直前に生成されたトークン (B, 1)
+            input_tokens = next_tokens
+            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                logits, values = self.forward(input_tokens, padding_mask=None, is_causal=True, is_save_cache=True)
+            probs_list = Categorical(logits=logits.squeeze(1))
+            # next_tokens は (batch_size,) の形状を持つテンソル
+            next_tokens = self.top_p_sampling(logits.squeeze(1), p=0.95, temperature=1.0)
+            #print(next_tokens.max(), all_tokens.max())
+            # 生成されたトークンを連結
+            all_tokens = torch.cat([all_tokens, next_tokens.unsqueeze(1)], dim=1)
+            all_values = torch.cat([all_values, values.unsqueeze(1)], dim=1)
+            all_probs = torch.cat([all_probs, probs_list.log_prob(next_tokens).unsqueeze(1)], dim=1)
+            if print_log: print(f"\r Step {i+1}: Generated tokens {next_tokens.tolist()}", end="")
+            if self.is_end_point(all_tokens) or i > self.args.position_length:
+                is_running = False
+            i += 1
+        print(all_tokens.shape, all_values.shape, all_probs.shape)
+        np_all_tokens = []
+        np_all_values = []
+        np_all_probs = []
+        np_generated_only_tokens = []
+        if print_log: print(all_tokens.max())
+        for i, seq in enumerate(all_tokens):
+            seq: Tensor
+            np_seq = np.array([], dtype=int)
+            np_value = np.array([], dtype=float)
+            np_probs = np.array([], dtype=float)
+            pad = (seq == 0).nonzero(as_tuple=True)[0]
+            eseq = (seq == 585).nonzero(as_tuple=True)[0]
+            # ... eseq の長さを決定するロジックは変更なし ...
+            if len(eseq) == 0:
+                eseq = len(seq)-1
+            elif len(eseq) != 1:
+                eseq = eseq[0].item()
+            else:
+                eseq = eseq.item()
+            if len(pad) != 0:
+                start = pad[0]
+                end = pad[-1]
+                # トークン部分は変更なし
+                np_seq = np.append(np_seq, seq[:start].cpu().numpy())
+                # --- 修正: スライスの終点を-1する ---
+                np_value = all_values[i, :start-1].cpu().numpy()
+                np_probs = all_probs[i, :start-1].cpu().numpy()
+                if eseq == len(seq):
+                    # トークン部分は変更なし
+                    np_seq = np.append(np_seq, seq[end+1:].cpu().numpy())
+                    # --- 修正: スライスの始点と終点を-1する ---
+                    np_value = np.append(np_value, all_values[i, end:].cpu().numpy()) # eseqはlen(seq)-1なので-1不要
+                    np_probs = np.append(np_probs, all_probs[i, end:].cpu().numpy())
+                    if print_log:print(f"fdkso LEN : {np_seq.shape, np_value.shape, np_probs.shape}")
+                else:
+                    # トークン部分は変更なし
+                    np_seq = np.append(np_seq, seq[end+1:eseq+1].cpu().numpy())
+                    # --- 修正: スライスの始点と終点を-1する ---
+                    np_value = np.append(np_value, all_values[i, end:eseq].cpu().numpy())
+                    np_probs = np.append(np_probs, all_probs[i, end:eseq].cpu().numpy())
+                    if print_log:print(f"!WDFG : {np_seq.shape, np_value.shape, np_probs.shape}")
+            else: # パディングがない場合
+                if eseq == len(seq):
+                    # トークン部分は変更なし
+                    np_seq = np.append(np_seq, seq.cpu().numpy())
+                    # --- 修正: 全体をスライスするが、長さは元々1短いのでそのままでOK ---
+                    np_value = np.append(np_value, all_values[i].cpu().numpy())
+                    np_probs = np.append(np_probs, all_probs[i].cpu().numpy())
+                    if print_log:print(f"ESEQ LEN : {np_seq.shape, np_value.shape, np_probs.shape}")
+                else:
+                    # トークン部分は変更なし
+                    np_seq = np.append(np_seq, seq[:eseq+1].cpu().numpy())
+                    # --- 修正: スライスの終点を-1する ---
+                    np_value = np.append(np_value, all_values[i, :eseq].cpu().numpy())
+                    np_probs = np.append(np_probs, all_probs[i, :eseq].cpu().numpy())
+                    if print_log: print(f"Not ESEQ LEN : {np_seq.shape, np_value.shape, np_probs.shape}")
+            np_all_tokens.append(np_seq)
+            np_all_values.append(np_value)
+            np_all_probs.append(np_probs)
+            if np_seq.max() > self.args.vocab_size:
+                raise ValueError(
+                    f"生成されたトークンIDが語彙サイズ({self.args.vocab_size})を超えています: {np_seq.max()}"
+                )
+        return np_all_tokens, np_all_values, np_all_probs
+    def forward(self, x, padding_mask=None, is_causal=False, is_save_cache=False):
+        x: Tensor = self.embedding(x).to(dtype=torch.bfloat16)
+        if padding_mask is not None:
+            batch, tgt_len, embed_dim = x.size()
+            x, indices, cu_seqlens, max_s, used_seqlens = unpad_input(x, padding_mask)
+        else:
+            tgt_len, embed_dim = x.size()
+            batch = None
+            indices = cu_seqlens = max_s = used_seqlens = None
+        out = self.decoder(tgt=x, tgt_is_causal=is_causal, cu_seqlens=cu_seqlens, max_seqlen=max_s,
+                           batch_size=batch, indices=indices, is_save_cache=is_save_cache)
+        if padding_mask is not None:
+            out = pad_input(out, indices, batch, tgt_len)
+        with torch.autocast(device_type="cuda", dtype=torch.float32):
+            score: Tensor = self.Wout(out)
+            hidden = self.critic_hidden(out)
+            hidden = F.relu(hidden)
+            critic_score = self.critic_out(hidden)
+        return score, critic_score
+    def is_end_point(self, x: torch.Tensor) -> bool:
+        """
+        x: Tensor of shape [n, 14]
+        戻り値: 全ての行に少なくとも1つ 5 があれば True、そうでなければ False
+        """
+        mask = (x == 585) | (x == 586)
+        per_row_has5 = mask.any(dim=1)
+        # 3) 全行が True かを判定する
+        all_rows_ok = per_row_has5.all()
+        # 4) Python の bool 型で返す
+        return bool(all_rows_ok)
+    def top_p_sampling(self, logits: Tensor, p=0.9, temperature=1.0) -> Tensor:
+        """
+        複数バッチに対応したTop-pサンプリング。（修正版）
+        """
+        logits = logits / temperature
+        probs = self.softmax(logits)
+        sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_probs_to_remove = cumulative_probs > p
+        sorted_probs_to_remove[..., 1:] = sorted_probs_to_remove[..., :-1].clone()
+        sorted_probs_to_remove[..., 0] = 0
+        probs_to_keep = sorted_probs.masked_fill(sorted_probs_to_remove, 0)
+        # ゼロ除算を避けるため、分母に微小な値を加える
+        probs_sum = probs_to_keep.sum(dim=-1, keepdim=True)
+        renormalized_probs = probs_to_keep / (probs_sum + 1e-9) #
+        sampled_next_indices = torch.multinomial(renormalized_probs, num_samples=1)
+        sampled_original_indices = torch.gather(sorted_indices, dim=-1, index=sampled_next_indices)
+        r = sampled_original_indices.squeeze(-1)
+        # vocab_sizeを直接取得してチェックする
+        vocab_size = logits.shape[-1]
+        if r.max().item() > vocab_size:
+            raise ValueError(
+                f"サンプリングされたトークンIDが語彙サイズ({vocab_size})以上です: {r.max().item()}"
+            )
+        return r
+class BERTM(nn.Module):
+    def __init__(self, args: MORTMArgs, progress):
+        super(BERTM, self).__init__()
+        self.args = args # argsを保存しておくと便利
+        self.embedding = nn.Embedding(args.vocab_size, args.d_model)
+        self.decoder = MORTMDecoder(args=args,
+                                    progress=progress)
+        self.attn_pool = Pool(args)
+        self.hidden = nn.Linear(args.d_model, args.d_model // 2)
+        self.Wout = nn.Linear(args.d_model // 2, 1) # linear層の出力次元に合わせる
+    def forward(self, x: Tensor, padding_mask=None):
+        x: Tensor = self.embedding(x).to(dtype=torch.bfloat16)
+        if padding_mask is not None:
+            x, indices, cu_seqlens, max_s, used_seqlens = unpad_input(x, padding_mask)
+        else:
+            indices = cu_seqlens = max_s = used_seqlens = None
+        out = self.decoder(tgt=x, tgt_is_causal=False, cu_seqlens=cu_seqlens, max_seqlen=max_s)
+        out = self.attn_pool(out, cu_seqlens if cu_seqlens is not None else torch.tensor([0, len(x)], dtype=torch.int32, device=x.device))  # バッチサイズをcu_seqlensに設定
+        out = self.hidden(out)
+        hid = F.relu(out)
+        score = self.Wout(hid)
+        return score

mortm/models/modules/PositionalEncoding.py ADDED Viewed

@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+import math
+from .progress import LearningProgress
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, progress: LearningProgress, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # Positional Encodingのテンソルを生成
+        pe = torch.zeros(max_len, d_model, device=progress.get_device())
+        position = torch.arange(0, max_len, dtype=torch.float, device=progress.get_device()).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)).to(progress.get_device())
+        pe[:, 0::2] = torch.sin(position * div_term,)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)

mortm/models/modules/__init__.py ADDED Viewed

File without changes

mortm/models/modules/attention.py ADDED Viewed

@@ -0,0 +1,300 @@
+from typing import Optional
+from torch.nn.parameter import Parameter
+from torch.nn.init import *
+from typing import Optional, Tuple
+import loralib.layers as lora
+from torch.nn.functional import linear, softmax, dropout
+import torch
+import torch.nn as nn
+from torch import Tensor
+import math
+from einops import rearrange
+from .config import MORTMArgs
+try:
+    from flash_attn.layers.rotary import RotaryEmbedding, apply_rotary_emb
+    IS_NOT_LINUX = True #本当はFlase
+except ImportError as i:
+    IS_NOT_LINUX = True
+    print(f"モジュールをインストールできませんでした。（WindowsではFlashを利用できません）\n {i.name}")
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.flash_attn_interface import *
+    from flash_attn.flash_attn_interface import flash_attn_varlen_kvpacked_func, flash_attn_qkvpacked_func
+except ImportError as i:
+    print(f"モジュールをインストールできませんでした。\n {i.name}")
+# FlashAttention2 の関数（flash_attn_func）をインポート
+# （ライブラリがダウンロード済みであると仮定）
+def marge_cache(kv_cache: Optional[Tuple[Tensor, Tensor]], cache_seqlens: Optional[Tensor],
+                k: Tensor, v: Tensor) -> Tuple[Optional[Tuple[Tensor, Tensor]], Optional[Tensor]]:
+    for i in range(k.shape[0]):
+        pos = cache_seqlens[i] # シーケンス内の位置
+        if pos >= kv_cache[0].shape[1]:
+            kv_cache[0] = torch.cat([kv_cache[0],torch.zeros_like(kv_cache[0][:, :1])], dim=1)
+            kv_cache[1] = torch.cat([kv_cache[1],torch.zeros_like(kv_cache[1][:, :1])], dim=1)
+        kv_cache[0][i, pos, :, :] = k[i, 0]  # バッチi, スロットposに格納
+        kv_cache[1][i, pos, :, :] = v[i, 0]
+        cache_seqlens[i] += 1
+    return kv_cache, cache_seqlens
+def get_alibi_slopes(n_heads):
+    """
+    ALiBi のスロープを計算する関数。
+    n_heads が 2 のべき乗の場合はシンプルな幾何級数になり、
+    そうでない場合は補間してスロープを拡張します。
+    """
+    def get_slopes_power_of_2(n):
+        start = 2 ** (-2 ** -(math.log2(n) - 3))
+        return [start * (start ** i) for i in range(n)]
+    if math.log2(n_heads).is_integer():
+        slopes = get_slopes_power_of_2(n_heads)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n_heads))
+        slopes = get_slopes_power_of_2(closest_power_of_2)
+        extra = get_alibi_slopes(2 * closest_power_of_2)[0::2]
+        slopes.extend(extra[: n_heads - closest_power_of_2])
+    return slopes
+class QKVLinear(nn.Module):
+    def __init__(self, args: MORTMArgs, use_cross_attention: bool=False):
+        super(QKVLinear, self).__init__()
+        self.num_heads = args.num_heads
+        self.drop_out = nn.Dropout(args.dropout)
+        self.use_cross_attention  = use_cross_attention
+        if not use_cross_attention:
+            if not  args.use_lora:
+                self.qkv_weight = nn.Linear(args.d_model, 3 * args.d_model, bias=False, dtype=torch.bfloat16)
+                self.W_o = nn.Linear(args.d_model, args.d_model, dtype=torch.bfloat16)
+            else:
+                self.qkv_weight = lora.Linear(args.d_model, 3 * args.d_model, r=args.lora_r, lora_alpha=args.lora_alpha, bias=False, dtype=torch.bfloat16)
+                self.W_o = lora.Linear(args.d_model, args.d_model, r=args.lora_r, lora_alpha=args.lora_alpha, dtype=torch.bfloat16)
+        else:
+            self.q_weight = nn.Linear(args.d_model, args.d_model, bias=True, dtype=torch.bfloat16)
+            self.kv_weight = nn.Linear(args.d_model, 2 * args.d_model, bias=True, dtype=torch.bfloat16)
+            self.W_o = nn.Linear(args.d_model, args.d_model, dtype=torch.bfloat16)
+    def forward(self, q: Tensor, kv: Tensor = None):
+        if not self.use_cross_attention:
+            total, D = q.size()
+            qkv = self.qkv_weight(q).view(total, 3, self.num_heads, D // self.num_heads)
+            return qkv
+        else:
+            total_q, D_q = q.size()
+            total_kv, D_kv = kv.size()
+            q = self.q_weight(q).view(total_q, self.num_heads, D_q // self.num_heads)
+            kv = self.kv_weight(kv).view(total_kv, 2, self.num_heads, D_kv // self.num_heads)
+            return q, kv
+    def comp(self, o: Tensor):
+        out: Tensor = self.W_o(o)
+        return out
+class FlashSelfAttentionM(nn.Module):
+    def __init__(self, args: MORTMArgs, progress=None):
+        super(FlashSelfAttentionM, self).__init__()
+        self.batch_first = True
+        self._qkv_same_embed_dim = True
+        self.in_proj_bias = None
+        self.args = args
+        self.embed_dim = args.d_model
+        self.qkv_block = QKVLinear(args)
+        self.drop = args.dropout
+        self.kv_cache: Optional[Tuple[Tensor, Tensor]] = None
+        self.cache_seqlens: Tensor = None
+        if not self.args.use_rope:
+            print("FlashAttention2のALiBiを使用します。")
+            self.alibi_slopes = torch.tensor(get_alibi_slopes(args.num_heads), dtype=torch.float32, device=progress.get_device())
+        else:
+            print("FlashAttention2のRoPEを使用します。")
+            head_dim = args.d_model // args.num_heads
+            device = progress.get_device() if progress else None
+            self.rotary_emb = RotaryEmbedding(dim=head_dim, base=10000.0, interleaved=False, device=device)
+    def _init_kv_cache(self, batch_size, device, dtype):
+        """最初の呼び出し時に、バッチサイズに合わせてキャッシュを初期化する"""
+        max_seq_len = self.args.position_length + 100 # 設定ファイルなどから最大長を取得
+        head_dim = self.args.d_model // self.args.num_heads
+        shape = (batch_size, max_seq_len, self.args.num_heads, head_dim)
+        # torch.emptyでメモリを確保するだけ。0で埋める必要はない
+        self.kv_cache = (
+            torch.empty(shape, device=device, dtype=dtype),
+            torch.empty(shape, device=device, dtype=dtype)
+        )
+        self.cache_seqlens = torch.zeros(batch_size, device=device, dtype=torch.int32)
+    def forward(self, x: Tensor, is_causal=False, cu_seqlens=None, max_seqlen=None,
+                batch_size=None, indices=None, is_save_cache=False):
+        if x.dtype == torch.float32:
+            x = x.to(torch.bfloat16)
+        # --- フェーズ1: 学習 または 推論のプロンプト処理 ---
+        if cu_seqlens is not None:
+            # プロンプト処理時にはキャッシュを初期化
+            if is_save_cache and (self.kv_cache is None or self.kv_cache[0].shape[0] != batch_size):
+                self._init_kv_cache(batch_size, x.device, x.dtype)
+            qkv: Tensor = self.qkv_block(q=x)
+            # RoPE/ALiBiの適用とアテンション計算 (この部分は元のロジックを維持)
+            if not self.args.use_rope:
+                out = flash_attn_varlen_qkvpacked_func(qkv, dropout_p=self.drop, causal=is_causal,
+                                                       cu_seqlens=cu_seqlens, max_seqlen=max_seqlen,
+                                                       alibi_slopes=self.alibi_slopes)
+            else:
+                q, k, v = qkv.unbind(1)
+                self.rotary_emb._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+                q = apply_rotary_emb(q, self.rotary_emb._cos_cached, self.rotary_emb._sin_cached, interleaved=False, cu_seqlens=cu_seqlens)
+                k = apply_rotary_emb(k, self.rotary_emb._cos_cached, self.rotary_emb._sin_cached, interleaved=False, cu_seqlens=cu_seqlens)
+                qkv_rotated = torch.stack([q, k, v], dim=1)
+                out = flash_attn_varlen_qkvpacked_func(qkv_rotated, dropout_p=self.drop, causal=is_causal,
+                                                       cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
+            # is_save_cacheがTrueの場合、計算結果を事前確保したキャッシュに書き込む
+            if is_save_cache:
+                with torch.no_grad():
+                    # RoPE適用済みのk,vをキャッシュするのが望ましい場合があるが、ここではqkvから取得
+                    _, k_unpad, v_unpad = qkv.unbind(dim=1)
+                    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).to(torch.int32)
+                    # 各シーケンスのK,Vを、事前確保したキャッシュの先頭に書き込む
+                    for i in range(batch_size):
+                        start, end = cu_seqlens[i], cu_seqlens[i+1]
+                        seq_len = end - start
+                        self.kv_cache[0][i, :seq_len] = k_unpad[start:end]
+                        self.kv_cache[1][i, :seq_len] = v_unpad[start:end]
+                    self.cache_seqlens = seqlens
+        # --- フェーズ2: 1トークンずつの推論 ---
+        else:
+            if is_save_cache:
+                # このパスでは、xは (batch_size, d_model) の形状を想定
+                qkv: Tensor = self.qkv_block(q=x)
+                # (batch_size, 3, num_heads, head_dim) -> (3, batch_size, num_heads, head_dim)
+                qkv = qkv.permute(1, 0, 2, 3)
+                q, k, v = qkv[0], qkv[1], qkv[2]
+                # (batch_size, num_heads, head_dim) -> (batch_size, 1, num_heads, head_dim)
+                # flash_attn_with_kvcache の入力形状に合わせる
+                q, k, v = q.unsqueeze(1), k.unsqueeze(1), v.unsqueeze(1)
+                # RoPE / ALiBi の引数を準備
+                rotary_kwargs = {}
+                if not IS_NOT_LINUX:
+                    self.rotary_emb._update_cos_sin_cache(self.args.position_length, device=x.device, dtype=x.dtype)
+                    rotary_kwargs = {
+                        "rotary_cos": self.rotary_emb.cos_cached,
+                        "rotary_sin": self.rotary_emb.sin_cached,
+                        "rotary_interleaved": False
+                    }
+                # flash_attn_with_kvcache を呼び出すだけで、計算とキャッシュ更新が完了
+                out = flash_attn_with_kvcache(
+                    q,
+                    k_cache=self.kv_cache[0],
+                    v_cache=self.kv_cache[1],
+                    k=k,
+                    v=v,
+                    cache_seqlens=self.cache_seqlens,
+                    alibi_slopes=self.alibi_slopes if IS_NOT_LINUX else None,
+                    causal=True,
+                    **rotary_kwargs
+                )
+                # キャッシュの有効長をインクリメント
+                self.cache_seqlens += 1
+                # (batch_size, 1, h, d_model) -> (batch_size, h, d_model)
+                out = out.squeeze(1)
+            else:
+                qkv = self.qkv_block(q=x)
+                qkv = qkv.unsqueeze(0)
+                out = flash_attn_qkvpacked_func(qkv=qkv, dropout_p=self.drop, causal=is_causal)
+                out = rearrange(out, "b s h d -> (b s) (h d)")
+                return self.qkv_block.comp(out)
+        # 最終的な出力層
+        out = rearrange(out, "total h d -> total (h d)")
+        return self.qkv_block.comp(out)
+    def compute_cache_seqlens(self, k: torch.Tensor) -> torch.Tensor:
+        """
+        k: Tensor of shape [batch_size, max_seq_len, num_heads, head_dim]
+        Returns:
+            cache_seqlens: Tensor of shape [batch_size]  (実際のシーケンス長)
+        """
+        # 各タイムステップが "all-zero" かどうかを判定
+        is_nonzero = k.abs().sum(dim=(-1, -2)) != 0  # shape: [batch_size, max_seq_len]
+        # True/False → int に変換して累積和で長さを求める（ただし最初の False 位置でもOK）
+        seqlens = is_nonzero.sum(dim=1)  # shape: [batch_size]
+        return seqlens
+class FlashCrossAttentionM(nn.Module):
+    def __init__(self, args: MORTMArgs, progress=None):
+        super(FlashCrossAttentionM, self).__init__()
+        self.batch_first = True
+        self._qkv_same_embed_dim = True
+        self.in_proj_bias = None
+        self.args = args
+        self.embed_dim = args.d_model
+        self.qkv_block = QKVLinear(args, use_cross_attention=True)
+        self.drop = args.dropout
+    def forward(self, x: Tensor, encoder_x: Tensor,cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=None,
+                    max_seqlen_k=None):
+        if x.dtype == torch.float32:
+            x = x.to(torch.bfloat16)
+        if encoder_x.dtype == torch.float32:
+            encoder_x = encoder_x.to(torch.bfloat16)
+        # --- フェーズ1: 学習 または 推論のプロンプト処理 ---
+        if cu_seqlens_q is not None:
+            q, kv = self.qkv_block(q=x, kv=encoder_x)
+            out = flash_attn_varlen_kvpacked_func(
+                q=q,
+                kv=kv,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                causal=False,
+                dropout_p=self.drop
+            )
+        else:
+            q, kv = self.qkv_block(q=x, kv=encoder_x)
+            q = q.unsqueeze(0)
+            kv = kv.unsqueeze(0)
+            out = flash_attn_kvpacked_func(q=q, kv=kv, dropout_p=self.drop, causal=False)
+            out = rearrange(out, "b s h d -> (b s) (h d)")
+            return self.qkv_block.comp(out)
+        # 最終的な出力層
+        out = rearrange(out, "total h d -> total (h d)")
+        return self.qkv_block.comp(out)