PyPI - univi - Versions diffs - 0.3.4__py3-none-any.whl - Mend

univi 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

univi/__init__.py +120 -0
univi/__main__.py +5 -0
univi/cli.py +60 -0
univi/config.py +340 -0
univi/data.py +345 -0
univi/diagnostics.py +130 -0
univi/evaluation.py +632 -0
univi/hyperparam_optimization/__init__.py +17 -0
univi/hyperparam_optimization/common.py +339 -0
univi/hyperparam_optimization/run_adt_hparam_search.py +109 -0
univi/hyperparam_optimization/run_atac_hparam_search.py +109 -0
univi/hyperparam_optimization/run_citeseq_hparam_search.py +137 -0
univi/hyperparam_optimization/run_multiome_hparam_search.py +145 -0
univi/hyperparam_optimization/run_rna_hparam_search.py +111 -0
univi/hyperparam_optimization/run_teaseq_hparam_search.py +146 -0
univi/interpretability.py +399 -0
univi/matching.py +394 -0
univi/models/__init__.py +8 -0
univi/models/decoders.py +249 -0
univi/models/encoders.py +848 -0
univi/models/mlp.py +36 -0
univi/models/tokenizers.py +376 -0
univi/models/transformer.py +249 -0
univi/models/univi.py +1284 -0
univi/objectives.py +46 -0
univi/pipeline.py +194 -0
univi/plotting.py +126 -0
univi/trainer.py +478 -0
univi/utils/__init__.py +5 -0
univi/utils/io.py +621 -0
univi/utils/logging.py +16 -0
univi/utils/seed.py +18 -0
univi/utils/stats.py +23 -0
univi/utils/torch_utils.py +23 -0
univi-0.3.4.dist-info/METADATA +908 -0
univi-0.3.4.dist-info/RECORD +40 -0
univi-0.3.4.dist-info/WHEEL +5 -0
univi-0.3.4.dist-info/entry_points.txt +2 -0
univi-0.3.4.dist-info/licenses/LICENSE +21 -0
univi-0.3.4.dist-info/top_level.txt +1 -0

univi/models/mlp.py ADDED Viewed

@@ -0,0 +1,36 @@
+# univi/models/mlp.py
+from __future__ import annotations
+from typing import List, Optional
+from torch import nn
+def build_mlp(
+    in_dim: int,
+    hidden_dims: List[int],
+    out_dim: int,
+    activation: Optional[nn.Module] = None,
+    dropout: float = 0.0,
+    batchnorm: bool = True,
+) -> nn.Sequential:
+    """
+    Generic MLP builder: [Linear -> BN -> Act -> Dropout]* + final Linear.
+    (Python gotcha: don't use nn.ReLU() as a default arg; it becomes a shared instance.)
+    """
+    if activation is None:
+        activation = nn.ReLU()
+    layers = []
+    last_dim = in_dim
+    for h in hidden_dims:
+        layers.append(nn.Linear(last_dim, h))
+        if batchnorm:
+            layers.append(nn.BatchNorm1d(h))
+        layers.append(activation.__class__() if isinstance(activation, nn.Module) else nn.ReLU())
+        if dropout and dropout > 0:
+            layers.append(nn.Dropout(float(dropout)))
+        last_dim = h
+    layers.append(nn.Linear(last_dim, out_dim))
+    return nn.Sequential(*layers)

univi/models/tokenizers.py ADDED Viewed

@@ -0,0 +1,376 @@
+# univi/models/tokenizers.py
+from __future__ import annotations
+from typing import Optional, Tuple, Sequence, Literal, Dict, Any
+import torch
+from torch import nn
+from ..config import TokenizerConfig
+class Tokenizer(nn.Module):
+    """
+    Base tokenizer interface.
+    Backwards-compatible:
+      forward(x) -> (tokens, key_padding_mask)
+    Extras:
+      - self.last_meta is updated on each forward()
+      - forward_with_meta(x) -> (tokens, key_padding_mask, meta)
+    Conventions
+    -----------
+    - tokens: (B, T, D_in)
+    - key_padding_mask: Optional[(B, T)] where True means "PAD / ignore"
+    - meta: dict (optional), e.g. {"token_pos": (B, T) basepair positions}
+    """
+    def __init__(self):
+        super().__init__()
+        self.last_meta: Dict[str, Any] = {}
+    @property
+    def d_in(self) -> int:
+        raise NotImplementedError
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        raise NotImplementedError
+    def forward_with_meta(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor], Dict[str, Any]]:
+        tokens, mask = self.forward(x)
+        return tokens, mask, dict(self.last_meta)
+class TopKScalarTokenizer(Tokenizer):
+    """(B,F) -> (B,K,1) using top-k by absolute value per cell."""
+    def __init__(self, n_tokens: int, add_cls_token: bool = False):
+        super().__init__()
+        self.n_tokens = int(n_tokens)
+        self.add_cls_token = bool(add_cls_token)
+    @property
+    def d_in(self) -> int:
+        return 1
+    def forward(self, x: torch.Tensor):
+        B, F = x.shape
+        K = min(self.n_tokens, F)
+        _, idx = torch.topk(x.abs(), k=K, dim=1, largest=True, sorted=True)
+        vals = torch.gather(x, 1, idx)      # (B,K)
+        tokens = vals.unsqueeze(-1)         # (B,K,1)
+        key_padding_mask = None
+        self.last_meta = {"feature_idx": idx}
+        if self.add_cls_token:
+            cls = torch.zeros((B, 1, 1), device=x.device, dtype=x.dtype)
+            tokens = torch.cat([cls, tokens], dim=1)
+        return tokens, key_padding_mask
+class TopKChannelsTokenizer(Tokenizer):
+    """
+    (B,F) -> (B,K,C) multi-dim tokens, where channels can include:
+      - value: raw x_i
+      - rank: rank within selected K (0..1)
+      - dropout: 1 if x_i == 0 else 0
+    """
+    def __init__(
+        self,
+        n_tokens: int,
+        channels: Sequence[Literal["value", "rank", "dropout"]] = ("value", "rank", "dropout"),
+        add_cls_token: bool = False,
+    ):
+        super().__init__()
+        self.n_tokens = int(n_tokens)
+        self.channels = tuple(channels)
+        self.add_cls_token = bool(add_cls_token)
+        if len(self.channels) == 0:
+            raise ValueError("TopKChannelsTokenizer requires at least one channel.")
+        for c in self.channels:
+            if c not in ("value", "rank", "dropout"):
+                raise ValueError(f"Unknown channel {c!r}. Allowed: value, rank, dropout")
+    @property
+    def d_in(self) -> int:
+        return len(self.channels)
+    def forward(self, x: torch.Tensor):
+        B, F = x.shape
+        K = min(self.n_tokens, F)
+        _, idx = torch.topk(x.abs(), k=K, dim=1, largest=True, sorted=True)
+        vals = torch.gather(x, 1, idx)  # (B,K)
+        chans = []
+        for c in self.channels:
+            if c == "value":
+                chans.append(vals)
+            elif c == "dropout":
+                chans.append((vals == 0).to(vals.dtype))
+            elif c == "rank":
+                r = torch.arange(K, device=x.device, dtype=vals.dtype).view(1, K).expand(B, K)
+                chans.append(r / max(K - 1, 1))
+            else:
+                raise RuntimeError("unreachable")
+        tokens = torch.stack(chans, dim=-1)  # (B,K,C)
+        key_padding_mask = None
+        self.last_meta = {"feature_idx": idx}
+        if self.add_cls_token:
+            cls = torch.zeros((B, 1, tokens.size(-1)), device=x.device, dtype=x.dtype)
+            tokens = torch.cat([cls, tokens], dim=1)
+        return tokens, key_padding_mask
+class PatchTokenizer(Tokenizer):
+    """
+    Split features into patches:
+      (B,F) -> (B,T,patch_size)  where T = ceil(F/patch_size)
+    Optionally project:
+      patch_vec (patch_size) -> patch_proj_dim
+    """
+    def __init__(
+        self,
+        patch_size: int,
+        add_cls_token: bool = False,
+        patch_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.patch_size = int(patch_size)
+        self.add_cls_token = bool(add_cls_token)
+        self.patch_proj_dim = int(patch_proj_dim) if patch_proj_dim is not None else None
+        if self.patch_size <= 0:
+            raise ValueError("patch_size must be > 0")
+        if self.patch_proj_dim is not None:
+            self.proj = nn.Sequential(
+                nn.LayerNorm(self.patch_size),
+                nn.Linear(self.patch_size, self.patch_proj_dim),
+                nn.GELU(),
+                nn.Linear(self.patch_proj_dim, self.patch_proj_dim),
+            )
+        else:
+            self.proj = None
+    @property
+    def d_in(self) -> int:
+        return self.patch_proj_dim if self.patch_proj_dim is not None else self.patch_size
+    def forward(self, x: torch.Tensor):
+        B, F = x.shape
+        P = self.patch_size
+        T = (F + P - 1) // P
+        pad = T * P - F
+        if pad > 0:
+            x_pad = torch.cat([x, torch.zeros((B, pad), device=x.device, dtype=x.dtype)], dim=1)
+        else:
+            x_pad = x
+        patches = x_pad.view(B, T, P)  # (B,T,P)
+        key_padding_mask = None
+        if self.proj is not None:
+            patches = self.proj(patches)  # (B,T,patch_proj_dim)
+        if self.add_cls_token:
+            cls = torch.zeros((B, 1, patches.size(-1)), device=x.device, dtype=x.dtype)
+            patches = torch.cat([cls, patches], dim=1)
+        self.last_meta = {}
+        return patches, key_padding_mask
+class TopKEmbeddedTokenizer(Tokenizer):
+    """
+    Top-k tokenizer with explicit feature identity embeddings:
+      token = Emb(feature_id) + MLP(channels(value/rank/dropout))
+    Optional ATAC coordinate embeddings:
+      token += Emb(chrom_id) + MLP(midpoint_bp / coord_scale)
+    Meta
+    ----
+    self.last_meta will include:
+      - "feature_idx": (B,K) long
+      - "token_pos":  (B,K) float basepairs (if use_coords=True)
+    """
+    def __init__(
+        self,
+        *,
+        n_tokens: int,
+        n_features: int,
+        d_model: int,
+        channels: Sequence[Literal["value", "rank", "dropout"]] = ("value", "rank", "dropout"),
+        add_cls_token: bool = False,
+        value_mlp_hidden: int = 256,
+        # coordinate extras
+        use_coords: bool = False,
+        chrom_vocab_size: int = 0,
+        feature_info: Optional[Dict[str, Any]] = None,
+        coord_scale: float = 1e6,
+    ):
+        super().__init__()
+        self.n_tokens = int(n_tokens)
+        self.n_features = int(n_features)
+        self._d_model = int(d_model)
+        self.channels = tuple(channels)
+        self.add_cls_token = bool(add_cls_token)
+        self.use_coords = bool(use_coords)
+        self.chrom_vocab_size = int(chrom_vocab_size)
+        self.coord_scale = float(coord_scale)
+        if len(self.channels) == 0:
+            raise ValueError("TopKEmbeddedTokenizer requires at least one channel.")
+        for c in self.channels:
+            if c not in ("value", "rank", "dropout"):
+                raise ValueError(f"Unknown channel {c!r}. Allowed: value, rank, dropout")
+        self.id_embed = nn.Embedding(self.n_features, self._d_model)
+        c_in = len(self.channels)
+        self.val_proj = nn.Sequential(
+            nn.LayerNorm(c_in),
+            nn.Linear(c_in, int(value_mlp_hidden)),
+            nn.GELU(),
+            nn.Linear(int(value_mlp_hidden), self._d_model),
+        )
+        # Feature metadata buffers for coords
+        self.feature_chrom: Optional[torch.Tensor] = None
+        self.feature_start: Optional[torch.Tensor] = None
+        self.feature_end: Optional[torch.Tensor] = None
+        if self.use_coords:
+            if self.chrom_vocab_size <= 0:
+                raise ValueError("chrom_vocab_size must be > 0 when use_coords=True.")
+            if feature_info is None:
+                raise ValueError("feature_info must be provided when use_coords=True (keys: chrom,start,end).")
+            for k in ("chrom", "start", "end"):
+                if k not in feature_info:
+                    raise ValueError(f"feature_info missing key {k!r} (required for coords).")
+            chrom = torch.as_tensor(feature_info["chrom"], dtype=torch.long)
+            start = torch.as_tensor(feature_info["start"], dtype=torch.float32)
+            end = torch.as_tensor(feature_info["end"], dtype=torch.float32)
+            if chrom.numel() != self.n_features or start.numel() != self.n_features or end.numel() != self.n_features:
+                raise ValueError(
+                    f"feature_info arrays must have length n_features={self.n_features}; "
+                    f"got chrom={chrom.numel()}, start={start.numel()}, end={end.numel()}."
+                )
+            # register buffers so they follow .to(device)
+            self.register_buffer("feature_chrom", chrom, persistent=False)
+            self.register_buffer("feature_start", start, persistent=False)
+            self.register_buffer("feature_end", end, persistent=False)
+            self.chrom_embed = nn.Embedding(self.chrom_vocab_size, self._d_model)
+            self.coord_mlp = nn.Sequential(
+                nn.LayerNorm(1),
+                nn.Linear(1, int(value_mlp_hidden)),
+                nn.GELU(),
+                nn.Linear(int(value_mlp_hidden), self._d_model),
+            )
+    @property
+    def d_in(self) -> int:
+        return self._d_model
+    def forward(self, x: torch.Tensor):
+        B, F = x.shape
+        if F != self.n_features:
+            raise ValueError(f"Expected F={self.n_features}, got {F}. Did you set TokenizerConfig.n_features correctly?")
+        K = min(self.n_tokens, F)
+        _, idx = torch.topk(x.abs(), k=K, dim=1, largest=True, sorted=True)  # (B,K)
+        vals = torch.gather(x, 1, idx)                                       # (B,K)
+        # channels -> (B,K,C)
+        chans = []
+        for c in self.channels:
+            if c == "value":
+                chans.append(vals)
+            elif c == "dropout":
+                chans.append((vals == 0).to(vals.dtype))
+            elif c == "rank":
+                r = torch.arange(K, device=x.device, dtype=vals.dtype).view(1, K).expand(B, K)
+                chans.append(r / max(K - 1, 1))
+            else:
+                raise RuntimeError("unreachable")
+        ch = torch.stack(chans, dim=-1)  # (B,K,C)
+        id_emb = self.id_embed(idx)      # (B,K,D)
+        val_emb = self.val_proj(ch)      # (B,K,D)
+        tokens = id_emb + val_emb
+        meta: Dict[str, Any] = {"feature_idx": idx}
+        if self.use_coords:
+            # buffers exist because we register_buffer above
+            chrom = self.feature_chrom[idx]  # (B,K)
+            mid = 0.5 * (self.feature_start[idx] + self.feature_end[idx])  # (B,K)
+            mid_scaled = (mid / self.coord_scale).unsqueeze(-1)            # (B,K,1)
+            tokens = tokens + self.chrom_embed(chrom) + self.coord_mlp(mid_scaled)
+            meta["token_pos"] = mid  # basepairs
+        if self.add_cls_token:
+            cls = torch.zeros((B, 1, tokens.size(-1)), device=x.device, dtype=x.dtype)
+            tokens = torch.cat([cls, tokens], dim=1)
+            # keep meta aligned if present
+            if "token_pos" in meta:
+                cls_pos = torch.zeros((B, 1), device=x.device, dtype=meta["token_pos"].dtype)
+                meta["token_pos"] = torch.cat([cls_pos, meta["token_pos"]], dim=1)
+        self.last_meta = meta
+        return tokens, None
+def build_tokenizer(cfg: TokenizerConfig) -> Tokenizer:
+    mode = (cfg.mode or "").lower().strip()
+    if mode == "topk_scalar":
+        return TopKScalarTokenizer(n_tokens=cfg.n_tokens, add_cls_token=cfg.add_cls_token)
+    if mode == "topk_channels":
+        return TopKChannelsTokenizer(n_tokens=cfg.n_tokens, channels=cfg.channels, add_cls_token=cfg.add_cls_token)
+    if mode == "patch":
+        return PatchTokenizer(
+            patch_size=cfg.patch_size,
+            add_cls_token=cfg.add_cls_token,
+            patch_proj_dim=cfg.patch_proj_dim,
+        )
+    if mode == "topk_embed":
+        if cfg.n_features is None or cfg.d_model is None:
+            raise ValueError("TokenizerConfig.mode='topk_embed' requires n_features and d_model to be set.")
+        return TopKEmbeddedTokenizer(
+            n_tokens=cfg.n_tokens,
+            n_features=int(cfg.n_features),
+            d_model=int(cfg.d_model),
+            channels=cfg.channels,
+            add_cls_token=cfg.add_cls_token,
+            value_mlp_hidden=int(cfg.value_mlp_hidden),
+            use_coords=bool(cfg.use_coords),
+            chrom_vocab_size=int(cfg.chrom_vocab_size),
+            feature_info=cfg.feature_info,
+            coord_scale=float(cfg.coord_scale),
+        )
+    raise ValueError(f"Unknown tokenizer mode {cfg.mode!r}")

univi/models/transformer.py ADDED Viewed

@@ -0,0 +1,249 @@
+# univi/models/transformer.py
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Literal, List, Tuple, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+@dataclass
+class TransformerConfig:
+    d_model: int
+    num_heads: int
+    num_layers: int
+    dim_feedforward: int = 4096
+    dropout: float = 0.1
+    attn_dropout: float = 0.1
+    activation: Literal["relu", "gelu"] = "gelu"
+    pooling: Literal["cls", "mean"] = "mean"
+    max_tokens: Optional[int] = None
+    # Optional: binned relative-position attention bias (e.g., genomic distance)
+    use_relpos_bias: bool = False
+    relpos_num_bins: int = 32
+    relpos_max_dist: float = 1e6  # basepairs
+def _act(name: str):
+    name = str(name).lower().strip()
+    if name == "relu":
+        return F.relu
+    if name == "gelu":
+        return F.gelu
+    raise ValueError(f"Unknown activation: {name!r}")
+class GenomicRelPosBias(nn.Module):
+    """
+    Simple distance-binned relative attention bias.
+    Given token positions pos (B,T) in basepairs, returns an additive bias
+    (B, H, T, T). Intended for ATAC peak midpoints.
+    Notes
+    -----
+    - Uses log1p compression to allocate more bins to shorter distances.
+    - Bias table is learned: (H, num_bins).
+    """
+    def __init__(self, num_heads: int, num_bins: int = 32, max_dist: float = 1e6):
+        super().__init__()
+        self.num_heads = int(num_heads)
+        self.num_bins = int(num_bins)
+        self.max_dist = float(max_dist)
+        self.bias = nn.Parameter(torch.zeros(self.num_heads, self.num_bins))
+    def _bin(self, dist: torch.Tensor) -> torch.Tensor:
+        # dist: (B,T,T) >= 0
+        d = dist.clamp(min=0.0, max=self.max_dist)
+        d = torch.log1p(d)
+        dmax = torch.log1p(torch.tensor(self.max_dist, device=d.device, dtype=d.dtype))
+        b = (d / dmax) * (self.num_bins - 1)
+        return b.to(torch.long)
+    def forward(self, pos: torch.Tensor) -> torch.Tensor:
+        # pos: (B,T)
+        dist = (pos[:, :, None] - pos[:, None, :]).abs()  # (B,T,T)
+        bins = self._bin(dist)                             # (B,T,T)
+        # bias[:, bins] -> (H,B,T,T) then permute -> (B,H,T,T)
+        out = self.bias[:, bins]
+        return out.permute(1, 0, 2, 3).contiguous()
+class TransformerBlock(nn.Module):
+    """
+    Single pre-norm style block:
+      x -> MHA -> residual -> LN
+        -> FFN -> residual -> LN
+    Supports optional additive attention bias (e.g., relative position).
+    """
+    def __init__(self, cfg: TransformerConfig):
+        super().__init__()
+        self.cfg = cfg
+        d_model = int(cfg.d_model)
+        self.num_heads = int(cfg.num_heads)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=d_model,
+            num_heads=self.num_heads,
+            dropout=float(cfg.attn_dropout),
+            batch_first=True,
+        )
+        self.attn_drop = nn.Dropout(float(cfg.dropout))
+        self.ln1 = nn.LayerNorm(d_model)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, int(cfg.dim_feedforward)),
+            nn.GELU() if str(cfg.activation).lower().strip() == "gelu" else nn.ReLU(),
+            nn.Dropout(float(cfg.dropout)),
+            nn.Linear(int(cfg.dim_feedforward), d_model),
+        )
+        self.ff_drop = nn.Dropout(float(cfg.dropout))
+        self.ln2 = nn.LayerNorm(d_model)
+        self.relpos: Optional[GenomicRelPosBias] = None
+        if bool(getattr(cfg, "use_relpos_bias", False)):
+            self.relpos = GenomicRelPosBias(
+                num_heads=self.num_heads,
+                num_bins=int(getattr(cfg, "relpos_num_bins", 32)),
+                max_dist=float(getattr(cfg, "relpos_max_dist", 1e6)),
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        token_pos: Optional[torch.Tensor] = None,  # (B,T) basepairs or other coordinates
+        return_attn: bool = False,
+        attn_average_heads: bool = True,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        need_weights = bool(return_attn)
+        attn_mask = None
+        if self.relpos is not None and token_pos is not None:
+            # (B,H,T,T) -> (B*H,T,T) for nn.MultiheadAttention
+            bias = self.relpos(token_pos).to(dtype=x.dtype)
+            B, H, T, _ = bias.shape
+            attn_mask = bias.view(B * H, T, T)
+        attn_out, attn_w = self.attn(
+            x, x, x,
+            key_padding_mask=key_padding_mask,        # (B, T) True = PAD
+            attn_mask=attn_mask,                      # None or (B*H,T,T)
+            need_weights=need_weights,
+            average_attn_weights=bool(attn_average_heads),
+        )
+        x = self.ln1(x + self.attn_drop(attn_out))
+        ff_out = self.ff(x)
+        x = self.ln2(x + self.ff_drop(ff_out))
+        if return_attn:
+            if attn_w is None:
+                raise RuntimeError("Expected attn_w when return_attn=True, got None.")
+            return x, attn_w
+        return x
+class TransformerEncoder(nn.Module):
+    """
+    Generic encoder:
+      tokens (B,T,D_in) -> proj -> blocks -> pool -> out_proj -> (B,d_out)
+    Optional:
+      - learned absolute positional embeddings (use_positional_encoding=True)
+      - relative attention bias via token_pos (if cfg.use_relpos_bias=True)
+    """
+    def __init__(
+        self,
+        *,
+        cfg: TransformerConfig,
+        d_in: int,
+        d_out: int,
+        use_positional_encoding: bool = True,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.use_positional_encoding = bool(use_positional_encoding)
+        d_model = int(cfg.d_model)
+        self.input_proj = nn.Identity() if int(d_in) == d_model else nn.Linear(int(d_in), d_model, bias=True)
+        self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(int(cfg.num_layers))])
+        self.dropout = nn.Dropout(float(cfg.dropout))
+        self.out_proj = nn.Linear(d_model, int(d_out), bias=True)
+        self.pooling = str(cfg.pooling).lower().strip()
+        if self.pooling not in ("cls", "mean"):
+            raise ValueError(f"Unknown pooling={cfg.pooling!r}")
+        # learned positional embeddings (optional)
+        self.pos_emb: Optional[nn.Parameter] = None
+        if self.use_positional_encoding:
+            if cfg.max_tokens is None:
+                raise ValueError("use_positional_encoding=True requires cfg.max_tokens to be set.")
+            max_tokens = int(cfg.max_tokens)
+            self.pos_emb = nn.Parameter(torch.zeros(1, max_tokens, d_model))
+            nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)
+    def _pool(self, x: torch.Tensor, *, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if self.pooling == "cls":
+            return x[:, 0, :]
+        if key_padding_mask is None:
+            return x.mean(dim=1)
+        keep = (~key_padding_mask).to(dtype=x.dtype)  # (B, T)
+        denom = keep.sum(dim=1, keepdim=True).clamp_min(1.0)
+        return (x * keep.unsqueeze(-1)).sum(dim=1) / denom
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        *,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        token_pos: Optional[torch.Tensor] = None,  # (B,T) for relpos bias (optional)
+        return_attn: bool = False,
+        attn_average_heads: bool = True,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        x = self.input_proj(tokens)
+        if self.use_positional_encoding:
+            assert self.pos_emb is not None
+            T = x.shape[1]
+            if T > self.pos_emb.shape[1]:
+                raise ValueError(f"Sequence length T={T} exceeds max_tokens={self.pos_emb.shape[1]}.")
+            x = x + self.pos_emb[:, :T, :]
+        x = self.dropout(x)
+        attn_all: List[torch.Tensor] = []
+        for blk in self.blocks:
+            if return_attn:
+                x, aw = blk(
+                    x,
+                    key_padding_mask=key_padding_mask,
+                    token_pos=token_pos,
+                    return_attn=True,
+                    attn_average_heads=attn_average_heads,
+                )
+                attn_all.append(aw)
+            else:
+                x = blk(
+                    x,
+                    key_padding_mask=key_padding_mask,
+                    token_pos=token_pos,
+                    return_attn=False,
+                    attn_average_heads=attn_average_heads,
+                )
+        pooled = self._pool(x, key_padding_mask=key_padding_mask)
+        out = self.out_proj(pooled)
+        if return_attn:
+            return out, attn_all
+        return out