PyPI - returnn - Versions diffs - 1.20240731.223820__tar.gz → 1.20240808.234227__tar.gz - Mend

returnn 1.20240731.223820tar.gz → 1.20240808.234227tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (460) hide show

{returnn-1.20240731.223820 → returnn-1.20240808.234227}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240731.223820
+Version: 1.20240808.234227
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240808.234227/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240808.234227'
2	+ long_version = '1.20240808.234227+git.d773e45'

returnn-1.20240808.234227/returnn/frontend/conversions/hf_llama.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""
+Import the parameters from the HuggingFace Llama model (PyTorch).
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Union
+import returnn.frontend as rf
+from returnn.frontend.decoder.transformer import TransformerDecoder, TransformerDecoderLayer, FeedForwardGated
+if TYPE_CHECKING:
+    from transformers.models.llama.modeling_llama import (
+        LlamaModel,
+        LlamaForCausalLM,
+        LlamaDecoderLayer,
+        LlamaMLP,
+        LlamaRMSNorm,
+        LlamaAttention,
+    )
+def import_params_hf_llama_to_rf_transformer_decoder(
+    model_hf: Union[LlamaModel, LlamaForCausalLM], model_rf: TransformerDecoder
+):
+    """
+    Import params from HF Llama model to RF :class:`TransformerDecoder`.
+    """
+    import torch
+    from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaDecoderLayer
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    # Check if the number of parameters is the same below.
+    # First import individual sub modules.
+    # We might detect any mismatches there, and this will easy the debugging.
+    lm_head = None
+    if isinstance(model_hf, LlamaForCausalLM):
+        lm_head = model_hf.lm_head
+        model_hf = model_hf.model
+    else:
+        # Exclude logits.
+        num_params_rf -= model_rf.logits.weight.num_elements()
+    assert isinstance(model_hf, LlamaModel)
+    assert model_hf.norm.weight.shape[0] == model_rf.model_dim.dimension
+    assert len(model_hf.layers) == len(model_rf.layers)
+    for i, (layer_hf, layer_rf) in enumerate(zip(model_hf.layers, model_rf.layers)):
+        assert isinstance(layer_hf, LlamaDecoderLayer)
+        assert isinstance(layer_rf, TransformerDecoderLayer)
+        import_params_hf_llama_decoder_layer_to_rf(layer_hf, layer_rf)
+    assert model_hf.embed_tokens.weight.shape == model_rf.input_embedding.weight.raw_tensor.shape
+    with torch.no_grad():
+        model_rf.input_embedding.weight.raw_tensor.copy_(model_hf.embed_tokens.weight)  # (vocab,hidden)
+    assert isinstance(model_rf.final_layer_norm, rf.RMSNorm)
+    import_params_hf_llama_rms_norm_to_rf(model_hf.norm, model_rf.final_layer_norm)
+    if lm_head is not None:
+        assert lm_head.bias is None and model_rf.logits.bias is None  # not implemented
+        # Torch Linear: (out,in), but RF has (in,out).
+        with torch.no_grad():
+            model_rf.logits.weight.raw_tensor.copy_(lm_head.weight.T)  # (hidden,vocab)
+    assert num_params_rf == num_params_hf, f"missmatch num params: RF {num_params_rf} != HF {num_params_hf}"
+def import_params_hf_llama_decoder_layer_to_rf(model_hf: LlamaDecoderLayer, model_rf: TransformerDecoderLayer):
+    """
+    Import the parameters from the HF Llama decoder layer.
+    """
+    import torch
+    assert model_hf.hidden_size == model_rf.out_dim.dimension
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    # Check if the number of parameters is the same below.
+    # First import individual sub modules.
+    # We might detect any mismatches there, and this will easy the debugging.
+    assert isinstance(model_rf.ff, FeedForwardGated), f"unexpected: {model_rf.ff}"
+    import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf.mlp, model_rf.ff)
+    assert isinstance(model_rf.self_att, rf.RotaryPosCausalSelfAttention), f"unexpected: {model_rf.self_att}"
+    import_params_hf_llama_att_to_rf_rotary_att(model_hf.self_attn, model_rf.self_att)
+    assert isinstance(model_rf.self_att_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.self_att_layer_norm}"
+    import_params_hf_llama_rms_norm_to_rf(model_hf.input_layernorm, model_rf.self_att_layer_norm)
+    assert isinstance(model_rf.ff_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.ff_layer_norm}"
+    import_params_hf_llama_rms_norm_to_rf(model_hf.post_attention_layernorm, model_rf.ff_layer_norm)
+    assert num_params_rf == num_params_hf
+def import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf: LlamaMLP, model_rf: FeedForwardGated):
+    """
+    Import the parameters from the HF Llama MLP module.
+    """
+    import torch
+    assert model_hf.hidden_size == model_rf.out_dim.dimension == model_rf.linear_ff.in_dim.dimension
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    assert num_params_rf == num_params_hf
+    # Torch Linear: (out,in), but RF has (in,out).
+    w1 = model_hf.gate_proj.weight.T  # (in,out)
+    w2 = model_hf.up_proj.weight.T  # (in,out)
+    w3 = model_hf.down_proj.weight.T  # (out,in)
+    assert model_hf.gate_proj.bias is None  # not implemented
+    assert model_hf.up_proj.bias is None  # not implemented
+    assert model_hf.down_proj.bias is None  # not implemented
+    with torch.no_grad():
+        w = torch.cat((w1, w2), dim=1)  # (in,out*2)
+        model_rf.linear_ff.weight.raw_tensor.copy_(w)
+        model_rf.linear_out.weight.raw_tensor.copy_(w3)
+def import_params_hf_llama_rms_norm_to_rf(model_hf: LlamaRMSNorm, model_rf: rf.RMSNorm):
+    """
+    Import the parameters from the HF Llama RMSNorm module.
+    """
+    import torch
+    assert model_hf.weight.shape[0] == model_rf.in_dim.dimension
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    assert num_params_rf == num_params_hf
+    w = model_hf.weight  # (in,)
+    with torch.no_grad():
+        model_rf.scale.raw_tensor.copy_(w)
+def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
+    """
+    Import the parameters from the HF Llama attention module.
+    """
+    import torch
+    assert model_hf.num_heads == model_rf.num_heads.dimension
+    assert model_hf.hidden_size == model_rf.in_dim.dimension
+    dim = model_hf.hidden_size
+    nh = model_hf.num_heads
+    hdim = dim // nh
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    assert num_params_rf == num_params_hf, f"num params RF {num_params_rf} != params HF {num_params_hf}"
+    # Torch Linear: (out,in), but RF has (in,out).
+    q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    qkv = torch.cat([q, k, v], dim=2)  # (in,h,out/h*3)
+    qkv = qkv.reshape(dim, 3 * dim)
+    assert model_hf.q_proj.bias is None  # not implemented
+    with torch.no_grad():
+        model_rf.qkv.weight.raw_tensor.copy_(qkv)
+        model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)

{returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/decoder/transformer.py RENAMED Viewed

@@ -13,10 +13,12 @@ References:
 from __future__ import annotations
 from typing import Optional, Any, Union, Tuple, Dict, Callable, Sequence
+from types import FunctionType
 import functools
 import logging
 import copy as _copy
 from returnn.util.basic import NotSpecified, BehaviorVersion
+from returnn.util.math import ceil_div
 import returnn.frontend as rf
 from returnn.tensor import Tensor, Dim, single_step_dim
@@ -36,6 +38,7 @@ class TransformerDecoder(rf.Module):
         ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
         ff_dim: Union[Dim, int] = NotSpecified,
         ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
+        pos_enc: Union[None, Callable, Dict[str, Any], rf.Module] = rf.sinusoidal_positional_encoding,
         dropout: float = 0.1,
         num_heads: int = 8,
         att_dropout: float = 0.1,
@@ -57,6 +60,7 @@ class TransformerDecoder(rf.Module):
         :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
         :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
         :param ff_activation: activation function for feed-forward network
+        :param pos_enc: positional encoding. Default is sinusoidal positional encoding.
         :param dropout: the dropout value for the FF block
         :param num_heads: the number of attention heads
         :param att_dropout: attention dropout value
@@ -92,10 +96,21 @@ class TransformerDecoder(rf.Module):
         if embed_dim:
             self.input_embedding_proj = rf.Linear(embed_dim, model_dim, with_bias=False)
-        # TODO This should be configurable...
-        self.pos_enc = functools.partial(
-            rf.sinusoidal_positional_encoding, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
-        )
+        if pos_enc is None:
+            pass
+        elif isinstance(pos_enc, dict):
+            pos_enc = rf.build_from_dict(
+                pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
+            )
+        elif isinstance(pos_enc, rf.Module):
+            pass
+        elif isinstance(pos_enc, FunctionType):
+            pos_enc = functools.partial(
+                pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
+            )
+        else:
+            raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
+        self.pos_enc = pos_enc
         if share_embedding is None:
             if BehaviorVersion.get() < 20:
                 logging.getLogger("returnn.frontend").warning(
@@ -189,7 +204,8 @@ class TransformerDecoder(rf.Module):
         new_state = rf.State()
         decoded = self.input_embedding(source) * self.input_embedding_scale
-        decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
+        if self.pos_enc is not None:
+            decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
         decoded = rf.dropout(decoded, self.input_dropout)
         if self.input_embedding_proj is not None:
             decoded = self.input_embedding_proj(decoded)
@@ -228,7 +244,9 @@ class TransformerDecoderLayer(rf.Module):
         ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
         dropout: float = 0.1,
         num_heads: int = 8,
-        self_att: Optional[Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Any]] = None,
+        self_att: Optional[
+            Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Dict[str, Any]]
+        ] = None,
         self_att_opts: Optional[Dict[str, Any]] = None,
         att_dropout: float = 0.1,
         norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
@@ -271,7 +289,7 @@ class TransformerDecoderLayer(rf.Module):
         self.ff = ff
         self.ff_layer_norm = _make_norm(norm, out_dim)
-        if self_att is None or isinstance(self_att, type):
+        if self_att is None or isinstance(self_att, type) or isinstance(self_att, dict):
             self_att_opts_ = dict(
                 in_dim=out_dim,
                 proj_dim=out_dim,
@@ -284,10 +302,16 @@ class TransformerDecoderLayer(rf.Module):
                 self_att_opts_.update(self_att_opts)
             if self_att is None:
                 self.self_att = rf.CausalSelfAttention(**self_att_opts_)
-            else:
+            elif isinstance(self_att, type):
                 self.self_att = self_att(**self_att_opts_)
+            elif isinstance(self_att, dict):
+                self.self_att = rf.build_from_dict(self_att, **self_att_opts_)
+            else:
+                raise TypeError(f"unexpected self_att type {self_att!r}")
+        elif isinstance(self_att, rf.Module):
+            self.self_att = _copy.deepcopy(self_att)
         else:
-            self.self_att = self_att
+            raise TypeError(f"unexpected self_att type {self_att!r}")
         self.self_att_layer_norm = _make_norm(norm, out_dim)
         self.cross_att = None
@@ -353,12 +377,15 @@ class FeedForward(rf.Module):
         ff_dim: Optional[Union[Dim, int]] = NotSpecified,
         dropout: float = 0.1,
         activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
+        with_bias: bool = True,
     ):
         """
         :param out_dim: output feature dimension
         :param ff_dim: dimension of the feed-forward layers
         :param dropout: dropout value
         :param activation: activation function, relu by default
+        :param with_bias: whether to use bias in the linear layers.
+            True by default for compatibility, but nowadays it's common to use without bias.
         """
         super().__init__()
@@ -381,8 +408,8 @@ class FeedForward(rf.Module):
         self.dropout_broadcast = rf.dropout_broadcast_default()
         self.activation = activation
-        self.linear_ff = rf.Linear(out_dim, ff_dim)
-        self.linear_out = rf.Linear(ff_dim, out_dim)
+        self.linear_ff = rf.Linear(out_dim, ff_dim, with_bias=with_bias)
+        self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
     def __call__(self, inp: Tensor) -> Tensor:
         """forward"""
@@ -401,6 +428,8 @@ class FeedForwardGated(rf.Module):
         f(Linear(x)) * Linear(x)
     This is a feed-forward block based on SwiGLU, as defined in the paper.
+    Alternative to :class:`FeedForward`.
     """
     def __init__(
@@ -410,14 +439,30 @@ class FeedForwardGated(rf.Module):
         ff_dim: Optional[Union[Dim, int]] = NotSpecified,
         dropout: float = 0.1,
         activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
+        with_bias: bool = False,
     ):
+        """
+        :param out_dim:
+        :param ff_dim: intermediate dimension.
+            Unlike :class:`FeedForward`:
+            If not provided, factor 4*2/3 to keep same number of parameters as in the original :class:`FeedForward`,
+            just as in the paper, and also making it a multiple of 256.
+        :param dropout:
+        :param activation: activation function for the gating. unlike :class:`FeedForward`, default is swish.
+        :param with_bias: whether to use bias in the linear layers.
+            unlike :class:`FeedForward`, default is False.
+        """
         super().__init__()
         if isinstance(ff_dim, int):
             ff_dim = Dim(ff_dim, name="transformer-ff-dim")
         if ff_dim is NotSpecified or ff_dim is None:
-            # Factor 2/3 to keep same number of parameters as in the original FF block, just as in the paper.
-            ff_dim = out_dim * 2 // 3
+            # Factor 4 as usual.
+            # The additional factor 2/3 to keep same number of parameters as in the original FF block,
+            # just as in the paper.
+            ff_dim_ = out_dim.dimension * 4 * 2 // 3
+            ff_dim_ = ceil_div(ff_dim_, 256) * 256  # make multiple of 256
+            ff_dim = Dim(ff_dim_, name="transformer-ff-dim")
         if not isinstance(ff_dim, Dim):
             raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
@@ -434,8 +479,8 @@ class FeedForwardGated(rf.Module):
         self.activation = activation
         # Factor 2 because we concatenate the two paths.
-        self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
-        self.linear_out = rf.Linear(ff_dim, out_dim)
+        self.linear_ff = rf.Linear(out_dim, 2 * ff_dim, with_bias=with_bias)
+        self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
     def __call__(self, inp: Tensor) -> Tensor:
         """forward"""

{returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -983,6 +983,9 @@ class TorchBackend(Backend[torch.Tensor]):
         elif axis_int == 0 and source.batch_ndim == 2:
             # This is exactly what torch.embedding is intended for. Let's use that.
             out.raw_tensor = torch.embedding(source.raw_tensor, indices.raw_tensor)
+        elif indices.batch_ndim <= 1:
+            # Note: This also works when indices is on CPU and source is on GPU.
+            out.raw_tensor = source.raw_tensor[(slice(None),) * axis_int + (indices.raw_tensor,)]
         else:
             out_raw = torch.index_select(source.raw_tensor, dim=axis_int, index=indices.raw_tensor.flatten())
             out_shape = (

{returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/debug.py RENAMED Viewed

@@ -628,7 +628,10 @@ class PyTracer:
                         continue
                     prev = self.captured_locals[func][-1].get(k, None)
                     if prev is None or prev[-1] is not v:
-                        print(f"{func.__qualname__} tensor var changed: {k} = {v}")
+                        print(
+                            f"{func.__qualname__}[{len(self.captured_locals[func]) - 1}]"
+                            f" {type(v).__qualname__} var changed: {k} = {v}"
+                        )
                         self.captured_locals[func][-1].setdefault(k, []).append(v)
             return self
         return prev_trace_func_res
@@ -686,9 +689,13 @@ def check_py_traces_rf_to_pt_equal(
         else:
             raise TypeError(f"invalid dim type: {dim!r}")
+    def _format_check(check: Tuple[Union[FunctionType, Callable], int, str, int]) -> str:
+        func, i, var_name, j = check
+        return f"{func.__qualname__}[{i}] {var_name}[{j}]"
     non_matching = []
     for check_rf, check_pt, pt_dims in checks:
-        print(f"checking {check_rf} vs {check_pt} ({pt_dims})...")
+        print(f"checking {_format_check(check_rf)} vs {_format_check(check_pt)} ({pt_dims})...")
         tensor_rf: Tensor = _get_entry(trace_rf, *check_rf)
         tensor_pt: torch.Tensor = _get_entry(trace_pt, *check_pt)
         if callable(pt_dims):

returnn 1.20240731.223820__tar.gz → 1.20240808.234227__tar.gz

Potentially problematic release.

returnn 1.20240731.223820tar.gz → 1.20240808.234227tar.gz