PyPI - aigc-core - Versions diffs - 0.0.1__py3-none-any.whl - Mend

aigc-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

aigc_core-0.0.1.dist-info/METADATA +12 -0
aigc_core-0.0.1.dist-info/RECORD +18 -0
aigc_core-0.0.1.dist-info/WHEEL +5 -0
aigc_core-0.0.1.dist-info/top_level.txt +1 -0
aigcore/__init__.py +26 -0
aigcore/_logger.py +95 -0
aigcore/agent/__init__.py +0 -0
aigcore/llm/__init__.py +11 -0
aigcore/llm/attention/__init__.py +0 -0
aigcore/llm/attention/_self_attention.py +35 -0
aigcore/llm/embed/__init__.py +0 -0
aigcore/llm/embed/_rope.py +93 -0
aigcore/llm/lora/__init__.py +9 -0
aigcore/llm/lora/_lora_base.py +80 -0
aigcore/llm/model/__init__.py +4 -0
aigcore/llm/model/_minimind.py +103 -0
aigcore/llm/norm/__init__.py +6 -0
aigcore/llm/norm/_norm.py +56 -0

aigc_core-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: aigc-core
+Version: 0.0.1
+Summary: Add your description here
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: python-dotenv
+Requires-Dist: datasets>=4.5.0
+Requires-Dist: peft>=0.18.1
+Requires-Dist: pip>=25.3
+Requires-Dist: swanlab>=0.7.6
+Requires-Dist: torch>=2.10.0

aigc_core-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+aigcore/__init__.py,sha256=xfZFqDDVqXTRUglmrVcqZqYxc6xsWuWY3pK1SNJqFRY,411
+aigcore/_logger.py,sha256=74DTl4hbAZdoIZ_WL5k4dcXqUQ2-VM0SEaDdNQM5yv0,2367
+aigcore/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+aigcore/llm/__init__.py,sha256=YUDY2RxdCPX954l7pnB0UptTtx5BxdM7ZJC0boX0OI4,143
+aigcore/llm/attention/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+aigcore/llm/attention/_self_attention.py,sha256=vi9H1rJPv1E68nOt6IlyiKDDm1hu7iEgNEQ78STTINw,1085
+aigcore/llm/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+aigcore/llm/embed/_rope.py,sha256=PoP7FaemsbJmxx6uuM4yfHNMySi7akbEeH3ewBexxs0,3606
+aigcore/llm/lora/__init__.py,sha256=16mRx18XLQkfDnIB2_dUJRdoRlQ0WCX_fJfNtmQhJDQ,143
+aigcore/llm/lora/_lora_base.py,sha256=qGBicjOkkp71kGQqYaySmiJ9To8fHn1kk60glOlidF8,2635
+aigcore/llm/model/__init__.py,sha256=kl-sn2CMGuZxr_5T6gehYqdi6j5fq7mFL0B2Jt4cnEw,89
+aigcore/llm/model/_minimind.py,sha256=NIILze7afzSMt7FTzUap7g2XM7On071yh_mgQO7RQJU,3848
+aigcore/llm/norm/__init__.py,sha256=emv2DeHDgM5JYY1AJWzB-oirJiGyxlNW01fe8ITzDnc,85
+aigcore/llm/norm/_norm.py,sha256=5qd8KgZ2n3sFL7SVyHVeEOVZ2tvpPErzMMAzhbiF6_8,3592
+aigc_core-0.0.1.dist-info/METADATA,sha256=ImOBo8RmCsxhTVY4iaZRiv0DZKQYj1KwQ5oisW24tyw,324
+aigc_core-0.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+aigc_core-0.0.1.dist-info/top_level.txt,sha256=Sg7mTVSn-QTQ740jamTJRvF1W7bzwS_mNiBJqxbcUE4,8
+aigc_core-0.0.1.dist-info/RECORD,,

aigc_core-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

aigc_core-0.0.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ aigcore

aigcore/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+A inclued llm and agent to aigc lib
+PyPI: https://pypi.org/project/aigcore/
+GitHub: https://github.com/torrentbrave/aigcore
+"""
+__author__ = "BoHaoChen"
+__connect__ = "X @TorrentBrave"
+__version__ = "0.0.1"
+from ._logger import logger, print
+from . import llm
+from . import agent
+class Null:
+    pass
+NULL = Null()
+__all__ = ["logger", "print", "NULL"]
+__all__.extend(llm.__all__ + agent.__all__)

aigcore/_logger.py ADDED Viewed

@@ -0,0 +1,95 @@
+import os
+import datetime
+import logging
+from dotenv import load_dotenv
+load_dotenv()
+__all__ = ["logger", "print"]
+_print = print
+def print(
+    *args,
+    **kwargs,
+) -> None:
+    """
+    Prints, and then flushes instantly.
+    The usage is the same as the built-in `print`.
+    Parameters
+    ----------
+    See also the built-in `print`.
+    Returns
+    -------
+    None
+    Notes
+    -----
+    `args` and `kwargs` are passed to the built-in `print`. `flush` is
+    overridden to True no matter what.
+    """
+    kwargs["flush"] = True
+    _print(*args, **kwargs)
+def logger(
+    *,
+    name: str | None = None,
+    dir: str | None = None,
+) -> logging.Logger:
+    """
+    Returns a pre-configured `logging.Logger` object.
+    INFO logs are written to both the .log file and the console.
+    WARNING logs are written to the console only.
+    Parameters
+    ----------
+    name: str | None = None
+        `logging.Logger.name`. If *None*, it is set to 'wqb' followed by
+        the current datetime. The filename of the .log file is set to
+        `name` followed by '.log'.
+        Specifying a name is required if you want to prevent multiple log files and keep everything in a single trace.
+    dir: str | None = "logs"
+        The directory where the .log file will be stored.
+        Defaults to 'logs'. If the directory does not exist, it will be created.
+    Returns
+    -------
+    logging.Logger
+        A pre-configured `logging.Logger` object.
+    """
+    if dir is None:
+        dir = os.getenv("LOGDIR", "logs")
+    if name is None:
+        name = "aigcore" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    if dir is not None:
+        os.makedirs(dir, exist_ok=True)
+        log_path = os.path.join(dir, f"{name}.log")
+    else:
+        log_path = f"{name}.log"
+    logger = logging.getLogger(name=name)
+    logger.setLevel(logging.INFO)
+    handler1 = logging.FileHandler(log_path, mode="w", encoding="utf-8")
+    handler1.setLevel(logging.INFO)
+    handler1.setFormatter(
+        logging.Formatter(fmt="# %(levelname)s %(asctime)s\n%(message)s\n")
+    )
+    logger.addHandler(handler1)
+    handler2 = logging.StreamHandler()
+    handler2.setLevel(logging.WARNING)
+    handler2.setFormatter(
+        logging.Formatter(fmt="# %(levelname)s %(asctime)s\n%(message)s\n")
+    )
+    logger.addHandler(handler2)
+    return logger

aigcore/agent/__init__.py ADDED Viewed

File without changes

aigcore/llm/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import lora
+from . import model
+from . import embed
+from . import norm
+__all__ = [
+    "lora",
+    "model",
+    "embed",
+    "norm",
+]

aigcore/llm/attention/__init__.py ADDED Viewed

File without changes

aigcore/llm/attention/_self_attention.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""
+Visualizing the Self-Attention Mechanism - https://codingowen.github.io/blog/2025/02/27/self_attention_intuition/
+Building the Self-Attention Mechanism from scratch
+    (1) https://codingowen.github.io/projects/self_attention/
+    (2) https://mohdfaraaz.medium.com/implementing-self-attention-from-scratch-in-pytorch-776ef7b8f13e
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SelfAttention(nn.Module):
+    def __init__(self, d, d_k, d_q, d_v):
+        super(SelfAttention, self).__init__
+        self.d = d
+        self.d_k = d_k
+        self.d_q = d_q
+        self.d_v = d_v
+        self.W_K = nn.Parameter(torch.rand(d, d_k))
+        self.W_Q = nn.Parameter(torch.rand(d, d_q))
+        self.W_V = nn.Parameter(torch.rand(d, d_v))
+    def forward(self, X):
+        K = X @ self.W_K
+        Q = X @ self.W_Q
+        V = X @ self.W_V
+        attention_scores = Q @ K.T / math.sqrt(self.d_k)
+        attention_weights = F.softmax(attention_scores, dim=-1)
+        context_vector = attention_weights @ V
+        return context_vector

aigcore/llm/embed/__init__.py ADDED Viewed

File without changes

aigcore/llm/embed/_rope.py ADDED Viewed

@@ -0,0 +1,93 @@
+import math
+import torch
+import torch.nn.init as init
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, Tuple, List, Union
+from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+def precompute_freqs_cis(
+    dim: int,
+    end: int = int(32 * 1024),
+    rope_base: float = 1e6,
+    rope_scaling: Optional[dict] = None,
+):
+    """
+    预先计算旋转位置编码所需的cos和sin矩阵
+    YaRN(Yet another ROPE extensioN): 推理时动态扩展模型的上下文窗口(Extrapolation)
+    torch.arange(0, dim, 2) 从 0 到 dim, 每隔 2 取一个数
+        [0: dim // 2] 切片操作,强制选
+    """
+    freqs, attn_factor = (
+        1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)),
+        1.0,
+    )
+    if rope_scaling is not None:
+        orig_max, factor, beta_fast, beta_slow, attn_factor = (
+            rope_scaling.get("original_max_position_embeddings", 2048),
+            rope_scaling.get("factor", 16),
+            rope_scaling.get("beta_fast", 32.0),
+            rope_scaling.get("beta_slow", 1.0),
+            rope_scaling.get("attention_factor", 1.0),
+        )
+        if end / orig_max > 1.0:
+            # YaRN: f'(i) = f(i)((1-γ) + γ/s), where γ∈[0,1] is linear ramp
+            inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (
+                2 * math.log(rope_base)
+            )
+            low, high = (
+                max(math.floor(inv_dim(beta_fast)), 0),
+                min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1),
+            )
+            ramp = torch.clamp(
+                (torch.arange(dim // 2, device=freqs.device).float() - low)
+                / max(high - low, 0.001),
+                0,
+                1,
+            )
+            freqs = freqs * (1 - ramp + ramp / factor)
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
+    freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
+    return freqs_cos, freqs_sin
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    def rotate_half(x):
+        return torch.cat(
+            (-x[..., x.shape[-1] // 2 :], x[..., : x.shape[-1] // 2]), dim=-1
+        )
+    q_embed = (q * cos.unsqueeze(unsqueeze_dim)) + (
+        rotate_half(q) * sin.unsqueeze(unsqueeze_dim)
+    )
+    k_embed = (k * cos.unsqueeze(unsqueeze_dim)) + (
+        rotate_half(k) * sin.unsqueeze(unsqueeze_dim)
+    )
+    return q_embed, k_embed
+"""
+绝对位置编码: 构建一维向量,通过加法对嵌入向量增加位置信息
+旋转位置编码: 构建分组旋转矩阵,通过矩阵乘对特征分量嵌入位置信息
+P(m,i): 位置编号为m,第i个角度对应的分组向量,d是注意力特征维度
+    P(m,i) = [sin(m*theta_i), cos(m*theta_i)]
+theta: 定义角度
+    (1) 决定旋转的快慢: theta_i = 1 / (1 / b^2((i - 1)/d)))
+        i in [1, d/2]: rope 不是对所有维度统一旋转,而是两两一组,把它们看作一个个二维平面上的点,i 就是这些组的编号
+Q1: 为什么高维度的慢旋转能捕捉长距离?
+A1: 避免角度重合
+    (1) 相位偏移: 维度低,旋转快,如果两个词距离远,指针可能转了几十圈回到原点,模型分不清两个词是距离1,101 or 201
+    (2) 位置与角度对应唯一: 旋转慢,即使两个词距离1000,指针可能才转30度
+"""

aigcore/llm/lora/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"The lora module"
+from ._lora_base import load_lora, save_lora, apply_lora
+__all__ = [
+    "load_lora",
+    "save_lora",
+    "apply_lora",
+]

aigcore/llm/lora/_lora_base.py ADDED Viewed

@@ -0,0 +1,80 @@
+import torch
+from torch import nn
+class LoRA(nn.Module):
+    """
+    output = W_0x + BA(X) * alpha/rank
+    1. matrix B (All Zeros):
+        ensures the initial state of the LoRA path is BAx=0
+    2. matrix A (Random/Gaussian):
+        breaks symmetry
+        If both A and B were zero, the gradients for all neurons would be identical.
+        Randomizing A ensures that different neurons can learn different features once training begins and the weights start to update
+    """
+    def __init__(self, in_features, out_features, rank):
+        super().__init__()
+        self.A = nn.Linear(in_features, rank, bias=False)
+        self.B = nn.Linear(rank, out_features, bias=False)
+        self.A.weight.data.normal_(0, std=0.02)
+        self.B.weight.data.zero_()
+    def forward(self, x):
+        return self.B(self.A(x))
+def apply_lora(model, rank=8):
+    """
+    explicit binding
+    """
+    for name, module in model.named_modules():
+        if (
+            isinstance(module, nn.Linear)
+            and module.weight.shape[0] == module.weight.shape[1]
+        ):
+            lora = LoRA(module.weight.shape[0], module.weight.shape[1], rank=rank).to(
+                model.device
+            )
+            setattr(module, "lora", lora)
+            original_forward = module.forward
+            def forward_with_lora(x, layer1=original_forward, layer2=lora):
+                return layer1(x) + layer2(x)
+            module.forward = forward_with_lora
+def load_lora(model, path):
+    state_dict = torch.load(path, map_location=model.device)
+    state_dict = {
+        (k[7:] if k.startswith("module.") else k): v for k, v in state_dict.items()
+    }
+    for name, module in model.named_modules():
+        if hasattr(module, "lora"):
+            lora_state = {
+                k.replace(f"{name}.lora.", ""): v
+                for k, v in state_dict.items()
+                if f"{name}.lora." in k
+            }
+            module.lora.load_state_dict(lora_state)
+def save_lora(model, path):
+    """
+    raw_model = getattr(model, "_orig_mod", model)
+        If compiled: It grabs the hidden original model (_orig_mod).
+        If not compiled: It just uses the model as-is
+    """
+    raw_model = getattr(model, "_orig_mod", model)
+    state_dict = {}
+    for name, module in raw_model.named_modules():
+        if hasattr(module, "lora"):
+            clean_name = name[7:] if name.startswith("module.") else name
+            lora_state = {
+                f"{clean_name}.lora.{k}": v for k, v in module.lora.state_dict().items()
+            }
+            state_dict.update(lora_state)
+    torch.save(state_dict, path)

aigcore/llm/model/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+"The model module."
+from ._minimind import minimind
+# from ._minimindv import minimindv

aigcore/llm/model/_minimind.py ADDED Viewed

@@ -0,0 +1,103 @@
+import math
+import torch
+import torch.nn.init as init
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, Tuple, List, Union
+from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import PretrainedConfig
+from module import RMSNorm, LayerNorm
+class MiniMindConfig(PretrainedConfig):
+    """
+    Base Parameters:
+        (1) vocab_size: 6400
+        (2) hidden_size: 512
+        (3) num_hidden_layers: 8
+    YaRN (RoPE 扩展):
+        inference_rope_scaling: True
+        扩展 Context Window 允许模型处理比训练时更长的序列
+    MoE:
+        use_moe: True
+    """
+    model_type = "minimind"
+    def __init__(
+        self,
+        dropout: float = 0.0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        hidden_act: str = "silu",
+        hidden_size: int = 512,
+        intermediate_size: int = None,
+        max_position_embeddings: int = 32768,
+        num_attention_heads: int = 8,
+        num_hidden_layers: int = 8,
+        num_key_value_heads: int = 2,
+        vocab_size: int = 6400,
+        rms_norm_eps: float = 1e-05,
+        rope_theta: int = 1000000.0,
+        inference_rope_scaling: bool = False,
+        flash_attn: bool = True,
+        ####################################################
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
+        ####################################################
+        use_moe: bool = False,
+        num_experts_per_tok: int = 2,
+        n_routed_experts: int = 4,
+        n_shared_experts: int = 1,
+        scoring_func: str = "softmax",
+        aux_loss_alpha: float = 0.01,
+        seq_aux: bool = True,
+        norm_topk_prob: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.inference_rope_scaling = inference_rope_scaling
+        # 外推长度 = factor * original_max_position_embeddings = 32768
+        self.rope_scaling = (
+            {
+                "beta_fast": 32,
+                "beta_slow": 1,
+                "factor": 16,
+                "original_max_position_embeddings": 2048,
+                "attention_factor": 1.0,
+                "type": "yarn",
+            }
+            if self.inference_rope_scaling
+            else None
+        )
+        self.flash_attn = flash_attn
+        ####################################################
+        # Here are the specific configurations of MOE
+        # When use_moe is false, the following is invalid
+        ####################################################
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok  # 每个token选择的专家数量
+        self.n_routed_experts = n_routed_experts  # 总的专家数量
+        self.n_shared_experts = n_shared_experts  # 共享专家
+        self.scoring_func = scoring_func  # 评分函数，默认为'softmax'
+        self.aux_loss_alpha = aux_loss_alpha  # 辅助损失的alpha参数
+        self.seq_aux = seq_aux  # 是否在序列级别上计算辅助损失
+        self.norm_topk_prob = norm_topk_prob  # 是否标准化top-k概率

aigcore/llm/norm/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ._norm import RMSNorm, LayerNorm
+__all__ = [
+    "RMSNorm",
+    "LayerNorm",
+]

aigcore/llm/norm/_norm.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Q1: 为什么RMSNorm和LayerNorm都在token特征维度上操作而非跨batch?
+A1: BatchNorm是在处理图像数据时常用的归一化方式
+    图像数据通常有强烈的空间相关性,即相邻的像素通常会有相似的值或模式。因此,图像的像素特征在一个batch中通常有相似的分布,这使得在整个batch上做归一化是合理的。BatchNorm通过计算每个特征(比如每个通道)的均值和方差，能有效地减轻这些空间相关性带来的影响，并保证训练时每一层的输入保持一定的分布,从而加速收敛。
+    而在NLP任务中,每个token通常是一个具有特定语义和上下文信息的单位,比如每个token代表一个词。每个token的特征是通过模型的embedding层或Transformer层计算得到的,并包含了该token的语义信息。不同token的语义内容不同,所以它们的特征应该独立地进行归一化处理。
+    如果归一化操作发生在batch维度上,会导致不考虑每个token的独立性。用于归一化的数据来自不同的batch,包含不同的token内容和信息,如果跨batch进行标准化,会丢失token间的独立性,使得token之间存在耦合关系,比如一些padding token并没有实际意义,但是被加入了归一化计算,进而影响模型的学习效果
+Q2: 为什么使用RMSNorm而不是LayerNorm?
+A2: (1) 计算过程比更简单,因为它不涉及均值的计算,并且减少了一个可学习参数
+        LayerNorm在归一化时需要计算每个token的均值和方差,并使用它们来标准化输入。
+        而RMSNorm只需要计算特征的平方和,减少了计算复杂度和内存消耗
+    (2) 处理大型模型时,输入的特征维度可能非常大,计算均值和方差的开销相对较大。RMSNorm去除了均值计算,因此可以节省计算资源,特别是在高维数据中，计算效率更高
+    (3) 在各种场景中实验发现,使用RMSNorm能够减少约7%~64%的计算时间
+Q3: token 独立标准化的作用
+A3: (1) 变长序列: 在推理时，句子长度是动态的。如果 normalization 依赖于其他 token(Batch 维度),那么当句子变长时,均值和方差会剧烈波动。
+    (2) 并行计算: 独立化让每个 token 的归一化可以并行完成，不需要等待其他 Batch 的统计结果。
+"""
+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    """
+    x.shape: [batch_size, seq_length, embedding_dim]
+    gamma: scale parameter which can learn named weight for each token
+    torch.rsqrt: x.pow(2).mean(-1, keepdim=True) + self.eps 的平方根倒数
+    直接调用 rsqrt 比 先 sqrt 再 1 / 在 GPU 上更高效
+    keepdim: eg [1, 2, 4] -> [1, 2, 1] 而不是 [1, 2]
+    Llama 系列模型标配的归一化层,比标准的LayerNorm少了减去均值的步骤,计算更简单,训练更稳定
+    """
+    def __init__(self, dim: int, eps: float = 1e-5):
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.weight * self._norm(x.float()).type_as(x)
+class LayerNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.bias = nn.Parameter(torch.zeros(dim))
+    def forward(self, x):
+        mean = x.mean(dim=-1, keepdim=True)
+        var = x.var(dim=-1, keepdim=True, unbiased=False)
+        return self.weight * (x - mean) / (var + self.eps).sqrt() + self.bias