PyPI - orbit-torch - Versions diffs - 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl - Mend

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

orbit/__init__.py +3 -1
orbit/callback.py +4 -3
orbit/dataset/__init__.py +1 -0
orbit/dataset/cogn.py +138 -0
orbit/dataset/data/cogn_en.jsonl +45 -0
orbit/dataset/data/cogn_zh.jsonl +113 -0
orbit/engine.py +210 -146
orbit/kit/__init__.py +2 -0
orbit/kit/interface.py +154 -0
orbit/kit/wrapper.py +157 -0
orbit/model/__init__.py +5 -0
orbit/model/base.py +125 -0
orbit/model/block/__init__.py +34 -0
orbit/model/block/attention.py +265 -0
orbit/model/block/bio.py +537 -0
orbit/model/block/codebook.py +122 -0
orbit/model/block/conv.py +505 -0
orbit/model/block/embedding.py +252 -0
orbit/model/block/film.py +176 -0
orbit/model/block/fusion.py +335 -0
orbit/model/block/gate.py +334 -0
orbit/model/block/lora.py +776 -0
orbit/model/block/mlp.py +68 -0
orbit/model/block/moe.py +94 -0
orbit/model/block/tcn.py +99 -0
orbit/model/config.py +62 -0
orbit/model/kit/__init__.py +6 -0
orbit/model/kit/discriminator.py +46 -0
orbit/model/kit/losses.py +193 -0
orbit/model/motif/__init__.py +0 -0
orbit/model/motif/vision/__init__.py +0 -0
orbit/model/motif/vision/v1.py +645 -0
orbit/model/registry.py +53 -0
orbit/optim/__init__.py +2 -2
orbit/optim/sam.py +10 -3
orbit/plugin/__init__.py +12 -8
orbit/plugin/board.py +1 -2
orbit/plugin/checkpoint.py +137 -62
orbit/plugin/classification.py +2 -2
orbit/plugin/display_model.py +1 -2
orbit/plugin/early_stopping.py +1 -2
orbit/plugin/ema.py +1 -2
orbit/plugin/gradient_accumulation.py +1 -2
orbit/plugin/lora.py +346 -0
orbit/plugin/memory_estimator.py +1 -2
orbit/plugin/warmup.py +1 -2
orbit/utils/__init__.py +24 -1
orbit/utils/cuda.py +10 -0
orbit/utils/freeze.py +61 -17
orbit/utils/image.py +164 -0
orbit/utils/initialization.py +184 -94
orbit/utils/layer_io.py +66 -7
orbit/utils/lora.py +480 -0
orbit/utils/moe.py +55 -0
orbit/utils/seed.py +3 -19
orbit/utils/sft.py +93 -0
orbit_torch-0.1.0b1.dist-info/METADATA +208 -0
orbit_torch-0.1.0b1.dist-info/RECORD +65 -0
orbit_torch-0.0.4a1.dist-info/METADATA +0 -25
orbit_torch-0.0.4a1.dist-info/RECORD +0 -29
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/WHEEL +0 -0
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/top_level.txt +0 -0

orbit/model/block/embedding.py ADDED Viewed

@@ -0,0 +1,252 @@
+import torch
+import torch.nn as nn
+import math
+from orbit.model import BaseBlock, register_model
+@register_model()
+class RotaryPositionalEmbedding(BaseBlock):
+    '''
+    旋转位置编码 (Rotary Positional Embedding, RoPE)。
+    '''
+    def __init__(self, model_dim: int, max_len: int = 128000, base: int = 10000):
+        '''
+        初始化 RoPE 模块。
+        Args:
+            model_dim (int): 模型的维度 (或 head_dim)。必须是偶数。
+            max_len (int, optional): 预计算位置编码的最大序列长度。默认为 128000。
+            base (int, optional): 计算频率的基数。默认为 10000。
+        '''
+        super(RotaryPositionalEmbedding, self).__init__()
+        self.model_dim = model_dim
+        self.max_len = max_len
+        self.base = base
+        inv_freq = 1.0 / (base ** (torch.arange(0, model_dim, 2).float() / model_dim))
+        t = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.outer(t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos())
+        self.register_buffer('sin_cached', emb.sin())
+    def _rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        '''
+        将向量分为两半并旋转: [-x2, x1]。
+        无论输入是 3D 还是 4D，Split 都是作用在最后一维 (model_dim)。
+        Args:
+            x (torch.Tensor): 输入张量。
+        Returns:
+            torch.Tensor: 旋转后的张量。
+        '''
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        '''
+        应用旋转位置编码。
+        自动适配两种输入:
+        1. [Batch, Seq_Len, Dim]
+        2. [Batch, Head, Seq_Len, Head_Dim]
+        Args:
+            x (torch.Tensor): 输入张量。
+            start_pos (int, optional): 起始位置索引，用于 KV Cache 推理。默认为 0。
+        Returns:
+            torch.Tensor: 添加了位置信息的张量。
+        '''
+        ndim = x.ndim
+        seq_len = x.shape[-2]
+        cos = self.cos_cached[start_pos : start_pos + seq_len, :]
+        sin = self.sin_cached[start_pos : start_pos + seq_len, :]
+        shape = [1] * (ndim - 2) + [seq_len, -1]
+        cos = cos.view(*shape)
+        sin = sin.view(*shape)
+        return (x * cos) + (self._rotate_half(x) * sin)
+@register_model()
+class SinusoidalPositionalEmbedding(BaseBlock):
+    def __init__(self, model_dim: int, max_len: int = 128000):
+        '''
+        初始化绝对位置编码模块。
+        Args:
+            model_dim (int): 模型的维度。
+            max_len (int, optional): 最大序列长度。默认为 128000。
+        '''
+        super(SinusoidalPositionalEmbedding, self).__init__()
+        self.model_dim = model_dim
+        self.max_len = max_len
+        pe = torch.zeros(max_len, model_dim)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-math.log(10000.0) / model_dim))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        '''
+        前向传播。
+        Args:
+            x (torch.Tensor): 输入张量。Shape: [Batch_Size, Seq_Len, model_dim]。
+        Returns:
+            torch.Tensor: 加上位置编码后的张量。
+        '''
+        x = x + self.pe[:, :x.size(1), :]
+        return x
+@register_model()
+class MRoPEInterleavedEmbedding(BaseBlock):
+    '''
+    交错分配多模态旋转位置编码 (MRoPE‑Interleave)。
+    支持三维位置（时间 t、高度 h、宽度 w），频率通道采用轮转交错分配 (thw…thw…thw)。
+    '''
+    def __init__(self, model_dim: int, max_len: int = 128000, base: int = 10000, num_axes: int = 3):
+        '''
+        初始化 MRoPEInterleaved 模块。
+        Args:
+            model_dim (int): 模型的维度。必须是偶数且能被 num_axes 整除。
+            max_len (int, optional): 预计算位置编码的最大序列长度。默认为 128000。
+            base (int, optional): 计算频率的基数。默认为 10000。
+            num_axes (int, optional): 位置轴的数量（例如 3 表示时间、高度、宽度）。默认为 3。
+        '''
+        super().__init__()
+        assert model_dim % 2 == 0, 'model_dim must be even'
+        assert model_dim % num_axes == 0, f'model_dim {model_dim} not divisible by num_axes {num_axes}'
+        self.model_dim = model_dim
+        self.max_len = max_len
+        self.base = base
+        self.num_axes = num_axes
+        inv_freq = 1.0 / (base ** (torch.arange(0, model_dim, 2).float() / model_dim))
+        t_range = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.outer(t_range, inv_freq)  # [max_len, dim/2]
+        emb = torch.cat((freqs, freqs), dim=-1)  # [max_len, dim]
+        self.register_buffer('cos_cached', emb.cos())
+        self.register_buffer('sin_cached', emb.sin())
+        self.register_buffer(
+            'axis_mask',
+            torch.arange(model_dim) % num_axes,
+            persistent=False
+        )
+        k = model_dim // num_axes
+        idx = []
+        for p in range(model_dim):
+            j = p % num_axes
+            i = p // num_axes
+            pos_in_old = j * k + i
+            idx.append(pos_in_old)
+        self.register_buffer('interleave_idx', torch.tensor(idx, dtype=torch.long), persistent=False)
+    def _rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        '''
+        将向量分为两半并旋转: [-x2, x1]。
+        Args:
+            x (torch.Tensor): 输入张量。
+        Returns:
+            torch.Tensor: 旋转后的张量。
+        '''
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    def forward(self, x: torch.Tensor, positions: torch.Tensor = None, start_pos: int = 0) -> torch.Tensor:
+        '''
+        应用多模态旋转位置编码。
+        Args:
+            x (torch.Tensor): 输入张量。Shape: [Batch, Seq_Len, Dim] 或 [Batch, Head, Seq_Len, Head_Dim]。
+            positions (torch.Tensor, optional): 位置索引张量。Shape: [Batch, Seq_Len] 或 [Batch, Seq_Len, num_axes]。
+                如果是 2D 张量，将自动扩展为 [Batch, Seq_Len, num_axes]。
+                如果为 None 且 num_axes=1，将自动创建线性位置索引。
+            start_pos (int, optional): 起始位置索引。默认为 0。
+        Returns:
+            torch.Tensor: 添加了位置信息的张量。
+        Raises:
+            ValueError: 如果 positions 为 None 且 num_axes > 1。
+        '''
+        ndim = x.ndim
+        seq_len = x.shape[-2]
+        batch_size = x.shape[0]
+        if positions is None:
+            if self.num_axes == 1:
+                positions = torch.arange(0, seq_len, device=x.device, dtype=torch.long)
+            else:
+                raise ValueError("positions must be provided when num_axes > 1 (e.g. for vision/multimodal inputs)")
+        if positions.ndim == 1:
+            positions = positions.unsqueeze(0).unsqueeze(-1).expand(batch_size, -1, self.num_axes)
+        if positions.ndim == 2:
+            positions = positions.unsqueeze(-1).expand(-1, -1, self.num_axes)
+        if positions.ndim == 3 and positions.shape[-1] == 1:
+            positions = positions.expand(-1, -1, self.num_axes)
+        batch_size = positions.shape[0]
+        cos_list, sin_list = [], []
+        for ax in range(self.num_axes):
+            pos_ax = positions[..., ax]
+            pos_ax = torch.clamp(pos_ax + start_pos, 0, self.max_len - 1).long()
+            cos_full = self.cos_cached[pos_ax]
+            sin_full = self.sin_cached[pos_ax]
+            mask = (self.axis_mask == ax)
+            cos_ax = cos_full[..., mask]
+            sin_ax = sin_full[..., mask]
+            cos_list.append(cos_ax)
+            sin_list.append(sin_ax)
+        cos_all = torch.cat(cos_list, dim=-1)
+        sin_all = torch.cat(sin_list, dim=-1)
+        cos_all = cos_all[..., self.interleave_idx]
+        sin_all = sin_all[..., self.interleave_idx]
+        if ndim == 4:
+            shape = [batch_size, 1, seq_len, -1]
+            cos_all = cos_all.view(*shape)
+            sin_all = sin_all.view(*shape)
+        return (x * cos_all) + (self._rotate_half(x) * sin_all)

orbit/model/block/film.py ADDED Viewed

@@ -0,0 +1,176 @@
+import torch
+import torch.nn as nn
+from typing import Optional
+from dataclasses import dataclass
+from orbit.model import BaseBlock, register_model
+@dataclass
+class FiLMOutput:
+    ''' FiLM 模块的输出容器。
+    Attributes:
+        output (torch.Tensor): 经过 gamma 和 beta 调制后的特征。
+        gate (Optional[torch.Tensor]): 用于残差连接的门控值。
+    '''
+    output: torch.Tensor
+    gate: Optional[torch.Tensor] = None
+    @property
+    def gated_output(self):
+        if self.gate is None: return self.output
+        return self.output * self.gate
+@register_model()
+class FiLM(BaseBlock):
+    ''' Feature-wise Linear Modulation (FiLM) 模块。
+    对输入特征进行仿射变换：FiLM(x) = (1 + gamma(z)) * x + beta(z)
+    其中 gamma 和 beta 是从条件输入 z 生成的。
+    初始状态下，gamma 为 0，beta 为 0，即恒等映射。
+    Args:
+        in_features (int): 输入特征维度。
+        cond_features (int): 条件特征维度。
+        use_beta (bool, optional): 是否使用平移项 (beta)。默认为 True。
+        use_gamma (bool, optional): 是否使用缩放项 (gamma)。默认为 True。
+        use_gate (bool, optional): 是否使用门控项 (gate)。默认为 True。
+        use_context_gate (bool, optional): 是否使用上下文门控 (context gate)。
+            如果为 True，将使用输入特征和条件特征的拼接来生成门控值，并覆盖 use_gate 的设置。默认为 False。
+        channel_first (bool, optional): 特征维度是否在第 1 维 (如 CNN [B, C, H, W])。
+            如果为 False，则假设特征在最后一维 (如 Transformer [B, L, C])。默认为 False。
+    '''
+    def __init__(
+        self,
+        in_features: int,
+        cond_features: int,
+        use_beta: bool = True,
+        use_gamma: bool = True,
+        use_gate: bool = True,
+        use_context_gate: bool = False,
+        channel_first: bool = False
+    ):
+        super(FiLM, self).__init__()
+        if use_context_gate: use_gate = False
+        self.in_features = in_features
+        self.cond_features = cond_features
+        self.use_beta = use_beta
+        self.use_gamma = use_gamma
+        self.use_gate = use_gate
+        self.use_context_gate = use_context_gate
+        self.channel_first = channel_first
+        self.out_dim = 0
+        if use_gamma: self.out_dim += in_features
+        if use_beta:  self.out_dim += in_features
+        if use_gate:  self.out_dim += in_features
+        self.gate_proj = nn.Linear(in_features + cond_features, in_features) if use_context_gate else nn.Identity()
+        if self.out_dim > 0:
+            self.proj = nn.Linear(cond_features, self.out_dim)
+        else: self.proj = None
+        self._init_weights(self)
+    def _init_weights(self, model: nn.Module):
+        ''' 初始化权重。
+        将投影层的权重和偏置初始化为 0，以确保初始状态为恒等映射。
+        如果使用了上下文门控，其投影层使用 Xavier Uniform 初始化。
+        Args:
+            model (nn.Module): 需要初始化的模型。
+        '''
+        if model is self and self.proj is not None:
+            nn.init.constant_(self.proj.weight, 0)
+            nn.init.constant_(self.proj.bias, 0)
+            if isinstance(self.gate_proj, nn.Identity): return
+            nn.init.xavier_uniform_(self.gate_proj.weight, gain=0.1)
+            nn.init.zeros_(self.gate_proj.bias)
+    def _reshape(self, param: torch.Tensor, ref_ndim: int) -> torch.Tensor:
+        ''' 调整参数形状以匹配输入特征的维度，以便进行广播。
+        Args:
+            param (torch.Tensor): 需要重塑的参数张量。
+            ref_ndim (int): 参考张量（通常是输入特征 x）的维度数。
+        Returns:
+            torch.Tensor: 重塑后的参数张量。
+        '''
+        if self.channel_first:
+            param = param.movedim(-1, 1)
+            for _ in range(ref_ndim - param.ndim):
+                param = param.unsqueeze(-1)
+        else:
+            for _ in range(ref_ndim - param.ndim):
+                param = param.unsqueeze(-2)
+        return param
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> FiLMOutput:
+        ''' 前向传播。
+        Args:
+            x (torch.Tensor): 输入特征。形状为 [B, C, ...] (如果 channel_first=True)
+                或 [B, ..., C] (如果 channel_first=False)。
+            cond (torch.Tensor): 条件输入。形状为 [B, ..., cond_features]。
+        Returns:
+            FiLMOutput: 调制后的特征。
+        '''
+        if self.proj is None: return FiLMOutput(output=x)
+        params = self.proj(cond)
+        count = sum([self.use_gamma, self.use_beta, self.use_gate])
+        if count > 1:
+            params_list = params.chunk(count, dim=-1)
+        else:
+            params_list = [params]
+        idx = 0
+        gamma, beta, gate = None, None, None
+        if self.use_gamma:
+            gamma = params_list[idx]
+            idx += 1
+        if self.use_beta:
+            beta = params_list[idx]
+            idx += 1
+        if self.use_gate:
+            gate = params_list[idx]
+            idx += 1
+        out = x
+        if gamma is not None:
+            out = out * (1 + self._reshape(gamma, x.ndim))
+        if beta is not None:
+            out = out + self._reshape(beta, x.ndim)
+        final_gate = None
+        if self.use_context_gate:
+            if cond.ndim < x.ndim:
+                shape = list(x.shape)
+                feat_dim = 1 if self.channel_first else -1
+                shape[feat_dim] = -1
+                cond_expanded = self._reshape(cond, x.ndim).expand(shape)
+            else:
+                cond_expanded = cond
+            feat_dim = 1 if self.channel_first else -1
+            context_input = torch.cat([x, cond_expanded], dim=feat_dim)
+            if self.channel_first:
+                context_input = context_input.movedim(1, -1)
+                final_gate = self.gate_proj(context_input).movedim(-1, 1)
+            else:
+                final_gate = self.gate_proj(context_input)
+        elif gate is not None:
+            final_gate = self._reshape(gate, x.ndim)
+        return FiLMOutput(output=out, gate=final_gate)

orbit-torch 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl