PyPI - orbit-torch - Versions diffs - 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl - Mend

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

orbit/__init__.py +3 -1
orbit/callback.py +4 -3
orbit/dataset/__init__.py +1 -0
orbit/dataset/cogn.py +138 -0
orbit/dataset/data/cogn_en.jsonl +45 -0
orbit/dataset/data/cogn_zh.jsonl +113 -0
orbit/engine.py +210 -146
orbit/kit/__init__.py +2 -0
orbit/kit/interface.py +154 -0
orbit/kit/wrapper.py +157 -0
orbit/model/__init__.py +5 -0
orbit/model/base.py +125 -0
orbit/model/block/__init__.py +34 -0
orbit/model/block/attention.py +265 -0
orbit/model/block/bio.py +537 -0
orbit/model/block/codebook.py +122 -0
orbit/model/block/conv.py +505 -0
orbit/model/block/embedding.py +252 -0
orbit/model/block/film.py +176 -0
orbit/model/block/fusion.py +335 -0
orbit/model/block/gate.py +334 -0
orbit/model/block/lora.py +776 -0
orbit/model/block/mlp.py +68 -0
orbit/model/block/moe.py +94 -0
orbit/model/block/tcn.py +99 -0
orbit/model/config.py +62 -0
orbit/model/kit/__init__.py +6 -0
orbit/model/kit/discriminator.py +46 -0
orbit/model/kit/losses.py +193 -0
orbit/model/motif/__init__.py +0 -0
orbit/model/motif/vision/__init__.py +0 -0
orbit/model/motif/vision/v1.py +645 -0
orbit/model/registry.py +53 -0
orbit/optim/__init__.py +2 -2
orbit/optim/sam.py +10 -3
orbit/plugin/__init__.py +12 -8
orbit/plugin/board.py +1 -2
orbit/plugin/checkpoint.py +137 -62
orbit/plugin/classification.py +2 -2
orbit/plugin/display_model.py +1 -2
orbit/plugin/early_stopping.py +1 -2
orbit/plugin/ema.py +1 -2
orbit/plugin/gradient_accumulation.py +1 -2
orbit/plugin/lora.py +346 -0
orbit/plugin/memory_estimator.py +1 -2
orbit/plugin/warmup.py +1 -2
orbit/utils/__init__.py +24 -1
orbit/utils/cuda.py +10 -0
orbit/utils/freeze.py +61 -17
orbit/utils/image.py +164 -0
orbit/utils/initialization.py +184 -94
orbit/utils/layer_io.py +66 -7
orbit/utils/lora.py +480 -0
orbit/utils/moe.py +55 -0
orbit/utils/seed.py +3 -19
orbit/utils/sft.py +93 -0
orbit_torch-0.1.0b1.dist-info/METADATA +208 -0
orbit_torch-0.1.0b1.dist-info/RECORD +65 -0
orbit_torch-0.0.4a1.dist-info/METADATA +0 -25
orbit_torch-0.0.4a1.dist-info/RECORD +0 -29
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/WHEEL +0 -0
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/top_level.txt +0 -0

orbit/utils/image.py ADDED Viewed

@@ -0,0 +1,164 @@
+import torch
+import torch.nn.functional as F
+from typing import Tuple
+from dataclasses import dataclass
+@dataclass
+class PatchOutput:
+    output: torch.Tensor
+    mask: torch.Tensor
+    patch_size: Tuple[int, int]
+    num_patches: Tuple[int, int]
+def pad_to_patch_size(image: torch.Tensor, patch_size: Tuple[int, int]) -> PatchOutput:
+    '''对图像进行填充以适配补丁大小，不进行分割。
+    此函数接收形状为 [..., channels, width, height] 的图像张量，
+    并在右侧和底部进行零填充，使得填充后的尺寸能被 patch_size 整除。
+    Args:
+        image (torch.Tensor): 输入图像张量，形状为 [..., channels, w, h]。
+            最后两个维度被视为空间维度（宽度，高度）。
+        patch_size (Tuple[int, int]): 表示补丁大小的元组 (a, b)，其中 'a' 对应于
+            宽度维度，'b' 对应于高度维度。
+    Returns:
+        PatchOutput: 包含以下字段的数据类：
+            - output (torch.Tensor): 填充后的图像张量，形状为 [..., channels, w_padded, h_padded]。
+            - mask (torch.Tensor): 形状为 [..., 1, w_padded, h_padded] 的掩码张量，
+              有效区域为 1，填充区域为 0。
+            - patch_size (Tuple[int, int]): 输入的补丁大小 (a, b)。
+            - num_patches (Tuple[int, int]): 宽度和高度方向的补丁数量 (num_w, num_h)。
+    Raises:
+        ValueError: 如果输入图像维度少于 3。
+    '''
+    if image.ndim < 3:
+        raise ValueError(f'Input image must have at least 3 dimensions, got {image.ndim}')
+    w, h = image.shape[-2], image.shape[-1]
+    a, b = patch_size
+    pad_w = (a - w % a) % a
+    pad_h = (b - h % b) % b
+    image_padded = F.pad(image, (0, pad_h, 0, pad_w))
+    w_padded = w + pad_w
+    h_padded = h + pad_h
+    num_w = w_padded // a
+    num_h = h_padded // b
+    mask = torch.ones((*image.shape[:-3], 1, w, h), dtype=image.dtype, device=image.device)
+    mask_padded = F.pad(mask, (0, pad_h, 0, pad_w), value=0)
+    return PatchOutput(
+        output=image_padded,
+        mask=mask_padded,
+        patch_size=patch_size,
+        num_patches=(num_w, num_h)
+    )
+def split_to_patches(image: torch.Tensor, patch_size: Tuple[int, int]) -> PatchOutput:
+    '''将图像张量分割成多个子图像，支持自动填充。
+    此函数接收形状为 [..., channels, width, height] 的图像张量，
+    并将其划分为形状为 [channels, patch_width, patch_height] 的补丁。
+    如果图像尺寸不能被 patch_size 整除，则在右侧和底部进行零填充。
+    结果张量的形状为 [..., num_patches_total, channels, patch_width, patch_height]。
+    Args:
+        image (torch.Tensor): 输入图像张量，形状为 [..., channels, w, h]。
+            最后两个维度被视为空间维度（宽度，高度）。
+        patch_size (Tuple[int, int]): 表示补丁大小的元组 (a, b)，其中 'a' 对应于
+            宽度维度，'b' 对应于高度维度。
+    Returns:
+        PatchOutput: 包含以下字段的数据类：
+            - output (torch.Tensor): 形状为 [..., num_w * num_h, channels, a, b] 的补丁张量。
+            - mask (torch.Tensor): 形状为 [..., num_w * num_h, 1, a, b] 的掩码张量，
+              有效区域为 1，填充区域为 0。
+            - patch_size (Tuple[int, int]): 输入的补丁大小 (a, b)。
+            - num_patches (Tuple[int, int]): 宽度和高度方向的补丁数量 (num_w, num_h)。
+    Raises:
+        ValueError: 如果输入图像维度少于 3。
+    '''
+    padded_output = pad_to_patch_size(image, patch_size)
+    image_padded = padded_output.output
+    mask_padded = padded_output.mask
+    num_w, num_h = padded_output.num_patches
+    a, b = patch_size
+    def split(x, nw, nh):
+        # x shape: [..., C, W, H]
+        reshaped = x.view(*x.shape[:-2], nw, a, nh, b)
+        # permute to [..., nw, nh, C, a, b]
+        permuted = reshaped.permute(
+            *range(reshaped.ndim - 5),
+            reshaped.ndim - 4, # nw
+            reshaped.ndim - 2, # nh
+            reshaped.ndim - 5, # C
+            reshaped.ndim - 3, # a
+            reshaped.ndim - 1  # b
+        )
+        return permuted.reshape(*x.shape[:-3], nw * nh, x.shape[-3], a, b)
+    output_patches = split(image_padded, num_w, num_h)
+    mask_patches = split(mask_padded, num_w, num_h)
+    return PatchOutput(
+        output=output_patches,
+        mask=mask_patches,
+        patch_size=patch_size,
+        num_patches=(num_w, num_h)
+    )
+def reconstruct_from_patches(patches: torch.Tensor, num_patches: Tuple[int, int], mask: torch.Tensor = None) -> torch.Tensor:
+    '''从补丁重建图像。
+    此函数是 split_to_patches 的逆操作。它将补丁张量重新组合成原始图像。
+    如果提供了 mask，则会根据 mask 去除 padding。
+    Args:
+        patches (torch.Tensor): 形状为 [..., num_patches, channels, patch_width, patch_height] 的补丁张量。
+        num_patches (Tuple[int, int]): 宽度和高度方向的补丁数量 (num_w, num_h)。
+        mask (torch.Tensor, optional): 用于去除 padding 的掩码，形状与 patches 相同但通道数为 1。
+            如果提供，将根据掩码裁剪重建后的图像以去除 padding。
+    Returns:
+        torch.Tensor: 重建后的图像张量，形状为 [..., channels, width, height]。
+    '''
+    nw, nh = num_patches
+    if patches.shape[-4] != nw * nh:
+         raise ValueError(f"Number of patches in tensor ({patches.shape[-4]}) does not match num_patches argument ({nw} * {nh} = {nw*nh})")
+    a, b = patches.shape[-2], patches.shape[-1]
+    def unsplit(x):
+        # x: [..., nw*nh, C, a, b]
+        reshaped = x.view(*x.shape[:-4], nw, nh, *x.shape[-3:])
+        # permute to [..., C, nw, a, nh, b]
+        permuted = reshaped.permute(
+            *range(reshaped.ndim - 5),
+            reshaped.ndim - 3, # C
+            reshaped.ndim - 5, # nw
+            reshaped.ndim - 2, # a
+            reshaped.ndim - 4, # nh
+            reshaped.ndim - 1  # b
+        )
+        return permuted.reshape(*x.shape[:-4], x.shape[-3], nw * a, nh * b)
+    reconstructed = unsplit(patches)
+    if mask is not None:
+        reconstructed_mask = unsplit(mask)
+        m = reconstructed_mask.view(-1, reconstructed_mask.shape[-2], reconstructed_mask.shape[-1])
+        valid_h = (m[0, 0, :] > 0.5).sum().item()
+        valid_w = (m[0, :, 0] > 0.5).sum().item()
+        reconstructed = reconstructed[..., :int(valid_w), :int(valid_h)]
+    return reconstructed

orbit/utils/initialization.py CHANGED Viewed

@@ -4,13 +4,8 @@ import re
 import torch
 import torch.nn as nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
-try:
-    from rich.console import Console
-    from rich.table import Table
-    RICH_AVAILABLE = True
-except ImportError:
-    RICH_AVAILABLE = False
+from rich.console import Console
+from rich.table import Table
 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
     '''截断正态分布初始化的辅助函数，在无梯度模式下运行。
@@ -68,11 +63,63 @@ def constant_init(module, val, bias=0):
         val (float): 权重的常数值。
         bias (float): 偏置的常数值。
     '''
+    if isinstance(module, (nn.Parameter, torch.Tensor)):
+        nn.init.constant_(module, val)
+        return
     if hasattr(module, 'weight') and module.weight is not None:
         nn.init.constant_(module.weight, val)
     if hasattr(module, 'bias') and module.bias is not None:
         nn.init.constant_(module.bias, bias)
+def _init_tensor_impl(tensor, method, distribution, a, mode, nonlinearity, gain, std, trunc_a, trunc_b):
+    '''内部函数：对单个张量应用初始化方法。'''
+    info = ""
+    if method == 'kaiming':
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                tensor, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                tensor, a=a, mode=mode, nonlinearity=nonlinearity)
+        info = f'Kaiming ({distribution}), mode={mode}, nonlin={nonlinearity}'
+    elif method == 'xavier':
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(tensor, gain=gain)
+        else:
+            nn.init.xavier_normal_(tensor, gain=gain)
+        info = f'Xavier ({distribution}), gain={gain}'
+    elif method == 'c2_xavier':
+        fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+        c2_std = math.sqrt(1.0 / float(fan_in))
+        nn.init.normal_(tensor, mean=0.0, std=c2_std)
+        info = f'C2 Xavier (Normal), std={c2_std:.4f}'
+    elif method == 'orthogonal':
+        nn.init.orthogonal_(tensor, gain=gain)
+        info = f'Orthogonal, gain={gain}'
+    elif method == 'trunc_normal':
+        trunc_normal_(
+            tensor, mean=0., std=std, a=trunc_a, b=trunc_b)
+        info = f'Trunc Normal, std={std}, a={trunc_a}, b={trunc_b}'
+    elif method == 'normal':
+        nn.init.normal_(tensor, mean=0., std=std)
+        info = f'Normal, std={std}'
+    elif method == 'constant':
+        nn.init.constant_(tensor, val=gain)
+        info = f'Constant, val={gain}'
+    else:
+        nn.init.xavier_uniform_(tensor, gain=gain)
+        info = f'Xavier (Uniform) [Default], gain={gain}'
+    return info
 def init_weights(module, method='kaiming', distribution='normal', bias=0,
                  a=0, mode='fan_out', nonlinearity='relu',
                  gain=1,
@@ -98,45 +145,56 @@ def init_weights(module, method='kaiming', distribution='normal', bias=0,
         std (float): Normal/Truncated Normal 的标准差。
         trunc_a (float): Truncated Normal 的下界。
         trunc_b (float): Truncated Normal 的上界。
-    '''
-    if hasattr(module, 'weight') and module.weight is not None:
-        if method == 'kaiming':
-            if distribution == 'uniform':
-                nn.init.kaiming_uniform_(
-                    module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
-            else:
-                nn.init.kaiming_normal_(
-                    module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
-        elif method == 'xavier':
-            if distribution == 'uniform':
-                nn.init.xavier_uniform_(module.weight, gain=gain)
-            else:
-                nn.init.xavier_normal_(module.weight, gain=gain)
-        elif method == 'c2_xavier':
-            fan_in, fan_out = _calculate_fan_in_and_fan_out(module.weight)
-            c2_std = math.sqrt(1.0 / float(fan_in))
-            nn.init.normal_(module.weight, mean=0.0, std=c2_std)
-        elif method == 'orthogonal':
-            nn.init.orthogonal_(module.weight, gain=gain)
-        elif method == 'trunc_normal':
-            trunc_normal_(
-                module.weight, mean=0., std=std, a=trunc_a, b=trunc_b)
+    Returns:
+        str: 初始化详情字符串，如果未执行初始化则返回 None。
+    '''
+    # 1. 直接处理 Parameter/Tensor
+    if isinstance(module, (nn.Parameter, torch.Tensor)):
+        return _init_tensor_impl(module, method, distribution, a, mode, nonlinearity, gain, std, trunc_a, trunc_b)
+    # 2. 处理 Module
+    info_parts = []
+    handled_params = set()
+    def init_and_record(tensor, name, is_bias=False):
+        if id(tensor) in handled_params:
+            return
-        elif method == 'normal':
-            nn.init.normal_(module.weight, mean=0., std=std)
-        elif method == 'constant':
-            nn.init.constant_(module.weight, val=gain)
+        if is_bias:
+            nn.init.constant_(tensor, bias)
+            # 简化输出：如果是标准的 bias 且值为 0，可能不需要太详细，但为了清晰还是保留
+            info = f"bias={bias}" if name == 'bias' else f"{name}: Constant({bias})"
         else:
-            nn.init.xavier_uniform_(module.weight, gain=gain)
+            info = _init_tensor_impl(tensor, method, distribution, a, mode, nonlinearity, gain, std, trunc_a, trunc_b)
+            if name != 'weight':
+                info = f"{name}: {info}"
+        info_parts.append(info)
+        handled_params.add(id(tensor))
+    # A. 优先处理标准属性 'weight'
+    if hasattr(module, 'weight') and module.weight is not None:
+        init_and_record(module.weight, 'weight', is_bias=False)
+    # B. 优先处理标准属性 'bias'
     if hasattr(module, 'bias') and module.bias is not None:
-        nn.init.constant_(module.bias, bias)
+        init_and_record(module.bias, 'bias', is_bias=True)
+    # C. 遍历所有注册参数 (处理自定义名称)
+    # recurse=False 确保只处理当前模块的直接参数
+    for name, param in module.named_parameters(recurse=False):
+        if id(param) in handled_params:
+            continue
+        # 启发式规则：维度 < 2 视为偏置类参数，否则视为权重类参数
+        is_bias_like = param.ndim < 2
+        init_and_record(param, name, is_bias=is_bias_like)
+    if not info_parts:
+        return None
+    return ", ".join(info_parts)
 def init_layer_norm(module, weight=1.0, bias=0.0):
     '''初始化 LayerNorm 或 GroupNorm 模块。
@@ -145,11 +203,21 @@ def init_layer_norm(module, weight=1.0, bias=0.0):
         module (nn.Module): 归一化模块。
         weight (float): 权重的初始值 (gamma)。
         bias (float): 偏置的初始值 (beta)。
+    Returns:
+        str: 初始化详情字符串。
     '''
+    initialized = False
     if hasattr(module, 'weight') and module.weight is not None:
         nn.init.constant_(module.weight, weight)
+        initialized = True
     if hasattr(module, 'bias') and module.bias is not None:
         nn.init.constant_(module.bias, bias)
+        initialized = True
+    if initialized:
+        return f'Norm (w={weight}, b={bias})'
+    return None
 def init_embedding(module, init_method='normal', std=0.02, a=0., b=1., padding_idx=None):
     '''初始化 Embedding 层。
@@ -161,17 +229,32 @@ def init_embedding(module, init_method='normal', std=0.02, a=0., b=1., padding_i
         a (float): 均匀分布的下界或截断正态分布的下界。
         b (float): 均匀分布的上界或截断正态分布的上界。
         padding_idx (int, optional): 如果指定，padding 索引的权重将被初始化为 0。
+    Returns:
+        str: 初始化详情字符串。
     '''
-    if hasattr(module, 'weight') and module.weight is not None:
-        if init_method == 'normal':
-            nn.init.normal_(module.weight, mean=0., std=std)
-        elif init_method == 'trunc_normal':
-            trunc_normal_(module.weight, mean=0., std=std, a=a, b=b)
-        elif init_method == 'uniform':
-            nn.init.uniform_(module.weight, a=a, b=b)
+    if not (hasattr(module, 'weight') and module.weight is not None):
+        return None
+    info = ""
+    if init_method == 'normal':
+        nn.init.normal_(module.weight, mean=0., std=std)
+        info = f'Normal (std={std})'
+    elif init_method == 'trunc_normal':
+        trunc_normal_(module.weight, mean=0., std=std, a=a, b=b)
+        info = f'Trunc Normal (std={std}, [{a}, {b}])'
+    elif init_method == 'uniform':
+        nn.init.uniform_(module.weight, a=a, b=b)
+        info = f'Uniform ([{a}, {b}])'
+    else:
+        nn.init.normal_(module.weight, mean=0., std=std)
+        info = f'Normal (std={std})'
     if padding_idx is not None:
         module.weight.data[padding_idx].zero_()
+        info += f', pad_idx={padding_idx}'
+    return info
 def init_weights_transformer(model, n_layer=None, initializer_range=0.02,
                              residual_proj_names=('linear_out', 'fc2', 'c_proj'),
@@ -277,6 +360,25 @@ class WeightInitializer:
         '''
         init_info = []
+        # 处理单个 Parameter/Tensor
+        if isinstance(model, (nn.Parameter, torch.Tensor)):
+            info = init_weights(
+                model,
+                method=self.method,
+                distribution=self.distribution,
+                bias=self.init_bias,
+                mode=self.mode,
+                nonlinearity=self.nonlinearity,
+                std=self.std,
+                trunc_a=self.trunc_a,
+                trunc_b=self.trunc_b
+            )
+            if info:
+                init_info.append(('Parameter/Tensor', type(model).__name__, info))
+            if verbose:
+                _print_init_info(init_info)
+            return
         for name, module in model.named_modules():
             current_config = {}
             if override:
@@ -291,8 +393,19 @@ class WeightInitializer:
             nonlinearity = current_config.get('nonlinearity', self.nonlinearity)
             std = current_config.get('std', self.std)
-            if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Conv3d, nn.Linear)):
-                init_weights(
+            info = None
+            if isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm, nn.BatchNorm1d)):
+                info = init_layer_norm(
+                    module, weight=self.init_norm_weight, bias=self.init_norm_bias)
+            elif isinstance(module, nn.Embedding):
+                emb_method = method if method in ['normal', 'trunc_normal', 'uniform'] else 'normal'
+                info = init_embedding(module, init_method=emb_method, std=std)
+            else:
+                # 尝试通用初始化 (Linear, Conv, 或其他带 weight/bias 的层)
+                info = init_weights(
                     module,
                     method=method,
                     distribution=distribution,
@@ -303,31 +416,15 @@ class WeightInitializer:
                     trunc_a=self.trunc_a,
                     trunc_b=self.trunc_b
                 )
-                info = f'{method} ({distribution})'
-                if method == 'kaiming':
-                    info += f', mode={mode}, nonlin={nonlinearity}'
-                elif method == 'normal':
-                    info += f', std={std}'
-                init_info.append((name, module.__class__.__name__, info))
-            elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm, nn.BatchNorm1d)):
-                init_layer_norm(
-                    module, weight=self.init_norm_weight, bias=self.init_norm_bias)
-                init_info.append((name, module.__class__.__name__, 'Norm (1/0)'))
-            elif isinstance(module, nn.Embedding):
-                init_embedding(
-                    module,
-                    init_method=method if method in ['normal', 'trunc_normal', 'uniform'] else 'normal',
-                    std=std
-                )
-                init_info.append((name, 'Embedding', f'{method} (std={std})'))
+            if info:
+                init_info.append((name, module.__class__.__name__, info))
         if verbose:
             _print_init_info(init_info)
 def _print_init_info(init_info):
-    '''打印初始化信息的辅助函数，支持 rich 美化。
+    '''打印初始化信息的辅助函数，使用 rich 美化。
     Args:
         init_info (list): 包含 (layer_name, module_type, details) 元组的列表。
@@ -335,22 +432,16 @@ def _print_init_info(init_info):
     if not init_info:
         return
-    if RICH_AVAILABLE:
-        console = Console()
-        table = Table(title="Weight Initialization Report", show_header=True, header_style="bold magenta")
-        table.add_column("Layer Name", style="cyan")
-        table.add_column("Module Type", style="green")
-        table.add_column("Initialization Details", style="yellow")
+    console = Console()
+    table = Table(title="Weight Initialization Report", show_header=True, header_style="bold magenta")
+    table.add_column("Layer Name", style="cyan")
+    table.add_column("Module Type", style="green")
+    table.add_column("Initialization Details", style="yellow")
-        for name, type_name, details in init_info:
-            table.add_row(name, type_name, details)
-        console.print(table)
-    else:
-        print(f"{'Layer Name':<40} | {'Module Type':<20} | {'Initialization Details'}")
-        print("-" * 90)
-        for name, type_name, details in init_info:
-            print(f"{name:<40} | {type_name:<20} | {details}")
+    for name, type_name, details in init_info:
+        table.add_row(str(name), str(type_name), str(details))
+    console.print(table)
 def initialize_weights(model, method='kaiming', override=None, verbose=False, **kwargs):
     '''初始化模型权重的便捷函数。
@@ -398,6 +489,9 @@ class AutoInitializer:
             'transformer_detected': False
         }
+        if isinstance(self.model, (nn.Parameter, torch.Tensor)):
+            return stats
         # 简单的深度估计：计算包含参数的层数
         param_layers = [m for m in self.model.modules() if isinstance(m, (nn.Linear, nn.Conv2d, nn.Conv1d))]
         stats['depth'] = len(param_layers)
@@ -471,16 +565,12 @@ class AutoInitializer:
         method, nonlinearity, override = self.recommend_config()
         if verbose:
-            if RICH_AVAILABLE:
-                console = Console()
-                console.print(f"[bold cyan]Auto Initialization Analysis:[/bold cyan]")
-                console.print(f"  Depth: {self.stats['depth']}")
-                console.print(f"  Activations: {self.stats['activations']}")
-                console.print(f"  Transformer Detected: {self.stats['transformer_detected']}")
-                console.print(f"[bold green]Recommended Strategy:[/bold green] {method} (nonlin={nonlinearity})")
-            else:
-                print(f"Auto Init: Depth={self.stats['depth']}, Acts={self.stats['activations']}")
-                print(f"Strategy: {method}, {nonlinearity}")
+            console = Console()
+            console.print(f"[bold cyan]Auto Initialization Analysis:[/bold cyan]")
+            console.print(f"  Depth: {self.stats['depth']}")
+            console.print(f"  Activations: {self.stats['activations']}")
+            console.print(f"  Transformer Detected: {self.stats['transformer_detected']}")
+            console.print(f"[bold green]Recommended Strategy:[/bold green] {method} (nonlin={nonlinearity})")
         initialize_weights(
             self.model,

orbit/utils/layer_io.py CHANGED Viewed

@@ -2,7 +2,10 @@ import torch
 import torch.nn as nn
 from typing import Union
-def get_module_by_name(model: nn.Module, name: str) -> nn.Module:
+from safetensors.torch import save_file as safe_save_file
+from safetensors.torch import load_file as safe_load_file
+def get_model_by_name(model: nn.Module, name: str) -> nn.Module:
     '''通过名称获取模型的子模块。
     Args:
@@ -23,7 +26,7 @@ def get_module_by_name(model: nn.Module, name: str) -> nn.Module:
         module = getattr(module, n)
     return module
-def save_layer_weights(model: nn.Module, layer_name: str, file_path: str) -> None:
+def save_layer(model: nn.Module, layer_name: str, file_path: str) -> None:
     '''保存模型指定层的权重到文件。
     Args:
@@ -31,10 +34,13 @@ def save_layer_weights(model: nn.Module, layer_name: str, file_path: str) -> Non
         layer_name (str): 要保存权重的层名称。
         file_path (str): 保存路径。
     '''
-    module = get_module_by_name(model, layer_name)
-    torch.save(module.state_dict(), file_path)
+    module = get_model_by_name(model, layer_name)
+    if file_path.endswith('.safetensors'):
+        safe_save_file(module.state_dict(), file_path)
+    else:
+        torch.save(module.state_dict(), file_path)
-def load_layer_weights(
+def load_layer(
     model: nn.Module,
     layer_name: str,
     file_path: str,
@@ -50,6 +56,59 @@ def load_layer_weights(
         strict (bool): 是否严格匹配键值。默认为 True。
         map_location (str or torch.device): 加载位置。默认为 'cpu'。
     '''
-    state_dict = torch.load(file_path, map_location=map_location)
-    module = get_module_by_name(model, layer_name)
+    if file_path.endswith('.safetensors'):
+        state_dict = safe_load_file(file_path, device=str(map_location))
+    else:
+        state_dict = torch.load(file_path, map_location=map_location)
+    if isinstance(state_dict, dict) and 'model_state_dict' in state_dict:
+        state_dict = state_dict['model_state_dict']
+    module = get_model_by_name(model, layer_name)
+    prefix = layer_name + '.'
+    if any(k.startswith(prefix) for k in state_dict.keys()):
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if k.startswith(prefix):
+                new_key = k[len(prefix):]
+                new_state_dict[new_key] = v
+        state_dict = new_state_dict
     module.load_state_dict(state_dict, strict=strict)
+def save_model(model: nn.Module, file_path: str) -> None:
+    '''保存整个模型的权重到文件。
+    Args:
+        model (nn.Module): 目标模型。
+        file_path (str): 保存路径。
+    '''
+    if file_path.endswith('.safetensors'):
+        safe_save_file(model.state_dict(), file_path)
+    else:
+        torch.save(model.state_dict(), file_path)
+def load_model(
+    model: nn.Module,
+    file_path: str,
+    strict: bool = True,
+    map_location: Union[str, torch.device] = 'cpu'
+) -> None:
+    '''从文件加载权重到整个模型。
+    Args:
+        model (nn.Module): 目标模型。
+        file_path (str): 权重文件路径。
+        strict (bool): 是否严格匹配键值。默认为 True。
+        map_location (str or torch.device): 加载位置。默认为 'cpu'。
+    '''
+    if file_path.endswith('.safetensors'):
+        state_dict = safe_load_file(file_path, device=str(map_location))
+    else:
+        state_dict = torch.load(file_path, map_location=map_location)
+    if isinstance(state_dict, dict) and 'model_state_dict' in state_dict:
+        state_dict = state_dict['model_state_dict']
+    model.load_state_dict(state_dict, strict=strict)

orbit-torch 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl