PyPI - orbit-torch - Versions diffs - 0.0.4a1__py3-none-any.whl - Mend

orbit-torch 0.0.4a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

orbit/__init__.py +3 -0
orbit/callback.py +54 -0
orbit/engine.py +802 -0
orbit/optim/__init__.py +2 -0
orbit/optim/muon.py +193 -0
orbit/optim/sam.py +92 -0
orbit/plugin/__init__.py +10 -0
orbit/plugin/board.py +61 -0
orbit/plugin/checkpoint.py +245 -0
orbit/plugin/classification.py +190 -0
orbit/plugin/data/mentor_i18n.json +102 -0
orbit/plugin/display_model.py +75 -0
orbit/plugin/early_stopping.py +101 -0
orbit/plugin/ema.py +97 -0
orbit/plugin/gradient_accumulation.py +32 -0
orbit/plugin/memory_estimator.py +234 -0
orbit/plugin/mentor.py +313 -0
orbit/plugin/overfit.py +30 -0
orbit/plugin/warmup.py +119 -0
orbit/utils/__init__.py +29 -0
orbit/utils/freeze.py +59 -0
orbit/utils/initialization.py +501 -0
orbit/utils/layer_io.py +55 -0
orbit/utils/mask.py +92 -0
orbit/utils/seed.py +66 -0
orbit_torch-0.0.4a1.dist-info/METADATA +25 -0
orbit_torch-0.0.4a1.dist-info/RECORD +29 -0
orbit_torch-0.0.4a1.dist-info/WHEEL +5 -0
orbit_torch-0.0.4a1.dist-info/top_level.txt +1 -0

orbit/plugin/classification.py ADDED Viewed

@@ -0,0 +1,190 @@
+import torch
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
+from rich.table import Table
+from typing import List, Optional, TYPE_CHECKING
+import rich.box as box
+from orbit.callback import Callback, Event
+if TYPE_CHECKING:
+    from ..engine import Engine
+class ClassificationReport(Callback):
+    def __init__(
+        self,
+        num_classes: int,
+        class_names: Optional[List[str]] = None,
+        top_k: int = 1,
+        cm_cmap: str = 'Blues'
+    ):
+        """
+        专用于分类任务的评估与可视化回调。
+        Args:
+            num_classes (int): 类别总数。
+            class_names (List[str]): 类别名称列表 ["Cat", "Dog", ...]。可选。
+            top_k (int): 另外计算 Top-K 准确率。
+            cm_cmap (str): 混淆矩阵热图的颜色风格。
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.class_names = class_names if class_names else [str(i) for i in range(num_classes)]
+        self.top_k = top_k
+        self.cm_cmap = cm_cmap
+        # 缓存预测结果
+        self.preds = []
+        self.targets = []
+    def on_eval_start(self, event: Event):
+        """每轮验证开始前清空缓存"""
+        self.preds = []
+        self.targets = []
+    def on_batch_end(self, event: Event):
+        """收集验证阶段的预测结果"""
+        engine = event.engine
+        if engine.state == "EVAL":
+            # 假设 engine.output 是 logits [Batch, NumClasses]
+            # 假设 engine.target 是 labels [Batch]
+            # 收集 Raw Output (用于 Top-K) 或 Argmax (用于混淆矩阵)
+            # 为了节省内存，我们这里尽量存 CPU Tensor
+            self.preds.append(engine.output.detach().cpu())
+            self.targets.append(engine.target.detach().cpu())
+    def on_eval_end(self, event: Event):
+        """验证结束后计算指标并绘图"""
+        if not self.preds: return
+        engine = event.engine
+        # 1. 拼接所有 Batch
+        all_logits = torch.cat(self.preds)  # [N, C]
+        all_targets = torch.cat(self.targets) # [N]
+        # 转为预测类别索引 [N]
+        all_preds_idx = all_logits.argmax(dim=1)
+        # 转换 numpy 用于 sklearn
+        y_true = all_targets.numpy()
+        y_pred = all_preds_idx.numpy()
+        # --- A. 计算基础 Acc 并存入 metrics ---
+        acc = accuracy_score(y_true, y_pred)
+        engine.metrics['val_acc'] = acc
+        # --- 计算 Top-K Acc ---
+        topk_acc = None
+        if self.top_k > 1:
+            _, indices = all_logits.topk(self.top_k, dim=1)
+            correct = indices.eq(all_targets.view(-1, 1).expand_as(indices))
+            topk_acc = correct.sum().item() / len(all_targets)
+            engine.metrics[f'val_acc_top{self.top_k}'] = topk_acc
+        # --- B. 控制台打印 Classification Report ---
+        report = classification_report(
+            y_true, y_pred,
+            target_names=self.class_names,
+            output_dict=True,
+            zero_division=0
+        )
+        self._print_rich_table(engine, report, acc, topk_acc)
+        # --- C. 绘制 Confusion Matrix ---
+        # 只有挂载了 TensorBoard Writer 才画图
+        if hasattr(engine, 'writer') and engine.writer is not None:
+            fig = self._plot_confusion_matrix(y_true, y_pred)
+            engine.writer.add_figure("Eval/Confusion_Matrix", fig, global_step=engine.epoch)
+            plt.close(fig) # 关闭 release 内存
+    def _print_rich_table(self, engine, report: dict, acc: float, topk_acc: Optional[float] = None):
+        """用 Rich 打印漂亮的分类报告表格"""
+        table = Table(title=f"[bold]Evaluation Report (Ep {engine.epoch+1})[/]", box=box.HORIZONTALS)
+        table.add_column("Class", style="cyan")
+        table.add_column("Precision", justify="right")
+        table.add_column("Recall", justify="right")
+        table.add_column("F1-Score", justify="right")
+        # 限制显示数量，防止刷屏
+        max_display = 20
+        items_to_show = []
+        if len(self.class_names) > max_display:
+            items_to_show.extend(self.class_names[:10])
+            items_to_show.append(None) # None 表示省略号
+            items_to_show.extend(self.class_names[-10:])
+        else:
+            items_to_show = self.class_names
+        for class_name in items_to_show:
+            if class_name is None:
+                table.add_row("...", "...", "...", "...")
+                continue
+            if class_name in report:
+                row = report[class_name]
+                table.add_row(
+                    class_name,
+                    f"{row['precision']:.3f}",
+                    f"{row['recall']:.3f}",
+                    f"{row['f1-score']:.3f}",
+                )
+        avg = report['weighted avg']
+        table.add_row(
+            "[bold]Weighted Avg[/]",
+            f"[bold]{avg['precision']:.3f}[/]",
+            f"[bold]{avg['recall']:.3f}[/]",
+            f"[bold]{avg['f1-score']:.3f}[/]",
+            end_section=True
+        )
+        with engine.out_logs:
+            engine.print(table)
+        engine.print(f"Accuracy: [green]{acc*100:.2f}%[/]", plugin='ClassReport')
+        if topk_acc is not None:
+            engine.print(f"Top-{self.top_k} Accuracy: [green]{topk_acc*100:.2f}%[/]", plugin='ClassReport')
+    def _plot_confusion_matrix(self, y_true, y_pred):
+        """使用 Seaborn 绘制混淆矩阵"""
+        cm = confusion_matrix(y_true, y_pred)
+        num_classes = len(self.class_names)
+        # 动态调整 Figure 大小
+        # 基础大小 10，每多一个类别增加一点尺寸，上限设个限制防止过大
+        fig_base = 10
+        fig_scale = 0.3
+        figsize_dim = max(fig_base, min(50, num_classes * fig_scale))
+        # 创建 Figure
+        fig, ax = plt.subplots(figsize=(figsize_dim, figsize_dim))
+        # 智能决定是否显示数值 annot
+        do_annot = True
+        if num_classes > 20:
+            do_annot = False
+        sns.heatmap(
+            cm,
+            annot=do_annot,
+            fmt='d',
+            cmap=self.cm_cmap,
+            xticklabels=self.class_names,
+            yticklabels=self.class_names,
+            ax=ax,
+            square=True
+        )
+        # 调整标签样式
+        if num_classes > 20:
+            plt.xticks(rotation=90, fontsize=8)
+            plt.yticks(rotation=0, fontsize=8)
+        ax.set_xlabel('Predicted')
+        ax.set_ylabel('True')
+        ax.set_title(f'Confusion Matrix ({num_classes} classes)')
+        plt.tight_layout()
+        return fig

orbit/plugin/data/mentor_i18n.json ADDED Viewed

@@ -0,0 +1,102 @@
+{
+    "en": {
+        "nan_loss_title": "Loss is NaN or Infinity!",
+        "nan_loss_msg": "Gradient explosion detected. Try:\n1. Lower learning rate significantly.\n2. Enable gradient clipping (engine.grad_clip_norm).\n3. Check your data for anomalies.",
+        "divergence_title": "⚠️  Loss Divergence/Instability Detected",
+        "divergence_msg_unstable": "Train loss has increased for {count} consecutive epochs ({loss:.4f}).",
+        "divergence_msg_spike": "Train loss has spiked significantly ({loss:.4f} vs min {min_loss:.4f}).",
+        "stagnation_title": "🛑 Loss Stagnation Detected",
+        "stagnation_msg": "Train loss hasn't improved for {patience} epochs.",
+        "advice_add_warmup": "- Add Warmup: You are not using Warmup. Unstable gradients at start can cause divergence.",
+        "advice_lower_lr": "- Lower LR: Try reducing your initial learning rate.",
+        "advice_warmup_start_lr": "- Warmup Start LR: Your warmup starting learning rate might be too high.",
+        "advice_post_warmup_lr": "- Post-Warmup LR: The learning rate after warmup might be too high.",
+        "advice_small_bs": "- Small Batch Size: Effective Batch Size is {eff_bs}. Small batches have noisy gradients.",
+        "advice_increase_accum": "  -> Try increasing `accumulation_steps` (currently {accum_steps}) or batch size.",
+        "advice_grad_clip": "- Gradient Clipping: Enable `grad_clip_norm` in Engine if not already used.",
+        "advice_warmup_duration": "- Warmup Duration: You are in warmup (Epoch {epoch}).",
+        "advice_warmup_too_long": "  -> Your warmup is {warmup_epochs} epochs (>20% of total). Consider shortening it.",
+        "advice_check_start_lr": "- Start LR: Check if your warmup start LR is too small.",
+        "advice_lr_general": "- Learning Rate: LR might be too small (slow convergence) or too large (bouncing).",
+        "advice_add_scheduler": "- Add Scheduler: You are not using an LR Scheduler. Dynamic LR reduction often breaks stagnation.",
+        "advice_check_scheduler": "- Scheduler: Check if your scheduler decayed LR too early or too aggressively.",
+        "advice_large_bs": "- Large Batch Size: Effective BS is {eff_bs}. Large batches generalize worse.",
+        "advice_reduce_bs": "  -> Try increasing LR (Linear Scaling Rule) or reducing batch size.",
+        "advice_check_init": "- Bad Initialization: Weights might be initialized poorly. Try Xavier or Kaiming initialization.",
+        "advice_overfit_single_batch": "- Debugging: Try overfitting a single batch. If the model can't learn even one batch, there's a bug in the model or data pipeline.",
+        "advice_data_hard": "- Data Complexity: The data might be too hard or noisy. Check labels and input features.",
+        "overfitting_title": "📉 Overfitting Detected",
+        "overfitting_msg": "Validation loss is rising while training loss decreases.",
+        "advice_regularization": "- Regularization: Add Dropout or Weight Decay (L2 regularization).",
+        "advice_data_aug": "- Data Augmentation: Increase data augmentation to improve generalization.",
+        "advice_early_stopping": "- Early Stopping: Consider stopping training now to prevent further degradation.",
+        "oscillation_title": "〰️ Loss Oscillation Detected",
+        "oscillation_msg": "Training loss is oscillating significantly (Std Dev: {std:.4f}).",
+        "advice_lower_lr_oscillation": "- Lower LR: The learning rate is likely too high, causing the model to overshoot minima.",
+        "advice_oscillation_scheduler": "- Add Scheduler: Dynamic LR reduction helps stabilize training when loss oscillates.",
+        "advice_oscillation_grad_clip": "- Gradient Clipping: Enable `grad_clip_norm` to prevent large gradient updates from destabilizing the model.",
+        "mentor_watching": "[dim]Mentor watching: Eff. Batch Size={eff_bs} (BS={bs} * Accum={accum})[/]"
+    },
+    "zh": {
+        "nan_loss_title": "Loss 变为 NaN 或无穷大！",
+        "nan_loss_msg": "检测到梯度爆炸。尝试：\n1. 显著降低学习率。\n2. 启用梯度裁剪 (engine.grad_clip_norm)。\n3. 检查数据是否存在异常。",
+        "divergence_title": "⚠️  检测到 Loss 发散/不稳定",
+        "divergence_msg_unstable": "训练 Loss 已连续 {count} 个 Epoch 上升 ({loss:.4f})。",
+        "divergence_msg_spike": "训练 Loss 显著飙升 ({loss:.4f} vs 最小值 {min_loss:.4f})。",
+        "stagnation_title": "🛑 检测到 Loss 停滞",
+        "stagnation_msg": "训练 Loss 已连续 {patience} 个 Epoch 未改善。",
+        "advice_add_warmup": "- 添加预热 (Warmup): 您未使用 Warmup。初始阶段的不稳定梯度可能导致发散。",
+        "advice_lower_lr": "- 降低 LR: 尝试降低初始学习率。",
+        "advice_warmup_start_lr": "- Warmup 初始 LR: 您的 Warmup 起始学习率可能过高。",
+        "advice_post_warmup_lr": "- Warmup 后 LR: Warmup 结束后的学习率可能过高。",
+        "advice_small_bs": "- Batch Size 过小: 有效 Batch Size 为 {eff_bs}。小批量会导致梯度噪声大。",
+        "advice_increase_accum": "  -> 尝试增加 `accumulation_steps` (当前为 {accum_steps}) 或 Batch Size。",
+        "advice_grad_clip": "- 梯度裁剪: 如果尚未启用，请在 Engine 中启用 `grad_clip_norm`。",
+        "advice_warmup_duration": "- Warmup 持续时间: 当前处于 Warmup 阶段 (Epoch {epoch})。",
+        "advice_warmup_too_long": "  -> 您的 Warmup 长达 {warmup_epochs} 个 Epoch (>总数的 20%)。考虑缩短它。",
+        "advice_check_start_lr": "- 初始 LR: 检查 Warmup 起始 LR 是否过小。",
+        "advice_lr_general": "- 学习率: LR 可能太小 (收敛慢) 或太大 (在最小值附近震荡)。",
+        "advice_add_scheduler": "- 添加调度器: 您未使用 LR Scheduler。动态降低 LR 通常能打破停滞。",
+        "advice_check_scheduler": "- 检查调度器: 检查 Scheduler 是否过早或过激进地降低了 LR。",
+        "advice_large_bs": "- Batch Size 过大: 有效 BS 为 {eff_bs}。大批量通常泛化能力较差。",
+        "advice_reduce_bs": "  -> 尝试增加 LR (线性缩放规则) 或减小 Batch Size。",
+        "advice_check_init": "- 初始化糟糕: 权重初始化可能不当。尝试使用 Xavier 或 Kaiming 初始化。",
+        "advice_overfit_single_batch": "- 调试建议: 尝试使用单 Batch 过拟合。如果模型连一个 Batch 都学不会，说明模型或数据管道有 Bug。",
+        "advice_data_hard": "- 数据难度: 数据可能太难或噪声太大。检查标签和输入特征。",
+        "overfitting_title": "📉 检测到过拟合",
+        "overfitting_msg": "验证集 Loss 正在上升，而训练集 Loss 仍在下降。",
+        "advice_regularization": "- 正则化: 添加 Dropout 或权重衰减 (Weight Decay/L2 正则)。",
+        "advice_data_aug": "- 数据增强: 增加数据增强强度以提高泛化能力。",
+        "advice_early_stopping": "- 早停 (Early Stopping): 考虑立即停止训练以防止性能进一步恶化。",
+        "oscillation_title": "〰️ 检测到 Loss 震荡",
+        "oscillation_msg": "训练 Loss 波动剧烈 (标准差: {std:.4f})。",
+        "advice_lower_lr_oscillation": "- 降低 LR: 学习率可能过高，导致模型在极小值附近跳跃。",
+        "advice_oscillation_scheduler": "- 添加调度器: 动态降低 LR 有助于在 Loss 震荡时稳定训练。",
+        "advice_oscillation_grad_clip": "- 梯度裁剪: 启用 `grad_clip_norm` 以防止大梯度更新导致模型不稳定。",
+        "mentor_watching": "[dim]Mentor watching: Eff. Batch Size={eff_bs} (BS={bs} * Accum={accum})[/]"
+    }
+}

orbit/plugin/display_model.py ADDED Viewed

@@ -0,0 +1,75 @@
+import torch.nn as nn
+from rich.table import Table
+from rich.console import Console
+from typing import TYPE_CHECKING
+import rich.box as box
+from orbit.callback import Callback, Event
+if TYPE_CHECKING:
+    from ..engine import Engine
+class ModelSummary(Callback):
+    def __init__(self, max_depth: int = 3):
+        super().__init__()
+        self.max_depth = max_depth
+    def on_init(self, event: Event):
+        """
+        Engine 初始化时，自动打印模型结构
+        """
+        engine = event.engine
+        self.display(engine.model, engine.console)
+    def display(self, model: nn.Module, console: Console):
+        """核心打印逻辑"""
+        table = Table(title=f"[bold]Model Summary: {model.__class__.__name__}[/]", box=box.HORIZONTALS)
+        table.add_column("Layer (Type)", style="cyan", no_wrap=True)
+        table.add_column("Output Shape", style="magenta")
+        table.add_column("Param #", justify="right", style="green")
+        table.add_column("Trainable", justify="right", style="yellow")
+        total_params = 0
+        trainable_params = 0
+        # 遍历顶层模块 (简单版遍历，深度遍历比较复杂，为了美观这里展示第一级子模块)
+        for name, module in model.named_children():
+            # 计算该模块的总参数
+            num_params = sum(p.numel() for p in module.parameters())
+            num_trainable = sum(p.numel() for p in module.parameters() if p.requires_grad)
+            total_params += num_params
+            trainable_params += num_trainable
+            is_trainable = "[bold green]Yes[/]" if num_trainable > 0 else "[dim]No[/]"
+            layer_name = f"{name} ({module.__class__.__name__})"
+            table.add_row(
+                layer_name,
+                "-",
+                f"{num_params:,}",
+                is_trainable
+            )
+        # 计算模型总大小 (MB) - Float32 = 4 bytes
+        total_size_mb = total_params * 4 / (1024 ** 2)
+        console.print(table)
+        if total_params > 0:
+            trainable_params = trainable_params/total_params
+        else:
+            trainable_params = 0
+        # 打印汇总信息
+        summary_table = Table(show_header=False, box=None)
+        summary_table.add_row("Total Params:", f"[bold cyan]{total_params:,}[/]")
+        summary_table.add_row("Trainable Params:", f"[bold green]{trainable_params:,}[/] ({trainable_params:.1%})")
+        summary_table.add_row("Non-trainable Params:", f"[dim]{total_params - trainable_params:,}[/]")
+        summary_table.add_row("Est. Params Size (MB):", f"[bold blue]{total_size_mb:.2f} MB[/]")
+        console.print(summary_table)
+        #console.print(' ' + '─' * 15 + '─' + '─' * 35)
+        console.print()

orbit/plugin/early_stopping.py ADDED Viewed

@@ -0,0 +1,101 @@
+import numpy as np
+from typing import TYPE_CHECKING
+from orbit.callback import Callback, Event
+if TYPE_CHECKING:
+    from orbit.engine import Engine
+class EarlyStopping(Callback):
+    """
+    Early Stopping 插件。
+    如果监控的指标在 'patience' 个 Epoch 内没有改善，则停止训练。
+    支持断点续训状态保存。
+    """
+    def __init__(
+        self,
+        monitor: str = 'val_loss',
+        mode: str = 'min',
+        patience: int = 5,
+        min_delta: float = 0.0,
+        verbose: bool = True
+    ):
+        """
+        Args:
+            monitor (str): 监控的指标名称 (e.g., 'val_loss', 'val_acc')。
+            mode (str): 'min' (越小越好) 或 'max' (越大越好)。
+            patience (int): 容忍多少个 Epoch 不提升。
+            min_delta (float): 视为提升的最小变化量。
+            verbose (bool): 是否打印信息。
+        """
+        super().__init__()
+        self.monitor = monitor
+        self.mode = mode
+        self.patience = patience
+        self.min_delta = min_delta
+        self.verbose = verbose
+        self.wait_count = 0
+        self.best_score = np.inf if mode == 'min' else -np.inf
+        # 内部状态 Key
+        self._meta_key = 'early_stopping'
+    def on_train_start(self, event: Event):
+        """尝试从 engine.meta 恢复状态"""
+        engine = event.engine
+        if self._meta_key in engine.meta:
+            state = engine.meta[self._meta_key]
+            self.best_score = state.get('best_score', self.best_score)
+            self.wait_count = state.get('wait_count', 0)
+            if self.verbose:
+                engine.print(f"[cyan]Resumed: Best Score={self.best_score:.4f}, Wait={self.wait_count}/{self.patience}[/]", plugin='EarlyStopping')
+    def on_epoch_end(self, event: Event):
+        """每 Epoch 检查指标"""
+        engine = event.engine
+        # 0. 如果处于 Warmup 阶段，跳过 Early Stopping
+        if engine.is_in_warmup():
+            if self.verbose:
+                engine.print(f"[dim]Skipping EarlyStopping during warmup.[/]", plugin='EarlyStopping')
+            return
+        # 1. 获取当前指标
+        current_score = engine.metrics.get(self.monitor)
+        if current_score is None:
+            # 如果指标不存在 (例如只跑了 Train 没跑 Eval)，跳过检查
+            return
+        # 2. 判断是否提升
+        improved = False
+        if self.mode == 'min':
+            if current_score < self.best_score - self.min_delta:
+                improved = True
+        else:
+            if current_score > self.best_score + self.min_delta:
+                improved = True
+        # 3. 更新状态
+        if improved:
+            old_best = self.best_score
+            self.best_score = current_score
+            self.wait_count = 0
+            if self.verbose:
+                if old_best == np.inf or old_best == -np.inf:
+                    engine.print(f"{self.monitor} improved to [green]{current_score:.4f}[/]", plugin='EarlyStopping')
+                else:
+                    engine.print(f"{self.monitor} improved [green]{old_best:.4f} -> {current_score:.4f}[/]", plugin='EarlyStopping')
+        else:
+            self.wait_count += 1
+            if self.verbose:
+                engine.print(f"[yellow]{self.monitor} did not improve ({self.wait_count}/{self.patience}). Best: {self.best_score:.4f}[/]", plugin='EarlyStopping')
+            if self.wait_count >= self.patience:
+                engine.stop(source="EarlyStopping", reason=f"No improvement in {self.patience} epochs")
+                engine.print(f"[red][bold]Stopping training (no improvement in {self.patience} epochs).[/]", plugin='EarlyStopping')
+        # 4. 保存状态到 meta，以便 Checkpoint 持久化
+        engine.meta[self._meta_key] = {
+            'best_score': self.best_score,
+            'wait_count': self.wait_count
+        }

orbit/plugin/ema.py ADDED Viewed

@@ -0,0 +1,97 @@
+from copy import deepcopy
+import torch
+from orbit.callback import Callback, Event
+from typing import TYPE_CHECKING, Dict
+if TYPE_CHECKING:
+    from orbit.engine import Engine
+class EMA(Callback):
+    """
+    指数移动平均 (Exponential Moving Average) 插件。
+    在训练过程中维护模型参数的滑动平均版本，并在评估/预测时使用它。
+    通常能提升模型的泛化能力和鲁棒性。
+    """
+    def __init__(self, decay: float = 0.999, start_step: int = 0):
+        """
+        Args:
+            decay (float): 衰减率，通常接近 1 (如 0.999, 0.9999)。
+            start_step (int): 从第几个 Global Step 开始启用 EMA。
+        """
+        super().__init__()
+        self.decay = decay
+        self.start_step = start_step
+        self.shadow: Dict[str, torch.Tensor] = {}
+        self.backup: Dict[str, torch.Tensor] = {}
+        # 内部状态 Key，用于 Checkpoint 保存/恢复
+        self._meta_key = 'ema_state'
+    def on_init(self, event: Event):
+        # 初始化影子权重 (Shadow Weights)
+        # 注意：此时模型应该已经加载到了正确的 Device 上
+        engine = event.engine
+        for name, param in engine.model.named_parameters():
+            if param.requires_grad:
+                self.shadow[name] = param.data.clone()
+        engine.print(f"[magenta]Enabled (decay={self.decay})[/]", plugin='EMA')
+    def on_train_start(self, event: Event):
+        """尝试从 Checkpoint 恢复 EMA 状态"""
+        engine = event.engine
+        if self._meta_key in engine.meta:
+            saved_shadow = engine.meta[self._meta_key]
+            # 确保加载的权重在正确的设备上
+            for k, v in saved_shadow.items():
+                if k in self.shadow:
+                    self.shadow[k] = v.to(engine.device)
+            engine.print(f"[green]Resumed EMA state from checkpoint[/]", plugin='EMA')
+    def on_batch_end(self, event: Event):
+        """每个 Batch 结束后更新 EMA 权重"""
+        engine = event.engine
+        if engine.state == 'TRAIN' and engine.global_step >= self.start_step:
+            for name, param in engine.model.named_parameters():
+                if param.requires_grad:
+                    # shadow = decay * shadow + (1 - decay) * param
+                    self.shadow[name].data.mul_(self.decay).add_(param.data, alpha=1.0 - self.decay)
+    def on_eval_start(self, event: Event):
+        """评估开始前：备份当前权重，应用 EMA 权重"""
+        engine = event.engine
+        if engine.global_step < self.start_step:
+            return
+        self.backup = {
+            name: p.data.clone()
+            for name, p in engine.model.named_parameters()
+            if p.requires_grad
+        }
+        for name, param in engine.model.named_parameters():
+            if param.requires_grad:
+                param.data.copy_(self.shadow[name])
+        engine.print("[dim]Switched to EMA weights for evaluation[/]", plugin='EMA')
+    def on_eval_end(self, event: Event):
+        """评估结束后：恢复原始训练权重"""
+        engine = event.engine
+        if not self.backup:
+            return
+        for name, param in engine.model.named_parameters():
+            if param.requires_grad:
+                param.data.copy_(self.backup[name])
+        self.backup = {} # 清空备份
+        engine.print("[dim]Restored training weights[/]", plugin='EMA')
+    def on_epoch_end(self, event: Event):
+        """
+        Epoch 结束时：将 EMA 状态存入 meta，以便 Checkpoint 插件保存。
+        注意：这会增加 Checkpoint 文件的大小 (约 2 倍模型大小)。
+        """
+        engine = event.engine
+        engine.meta[self._meta_key] = self.shadow

orbit/plugin/gradient_accumulation.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import TYPE_CHECKING
+from orbit.callback import Callback, Event
+if TYPE_CHECKING:
+    from orbit.engine import Engine
+class GradientAccumulation(Callback):
+    """
+    梯度累积插件。
+    通过配置 Engine 的 accumulation_steps 属性来实现。
+    """
+    def __init__(self, steps: int = 1):
+        """
+        Args:
+            steps (int): 累积步数。默认为 1 (不累积)。
+                         例如 steps=4，则每 4 个 Batch 更新一次参数，
+                         等效 Batch Size = 原始 Batch Size * 4。
+        """
+        super().__init__()
+        self.steps = steps
+        if self.steps < 1:
+            raise ValueError("Gradient accumulation steps must be >= 1")
+    def on_init(self, event: Event):
+        """
+        在初始化阶段配置 Engine
+        """
+        engine = event.engine
+        engine.accumulation_steps = self.steps
+        if self.steps > 1:
+            engine.print(f"[magenta]Enabled: steps={self.steps}[/]", plugin='GradAccum')