PyPI - orbit-torch - Versions diffs - 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl - Mend

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

orbit/__init__.py +3 -1
orbit/callback.py +4 -3
orbit/dataset/__init__.py +1 -0
orbit/dataset/cogn.py +138 -0
orbit/dataset/data/cogn_en.jsonl +45 -0
orbit/dataset/data/cogn_zh.jsonl +113 -0
orbit/engine.py +210 -146
orbit/kit/__init__.py +2 -0
orbit/kit/interface.py +154 -0
orbit/kit/wrapper.py +157 -0
orbit/model/__init__.py +5 -0
orbit/model/base.py +125 -0
orbit/model/block/__init__.py +34 -0
orbit/model/block/attention.py +265 -0
orbit/model/block/bio.py +537 -0
orbit/model/block/codebook.py +122 -0
orbit/model/block/conv.py +505 -0
orbit/model/block/embedding.py +252 -0
orbit/model/block/film.py +176 -0
orbit/model/block/fusion.py +335 -0
orbit/model/block/gate.py +334 -0
orbit/model/block/lora.py +776 -0
orbit/model/block/mlp.py +68 -0
orbit/model/block/moe.py +94 -0
orbit/model/block/tcn.py +99 -0
orbit/model/config.py +62 -0
orbit/model/kit/__init__.py +6 -0
orbit/model/kit/discriminator.py +46 -0
orbit/model/kit/losses.py +193 -0
orbit/model/motif/__init__.py +0 -0
orbit/model/motif/vision/__init__.py +0 -0
orbit/model/motif/vision/v1.py +645 -0
orbit/model/registry.py +53 -0
orbit/optim/__init__.py +2 -2
orbit/optim/sam.py +10 -3
orbit/plugin/__init__.py +12 -8
orbit/plugin/board.py +1 -2
orbit/plugin/checkpoint.py +137 -62
orbit/plugin/classification.py +2 -2
orbit/plugin/display_model.py +1 -2
orbit/plugin/early_stopping.py +1 -2
orbit/plugin/ema.py +1 -2
orbit/plugin/gradient_accumulation.py +1 -2
orbit/plugin/lora.py +346 -0
orbit/plugin/memory_estimator.py +1 -2
orbit/plugin/warmup.py +1 -2
orbit/utils/__init__.py +24 -1
orbit/utils/cuda.py +10 -0
orbit/utils/freeze.py +61 -17
orbit/utils/image.py +164 -0
orbit/utils/initialization.py +184 -94
orbit/utils/layer_io.py +66 -7
orbit/utils/lora.py +480 -0
orbit/utils/moe.py +55 -0
orbit/utils/seed.py +3 -19
orbit/utils/sft.py +93 -0
orbit_torch-0.1.0b1.dist-info/METADATA +208 -0
orbit_torch-0.1.0b1.dist-info/RECORD +65 -0
orbit_torch-0.0.4a1.dist-info/METADATA +0 -25
orbit_torch-0.0.4a1.dist-info/RECORD +0 -29
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/WHEEL +0 -0
{orbit_torch-0.0.4a1.dist-info → orbit_torch-0.1.0b1.dist-info}/top_level.txt +0 -0

orbit/optim/sam.py CHANGED Viewed

@@ -34,7 +34,7 @@ class SAM(torch.optim.Optimizer):
     @torch.no_grad()
     def first_step(self, zero_grad=False):
-        '''计算并应用参数扰动 epsilon。
+        r'''计算并应用参数扰动 epsilon。
         该步骤根据当前梯度将模型参数 $w$ 更新为 $w + \epsilon$。
@@ -55,7 +55,7 @@ class SAM(torch.optim.Optimizer):
     @torch.no_grad()
     def second_step(self, zero_grad=False):
-        '''恢复参数并执行真正的梯度更新。
+        r'''恢复参数并执行真正的梯度更新。
         该步骤将参数从 $w + \epsilon$ 恢复回 $w$，并利用扰动位置的梯度更新 $w$。
@@ -89,4 +89,11 @@ class SAM(torch.optim.Optimizer):
         return norm
     def step(self, closure=None):
-        raise NotImplementedError('SAM requires steps to be run manually: first_step and second_step')
+        raise NotImplementedError('SAM requires steps to be run manually: first_step and second_step')
+    def load_state_dict(self, state_dict):
+        self.base_optimizer.load_state_dict(state_dict)
+        self.param_groups = self.base_optimizer.param_groups
+    def state_dict(self):
+        return self.base_optimizer.state_dict()

orbit/plugin/__init__.py CHANGED Viewed

@@ -1,10 +1,14 @@
-try: from orbit.plugin.classification import ClassificationReport
+try: from .classification import ClassificationReport
 except: pass
-from orbit.plugin.warmup import Warmup
-from orbit.plugin.early_stopping import EarlyStopping
-from orbit.plugin.gradient_accumulation import GradientAccumulation
-from orbit.plugin.mentor import Mentor
-from orbit.plugin.ema import EMA # Not tested
-from orbit.plugin.memory_estimator import MemoryEstimator
-from orbit.plugin.overfit import Overfit
+from .checkpoint import Checkpoint
+from .board import Board
+from .display_model import ModelSummary
+from .warmup import Warmup
+from .early_stopping import EarlyStopping
+from .gradient_accumulation import GradientAccumulation
+from .mentor import Mentor
+from .ema import EMA # Not tested
+from .memory_estimator import MemoryEstimator
+from .overfit import Overfit
+from .lora import LoRA

orbit/plugin/board.py CHANGED Viewed

@@ -2,8 +2,7 @@ from torch.utils.tensorboard import SummaryWriter
 from typing import Optional, TYPE_CHECKING
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from ..engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class Board(Callback):
     def __init__(self, name: str, log_dir: str):

orbit/plugin/checkpoint.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import os
 import torch
+import json
 from typing import TYPE_CHECKING, List, Tuple
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from orbit.engine import Engine
+from safetensors.torch import save_file as safe_save_file
+from safetensors.torch import load_file as safe_load_file
+from safetensors.torch import safe_open
+if TYPE_CHECKING: from orbit.engine import Engine
 class Checkpoint(Callback):
     def __init__(
@@ -17,6 +21,7 @@ class Checkpoint(Callback):
         save_top_k: int = 1,
         save_last: bool = True,
         every_n_train_steps: int = None,
+        use_safetensors: bool = False,
         verbose: bool = True
     ):
         """
@@ -29,6 +34,7 @@ class Checkpoint(Callback):
             save_top_k (int): 保存最好的 K 个模型。设为 0 则禁用 Top-K 保存。
             save_last (bool): 是否总是保存 '{name}_last.pt'。
             every_n_train_steps (int): 每隔多少个训练步保存一次。
+            use_safetensors (bool): 是否使用 safetensors 格式保存。注意：safetensors 不支持保存优化器状态。
             verbose (bool): 是否打印保存信息。
         """
         super().__init__()
@@ -40,8 +46,12 @@ class Checkpoint(Callback):
         self.save_top_k = save_top_k
         self.save_last = save_last
         self.every_n_train_steps = every_n_train_steps
+        self.use_safetensors = use_safetensors
         self.verbose = verbose
+        if self.use_safetensors and not self.save_weights_only:
+            print("[yellow]Warning: safetensors does not support saving optimizer state. Setting save_weights_only=True implicitly.[/]")
         # 维护 Top-K 模型列表: [(score, filename), ...]
         self.best_k_models: List[Tuple[float, str]] = []
@@ -65,12 +75,20 @@ class Checkpoint(Callback):
         if self._meta_key in engine.meta:
             self.best_k_models = engine.meta[self._meta_key].get('best_k_models', [])
-        load_path = os.path.join(self.path, self.name + "_last.pt").replace("\\", "/")
+        ext = ".safetensors" if self.use_safetensors else ".pt"
+        load_path = os.path.join(self.path, self.name + "_last" + ext).replace("\\", "/")
         if os.path.exists(load_path):
             self._load(engine, load_path)
         else:
-            engine.print(f"[yellow]Warning: Resume checkpoint '{load_path}' not found. Starting from scratch.[/]", plugin='Checkpointing')
+            # 尝试查找另一种格式
+            alt_ext = ".pt" if self.use_safetensors else ".safetensors"
+            alt_path = os.path.join(self.path, self.name + "_last" + alt_ext).replace("\\", "/")
+            if os.path.exists(alt_path):
+                engine.print(f"[yellow]Found checkpoint with alternative extension: {alt_path}[/]", plugin='Checkpointing')
+                self._load(engine, alt_path)
+            else:
+                engine.print(f"[yellow]Warning: Resume checkpoint '{load_path}' not found. Starting from scratch.[/]", plugin='Checkpointing')
     def on_batch_end(self, event: Event):
         """
@@ -81,7 +99,8 @@ class Checkpoint(Callback):
             step = event.engine.global_step
             if step > 0 and step % self.every_n_train_steps == 0:
                 # 保存 step checkpoint
-                filename = f"{self.name}_step_{step}.pt"
+                ext = ".safetensors" if self.use_safetensors else ".pt"
+                filename = f"{self.name}_step_{step}{ext}"
                 # 传递 is_step=True
                 self._save(event.engine, filename, verbose=self.verbose, is_step=True)
@@ -93,7 +112,7 @@ class Checkpoint(Callback):
                 # 同时更新 last checkpoint
                 if self.save_last:
-                    self._save(event.engine, f"{self.name}_last.pt", verbose=False, is_step=True)
+                    self._save(event.engine, f"{self.name}_last{ext}", verbose=False, is_step=True)
     def on_epoch_end(self, event: Event):
         """
@@ -102,9 +121,11 @@ class Checkpoint(Callback):
         2. 如果设置了 monitor，保存 top_k
         """
         engine = event.engine
+        ext = ".safetensors" if self.use_safetensors else ".pt"
         # 1. Save Last
         if self.save_last:
-            self._save(engine, f"{self.name}_last.pt", verbose=False) # last 不需要每次都啰嗦
+            self._save(engine, f"{self.name}_last{ext}", verbose=False) # last 不需要每次都啰嗦
         # 2. Save Top K
         if self.monitor and self.save_top_k > 0:
@@ -119,7 +140,8 @@ class Checkpoint(Callback):
     def _check_and_save_top_k(self, engine: 'Engine', current_score: float):
         """检查并保存 Top-K 模型"""
-        filename = f"{self.name}_ep{engine.epoch+1}_{self.monitor}_{current_score:.4f}.pt"
+        ext = ".safetensors" if self.use_safetensors else ".pt"
+        filename = f"{self.name}_ep{engine.epoch+1}_{self.monitor}_{current_score:.4f}{ext}"
         # 逻辑简化：总是先加入，然后排序，如果超过 K 个，删除最差的
         self.best_k_models.append((current_score, filename))
@@ -154,24 +176,44 @@ class Checkpoint(Callback):
         # 获取原始模型 (去除 DataParallel 包装) 以保证 Checkpoint 通用性
         raw_model = engine.unwrap_model()
-        state = {
-            'epoch': engine.epoch,
-            'global_step': engine.global_step,
-            'batch_idx': engine.batch_idx,
-            'is_step': is_step,
-            'model_state_dict': raw_model.state_dict(),
-            'optimizer_state_dict': engine.optimizer.state_dict() if engine.optimizer else None,
-            'scheduler_state_dict': engine.scheduler.state_dict() if engine.scheduler else None,
-            'scaler_state_dict': engine.scaler.state_dict() if engine.scaler else None,
-            'meta': engine.meta,
-        }
-        if self.save_weights_only:
-            state = raw_model.state_dict()
         file_path = os.path.join(self.path, filename)
         try:
-            torch.save(state, file_path)
+            if self.use_safetensors and filename.endswith('.safetensors'):
+                # Safetensors 模式：仅保存权重和元数据
+                metadata = {
+                    'epoch': str(engine.epoch),
+                    'global_step': str(engine.global_step),
+                    'batch_idx': str(engine.batch_idx),
+                    'is_step': str(is_step),
+                    # 注意：safetensors metadata 只能是字符串
+                }
+                # 尝试序列化 meta
+                try:
+                    metadata['meta'] = json.dumps(engine.meta)
+                except:
+                    pass
+                safe_save_file(raw_model.state_dict(), file_path, metadata=metadata)
+            else:
+                # Torch 模式：保存完整状态
+                state = {
+                    'epoch': engine.epoch,
+                    'global_step': engine.global_step,
+                    'batch_idx': engine.batch_idx,
+                    'is_step': is_step,
+                    'model_state_dict': raw_model.state_dict(),
+                    'optimizer_state_dict': engine.optimizer.state_dict() if engine.optimizer else None,
+                    'scheduler_state_dict': engine.scheduler.state_dict() if engine.scheduler else None,
+                    'scaler_state_dict': engine.scaler.state_dict() if engine.scaler else None,
+                    'meta': engine.meta,
+                }
+                if self.save_weights_only:
+                    state = raw_model.state_dict()
+                torch.save(state, file_path)
             if verbose:
                 # 使用相对路径显示，更简洁
                 rel_path = os.path.relpath(file_path)
@@ -194,52 +236,85 @@ class Checkpoint(Callback):
         """加载 Checkpoint 的核心逻辑"""
         engine.print(f"[cyan]Loading checkpoint from: {file_path}[/]", plugin='Checkpointing')
         try:
-            # 加载到设备
-            checkpoint = torch.load(file_path, map_location=engine.device)
             # 获取原始模型以进行加载
             raw_model = engine.unwrap_model()
-            # 1. 加载模型权重
-            if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-                raw_model.load_state_dict(checkpoint['model_state_dict'])
-            else:
-                raw_model.load_state_dict(checkpoint)
-                engine.print("[yellow]Loaded model weights only (legacy format).[/]", plugin='Checkpointing')
-                return
-            # 2. 恢复训练状态
-            if not self.save_weights_only:
-                if engine.optimizer and 'optimizer_state_dict' in checkpoint:
-                    engine.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-                if engine.scheduler and 'scheduler_state_dict' in checkpoint:
-                    engine.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+            if file_path.endswith('.safetensors'):
+                # 加载权重
+                state_dict = safe_load_file(file_path, device=str(engine.device))
+                raw_model.load_state_dict(state_dict)
-                if engine.scaler and 'scaler_state_dict' in checkpoint:
-                    engine.scaler.load_state_dict(checkpoint['scaler_state_dict'])
+                # 尝试恢复元数据
+                with safe_open(file_path, framework="pt", device=str(engine.device)) as f:
+                    metadata = f.metadata()
+                    if metadata:
+                        loaded_epoch = int(metadata.get('epoch', 0))
+                        loaded_batch_idx = int(metadata.get('batch_idx', -1))
+                        is_step = metadata.get('is_step', 'False') == 'True'
+                        engine.global_step = int(metadata.get('global_step', 0))
+                        if 'meta' in metadata:
+                            try:
+                                engine.meta.update(json.loads(metadata['meta']))
+                            except: pass
+                        if is_step:
+                            engine.start_epoch = loaded_epoch
+                            engine.start_batch_idx = loaded_batch_idx
+                            msg = f"Epoch {engine.start_epoch}, Batch {engine.start_batch_idx + 1}"
+                        else:
+                            engine.start_epoch = loaded_epoch + 1
+                            engine.start_batch_idx = -1
+                            msg = f"Epoch {engine.start_epoch}"
+                        engine.print(f"[green]Resumed weights from {msg}. Note: Optimizer state not restored (safetensors).[/]", plugin='Checkpointing')
+                    else:
+                        engine.print("[yellow]Loaded weights only (no metadata in safetensors).[/]", plugin='Checkpointing')
-                if 'meta' in checkpoint:
-                    engine.meta.update(checkpoint['meta'])
+            else:
+                # Torch 加载
+                checkpoint = torch.load(file_path, map_location=engine.device)
-                loaded_epoch = checkpoint.get('epoch', 0)
-                loaded_batch_idx = checkpoint.get('batch_idx', -1)
-                is_step = checkpoint.get('is_step', False)
-                if is_step:
-                    # 如果是 Step Checkpoint，从当前 Epoch 的下一个 Batch 继续
-                    engine.start_epoch = loaded_epoch
-                    engine.start_batch_idx = loaded_batch_idx
-                    msg = f"Epoch {engine.start_epoch}, Batch {engine.start_batch_idx + 1}"
+                # 1. 加载模型权重
+                if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+                    raw_model.load_state_dict(checkpoint['model_state_dict'])
                 else:
-                    # 如果是 Epoch Checkpoint，从下一个 Epoch 开始
-                    engine.start_epoch = loaded_epoch + 1
-                    engine.start_batch_idx = -1
-                    msg = f"Epoch {engine.start_epoch}"
-                engine.global_step = checkpoint.get('global_step', 0)
+                    raw_model.load_state_dict(checkpoint)
+                    engine.print("[yellow]Loaded model weights only (legacy format).[/]", plugin='Checkpointing')
+                    return
-                engine.print(f"[green]Successfully resumed training from {msg}, Global Step {engine.global_step}[/]", plugin='Checkpointing')
+                # 2. 恢复训练状态
+                if not self.save_weights_only:
+                    if engine.optimizer and 'optimizer_state_dict' in checkpoint:
+                        engine.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+                    if engine.scheduler and 'scheduler_state_dict' in checkpoint:
+                        engine.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+                    if engine.scaler and 'scaler_state_dict' in checkpoint:
+                        engine.scaler.load_state_dict(checkpoint['scaler_state_dict'])
+                    if 'meta' in checkpoint:
+                        engine.meta.update(checkpoint['meta'])
+                    loaded_epoch = checkpoint.get('epoch', 0)
+                    loaded_batch_idx = checkpoint.get('batch_idx', -1)
+                    is_step = checkpoint.get('is_step', False)
+                    if is_step:
+                        # 如果是 Step Checkpoint，从当前 Epoch 的下一个 Batch 继续
+                        engine.start_epoch = loaded_epoch
+                        engine.start_batch_idx = loaded_batch_idx
+                        msg = f"Epoch {engine.start_epoch}, Batch {engine.start_batch_idx + 1}"
+                    else:
+                        # 如果是 Epoch Checkpoint，从下一个 Epoch 开始
+                        engine.start_epoch = loaded_epoch + 1
+                        engine.start_batch_idx = -1
+                        msg = f"Epoch {engine.start_epoch}"
+                    engine.global_step = checkpoint.get('global_step', 0)
+                    engine.print(f"[green]Successfully resumed training from {msg}, Global Step {engine.global_step}[/]", plugin='Checkpointing')
         except Exception as e:
             engine.print(f"[red]Failed to load checkpoint: {e}[/]", plugin='Checkpointing')

orbit/plugin/classification.py CHANGED Viewed

@@ -7,8 +7,8 @@ from typing import List, Optional, TYPE_CHECKING
 import rich.box as box
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from ..engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class ClassificationReport(Callback):
     def __init__(

orbit/plugin/display_model.py CHANGED Viewed

@@ -6,8 +6,7 @@ import rich.box as box
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from ..engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class ModelSummary(Callback):
     def __init__(self, max_depth: int = 3):

orbit/plugin/early_stopping.py CHANGED Viewed

@@ -2,8 +2,7 @@ import numpy as np
 from typing import TYPE_CHECKING
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from orbit.engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class EarlyStopping(Callback):
     """

orbit/plugin/ema.py CHANGED Viewed

@@ -3,8 +3,7 @@ import torch
 from orbit.callback import Callback, Event
 from typing import TYPE_CHECKING, Dict
-if TYPE_CHECKING:
-    from orbit.engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class EMA(Callback):
     """

orbit/plugin/gradient_accumulation.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from typing import TYPE_CHECKING
 from orbit.callback import Callback, Event
-if TYPE_CHECKING:
-    from orbit.engine import Engine
+if TYPE_CHECKING: from orbit.engine import Engine
 class GradientAccumulation(Callback):
     """

orbit-torch 0.0.4a1__py3-none-any.whl → 0.1.0b1__py3-none-any.whl

orbit-torch 0.0.4a1py3-none-any.whl → 0.1.0b1py3-none-any.whl