PyPI - ema-pytorch - Versions diffs - 0.3.1__tar.gz → 0.6.3__tar.gz - Mend

ema-pytorch 0.3.1tar.gz → 0.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ema-pytorch
-Version: 0.3.1
+Version: 0.6.3
 Summary: Easy way to keep track of exponential moving average version of your pytorch module
 Home-page: https://github.com/lucidrains/ema-pytorch
 Author: Phil Wang
@@ -14,5 +14,4 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: beartype
-Requires-Dist: torch>=1.6
+Requires-Dist: torch>=2.0

ema_pytorch-0.6.3/README.md ADDED Viewed

@@ -0,0 +1,109 @@
+## EMA - Pytorch
+A simple way to keep track of an Exponential Moving Average (EMA) version of your pytorch model
+## Install
+```bash
+$ pip install ema-pytorch
+```
+## Usage
+```python
+import torch
+from ema_pytorch import EMA
+# your neural network as a pytorch module
+net = torch.nn.Linear(512, 512)
+# wrap your neural network, specify the decay (beta)
+ema = EMA(
+    net,
+    beta = 0.9999,              # exponential moving average factor
+    update_after_step = 100,    # only after this number of .update() calls will it start updating
+    update_every = 10,          # how often to actually update, to save on compute (updates every 10th .update() call)
+)
+# mutate your network, with SGD or otherwise
+with torch.no_grad():
+    net.weight.copy_(torch.randn_like(net.weight))
+    net.bias.copy_(torch.randn_like(net.bias))
+# you will call the update function on your moving average wrapper
+ema.update()
+# then, later on, you can invoke the EMA model the same way as your network
+data = torch.randn(1, 512)
+output     = net(data)
+ema_output = ema(data)
+# if you want to save your ema model, it is recommended you save the entire wrapper
+# as it contains the number of steps taken (there is a warmup logic in there, recommended by @crowsonkb, validated for a number of projects now)
+# however, if you wish to access the copy of your model with EMA, then it will live at ema.ema_model
+```
+In order to use the post-hoc synthesized EMA, proposed by Karras et al. in <a href="https://arxiv.org/abs/2312.02696">a recent paper</a>, follow the example below
+```python
+import torch
+from ema_pytorch import PostHocEMA
+# your neural network as a pytorch module
+net = torch.nn.Linear(512, 512)
+# wrap your neural network, specify the sigma_rels or gammas
+emas = PostHocEMA(
+    net,
+    sigma_rels = (0.05, 0.3),           # a tuple with the hyperparameter for the multiple EMAs. you need at least 2 here to synthesize a new one
+    update_every = 10,                  # how often to actually update, to save on compute (updates every 10th .update() call)
+    checkpoint_every_num_steps = 10,
+    checkpoint_folder = './post-hoc-ema-checkpoints'  # the folder of saved checkpoints for each sigma_rel (gamma) across timesteps with the hparam above, used to synthesizing a new EMA model after training
+)
+net.train()
+for _ in range(1000):
+    # mutate your network, with SGD or otherwise
+    with torch.no_grad():
+        net.weight.copy_(torch.randn_like(net.weight))
+        net.bias.copy_(torch.randn_like(net.bias))
+    # you will call the update function on your moving average wrapper
+    emas.update()
+# now that you have a few checkpoints
+# you can synthesize an EMA model with a different sigma_rel (say 0.15)
+synthesized_ema = emas.synthesize_ema_model(sigma_rel = 0.15)
+# output with synthesized EMA
+data = torch.randn(1, 512)
+synthesized_ema_output = synthesized_ema(data)
+```
+## Citations
+```bibtex
+@article{Karras2023AnalyzingAI,
+    title   = {Analyzing and Improving the Training Dynamics of Diffusion Models},
+    author  = {Tero Karras and Miika Aittala and Jaakko Lehtinen and Janne Hellsten and Timo Aila and Samuli Laine},
+    journal = {ArXiv},
+    year    = {2023},
+    volume  = {abs/2312.02696},
+    url     = {https://api.semanticscholar.org/CorpusID:265659032}
+}
+```

ema_pytorch-0.6.3/ema_pytorch/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from ema_pytorch.ema_pytorch import EMA
+from ema_pytorch.post_hoc_ema import (
+    KarrasEMA,
+    PostHocEMA
+)

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/ema_pytorch/ema_pytorch.py RENAMED Viewed

@@ -1,3 +1,6 @@
+from __future__ import annotations
+from typing import Set, Tuple
 from copy import deepcopy
 from functools import partial
@@ -5,26 +8,35 @@ import torch
 from torch import nn, Tensor
 from torch.nn import Module
-from beartype import beartype
-from beartype.typing import Set, Optional
 def exists(val):
     return val is not None
 def get_module_device(m: Module):
     return next(m.parameters()).device
-def inplace_copy(src: Tensor, tgt: Tensor, *, auto_move_device = False):
+def maybe_coerce_dtype(t, dtype):
+    if t.dtype == dtype:
+        return t
+    return t.to(dtype)
+def inplace_copy(tgt: Tensor, src: Tensor, *, auto_move_device = False, coerce_dtype = False):
     if auto_move_device:
-        tgt = tgt.to(src.device)
+        src = src.to(tgt.device)
+    if coerce_dtype:
+        src = maybe_coerce_dtype(src, tgt.dtype)
-    src.copy_(tgt)
+    tgt.copy_(src)
-def inplace_lerp(src: Tensor, tgt: Tensor, weight, *, auto_move_device = False):
+def inplace_lerp(tgt: Tensor, src: Tensor, weight, *, auto_move_device = False, coerce_dtype = False):
     if auto_move_device:
-        tgt = tgt.to(src.device)
+        src = src.to(tgt.device)
-    src.lerp_(tgt, weight)
+    if coerce_dtype:
+        src = maybe_coerce_dtype(src, tgt.dtype)
+    tgt.lerp_(src, weight)
 class EMA(Module):
     """
@@ -43,15 +55,14 @@ class EMA(Module):
     Args:
         inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
-        power (float): Exponential factor of EMA warmup. Default: 1.
+        power (float): Exponential factor of EMA warmup. Default: 2/3.
         min_value (float): The minimum EMA decay rate. Default: 0.
     """
-    @beartype
     def __init__(
         self,
         model: Module,
-        ema_model: Optional[Module] = None,           # if your model has lazylinears or other types of non-deepcopyable modules, you can pass in your own ema model
+        ema_model: Module | None = None,             # if your model has lazylinears or other types of non-deepcopyable modules, you can pass in your own ema model
         beta = 0.9999,
         update_after_step = 100,
         update_every = 10,
@@ -62,11 +73,17 @@ class EMA(Module):
         ignore_names: Set[str] = set(),
         ignore_startswith_names: Set[str] = set(),
         include_online_model = True,                  # set this to False if you do not wish for the online model to be saved along with the ema model (managed externally)
-        allow_different_devices = False               # if the EMA model is on a different device (say CPU), automatically move the tensor
+        allow_different_devices = False,              # if the EMA model is on a different device (say CPU), automatically move the tensor
+        use_foreach = False,
+        forward_method_names: Tuple[str, ...] = (),
+        move_ema_to_online_device = False,
+        coerce_dtype = False
     ):
         super().__init__()
         self.beta = beta
+        self.is_frozen = beta == 1.
         # whether to include the online model within the module tree, so that state_dict also saves it
         self.include_online_model = include_online_model
@@ -88,17 +105,24 @@ class EMA(Module):
                 print('Your model was not copyable. Please make sure you are not using any LazyLinear')
                 exit()
-        self.ema_model.requires_grad_(False)
+        for p in self.ema_model.parameters():
+            p.detach_()
+        # forwarding methods
+        for forward_method_name in forward_method_names:
+            fn = getattr(self.ema_model, forward_method_name)
+            setattr(self, forward_method_name, fn)
         # parameter and buffer names
-        self.parameter_names = {name for name, param in self.ema_model.named_parameters() if param.dtype in [torch.float, torch.float16]}
-        self.buffer_names = {name for name, buffer in self.ema_model.named_buffers() if buffer.dtype in [torch.float, torch.float16]}
+        self.parameter_names = {name for name, param in self.ema_model.named_parameters() if torch.is_floating_point(param) or torch.is_complex(param)}
+        self.buffer_names = {name for name, buffer in self.ema_model.named_buffers() if torch.is_floating_point(buffer) or torch.is_complex(buffer)}
         # tensor update functions
-        self.inplace_copy = partial(inplace_copy, auto_move_device = allow_different_devices)
-        self.inplace_lerp = partial(inplace_lerp, auto_move_device = allow_different_devices)
+        self.inplace_copy = partial(inplace_copy, auto_move_device = allow_different_devices, coerce_dtype = coerce_dtype)
+        self.inplace_lerp = partial(inplace_lerp, auto_move_device = allow_different_devices, coerce_dtype = coerce_dtype)
         # updating hyperparameters
@@ -119,6 +143,21 @@ class EMA(Module):
         self.allow_different_devices = allow_different_devices
+        # whether to coerce dtype when copy or lerp from online to EMA model
+        self.coerce_dtype = coerce_dtype
+        # whether to move EMA model to online model device automatically
+        self.move_ema_to_online_device = move_ema_to_online_device
+        # whether to use foreach
+        if use_foreach:
+            assert hasattr(torch, '_foreach_lerp_') and hasattr(torch, '_foreach_copy_'), 'your version of torch does not have the prerequisite foreach functions'
+        self.use_foreach = use_foreach
         # init and step states
         self.register_buffer('initted', torch.tensor(False))
@@ -193,9 +232,25 @@ class EMA(Module):
     @torch.no_grad()
     def update_moving_average(self, ma_model, current_model):
-        copy, lerp = self.inplace_copy, self.inplace_lerp
+        if self.is_frozen:
+            return
+        # move ema model to online model device if not same and needed
+        if self.move_ema_to_online_device and get_module_device(ma_model) != get_module_device(current_model):
+            ma_model.to(get_module_device(current_model))
+        # get current decay
         current_decay = self.get_current_decay()
+        # store all source and target tensors to copy or lerp
+        tensors_to_copy = []
+        tensors_to_lerp = []
+        # loop through parameters
         for (name, current_params), (_, ma_params) in zip(self.get_params_iter(current_model), self.get_params_iter(ma_model)):
             if name in self.ignore_names:
                 continue
@@ -204,10 +259,12 @@ class EMA(Module):
                 continue
             if name in self.param_or_buffer_names_no_ema:
-                copy(ma_params.data, current_params.data)
+                tensors_to_copy.append((ma_params.data, current_params.data))
                 continue
-            lerp(ma_params.data, current_params.data, 1. - current_decay)
+            tensors_to_lerp.append((ma_params.data, current_params.data))
+        # loop through buffers
         for (name, current_buffer), (_, ma_buffer) in zip(self.get_buffers_iter(current_model), self.get_buffers_iter(ma_model)):
             if name in self.ignore_names:
@@ -217,10 +274,39 @@ class EMA(Module):
                 continue
             if name in self.param_or_buffer_names_no_ema:
-                copy(ma_buffer.data, current_buffer.data)
+                tensors_to_copy.append((ma_buffer.data, current_buffer.data))
                 continue
-            lerp(ma_buffer.data, current_buffer.data, 1. - current_decay)
+            tensors_to_lerp.append((ma_buffer.data, current_buffer.data))
+        # execute inplace copy or lerp
+        if not self.use_foreach:
+            for tgt, src in tensors_to_copy:
+                self.inplace_copy(tgt, src)
+            for tgt, src in tensors_to_lerp:
+                self.inplace_lerp(tgt, src, 1. - current_decay)
+        else:
+            # use foreach if available and specified
+            if self.allow_different_devices:
+                tensors_to_copy = [(tgt, src.to(tgt.device)) for tgt, src in tensors_to_copy]
+                tensors_to_lerp = [(tgt, src.to(tgt.device)) for tgt, src in tensors_to_lerp]
+            if self.coerce_dtype:
+                tensors_to_copy = [(tgt, maybe_coerce_dtype(src, tgt.dtype)) for tgt, src in tensors_to_copy]
+                tensors_to_lerp = [(tgt, maybe_coerce_dtype(src, tgt.dtype)) for tgt, src in tensors_to_lerp]
+            if len(tensors_to_copy) > 0:
+                tgt_copy, src_copy = zip(*tensors_to_copy)
+                torch._foreach_copy_(tgt_copy, src_copy)
+            if len(tensors_to_lerp) > 0:
+                tgt_lerp, src_lerp = zip(*tensors_to_lerp)
+                torch._foreach_lerp_(tgt_lerp, src_lerp, 1. - current_decay)
     def __call__(self, *args, **kwargs):
         return self.ema_model(*args, **kwargs)

ema_pytorch-0.6.3/ema_pytorch/post_hoc_ema.py ADDED Viewed

@@ -0,0 +1,420 @@
+from __future__ import annotations
+from pathlib import Path
+from copy import deepcopy
+from functools import partial
+import torch
+from torch import nn, Tensor
+from torch.nn import Module, ModuleList
+import numpy as np
+from typing import Set, Tuple
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def first(arr):
+    return arr[0]
+def get_module_device(m: Module):
+    return next(m.parameters()).device
+def inplace_copy(tgt: Tensor, src: Tensor, *, auto_move_device = False):
+    if auto_move_device:
+        src = src.to(tgt.device)
+    tgt.copy_(src)
+def inplace_lerp(tgt: Tensor, src: Tensor, weight, *, auto_move_device = False):
+    if auto_move_device:
+        src = src.to(tgt.device)
+    tgt.lerp_(src, weight)
+# algorithm 2 in https://arxiv.org/abs/2312.02696
+def sigma_rel_to_gamma(sigma_rel):
+    t = sigma_rel ** -2
+    return np.roots([1, 7, 16 - t, 12 - t]).real.max().item()
+class KarrasEMA(Module):
+    """
+    exponential moving average module that uses hyperparameters from the paper https://arxiv.org/abs/2312.02696
+    can either use gamma or sigma_rel from paper
+    """
+    def __init__(
+        self,
+        model: Module,
+        sigma_rel: float | None = None,
+        gamma: float | None = None,
+        ema_model: Module | None = None,           # if your model has lazylinears or other types of non-deepcopyable modules, you can pass in your own ema model
+        update_every: int = 100,
+        frozen: bool = False,
+        param_or_buffer_names_no_ema: Set[str] = set(),
+        ignore_names: Set[str] = set(),
+        ignore_startswith_names: Set[str] = set(),
+        allow_different_devices = False,              # if the EMA model is on a different device (say CPU), automatically move the tensor
+        move_ema_to_online_device = False             # will move entire EMA model to the same device as online model, if different
+    ):
+        super().__init__()
+        assert exists(sigma_rel) ^ exists(gamma), 'either sigma_rel or gamma is given. gamma is derived from sigma_rel as in the paper, then beta is dervied from gamma'
+        if exists(sigma_rel):
+            gamma = sigma_rel_to_gamma(sigma_rel)
+        self.gamma = gamma
+        self.frozen = frozen
+        self.online_model = [model]
+        # ema model
+        self.ema_model = ema_model
+        if not exists(self.ema_model):
+            try:
+                self.ema_model = deepcopy(model)
+            except Exception as e:
+                print(f'Error: While trying to deepcopy model: {e}')
+                print('Your model was not copyable. Please make sure you are not using any LazyLinear')
+                exit()
+        for p in self.ema_model.parameters():
+            p.detach_()
+        # parameter and buffer names
+        self.parameter_names = {name for name, param in self.ema_model.named_parameters() if torch.is_floating_point(param) or torch.is_complex(param)}
+        self.buffer_names = {name for name, buffer in self.ema_model.named_buffers() if torch.is_floating_point(buffer) or torch.is_complex(buffer)}
+        # tensor update functions
+        self.inplace_copy = partial(inplace_copy, auto_move_device = allow_different_devices)
+        self.inplace_lerp = partial(inplace_lerp, auto_move_device = allow_different_devices)
+        # updating hyperparameters
+        self.update_every = update_every
+        assert isinstance(param_or_buffer_names_no_ema, (set, list))
+        self.param_or_buffer_names_no_ema = param_or_buffer_names_no_ema # parameter or buffer
+        self.ignore_names = ignore_names
+        self.ignore_startswith_names = ignore_startswith_names
+        # whether to manage if EMA model is kept on a different device
+        self.allow_different_devices = allow_different_devices
+        # whether to move EMA model to online model device automatically
+        self.move_ema_to_online_device = move_ema_to_online_device
+        # init and step states
+        self.register_buffer('initted', torch.tensor(False))
+        self.register_buffer('step', torch.tensor(0))
+    @property
+    def model(self):
+        return first(self.online_model)
+    @property
+    def beta(self):
+        return (1. - 1. / (self.step.item() + 1.)) ** (1. + self.gamma)
+    def eval(self):
+        return self.ema_model.eval()
+    def restore_ema_model_device(self):
+        device = self.initted.device
+        self.ema_model.to(device)
+    def get_params_iter(self, model):
+        for name, param in model.named_parameters():
+            if name not in self.parameter_names:
+                continue
+            yield name, param
+    def get_buffers_iter(self, model):
+        for name, buffer in model.named_buffers():
+            if name not in self.buffer_names:
+                continue
+            yield name, buffer
+    def copy_params_from_model_to_ema(self):
+        copy = self.inplace_copy
+        for (_, ma_params), (_, current_params) in zip(self.get_params_iter(self.ema_model), self.get_params_iter(self.model)):
+            copy(ma_params.data, current_params.data)
+        for (_, ma_buffers), (_, current_buffers) in zip(self.get_buffers_iter(self.ema_model), self.get_buffers_iter(self.model)):
+            copy(ma_buffers.data, current_buffers.data)
+    def copy_params_from_ema_to_model(self):
+        copy = self.inplace_copy
+        for (_, ma_params), (_, current_params) in zip(self.get_params_iter(self.ema_model), self.get_params_iter(self.model)):
+            copy(current_params.data, ma_params.data)
+        for (_, ma_buffers), (_, current_buffers) in zip(self.get_buffers_iter(self.ema_model), self.get_buffers_iter(self.model)):
+            copy(current_buffers.data, ma_buffers.data)
+    def update(self):
+        step = self.step.item()
+        self.step += 1
+        if (step % self.update_every) != 0:
+            return
+        if not self.initted.item():
+            self.copy_params_from_model_to_ema()
+            self.initted.data.copy_(torch.tensor(True))
+        self.update_moving_average(self.ema_model, self.model)
+    def iter_all_ema_params_and_buffers(self):
+        for name, ma_params in self.get_params_iter(self.ema_model):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                continue
+            yield ma_params
+        for name, ma_buffer in self.get_buffers_iter(self.ema_model):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                continue
+            yield ma_buffer
+    @torch.no_grad()
+    def update_moving_average(self, ma_model, current_model):
+        if self.frozen:
+            return
+        # move ema model to online model device if not same and needed
+        if self.move_ema_to_online_device and get_module_device(ma_model) != get_module_device(current_model):
+            ma_model.to(get_module_device(current_model))
+        # get some functions and current decay
+        copy, lerp = self.inplace_copy, self.inplace_lerp
+        current_decay = self.beta
+        for (name, current_params), (_, ma_params) in zip(self.get_params_iter(current_model), self.get_params_iter(ma_model)):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                copy(ma_params.data, current_params.data)
+                continue
+            lerp(ma_params.data, current_params.data, 1. - current_decay)
+        for (name, current_buffer), (_, ma_buffer) in zip(self.get_buffers_iter(current_model), self.get_buffers_iter(ma_model)):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                copy(ma_buffer.data, current_buffer.data)
+                continue
+            lerp(ma_buffer.data, current_buffer.data, 1. - current_decay)
+    def __call__(self, *args, **kwargs):
+        return self.ema_model(*args, **kwargs)
+# post hoc ema wrapper
+# solving of the weights for combining all checkpoints into a newly synthesized EMA at desired gamma
+# Algorithm 3 copied from paper, redone in torch
+def p_dot_p(t_a, gamma_a, t_b, gamma_b):
+    t_ratio = t_a / t_b
+    t_exp = torch.where(t_a < t_b , gamma_b , -gamma_a)
+    t_max = torch.maximum(t_a , t_b)
+    num = (gamma_a + 1) * (gamma_b + 1) * t_ratio ** t_exp
+    den = (gamma_a + gamma_b + 1) * t_max
+    return num / den
+def solve_weights(t_i, gamma_i, t_r, gamma_r):
+    rv = lambda x: x.double().reshape(-1, 1)
+    cv = lambda x: x.double().reshape(1, -1)
+    A = p_dot_p(rv(t_i), rv(gamma_i), cv(t_i), cv(gamma_i))
+    b = p_dot_p(rv(t_i), rv(gamma_i), cv(t_r), cv(gamma_r))
+    return torch.linalg.solve(A, b)
+class PostHocEMA(Module):
+    def __init__(
+        self,
+        model: Module,
+        sigma_rels: Tuple[float, ...] | None = None,
+        gammas: Tuple[float, ...] | None = None,
+        checkpoint_every_num_steps: int = 1000,
+        checkpoint_folder: str = './post-hoc-ema-checkpoints',
+        checkpoint_dtype: torch.dtype = torch.float16,
+        **kwargs
+    ):
+        super().__init__()
+        assert exists(sigma_rels) ^ exists(gammas)
+        if exists(sigma_rels):
+            gammas = tuple(map(sigma_rel_to_gamma, sigma_rels))
+        assert len(gammas) > 1, 'at least 2 ema models with different gammas in order to synthesize new ema models of a different gamma'
+        assert len(set(gammas)) == len(gammas), 'calculated gammas must be all unique'
+        self.gammas = gammas
+        self.num_ema_models = len(gammas)
+        self._model = [model]
+        self.ema_models = ModuleList([KarrasEMA(model, gamma = gamma, **kwargs) for gamma in gammas])
+        self.checkpoint_folder = Path(checkpoint_folder)
+        self.checkpoint_folder.mkdir(exist_ok = True, parents = True)
+        assert self.checkpoint_folder.is_dir()
+        self.checkpoint_every_num_steps = checkpoint_every_num_steps
+        self.checkpoint_dtype = checkpoint_dtype
+        self.ema_kwargs = kwargs
+    @property
+    def model(self):
+        return first(self._model)
+    @property
+    def step(self):
+        return first(self.ema_models).step
+    @property
+    def device(self):
+        return self.step.device
+    def copy_params_from_model_to_ema(self):
+        for ema_model in self.ema_models:
+            ema_model.copy_params_from_model_to_ema()
+    def copy_params_from_ema_to_model(self):
+        for ema_model in self.ema_models:
+            ema_model.copy_params_from_ema_to_model()
+    def update(self):
+        for ema_model in self.ema_models:
+            ema_model.update()
+        if not (self.step.item() % self.checkpoint_every_num_steps):
+            self.checkpoint()
+    def checkpoint(self):
+        step = self.step.item()
+        for ind, ema_model in enumerate(self.ema_models):
+            filename = f'{ind}.{step}.pt'
+            path = self.checkpoint_folder / filename
+            pkg = deepcopy(ema_model).to(self.checkpoint_dtype).state_dict()
+            torch.save(pkg, str(path))
+    def synthesize_ema_model(
+        self,
+        gamma: float | None = None,
+        sigma_rel: float | None = None,
+        step: int | None = None,
+    ) -> KarrasEMA:
+        assert exists(gamma) ^ exists(sigma_rel)
+        device = self.device
+        if exists(sigma_rel):
+            gamma = sigma_rel_to_gamma(sigma_rel)
+        synthesized_ema_model = KarrasEMA(
+            model = self.model,
+            gamma = gamma,
+            **self.ema_kwargs
+        )
+        synthesized_ema_model
+        # get all checkpoints
+        gammas = []
+        timesteps = []
+        checkpoints = [*self.checkpoint_folder.glob('*.pt')]
+        for file in checkpoints:
+            gamma_ind, timestep = map(int, file.stem.split('.'))
+            gammas.append(self.gammas[gamma_ind])
+            timesteps.append(timestep)
+        step = default(step, max(timesteps))
+        assert step <= max(timesteps), f'you can only synthesize for a timestep that is less than the max timestep {max(timesteps)}'
+        # line up with Algorithm 3
+        gamma_i = torch.tensor(gammas, device = device)
+        t_i = torch.tensor(timesteps, device = device)
+        gamma_r = torch.tensor([gamma], device = device)
+        t_r = torch.tensor([step], device = device)
+        # solve for weights for combining all checkpoints into synthesized, using least squares as in paper
+        weights = solve_weights(t_i, gamma_i, t_r, gamma_r)
+        weights = weights.squeeze(-1)
+        # now sum up all the checkpoints using the weights one by one
+        tmp_ema_model = KarrasEMA(
+            model = self.model,
+            gamma = gamma,
+            **self.ema_kwargs
+        )
+        for ind, (checkpoint, weight) in enumerate(zip(checkpoints, weights.tolist())):
+            is_first = ind == 0
+            # load checkpoint into a temporary ema model
+            ckpt_state_dict = torch.load(str(checkpoint), weights_only=True)
+            tmp_ema_model.load_state_dict(ckpt_state_dict)
+            # add weighted checkpoint to synthesized
+            for ckpt_tensor, synth_tensor in zip(tmp_ema_model.iter_all_ema_params_and_buffers(), synthesized_ema_model.iter_all_ema_params_and_buffers()):
+                if is_first:
+                    synth_tensor.zero_()
+                synth_tensor.add_(ckpt_tensor * weight)
+        # return the synthesized model
+        return synthesized_ema_model
+    def __call__(self, *args, **kwargs):
+        return tuple(ema_model(*args, **kwargs) for ema_model in self.ema_models)

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/ema_pytorch.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ema-pytorch
-Version: 0.3.1
+Version: 0.6.3
 Summary: Easy way to keep track of exponential moving average version of your pytorch module
 Home-page: https://github.com/lucidrains/ema-pytorch
 Author: Phil Wang
@@ -14,5 +14,4 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: beartype
-Requires-Dist: torch>=1.6
+Requires-Dist: torch>=2.0

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/ema_pytorch.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,6 +3,7 @@ README.md
 setup.py
 ema_pytorch/__init__.py
 ema_pytorch/ema_pytorch.py
+ema_pytorch/post_hoc_ema.py
 ema_pytorch.egg-info/PKG-INFO
 ema_pytorch.egg-info/SOURCES.txt
 ema_pytorch.egg-info/dependency_links.txt

ema_pytorch-0.6.3/ema_pytorch.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ torch>=2.0

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'ema-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.1',
+  version = '0.6.3',
   license='MIT',
   description = 'Easy way to keep track of exponential moving average version of your pytorch module',
   author = 'Phil Wang',
@@ -16,8 +16,7 @@ setup(
     'exponential moving average'
   ],
   install_requires=[
-    'beartype',
-    'torch>=1.6',
+    'torch>=2.0',
   ],
   classifiers=[
     'Development Status :: 4 - Beta',

ema-pytorch-0.3.1/README.md DELETED Viewed

@@ -1,54 +0,0 @@
-## EMA - Pytorch
-A simple way to keep track of an Exponential Moving Average (EMA) version of your pytorch model
-## Install
-```bash
-$ pip install ema-pytorch
-```
-## Usage
-```python
-import torch
-from ema_pytorch import EMA
-# your neural network as a pytorch module
-net = torch.nn.Linear(512, 512)
-# wrap your neural network, specify the decay (beta)
-ema = EMA(
-    net,
-    beta = 0.9999,              # exponential moving average factor
-    update_after_step = 100,    # only after this number of .update() calls will it start updating
-    update_every = 10,          # how often to actually update, to save on compute (updates every 10th .update() call)
-)
-# mutate your network, with SGD or otherwise
-with torch.no_grad():
-    net.weight.copy_(torch.randn_like(net.weight))
-    net.bias.copy_(torch.randn_like(net.bias))
-# you will call the update function on your moving average wrapper
-ema.update()
-# then, later on, you can invoke the EMA model the same way as your network
-data = torch.randn(1, 512)
-output     = net(data)
-ema_output = ema(data)
-# if you want to save your ema model, it is recommended you save the entire wrapper
-# as it contains the number of steps taken (there is a warmup logic in there, recommended by @crowsonkb, validated for a number of projects now)
-# however, if you wish to access the copy of your model with EMA, then it will live at ema.ema_model
-```
-## Todo
-- [ ] address the issue of annealing EMA to 1 near the end of training for BYOL https://github.com/lucidrains/byol-pytorch/issues/82

ema-pytorch-0.3.1/ema_pytorch/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from ema_pytorch.ema_pytorch import EMA

ema-pytorch-0.3.1/ema_pytorch.egg-info/requires.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- beartype
2	- torch>=1.6

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/LICENSE RENAMED Viewed

File without changes

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/ema_pytorch.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/ema_pytorch.egg-info/top_level.txt RENAMED Viewed

File without changes

{ema-pytorch-0.3.1 → ema_pytorch-0.6.3}/setup.cfg RENAMED Viewed

File without changes

ema-pytorch 0.3.1__tar.gz → 0.6.3__tar.gz

ema-pytorch 0.3.1tar.gz → 0.6.3tar.gz