PyPI - gradboard - Versions diffs - 0.1.1__tar.gz - Mend

gradboard 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gradboard might be problematic. Click here for more details.

Files changed (7) hide show

gradboard-0.1.1/LICENSE +21 -0
gradboard-0.1.1/PKG-INFO +62 -0
gradboard-0.1.1/README.md +45 -0
gradboard-0.1.1/gradboard/cycles.py +277 -0
gradboard-0.1.1/gradboard/optimiser.py +163 -0
gradboard-0.1.1/gradboard/pass.py +212 -0
gradboard-0.1.1/pyproject.toml +39 -0

gradboard-0.1.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 nicholasbailey87
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

gradboard-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,62 @@
+Metadata-Version: 2.3
+Name: gradboard
+Version: 0.1.1
+Summary: Easily snowboard down gnarly loss gradients
+License: MIT
+Author: Nicholas Bailey
+Requires-Python: >=3.11
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: numpy (>=2.0.2,<3.0.0)
+Requires-Dist: scipy (>=1.15.3,<2.0.0)
+Description-Content-Type: text/markdown
+# gradboard
+![snowboarder](snowboarder.png "Image of a snowboarder")
+Easily snowboard down gnarly loss gradients
+## Getting started
+You can install gradboard with
+```
+pip install gradboard
+```
+PyTorch is a peer dependency of `gradboard`, which means
+  * You will need to make sure you have PyTorch installed in order to use `gradboard`
+  * PyTorch will **not** be installed automatically when you install `gradboard`
+We take this approach because PyTorch versioning is environment-specific and
+    we don't know where you will want to use `gradboard`. If we automatically install
+    PyTorch for you, there's a good chance we would get it wrong!
+Therefore, please also make sure you install PyTorch.
+## Usage examples
+### Decent model training outcomes without tuning hyperparameters
+`gradboard` includes
+  * An implementation of AdamS as proposed in Xie et al. (2023) "On the Overlooked
+        Pitfalls of Weight Decay and How to Mitigate Them: A Gradient-Norm
+        Perspective" (https://openreview.net/pdf?id=vnGcubtzR1), which in practice
+        makes model training more robust to the weight decay setting.
+  * Utilities for implementing popular learning rate schedules
+  * An implementation of an automatic max/min learning rate finder based on Smith
+        (2017) "Cyclical Learning Rates for Training Neural Networks"
+        (https://arxiv.org/abs/1506.01186)
+  * Sensible defaults
+In practice this means that you can train a neural network and get decent performance
+    right out of the box, just by using the `PASS` (point-and-shoot scheduler), even
+    for unfamiliar architectures or problem domains.

gradboard-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# gradboard
+![snowboarder](snowboarder.png "Image of a snowboarder")
+Easily snowboard down gnarly loss gradients
+## Getting started
+You can install gradboard with
+```
+pip install gradboard
+```
+PyTorch is a peer dependency of `gradboard`, which means
+  * You will need to make sure you have PyTorch installed in order to use `gradboard`
+  * PyTorch will **not** be installed automatically when you install `gradboard`
+We take this approach because PyTorch versioning is environment-specific and
+    we don't know where you will want to use `gradboard`. If we automatically install
+    PyTorch for you, there's a good chance we would get it wrong!
+Therefore, please also make sure you install PyTorch.
+## Usage examples
+### Decent model training outcomes without tuning hyperparameters
+`gradboard` includes
+  * An implementation of AdamS as proposed in Xie et al. (2023) "On the Overlooked
+        Pitfalls of Weight Decay and How to Mitigate Them: A Gradient-Norm
+        Perspective" (https://openreview.net/pdf?id=vnGcubtzR1), which in practice
+        makes model training more robust to the weight decay setting.
+  * Utilities for implementing popular learning rate schedules
+  * An implementation of an automatic max/min learning rate finder based on Smith
+        (2017) "Cyclical Learning Rates for Training Neural Networks"
+        (https://arxiv.org/abs/1506.01186)
+  * Sensible defaults
+In practice this means that you can train a neural network and get decent performance
+    right out of the box, just by using the `PASS` (point-and-shoot scheduler), even
+    for unfamiliar architectures or problem domains.

gradboard-0.1.1/gradboard/cycles.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""
+Utilities for generating a range of learning rate schedules.
+"""
+import math
+from typing import Optional, List, Union, Callable
+def ascent(step: int, total_steps: int) -> float:
+    """
+    Get a sequence of numbers evenly spaced between 0 and 1 so that the first
+        number is 0 and the last is 1 and there are `total_steps` numbers in
+        the sequence.
+    """
+    return round(step / (total_steps - 0.999), 8)
+def triangle(step: int, total_steps: int) -> float:
+    """
+    Get a triangular sequence of numbers between 0 and 1, going up in half of
+        `total_steps` and coming down in the other half, peaking at ~1.
+    """
+    half = int(math.ceil(total_steps / 2))
+    if step < half:
+        return 2 * ascent(step, total_steps)
+    else:
+        return 2 - 2 * ascent(step, total_steps)
+def cosine(step: int, total_steps: int) -> float:
+    """
+    Get a sequence of numbers between 0 and 1 in the shape of a cosine wave with
+        wavelength `total_steps`.
+    """
+    assert total_steps != 0
+    angle = (step / (total_steps - 0.999)) * (2 * math.pi)
+    return round((math.cos(angle) + 1) / 2, 8)
+def half_cosine(step: int, total_steps: int) -> float:
+    """
+    Get a sequence of numbers between 0 and 1 in the shape of the descending
+        half of a cosine wave with wavelength 2*`total_steps`.
+    """
+    return cosine(step, (total_steps * 2) - 1)
+def cycloid(step: int, total_steps: int) -> float:
+    """
+    Get a sequence of numbers between 0 and 1 in the shape of a cycloid with
+        circle diameter 1.0 and `total_steps/(2*math.pi)` steps per cycle.
+    """
+    x = step * (math.pi / (total_steps - 1))
+    def fx(t):
+        return 0.5 * (t - math.sin(t)) - x
+    def fx_prime(t):
+        return 0.5 - 0.5 * math.cos(t)
+    def fy_prime(t):
+        return 0.5 - 0.5 * -math.sin(t)
+    angle_estimate = 0.5 * x
+    # XXX: 200 iterations is too many! Use a more efficient root finding algorithm
+    for _ in range(200):
+        if abs(fx_prime(angle_estimate)) > 0.1:
+            update = fx(angle_estimate) / fx_prime(angle_estimate)
+        else:
+            update = fx(angle_estimate) / fy_prime(angle_estimate)
+        angle_estimate = angle_estimate - update
+    return 0.5 * (1 - math.cos(angle_estimate))
+def half_cycloid(step: int, total_steps: int) -> float:
+    return cycloid(total_steps + step, 2 * total_steps)
+class Cycle:
+    def __init__(
+        self,
+        generating_function: Union[str, Callable],
+        training_examples,
+        epochs,
+        batch_size,
+        t_0: Optional[int] = None,
+        t_mult: float = 1.0,
+        t_scale: float = 1.0,
+        low=0.0,
+        high=1.0,
+        reflect=False,
+    ):
+        self.training_examples = training_examples
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.total_steps = int(
+            epochs * (math.floor(training_examples / batch_size) + 1)
+        )
+        self.t_0 = (
+            t_0 * (training_examples / batch_size)
+            if t_0 is not None
+            else self.total_steps
+        )
+        self.t_mult = t_mult
+        self.t_scale = t_scale
+        self.low = low
+        self.high = high
+        self.reflect = reflect
+        if callable(generating_function):
+            self._generating_function = generating_function
+        elif generating_function == "ascent":
+            self._generating_function = ascent
+        elif generating_function == "triangle":
+            self._generating_function = triangle
+        elif generating_function == "cosine":
+            self._generating_function = cosine
+        elif generating_function == "half_cosine":
+            self._generating_function = half_cosine
+        elif generating_function == "half_cycloid":
+            self._generating_function = half_cycloid
+        else:
+            raise NotImplementedError(
+                "`generating_function` must be a callable object or one of "
+                '"ascent", "triangle", "cosine", "half_cosine" or "half_cycloid"'
+            )
+    def _get_window(self, step):
+        windows = self._windows()
+        cumulative = [
+            sum([w[0] for w in windows][: i + 1]) for i in range(len(windows))
+        ]
+        position = None
+        local_step = None
+        for i, c in enumerate(cumulative):
+            if c > step:
+                position = i
+                local_step = step if i == 0 else step - cumulative[i - 1]
+                break
+        window_width, window_height = windows[position]
+        return window_width, local_step, window_height
+    def _generate(self, step) -> list:
+        total_steps, step, scale = self._get_window(step)
+        y = self._generating_function(step, total_steps)
+        y = y * scale
+        y = 1 - y if self.reflect else y
+        return y * (self.high - self.low) + self.low
+    def __call__(self, n):
+        return self._generate(n)
+    def __len__(self):
+        return self.total_steps
+    def _windows(self):
+        assert self.t_mult > 0
+        # Get tile widths
+        widths = [self.t_0]
+        while True:
+            next_item = widths[-1] * self.t_mult
+            if sum(widths) + next_item <= self.total_steps:
+                widths.append(next_item)
+            else:
+                break
+        for i in range(1, len(widths)):
+            widths[i] = int(widths[i] * (self.total_steps / sum(widths)))
+        widths[-1] += self.total_steps - sum(widths)
+        # Get tile heights
+        heights = [1.0 * self.t_scale**i for i in range(len(widths))]
+        return list(zip(widths, heights, strict=True))
+    def stats(self) -> float:
+        """
+        Returns the area (as a percentage of the area of a curve where the learning
+            rate is constant max_lr), percentage ascent steps and percentage descent
+            steps of a learning rate schedule.
+        """
+        total_area = 0
+        max_area = 0
+        ascent_steps = 0
+        descent_steps = 0
+        total_up_gradient = 0
+        total_down_gradient = 0
+        total_gradient = 0
+        previous_lr = None
+        for s in range(self.total_steps):
+            height = self(s)
+            total_area += height
+            max_area += 1
+            if previous_lr is None:
+                pass
+            elif previous_lr > height:
+                descent_steps += 1
+                total_down_gradient += height - previous_lr
+                total_gradient += height - previous_lr
+            elif previous_lr < height:
+                ascent_steps += 1
+                total_up_gradient += height - previous_lr
+                total_gradient += height - previous_lr
+            else:
+                total_gradient += height
+            previous_lr = height
+        return {
+            "area": total_area / max_area,
+            "pc_ascent": round(ascent_steps / self.total_steps, 3),
+            "pc_descent": round(descent_steps / self.total_steps, 3),
+            "avg_up_gradient": round(
+                total_up_gradient / ascent_steps if ascent_steps > 0 else 0.0, 3
+            ),
+            "avg_down_gradient": round(
+                total_down_gradient / descent_steps if descent_steps > 0 else 0.0, 3
+            ),
+            "avg_gradient": round(-(self.high - self.low) / self.total_steps, 3),
+        }
+class CycleProduct(Cycle):
+    def __init__(self, cycles: List[Cycle], reflect=False):
+        main_training_examples = cycles[0].training_examples
+        main_batch_size = cycles[0].batch_size
+        assert all(c.training_examples == main_training_examples for c in cycles)
+        assert all(c.batch_size == main_batch_size for c in cycles)
+        self.cycles = cycles
+        self.reflect = reflect
+        def generating_function(step: int, total_steps: int) -> float:
+            output = self.cycles[0](step)
+            for c in self.cycles[1:]:
+                output *= c(step % c.total_steps)
+            return output
+        super().__init__(
+            generating_function=generating_function,
+            training_examples=self.cycles[0].training_examples,
+            epochs=self.cycles[0].epochs,
+            batch_size=self.cycles[0].batch_size,
+            reflect=reflect,
+        )
+class CycleSequence:
+    def __init__(self, cycles: List[Cycle]):
+        self.total_steps = sum([c.total_steps for c in cycles])
+        self.cycles = cycles
+    def _generate(self, step):
+        cycle, step = self._get_cycle_and_step(step)
+        return self.cycles[cycle](step)
+    def _get_cycle_and_step(self, step):
+        cycle_lengths = [c.total_steps for c in self.cycles]
+        cumulative = [sum(cycle_lengths[: i + 1]) for i in range(len(cycle_lengths))]
+        cycle = None
+        local_step = None
+        for i, c in enumerate(cumulative):
+            if c > step:
+                cycle = i
+                local_step = step if i == 0 else step - cumulative[i - 1]
+                break
+        return cycle, local_step
+    def __call__(self, step):
+        return self._generate(step)
+    def __len__(self):
+        return self.total_steps

gradboard-0.1.1/gradboard/optimiser.py ADDED Viewed

@@ -0,0 +1,163 @@
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from torch.optim import AdamW
+class AdamS(Optimizer):
+    r"""
+    Implements Adam with stable weight decay (AdamS) as proposed in
+        "On the Overlooked Pitfalls of Weight Decay and How to Mitigate Them:
+        A Gradient-Norm Perspective" (https://openreview.net/pdf?id=vnGcubtzR1).
+    This implementation was from the git repo
+        http://github.com/zeke-xie/stable-weight-decay-regularization/
+            blob/master/swd_optim/adams.py (MIT license ca. July 2025)
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-4)
+    """
+    def __init__(
+        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay}
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        param_size = 0
+        exp_avg_sq_hat_sum = 0.0
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                param_size += p.numel()
+                # Perform optimization step
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError("AdamS does not support sparse gradients")
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                beta1, beta2 = group["betas"]
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                state["step"] += 1
+                bias_correction2 = 1 - beta2 ** state["step"]
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                exp_avg_sq_hat = exp_avg_sq / bias_correction2
+                exp_avg_sq_hat_sum += exp_avg_sq_hat.sum()
+        # Calculate the sqrt of the mean of all elements in exp_avg_sq_hat
+        exp_avg_mean_sqrt = math.sqrt(exp_avg_sq_hat_sum / param_size)
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                state = self.state[p]
+                # Perform stable weight decay
+                if group["weight_decay"] != 0:
+                    p.data.mul_(
+                        1 - group["weight_decay"] * group["lr"] / exp_avg_mean_sqrt
+                    )
+                beta1, beta2 = group["betas"]
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
+                exp_avg_sq_hat = exp_avg_sq / bias_correction2
+                denom = exp_avg_sq_hat.sqrt().add(group["eps"])
+                step_size = group["lr"] / bias_correction1
+                p.addcdiv_(exp_avg, denom, value=-step_size)
+        # Make sure internal tensors are still leaf tensors
+        # state['exp_avg'] = state['exp_avg'].detach()
+        # state['exp_avg_sq'] = state['exp_avg_sq'].detach()
+        return loss
+def get_optimiser(model, optimiser=AdamW, lr=7e-4, weight_decay=5e-2):
+    """
+    Defaults are from one of the presets from the accompanying repo to Hassani
+        et al. (2023) "Escaping the Big Data Paradigm with Compact Transformers",
+        https://github.com/SHI-Labs/Compact-Transformers/blob/main/configs/
+        pretrained/cct_7-3x1_cifar100_1500epochs.yml
+    """
+    weight_decay_exclude = []
+    for keyword in [
+        "bias",
+        "norm",
+        "embedding",
+        "swiglu_beta",
+        "sigma",
+        "scale",
+        "input_query",
+        "reentrant_query",
+    ]:
+        weight_decay_exclude += [
+            p for name, p in model.named_parameters() if keyword in name.lower()
+        ]
+    weight_decay_exclude = set(weight_decay_exclude)
+    weight_decay_include = set(model.parameters()) - weight_decay_exclude
+    return optimiser(
+        [
+            {"params": list(weight_decay_include)},
+            {"params": list(weight_decay_exclude), "weight_decay": 0.0},
+        ],
+        weight_decay=weight_decay,
+        lr=lr,
+    )

gradboard-0.1.1/gradboard/pass.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Based on Smith (2017) https://arxiv.org/abs/1506.01186
+"""
+from typing import Optional
+import copy
+import math
+from scipy.ndimage import gaussian_filter1d
+from torch.amp import GradScaler
+from .cycles import Cycle
+class PASS:
+    """
+    A self-configuring learning rate scheduler
+    """
+    def __init__(
+        self,
+        learning_rate_schedule: Cycle,
+        model,
+        optimiser,
+        scaler: Optional[GradScaler] = None,
+        range_test: bool = True,
+        max_lr: float = None,
+        cool_point: float = None,
+    ):
+        assert (max_lr is not None) == (cool_point is not None)
+        assert (
+            ((max_lr is not None) and (cool_point is not None))
+            != range_test
+            is not None
+        )
+        self.model = model
+        self.optimiser = optimiser
+        self.scaler = scaler
+        self.learning_rate_schedule = learning_rate_schedule
+        self.range_test = range_test
+        self.max_lr = max_lr
+        self.cool_point = cool_point
+        self.original_states = self._saved_states()
+        self.range_test_results = []
+        self.step_count = 0
+        if range_test:
+            self.start_range_test()  # sets LR to 1E-7
+    @property
+    def lr(self):
+        """
+        Return first lr from self.optimiser.param_groups
+            (we assume they are all the same!)
+        """
+        for group in self.optimiser.param_groups:
+            return group["lr"]
+    @property
+    def in_range_test(self):
+        if not self.range_test:
+            return False
+        elif (len(self.range_test_results) == 0) or (
+            not math.isnan(self.range_test_results[-1][1])
+        ):
+            return True
+        else:
+            return False
+    @property
+    def trained(self):
+        if not self.range_test:
+            return True
+        elif math.isnan(self.range_test_results[-1][1]):
+            return True
+        else:
+            return False
+    @property
+    def finished(self):
+        return self.step_count >= len(self.learning_rate_schedule) - 1
+    def _saved_states(self):
+        saved_states = {
+            "model": copy.deepcopy(self.model.state_dict()),
+            "optimiser": copy.deepcopy(self.optimiser.state_dict()),
+        }
+        if self.scaler is not None:
+            saved_states["scaler"] = copy.deepcopy(self.scaler.state_dict())
+        return saved_states
+    def save_states(self):
+        self.saved_states = self._saved_states()
+    def load_states(self, saved_states):
+        self.model.load_state_dict(saved_states["model"])
+        self.optimiser.load_state_dict(saved_states["optimiser"])
+        if self.scaler is not None:
+            self.scaler.load_state_dict(saved_states["scaler"])
+    def recover_states(self):
+        self.load_states(self.saved_states)
+    @property
+    def _schedule_lr(self):
+        return (
+            self.learning_rate_schedule(
+                min(self.step_count, self.learning_rate_schedule.total_steps)
+            )
+            * (self.max_lr - self.cool_point)
+            + self.cool_point
+        )
+    def set_lr(self, lr):
+        for group in self.optimiser.param_groups:
+            group["lr"] = lr
+    def scale_lr(self, scaling_factor):
+        self.set_lr(self.lr * scaling_factor)
+    def start_range_test(self):
+        self.save_states()
+        self.optimiser.load_state_dict(self.original_states["optimiser"])
+        if self.scaler is not None:
+            self.scaler.load_state_dict(self.original_states["scaler"])
+        self.set_lr(1e-7)
+    def end_range_test(self):
+        self.recover_states()
+        self.update_learning_rates()
+    def _smoothed_range_test(self, range_test_results):
+        range_test_results = sorted(range_test_results, key=lambda x: x[0])
+        learning_rates = [t[0] for t in range_test_results]
+        losses = [t[1] for t in self.range_test_results]
+        losses = losses[:-1] + [10 * max(losses)]
+        smoothed_losses = gaussian_filter1d([t[1] for t in range_test_results][:-1], 3)
+        return list(zip(learning_rates, smoothed_losses, strict=True))
+    def _plot_range_test(self, range_test_results):
+        """
+        Returns a tuple with x values (learning rates) and y values (losses)
+            which can then be passed to e.g. pyplot. We recommend presenting
+            the plot with a logarithmic x axis.
+        """
+        range_test_results = sorted(range_test_results, key=lambda x: x[0])
+        learning_rates = [t[0] for t in range_test_results]
+        losses = [t[1] for t in range_test_results]
+        return learning_rates, losses
+    def _apply_range_test_result(self):
+        """
+        ...
+        """
+        range_test_results = self._smoothed_range_test(self.range_test_results)
+        self._plot_range_test(range_test_results)
+        minimum = min(range_test_results, key=lambda x: x[1])
+        points_left_of_min = [p for p in range_test_results if p[0] < minimum[0]]
+        highest_point_left_of_min = max(points_left_of_min, key=lambda x: x[1])
+        halfway = (highest_point_left_of_min[1] + minimum[1]) / 2
+        for r in range_test_results:
+            if r[1] < halfway:
+                self.max_lr = r[0] * 3
+                self.cool_point = r[0] / 3
+                print("High LR", self.max_lr)
+                print("Cool point", self.cool_point)
+                break
+    def update_learning_rates(self):
+        if self.finished:
+            pass
+        else:
+            self.set_lr(self._schedule_lr)
+    def _append_to_range_test(self, loss_item: float):
+        self.range_test_results.append((self.lr, loss_item))
+        if math.isnan(loss_item):
+            self._apply_range_test_result()
+            self.end_range_test()
+        else:
+            # Continue range test, step up learning rate
+            self.scale_lr(1.05)
+    def step(self, loss_item: float):
+        """
+        This function manages the process of
+            * Doing an initial range test
+            * Training for one microcycle using the learning rates from the
+                  initial range test ("burn in")
+            * Doing a second range test to set the learning rate schedule for
+                  the rest of training
+            * Updating learning rates during training according to the macrocycle
+        """
+        if self.in_range_test:  # True at init unless self.range_test = False
+            assert self.step_count == 0  # No weight updates yet
+            self._append_to_range_test(loss_item)
+        elif self.trained and not self.finished:
+            self.step_count += 1
+            self.update_learning_rates()
+        else:
+            pass

gradboard-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,39 @@
+[project]
+name = "gradboard"
+version = "0.1.1"
+description = "Easily snowboard down gnarly loss gradients"
+authors = [
+    {name = "Nicholas Bailey"}
+]
+license = {text = "MIT"}
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "numpy (>=2.0.2,<3.0.0)",
+    "scipy (>=1.15.3,<2.0.0)"
+]
+[tool.poetry]
+[tool.poetry.group.dev.dependencies]
+black = "^25.1.0"
+flake8 = "7.3.0"
+pytest = "^8.4.1"
+pytest-cov = "^6.2.1"
+[tool.black]
+line-length = 88
+target-version = ['py312']
+include = '\.pyi?$'
+extend-exclude = '''
+# A regex preceded with ^/ will apply only to files and directories
+# in the root of the project.
+(
+  ^/foo.py    # exclude a file named foo.py in the root of the project
+  | .*_pb2.py  # exclude autogenerated Protocol Buffer files anywhere in the project
+)
+'''
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"