PyPI - nshtrainer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nshtrainer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

nshtrainer/__init__.py +64 -0
nshtrainer/_experimental/__init__.py +2 -0
nshtrainer/_experimental/flops/__init__.py +48 -0
nshtrainer/_experimental/flops/flop_counter.py +787 -0
nshtrainer/_experimental/flops/module_tracker.py +140 -0
nshtrainer/_snoop.py +216 -0
nshtrainer/_submit/print_environment_info.py +31 -0
nshtrainer/_submit/session/_output.py +12 -0
nshtrainer/_submit/session/_script.py +109 -0
nshtrainer/_submit/session/lsf.py +467 -0
nshtrainer/_submit/session/slurm.py +573 -0
nshtrainer/_submit/session/unified.py +350 -0
nshtrainer/actsave/__init__.py +7 -0
nshtrainer/actsave/_callback.py +75 -0
nshtrainer/actsave/_loader.py +144 -0
nshtrainer/actsave/_saver.py +337 -0
nshtrainer/callbacks/__init__.py +35 -0
nshtrainer/callbacks/_throughput_monitor_callback.py +549 -0
nshtrainer/callbacks/base.py +113 -0
nshtrainer/callbacks/early_stopping.py +112 -0
nshtrainer/callbacks/ema.py +383 -0
nshtrainer/callbacks/finite_checks.py +75 -0
nshtrainer/callbacks/gradient_skipping.py +103 -0
nshtrainer/callbacks/interval.py +322 -0
nshtrainer/callbacks/latest_epoch_checkpoint.py +45 -0
nshtrainer/callbacks/log_epoch.py +35 -0
nshtrainer/callbacks/norm_logging.py +187 -0
nshtrainer/callbacks/on_exception_checkpoint.py +44 -0
nshtrainer/callbacks/print_table.py +90 -0
nshtrainer/callbacks/throughput_monitor.py +56 -0
nshtrainer/callbacks/timer.py +157 -0
nshtrainer/callbacks/wandb_watch.py +103 -0
nshtrainer/config.py +289 -0
nshtrainer/data/__init__.py +4 -0
nshtrainer/data/balanced_batch_sampler.py +132 -0
nshtrainer/data/transform.py +67 -0
nshtrainer/lr_scheduler/__init__.py +18 -0
nshtrainer/lr_scheduler/_base.py +101 -0
nshtrainer/lr_scheduler/linear_warmup_cosine.py +138 -0
nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +73 -0
nshtrainer/model/__init__.py +44 -0
nshtrainer/model/base.py +641 -0
nshtrainer/model/config.py +2064 -0
nshtrainer/model/modules/callback.py +157 -0
nshtrainer/model/modules/debug.py +42 -0
nshtrainer/model/modules/distributed.py +70 -0
nshtrainer/model/modules/logger.py +170 -0
nshtrainer/model/modules/profiler.py +24 -0
nshtrainer/model/modules/rlp_sanity_checks.py +202 -0
nshtrainer/model/modules/shared_parameters.py +72 -0
nshtrainer/nn/__init__.py +19 -0
nshtrainer/nn/mlp.py +106 -0
nshtrainer/nn/module_dict.py +66 -0
nshtrainer/nn/module_list.py +50 -0
nshtrainer/nn/nonlinearity.py +157 -0
nshtrainer/optimizer.py +62 -0
nshtrainer/runner.py +21 -0
nshtrainer/scripts/check_env.py +41 -0
nshtrainer/scripts/find_packages.py +51 -0
nshtrainer/trainer/__init__.py +1 -0
nshtrainer/trainer/signal_connector.py +208 -0
nshtrainer/trainer/trainer.py +340 -0
nshtrainer/typecheck.py +144 -0
nshtrainer/util/environment.py +119 -0
nshtrainer/util/seed.py +11 -0
nshtrainer/util/singleton.py +89 -0
nshtrainer/util/slurm.py +49 -0
nshtrainer/util/typed.py +2 -0
nshtrainer/util/typing_utils.py +19 -0
nshtrainer-0.1.0.dist-info/METADATA +18 -0
nshtrainer-0.1.0.dist-info/RECORD +72 -0
nshtrainer-0.1.0.dist-info/WHEEL +4 -0

nshtrainer/callbacks/_throughput_monitor_callback.py ADDED Viewed

@@ -0,0 +1,549 @@
+# type: ignore
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from collections import deque
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Deque,
+    Dict,
+    List,
+    Optional,
+    TypeVar,
+    Union,
+)
+import torch
+from lightning.fabric.plugins import Precision as FabricPrecision
+from lightning.fabric.utilities.throughput import (
+    _plugin_to_compute_dtype as fabric_plugin_to_compute_dtype,
+)
+from lightning.fabric.utilities.throughput import get_available_flops
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.plugins import (
+    BitsandbytesPrecision,
+    DeepSpeedPrecision,
+    DoublePrecision,
+    FSDPPrecision,
+    HalfPrecision,
+    MixedPrecision,
+    Precision,
+    TransformerEnginePrecision,
+    XLAPrecision,
+)
+from lightning.pytorch.trainer.states import RunningStage, TrainerFn
+from lightning.pytorch.utilities.rank_zero import rank_zero_only, rank_zero_warn
+from typing_extensions import override
+if TYPE_CHECKING:
+    from lightning.pytorch import LightningModule, Trainer
+_THROUGHPUT_METRICS = Dict[str, Union[int, float]]
+T = TypeVar("T", bound=float)
+class _MonotonicWindow(List[T]):
+    """Custom fixed size list that only supports right-append and ensures that all values increase monotonically."""
+    def __init__(self, maxlen: int) -> None:
+        super().__init__()
+        self.maxlen = maxlen
+    @property
+    def last(self) -> Optional[T]:
+        if len(self) > 0:
+            return self[-1]
+        return None
+    @override
+    def append(self, x: T) -> None:
+        last = self.last
+        if last is not None and last >= x:
+            rank_zero_warn(
+                f"Expected the value to increase, last: {last}, current: {x}"
+            )
+        list.append(self, x)
+        # truncate excess
+        if len(self) > self.maxlen:
+            del self[0]
+    @override
+    def __setitem__(self, key: Any, value: Any) -> None:
+        # assigning is not implemented since we don't use it. it could be by checking all previous values
+        raise NotImplementedError("__setitem__ is not supported")
+class Throughput:
+    """Computes throughput.
+    +------------------------+-------------------------------------------------------------------------------------+
+    | Key                    | Value                                                                               |
+    +========================+=====================================================================================+
+    | batches_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of batches |
+    |                        | processed per second                                                                |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | samples_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of samples |
+    |                        | processed per second                                                                |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | items_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of items   |
+    |                        | processed per second                                                                |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | flpps_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of flops   |
+    |                        | processed per second                                                                |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | device/batches_per_sec | batches_per_sec divided by world size                                               |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | device/samples_per_sec | samples_per_sec divided by world size                                               |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | device/items_per_sec   | items_per_sec divided by world size. This may include padding depending on the data |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | device/flops_per_sec   | flops_per_sec divided by world size.                                                |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | device/mfu             | device/flops_per_sec divided by world size.                                         |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | time                   | Total elapsed time                                                                  |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | batches                | Total batches seen                                                                  |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | samples                | Total samples seen                                                                  |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    | lengths                | Total items seen                                                                    |
+    +--------------------------+-----------------------------------------------------------------------------------+
+    Example::
+        throughput = Throughput()
+        t0 = time()
+        for i in range(1000):
+            do_work()
+            if torch.cuda.is_available(): torch.cuda.synchronize()  # required or else time() won't be correct
+            throughput.update(time=time() - t0, samples=i)
+            if i % 10 == 0:
+                print(throughput.compute())
+    Notes:
+        - The implementation assumes that devices FLOPs are all the same as it normalizes by the world size and only
+          takes a single ``available_flops`` value.
+        - items_per_sec, flops_per_sec and MFU do not account for padding if present. We suggest using
+          samples_per_sec or batches_per_sec to measure throughput under this circumstance.
+    Args:
+        available_flops: Number of theoretical flops available for a single device.
+        world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
+        window_size: Number of batches to use for a rolling average.
+        separator: Key separator to use when creating per-device and global metrics.
+    """
+    def __init__(
+        self,
+        available_flops: Optional[float] = None,
+        world_size: int = 1,
+        window_size: int = 100,
+        separator: str = "/",
+    ) -> None:
+        self.available_flops = available_flops
+        self.separator = separator
+        assert world_size > 0
+        self.world_size = world_size
+        # throughput is computed over a window of values. at least 2 is enforced since it looks at the difference
+        # between the first and last elements
+        assert window_size > 1
+        # custom class instead of `deque(maxlen=)` because it's easy for users to mess up their timer/counters and log
+        # values that do not increase monotonically. this class will raise an error if that happens.
+        self._time: _MonotonicWindow[float] = _MonotonicWindow(maxlen=window_size)
+        self._batches: _MonotonicWindow[int] = _MonotonicWindow(maxlen=window_size)
+        self._samples: _MonotonicWindow[int] = _MonotonicWindow(maxlen=window_size)
+        self._lengths: _MonotonicWindow[int] = _MonotonicWindow(maxlen=window_size)
+        self._flops: Deque[int] = deque(maxlen=window_size)
+    def update(
+        self,
+        *,
+        time: float,
+        batches: int,
+        samples: int,
+        lengths: Optional[int] = None,
+        flops: Optional[int] = None,
+    ) -> None:
+        """Update throughput metrics.
+        Args:
+            time: Total elapsed time in seconds. It should monotonically increase by the iteration time with each
+                call.
+            batches: Total batches seen per device. It should monotonically increase with each call.
+            samples: Total samples seen per device. It should monotonically increase by the batch size with each call.
+            lengths: Total length of the samples seen. It should monotonically increase by the lengths of a batch with
+                each call.
+            flops: Flops elapased per device since last ``update()`` call. You can easily compute this by using
+                :func:`measure_flops` and multiplying it by the number of batches that have been processed.
+                The value might be different in each device if the batch size is not the same.
+        """
+        self._time.append(time)
+        if samples < batches:
+            raise ValueError(
+                f"Expected samples ({samples}) to be greater or equal than batches ({batches})"
+            )
+        self._batches.append(batches)
+        self._samples.append(samples)
+        if lengths is not None:
+            if lengths < samples:
+                raise ValueError(
+                    f"Expected lengths ({lengths}) to be greater or equal than samples ({samples})"
+                )
+            self._lengths.append(lengths)
+            if len(self._samples) != len(self._lengths):
+                raise RuntimeError(
+                    f"If lengths are passed ({len(self._lengths)}), there needs to be the same number of samples"
+                    f" ({len(self._samples)})"
+                )
+        if flops is not None:
+            # sum of flops across ranks
+            self._flops.append(flops * self.world_size)
+    def compute(self) -> _THROUGHPUT_METRICS:
+        """Compute throughput metrics."""
+        metrics = {
+            "time": self._time[-1],
+            "batches": self._batches[-1],
+            "samples": self._samples[-1],
+        }
+        if self._lengths:
+            metrics["lengths"] = self._lengths[-1]
+        add_global_metrics = self.world_size > 1
+        # a different but valid design choice would be to still compute all these metrics even if the window of values
+        # has not been filled
+        if len(self._time) == self._time.maxlen:
+            elapsed_time = self._time[-1] - self._time[0]
+            elapsed_batches = self._batches[-1] - self._batches[0]
+            elapsed_samples = self._samples[-1] - self._samples[0]
+            # we are safe from ZeroDivisionError thanks to `_MonotonicWindow`
+            dev_samples_per_sec = elapsed_samples / elapsed_time
+            dev_batches_per_sec = elapsed_batches / elapsed_time
+            metrics.update(
+                {
+                    f"device{self.separator}batches_per_sec": elapsed_batches
+                    / elapsed_time,
+                    f"device{self.separator}samples_per_sec": dev_samples_per_sec,
+                }
+            )
+            if add_global_metrics:
+                samples_per_sec = dev_batches_per_sec * self.world_size
+                metrics.update(
+                    {
+                        "batches_per_sec": samples_per_sec,
+                        "samples_per_sec": dev_samples_per_sec * self.world_size,
+                    }
+                )
+            if len(self._lengths) == self._lengths.maxlen:
+                elapsed_lengths = self._lengths[-1] - self._lengths[0]
+                dev_items_per_sec = elapsed_lengths / elapsed_time
+                metrics[f"device{self.separator}items_per_sec"] = dev_items_per_sec
+                if add_global_metrics:
+                    items_per_sec = dev_items_per_sec * self.world_size
+                    metrics["items_per_sec"] = items_per_sec
+        if len(self._flops) == self._flops.maxlen:
+            elapsed_flops = sum(self._flops) - self._flops[0]
+            elapsed_time = self._time[-1] - self._time[0]
+            flops_per_sec = elapsed_flops / elapsed_time
+            dev_flops_per_sec = flops_per_sec / self.world_size
+            if add_global_metrics:
+                metrics["flops_per_sec"] = flops_per_sec
+            metrics[f"device{self.separator}flops_per_sec"] = dev_flops_per_sec
+            if self.available_flops:
+                metrics[f"device{self.separator}mfu"] = (
+                    dev_flops_per_sec / self.available_flops
+                )
+        return metrics
+    def reset(self) -> None:
+        self._time.clear()
+        self._batches.clear()
+        self._samples.clear()
+        self._lengths.clear()
+        self._flops.clear()
+class ThroughputMonitor(Callback):
+    r"""Computes and logs throughput with the :class:`~lightning.fabric.utilities.throughput.Throughput`
+    Example::
+        class MyModel(LightningModule):
+            def setup(self, stage):
+                with torch.device("meta"):
+                    model = MyModel()
+                    def sample_forward():
+                        batch = torch.randn(..., device="meta")
+                        return model(batch)
+                    self.flops_per_batch = measure_flops(model, sample_forward, loss_fn=torch.Tensor.sum)
+        logger = ...
+        throughput = ThroughputMonitor(batch_size_fn=lambda batch: batch.size(0))
+        trainer = Trainer(max_steps=1000, log_every_n_steps=10, callbacks=throughput, logger=logger)
+        model = MyModel()
+        trainer.fit(model)
+    Notes:
+        - It assumes that the batch size is the same during all iterations.
+        - It will try to access a ``flops_per_batch`` attribute on your ``LightningModule`` on every iteration.
+          We suggest using the :func:`~lightning.fabric.utilities.throughput.measure_flops` function for this.
+          You might want to compute it differently each time based on your setup.
+    Args:
+        batch_size_fn: A function to compute the number of samples given a batch.
+        length_fn: A function to compute the number of items in a sample given a batch.
+        \**kwargs: See available parameters in
+            :class:`~lightning.fabric.utilities.throughput.Throughput`
+    """
+    def __init__(
+        self,
+        batch_size_fn: Callable[[Any], int],
+        length_fn: Optional[Callable[[Any], int | None]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        self.kwargs = kwargs
+        self.batch_size_fn = batch_size_fn
+        self.length_fn = length_fn
+        self.available_flops: Optional[int] = None
+        self._throughputs: Dict[RunningStage, Throughput] = {}
+        self._t0s: Dict[RunningStage, float] = {}
+        self._samples: Dict[RunningStage, int] = {}
+        self._lengths: Dict[RunningStage, int] = {}
+    @override
+    def setup(
+        self, trainer: "Trainer", pl_module: "LightningModule", stage: str
+    ) -> None:
+        dtype = _plugin_to_compute_dtype(trainer.precision_plugin)
+        self.available_flops = get_available_flops(trainer.strategy.root_device, dtype)
+        if stage == TrainerFn.FITTING and trainer.enable_validation:
+            # `fit` includes validation inside
+            throughput = Throughput(
+                available_flops=self.available_flops,
+                world_size=trainer.world_size,
+                **self.kwargs,
+            )
+            self._throughputs[RunningStage.VALIDATING] = throughput
+        throughput = Throughput(
+            available_flops=self.available_flops,
+            world_size=trainer.world_size,
+            **self.kwargs,
+        )
+        stage = trainer.state.stage
+        assert stage is not None
+        self._throughputs[stage] = throughput
+    def _start(self, trainer: "Trainer") -> None:
+        stage = trainer.state.stage
+        assert stage is not None
+        self._throughputs[stage].reset()
+        self._samples[stage] = 0
+        self._lengths[stage] = 0
+        self._t0s[stage] = time.perf_counter()
+    def _update(
+        self,
+        trainer: "Trainer",
+        pl_module: "LightningModule",
+        batch: Any,
+        iter_num: int,
+    ) -> None:
+        stage = trainer.state.stage
+        assert stage is not None
+        throughput = self._throughputs[stage]
+        if trainer.strategy.root_device.type == "cuda":
+            # required or else perf_counter() won't be correct
+            torch.cuda.synchronize()
+        elapsed = time.perf_counter() - self._t0s[stage]
+        if self.length_fn is not None:
+            with torch.inference_mode():
+                if (length := self.length_fn(batch)) is not None:
+                    self._lengths[stage] += length
+        if hasattr(pl_module, "flops_per_batch"):
+            flops_per_batch = pl_module.flops_per_batch
+        else:
+            rank_zero_warn(
+                "When using the `ThroughputMonitor`, you need to define a `flops_per_batch` attribute or property"
+                f" in {type(pl_module).__name__} to compute the FLOPs."
+            )
+            flops_per_batch = None
+        with torch.inference_mode():
+            self._samples[stage] += self.batch_size_fn(batch)
+        throughput.update(
+            time=elapsed,
+            batches=iter_num,
+            samples=self._samples[stage],
+            lengths=None if self.length_fn is None else self._lengths[stage],
+            flops=flops_per_batch,
+        )
+    def _compute(self, trainer: "Trainer", iter_num: Optional[int] = None) -> None:
+        if not trainer._logger_connector.should_update_logs:
+            return
+        stage = trainer.state.stage
+        assert stage is not None
+        throughput = self._throughputs[stage]
+        metrics = throughput.compute()
+        # prefix with the stage to avoid collisions
+        metrics = {
+            f"{stage.value}{throughput.separator}{k}": v for k, v in metrics.items()
+        }
+        trainer._logger_connector.log_metrics(metrics, step=iter_num)  # type: ignore[arg-type]
+    @override
+    @rank_zero_only
+    def on_train_start(self, trainer: "Trainer", *_: Any) -> None:
+        self._start(trainer)
+    @override
+    @rank_zero_only
+    def on_train_batch_end(
+        self,
+        trainer: "Trainer",
+        pl_module: "LightningModule",
+        outputs: Any,
+        batch: Any,
+        *_: Any,
+    ) -> None:
+        self._update(trainer, pl_module, batch, trainer.fit_loop.total_batch_idx + 1)
+        # log only when gradient accumulation is over. this ensures that we only measure when the effective batch has
+        # finished and the `optimizer.step()` time is included
+        if not trainer.fit_loop._should_accumulate():
+            self._compute(trainer)
+    @override
+    @rank_zero_only
+    def on_validation_start(self, trainer: "Trainer", *_: Any) -> None:
+        if trainer.sanity_checking:
+            return
+        self._start(trainer)
+    @override
+    @rank_zero_only
+    def on_validation_batch_end(
+        self,
+        trainer: "Trainer",
+        pl_module: "LightningModule",
+        outputs: Any,
+        batch: Any,
+        *_: Any,
+        **__: Any,
+    ) -> None:
+        if trainer.sanity_checking:
+            return
+        iter_num = trainer._evaluation_loop.batch_progress.total.ready
+        self._update(trainer, pl_module, batch, iter_num)
+        self._compute(trainer, iter_num)
+    @override
+    @rank_zero_only
+    def on_validation_end(self, trainer: "Trainer", *_: Any) -> None:
+        if trainer.sanity_checking or trainer.state.fn != TrainerFn.FITTING:
+            return
+        # add the validation time to the training time before continuing to avoid sinking the training throughput
+        training_finished = self._t0s[RunningStage.TRAINING] + sum(
+            self._throughputs[RunningStage.TRAINING]._time
+        )
+        time_between_train_and_val = (
+            self._t0s[RunningStage.VALIDATING] - training_finished
+        )
+        val_time = sum(self._throughputs[RunningStage.VALIDATING]._time)
+        self._t0s[RunningStage.TRAINING] += time_between_train_and_val + val_time
+    @override
+    @rank_zero_only
+    def on_test_start(self, trainer: "Trainer", *_: Any) -> None:
+        self._start(trainer)
+    @override
+    @rank_zero_only
+    def on_test_batch_end(
+        self,
+        trainer: "Trainer",
+        pl_module: "LightningModule",
+        outputs: Any,
+        batch: Any,
+        *_: Any,
+        **__: Any,
+    ) -> None:
+        iter_num = trainer._evaluation_loop.batch_progress.total.ready
+        self._update(trainer, pl_module, batch, iter_num)
+        self._compute(trainer, iter_num)
+    @override
+    @rank_zero_only
+    def on_predict_start(self, trainer: "Trainer", *_: Any) -> None:
+        self._start(trainer)
+    @override
+    @rank_zero_only
+    def on_predict_batch_end(
+        self,
+        trainer: "Trainer",
+        pl_module: "LightningModule",
+        outputs: Any,
+        batch: Any,
+        *_: Any,
+        **__: Any,
+    ) -> None:
+        iter_num = trainer.predict_loop.batch_progress.total.ready
+        self._update(trainer, pl_module, batch, iter_num)
+        self._compute(trainer, iter_num)
+def _plugin_to_compute_dtype(plugin: Union[FabricPrecision, Precision]) -> torch.dtype:
+    # TODO: integrate this into the precision plugins
+    if not isinstance(plugin, Precision):
+        return fabric_plugin_to_compute_dtype(plugin)
+    if isinstance(plugin, BitsandbytesPrecision):
+        return plugin.dtype
+    if isinstance(plugin, HalfPrecision):
+        return plugin._desired_input_dtype
+    if isinstance(plugin, MixedPrecision):
+        return torch.bfloat16 if plugin.precision == "bf16-mixed" else torch.half
+    if isinstance(plugin, DoublePrecision):
+        return torch.double
+    if isinstance(plugin, (XLAPrecision, DeepSpeedPrecision)):
+        return plugin._desired_dtype
+    if isinstance(plugin, TransformerEnginePrecision):
+        return torch.int8
+    if isinstance(plugin, FSDPPrecision):
+        return plugin.mixed_precision_config.reduce_dtype or torch.float32
+    if isinstance(plugin, Precision):
+        return torch.float32
+    raise NotImplementedError(plugin)

nshtrainer/callbacks/base.py ADDED Viewed

@@ -0,0 +1,113 @@
+from abc import ABC, abstractmethod
+from collections import Counter
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, TypeAlias, TypedDict
+from lightning.pytorch import Callback
+from ..config import TypedConfig
+if TYPE_CHECKING:
+    from ..model.config import BaseConfig
+class CallbackMetadataDict(TypedDict, total=False):
+    ignore_if_exists: bool
+    """If `True`, the callback will not be added if another callback with the same class already exists."""
+    priority: int
+    """Priority of the callback. Callbacks with higher priority will be loaded first."""
+class CallbackMetadataConfig(TypedConfig):
+    ignore_if_exists: bool = False
+    """If `True`, the callback will not be added if another callback with the same class already exists."""
+    priority: int = 0
+    """Priority of the callback. Callbacks with higher priority will be loaded first."""
+@dataclass(frozen=True)
+class CallbackWithMetadata:
+    callback: Callback
+    metadata: CallbackMetadataConfig
+ConstructedCallback: TypeAlias = Callback | CallbackWithMetadata
+class CallbackConfigBase(TypedConfig, ABC):
+    metadata: CallbackMetadataConfig = CallbackMetadataConfig()
+    """Metadata for the callback."""
+    def with_metadata(self, callback: Callback, **metadata: CallbackMetadataDict):
+        return CallbackWithMetadata(
+            callback=callback, metadata=self.metadata.model_copy(update=metadata)
+        )
+    @abstractmethod
+    def construct_callbacks(
+        self, root_config: "BaseConfig"
+    ) -> Iterable[Callback | CallbackWithMetadata]: ...
+# region Config resolution helpers
+def _construct_callbacks_with_metadata(
+    config: CallbackConfigBase, root_config: "BaseConfig"
+) -> Iterable[CallbackWithMetadata]:
+    for callback in config.construct_callbacks(root_config):
+        if isinstance(callback, CallbackWithMetadata):
+            yield callback
+            continue
+        callback = config.with_metadata(callback)
+        yield callback
+def _filter_ignore_if_exists(callbacks: list[CallbackWithMetadata]):
+    # First, let's do a pass over all callbacks to hold the count of each callback class
+    callback_classes = Counter(callback.callback.__class__ for callback in callbacks)
+    # Remove non-duplicates
+    callbacks_filtered: list[CallbackWithMetadata] = []
+    for callback in callbacks:
+        # If `ignore_if_exists` is `True` and there is already a callback of the same class, skip this callback
+        if (
+            callback.metadata.ignore_if_exists
+            and callback_classes[callback.callback.__class__] > 1
+        ):
+            continue
+        callbacks_filtered.append(callback)
+    return callbacks_filtered
+def _process_and_filter_callbacks(
+    callbacks: Iterable[CallbackWithMetadata],
+) -> list[Callback]:
+    callbacks = list(callbacks)
+    # Sort by priority (higher priority first)
+    callbacks.sort(key=lambda callback: callback.metadata.priority, reverse=True)
+    # Process `ignore_if_exists`
+    callbacks = _filter_ignore_if_exists(callbacks)
+    return [callback.callback for callback in callbacks]
+def resolve_all_callbacks(root_config: "BaseConfig"):
+    callback_configs = [
+        config for config in root_config.ll_all_callback_configs() if config is not None
+    ]
+    callbacks = _process_and_filter_callbacks(
+        callback
+        for callback_config in callback_configs
+        for callback in _construct_callbacks_with_metadata(callback_config, root_config)
+    )
+    return callbacks
+# endregion