PyPI - libinephany - Versions diffs - 0.18.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

libinephany 0.18.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

libinephany/observations/statistic_trackers.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Any, Callable, final
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.optim as optim
 from torch.distributed import ReduceOp
@@ -75,7 +76,7 @@ class Statistic(ABC):
         self.max_cache_size = max_statistic_cache_size
         self.downsample_percent = tensor_stats_downsample_percentage
         self.sample_frequency = statistic_sample_frequency
-        self.skip_statistics: list[str] | None = None
+        self.include_statistics: list[str] | None = None
     @final
     @property
@@ -194,12 +195,17 @@ class Statistic(ABC):
         Processes the tensor cache to build a TensorStatistic model.
         """
+        if not self.include_statistics:
+            raise ValueError(f"{self.__class__.__name__} must be provided with include_statistics.")
         if self._tensor_cache:
             concatenated = torch.cat(self._tensor_cache)
             self._tensor_cache = []
             statistics = TensorStatistics.build(
-                tensor=concatenated, skip_statistics=self.skip_statistics, sample_percentage=self.downsample_percent
+                tensor=concatenated,
+                include_statistics=self.include_statistics,
+                sample_percentage=self.downsample_percent,
             )
             self._data.append(statistics)  # type: ignore
@@ -367,18 +373,18 @@ class FirstOrderGradients(Statistic):
     def __init__(
         self,
         *,
-        skip_statistics: list[str] | None = None,
+        include_statistics: list[str] | None = None,
         **kwargs,
     ) -> None:
         """
-        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
-        fields in the model to not include in returned observations.
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
-        self.skip_statistics = skip_statistics
+        self.include_statistics = include_statistics
     def _get_storage_format(self) -> StatisticStorageTypes:
         """
@@ -421,22 +427,22 @@ class SecondOrderGradients(Statistic):
     def __init__(
         self,
         *,
+        include_statistics: list[str] | None = None,
         compute_hessian_diagonal: bool = False,
-        skip_statistics: list[str] | None = None,
         **kwargs,
     ) -> None:
         """
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param compute_hessian_diagonal: Whether to compute the Hessian diagonal to determine second order gradients
         or use the squared first order gradients as approximations in the same way Adam does.
-        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
-        fields in the model to not include in returned observations.
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
         self.compute_hessian_diagonal = compute_hessian_diagonal
-        self.skip_statistics = skip_statistics
+        self.include_statistics = include_statistics
     @property
     def requires_gradient_graphs(self) -> bool:
@@ -519,18 +525,18 @@ class ActivationStatistics(Statistic):
     def __init__(
         self,
         *,
-        skip_statistics: list[str] | None = None,
+        include_statistics: list[str] | None = None,
         **kwargs,
     ) -> None:
         """
-        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
-        fields in the model to not include in returned observations.
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
-        self.skip_statistics = skip_statistics
+        self.include_statistics = include_statistics
     @property
     def uses_forward_hook(self) -> bool:
@@ -553,6 +559,9 @@ class ActivationStatistics(Statistic):
         :return: Forward hook to register the function with.
         """
+        if self.include_statistics is None:
+            raise ValueError("include_statistics is required to use forward hooks!")
         def hook(module: nn.Module, layer_input: torch.Tensor, layer_output: torch.Tensor) -> None:
             """
             :param module: Module the hook was registered with. Not used here.
@@ -562,7 +571,9 @@ class ActivationStatistics(Statistic):
             if self._sample_number % self.sample_frequency == 0:
                 statistics = TensorStatistics.build(
-                    tensor=layer_output, skip_statistics=self.skip_statistics, sample_percentage=self.downsample_percent
+                    tensor=layer_output,
+                    include_statistics=self.include_statistics,
+                    sample_percentage=self.downsample_percent,
                 )
                 self._data.append(statistics)  # type: ignore
@@ -602,18 +613,18 @@ class ParameterUpdateStatistics(Statistic):
     def __init__(
         self,
         *,
-        skip_statistics: list[str] | None = None,
+        include_statistics: list[str] | None = None,
         **kwargs,
     ) -> None:
         """
-        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
-        fields in the model to not include in returned observations.
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
-        self.skip_statistics = skip_statistics
+        self.include_statistics = include_statistics
     def _get_storage_format(self) -> StatisticStorageTypes:
         """
@@ -649,23 +660,28 @@ class ParameterUpdateStatistics(Statistic):
         return update_tensor
+class LHOPTParameterUpdateStatistics(ParameterUpdateStatistics):
+    pass
 class ParameterStatistics(Statistic):
     def __init__(
         self,
         *,
-        skip_statistics: list[str] | None = None,
+        include_statistics: list[str] | None = None,
         **kwargs,
     ) -> None:
         """
-        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
-        fields in the model to not include in returned observations.
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
-        self.skip_statistics = skip_statistics
+        self.include_statistics = include_statistics
     def _get_storage_format(self) -> StatisticStorageTypes:
         """
@@ -694,21 +710,30 @@ class ParameterStatistics(Statistic):
         return torch.cat([p.data.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p)])
+class LHOPTParameterStatistics(ParameterStatistics):
+    pass
 class LAMBTrustRatioStatistics(Statistic):
     def __init__(
         self,
         *,
+        include_statistics: list[str] | None = None,
         use_log_transform: bool = False,
         **kwargs,
     ) -> None:
         """
+        :param include_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to include in returned observations.
         :param use_log_transform: Whether to transform the LAMB trust ratio by taking ln(1 + R).
         :param kwargs: Other observation keyword arguments.
         """
         super().__init__(**kwargs)
+        self.include_statistics = include_statistics
         self.use_log_transform = use_log_transform
     def _get_storage_format(self) -> StatisticStorageTypes:
@@ -759,6 +784,11 @@ class LAMBTrustRatioStatistics(Statistic):
         return lamb_trust_ratio
+class LHOPTLAMBTrustRatioStatistics(LAMBTrustRatioStatistics):
+    pass
 class NumberOfParameters(Statistic):
     def __init__(
@@ -958,3 +988,385 @@ class GradientVarianceFraction(Statistic):
             return 0.0
         return variance_parameters / total_parameters
+class AverageParameterUpdateMagnitudeStatistics(Statistic):
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None or a float.
+        """
+        update_tensor = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        # when update tensor is none, return 0.0
+        if update_tensor is None:
+            return 0.0
+        update_tensor = update_tensor.view(-1)
+        update_tensor = update_tensor.abs()
+        average_update_magnitude = update_tensor.mean().item()
+        return average_update_magnitude
+class MomentumGradientRatioStatistics(Statistic):
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        momentum = observation_utils.form_momentum_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        if momentum is None:
+            return None
+        gradients_list = [
+            p.grad.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        # Handle empty gradients list
+        if not gradients_list:
+            return 0.0
+        gradients = torch.cat(gradients_list).view(-1)
+        # momentum_gradient_ratio r^t=\frac{\|g^t\|_2}{\|\nabla f(w^t)\|_2}
+        gradients_norm = gradients.norm(p=2)
+        momentum_norm = momentum.norm(p=2)
+        if momentum_norm == 0:
+            momentum_gradient_ratio = 0.0
+        else:
+            momentum_gradient_ratio = (gradients_norm / momentum_norm).item()
+        return momentum_gradient_ratio
+class LogOfNoiseScaleStatistics(Statistic):
+    """
+    Statistics for the log of noise scale in training.
+    Tracks the log of noise scale B_{noise} using the formula:
+    B_{noise} = tr(ΣH) / (G^T H G) ≈ (B/ε) * tr(HΣ) / tr(H^3 Σ)
+    where:
+    - H is the Hessian matrix
+    - G is the gradient vector
+    - Σ is the noise covariance matrix
+    - B is the batch size
+    - ε is the learning rate
+    """
+    @property
+    def requires_gradient_graphs(self) -> bool:
+        """
+        :return: Whether the statistic requires gradient graphs to be retained.
+        """
+        return False
+    @staticmethod
+    def compute_hessian_diagonals(parameters: list[torch.Tensor]) -> torch.Tensor:
+        """
+        :param parameters: Parameters to compute the hessian diagonal matrices for.
+        :return: Tensor containing the hessian diagonal matrices for all given parameters.
+        """
+        hessian_diagonals = []
+        for parameter in parameters:
+            if parameter.grad is not None:
+                so_gradient = torch.autograd.grad(
+                    outputs=parameter.grad.clone(),
+                    inputs=parameter,
+                    grad_outputs=torch.ones_like(parameter.grad, requires_grad=True),
+                    only_inputs=True,
+                    retain_graph=True,
+                    create_graph=True,
+                    allow_unused=True,
+                )[0]
+                if so_gradient is not None:
+                    hessian_diagonals.append(so_gradient.view(-1))
+                else:
+                    hessian_diagonals.append(torch.zeros_like(parameter.view(-1)))
+        return torch.cat(hessian_diagonals)
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        Computes the log of noise scale using the approximate formula:
+        log(B_{noise}) ≈ log(B/ε) (move to observer) + log(tr(HΣ)) - log(tr(H^3 Σ))
+        where:
+        - H is the Hessian matrix
+        - Σ is the noise covariance matrix
+        - B is the batch size
+        - ε is the learning rate
+        """
+        # Compute Hessian diagonals as in SecondOrderGradients Observation
+        # hessian_diagonals = self.compute_hessian_diagonals(parameters)
+        # use squared first order gradients as approximations
+        fo_gradients = [
+            p.grad.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not fo_gradients:
+            return None
+        hessian_diagonals = torch.cat(fo_gradients) ** 2
+        if hessian_diagonals.numel() == 0:  # No gradients
+            return None
+        # For noise covariance matrix Σ, we'll use the identity matrix as an approximation
+        # This is a common assumption when the exact noise structure is unknown
+        noise_covariance = torch.ones_like(hessian_diagonals)
+        # Compute tr(HΣ)
+        trace_hessian_noise_covariance = torch.sum(hessian_diagonals * noise_covariance)
+        # Avoid division by zero and log of zero
+        if trace_hessian_noise_covariance <= 0:
+            return None
+        log_trace_hessian_noise_covariance = torch.log(trace_hessian_noise_covariance).item()
+        # Compute tr(H^3 Σ)
+        trace_hessian_cubed_noise_covariance = torch.sum(hessian_diagonals**3 * noise_covariance)
+        if trace_hessian_cubed_noise_covariance <= 0:
+            return None
+        log_trace_hessian_cubed_noise_covariance = torch.log(trace_hessian_cubed_noise_covariance).item()
+        # Compute final result: log(B_{noise}) ≈ log(tr(HΣ)) - log(tr(H^3 Σ))
+        # Note: log(B/ε) term is handled in the observer layer
+        log_noise_scale_without_log_b_over_epsilon = (
+            log_trace_hessian_noise_covariance - log_trace_hessian_cubed_noise_covariance
+        )
+        return log_noise_scale_without_log_b_over_epsilon
+class CosineSimilarityObserverOfGradientAndMomentumStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and momentum.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        momentum = observation_utils.form_momentum_tensor(
+            optimizer=optimizer, parameters=parameters_with_grads, parameter_group=parameter_group
+        )
+        if momentum is None:
+            return None
+        gradients_2d = gradients.unsqueeze(0)
+        momentum_2d = momentum.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, momentum_2d, dim=1).item()
+        return cosine_similarity
+class CosineSimilarityObserverOfGradientAndUpdateStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and update.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        # Filter parameters that have gradients to ensure consistent tensor sizes
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        update_tensor = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters_with_grads, parameter_group=parameter_group
+        )
+        if update_tensor is None:
+            return None
+        gradients_2d = gradients.unsqueeze(0)
+        update_tensor_2d = update_tensor.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, update_tensor_2d, dim=1).item()
+        return cosine_similarity
+class CosineSimilarityOfGradientAndParameterStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and parameter.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        # Filter parameters that have gradients to ensure consistent tensor sizes
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        parameters_list = [p.view(-1) for p in parameters_with_grads]
+        if not parameters_list:
+            return None
+        parameters_tensor = torch.cat(parameters_list).view(-1)
+        gradients_2d = gradients.unsqueeze(0)
+        parameters_tensor_2d = parameters_tensor.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, parameters_tensor_2d, dim=1).item()
+        return cosine_similarity

libinephany 0.18.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

libinephany 0.18.1py3-none-any.whl → 1.0.0py3-none-any.whl