PyPI - libinephany - Versions diffs - 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl - Mend

libinephany 0.18.0py3-none-any.whl → 0.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

libinephany/observations/statistic_trackers.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Any, Callable, final
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.optim as optim
 from torch.distributed import ReduceOp
@@ -597,6 +598,58 @@ class ActivationStatistics(Statistic):
         return None
+class InnerStepParameterUpdateStatistics(Statistic):
+    def __init__(
+        self,
+        *,
+        skip_statistics: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to not include in returned observations.
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.skip_statistics = skip_statistics
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.TENSOR_STATISTICS
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        update_tensor = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        if update_tensor is None:
+            update_tensor = torch.cat([torch.zeros(p.view(-1).shape, device=p.device) for p in parameters])
+        return update_tensor
 class ParameterUpdateStatistics(Statistic):
     def __init__(
@@ -694,6 +747,51 @@ class ParameterStatistics(Statistic):
         return torch.cat([p.data.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p)])
+class InnerStepParameterStatistics(Statistic):
+    def __init__(
+        self,
+        *,
+        skip_statistics: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to not include in returned observations.
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.skip_statistics = skip_statistics
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.TENSOR_STATISTICS
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        return torch.cat([p.data.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p)])
 class LAMBTrustRatioStatistics(Statistic):
     def __init__(
@@ -759,6 +857,71 @@ class LAMBTrustRatioStatistics(Statistic):
         return lamb_trust_ratio
+class LHOPTLAMBTrustRatioStatistics(Statistic):
+    def __init__(
+        self,
+        *,
+        use_log_transform: bool = False,
+        **kwargs,
+    ) -> None:
+        """
+        :param use_log_transform: Whether to transform the LAMB trust ratio by taking ln(1 + R).
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.use_log_transform = use_log_transform
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        weights_list = [p.data.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p)]
+        if weights_list:
+            weights = torch.cat(weights_list)
+        else:
+            weights = None
+        updates = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        update_norm = torch.norm(updates, p=2).item() if updates is not None else 0
+        weight_norm = torch.norm(weights, p=2).item() if weights is not None else 0
+        lamb_trust_ratio = 0.0
+        if update_norm > 0:
+            lamb_trust_ratio = weight_norm / update_norm
+            if self.use_log_transform:
+                lamb_trust_ratio = math.log(1 + lamb_trust_ratio)
+        return lamb_trust_ratio
 class NumberOfParameters(Statistic):
     def __init__(
@@ -958,3 +1121,435 @@ class GradientVarianceFraction(Statistic):
             return 0.0
         return variance_parameters / total_parameters
+class AverageParameterUpdateMagnitudeStatistics(Statistic):
+    def __init__(
+        self,
+        *,
+        skip_statistics: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to not include in returned observations.
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.skip_statistics = skip_statistics
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None or a float.
+        """
+        update_tensor = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        # when update tensor is none, return 0.0
+        if update_tensor is None:
+            return 0.0
+        update_tensor = update_tensor.view(-1)
+        update_tensor = update_tensor.abs()
+        average_update_magnitude = update_tensor.mean().item()
+        return average_update_magnitude
+class MomentumGradientRatioStatistics(Statistic):
+    def __init__(
+        self,
+        *,
+        skip_statistics: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to not include in returned observations.
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.skip_statistics = skip_statistics
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        momentum = observation_utils.form_momentum_tensor(
+            optimizer=optimizer, parameters=parameters, parameter_group=parameter_group
+        )
+        if momentum is None:
+            return None
+        gradients_list = [
+            p.grad.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        # Handle empty gradients list
+        if not gradients_list:
+            return 0.0
+        gradients = torch.cat(gradients_list).view(-1)
+        # momentum_gradient_ratio r^t=\frac{\|g^t\|_2}{\|\nabla f(w^t)\|_2}
+        gradients_norm = gradients.norm(p=2)
+        momentum_norm = momentum.norm(p=2)
+        if momentum_norm == 0:
+            momentum_gradient_ratio = 0.0
+        else:
+            momentum_gradient_ratio = (gradients_norm / momentum_norm).item()
+        return momentum_gradient_ratio
+class LogOfNoiseScaleStatistics(Statistic):
+    """
+    Statistics for the log of noise scale in training.
+    Tracks the log of noise scale B_{noise} using the formula:
+    B_{noise} = tr(ΣH) / (G^T H G) ≈ (B/ε) * tr(HΣ) / tr(H^3 Σ)
+    where:
+    - H is the Hessian matrix
+    - G is the gradient vector
+    - Σ is the noise covariance matrix
+    - B is the batch size
+    - ε is the learning rate
+    """
+    def __init__(
+        self,
+        *,
+        skip_statistics: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        """
+        :param skip_statistics: If the observation uses the TensorStatistic model to return observations, names of the
+        fields in the model to not include in returned observations.
+        :param kwargs: Other observation keyword arguments.
+        """
+        super().__init__(**kwargs)
+        self.skip_statistics = skip_statistics
+    @property
+    def requires_gradient_graphs(self) -> bool:
+        """
+        :return: Whether the statistic requires gradient graphs to be retained.
+        """
+        return False
+    @staticmethod
+    def compute_hessian_diagonals(parameters: list[torch.Tensor]) -> torch.Tensor:
+        """
+        :param parameters: Parameters to compute the hessian diagonal matrices for.
+        :return: Tensor containing the hessian diagonal matrices for all given parameters.
+        """
+        hessian_diagonals = []
+        for parameter in parameters:
+            if parameter.grad is not None:
+                so_gradient = torch.autograd.grad(
+                    outputs=parameter.grad.clone(),
+                    inputs=parameter,
+                    grad_outputs=torch.ones_like(parameter.grad, requires_grad=True),
+                    only_inputs=True,
+                    retain_graph=True,
+                    create_graph=True,
+                    allow_unused=True,
+                )[0]
+                if so_gradient is not None:
+                    hessian_diagonals.append(so_gradient.view(-1))
+                else:
+                    hessian_diagonals.append(torch.zeros_like(parameter.view(-1)))
+        return torch.cat(hessian_diagonals)
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        Computes the log of noise scale using the approximate formula:
+        log(B_{noise}) ≈ log(B/ε) (move to observer) + log(tr(HΣ)) - log(tr(H^3 Σ))
+        where:
+        - H is the Hessian matrix
+        - Σ is the noise covariance matrix
+        - B is the batch size
+        - ε is the learning rate
+        """
+        # Compute Hessian diagonals as in SecondOrderGradients Observation
+        # hessian_diagonals = self.compute_hessian_diagonals(parameters)
+        # use squared first order gradients as approximations
+        fo_gradients = [
+            p.grad.view(-1) for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not fo_gradients:
+            return None
+        hessian_diagonals = torch.cat(fo_gradients) ** 2
+        if hessian_diagonals.numel() == 0:  # No gradients
+            return None
+        # For noise covariance matrix Σ, we'll use the identity matrix as an approximation
+        # This is a common assumption when the exact noise structure is unknown
+        noise_covariance = torch.ones_like(hessian_diagonals)
+        # Compute tr(HΣ)
+        trace_hessian_noise_covariance = torch.sum(hessian_diagonals * noise_covariance)
+        # Avoid division by zero and log of zero
+        if trace_hessian_noise_covariance <= 0:
+            return None
+        log_trace_hessian_noise_covariance = torch.log(trace_hessian_noise_covariance).item()
+        # Compute tr(H^3 Σ)
+        trace_hessian_cubed_noise_covariance = torch.sum(hessian_diagonals**3 * noise_covariance)
+        if trace_hessian_cubed_noise_covariance <= 0:
+            return None
+        log_trace_hessian_cubed_noise_covariance = torch.log(trace_hessian_cubed_noise_covariance).item()
+        # Compute final result: log(B_{noise}) ≈ log(tr(HΣ)) - log(tr(H^3 Σ))
+        # Note: log(B/ε) term is handled in the observer layer
+        log_noise_scale_without_log_b_over_epsilon = (
+            log_trace_hessian_noise_covariance - log_trace_hessian_cubed_noise_covariance
+        )
+        return log_noise_scale_without_log_b_over_epsilon
+class CosineSimilarityObserverOfGradientAndMomentumStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and momentum.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        momentum = observation_utils.form_momentum_tensor(
+            optimizer=optimizer, parameters=parameters_with_grads, parameter_group=parameter_group
+        )
+        if momentum is None:
+            return None
+        gradients_2d = gradients.unsqueeze(0)
+        momentum_2d = momentum.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, momentum_2d, dim=1).item()
+        return cosine_similarity
+class CosineSimilarityObserverOfGradientAndUpdateStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and update.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        # Filter parameters that have gradients to ensure consistent tensor sizes
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        update_tensor = observation_utils.form_update_tensor(
+            optimizer=optimizer, parameters=parameters_with_grads, parameter_group=parameter_group
+        )
+        if update_tensor is None:
+            return None
+        gradients_2d = gradients.unsqueeze(0)
+        update_tensor_2d = update_tensor.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, update_tensor_2d, dim=1).item()
+        return cosine_similarity
+class CosineSimilarityOfGradientAndParameterStatistics(Statistic):
+    """
+    Statistics for the cosine similarity of gradient and parameter.
+    """
+    def _get_storage_format(self) -> StatisticStorageTypes:
+        """
+        :return: Storage format this observation stores data in. Must be one of the enum attributes in the
+        StatisticStorageTypes enumeration class.
+        """
+        return StatisticStorageTypes.FLOAT
+    def _gather(
+        self,
+        *,
+        optimizer: optim.Optimizer,
+        model: nn.Module,
+        parameters: list[torch.Tensor],
+        parameter_group: dict[str, Any],
+    ) -> torch.Tensor | TensorStatistics | float | None:
+        """
+        :param optimizer: Optimizer the given parameters and parameter group came from.
+        :param model: Inner model to gather statistics from.
+        :param parameters: List of parameters to gather statistics from.
+        :param parameter_group: Parameter group the parameters originate from.
+        :return: None, TensorStatistics model or a float.
+        """
+        # Filter parameters that have gradients to ensure consistent tensor sizes
+        parameters_with_grads = [
+            p for p in parameters if observation_utils.tensor_on_local_rank(p) and p.grad is not None
+        ]
+        if not parameters_with_grads:
+            return None
+        gradients_list = [p.grad.view(-1) for p in parameters_with_grads]
+        gradients = torch.cat(gradients_list).view(-1)
+        parameters_list = [p.view(-1) for p in parameters_with_grads]
+        if not parameters_list:
+            return None
+        parameters_tensor = torch.cat(parameters_list).view(-1)
+        gradients_2d = gradients.unsqueeze(0)
+        parameters_tensor_2d = parameters_tensor.unsqueeze(0)
+        cosine_similarity = F.cosine_similarity(gradients_2d, parameters_tensor_2d, dim=1).item()
+        return cosine_similarity

libinephany/utils/constants.py CHANGED Viewed

@@ -43,7 +43,7 @@ AGENT_PREFIX_EPS = "adam-eps"
 AGENT_PREFIX_SGD_MOMENTUM = "sgd-momentum"
 AGENT_BATCH_SIZE = "batch-size"
-AGENT_GRADIENT_ACCUMULATION = "gradient-accumulation"
+AGENT_PREFIX_GRADIENT_ACCUMULATION = "gradient-accumulation"
 AGENT_BANDIT_SUFFIX = "bandit-agent"
@@ -68,7 +68,7 @@ PREFIXES = [
     AGENT_PREFIX_BETA_TWO,
     AGENT_PREFIX_EPS,
     AGENT_PREFIX_SGD_MOMENTUM,
-    AGENT_GRADIENT_ACCUMULATION,
+    AGENT_PREFIX_GRADIENT_ACCUMULATION,
 ]
 PREFIXES_TO_HPARAMS = {
     AGENT_PREFIX_LR: LEARNING_RATE,
@@ -79,6 +79,6 @@ PREFIXES_TO_HPARAMS = {
     AGENT_PREFIX_BETA_TWO: ADAM_BETA_TWO,
     AGENT_PREFIX_EPS: ADAM_EPS,
     AGENT_PREFIX_SGD_MOMENTUM: SGD_MOMENTUM,
-    AGENT_GRADIENT_ACCUMULATION: GRADIENT_ACCUMULATION,
+    AGENT_PREFIX_GRADIENT_ACCUMULATION: GRADIENT_ACCUMULATION,
 }
 HPARAMS_TO_PREFIXES = {hparam: prefix for prefix, hparam in PREFIXES_TO_HPARAMS.items()}

{libinephany-0.18.0.dist-info → libinephany-0.19.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: libinephany
-Version: 0.18.0
+Version: 0.19.0
 Summary: Inephany library containing code commonly used by multiple subpackages.
 Author-email: Inephany <info@inephany.com>
 License: Apache 2.0

libinephany 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

libinephany 0.18.0py3-none-any.whl → 0.19.0py3-none-any.whl