PyPI - libinephany - Versions diffs - 0.15.0__tar.gz → 0.15.2__tar.gz - Mend

libinephany 0.15.0tar.gz → 0.15.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

libinephany-0.15.2/CODE_VERSION.cfg ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.15.2

{libinephany-0.15.0/libinephany.egg-info → libinephany-0.15.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: libinephany
-Version: 0.15.0
+Version: 0.15.2
 Summary: Inephany library containing code commonly used by multiple subpackages.
 Author-email: Inephany <info@inephany.com>
 License: Apache 2.0

{libinephany-0.15.0 → libinephany-0.15.2}/libinephany/observations/observation_utils.py RENAMED Viewed

@@ -14,7 +14,7 @@ import torch
 import torch.optim as optim
 from libinephany.pydantic_models.schemas.tensor_statistics import TensorStatistics
-from libinephany.utils import optim_utils
+from libinephany.utils import optim_utils, torch_distributed_utils
 # ======================================================================================================================
 #
@@ -173,7 +173,14 @@ def tensor_on_local_rank(tensor: torch.Tensor | None) -> bool:
     :return: Whether the tensor is owned by the local rank.
     """
-    return tensor is not None and tensor.grad is not None and tensor.numel() > 0
+    valid_tensor = tensor is not None and tensor.grad is not None and tensor.numel() > 0
+    if valid_tensor and tensor.is_cuda:
+        local_rank = torch_distributed_utils.get_local_rank()
+        return tensor.device.index == local_rank
+    return valid_tensor
 def form_update_tensor(

{libinephany-0.15.0 → libinephany-0.15.2}/libinephany/observations/observers/global_observers.py RENAMED Viewed

@@ -36,7 +36,7 @@ class InitialHyperparameters(GlobalObserver):
         super().__init__(**kwargs)
-        force_skip = ["samples"]
+        force_skip = ["samples", "gradient_accumulation"]
         skip_hparams = force_skip if skip_hparams is None else skip_hparams + force_skip
         self.skip_hparams = [] if skip_hparams is None else skip_hparams
         self.pad_with = pad_with

{libinephany-0.15.0 → libinephany-0.15.2}/libinephany/observations/statistic_trackers.py RENAMED Viewed

@@ -193,13 +193,14 @@ class Statistic(ABC):
         Processes the tensor cache to build a TensorStatistic model.
         """
-        concatenated = torch.cat(self._tensor_cache)
-        self._tensor_cache = []
+        if self._tensor_cache:
+            concatenated = torch.cat(self._tensor_cache)
+            self._tensor_cache = []
-        statistics = TensorStatistics.build(
-            tensor=concatenated, skip_statistics=self.skip_statistics, sample_percentage=self.downsample_percent
-        )
-        self._data.append(statistics)  # type: ignore
+            statistics = TensorStatistics.build(
+                tensor=concatenated, skip_statistics=self.skip_statistics, sample_percentage=self.downsample_percent
+            )
+            self._data.append(statistics)  # type: ignore
     @staticmethod
     @final
@@ -213,10 +214,10 @@ class Statistic(ABC):
         if torch_distributed_utils.is_scheduler_master_rank():
             if isinstance(statistic, torch.Tensor):
-                shape = statistic.shape
+                shape = statistic.view(-1).shape
             elif isinstance(statistic, TensorStatistics):
-                shape = statistic.to_tensor().shape
+                shape = statistic.to_tensor().view(-1).shape
             elif statistic is not None:
                 shape = torch.tensor([statistic]).shape
@@ -239,23 +240,21 @@ class Statistic(ABC):
         if not torch_distributed_utils.is_distributed():
             return statistic
-        if statistic is None:
-            shape = self._determine_reduction_shape(statistic=statistic)
-            if shape is None:
-                return statistic
+        shape = self._determine_reduction_shape(statistic=statistic)
-            to_reduce = torch.zeros(shape)
+        if statistic is None:
+            to_reduce = torch.zeros(shape, dtype=torch.float64)
         elif isinstance(statistic, torch.Tensor):
-            to_reduce = statistic.clone()
+            to_reduce = statistic.clone().to(torch.float64).view(-1)
         elif isinstance(statistic, TensorStatistics):
-            to_reduce = statistic.to_tensor()
+            to_reduce = statistic.to_tensor().to(torch.float64).view(-1)
         else:
-            to_reduce = torch.tensor([statistic])
+            to_reduce = torch.tensor([statistic], dtype=torch.float64)
+        to_reduce = to_reduce.to(torch_distributed_utils.get_local_device())
         dist.reduce(to_reduce, dst=MASTER_SCHEDULER_RANK, op=ReduceOp.SUM)
         if not torch_distributed_utils.is_scheduler_master_rank():
@@ -288,23 +287,21 @@ class Statistic(ABC):
             statistic = self._gather(
                 optimizer=optimizer, model=model, parameters=parameters, parameter_group=parameter_group
             )
-            statistic = self._distributed_reduce(statistic=statistic)
-            if not torch_distributed_utils.is_scheduler_master_rank():
-                return
+            statistic = self._distributed_reduce(statistic=statistic)
-            if isinstance(statistic, torch.Tensor):
-                statistic = statistic.view(-1)
-                self._tensor_cache.append(statistic)
+            if torch_distributed_utils.is_scheduler_master_rank():
+                if isinstance(statistic, torch.Tensor):
+                    statistic = statistic.view(-1)
+                    self._tensor_cache.append(statistic)
-                if len(self._tensor_cache) >= self.max_cache_size:
-                    self._process_tensor_cache()
+                    if len(self._tensor_cache) >= self.max_cache_size:
+                        self._process_tensor_cache()
-            elif statistic is not None:
-                self._data.append(statistic)  # type: ignore
+                elif statistic is not None:
+                    self._data.append(statistic)  # type: ignore
-        if torch_distributed_utils.is_scheduler_master_rank():
-            self._sample_number += 1
+        self._sample_number += 1
     @final
     def fetch(self) -> TensorStatistics | float | None:

{libinephany-0.15.0 → libinephany-0.15.2}/libinephany/utils/torch_distributed_utils.py RENAMED Viewed

@@ -4,8 +4,10 @@
 #
 # ======================================================================================================================
+import os
 from typing import Any
+import torch
 import torch.distributed as dist
 # ======================================================================================================================
@@ -14,7 +16,11 @@ import torch.distributed as dist
 #
 # ======================================================================================================================
+CUDA = "cuda"
+CPU = "cpu"
+CUDA_PREFIX = f"{CUDA}:"
 MASTER_SCHEDULER_RANK = 0
+LOCAL_RANK = "LOCAL_RANK"
 # ======================================================================================================================
 #
@@ -48,7 +54,10 @@ def get_local_rank() -> int:
     :return: Distributed computing rank of this process.
     """
-    return dist.get_rank() if is_distributed() else MASTER_SCHEDULER_RANK
+    if not is_distributed():
+        return MASTER_SCHEDULER_RANK
+    return dist.get_rank()
 def is_scheduler_master_rank() -> bool:
@@ -83,3 +92,15 @@ def barrier() -> None:
     if is_distributed():
         dist.barrier()
+def get_local_device() -> torch.device:
+    """
+    :return: Local device of the current rank.
+    """
+    if not is_distributed():
+        return torch.device(CUDA if torch.cuda.is_available() else CPU)
+    local_device_rank = os.environ.get(LOCAL_RANK, MASTER_SCHEDULER_RANK)
+    return torch.device(f"{CUDA_PREFIX}{local_device_rank}" if torch.cuda.is_available() else CPU)

{libinephany-0.15.0 → libinephany-0.15.2/libinephany.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: libinephany
-Version: 0.15.0
+Version: 0.15.2
 Summary: Inephany library containing code commonly used by multiple subpackages.
 Author-email: Inephany <info@inephany.com>
 License: Apache 2.0