PyPI - torchax - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

torchax 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchax might be problematic. Click here for more details.

Files changed (31) hide show

torchax/CONTRIBUTING.md +2 -2
torchax/__init__.py +57 -19
torchax/amp.py +333 -0
torchax/config.py +19 -12
torchax/decompositions.py +663 -195
torchax/device_module.py +7 -1
torchax/distributed.py +55 -60
torchax/export.py +26 -17
torchax/flax.py +39 -0
torchax/interop.py +275 -141
torchax/mesh_util.py +211 -0
torchax/ops/jaten.py +1718 -1294
torchax/ops/jax_reimplement.py +23 -21
torchax/ops/jc10d.py +5 -4
torchax/ops/jimage.py +113 -0
torchax/ops/jlibrary.py +9 -2
torchax/ops/jtorch.py +219 -78
torchax/ops/jtorchvision_nms.py +32 -43
torchax/ops/mappings.py +77 -35
torchax/ops/op_base.py +59 -32
torchax/ops/ops_registry.py +40 -35
torchax/tensor.py +417 -275
torchax/train.py +38 -41
torchax/util.py +88 -0
torchax/view.py +377 -0
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/METADATA +111 -145
torchax-0.0.5.dist-info/RECORD +32 -0
torchax/environment.py +0 -2
torchax-0.0.4.dist-info/RECORD +0 -27
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/WHEEL +0 -0
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/licenses/LICENSE +0 -0

torchax/train.py CHANGED Viewed

@@ -7,14 +7,11 @@ from torchax import interop
 from torchax.interop import torch_view, jax_view
 import optax
 remat = torch_view(jax.remat)
 mark_sharding = torch_view(jax.lax.with_sharding_constraint)
-def make_train_step(model_fn,
-                    loss_fn, optax_optimizer,
-                    remat_policy=None):
+def make_train_step(model_fn, loss_fn, optax_optimizer, remat_policy=None):
   """Make a function that do one train step given model and loss.
   model_fn: a function representing the model's forward:
@@ -32,7 +29,8 @@ def make_train_step(model_fn,
       to do gradient checkpointing. If None, then it means checkpoint everything.
   """
   env = torchax.default_env()
-  def loss(weights, buffers, args, label): # inputs are XLATensor
+  def loss(weights, buffers, args, label):  # inputs are XLATensor
     with env, jax.named_scope('compute_loss'):
       res = model_fn(weights, buffers, args)
       l = loss_fn(res, label)
@@ -41,26 +39,24 @@ def make_train_step(model_fn,
   loss = interop.gradient_checkpoint(loss, kwargs={'policy': remat_policy})
   grad_fn = interop.jax_value_and_grad(loss)
-  def step(weights, buffers, opt_state, args, label): #inputs are array
+  def step(weights, buffers, opt_state, args, label):  #inputs are array
     with jax.named_scope('compute_gradient'):
-        loss, gradient = grad_fn(weights, buffers, args, label)
+      loss, gradient = grad_fn(weights, buffers, args, label)
     with jax.named_scope("optimizer_updates"):
-        updates, opt_state = interop.call_jax(
-            optax_optimizer.update,
-            gradient, opt_state, weights)
-        weights = interop.call_jax(optax.apply_updates, weights, updates)
+      updates, opt_state = interop.call_jax(optax_optimizer.update, gradient,
+                                            opt_state, weights)
+      weights = interop.call_jax(optax.apply_updates, weights, updates)
     return loss, weights, opt_state
   # TODO: apply jax.jit so the user don't have to.
   return step
 class Container:
   pass
 class ScannedModule(torch.nn.Module):
   def __init__(self, module_list, checkpoint_policy=None):
@@ -75,9 +71,9 @@ class ScannedModule(torch.nn.Module):
     weights = self._stack_layer_weights(module_list)
     self.layer_weights_keys = list(self.c.one_mod.state_dict().keys())
     self.params = torch.nn.ParameterDict({
-      self._param_name_new(k): v for k, v in weights.items()
+        self._param_name_new(k): v for k, v in weights.items()
     })
   def _stack_layer_weights(self, module_list):
     # Create weights such that, for every [n, m] weights
     # becomes [k, n, m] where k is number of layer
@@ -85,36 +81,37 @@ class ScannedModule(torch.nn.Module):
     temp = collections.defaultdict(list)
     for m in module_list:
       for k, v in m.state_dict().items():
-          temp[k].append(v)
+        temp[k].append(v)
     res = {k: torch.stack(v) for k, v in temp.items()}
     return res
   def _param_name_new(self, old):
-      return '___'.join(old.split('.'))
+    return '___'.join(old.split('.'))
   def _param_name_old(self, new):
-      return '.'.join(new.split('___'))
+    return '.'.join(new.split('___'))
   def forward(self, *args, **kwargs):
-      assert not kwargs
-      weights = {k: self.params[self._param_name_new(k)] for k in self.layer_weights_keys}
-      scan = interop.torch_view(jax.lax.scan)
-      def eval_one_layer(args, weight):
-          # unpack args
-          h, *rest = args
-          newh = torch.func.functional_call(self.c.one_mod, weight, args)
-          # next layer's input; and residual to be added to list
-          return (newh, *rest), None
-      _eval_one_layer = interop.gradient_checkpoint(
-          eval_one_layer,
-          kwargs={'policy': self.checkpoint_policy},
-      )
-      h, _ = scan(
-          _eval_one_layer,
-          args,
-          weights,
-      )
-      return h[0]
+    assert not kwargs
+    weights = {
+        k: self.params[self._param_name_new(k)] for k in self.layer_weights_keys
+    }
+    scan = interop.torch_view(jax.lax.scan)
+    def eval_one_layer(args, weight):
+      # unpack args
+      h, *rest = args
+      newh = torch.func.functional_call(self.c.one_mod, weight, args)
+      # next layer's input; and residual to be added to list
+      return (newh, *rest), None
+    _eval_one_layer = interop.gradient_checkpoint(
+        eval_one_layer,
+        kwargs={'policy': self.checkpoint_policy},
+    )
+    h, _ = scan(
+        _eval_one_layer,
+        args,
+        weights,
+    )
+    return h[0]

torchax/util.py ADDED Viewed

@@ -0,0 +1,88 @@
+from typing import Any, Callable
+def partition(original: list[Any],
+              func: Callable[[Any], bool]) -> tuple[list[Any], list[Any]]:
+  """Partitions elements into two parallel lists based on a predicate function.
+  Iterates through the 'original' list, applying 'func' to each element 'a'.
+  - If `func(a)` returns True, 'a' is appended to the first list ('truthy')
+    and `None` is appended to the second list ('falsy').
+  - If `func(a)` returns False, `None` is appended to the first list ('truthy')
+    and 'a' is appended to the second list ('falsy').
+  The result is two lists of the same length as the 'original' list, acting
+  as parallel representations of the partitioned elements, using `None` as
+  placeholders.
+  This is useful when we want to mark a group of elements as static (via passing
+  static_argnums) or donated (via donate_argnums) when combining with jax.jit
+  and friends.
+  Args:
+      original: The list of elements to partition.
+      func: A callable (function or lambda) that accepts an element from
+            'original' and returns a boolean value (True or False).
+  Returns:
+      A tuple containing two lists (`truthy`, `falsy`), both of the same
+      length as `original`:
+      - The first list contains elements `x` where `func(x)` was True, and
+        `None` otherwise.
+      - The second list contains elements `x` where `func(x)` was False, and
+        `None` otherwise.
+  Example:
+      >>> def is_even(n): return n % 2 == 0
+      >>> nums = [1, 2, 3, 4, 5, 6]
+      >>> truthy_list, falsy_list = partition(nums, is_even)
+      >>> truthy_list
+      [None, 2, None, 4, None, 6]
+      >>> falsy_list
+      [1, None, 3, None, 5, None]
+  """
+  truthy = []
+  falsy = []
+  for a in original:
+    t, f = (a, None) if func(a) else (None, a)
+    truthy.append(t)
+    falsy.append(f)
+  return truthy, falsy
+def merge(list1: list[Any], list2: list[Any]) -> list[Any]:
+  """Merges two lists element-wise, prioritizing non-None elements from list1.
+  Creates a new list where each element is taken from the corresponding position
+  in 'list1', unless that element is None, in which case the element from the
+  corresponding position in 'list2' is used. Assumes both lists have the
+  same length.
+  Invariant: merge(*partion(input_list, predicate)) == input_list for any predicate
+  Args:
+      list1: The primary list. Its elements are preferred unless they are None.
+      list2: The secondary list. Its elements are used as fallbacks when the
+              corresponding element in list1 is None.
+  Returns:
+      A new list representing the merged result.
+  Raises:
+      AssertionError: If 'list1' and 'list2' do not have the same length.
+  Example:
+      >>> l1 = [1, None, 3, None]
+      >>> l2 = [None, 2, None, 4]
+      >>> merge(l1, l2)
+      [1, 2, 3, 4]
+      >>> l3 = [None, 'b', None]
+      >>> l4 = ['a', None, 'c']
+      >>> merge(l3, l4)
+      ['a', 'b', 'c']
+  """
+  assert len(list1) == len(list2)
+  res = []
+  for a, b in zip(list1, list2):
+    res.append(b if a is None else a)
+  return res

torchax/view.py ADDED Viewed

@@ -0,0 +1,377 @@
+import torch
+import torch.utils._pytree as torch_pytree
+import jax
+from enum import Enum
+from typing import Union, List, Tuple, Optional, Any, cast
+from abc import ABC, abstractmethod
+# Reference to original PyTorch native functions
+# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml
+class ViewInfoType(Enum):
+  INVALID = 0
+  NARROW = 1
+  NO_OP = 2
+  PERMUTE = 3
+  RESHAPE = 4
+  RESIZE = 5
+  SELECT = 6
+  AS_STRIDED = 7
+  DIAGONAL = 8
+class ViewInfo(ABC):
+  """
+    Abstract base class for all view operations.
+    Defines the interface for applying and updating view transformations.
+    """
+  def __init__(
+      self,
+      view_info_type: ViewInfoType = ViewInfoType.INVALID,
+  ):
+    """
+        Initialize a ViewInfo object.
+        Args:
+            view_info_type: The type of view operation
+        """
+    self.view_info_type = view_info_type
+  @abstractmethod
+  def update_tensor(self, new_value: jax.Array,
+                    jax_array: jax.Array) -> jax.Array:
+    """
+        Apply this view transformation to a JAX array and update its value.
+        Args:
+            new_value: The new values to set in the view
+            jax_array: The parent array to update
+        Returns:
+            Updated array
+        """
+    pass
+  @abstractmethod
+  def transform_tensor(self, jax_array: jax.Array) -> jax.Array:
+    """
+        Apply this view transformation to a JAX array.
+        Args:
+            jax_array: The array to transform
+        Returns:
+            Transformed array
+        """
+    pass
+  @abstractmethod
+  def calculate_output_shape(self, source: jax.Array) -> List[int]:
+    """
+        Calculate the resulting shape after applying this view.
+        Args:
+            source: Original jax array before transformation
+        Returns:
+            Resulting shape after transformation
+        """
+    pass
+class NarrowInfo(ViewInfo):
+  """
+    Represents a slicing operation on a tensor.
+    Handles operations like tensor[1:3, :, 2:5:2].
+    """
+  def __init__(self, slices: Union[slice, Tuple[slice]]) -> None:
+    """
+        Args:
+            slices: The slice(s) to apply to the tensor.
+                E.g. jax_array.at[slices] will return the transformed tensor.
+        """
+    super().__init__(ViewInfoType.NARROW)
+    self.slices = slices
+  def __eq__(self, other: object) -> bool:
+    if not isinstance(other, NarrowInfo):
+      return False
+    return self.slices == other.slices
+  def transform_tensor(self, jax_array: jax.Array) -> jax.Array:
+    try:
+      return jax_array[self.slices]
+    except IndexError as e:
+      raise IndexError("Invalid slice operation") from e
+  def update_tensor(self, new_value: jax.Array,
+                    jax_array: jax.Array) -> jax.Array:
+    return jax_array.at[self.slices].set(new_value)
+  def calculate_output_shape(self, source: jax.Array) -> List[int]:
+    return source[self.slices].shape
+class SelectInfo(ViewInfo):
+  """
+    Represents a selection operation on a tensor.
+    Typically used for indexing operations that select specific elements.
+    """
+  def __init__(self,
+               dim: int = 0,
+               start: int = 0,
+               end: int = 0,
+               stride: int = 0) -> None:
+    super().__init__(ViewInfoType.SELECT)
+    self.dim: int = dim
+    self.start: int = start
+    self.end: int = end
+    self.stride: int = stride
+  def __eq__(self, other: object) -> bool:
+    if not isinstance(other, SelectInfo):
+      return False
+    return (self.dim == other.dim and self.start == other.start and
+            self.end == other.end and self.stride == other.stride)
+  def transform_tensor(self, jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("SelectInfo.apply not implemented")
+  def update_tensor(self, new_value: jax.Array,
+                    jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("SelectInfo.update not implemented")
+  def calculate_output_shape(self, source: jax.Array) -> List[int]:
+    raise NotImplementedError(
+        "SelectInfo.calculate_output_shape not implemented")
+class AsStridedInfo(ViewInfo):
+  """
+    Information for as_strided operations.
+    """
+  def __init__(self, stride: List[int], offset: int = 0) -> None:
+    super().__init__(ViewInfoType.AS_STRIDED)
+    self.stride: List[int] = stride
+    self.offset: int = offset
+  def __eq__(self, other: object) -> bool:
+    if not isinstance(other, AsStridedInfo):
+      return False
+    return self.offset == other.offset and self.stride == other.stride
+  def transform_tensor(self, jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("AsStridedInfo.apply not implemented")
+  def update_tensor(self, new_value: jax.Array,
+                    jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("AsStridedInfo.update not implemented")
+  def calculate_output_shape(self, source: jax.Array) -> List[int]:
+    raise NotImplementedError(
+        "AsStridedInfo.calculate_output_shape not implemented")
+class DiagonalInfo(ViewInfo):
+  """
+    Information for diagonal operations.
+    Extracts diagonal elements from a tensor.
+    """
+  def __init__(self, offset: int = 0, dim1: int = 0, dim2: int = 1) -> None:
+    """
+        Args:
+            offset: Offset from the main diagonal
+            dim1: First dimension for diagonal extraction
+            dim2: Second dimension for diagonal extraction
+        """
+    super().__init__(ViewInfoType.DIAGONAL)
+    self.offset: int = offset
+    self.dim1: int = dim1
+    self.dim2: int = dim2
+  def __eq__(self, other: object) -> bool:
+    if not isinstance(other, DiagonalInfo):
+      return False
+    return (self.offset == other.offset and self.dim1 == other.dim1 and
+            self.dim2 == other.dim2)
+  def transform_tensor(self, jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("DiagonalInfo.apply not implemented")
+  def update_tensor(self, new_value: jax.Array,
+                    jax_array: jax.Array) -> jax.Array:
+    raise NotImplementedError("DiagonalInfo.update not implemented")
+  def calculate_output_shape(self, source: jax.Array) -> List[int]:
+    raise NotImplementedError(
+        "DiagonalInfo.calculate_output_shape not implemented")
+class View(torch.Tensor):
+  """
+    A View is a reference to another Tensor or another View,
+    with a transformation applied to it.
+    """
+  @staticmethod
+  def __new__(cls, parent: Union["torchax.Tensor", "View"], view_info: ViewInfo,
+              env: Any) -> "View":
+    """
+        Args:
+            parent: Parent tensor or view
+            view_info: Information about the view transformation
+            env: Environment for tensor operations
+        """
+    shape = view_info.calculate_output_shape(parent.jax())
+    return torch.Tensor._make_wrapper_subclass(
+        cls,
+        shape,
+        device="meta",
+        dtype=parent.dtype,
+        requires_grad=False,
+    )
+  def __init__(self, parent: Union["torchax.Tensor", "View"],
+               view_info: ViewInfo, env: Any) -> None:
+    super().__init__()
+    self.parent = parent
+    self.view_info = view_info
+    self._env = env
+  def get_transformation_chain(self) -> List[ViewInfo]:
+    """
+        Get all view transformations from the source tensor to this view.
+        """
+    if isinstance(self.parent, View):
+      transformations = self.parent.get_transformation_chain()
+      transformations.append(self.view_info)
+      return transformations
+    else:
+      return [self.view_info]
+  __torch_function__ = torch._C._disabled_torch_function_impl
+  def source_jax(self) -> jax.Array:
+    """
+        Returns the source tensor.
+        """
+    if isinstance(self.parent, View):
+      return self.parent.source_jax()
+    else:
+      return self.parent.jax()
+  def replace_source_jax(self, new_value: jax.Array) -> None:
+    """
+        Update the source tensor with new values.
+        """
+    if isinstance(self.parent, View):
+      self.parent.replace_source_jax(new_value)
+    else:
+      assert new_value.shape == self.parent._elem.shape
+      self.parent._elem = new_value
+  def torch(self) -> "torchax.Tensor":
+    """
+        Returns a Torchax tensor representing this view after all transformations
+        """
+    from torchax.tensor import Tensor
+    return Tensor(self.jax(), self._env)
+  def update(
+      self,
+      new_values: Union[jax.Array, "View", "torchax.Tensor"],
+      view_infos: Optional[List[ViewInfo]] = None,
+  ) -> None:
+    """
+        Update this view with new values, propagating changes back to source.
+        If view_infos is None, it will use the transformation chain
+        from the source tensor.
+        """
+    if view_infos is None:
+      view_infos = self.get_transformation_chain()
+    # Get the source JAX array
+    source_array = self.source_jax()
+    # Get the new value
+    from torchax.tensor import Tensor
+    if isinstance(new_values, View) or isinstance(new_values, Tensor):
+      new_values = new_values.jax()
+    # Apply all view transformations to the source array
+    # And store intermediate values
+    intermediate_values = [source_array]
+    for view_info in view_infos[:-1]:
+      intermediate_values.append(
+          view_info.transform_tensor(intermediate_values[-1]))
+    # TODO: Investigate efficiency of this algorithm
+    # Update the source array with the new value by
+    # applying inverse transformations in reverse order
+    for view_info, parent_array in zip(
+        reversed(view_infos), reversed(intermediate_values)):
+      # Apply the inverse transformation to propagate changes back
+      new_values = view_info.update_tensor(new_values, parent_array)
+    # Update the source tensor with the new values
+    self.replace_source_jax(new_values)
+  @classmethod
+  def __torch_dispatch__(
+      cls,
+      func: Any,
+      types: Tuple[Any, ...],
+      args: Tuple[Any, ...] = (),
+      kwargs: Optional[dict] = None,
+  ) -> Any:
+    raise AssertionError(
+        'torchax Tensors can only do math within the torchax environment.'
+        'Please wrap your code with `with torchax.default_env()` or '
+        'call torchax.enable_globally() before.')
+  def create_sub_view(self, view_info: ViewInfo) -> "View":
+    """
+        Create a new view that is a child of this view.
+        """
+    return View(self, view_info, self._env)
+  def __str__(self) -> str:
+    return f"View({self.torch()})"
+  def jax(self) -> jax.Array:
+    """
+        Returns a copy of the source tensor after transformations.
+        """
+    result = self.source_jax()
+    for view_info in self.get_transformation_chain():
+      result = view_info.transform_tensor(result)
+    return result
+  def __setitem__(self, indexes, val):
+    view_infos = self.get_transformation_chain() + [NarrowInfo(indexes)]
+    self.update(view_infos=view_infos, new_values=val)
+  def dim(self):
+    return self.ndim
+  @property
+  def device(self):
+    return torch.device("jax:0")
+  @property
+  def jax_device(self):
+    return self.jax().device
+  @property
+  def ndim(self):
+    return len(self.shape)
+  __repr__ = __str__

torchax 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

Potentially problematic release.

torchax 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl