PyPI - torchax - Versions diffs - 0.0.10.dev20251114__py3-none-any.whl → 0.0.11.dev202612__py3-none-any.whl - Mend

torchax 0.0.10.dev20251114py3-none-any.whl → 0.0.11.dev202612py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchax might be problematic. Click here for more details.

Files changed (30) hide show

torchax/__init__.py +73 -77
torchax/amp.py +143 -271
torchax/checkpoint.py +15 -9
torchax/config.py +0 -4
torchax/decompositions.py +66 -60
torchax/export.py +53 -54
torchax/flax.py +7 -5
torchax/interop.py +66 -62
torchax/mesh_util.py +20 -18
torchax/ops/__init__.py +4 -3
torchax/ops/jaten.py +3841 -3968
torchax/ops/jax_reimplement.py +68 -42
torchax/ops/jc10d.py +4 -6
torchax/ops/jimage.py +20 -25
torchax/ops/jlibrary.py +6 -6
torchax/ops/jtorch.py +355 -419
torchax/ops/jtorchvision_nms.py +69 -49
torchax/ops/mappings.py +42 -63
torchax/ops/op_base.py +17 -25
torchax/ops/ops_registry.py +35 -30
torchax/tensor.py +124 -128
torchax/train.py +100 -102
torchax/types.py +8 -7
torchax/util.py +6 -4
torchax/view.py +144 -136
{torchax-0.0.10.dev20251114.dist-info → torchax-0.0.11.dev202612.dist-info}/METADATA +7 -1
torchax-0.0.11.dev202612.dist-info/RECORD +31 -0
{torchax-0.0.10.dev20251114.dist-info → torchax-0.0.11.dev202612.dist-info}/WHEEL +1 -1
torchax-0.0.10.dev20251114.dist-info/RECORD +0 -31
{torchax-0.0.10.dev20251114.dist-info → torchax-0.0.11.dev202612.dist-info}/licenses/LICENSE +0 -0

torchax/interop.py CHANGED Viewed

@@ -15,20 +15,24 @@
 import collections
 import copy
 import functools
-import torch
-from inspect import signature
 from functools import wraps
-from torch.nn.utils import stateless as torch_stateless
+from inspect import signature
 import jax
 import jax.numpy as jnp
+import torch
 from jax import tree_util as pytree
-from jax.experimental.shard_map import shard_map
-from torchax import tensor
-from torchax import util
-from torchax.ops import mappings
+from torch.nn.utils import stateless as torch_stateless
 import torchax
+from torchax import tensor, util
+from torchax.ops import mappings
+from torchax.types import JaxCallable, JaxValue, TorchCallable, TorchValue
-from torchax.types import JaxValue, TorchValue, JaxCallable, TorchCallable
+try:
+  from jax import shard_map as shard_map  # for jax since v0.8.0
+except ImportError:
+  from jax.experimental.shard_map import shard_map
 def extract_all_buffers(m: torch.nn.Module):
@@ -39,7 +43,7 @@ def extract_all_buffers(m: torch.nn.Module):
     for k in dir(module):
       try:
         v = getattr(module, k)
-      except:
+      except Exception:
         continue
       qual_name = prefix + k
       if isinstance(v, torch.nn.parameter.Parameter) and v.requires_grad:
@@ -47,14 +51,13 @@ def extract_all_buffers(m: torch.nn.Module):
       elif isinstance(v, torch.Tensor):
         buffers[qual_name] = v
     for name, child in module.named_children():
-      extract_one(child, prefix + name + '.')
+      extract_one(child, prefix + name + ".")
-  extract_one(m, '')
+  extract_one(m, "")
   return params, buffers
 def set_all_buffers(m, params, buffers):
   def set_one(module, prefix):
     for k in dir(module):
       qual_name = prefix + k
@@ -64,17 +67,15 @@ def set_all_buffers(m, params, buffers):
         print(k, potential_v)
         setattr(module, k, torch.nn.Parameter(potential_v))
     for name, child in module.named_children():
-      set_one(child, prefix + name + '.')
+      set_one(child, prefix + name + ".")
-  set_one(m, '')
+  set_one(m, "")
 class JittableModule(torch.nn.Module):
-  def __init__(self,
-               m: torch.nn.Module,
-               extra_jit_args={},
-               dedup_parameters=True):
+  def __init__(self, m: torch.nn.Module, extra_jit_args=None, dedup_parameters=True):
+    if extra_jit_args is None:
+      extra_jit_args = {}
     super().__init__()
     self.params, self.buffers = extract_all_buffers(m)
     self._model = m
@@ -119,7 +120,7 @@ class JittableModule(torch.nn.Module):
     else:
       if not callable(method_or_name):
         raise TypeError(
-            f"method_or_name should be a callable or a string, got {type(method_or_name)}"
+          f"method_or_name should be a callable or a string, got {type(method_or_name)}"
         )
       method = method_or_name
       args = (self._model,) + args
@@ -130,8 +131,8 @@ class JittableModule(torch.nn.Module):
   def jittable_call(self, method_name: str, *args, **kwargs):
     if method_name not in self._jitted:
       jitted = jax_jit(
-          functools.partial(self.functional_call, method_name),
-          kwargs_for_jax_jit=self._extra_jit_args,
+        functools.partial(self.functional_call, method_name),
+        kwargs_for_jax_jit=self._extra_jit_args,
       )
       def jitted_forward(*args, **kwargs):
@@ -141,10 +142,10 @@ class JittableModule(torch.nn.Module):
     return self._jitted[method_name](*args, **kwargs)
   def forward(self, *args, **kwargs):
-    return self.jittable_call('forward', *args, **kwargs)
+    return self.jittable_call("forward", *args, **kwargs)
   def __getattr__(self, key):
-    if key == '_model':
+    if key == "_model":
       return super().__getattr__(key)
     if key in self._jitted:
       return self._jitted[key]
@@ -152,8 +153,9 @@ class JittableModule(torch.nn.Module):
   def make_jitted(self, key):
     jitted = jax_jit(
-        functools.partial(self.functional_call, key),
-        kwargs_for_jax_jit=self._extra_jit_args)
+      functools.partial(self.functional_call, key),
+      kwargs_for_jax_jit=self._extra_jit_args,
+    )
     def call(*args, **kwargs):
       return jitted(self.params, self.buffers, *args, **kwargs)
@@ -162,7 +164,6 @@ class JittableModule(torch.nn.Module):
 class CompileMixin:
   def functional_call(self, method, params, buffers, *args, **kwargs):
     kwargs = kwargs or {}
     params_copy = copy.copy(params)
@@ -172,24 +173,23 @@ class CompileMixin:
     return res
   def jit(self, method):
-    jitted = jax_jit(functools.partial(self.functional_call, method_name))
+    jitted = jax_jit(functools.partial(self.functional_call, method_name))  # noqa: F821
     def call(*args, **kwargs):
-      return jitted(self.named_paramters(), self.named_buffers(), *args,
-                    **kwargs)
+      return jitted(self.named_paramters(), self.named_buffers(), *args, **kwargs)
     return call
 def compile_nn_module(m: torch.nn.Module, methods=None):
   if methods is None:
-    methods = ['forward']
+    methods = ["forward"]
-  new_parent = type(
-      m.__class__.__name__ + '_with_CompileMixin',
-      (CompileMixin, m.__class__),
+  type(
+    m.__class__.__name__ + "_with_CompileMixin",
+    (CompileMixin, m.__class__),
   )
-  m.__class__ = NewParent
+  m.__class__ = NewParent  # noqa: F821
 def _torch_view(t: JaxValue) -> TorchValue:
@@ -227,15 +227,17 @@ def _jax_view(t: TorchValue) -> JaxValue:
 jax_view = functools.partial(pytree.tree_map, _jax_view)
-def call_jax(jax_func: JaxCallable, *args: TorchValue,
-             **kwargs: TorchValue) -> TorchValue:
+def call_jax(
+  jax_func: JaxCallable, *args: TorchValue, **kwargs: TorchValue
+) -> TorchValue:
   args, kwargs = jax_view((args, kwargs))
   res: JaxValue = jax_func(*args, **kwargs)
   return torch_view(res)
-def call_torch(torch_func: TorchCallable, *args: JaxValue,
-               **kwargs: JaxValue) -> JaxValue:
+def call_torch(
+  torch_func: TorchCallable, *args: JaxValue, **kwargs: JaxValue
+) -> JaxValue:
   args, kwargs = torch_view((args, kwargs))
   with torchax.default_env():
     res: TorchValue = torch_func(*args, **kwargs)
@@ -245,10 +247,10 @@ def call_torch(torch_func: TorchCallable, *args: JaxValue,
 def j2t_autograd(fn, call_jax=call_jax):
   """Given a JAX function, returns a PyTorch autograd function implemented with `jax.vjp(fn)`.
-    It wraps `fn` with `jax.vjp` to compute both the output and residuals (intermediate
-    activations). The wrapped function is then run via `call_jax` and integrated into
-    the PyTorch autograd framework by saving the residuals into the context object.
-    """
+  It wraps `fn` with `jax.vjp` to compute both the output and residuals (intermediate
+  activations). The wrapped function is then run via `call_jax` and integrated into
+  the PyTorch autograd framework by saving the residuals into the context object.
+  """
   # NOTE(qihqi): This function cannot be inlined from the callsite
   #  Becuase if it does, then it won't hit the compilation cache for
@@ -261,7 +263,7 @@ def j2t_autograd(fn, call_jax=call_jax):
     primals should be a tuple (args, kwargs).
     """
     import jax
-    from jax.tree_util import tree_flatten, tree_unflatten
+    from jax.tree_util import tree_unflatten
     def fn_wrapper(*tensors):
       # Reconstruct the original args and kwargs
@@ -277,6 +279,7 @@ def j2t_autograd(fn, call_jax=call_jax):
     Unflattening `saved_tensors` with `vjp_spec` should restore the original vjp function.
     """
     from jax.tree_util import tree_unflatten
     fun_vjp = tree_unflatten(vjp_spec, saved_tensors)
     return fun_vjp(grad_out)
@@ -285,12 +288,11 @@ def j2t_autograd(fn, call_jax=call_jax):
     from jax.tree_util import tree_flatten
     class JaxFun(torch.autograd.Function):
       @staticmethod
       def forward(ctx, tree_def, *flat_args_kwargs):
-        tensors, other = util.partition(flat_args_kwargs,
-                                        lambda x: isinstance(x, torch.Tensor))
+        tensors, other = util.partition(
+          flat_args_kwargs, lambda x: isinstance(x, torch.Tensor)
+        )
         # We want the arguments that don't require grads to be closured?
         y, fun_vjp = call_jax(_jax_forward, fn, other, tree_def, tensors)
@@ -308,8 +310,9 @@ def j2t_autograd(fn, call_jax=call_jax):
         assert len(grad_out) > 0
         grad_out = grad_out if len(grad_out) > 1 else grad_out[0]
-        input_grads_structured = call_jax(_jax_backward, ctx.vjp_spec,
-                                          ctx.saved_tensors, grad_out)
+        input_grads_structured = call_jax(
+          _jax_backward, ctx.vjp_spec, ctx.saved_tensors, grad_out
+        )
         # Construct the gradient tuple to be returned.
         # It needs to match the inputs to forward: (tree_def, *flat_inputs)
@@ -318,7 +321,8 @@ def j2t_autograd(fn, call_jax=call_jax):
         # We need to put a None for inputs that did not require gradients.
         final_grads = [None]
         for needs_grad, grad in zip(
-            ctx.needs_input_grad[1:], input_grads_structured, strict=True):
+          ctx.needs_input_grad[1:], input_grads_structured, strict=True
+        ):
           final_grads.append(grad if needs_grad else None)
         return tuple(final_grads)
@@ -343,27 +347,27 @@ def wrap_jax_jit(torch_function, jax_jit_func=jax.jit, kwargs_for_jax=None):
   return torch_view(jitted)
-def jax_jit(torch_function,
-            kwargs_for_jax_jit=None,
-            fix_for_buffer_donation=False):
+def jax_jit(torch_function, kwargs_for_jax_jit=None, fix_for_buffer_donation=False):
   return wrap_jax_jit(
-      torch_function, jax_jit_func=jax.jit, kwargs_for_jax=kwargs_for_jax_jit)
+    torch_function, jax_jit_func=jax.jit, kwargs_for_jax=kwargs_for_jax_jit
+  )
 def jax_shard_map(torch_function, kwargs_for_jax_shard_map=None):
   return wrap_jax_jit(
-      torch_function,
-      jax_jit_func=shard_map,
-      kwargs_for_jax=kwargs_for_jax_shard_map)
+    torch_function, jax_jit_func=shard_map, kwargs_for_jax=kwargs_for_jax_shard_map
+  )
 def jax_value_and_grad(torch_function, kwargs_for_value_and_grad=None):
   return wrap_jax_jit(
-      torch_function,
-      jax_jit_func=jax.value_and_grad,
-      kwargs_for_jax=kwargs_for_value_and_grad)
+    torch_function,
+    jax_jit_func=jax.value_and_grad,
+    kwargs_for_jax=kwargs_for_value_and_grad,
+  )
 def gradient_checkpoint(torch_function, kwargs=None):
   return wrap_jax_jit(
-      torch_function, jax_jit_func=jax.checkpoint, kwargs_for_jax=kwargs)
+    torch_function, jax_jit_func=jax.checkpoint, kwargs_for_jax=kwargs
+  )

torchax/mesh_util.py CHANGED Viewed

@@ -13,8 +13,9 @@
 # limitations under the License.
 import jax
-from jax.sharding import PartitionSpec, NamedSharding
 import torch
+from jax.sharding import NamedSharding, PartitionSpec
 import torchax
 from torchax import interop
@@ -94,12 +95,13 @@ class SingleAxisSharder:
       `_shard_first_multiple_of`.
     """
     del name
-    sharding = _shard_first_multiple_of(self.axis_name, shapedtype.shape,
-                                        self.axis_size)
+    sharding = _shard_first_multiple_of(
+      self.axis_name, shapedtype.shape, self.axis_size
+    )
     if not self.replicate_unshardable and all(s is None for s in sharding):
       raise AssertionError(
-          f"Unable to find a dim to shard because "
-          f"None of the dims ({shapedtype.shape}) in shape is multiple of {self.axis_size}"
+        f"Unable to find a dim to shard because "
+        f"None of the dims ({shapedtype.shape}) in shape is multiple of {self.axis_size}"
       )
     return sharding
@@ -159,15 +161,14 @@ class Mesh:
     self.jax_mesh = jax_mesh
     if sharder is None:
       assert len(self.jax_mesh.axis_names) == 1
-      sharder = SingleAxisSharder(self.jax_mesh.axis_names[0],
-                                  len(self.mesh.device_ids))
+      sharder = SingleAxisSharder(
+        self.jax_mesh.axis_names[0], len(self.mesh.device_ids)
+      )
     self._sharder = sharder
-  def initialize_model_sharded(self,
-                               model_class,
-                               init_args,
-                               init_kwargs=None,
-                               override_sharder=None):
+  def initialize_model_sharded(
+    self, model_class, init_args, init_kwargs=None, override_sharder=None
+  ):
     """Initializes a PyTorch model with its parameters sharded across the mesh.
     This method orchestrates the initialization of a `torch.nn.Module` such
@@ -208,17 +209,18 @@ class Mesh:
     states = model.state_dict()
     output_shards = {
-        name: NamedSharding(self.jax_mesh, sharder(name, tensor))
-        for name, tensor in states.items()
+      name: NamedSharding(self.jax_mesh, sharder(name, tensor))
+      for name, tensor in states.items()
     }
     def model_initializer():
-      with torchax.default_env(), torch.device('meta'):
+      with torchax.default_env(), torch.device("meta"):
         model = model_class(*init_args, **init_kwargs)
       return dict(model.state_dict())
     jitted = interop.jax_jit(
-        model_initializer, kwargs_for_jax_jit={"out_shardings": output_shards})
+      model_initializer, kwargs_for_jax_jit={"out_shardings": output_shards}
+    )
     weights_dict = jitted()
     model.load_state_dict(weights_dict, assign=True)
@@ -228,7 +230,7 @@ class Mesh:
     sharder = override_sharder or self._sharder
     states = model.state_dict()
     output_shards = {
-        name: NamedSharding(self.jax_mesh, sharder(name, tensor))
-        for name, tensor in states.items()
+      name: NamedSharding(self.jax_mesh, sharder(name, tensor))
+      for name, tensor in states.items()
     }
     model.load_state_dict(output_shards, assign=True)

torchax/ops/__init__.py CHANGED Viewed

@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 def all_aten_jax_ops():
   # to load the ops
   import torchax.ops.jaten  # type: ignore
   import torchax.ops.ops_registry  # type: ignore
   return {
-      key: val.func
-      for key, val in torchax.ops.ops_registry.all_aten_ops.items()
-      if val.is_jax_function
+    key: val.func
+    for key, val in torchax.ops.ops_registry.all_aten_ops.items()
+    if val.is_jax_function
   }

torchax 0.0.10.dev20251114__py3-none-any.whl → 0.0.11.dev202612__py3-none-any.whl

Potentially problematic release.

torchax 0.0.10.dev20251114py3-none-any.whl → 0.0.11.dev202612py3-none-any.whl