PyPI - torchax - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

torchax 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchax might be problematic. Click here for more details.

Files changed (16) hide show

torchax/__init__.py +5 -41
torchax/amp.py +2 -3
torchax/config.py +5 -1
torchax/configuration.py +30 -0
torchax/device_module.py +7 -0
torchax/environment.py +1 -0
torchax/interop.py +27 -14
torchax/mesh_util.py +10 -1
torchax/ops/jaten.py +5 -3
torchax/ops/jtorch.py +18 -10
torchax/tensor.py +127 -115
{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/METADATA +1 -1
{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/RECORD +15 -14
torchax/distributed.py +0 -241
{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/WHEEL +0 -0
{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/licenses/LICENSE +0 -0

torchax/__init__.py CHANGED Viewed

@@ -6,10 +6,9 @@ import os
 import torch
 from torch.utils import _pytree as pytree
 from torchax import tensor
-from torchax import distributed  # noqa: F401
 from contextlib import contextmanager
-__version__ = "0.0.5"
+__version__ = "0.0.6"
 VERSION = __version__
 __all__ = [
@@ -50,10 +49,11 @@ def extract_jax(mod: torch.nn.Module, env=None):
   states = env.t2j_copy(states)
   #@jax.jit
-  def jax_func(states, inputs):
-    (states, inputs) = env.j2t_iso((states, inputs))
+  def jax_func(states, args, kwargs=None):
+    (states, args, kwargs) = env.j2t_iso((states, args, kwargs))
     with env:
-      res = torch.func.functional_call(mod, states, inputs, tie_weights=False)
+      res = torch.func.functional_call(
+          mod, states, args, kwargs, tie_weights=False)
     return env.t2j_iso(res)
   return states, jax_func
@@ -81,11 +81,6 @@ def disable_temporarily():
 torch.utils.rename_privateuse1_backend('jax')
 unsupported_dtype = [torch.quint8]
-torch.utils.generate_methods_for_privateuse1_backend(
-    for_tensor=True,
-    for_module=True,
-    for_storage=True,
-    unsupported_dtype=unsupported_dtype)
 import jax
 import torchax.device_module
@@ -129,34 +124,3 @@ def compile(fn, options: Optional[CompileOptions] = None):
     raise RuntimeError('dynamo mode is not supported yet')
   elif options.mode == 'export':
     raise RuntimeError('export mode is not supported yet')
-@contextmanager
-def jax_device(target_device: str, env: tensor.Environment | None = None):
-  """
-  to("jax") cannot differentiate the device/platform (cpu vs tpu).
-  Use this context manager to control jax array's storage device
-  Examples:
-  a = torch.ones(3, 3)
-  with jax_device("cpu"):
-    b = a.to("jax")
-  with jax_device("tpu"):
-    c = a.to("jax")
-  with jax_device("tpu"):
-    c = b.to("jax")
-  """
-  if env is None:
-    env = default_env()
-  prev_target_device = env.target_device
-  try:
-    env.target_device = target_device
-    yield env
-  finally:
-    env.target_device = prev_target_device

torchax/amp.py CHANGED Viewed

@@ -61,9 +61,8 @@ def autocast(device, dtype=torch.bfloat16, env=None):
   if env is None:
     import torchax
     env = torchax.default_env()
-  env.autocast_dtype, old = dtype, env.autocast_dtype
-  yield
-  env.autocast_dtype = old
+  with env.override_property(autocast_dtype=dtype):
+    yield
 # https://github.com/pytorch/pytorch/blob/05faba40287cf7d8734da96cb2e904f39710bf29/aten/src/ATen/autocast_mode.cpp#L327

torchax/config.py CHANGED Viewed

@@ -10,6 +10,11 @@ class Configuration:
   use_int32_for_index: bool = False
+  # normally, math between CPU torch.Tensor with torchax.Tensor is not
+  # allowed. However, if that torch.Tensor happens to be scalar, then we
+  # can use scalar * tensor math to handle it
+  allow_mixed_math_with_scalar_tensor: bool = True
   # If true, we will convert Views into torchax.Tensors eagerly
   force_materialize_views: bool = False
@@ -22,5 +27,4 @@ class Configuration:
   # device
   treat_cuda_as_jax_device: bool = True
-  use_torch_native_for_cpu_tensor: bool = True
   internal_respect_torch_return_dtypes: bool = False

torchax/configuration.py ADDED Viewed

@@ -0,0 +1,30 @@
+import dataclasses
+@dataclasses.dataclass
+class Configuration:
+  debug_print_each_op: bool = False
+  debug_accuracy_for_each_op: bool = False
+  debug_mixed_tensor: bool = False
+  debug_print_each_op_operands: bool = False
+  use_int32_for_index: bool = False
+  # normally, math between CPU torch.Tensor with torchax.Tensor is not
+  # allowed. However, if that torch.Tensor happens to be scalar, then we
+  # can use scalar * tensor math to handle it
+  allow_mixed_math_with_scalar_tensor: bool = True
+  # If true, we will convert Views into torchax.Tensors eagerly
+  force_materialize_views: bool = False
+  # Use DLPack for converting jax.Arrays <-> and torch.Tensor
+  use_dlpack_for_data_conversion: bool = False
+  # Flash attention
+  use_tpu_flash_attention: bool = False
+  shmap_flash_attention: bool = False
+  # device
+  treat_cuda_as_jax_device: bool = True
+  internal_respect_torch_return_dtypes: bool = False

torchax/device_module.py CHANGED Viewed

@@ -1,3 +1,6 @@
+import torch
 def _is_in_bad_fork():
   return False
@@ -24,3 +27,7 @@ def is_available():
 def current_device():
   return 0
+def get_amp_supported_dtype():
+  return [torch.float16, torch.bfloat16]

torchax/environment.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

torchax/interop.py CHANGED Viewed

@@ -11,6 +11,7 @@ from jax import tree_util as pytree
 from jax.experimental.shard_map import shard_map
 from torchax import tensor
 from torchax import util
+from torchax.ops import mappings
 import torchax
 from torchax.types import JaxValue, TorchValue, JaxCallable, TorchCallable
@@ -90,7 +91,7 @@ class JittableModule(torch.nn.Module):
   def __call__(self, *args, **kwargs):
     return self.forward(*args, **kwargs)
-  def functional_call(self, method_name, params, buffers, *args, **kwargs):
+  def functional_call(self, method_or_name, params, buffers, *args, **kwargs):
     kwargs = kwargs or {}
     params_copy = copy.copy(params)
     params_copy.update(buffers)
@@ -98,22 +99,35 @@ class JittableModule(torch.nn.Module):
     for k, v in self._extra_dumped_weights.items():
       for new_key in v:
         params_copy[new_key] = params_copy[k]
+    if isinstance(method_or_name, str):
+      method = getattr(self._model, method_or_name)
+    else:
+      if not callable(method_or_name):
+        raise TypeError(
+            f"method_or_name should be a callable or a string, got {type(method_or_name)}"
+        )
+      method = method_or_name
+      args = (self._model,) + args
     with torch_stateless._reparametrize_module(self._model, params_copy):
-      res = getattr(self._model, method_name)(*args, **kwargs)
+      res = method(*args, **kwargs)
     return res
-  def forward(self, *args, **kwargs):
-    if 'forward' not in self._jitted:
+  def jittable_call(self, method_name: str, *args, **kwargs):
+    if method_name not in self._jitted:
       jitted = jax_jit(
-          functools.partial(self.functional_call, 'forward'),
+          functools.partial(self.functional_call, method_name),
           kwargs_for_jax_jit=self._extra_jit_args,
       )
       def jitted_forward(*args, **kwargs):
         return jitted(self.params, self.buffers, *args, **kwargs)
-      self._jitted['forward'] = jitted_forward
-    return self._jitted['forward'](*args, **kwargs)
+      self._jitted[method_name] = jitted_forward
+    return self._jitted[method_name](*args, **kwargs)
+  def forward(self, *args, **kwargs):
+    return self.jittable_call('forward', *args, **kwargs)
   def __getattr__(self, key):
     if key == '_model':
@@ -170,8 +184,8 @@ def _torch_view(t: JaxValue) -> TorchValue:
   if isinstance(t, jax.Array):
     # TODO
     return tensor.Tensor(t, torchax.default_env())
-  if isinstance(t, type(jnp.int32)):
-    return tensor.t2j_type(t)
+  if isinstance(t, jnp.dtype):
+    return mappings.j2t_dtype(t)
   if callable(t):  # t is a JaxCallable
     return functools.partial(call_jax, t)
   # regular types are not changed
@@ -188,7 +202,7 @@ def _jax_view(t: TorchValue) -> JaxValue:
     assert isinstance(t, tensor.Tensor) or isinstance(t, tensor.View), type(t)
     return t.jax()
   if isinstance(t, type(torch.int32)):
-    return tensor.t2j_dtype(t)
+    return mappings.t2j_dtype(t)
   # torch.nn.Module needs special handling
   if not isinstance(t, torch.nn.Module) and callable(t):  # t is a TorchCallable
@@ -225,8 +239,7 @@ def j2t_autograd(fn, call_jax=call_jax):
   @wraps(fn)
   def inner(*args, **kwargs):
-    from jax.tree_util import tree_flatten, tree_unflatten
-    from jax.util import safe_zip
+    from jax.tree_util import tree_flatten
     class JaxFun(torch.autograd.Function):
@@ -261,8 +274,8 @@ def j2t_autograd(fn, call_jax=call_jax):
         # The subsequent gradients correspond to flat_inputs.
         # We need to put a None for inputs that did not require gradients.
         final_grads = [None]
-        for needs_grad, grad in safe_zip(ctx.needs_input_grad[1:],
-                                         input_grads_structured):
+        for needs_grad, grad in zip(
+            ctx.needs_input_grad[1:], input_grads_structured, strict=True):
           final_grads.append(grad if needs_grad else None)
         return tuple(final_grads)

torchax/mesh_util.py CHANGED Viewed

@@ -199,7 +199,7 @@ class Mesh:
     }
     def model_initializer():
-      with torchax.default_env():
+      with torchax.default_env(), torch.device('meta'):
         model = model_class(*init_args, **init_kwargs)
       return dict(model.state_dict())
@@ -209,3 +209,12 @@ class Mesh:
     model.load_state_dict(weights_dict, assign=True)
     return model
+  def shard_model(self, model, override_sharder=None):
+    sharder = override_sharder or self._sharder
+    states = model.state_dict()
+    output_shards = {
+        name: NamedSharding(self.jax_mesh, sharder(name, tensor))
+        for name, tensor in states.items()
+    }
+    model.load_state_dict(output_shards, assign=True)

torchax/ops/jaten.py CHANGED Viewed

@@ -736,7 +736,6 @@ def _aten_empty_strided(sizes, stride, dtype=None, **kwargs):
   return jnp.empty(sizes, dtype=dtype)
-@op(torch.ops.aten.index_put_)
 @op(torch.ops.aten.index_put)
 def _aten_index_put(self, indexes, values, accumulate=False):
   indexes = [slice(None, None, None) if i is None else i for i in indexes]
@@ -3532,7 +3531,7 @@ def _aten_tensor_split(ary, indices_or_sections, axis=0):
 @op(torch.ops.aten.randn, needs_env=True)
 @op_base.convert_dtype()
-def _randn(
+def _aten_randn(
     *size,
     generator=None,
     out=None,
@@ -3652,7 +3651,7 @@ def _aten_native_batch_norm(input,
 @op(torch.ops.aten.normal, needs_env=True)
 def _aten_normal(self, mean=0, std=1, generator=None, env=None):
   shape = self.shape
-  res = _randn(*shape, generator=generator, env=env)
+  res = _aten_randn(*shape, generator=generator, env=env)
   return res * std + mean
@@ -5541,6 +5540,7 @@ def _aten_floor_divide(x, y):
 @op(torch.ops.aten._assert_tensor_metadata)
+@op(torch.ops.aten._assert_scalar)
 def _aten__assert_tensor_metadata(*args, **kwargs):
   pass
@@ -5617,6 +5617,8 @@ mutation_ops_to_functional = {
         op_base.InplaceOp(torch.ops.aten.floor_divide),
     torch.ops.aten.remainder_:
         op_base.InplaceOp(torch.ops.aten.remainder),
+    torch.ops.aten.index_put_:
+        op_base.InplaceOp(torch.ops.aten.index_put),
 }
 # Note: tuple comparisons work intuitively, e.g. `_jax_version >= (0, 4, 32)`.

torchax/ops/jtorch.py CHANGED Viewed

@@ -179,6 +179,13 @@ def _tpu_flash_attention(query, key, value, env):
   return wrap_flash_attention(query, key, value)
+@register_function(torch.nn.functional.one_hot)
+def one_hot(tensor, num_classes=-1):
+  if num_classes == -1:
+    num_classes = jnp.max(tensor) + 1
+  return jax.nn.one_hot(tensor, num_classes).astype(jnp.int64)
 @register_function(torch.nn.functional.pad)
 def pad(tensor, pad, mode="constant", value=None):
   # For padding modes that have different names between Torch and NumPy, this
@@ -341,7 +348,7 @@ def empty(*size: Sequence[int], dtype=None, **kwargs):
   return jnp.empty(size, dtype=dtype)
-@register_function(torch.arange, is_jax_function=False)
+@register_function(torch.arange, is_jax_function=True)
 def arange(
     start,
     end=None,
@@ -358,10 +365,10 @@ def arange(
     start = 0
   if step is None:
     step = 1
-  return torch.ops.aten.arange(start, end, step, dtype=dtype)
+  return jaten._aten_arange(start, end, step, dtype=dtype)
-@register_function(torch.empty_strided, is_jax_function=False)
+@register_function(torch.empty_strided, is_jax_function=True)
 def empty_strided(
     size,
     stride,
@@ -372,7 +379,7 @@ def empty_strided(
     requires_grad=False,
     pin_memory=False,
 ):
-  return empty(size, dtype=dtype)
+  return empty(size, dtype=dtype, requires_grad=requires_grad)
 @register_function(torch.unravel_index)
@@ -380,14 +387,14 @@ def unravel_index(indices, shape):
   return jnp.unravel_index(indices, shape)
-@register_function(torch.rand, is_jax_function=False)
+@register_function(torch.rand, is_jax_function=True, needs_env=True)
 def rand(*size, **kwargs):
   if len(size) == 1 and isinstance(size[0], collections.abc.Iterable):
     size = size[0]
-  return torch.ops.aten.rand(size, **kwargs)
+  return jaten._rand(size, **kwargs)
-@register_function(torch.randn, is_jax_function=False)
+@register_function(torch.randn, is_jax_function=True, needs_env=True)
 def randn(
     *size,
     generator=None,
@@ -397,15 +404,16 @@ def randn(
     device=None,
     requires_grad=False,
     pin_memory=False,
+    env=None,
 ):
   if len(size) == 1 and isinstance(size[0], collections.abc.Iterable):
     size = size[0]
-  return torch.ops.aten.randn(size, generator=generator, dtype=dtype)
+  return jaten._aten_randn(size, generator=generator, dtype=dtype, env=env)
-@register_function(torch.randint, is_jax_function=False)
+@register_function(torch.randint, is_jax_function=False, needs_env=True)
 def randint(*args, **kwargs):
-  return torch.ops.aten.randint(*args, **kwargs)
+  return jaten._aten_randint(*args, **kwargs)
 @register_function(torch.logdet)

torchax/tensor.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import threading
 import logging
 import sys
 import contextlib
@@ -16,7 +17,6 @@ from torchax.view import View
 from torchax import config
 from torchax.ops import mappings, ops_registry
 from torchax import amp
-from jax.experimental import mutable_array
 logger = logging.getLogger(__name__)
@@ -25,14 +25,6 @@ class OperatorNotFound(Exception):
   pass
-def wrap(jaxarray):
-  return torch_pytree.tree_map_only(jnp.ndarray, Tensor, jaxarray)
-def unwrap(torchtensors):
-  return torch_pytree.tree_map_only(Tensor, lambda x: x._elem, torchtensors)
 @contextlib.contextmanager
 def log_nested(env, message):
   if env.config.debug_print_each_op:
@@ -48,7 +40,7 @@ log_nested.level = 0
 class Tensor(torch.Tensor):
   @staticmethod
-  def __new__(cls, elem, env):
+  def __new__(cls, elem, env, requires_grad=False):
     dtype = mappings.j2t_dtype(elem.dtype)
     shape = list(elem.shape)
     for i, s in enumerate(shape):
@@ -56,15 +48,19 @@ class Tensor(torch.Tensor):
         shape[i] = 1
     if dtype is None:
       dtype = torch.float32
+    #dispatch_keys = torch.DispatchKeySet(torch._C.DispatchKey.PrivateUse1).add(torch._C.DispatchKey.AutogradPrivateUse1)
+    if not (dtype.is_floating_point or dtype.is_complex):
+      requires_grad = False
     return torch.Tensor._make_wrapper_subclass(
         cls,
         shape,
         dtype=dtype,
-        device="meta",
-        requires_grad=False,
+        device='meta',
+        requires_grad=requires_grad,
     )
-  def __init__(self, elem: jax.Array, env: "Environment"):
+  def __init__(self, elem: jax.Array, env: "Environment", requires_grad=False):
     super().__init__()
     self._elem = elem
     self._env = env
@@ -74,9 +70,6 @@ class Tensor(torch.Tensor):
   __repr__ = __str__
-  def __jax_array__(self):
-    return self._elem
   @property
   def shape(self):
     return torch.Size(self._elem.shape)
@@ -109,6 +102,8 @@ class Tensor(torch.Tensor):
     # TODO(hanq): figure out why is dispatch mode not sufficient
     if func == torch.ops._c10d_functional.wait_tensor.default:
       return args[0]._env.dispatch(func, types, args, kwargs)
+    if func == torch.ops.prim.device.default:
+      return torch.device('privateuseone', 0)
     raise AssertionError(
         'torchax Tensors can only do math within the torchax environment.'
         'Please wrap your code with `with torchax.default_env()` or '
@@ -298,6 +293,38 @@ TENSOR_CONSTRUCTORS = {
 SUPPORTED_JAX_PLATFROM = ["cpu", "tpu"]
+class RuntimeProperty:
+  mesh: Any
+  prng: Any
+  autocast_dtype: Any
+  def __init__(self, mesh, prng, autocast_dtype):
+    self.mesh = mesh
+    self.prng = prng
+    self.autocast_dtype = autocast_dtype
+  def override(self, **kwargs):
+    return OverrideProperty(self, kwargs)
+  def get_and_rotate_prng_key(self):
+    old_key = self.prng
+    new_prng_key, next_key = jax.random.split(old_key)
+    self.prng = new_prng_key
+    return next_key
+class OverrideProperty(RuntimeProperty):
+  def __init__(self, parent, override):
+    self.parent = parent
+    self._override = dict(override)
+  def __getattr__(self, name):
+    if name in self._override:
+      return self._override[name]
+    return getattr(self.parent, name)
 class Environment(contextlib.ContextDecorator):
   """This class holds a set of configurations and "globals" needed
@@ -321,62 +348,55 @@ class Environment(contextlib.ContextDecorator):
     self.load_ops()
-    self._mesh = None
+    _mesh = None
     self.config = configuration or config.Configuration()
-    self._manually_entered = False
     self.enabled = False
-    self._prng_key = mutable_array(
-        jax.random.key(torch.initial_seed() % (1 << 63)))
-    self.autocast_dtype = None
-    self._target_device = jax.local_devices()[0].platform
+    autocast_dtype = None
-  @property
-  def target_device(self):
-    return self._target_device
+    _prng_key = jax.random.key(torch.initial_seed() % (1 << 63))
+    self._property = threading.local()
+    self._property.content = [
+        RuntimeProperty(
+            mesh=_mesh, prng=_prng_key, autocast_dtype=autocast_dtype)
+    ]
-  @target_device.setter
-  def target_device(self, device: str):
-    self._target_device = device.lower()
+  @property
+  def param(self):
+    return self._property.content[-1]
   def manual_seed(self, key):
-    self._prng_key = mutable_array(jax.random.key(key))
+    jax_key = jax.random.PRNGKey(key)
+    new_prop = self.param.override(prng=jax_key)
+    self._property.content.append(new_prop)
   @property
   def prng_key(self):
-    return self._prng_key[...]
+    return self.param.prng
-  def get_as_jax_device(self, device: Any):
+  def _should_use_torchax_tensor(self, device):
     if device is None:
       device = torch.get_default_device()
     if isinstance(device, torch.device):
-      device = str(device)
-    if not self.config.use_torch_native_for_cpu_tensor and device.startswith(
-        "cpu"):
-      return jax.devices("cpu")[0]
-    if self.config.treat_cuda_as_jax_device and device.startswith("cuda"):
-      return jax.local_devices()[0]
-    if device.startswith("xla"):
-      return jax.local_devices()[0]
-    # TODO (wen): jax is NOT a device type,
-    # once we can register more than one backend, revisit
-    if device.startswith("jax"):
-      match self.target_device:
-        case "cpu":
-          return jax.devices("cpu")[0]
-        case "tpu":
-          return jax.devices("tpu")[0]
-        case _:
-          raise AttributeError(
-              f"Cannot handle env.target_device {self.target_device}")
-    return None  # fallback to torch
+      device = device.type
+    if ':' in device:
+      device = device.split(':')[0]
+    match device:
+      case 'cpu':
+        return False
+      case 'cuda':
+        return self.config.treat_cuda_as_jax_device
+      case 'jax':
+        return True
+      case 'privateuseone':
+        return True
+      case 'meta':
+        return self.enabled
+    return False
   def load_ops(self):
     from torchax.ops import jaten, jtorch, jc10d, jtorchvision_nms
@@ -423,80 +443,63 @@ class Environment(contextlib.ContextDecorator):
     return op
+  def _is_same_device(self, the_tensor, new_device):
+    if new_device is None:
+      return True
+    if new_device == 'meta' and the_tensor.device.type == 'jax':
+      return True
+    if the_tensor.device.type != new_device:
+      if the_tensor.device.type == 'cuda':
+        return self.config.treat_cuda_as_jax_device
+      return False
+    return True
   def _to_copy(self, the_tensor, new_dtype, new_device):
     if isinstance(the_tensor, View):
       the_tensor = the_tensor.torch()
-    if isinstance(the_tensor, Tensor):
-      arr = the_tensor.jax()
-      if new_dtype is not None and new_dtype != arr.dtype:
-        arr = arr.astype(mappings.t2j_dtype(new_dtype))
-      if new_device is not None:
-        match str(new_device).lower():
-          case "cpu":
-            # converting to a non-jax device: let torch native handle it
-            torch_tensor = self.j2t_copy(arr) if isinstance(the_tensor,
-                                                            Tensor) else arr
-            with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
-              return torch_tensor.to(new_device)
-          case "jax":
-            # move torchax.tensor / jax tensor between devices
-            # I don't know ifgit  this will work after the model is jitted
-            if self.target_device != the_tensor.jax_device.platform:
-              arr = jax.device_put(the_tensor.jax(),
-                                   jax.devices(self.target_device)[0])
-              return Tensor(arr, self)
-          case _:
-            logging.error(f"torchax.Tenosr cannot handle device {new_device}")
-    else:
-      if new_dtype is not None and new_dtype != the_tensor.dtype:
+    if isinstance(new_device, torch.device):
+      new_device = new_device.type
+    res = the_tensor
+    if not self._is_same_device(the_tensor, new_device):
+      if isinstance(the_tensor, Tensor):
+        torch_tensor = self.j2t_copy(the_tensor._elem)
         with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
-          the_tensor = the_tensor.to(new_dtype)
-      if new_device is None:  ## device is None means don't change device
-        return the_tensor
-      jax_device = self.get_as_jax_device(new_device)
-      if jax_device:
+          return torch_tensor.to(device=new_device, dtype=new_dtype)
+      else:
         arr = self.t2j_copy(the_tensor)
-        arr = jax.device_put(arr, jax_device)
+        res = Tensor(arr, self, the_tensor.requires_grad)
+    if new_dtype is not None and new_dtype != the_tensor.dtype:
+      if isinstance(the_tensor, Tensor):
+        res = res.apply_jax(jnp.astype, mappings.t2j_dtype(new_dtype))
       else:
         with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
-          return the_tensor.to(new_device)
-    return Tensor(arr, self)
+          return the_tensor.to(device=new_device, dtype=new_dtype)
+    return res
   def get_and_rotate_prng_key(self,
                               generator: Optional[torch.Generator] = None):
     if generator is not None:
-      with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
-        self._prng_key[...] = jax.random.key(generator.initial_seed() % (2**63))
-    old_key = self._prng_key[...]
-    new_prng_key, next_key = jax.random.split(old_key)
-    self._prng_key[...] = new_prng_key
-    return next_key
+      return jax.random.PRNGKey(generator.initial_seed() % (2**63))
+    return self.param.get_and_rotate_prng_key()
   def _handle_tensor_constructor(self, func, args, kwargs):
     device = kwargs.get("device")
-    jax_device = self.get_as_jax_device(device)
-    # TODO(qihqi) figure out better ways for device propagation
-    if not self._manually_entered and jax_device is None:
-      # let torch handle it
-      with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
-        return func(*args, **kwargs)
-    with jax.default_device(jax_device):
+    if self._should_use_torchax_tensor(device):
+      # don't set default device, let caller set it
       requires_grad = kwargs.get("requires_grad", False)
       op = self._get_op_or_decomp(func)
+      if op.needs_env:
+        kwargs['env'] = self
+      if op.is_jax_function:
+        (args, kwargs) = self.t2j_iso((args, kwargs))
       res = op.func(*args, **kwargs)
       if isinstance(res, jax.Array):
-        res = Tensor(res, self)
-      if requires_grad:
-        res.requires_grad = True
+        res = Tensor(res, self, requires_grad)
       return res
+    else:
+      with mode_utils.no_dispatch(), torch._C.DisableTorchFunction():
+        return func(*args, **kwargs)
   def _torch_Tensor_to(self, args, kwargs):
     the_tensor = args[0]
@@ -560,11 +563,11 @@ class Environment(contextlib.ContextDecorator):
           args, kwargs = self.v2t_iso((args, kwargs))
         with self:
-          if self.autocast_dtype is not None:
+          if self.param.autocast_dtype is not None:
             autocast_policy = amp.autocast_policy.get(func)
             if autocast_policy is not None:
               args, kwargs = amp.execute_policy(autocast_policy, args, kwargs,
-                                                self.autocast_dtype)
+                                                self.param.autocast_dtype)
         if op.is_jax_function:
           args, kwargs = self.t2j_iso((args, kwargs))
@@ -609,11 +612,9 @@ class Environment(contextlib.ContextDecorator):
   def __enter__(self):
     self.enable_torch_modes()
-    self._manually_entered = True
     return self
   def __exit__(self, *exc):
-    self._manually_entered = False
     self.disable_torch_modes(*exc)
   def _move_one_value(self, val):
@@ -639,6 +640,10 @@ class Environment(contextlib.ContextDecorator):
     """
     def to_jax(x):
+      if self.config.allow_mixed_math_with_scalar_tensor and not isinstance(
+          x, Tensor):
+        if x.squeeze().ndim == 0:
+          return x.item()
       if isinstance(
           x, torch.distributed._functional_collectives.AsyncCollectiveTensor):
         x = x.wait()
@@ -697,3 +702,10 @@ class Environment(contextlib.ContextDecorator):
         is_user_defined=True,
         needs_env=False,
     )
+  @contextlib.contextmanager
+  def override_property(self, **kwargs):
+    new_prop = self.param.override(**kwargs)
+    self._property.content.append(new_prop)
+    yield
+    self._property.content.pop()

{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchax
-Version: 0.0.5
+Version: 0.0.6
 Summary: torchax is a library for running Jax and PyTorch together
 Project-URL: Homepage, https://github.com/pytorch/xla/tree/master/torchax
 Author-email: Han Qi <qihan.dev@gmail.com>, Pytorch/XLA team <pytorchxla-dev@google.com>

{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,32 +1,33 @@
 torchax/CONTRIBUTING.md,sha256=VOL0us6kS-uc4yE6IlSm6SDHYHnx-gw-0upFnP0VkSQ,1369
-torchax/__init__.py,sha256=T8tYMpwfP9i3FLzci2_TCGH58PBbRjcRO4O3sgyyk_0,3945
-torchax/amp.py,sha256=WycgMeZfwgzVDqu9ADnUHwhbXSQXtVUoIUXP3jcMF1k,11818
-torchax/config.py,sha256=N52pUw18H8UnIka524w07mX_kv3kGoRrZYhc4VbV8wc,727
+torchax/__init__.py,sha256=c98iIGugRTbEVcsx8eWnbAjsC4mpcDrK23ZQqiMycLg,3157
+torchax/amp.py,sha256=-k8t4lrCsJLKHEhI6J0aHE3MAPEL-4DP6wCKtMwo1AM,11791
+torchax/config.py,sha256=O9yF96AShWb02hcwkT5ToPTt_hpOo3dMJNO30A7dmac,922
+torchax/configuration.py,sha256=O9yF96AShWb02hcwkT5ToPTt_hpOo3dMJNO30A7dmac,922
 torchax/decompositions.py,sha256=1p5TFZfAJ2Bs9BiSO1vXbnWEXnbPfC_gCQ54rDXhd9k,28859
-torchax/device_module.py,sha256=yGFPczPiXPlhTtpx-hBaxnhAhOuegRrxGgyvlWI2n_M,260
-torchax/distributed.py,sha256=9WyscssryK9jje9LPX-iiN0p4giHXzHzzPYu9G1Rg54,7703
+torchax/device_module.py,sha256=7fkdPwXG0qCBTmvDYHp0fvv4xK0W9avV_Ua3MeMzczE,349
+torchax/environment.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 torchax/export.py,sha256=xU-UbrQBvQWUy-GM2FfeIHymlEdmYDYcPymjlcXM23w,8969
 torchax/flax.py,sha256=2Tg8inGskgAfByPxJQh4ItZHHAb-960gYq156bSO8V4,1280
-torchax/interop.py,sha256=T7Wt3nngGF-6lweRKb28cmjqhMgpQiro6yGnj9a35MY,10676
-torchax/mesh_util.py,sha256=g3yv8pS4ox1_QU70U6OHyPbEpLw-1AzQSTPGyZ4D5q8,8965
-torchax/tensor.py,sha256=sF0Pi0V4kUwuFADAs3k9-ypDc2o1RuNCBAz9LtJwe9c,20937
+torchax/interop.py,sha256=7HvJwtxdodcCrMyJzs-Wr47hkHuoh6CWb2-YKoBwqV0,11076
+torchax/mesh_util.py,sha256=Ab4ic2eHWmQ3Mw3jpERvi-TKLIcDvQQoC6tuIZ9ig7Q,9314
+torchax/tensor.py,sha256=XjAp7khpQNhoVsSMzDj-V8l4DFT9jBaL4NVCi88a6K0,20893
 torchax/tf_integration.py,sha256=d_h4vSJm7N9rJXpUPNCDOiUz3J1-UPo3KU8D9Wi4nnc,4074
 torchax/train.py,sha256=rtvj6HkdnG9fc3VWYPNwHuxGlUxFJkUXJWED8azgtok,3855
 torchax/types.py,sha256=j4ERjkgDgwhgi9zrwwbbiv4HMDlrJ1IEMUCmP_BIJ9M,388
 torchax/util.py,sha256=cb-eudDE7AX2s-6zYtXdowgyzyvqPqE9MPP82PfH23g,3069
 torchax/view.py,sha256=1ekqRN04lAPd_icgZMKbSYWhr738DzVloc34ynml4wo,11121
 torchax/ops/__init__.py,sha256=Vr1p8zDHwfXZBUbw70iNiCJLZLNdI6gR_vUlaiA7Usg,270
-torchax/ops/jaten.py,sha256=rUnyJVzvU701SOIGH_b_huLWH7NrrgSDQTRzlGJNn_A,165737
+torchax/ops/jaten.py,sha256=WxfZU6p7b7OR98B3z0LCXKlV6U5aslXxJMJirBr6lns,165835
 torchax/ops/jax_reimplement.py,sha256=idkmFWNCXBilkmaHBGdivKz0XhsjSpqLNlGXxbBOKWQ,7302
 torchax/ops/jc10d.py,sha256=OzSYYle_5jBmNVP64SuJPz9S-rRGD6H7e1a9HHIKsjU,1322
 torchax/ops/jimage.py,sha256=P0lAauYX_au_xjIHDsG7H6jO7Jf54_VCAjzZuIZdhO0,3182
 torchax/ops/jlibrary.py,sha256=YfYUQbf5dKiMtEHUMfdgHTeLuNvvSTJ-l8s7wQNIvO0,2930
-torchax/ops/jtorch.py,sha256=LMHz85UfLerbyrB0IZqHIXpQXfTDmnzhaaE3_SHtMH4,16870
+torchax/ops/jtorch.py,sha256=wR4ZdDscxqG4VpxjcLGzgdUKmipa3fp7S0mK3DcD--A,17161
 torchax/ops/jtorchvision_nms.py,sha256=HSnhwU0gFaHucT7EvrEruJdnWkAWTw4T35GY525ohO8,8903
 torchax/ops/mappings.py,sha256=AESERtXJ6i_Hm0ycwEw7z5OJnHu-7QteWlSs-mlUPE4,3492
 torchax/ops/op_base.py,sha256=MLKFxMojIXgz4lkTE6k-8F-ddve-9vEiXkzj3P-YJPs,3739
 torchax/ops/ops_registry.py,sha256=qADpG1up0JOThoybiOQoRDWtAe5TOkHlqcj1bSHjtGY,1594
-torchax-0.0.5.dist-info/METADATA,sha256=fyGJQ51oOgCz8OOpxsbiuIVyQFT5G-wyx2R2KiUHGXE,10753
-torchax-0.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-torchax-0.0.5.dist-info/licenses/LICENSE,sha256=ZHyir3-ltOerFLt9JH1bjf7lIxIWipFmqeMnB_8z_aU,1498
-torchax-0.0.5.dist-info/RECORD,,
+torchax-0.0.6.dist-info/METADATA,sha256=uB9hoyxdfrAD14pHy0U8Gh1uCHbYwok-oEW12pEa6qs,10753
+torchax-0.0.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+torchax-0.0.6.dist-info/licenses/LICENSE,sha256=ZHyir3-ltOerFLt9JH1bjf7lIxIWipFmqeMnB_8z_aU,1498
+torchax-0.0.6.dist-info/RECORD,,

torchax/distributed.py DELETED Viewed

@@ -1,241 +0,0 @@
-"""`torch.distributed` backend implemented with JAX collective ops.
-EXPERIMENTAL: This module is still highly experimental, and it may be removed
-before any stable release.
-Note: JAX collective ops require that axis names be defined in `pmap` or
-`shmap`. The distributed backend only supports one axis, named `torch_dist`.
-This name is defined by our mirror implementation of `spawn`.
-"""
-import datetime
-import functools
-import logging
-import os
-from typing import List, Optional, Union
-import jax
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.distributed._functional_collectives
-from torch._C._distributed_c10d import ProcessGroup  # type: ignore
-import torch.distributed
-import torchax
-from jax.sharding import NamedSharding
-from jax.sharding import Mesh, PartitionSpec as P
-from jax.experimental import mesh_utils
-import torch.utils._pytree as torch_pytree
-from torchax import interop
-class ProcessGroupJax(ProcessGroup):
-  """Distributed backend implemented with JAX."""
-  def __init__(self, prefix_store, rank, size, timeout):
-    super().__init__(rank, size)
-    self._group_name = None
-  def getBackendName(self):
-    return "jax"
-  # TODO(wcromar): why doesn't default group name setter work?
-  # https://github.com/pytorch/pytorch/blob/7b1988f9222f3dec5cc2012afce84218199748ae/torch/csrc/distributed/c10d/ProcessGroup.cpp#L148-L152
-  def _set_group_name(self, name: str) -> None:
-    self._group_name = name
-  @property
-  def group_name(self):
-    assert self._group_name
-    return self._group_name
-  @staticmethod
-  def _work(
-      tensors: Union[torch.Tensor, List[torch.Tensor],
-                     List[List[torch.Tensor]]],
-  ) -> dist.Work:
-    fut = torch.futures.Future()
-    fut.set_result(tensors)
-    return torch._C._distributed_c10d._create_work_from_future(fut)
-  def _allgather_base(
-      self,
-      output: torch.Tensor,
-      input: torch.Tensor,
-      opts=...,
-  ) -> dist.Work:
-    assert isinstance(input, torchax.tensor.Tensor)
-    assert isinstance(output, torchax.tensor.Tensor)
-    torch.distributed._functional_collectives.all_gather_tensor_inplace(
-        output, input, group=self)
-    return self._work(output)
-  def allreduce(
-      self,
-      tensors: List[torch.Tensor],
-      opts: dist.AllreduceOptions = ...,
-  ) -> dist.Work:
-    assert len(tensors) == 1
-    assert isinstance(tensors[0], torchax.tensor.Tensor)
-    torch.distributed._functional_collectives.all_reduce_inplace(
-        tensors[0],
-        torch.distributed._functional_collectives.REDUCE_OP_TO_STR[
-            opts.reduceOp.op],
-        self,
-    )
-    return self._work(tensors)
-  def broadcast(
-      self,
-      tensors: List[torch.Tensor],
-      opts: dist.BroadcastOptions = ...,
-  ) -> dist.Work:
-    assert len(tensors) == 1
-    assert isinstance(tensors[0], torchax.tensor.Tensor)
-    tensors[0].copy_(
-        torch.distributed._functional_collectives.broadcast(
-            tensors[0], opts.rootRank, group=self))
-    return self._work(tensors)
-dist.Backend.register_backend("jax", ProcessGroupJax, devices=["jax"])
-def jax_rendezvous_handler(url: str,
-                           timeout: datetime.timedelta = ...,
-                           **kwargs):
-  """Initialize distributed store with JAX process IDs.
-  Requires `$MASTER_ADDR` and `$MASTER_PORT`.
-  """
-  # TODO(wcromar): jax.distributed.initialize(...) for multiprocess on GPU
-  # TODO(wcromar): Can we use the XLA coordinator as a Store? This isn't part
-  # of their public Python API
-  master_ip = os.environ["MASTER_ADDR"]
-  master_port = int(os.environ["MASTER_PORT"])
-  # TODO(wcromar): Use `torchrun`'s store if available
-  store = dist.TCPStore(
-      master_ip,
-      master_port,
-      jax.process_count(),
-      is_master=jax.process_index() == 0,
-  )
-  yield (store, jax.process_index(), jax.process_count())
-dist.register_rendezvous_handler("jax", jax_rendezvous_handler)
-def spawn(f, args=(), env: Optional[torchax.tensor.Environment] = None):
-  """Wrap `f` in a JAX `pmap` with the axis name `torch_dist` defined.
-  `f` is expected to take the replica index as a positional argument, similar
-  to `torch.multiprocessing.spawn`.
-  Note: `spawn` does not actually create parallel processes.
-  """
-  env = env or torchax.default_env()
-  def jax_wrapper(index, jax_args):
-    index, args = env.j2t_iso([index, jax_args])
-    torch_outputs = f(index, *args)
-    return env.t2j_iso(torch_outputs)
-  jax_outputs = jax.pmap(
-      jax_wrapper, axis_name="torch_dist")(np.arange(jax.device_count()),
-                                           env.t2j_iso(args))
-  return env.j2t_iso(jax_outputs)
-class DistributedDataParallel(torch.nn.Module):
-  """Re-implementation of DistributedDataParallel using JAX SPMD.
-  Splits inputs along batch dimension (assumed to be 0) across all devices in
-  JAX runtime, including remote devices. Each process should load a distinct
-  shard of the input data using e.g. DistributedSampler. Each process' shard
-  is then further split among the addressable devices (e.g. local TPU chips)
-  by `shard_input`.
-  Note: since parameters are replicated across addressable devices, inputs
-  must also be SPMD sharded using `shard_input` or `replicate_input`.
-  Example usage:
-  ```
-  jax_model = torchax.distributed.DistributedDataParallel(create_model())
-  for data, dataloader:
-    jax_data = jax_model.shard_input(data)
-    jax_output = jax_model(jax_data)
-  ```
-  """
-  def __init__(
-      self,
-      module: torch.nn.Module,
-      env: Optional[torchax.tensor.Environment] = None,
-      **kwargs,
-  ):
-    if kwargs:
-      logging.warning(f"Unsupported kwargs {kwargs}")
-    super().__init__()
-    self._env = env or torchax.default_env()
-    self._mesh = Mesh(
-        mesh_utils.create_device_mesh((jax.device_count(),)),
-        axis_names=("batch",),
-    )
-    replicated_state = torch_pytree.tree_map_only(
-        torch.Tensor,
-        lambda t: self._env.j2t_iso(
-            jax.device_put(
-                self._env.to_xla(t)._elem, NamedSharding(self._mesh, P()))),
-        module.state_dict(),
-    )
-    # TODO: broadcast
-    module.load_state_dict(replicated_state, assign=True)
-    self._module = module
-  def shard_input(self, inp):
-    per_process_batch_size = inp.shape[0]  # assumes batch dim is 0
-    per_replica_batch_size = per_process_batch_size // jax.local_device_count()
-    per_replica_batches = torch.chunk(inp, jax.local_device_count())
-    global_batch_size = per_replica_batch_size * jax.device_count()
-    global_batch_shape = (global_batch_size,) + inp.shape[1:]
-    sharding = NamedSharding(self._mesh, P("batch"))
-    return self._env.j2t_iso(
-        jax.make_array_from_single_device_arrays(
-            global_batch_shape,
-            NamedSharding(self._mesh, P("batch")),
-            arrays=[
-                jax.device_put(self._env.to_xla(batch)._elem, device) for batch,
-                device in zip(per_replica_batches, sharding.addressable_devices)
-            ],
-        ))
-  def replicate_input(self, inp):
-    return self._env.j2t_iso(
-        jax.device_put(inp._elem, NamedSharding(self._mesh, P())))
-  def jit_step(self, func):
-    @functools.partial(
-        interop.jax_jit, kwargs_for_jax_jit={'donate_argnums': 0})
-    def _jit_fn(states, args):
-      self.load_state_dict(states)
-      outputs = func(*args)
-      return self.state_dict(), outputs
-    @functools.wraps(func)
-    def inner(*args):
-      jax_states = self.state_dict()
-      new_states, outputs = _jit_fn(jax_states, args)
-      self.load_state_dict(new_states)
-      return outputs
-    return inner
-  def forward(self, *args):
-    with self._env:
-      return self._module(*args)

{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchax-0.0.5.dist-info → torchax-0.0.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

torchax 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

torchax 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl