PyPI - torchax - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

torchax 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchax might be problematic. Click here for more details.

Files changed (31) hide show

torchax/CONTRIBUTING.md +2 -2
torchax/__init__.py +57 -19
torchax/amp.py +333 -0
torchax/config.py +19 -12
torchax/decompositions.py +663 -195
torchax/device_module.py +7 -1
torchax/distributed.py +55 -60
torchax/export.py +26 -17
torchax/flax.py +39 -0
torchax/interop.py +275 -141
torchax/mesh_util.py +211 -0
torchax/ops/jaten.py +1718 -1294
torchax/ops/jax_reimplement.py +23 -21
torchax/ops/jc10d.py +5 -4
torchax/ops/jimage.py +113 -0
torchax/ops/jlibrary.py +9 -2
torchax/ops/jtorch.py +219 -78
torchax/ops/jtorchvision_nms.py +32 -43
torchax/ops/mappings.py +77 -35
torchax/ops/op_base.py +59 -32
torchax/ops/ops_registry.py +40 -35
torchax/tensor.py +417 -275
torchax/train.py +38 -41
torchax/util.py +88 -0
torchax/view.py +377 -0
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/METADATA +111 -145
torchax-0.0.5.dist-info/RECORD +32 -0
torchax/environment.py +0 -2
torchax-0.0.4.dist-info/RECORD +0 -27
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/WHEEL +0 -0
{torchax-0.0.4.dist-info → torchax-0.0.5.dist-info}/licenses/LICENSE +0 -0

torchax/ops/jaten.py CHANGED Viewed

@@ -15,75 +15,22 @@ from torchax.ops import ops_registry
 from torchax.ops import op_base, mappings
 from torchax import interop
 from torchax.ops import jax_reimplement
+from torchax.view import View
 # Keys are OpOverload, value is a callable that takes
 # Tensor
 all_ops = {}
-# list all Aten ops from pytorch that does mutation
-# and need to be implemented in jax
-mutation_ops_to_functional = {
-  torch.ops.aten.add_: torch.ops.aten.add,
-  torch.ops.aten.sub_: torch.ops.aten.sub,
-  torch.ops.aten.mul_: torch.ops.aten.mul,
-  torch.ops.aten.div_: torch.ops.aten.div,
-  torch.ops.aten.pow_: torch.ops.aten.pow,
-  torch.ops.aten.lt_: torch.ops.aten.lt,
-  torch.ops.aten.le_: torch.ops.aten.le,
-  torch.ops.aten.gt_: torch.ops.aten.gt,
-  torch.ops.aten.ge_: torch.ops.aten.ge,
-  torch.ops.aten.eq_: torch.ops.aten.eq,
-  torch.ops.aten.ne_: torch.ops.aten.ne,
-  torch.ops.aten.bernoulli_: torch.ops.aten.bernoulli.p,
-  torch.ops.aten.geometric_: torch.ops.aten.geometric,
-  torch.ops.aten.normal_: torch.ops.aten.normal,
-  torch.ops.aten.random_: torch.ops.aten.uniform,
-  torch.ops.aten.uniform_: torch.ops.aten.uniform,
-  torch.ops.aten.relu_: torch.ops.aten.relu,
-  # squeeze_ is expected to change tensor's shape. So replace with new value
-  torch.ops.aten.squeeze_: (torch.ops.aten.squeeze, True),
-  torch.ops.aten.sqrt_: torch.ops.aten.sqrt,
-  torch.ops.aten.clamp_: torch.ops.aten.clamp,
-  torch.ops.aten.clamp_min_: torch.ops.aten.clamp_min,
-  torch.ops.aten.sigmoid_: torch.ops.aten.sigmoid,
-  torch.ops.aten.tanh_: torch.ops.aten.tanh,
-  torch.ops.aten.ceil_: torch.ops.aten.ceil,
-  torch.ops.aten.logical_not_: torch.ops.aten.logical_not,
-  torch.ops.aten.unsqueeze_: torch.ops.aten.unsqueeze,
-  torch.ops.aten.transpose_: torch.ops.aten.transpose,
-  torch.ops.aten.log_normal_: torch.ops.aten.log_normal,
-  torch.ops.aten.scatter_add_: torch.ops.aten.scatter_add,
-  torch.ops.aten.scatter_reduce_.two: torch.ops.aten.scatter_reduce,
-  torch.ops.aten.scatter_: torch.ops.aten.scatter,
-}
-# Note: tuple comparisons work intuitively, e.g. `_jax_version >= (0, 4, 32)`.
-_jax_version = tuple(int(v) for v in jax.version._version.split("."))
-def make_mutation(op):
-  if type(mutation_ops_to_functional[op]) is tuple:
-    return op_base.InplaceOp(mutation_ops_to_functional[op][0],
-                             replace=mutation_ops_to_functional[op][1],
-                             position_to_mutate=0)
-  return op_base.InplaceOp(mutation_ops_to_functional[op], position_to_mutate=0)
-for op in mutation_ops_to_functional.keys():
-  ops_registry.register_torch_dispatch_op(
-    op, make_mutation(op), is_jax_function=False
-  )
 def op(*aten, **kwargs):
   def inner(func):
     for a in aten:
       ops_registry.register_torch_dispatch_op(a, func, **kwargs)
       continue
       if isinstance(a, torch._ops.OpOverloadPacket):
-        opname = a.default.name() if 'default' in a.overloads() else a._qualified_op_name
+        opname = a.default.name() if 'default' in a.overloads(
+        ) else a._qualified_op_name
       elif isinstance(a, torch._ops.OpOverload):
         opname = a.name()
       else:
@@ -91,17 +38,18 @@ def op(*aten, **kwargs):
       torchfunc = functools.partial(interop.call_jax, func)
       # HACK: to_copy is where we make the initial conversion from CPU tensor to JAX tensor
-      torch.library.impl(opname, 'privateuseone')(torchfunc if a != torch.ops.aten._to_copy else func)
+      torch.library.impl(opname, 'privateuseone')(
+          torchfunc if a != torch.ops.aten._to_copy else func)
     return func
   return inner
 @op(
-  torch.ops.aten.view_copy,
-  torch.ops.aten.view,
-  torch.ops.aten._unsafe_view,
-  torch.ops.aten.reshape,
+    torch.ops.aten.view_copy,
+    torch.ops.aten.view,
+    torch.ops.aten._unsafe_view,
+    torch.ops.aten.reshape,
 )
 def _aten_unsafe_view(x, shape):
   return jnp.reshape(x, shape)
@@ -121,8 +69,19 @@ def _aten_add(x, y, *, alpha=1):
   return res
-@op(torch.ops.aten.copy_, is_jax_function=False)
-def _aten_copy(x, y, memory_format=None):
+@op(torch.ops.aten.copy_,
+    is_jax_function=False,
+    is_view_op=True,
+    needs_env=True)
+def _aten_copy(x, y, memory_format=None, env=None):
+  if y.device.type == 'cpu':
+    y = env.to_xla(y)
+  if isinstance(x, View):
+    x.update(y)
+    return x
   if x.ndim == 1 and y.ndim == 0:
     # case of torch.empty((1,)).copy_(tensor(N))
     # we need to return 0D tensor([N]) and not scalar tensor(N)
@@ -147,14 +106,20 @@ def _aten_trunc(x):
 @op(torch.ops.aten.index_copy)
 def _aten_index_copy(x, dim, indexes, source):
+  if x.ndim == 0:
+    return source
+  if x.ndim == 1:
+    source = jnp.squeeze(source)
   # return jax.lax.scatter(x, index, dim)
+  if dim < 0:
+    dim = dim + x.ndim
   dims = []
   for i in range(len(x.shape)):
     if i == dim:
       dims.append(indexes)
     else:
       dims.append(slice(None, None, None))
-  return x.at[dim].set(source)
+  return x.at[tuple(dims)].set(source)
 # aten.cauchy_
@@ -199,7 +164,9 @@ def _aten_complex(real, imag):
   Returns:
     A complex array with the specified real and imaginary parts.
   """
-  return jnp.array(real, dtype=jnp.float32) + 1j * jnp.array(imag, dtype=jnp.float32)
+  return jnp.array(
+      real, dtype=jnp.float32) + 1j * jnp.array(
+          imag, dtype=jnp.float32)
 # aten.exponential_
@@ -223,13 +190,14 @@ def _aten_exponential_(x, lambd=1.0):
 # aten.linalg_householder_product
 @op(torch.ops.aten.linalg_householder_product)
 def _aten_linalg_householder_product(input, tau):
-  return jax.lax.linalg.householder_product(a = input, taus = tau)
+  return jax.lax.linalg.householder_product(a=input, taus=tau)
 @op(torch.ops.aten.select)
 def _aten_select(x, dim, indexes):
   return jax.lax.index_in_dim(x, index=indexes, axis=dim, keepdims=False)
 @op(torch.ops.aten.index_select)
 @op(torch.ops.aten.select_copy)
 def _aten_index_select(x, dim, index):
@@ -249,11 +217,10 @@ def _aten_linalg_cholesky_ex(input, upper=False, check_errors=False):
     raise NotImplementedError(
         "check_errors=True is not supported in this JAX implementation. "
         "Check for positive definiteness using jnp.linalg.eigvalsh before "
-        "calling this function."
-    )
+        "calling this function.")
   L = jax.scipy.linalg.cholesky(input, lower=not upper)
-  if len(L.shape) >2:
+  if len(L.shape) > 2:
     info = jnp.zeros(shape=L.shape[:-2], dtype=jnp.int32)
   else:
     info = jnp.array(0, dtype=jnp.int32)
@@ -263,7 +230,7 @@ def _aten_linalg_cholesky_ex(input, upper=False, check_errors=False):
 @op(torch.ops.aten.cholesky_solve)
 def _aten_cholesky_solve(input, input2, upper=False):
   # Ensure input2 is lower triangular for cho_solve
-  L = input2 if not upper else input2.T
+  L = input2 if not upper else input2.T
   # Use cho_solve to solve the linear system
   solution = jax.scipy.linalg.cho_solve((L, True), input)
   return solution
@@ -275,7 +242,7 @@ def _aten_special_zeta(x, q):
   res = jax.scipy.special.zeta(x, q)
   if isinstance(x, int) or isinstance(q, int):
     res = res.astype(new_dtype)
-  return res # jax.scipy.special.zeta(x, q)
+  return res  # jax.scipy.special.zeta(x, q)
 # aten.igammac
@@ -286,7 +253,7 @@ def _aten_igammac(input, other):
   if isinstance(other, jnp.ndarray):
     other = jnp.where(other < 0, jnp.nan, other)
   else:
-    if (input==0 and other==0) or (input < 0) or (other < 0):
+    if (input == 0 and other == 0) or (input < 0) or (other < 0):
       other = jnp.nan
   return jnp.array(jax.scipy.special.gammaincc(input, other))
@@ -294,7 +261,7 @@ def _aten_igammac(input, other):
 @op(torch.ops.aten.mean)
 def _aten_mean(x, dim=None, keepdim=False):
   if x.shape == () and dim is not None:
-    dim = None # disable dim for jax array without dim
+    dim = None  # disable dim for jax array without dim
   return jnp.mean(x, dim, keepdims=keepdim)
@@ -310,13 +277,14 @@ def _torch_binary_scalar_type(scalar, tensor):
 @op(torch.ops.aten.searchsorted.Tensor)
-def _aten_searchsorted(sorted_sequence, values):
+def _aten_searchsorted(sorted_sequence, values):
   new_dtype = mappings.t2j_dtype(torch.get_default_dtype())
   res = jnp.searchsorted(sorted_sequence, values)
-  if sorted_sequence.dtype == np.dtype(np.int32) or sorted_sequence.dtype == np.dtype(np.int32):
+  if sorted_sequence.dtype == np.dtype(
+      np.int32) or sorted_sequence.dtype == np.dtype(np.int32):
     # res = res.astype(new_dtype)
     res = res.astype(np.dtype(np.int64))
-  return res # jnp.searchsorted(sorted_sequence, values)
+  return res  # jnp.searchsorted(sorted_sequence, values)
 @op(torch.ops.aten.sub.Tensor)
@@ -328,7 +296,7 @@ def _aten_sub(x, y, alpha=1):
   if isinstance(y, float):
     dtype = _torch_binary_scalar_type(y, x)
     y = jnp.array(y, dtype=dtype)
-  return x - y*alpha
+  return x - y * alpha
 @op(torch.ops.aten.numpy_T)
@@ -345,7 +313,6 @@ def _aten_numpy_T(input):
   return jnp.transpose(input)
 @op(torch.ops.aten.mm)
 def _aten_mm(x, y):
   res = x @ y
@@ -379,13 +346,15 @@ def _aten_t(x):
 @op(torch.ops.aten.transpose)
 @op(torch.ops.aten.transpose_copy)
 def _aten_transpose(x, dim0, dim1):
-  shape = list(range(len(x.shape)))
-  shape[dim0], shape[dim1] = shape[dim1], shape[dim0]
-  return jnp.transpose(x, shape)
+  if x.ndim == 0:
+    return x
+  dim0 = dim0 if dim0 >= 0 else dim0 + x.ndim
+  dim1 = dim1 if dim1 >= 0 else dim1 + x.ndim
+  return jnp.swapaxes(x, dim0, dim1)
 @op(torch.ops.aten.triu)
-def _aten_triu(m, k):
+def _aten_triu(m, k=0):
   return jnp.triu(m, k)
@@ -406,6 +375,7 @@ def _aten_slice(self, dim=0, start=None, end=None, step=1):
   return self[tuple(dims)]
+@op(torch.ops.aten.positive)
 @op(torch.ops.aten.detach)
 def _aten_detach(self):
   return self
@@ -439,7 +409,8 @@ def _aten_resize_as_(x, y):
 @op(torch.ops.aten.repeat_interleave.Tensor)
 def repeat_interleave(repeats, dim=0):
-  return jnp.repeat(jnp.arange(repeats.shape[dim]), repeats)
+  return jnp.repeat(np.arange(repeats.shape[dim]), repeats)
 @op(torch.ops.aten.repeat_interleave.self_int)
 @op(torch.ops.aten.repeat_interleave.self_Tensor)
@@ -451,12 +422,6 @@ def repeat_interleave(self, repeats, dim=0):
   return jnp.repeat(self, repeats, dim, total_repeat_length=total_repeat_length)
-# aten.upsample_bilinear2d
-@op(torch.ops.aten.upsample_bilinear2d)
-def _aten_upsample_bilinear2d(x, output_size, align_corners=False, scale_h=None, scale_w=None):
-  return _aten_upsample_bilinear2d_aa(x, output_size=output_size, align_corners=align_corners, scale_factors=None, scales_h=scale_h, scales_w=scale_w)
 @op(torch.ops.aten.view_as_real)
 def _aten_view_as_real(x):
   real = jnp.real(x)
@@ -473,7 +438,7 @@ def _aten_stack(tensors, dim=0):
 @op(torch.ops.aten._softmax)
 @op(torch.ops.aten.softmax)
 @op(torch.ops.aten.softmax.int)
-def _aten_softmax(x, dim, halftofloat = False):
+def _aten_softmax(x, dim, halftofloat=False):
   if x.shape == ():
     return jax.nn.softmax(x.reshape([1]), axis=0).reshape([])
   return jax.nn.softmax(x, dim)
@@ -482,10 +447,12 @@ def _aten_softmax(x, dim, halftofloat = False):
 def _is_int(x):
   if isinstance(x, int):
     return True
-  if isinstance(x, jax.Array) and (x.dtype.name.startswith('int') or x.dtype.name.startswith('uint')):
+  if isinstance(x, jax.Array) and (x.dtype.name.startswith('int') or
+                                   x.dtype.name.startswith('uint')):
     return True
   return False
 def highest_precision_int_dtype(tensor1, tensor2):
   if isinstance(tensor1, int):
     return tensor2.dtype
@@ -493,12 +460,20 @@ def highest_precision_int_dtype(tensor1, tensor2):
     return tensor1.dtype
   dtype_hierarchy = {
-      'uint8': 8, 'int8': 8,
-      'uint16': 16, 'int16': 16,
-      'uint32': 32, 'int32': 32,
-      'uint64': 64, 'int64': 64,
+      'uint8': 8,
+      'int8': 8,
+      'uint16': 16,
+      'int16': 16,
+      'uint32': 32,
+      'int32': 32,
+      'uint64': 64,
+      'int64': 64,
   }
-  return max(tensor1.dtype, tensor2.dtype, key=lambda dtype: dtype_hierarchy[str(dtype)])
+  return max(
+      tensor1.dtype,
+      tensor2.dtype,
+      key=lambda dtype: dtype_hierarchy[str(dtype)])
 @op(torch.ops.aten.pow)
 def _aten_pow(x, y):
@@ -553,11 +528,13 @@ def _aten_div(x, y, rounding_mode=""):
 def _aten_true_divide(x, y):
   return x / y
 @op(torch.ops.aten.dist)
 def _aten_dist(input, other, p=2):
   diff = jnp.abs(jnp.subtract(input, other))
   return _aten_linalg_vector_norm(diff, ord=p)
 @op(torch.ops.aten.bmm)
 def _aten_bmm(x, y):
   res = x @ y
@@ -567,9 +544,14 @@ def _aten_bmm(x, y):
 @op(torch.ops.aten.embedding)
 # embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False)
-def _aten_embedding(a, w, padding_idx=-1, scale_grad_by_freq=False, sparse=False):
+def _aten_embedding(a,
+                    w,
+                    padding_idx=-1,
+                    scale_grad_by_freq=False,
+                    sparse=False):
   return jnp.take(a, w, axis=0)
 @op(torch.ops.aten.embedding_renorm_)
 def _aten_embedding_renorm_(weight, indices, max_norm, norm_type):
   # Adapted from https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/Embedding.cpp
@@ -587,27 +569,26 @@ def _aten_embedding_renorm_(weight, indices, max_norm, norm_type):
   indices_to_update = unique_indices[indice_idx]
-  weight = weight.at[indices_to_update].set(
-      weight[indices_to_update] * scale[:, None]
-  )
+  weight = weight.at[indices_to_update].set(weight[indices_to_update] *
+                                            scale[:, None])
   return weight
 #- func: _embedding_bag_forward_only(
 # Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False,
 # int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
 @op(torch.ops.aten._embedding_bag)
 @op(torch.ops.aten._embedding_bag_forward_only)
-def _aten__embedding_bag(
-  weight,
-  indices,
-  offsets=None,
-  scale_grad_by_freq=False,
-  mode=0,
-  sparse=False,
-  per_sample_weights=None,
-  include_last_offset=False,
-  padding_idx=-1):
-    """Jax implementation of the PyTorch _embedding_bag function.
+def _aten__embedding_bag(weight,
+                         indices,
+                         offsets=None,
+                         scale_grad_by_freq=False,
+                         mode=0,
+                         sparse=False,
+                         per_sample_weights=None,
+                         include_last_offset=False,
+                         padding_idx=-1):
+  """Jax implementation of the PyTorch _embedding_bag function.
     Args:
         weight: The learnable weights of the module of shape (num_embeddings, embedding_dim).
@@ -623,48 +604,50 @@ def _aten__embedding_bag(
     Returns:
         A tuple of (output, offset2bag, bag_size, max_indices).
     """
-    embedded = _aten_embedding(weight, indices, padding_idx)
-    if offsets is None:
-      # offsets is None only when indices.ndim > 1
-      if mode == 0:  # sum
-        output = jnp.sum(embedded, axis=1)
-      elif mode == 1:  # mean
-        output = jnp.mean(embedded, axis=1)
-      elif mode == 2:  # max
-        output = jnp.max(embedded, axis=1)
-      return output, None, None, None
-    if isinstance(offsets, jax.Array):
-      offsets_np = np.array(offsets)
-    else:
-      offsets_np = offsets
-    offset2bag = np.zeros(indices.shape[0], dtype=np.int64)
-    bag_size = np.zeros(offsets_np.shape[0], dtype=np.int64)
-    max_indices = jnp.full_like(indices, -1)
+  embedded = _aten_embedding(weight, indices, padding_idx)
+  if offsets is None:
+    # offsets is None only when indices.ndim > 1
+    if mode == 0:  # sum
+      output = jnp.sum(embedded, axis=1)
+    elif mode == 1:  # mean
+      output = jnp.mean(embedded, axis=1)
+    elif mode == 2:  # max
+      output = jnp.max(embedded, axis=1)
+    return output, None, None, None
+  if isinstance(offsets, jax.Array):
+    offsets_np = np.array(offsets)
+  else:
+    offsets_np = offsets
+  offset2bag = np.zeros(indices.shape[0], dtype=np.int64)
+  bag_size = np.zeros(offsets_np.shape[0], dtype=np.int64)
+  max_indices = jnp.full_like(indices, -1)
-    for bag in range(offsets_np.shape[0]):
-      start = int(offsets_np[bag])
+  for bag in range(offsets_np.shape[0]):
+    start = int(offsets_np[bag])
-      end = int(indices.shape[0] if bag + 1 == offsets_np.shape[0] else offsets_np[bag + 1])
-      bag_size[bag] = end - start
-      offset2bag = offset2bag.at[start:end].set(bag)
+    end = int(indices.shape[0] if bag +
+              1 == offsets_np.shape[0] else offsets_np[bag + 1])
+    bag_size[bag] = end - start
+    offset2bag = offset2bag.at[start:end].set(bag)
-      if end - start > 0:
-        if mode == 0:
-          output_bag = jnp.sum(embedded[start:end], axis=0)
-        elif mode == 1:
-          output_bag = jnp.mean(embedded[start:end], axis=0)
-        elif mode == 2:
-          output_bag = jnp.max(embedded[start:end], axis=0)
-          max_indices = max_indices.at[start:end].set(jnp.argmax(embedded[start:end], axis=0))
+    if end - start > 0:
+      if mode == 0:
+        output_bag = jnp.sum(embedded[start:end], axis=0)
+      elif mode == 1:
+        output_bag = jnp.mean(embedded[start:end], axis=0)
+      elif mode == 2:
+        output_bag = jnp.max(embedded[start:end], axis=0)
+        max_indices = max_indices.at[start:end].set(
+            jnp.argmax(embedded[start:end], axis=0))
-    # The original code returned offset2bag, bag_size, and max_indices as numpy arrays.
-    # Converting them to JAX arrays for consistency.
-    offset2bag = jnp.array(offset2bag)
-    bag_size = jnp.array(bag_size)
+  # The original code returned offset2bag, bag_size, and max_indices as numpy arrays.
+  # Converting them to JAX arrays for consistency.
+  offset2bag = jnp.array(offset2bag)
+  bag_size = jnp.array(bag_size)
-    return output_bag, offset2bag, bag_size, max_indices
+  return output_bag, offset2bag, bag_size, max_indices
 @op(torch.ops.aten.rsqrt)
@@ -676,6 +659,7 @@ def _aten_rsqrt(x):
 @op(torch.ops.aten.expand)
 @op(torch.ops.aten.expand_copy)
 def _aten_expand(x, dims):
   def fix_dims(d, xs):
     if d == -1:
       return xs
@@ -683,7 +667,9 @@ def _aten_expand(x, dims):
   shape = list(x.shape)
   if len(shape) < len(dims):
-    shape = [1, ] * (len(dims) - len(shape)) + shape
+    shape = [
+        1,
+    ] * (len(dims) - len(shape)) + shape
     # make sure that dims and shape is the same by
     # left pad with 1s. Otherwise the zip below will
     # truncate
@@ -705,15 +691,15 @@ def _aten__to_copy(self, **kwargs):
 @op(torch.ops.aten.empty)
-@op_base.convert_dtype()
+@op_base.convert_dtype(use_default_dtype=False)
 def _aten_empty(size: Sequence[int], *, dtype=None, **kwargs):
   return jnp.empty(size, dtype=dtype)
 @op(torch.ops.aten.empty_like)
-@op_base.convert_dtype()
+@op_base.convert_dtype(use_default_dtype=False)
 def _aten_empty_like(input, *, dtype=None, **kwargs):
-  return jnp.empty_like(input, dtype=dtype)
+  return jnp.empty_like(input, dtype)
 @op(torch.ops.aten.ones)
@@ -784,8 +770,8 @@ def split_with_sizes(x, sizes, dim=0):
     A list of sub-arrays.
   """
   if isinstance(sizes, int):
-    # split equal size
-    new_sizes = [sizes] * (x.shape[dim] // sizes)
+    # split equal size, round up
+    new_sizes = [sizes] * (-(-x.shape[dim] // sizes))
     sizes = new_sizes
   rank = x.ndim
   splits = np.cumsum(sizes)  # Cumulative sum for split points
@@ -796,14 +782,15 @@ def split_with_sizes(x, sizes, dim=0):
     return tuple(res)
   return [
-    x[make_range(rank, dim, start, end)]
-    for start, end in zip([0] + list(splits[:-1]), splits)
+      x[make_range(rank, dim, start, end)]
+      for start, end in zip([0] + list(splits[:-1]), splits)
   ]
 @op(torch.ops.aten.permute)
 @op(torch.ops.aten.permute_copy)
 def permute(t, dims):
+  # TODO: return a View instead
   return jnp.transpose(t, dims)
@@ -819,6 +806,7 @@ def _aten_unsqueeze(self, dim):
 def _aten_ne(x, y):
   return jnp.not_equal(x, y)
 # Create indices along a specific axis
 #
 # For example
@@ -832,14 +820,12 @@ def _aten_ne(x, y):
 def _indices_along_axis(x, axis):
   return jnp.expand_dims(
       jnp.arange(x.shape[axis]),
-      axis = [d for d in range(len(x.shape)) if d != axis]
-  )
+      axis=[d for d in range(len(x.shape)) if d != axis])
 def _broadcast_indices(indices, shape):
-  return jnp.broadcast_to(
-      indices,
-      shape
-  )
+  return jnp.broadcast_to(indices, shape)
 @op(torch.ops.aten.cummax)
 def _aten_cummax(x, dim):
@@ -851,36 +837,45 @@ def _aten_cummax(x, dim):
   indice_along_axis = _indices_along_axis(x, axis)
   indices = _broadcast_indices(indice_along_axis, x.shape)
   def cummax_reduce_func(carry, elem):
-    v1, v2 = carry['val'], elem['val']
+    v1, v2 = carry['val'], elem['val']
     i1, i2 = carry['idx'], elem['idx']
     v = jnp.maximum(v1, v2)
     i = jnp.where(v1 > v2, i1, i2)
     return {'val': v, 'idx': i}
-  res = jax.lax.associative_scan(cummax_reduce_func, {'val': x, 'idx': indices}, axis=axis)
+  res = jax.lax.associative_scan(
+      cummax_reduce_func, {
+          'val': x,
+          'idx': indices
+      }, axis=axis)
   return res['val'], res['idx']
 @op(torch.ops.aten.cummin)
 def _aten_cummin(x, dim):
   if not x.shape:
     return x, jnp.zeros_like(x, dtype=jnp.int64)
   axis = dim
   indice_along_axis = _indices_along_axis(x, axis)
   indices = _broadcast_indices(indice_along_axis, x.shape)
   def cummin_reduce_func(carry, elem):
-    v1, v2 = carry['val'], elem['val']
+    v1, v2 = carry['val'], elem['val']
     i1, i2 = carry['idx'], elem['idx']
     v = jnp.minimum(v1, v2)
     i = jnp.where(v1 < v2, i1, i2)
     return {'val': v, 'idx': i}
-  res = jax.lax.associative_scan(cummin_reduce_func, {'val': x, 'idx': indices}, axis=axis)
+  res = jax.lax.associative_scan(
+      cummin_reduce_func, {
+          'val': x,
+          'idx': indices
+      }, axis=axis)
   return res['val'], res['idx']
@@ -908,9 +903,11 @@ def _aten_cumprod(input, dim, dtype=None, out=None):
 @op(torch.ops.aten.native_layer_norm)
-def _aten_native_layer_norm(
-  input, normalized_shape, weight=None, bias=None, eps=1e-5
-):
+def _aten_native_layer_norm(input,
+                            normalized_shape,
+                            weight=None,
+                            bias=None,
+                            eps=1e-5):
   """Implements layer normalization in Jax as defined by `aten::native_layer_norm`.
   Args:
@@ -944,7 +941,7 @@ def _aten_native_layer_norm(
     norm_x += bias
   return norm_x, mean, rstd
 @op(torch.ops.aten.matmul)
 def _aten_matmul(x, y):
   return x @ y
@@ -960,6 +957,7 @@ def _aten_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0):
   self += alpha * jnp.matmul(mat1, mat2)
   return self
 @op(torch.ops.aten.sparse_sampled_addmm)
 def _aten_sparse_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0):
   alpha = jnp.array(alpha).astype(mat1.dtype)
@@ -974,9 +972,8 @@ def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1):
   alpha = jnp.array(alpha).astype(batch1.dtype)
   beta = jnp.array(beta).astype(batch1.dtype)
   mm = jnp.einsum("bxy, byz -> xz", batch1, batch2)
-  return jax.lax.cond(
-    beta == 0, lambda: alpha * mm, lambda: beta * input + alpha * mm
-  )
+  return jax.lax.cond(beta == 0, lambda: alpha * mm,
+                      lambda: beta * input + alpha * mm)
 @op(torch.ops.aten.gelu)
@@ -987,73 +984,69 @@ def _aten_gelu(self, *, approximate="none"):
 @op(torch.ops.aten.squeeze)
 @op(torch.ops.aten.squeeze_copy)
-def _aten_squeeze_dim(self, dim):
-  """Squeezes a Jax tensor by removing a single dimension of size 1.
-  Args:
-    self: The input tensor.
-    dim: The dimension to squeeze.
-  Returns:
-    The squeezed tensor with the specified dimension removed if it is 1,
-    otherwise the original tensor is returned.
-  """
-  # Validate input arguments
-  if not isinstance(self, jnp.ndarray):
-    raise TypeError(f"Expected a Jax tensor, got {type(self)}.")
-  if isinstance(dim, int):
-    dim = [dim]
-  # Check if the specified dimension has size 1
-  if (len(self.shape) == 0) or all([self.shape[d] != 1 for d in dim]):
+def _aten_squeeze_dim(self, dim=None):
+  if self.ndim == 0:
     return self
+  if dim is not None:
+    if isinstance(dim, int):
+      if self.shape[dim] != 1:
+        return self
+      if dim < 0:
+        dim += self.ndim
+    else:
+      # NOTE: torch leaves the dims that is not 1 unchanged,
+      # but jax raises error.
+      dim = [
+          i if i >= 0 else (i + self.ndim) for i in dim if self.shape[i] == 1
+      ]
-  # Use slicing to remove the dimension if it is 1
-  new_shape = list(self.shape)
-  def fix_dim(p):
-    if p < 0:
-      return p + len(self.shape)
-    return p
+  return jnp.squeeze(self, dim)
-  dim = [fix_dim(d) for d in dim]
-  new_shape = [p for i, p in enumerate(self.shape) if i not in dim or p != 1]
-  return self.reshape(new_shape)
 @op(torch.ops.aten.bucketize)
-def _aten_bucketize(input, boundaries, *, out_int32=False, right=False, out=None):
-  assert boundaries[0] < boundaries[-1], "boundaries must contain a strictly increasing sequence"
+def _aten_bucketize(input,
+                    boundaries,
+                    *,
+                    out_int32=False,
+                    right=False,
+                    out=None):
   return_type = jnp.int32 if out_int32 else jnp.int64
   return jnp.digitize(input, boundaries, right=not right).astype(return_type)
 @op(torch.ops.aten.conv2d)
 def _aten_conv2d(
-  input,
-  weight,
-  bias,
-  stride,
-  padding,
-  dilation,
-  groups,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
 ):
   return _aten_convolution(
-    input, weight, bias, stride, padding,
-    dilation, transposed=False,
-    output_padding=1, groups=groups)
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      transposed=False,
+      output_padding=1,
+      groups=groups)
 @op(torch.ops.aten.convolution)
 def _aten_convolution(
-  input,
-  weight,
-  bias,
-  stride,
-  padding,
-  dilation,
-  transposed,
-  output_padding,
-  groups,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
 ):
   num_shape_dim = weight.ndim - 1
   batch_dims = input.shape[:-num_shape_dim]
@@ -1068,7 +1061,7 @@ def _aten_convolution(
       # See https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d.html
       pad_out = []
       for i in range(num_spatial_dims):
-        front = dilation[i] * (weight.shape[i+2] - 1) - padding[i]
+        front = dilation[i] * (weight.shape[i + 2] - 1) - padding[i]
         back = front + output_padding[i]
         pad_out.append((front, back))
       return pad_out
@@ -1089,39 +1082,38 @@ def _aten_convolution(
       rhs_spec.append(i + 2)
       out_spec.append(i + 2)
     return jax.lax.ConvDimensionNumbers(
-      *map(tuple, (lhs_spec, rhs_spec, out_spec))
-    )
+        *map(tuple, (lhs_spec, rhs_spec, out_spec)))
   if transposed:
-    rhs = jnp.flip(weight, range(2, 1+num_shape_dim))
+    rhs = jnp.flip(weight, range(2, 1 + num_shape_dim))
     if groups != 1:
       # reshape filters for tranposed depthwise convolution
       assert rhs.shape[0] % groups == 0
-      rhs_shape = [rhs.shape[0]//groups, rhs.shape[1]*groups]
+      rhs_shape = [rhs.shape[0] // groups, rhs.shape[1] * groups]
       rhs_shape.extend(rhs.shape[2:])
       rhs = jnp.reshape(rhs, rhs_shape)
     res = jax.lax.conv_general_dilated(
-      input,
-      rhs,
-      (1,) * len(stride),
-      make_padding(padding, len(stride)),
-      lhs_dilation=stride,
-      rhs_dilation=dilation,
-      dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
-      feature_group_count=groups,
-      batch_group_count=1,
+        input,
+        rhs,
+        (1,) * len(stride),
+        make_padding(padding, len(stride)),
+        lhs_dilation=stride,
+        rhs_dilation=dilation,
+        dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
+        feature_group_count=groups,
+        batch_group_count=1,
     )
   else:
     res = jax.lax.conv_general_dilated(
-      input,
-      weight,
-      stride,
-      make_padding(padding, len(stride)),
-      lhs_dilation=(1,) * len(stride),
-      rhs_dilation=dilation,
-      dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
-      feature_group_count=groups,
-      batch_group_count=1,
+        input,
+        weight,
+        stride,
+        make_padding(padding, len(stride)),
+        lhs_dilation=(1,) * len(stride),
+        rhs_dilation=dilation,
+        dimension_numbers=create_default_conv_dimension_numbers(len(stride)),
+        feature_group_count=groups,
+        batch_group_count=1,
     )
   if bias is not None:
@@ -1137,10 +1129,9 @@ def _aten_convolution(
 # _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps)
-@op(torch.ops.aten._native_batch_norm_legit)
-def _aten__native_batch_norm_legit(
-  input, weight, bias, running_mean, running_var, training, momentum, eps
-):
+@op(torch.ops.aten._native_batch_norm_legit.default)
+def _aten__native_batch_norm_legit(input, weight, bias, running_mean,
+                                   running_var, training, momentum, eps):
   """JAX implementation of batch normalization with optional parameters.
   Refers to https://github.com/pytorch/pytorch/blob/cd3a71f754a2248bcfe500de7c9860bd7d2002bf/torch/_decomp/decompositions.py#L1713.
@@ -1161,8 +1152,7 @@ def _aten__native_batch_norm_legit(
     DeviceArray: Reversed batch variance (C,) or empty if training is False
   """
   reduction_dims = [0] + list(range(2, input.ndim))
-  reshape_dims = [1, -1] + [1]*(input.ndim-2)
+  reshape_dims = [1, -1] + [1] * (input.ndim - 2)
   if training:
     # Calculate batch mean and variance
     mean = jnp.mean(input, axis=reduction_dims, keepdims=True)
@@ -1175,7 +1165,9 @@ def _aten__native_batch_norm_legit(
     saved_rstd = jnp.squeeze(rstd, reduction_dims)
   else:
     rstd = jax.lax.rsqrt(running_var.reshape(reshape_dims) + eps)
-    saved_mean = jnp.array([], dtype=input.dtype)  # No need to calculate batch statistics in inference mode
+    saved_mean = jnp.array(
+        [], dtype=input.dtype
+    )  # No need to calculate batch statistics in inference mode
     saved_rstd = jnp.array([], dtype=input.dtype)
   # Normalize
@@ -1190,19 +1182,17 @@ def _aten__native_batch_norm_legit(
   if weight is not None:
     x_hat *= weight.reshape(reshape_dims)  # Reshape weight for broadcasting
   if bias is not None:
-    x_hat += bias.reshape(reshape_dims)    # Reshape bias for broadcasting
+    x_hat += bias.reshape(reshape_dims)  # Reshape bias for broadcasting
   return x_hat, saved_mean, saved_rstd
 @op(torch.ops.aten._native_batch_norm_legit_no_training)
-def _aten__native_batch_norm_legit_no_training(
-  input, weight, bias, running_mean, running_var, momentum, eps
-):
-  return _aten__native_batch_norm_legit(
-    input, weight, bias, running_mean, running_var, False, momentum, eps
-  )
+def _aten__native_batch_norm_legit_no_training(input, weight, bias,
+                                               running_mean, running_var,
+                                               momentum, eps):
+  return _aten__native_batch_norm_legit(input, weight, bias, running_mean,
+                                        running_var, False, momentum, eps)
 @op(torch.ops.aten.relu)
@@ -1212,7 +1202,15 @@ def _aten_relu(self):
 @op(torch.ops.aten.cat)
 def _aten_cat(tensors, dims=0):
-  return jnp.concatenate(tensors, dims)
+  # handle empty tensors as a special case.
+  # torch.cat will ignore the empty tensor, while jnp.concatenate
+  # will error if the dims > 0.
+  filtered_tensors = [
+      t for t in tensors if not (t.ndim == 1 and t.shape[0] == 0)
+  ]
+  if filtered_tensors:
+    return jnp.concatenate(filtered_tensors, dims)
+  return tensors[0]
 def _ceil_mode_padding(
@@ -1220,6 +1218,7 @@ def _ceil_mode_padding(
     input_shape: list[int],
     kernel_size: list[int],
     stride: list[int],
+    dilation: list[int],
     ceil_mode: bool,
 ):
   """Creates low and high padding specification for the given padding (which is symmetric) and ceil mode.
@@ -1232,20 +1231,13 @@ def _ceil_mode_padding(
     right_padding = left_padding
     input_size = input_shape[2 + i]
-    output_size_rem = (input_size + 2 * left_padding - kernel_size[i]) % stride[
-        i
-    ]
+    output_size_rem = (input_size + 2 * left_padding -
+                       (kernel_size[i] - 1) * dilation[i] - 1) % stride[i]
     if ceil_mode and output_size_rem != 0:
       extra_padding = stride[i] - output_size_rem
-      new_output_size = (
-          input_size
-          + left_padding
-          + right_padding
-          + extra_padding
-          - kernel_size[i]
-          + stride[i]
-          - 1
-      ) // stride[i] + 1
+      new_output_size = (input_size + left_padding + right_padding +
+                         extra_padding - (kernel_size[i] - 1) * dilation[i] -
+                         1 + stride[i] - 1) // stride[i] + 1
       # Ensure that the last pooling starts inside the image.
       size_to_compare = input_size + left_padding
@@ -1258,30 +1250,36 @@ def _ceil_mode_padding(
 @op(torch.ops.aten.max_pool2d_with_indices)
 @op(torch.ops.aten.max_pool3d_with_indices)
-def _aten_max_pool2d_with_indices(
-  inputs, kernel_size, strides, padding=0, dilation=1, ceil_mode=False
-):
+def _aten_max_pool2d_with_indices(inputs,
+                                  kernel_size,
+                                  strides=None,
+                                  padding=0,
+                                  dilation=1,
+                                  ceil_mode=False):
   num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
   kernel_size = tuple(kernel_size)
-  strides = tuple(strides)
+  # Default stride is kernel_size
+  strides = tuple(strides) if strides else kernel_size
   if isinstance(padding, int):
     padding = [padding for _ in range(len(kernel_size))]
+  if isinstance(dilation, int):
+    dilation = tuple(dilation for _ in range(len(kernel_size)))
+  elif isinstance(dilation, list):
+    dilation = tuple(dilation)
   input_shape = inputs.shape
   if num_batch_dims == 0:
     input_shape = [1, *input_shape]
-  padding = _ceil_mode_padding(
-      padding, input_shape, kernel_size, strides, ceil_mode
-  )
+  padding = _ceil_mode_padding(padding, input_shape, kernel_size, strides,
+                               dilation, ceil_mode)
-  window_shape = kernel_size
-  num_batch_dims = inputs.ndim - (len(window_shape) + 1)
-  strides = strides or (1,) * len(window_shape)
-  assert len(window_shape) == len(
-    strides
-  ), f"len({window_shape}) must equal len({strides})"
+  assert len(kernel_size) == len(
+      strides), f"len({kernel_size=}) must equal len({strides=})"
+  assert len(kernel_size) == len(
+      dilation), f"len({kernel_size=}) must equal len({dilation=})"
   strides = (1,) * (1 + num_batch_dims) + strides
-  dims = (1,) * (1 + num_batch_dims) + window_shape
+  dims = (1,) * (1 + num_batch_dims) + kernel_size
+  dilation = (1,) * (1 + num_batch_dims) + dilation
   is_single_input = False
   if num_batch_dims == 0:
@@ -1290,26 +1288,27 @@ def _aten_max_pool2d_with_indices(
     inputs = inputs[None]
     strides = (1,) + strides
     dims = (1,) + dims
+    dilation = (1,) + dilation
     is_single_input = True
   assert inputs.ndim == len(dims), f"len({inputs.shape}) != len({dims})"
   if not isinstance(padding, str):
     padding = tuple(map(tuple, padding))
-    assert len(padding) == len(window_shape), (
-      f"padding {padding} must specify pads for same number of dims as "
-      f"window_shape {window_shape}"
-    )
-    assert all(
-      [len(x) == 2 for x in padding]
-    ), f"each entry in padding {padding} must be length 2"
+    assert len(padding) == len(kernel_size), (
+        f"padding {padding} must specify pads for same number of dims as "
+        f"kernel_size {kernel_size}")
+    assert all([len(x) == 2 for x in padding
+               ]), f"each entry in padding {padding} must be length 2"
     padding = ((0, 0), (0, 0)) + padding
-  indices = jnp.arange(np.prod(inputs.shape)).reshape(inputs.shape)
+  indices = jnp.arange(np.prod(inputs.shape[-len(kernel_size):]))
+  indices = indices.reshape(inputs.shape[-len(kernel_size):])
+  indices = jnp.broadcast_to(indices, inputs.shape)
   def reduce_fn(a, b):
     ai, av = a
     bi, bv = b
-    which = av > bv
+    which = av >= bv  # torch breaks ties in favor of later indices
     return jnp.where(which, ai, bi), jnp.where(which, av, bv)
   init_val = -jnp.inf
@@ -1317,44 +1316,90 @@ def _aten_max_pool2d_with_indices(
     init_val = -(1 << 31)
   init_val = jnp.array(init_val).astype(inputs.dtype)
-  # Separate maxpool result and indices into two reduce_window ops. Since
-  # the indices tensor is usually unused in inference, separating the two
+  # Separate maxpool result and indices into two reduce_window ops. Since
+  # the indices tensor is usually unused in inference, separating the two
   # can help DCE computations for argmax.
   y = jax.lax.reduce_window(
-      inputs, init_val, jax.lax.max, dims, strides, padding
-  )
+      inputs,
+      init_val,
+      jax.lax.max,
+      dims,
+      strides,
+      padding,
+      window_dilation=dilation)
   indices, _ = jax.lax.reduce_window(
-      (indices, inputs), (0, init_val), reduce_fn, dims, strides, padding
+      (indices, inputs),
+      (0, init_val),
+      reduce_fn,
+      dims,
+      strides,
+      padding,
+      window_dilation=dilation,
   )
   if is_single_input:
     indices = jnp.squeeze(indices, axis=0)
     y = jnp.squeeze(y, axis=0)
   return y, indices
+# Aten ops registered under the `xla` library.
+try:
+  @op(torch.ops.xla.max_pool2d_forward)
+  def _xla_max_pool2d_forward(*args, **kwargs):
+    return _aten_max_pool2d_with_indices(*args, **kwargs)[0]
+  @op(torch.ops.xla.aot_mark_sharding)
+  def _xla_aot_mark_sharding(t, mesh: str, partition_spec: str):
+    from jax.sharding import PartitionSpec as P, NamedSharding
+    import ast
+    import torch_xla.distributed.spmd as xs
+    pmesh = xs.Mesh.from_str(mesh)
+    assert pmesh is not None
+    partition_spec_eval = ast.literal_eval(partition_spec)
+    jmesh = pmesh.get_jax_mesh()
+    return jax.lax.with_sharding_constraint(
+        t, NamedSharding(jmesh, P(*partition_spec_eval)))
+  @op(torch.ops.xla.einsum_linear_forward)
+  def _xla_einsum_linear_forward(input, weight, bias):
+    with jax.named_scope('einsum_linear_forward'):
+      product = jax.numpy.einsum('...n,mn->...m', input, weight)
+      if bias is not None:
+        return product + bias
+      return product
+except AttributeError:
+  pass
 # TODO add more ops
 @op(torch.ops.aten.min)
 def _aten_min(x, dim=None, keepdim=False):
   if dim is not None:
-    return _with_reduction_scalar(jnp.min, x, dim, keepdim), _with_reduction_scalar(jnp.argmin, x, dim, keepdim).astype(jnp.int64)
+    return _with_reduction_scalar(jnp.min, x, dim,
+                                  keepdim), _with_reduction_scalar(
+                                      jnp.argmin, x, dim,
+                                      keepdim).astype(jnp.int64)
   else:
     return _with_reduction_scalar(jnp.min, x, dim, keepdim)
 @op(torch.ops.aten.mode)
 def _aten_mode(input, dim=-1, keepdim=False, *, out=None):
-  if input.ndim == 0: # single number
+  if input.ndim == 0:  # single number
     return input, jnp.array(0)
-  dim = (input.ndim + dim) % input.ndim # jnp.scipy.stats.mode does not accept -1 as dim
+  dim = (input.ndim +
+         dim) % input.ndim  # jnp.scipy.stats.mode does not accept -1 as dim
   # keepdims must be True for accurate broadcasting
   mode, _ = jax.scipy.stats.mode(input, axis=dim, keepdims=True)
   mode_broadcast = jnp.broadcast_to(mode, input.shape)
   if not keepdim:
     mode = mode.squeeze(axis=dim)
-  indices = jnp.argmax(jnp.equal(mode_broadcast, input), axis=dim, keepdims=keepdim)
+  indices = jnp.argmax(
+      jnp.equal(mode_broadcast, input), axis=dim, keepdims=keepdim)
   return mode, indices
@@ -1388,8 +1433,7 @@ def _aten_var(x, dim=None, *, correction=1, keepdim=False, out=None):
 @op(torch.ops.prims.broadcast_in_dim)
 def _prims_broadcast_in_dim(t, shape, broadcast_dimensions):
   return jax.lax.broadcast_in_dim(
-    t, shape, broadcast_dimensions=broadcast_dimensions
-  )
+      t, shape, broadcast_dimensions=broadcast_dimensions)
 # aten.native_group_norm -- should use decomp table
@@ -1432,17 +1476,15 @@ def _aten_native_group_norm(input, weight, bias, N, C, HxW, group, eps=1e-5):
     normalized = (x - mean) * rstd
     return normalized, mean, rstd
-  normalized, group_mean, group_rstd = jax.lax.map(
-    group_norm_body, reshaped_input
-  )
+  normalized, group_mean, group_rstd = jax.lax.map(group_norm_body,
+                                                   reshaped_input)
   # Reshape back to original input shape
   output = jnp.reshape(normalized, input_shape)
   # **Affine transformation**
-  affine_shape = [
-    -1 if i == 1 else 1 for i in range(input.ndim)
-  ]  # Shape for broadcasting
+  affine_shape = [-1 if i == 1 else 1 for i in range(input.ndim)
+                 ]  # Shape for broadcasting
   if weight is not None and bias is not None:
     output = bias.reshape(affine_shape) + output * weight.reshape(affine_shape)
   elif weight is not None:
@@ -1474,22 +1516,25 @@ def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None):
       The tensor containing the calculated vector norms.
   """
-  if ord not in {2, float("inf"), float("-inf"), "fro"} and not isinstance(ord, (int, float)):
+  if ord not in {2, float("inf"), float("-inf"), "fro"
+                } and not isinstance(ord, (int, float)):
     raise ValueError(
-      f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and"
-      " 'fro'."
-    )
+        f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and"
+        " 'fro'.")
   # Special cases (for efficiency and clarity)
   if ord == 0:
     if self.shape == ():
       # float sets it to float64. set it back to input type
       result = jnp.astype(jnp.array(float(self != 0)), self.dtype)
     else:
-      result = _with_reduction_scalar(jnp.sum, jnp.where(self != 0, 1, 0), dim, keepdim)
+      result = _with_reduction_scalar(jnp.sum, jnp.where(self != 0, 1, 0), dim,
+                                      keepdim)
   elif ord == 2:  # Euclidean norm
-    result = jnp.sqrt(_with_reduction_scalar(jnp.sum, jnp.abs(self) ** 2, dim, keepdim))
+    result = jnp.sqrt(
+        _with_reduction_scalar(jnp.sum,
+                               jnp.abs(self)**2, dim, keepdim))
   elif ord == float("inf"):
     result = _with_reduction_scalar(jnp.max, jnp.abs(self), dim, keepdim)
@@ -1498,12 +1543,14 @@ def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None):
     result = _with_reduction_scalar(jnp.min, jnp.abs(self), dim, keepdim)
   elif ord == "fro":  # Frobenius norm
-    result = jnp.sqrt(_with_reduction_scalar(jnp.sum, jnp.abs(self) ** 2, dim, keepdim))
+    result = jnp.sqrt(
+        _with_reduction_scalar(jnp.sum,
+                               jnp.abs(self)**2, dim, keepdim))
   else:  # General case (e.g., ord = 1, ord = 3)
-    result = _with_reduction_scalar(jnp.sum, jnp.abs(self) ** ord, dim, keepdim) ** (
-      1.0 / ord
-    )
+    result = _with_reduction_scalar(jnp.sum,
+                                    jnp.abs(self)**ord, dim,
+                                    keepdim)**(1.0 / ord)
   # (Optional) dtype conversion
   if dtype is not None:
@@ -1539,9 +1586,12 @@ def _aten_sinh(self):
 # aten.native_layer_norm_backward
 @op(torch.ops.aten.native_layer_norm_backward)
-def _aten_native_layer_norm_backward(
-  grad_out, input, normalized_shape, weight, bias, eps=1e-5
-):
+def _aten_native_layer_norm_backward(grad_out,
+                                     input,
+                                     normalized_shape,
+                                     weight,
+                                     bias,
+                                     eps=1e-5):
   """Implements the backward pass of layer normalization in Jax as defined by `aten::native_layer_norm_backward`.
   Args:
@@ -1555,9 +1605,8 @@ def _aten_native_layer_norm_backward(
   Returns:
     A tuple of (grad_input, grad_weight, grad_bias).
   """
-  return jax.lax.native_layer_norm_backward(
-    grad_out, input, normalized_shape, weight, bias, eps
-  )
+  return jax.lax.native_layer_norm_backward(grad_out, input, normalized_shape,
+                                            weight, bias, eps)
 # aten.reflection_pad3d_backward
@@ -1585,12 +1634,14 @@ def _aten_bitwise_not(self):
 # aten.bitwise_left_shift
+@op(torch.ops.aten.__lshift__)
 @op(torch.ops.aten.bitwise_left_shift)
 def _aten_bitwise_left_shift(input, other):
   return jnp.left_shift(input, other)
 # aten.bitwise_right_shift
+@op(torch.ops.aten.__rshift__)
 @op(torch.ops.aten.bitwise_right_shift)
 def _aten_bitwise_right_shift(input, other):
   return jnp.right_shift(input, other)
@@ -1671,10 +1722,8 @@ def _scatter_index(dim, index):
       target_shape = [1] * len(index_shape)
       target_shape[i] = index_shape[i]
       input_indexes.append(
-        jnp.broadcast_to(
-          jnp.arange(index_shape[i]).reshape(target_shape), index_shape
-        )
-      )
+          jnp.broadcast_to(
+              jnp.arange(index_shape[i]).reshape(target_shape), index_shape))
   return tuple(input_indexes), tuple(source_indexes)
@@ -1686,6 +1735,7 @@ def _aten_scatter_add(input, dim, index, src):
   input_indexes, source_indexes = _scatter_index(dim, index)
   return input.at[input_indexes].add(src[source_indexes])
 # aten.masked_scatter
 @op(torch.ops.aten.masked_scatter)
 def _aten_masked_scatter(self, mask, source):
@@ -1707,6 +1757,7 @@ def _aten_masked_scatter(self, mask, source):
   return final_arr
 @op(torch.ops.aten.masked_select)
 def _aten_masked_select(self, mask, *args, **kwargs):
   broadcast_shape = jnp.broadcast_shapes(self.shape, mask.shape)
@@ -1722,6 +1773,7 @@ def _aten_masked_select(self, mask, *args, **kwargs):
   return self_flat[true_indices]
 # aten.logical_not
@@ -1730,11 +1782,13 @@ def _aten_masked_select(self, mask, *args, **kwargs):
 def _aten_sign(x):
   return jnp.sign(x)
 # aten.signbit
 @op(torch.ops.aten.signbit)
 def _aten_signbit(x):
   return jnp.signbit(x)
 # aten.sigmoid
 @op(torch.ops.aten.sigmoid)
 @op_base.promote_int_input
@@ -1760,7 +1814,13 @@ def _aten_atan(self):
 @op(torch.ops.aten.scatter_reduce)
 @op(torch.ops.aten.scatter)
-def _aten_scatter_reduce(input, dim, index, src, reduce=None, *, include_self=True):
+def _aten_scatter_reduce(input,
+                         dim,
+                         index,
+                         src,
+                         reduce=None,
+                         *,
+                         include_self=True):
   if not isinstance(src, jnp.ndarray):
     src = jnp.array(src, dtype=input.dtype)
   input_indexes, source_indexes = _scatter_index(dim, index)
@@ -1817,41 +1877,6 @@ def _aten_gt(self, other):
   return self > other
-# aten.pixel_shuffle
-@op(torch.ops.aten.pixel_shuffle)
-def _aten_pixel_shuffle(x, upscale_factor):
-  """PixelShuffle implementation in JAX.
-  Args:
-    x: Input tensor. Typically a feature map.
-    upscale_factor: Integer by which to upscale the spatial dimensions.
-  Returns:
-    Tensor after PixelShuffle operation.
-  """
-  batch_size, channels, height, width = x.shape
-  if channels % (upscale_factor**2) != 0:
-    raise ValueError(
-      "Number of channels must be divisible by the square of the upscale factor."
-    )
-  new_channels = channels // (upscale_factor**2)
-  new_height = height * upscale_factor
-  new_width = width * upscale_factor
-  x = x.reshape(
-    batch_size, new_channels, upscale_factor, upscale_factor, height, width
-  )
-  x = jnp.transpose(
-    x, (0, 1, 2, 4, 3, 5)
-  )  # Move channels to spatial dimensions
-  x = x.reshape(batch_size, new_channels, new_height, new_width)
-  return x
 # aten.sym_stride
 # aten.lt
 @op(torch.ops.aten.lt)
@@ -1883,8 +1908,7 @@ def pool(inputs, init, reduce_fn, window_shape, strides, padding):
   num_batch_dims = inputs.ndim - (len(window_shape) + 1)
   strides = strides or (1,) * len(window_shape)
   assert len(window_shape) == len(
-    strides
-  ), f"len({window_shape}) must equal len({strides})"
+      strides), f"len({window_shape}) must equal len({strides})"
   strides = (1,) * (1 + num_batch_dims) + strides
   dims = (1,) * (1 + num_batch_dims) + window_shape
@@ -1901,23 +1925,22 @@ def pool(inputs, init, reduce_fn, window_shape, strides, padding):
   if not isinstance(padding, str):
     padding = tuple(map(tuple, padding))
     assert len(padding) == len(window_shape), (
-      f"padding {padding} must specify pads for same number of dims as "
-      f"window_shape {window_shape}"
-    )
-    assert all(
-      [len(x) == 2 for x in padding]
-    ), f"each entry in padding {padding} must be length 2"
+        f"padding {padding} must specify pads for same number of dims as "
+        f"window_shape {window_shape}")
+    assert all([len(x) == 2 for x in padding
+               ]), f"each entry in padding {padding} must be length 2"
     padding = ((0, 0), (0, 0)) + padding
   y = jax.lax.reduce_window(inputs, init, reduce_fn, dims, strides, padding)
   if is_single_input:
     y = jnp.squeeze(y, axis=0)
   return y
 @op(torch.ops.aten._adaptive_avg_pool2d)
 @op(torch.ops.aten._adaptive_avg_pool3d)
-def adaptive_avg_pool2or3d(input: jnp.ndarray, output_size: Tuple[int, int]) -> jnp.ndarray:
-    """
+def adaptive_avg_pool2or3d(input: jnp.ndarray,
+                           output_size: Tuple[int, int]) -> jnp.ndarray:
+  """
     Applies a 2/3D adaptive average pooling over an input signal composed of several input planes.
     See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape.
@@ -1929,124 +1952,128 @@ def adaptive_avg_pool2or3d(input: jnp.ndarray, output_size: Tuple[int, int]) ->
     Context:
       https://github.com/pytorch/pytorch/blob/main/torch/_decomp/decompositions.py#L2401
     """
-    shape = input.shape
-    ndim = len(shape)
-    out_dim = len(output_size)
-    num_spatial_dim = ndim - out_dim
-    # Preconditions
-    assert ndim in (out_dim + 1, out_dim + 2), f"adaptive_avg_pool{num_spatial_dim}d(): Expected {num_spatial_dim+1}D or {num_spatial_dim+2}D tensor, but got {ndim}"
-    for d in input.shape[-2:]:
-        assert d != 0, "adaptive_avg_pool{num_spactial_dim}d(): Expected input to have non-zero size for " \
-                       f"non-batch dimensions, but input has shape {tuple(shape)}."
-    # Optimisation (we should also do this in the kernel implementation)
-    if all(s % o == 0 for o, s in zip(output_size, shape[-out_dim:])):
-        stride = tuple(i // o for i, o in zip(shape[-out_dim:], output_size))
-        kernel = tuple(i - (o - 1) * s for i, o, s in zip(shape[-out_dim:], output_size, stride))
-        return _aten_avg_pool(
-          input,
-          kernel,
-          strides=stride,
-        )
-    def start_index(a, b, c):
-        return (a * c) // b
-    def end_index(a, b, c):
-        return ((a + 1) * c + b - 1) // b
-    def compute_idx(in_size, out_size):
-        orange = jnp.arange(out_size, dtype=jnp.int64)
-        i0 = start_index(orange, out_size, in_size)
-        # Let length = end_index - start_index, i.e. the length of the pooling kernels
-        # length.max() can be computed analytically as follows:
-        maxlength = in_size // out_size + 1
-        in_size_mod = in_size % out_size
-        # adaptive = True iff there are kernels with different lengths
-        adaptive = not (in_size_mod == 0 or out_size % in_size_mod == 0)
-        if adaptive:
-            maxlength += 1
-        elif in_size_mod == 0:
-            maxlength -= 1
-        range_max = jnp.arange(maxlength, dtype=jnp.int64)
-        idx = i0[:, None] + range_max
-        if adaptive:
-            # Need to clamp to avoid accessing out-of-bounds memory
-            idx = jnp.minimum(idx, in_size - 1)
-            # Compute the length
-            i1 = end_index(orange, out_size, in_size)
-            length = i1 - i0
-        else:
-            length = maxlength
-        return idx, length, range_max, adaptive
-    idx, length, range_max, adaptive = [[None] * out_dim for _ in range(4)]
-    # length is not None if it's constant, otherwise we'll need to compute it
-    for i, (s, o) in enumerate(zip(shape[-out_dim:], output_size)):
-      idx[i], length[i], range_max[i], adaptive[i] = compute_idx(s, o)
-    def _unsqueeze_to_dim(x, dim):
-        ndim = len(x.shape)
-        return jax.lax.expand_dims(x, tuple(range(ndim, dim)))
-    if out_dim == 2:
-      # NOTE: unsqueeze to insert extra 1 in ranks; so they
-      # would broadcast
-      vals = input[..., _unsqueeze_to_dim(idx[0], 4), idx[1]]
-      reduce_axis = (-3, -1)
+  shape = input.shape
+  ndim = len(shape)
+  out_dim = len(output_size)
+  num_spatial_dim = ndim - out_dim
+  # Preconditions
+  assert ndim in (
+      out_dim + 1, out_dim + 2
+  ), f"adaptive_avg_pool{num_spatial_dim}d(): Expected {num_spatial_dim+1}D or {num_spatial_dim+2}D tensor, but got {ndim}"
+  for d in input.shape[-2:]:
+    assert d != 0, "adaptive_avg_pool{num_spactial_dim}d(): Expected input to have non-zero size for " \
+                   f"non-batch dimensions, but input has shape {tuple(shape)}."
+  # Optimisation (we should also do this in the kernel implementation)
+  if all(s % o == 0 for o, s in zip(output_size, shape[-out_dim:])):
+    stride = tuple(i // o for i, o in zip(shape[-out_dim:], output_size))
+    kernel = tuple(i - (o - 1) * s
+                   for i, o, s in zip(shape[-out_dim:], output_size, stride))
+    return _aten_avg_pool(
+        input,
+        kernel,
+        strides=stride,
+    )
+  def start_index(a, b, c):
+    return (a * c) // b
+  def end_index(a, b, c):
+    return ((a + 1) * c + b - 1) // b
+  def compute_idx(in_size, out_size):
+    orange = jnp.arange(out_size, dtype=jnp.int64)
+    i0 = start_index(orange, out_size, in_size)
+    # Let length = end_index - start_index, i.e. the length of the pooling kernels
+    # length.max() can be computed analytically as follows:
+    maxlength = in_size // out_size + 1
+    in_size_mod = in_size % out_size
+    # adaptive = True iff there are kernels with different lengths
+    adaptive = not (in_size_mod == 0 or out_size % in_size_mod == 0)
+    if adaptive:
+      maxlength += 1
+    elif in_size_mod == 0:
+      maxlength -= 1
+    range_max = jnp.arange(maxlength, dtype=jnp.int64)
+    idx = i0[:, None] + range_max
+    if adaptive:
+      # Need to clamp to avoid accessing out-of-bounds memory
+      idx = jnp.minimum(idx, in_size - 1)
+      # Compute the length
+      i1 = end_index(orange, out_size, in_size)
+      length = i1 - i0
+    else:
+      length = maxlength
+    return idx, length, range_max, adaptive
+  idx, length, range_max, adaptive = [[None] * out_dim for _ in range(4)]
+  # length is not None if it's constant, otherwise we'll need to compute it
+  for i, (s, o) in enumerate(zip(shape[-out_dim:], output_size)):
+    idx[i], length[i], range_max[i], adaptive[i] = compute_idx(s, o)
+  def _unsqueeze_to_dim(x, dim):
+    ndim = len(x.shape)
+    return jax.lax.expand_dims(x, tuple(range(ndim, dim)))
+  if out_dim == 2:
+    # NOTE: unsqueeze to insert extra 1 in ranks; so they
+    # would broadcast
+    vals = input[..., _unsqueeze_to_dim(idx[0], 4), idx[1]]
+    reduce_axis = (-3, -1)
+  else:
+    assert out_dim == 3
+    vals = input[...,
+                 _unsqueeze_to_dim(idx[0], 6),
+                 _unsqueeze_to_dim(idx[1], 4), idx[2]]
+    reduce_axis = (-5, -3, -1)
+  # Shortcut for the simpler case
+  if not any(adaptive):
+    return jnp.mean(vals, axis=reduce_axis)
+  def maybe_mask(vals, length, range_max, adaptive, dim):
+    if isinstance(length, int):
+      return vals, length
     else:
-      assert out_dim == 3
-      vals = input[..., _unsqueeze_to_dim(idx[0], 6),
-                        _unsqueeze_to_dim(idx[1], 4),
-                        idx[2]]
-      reduce_axis = (-5, -3, -1)
-    # Shortcut for the simpler case
-    if not any(adaptive):
-      return jnp.mean(vals, axis=reduce_axis)
-    def maybe_mask(vals, length, range_max, adaptive, dim):
-        if isinstance(length, int):
-            return vals, length
-        else:
-            # zero-out the things we didn't really want to select
-            assert dim < 0
-            # hack
-            mask = range_max >= length[:, None]
-            if dim == -2:
-                mask = _unsqueeze_to_dim(mask, 4)
-            elif dim == -3:
-                mask = _unsqueeze_to_dim(mask, 6)
-            vals = jnp.where(mask, 0.0, vals)
-            # Compute the length of each window
-            length = _unsqueeze_to_dim(length, -dim)
-            return vals, length
-    for i in range(len(length)):
-      vals, length[i] = maybe_mask(vals, length[i], range_max[i], adaptive=adaptive[i], dim=(i - out_dim))
-    # We unroll the sum as we assume that the kernels are going to be small
-    ret = jnp.sum(vals, axis=reduce_axis)
-    # NOTE: math.prod because we want to expand it to length[0] * length[1] * ...
-    # this is multiplication with broadcasting, not regular pointwise product
-    return ret / math.prod(length)
+      # zero-out the things we didn't really want to select
+      assert dim < 0
+      # hack
+      mask = range_max >= length[:, None]
+      if dim == -2:
+        mask = _unsqueeze_to_dim(mask, 4)
+      elif dim == -3:
+        mask = _unsqueeze_to_dim(mask, 6)
+      vals = jnp.where(mask, 0.0, vals)
+      # Compute the length of each window
+      length = _unsqueeze_to_dim(length, -dim)
+      return vals, length
+  for i in range(len(length)):
+    vals, length[i] = maybe_mask(
+        vals, length[i], range_max[i], adaptive=adaptive[i], dim=(i - out_dim))
+  # We unroll the sum as we assume that the kernels are going to be small
+  ret = jnp.sum(vals, axis=reduce_axis)
+  # NOTE: math.prod because we want to expand it to length[0] * length[1] * ...
+  # this is multiplication with broadcasting, not regular pointwise product
+  return ret / math.prod(length)
 @op(torch.ops.aten.avg_pool1d)
 @op(torch.ops.aten.avg_pool2d)
 @op(torch.ops.aten.avg_pool3d)
 def _aten_avg_pool(
-  inputs,
-  kernel_size,
-  strides=None,
-  padding=0,
-  ceil_mode=False,
-  count_include_pad=True,
-  divisor_override=None,
+    inputs,
+    kernel_size,
+    strides=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
 ):
   num_batch_dims = len(inputs.shape) - len(kernel_size) - 1
   kernel_size = tuple(kernel_size)
@@ -2060,7 +2087,7 @@ def _aten_avg_pool(
   if num_batch_dims == 0:
     input_shape = [1, *input_shape]
   padding = _ceil_mode_padding(padding, input_shape, kernel_size, strides,
-                               ceil_mode)
+                               [1] * len(kernel_size), ceil_mode)
   y = pool(inputs, 0.0, jax.lax.add, kernel_size, strides, padding)
   if divisor_override is not None:
@@ -2102,9 +2129,11 @@ def _aten_avg_pool(
     )
   return y.astype(inputs.dtype)
 # helper function to generate all indices to iterate through ndarray
-def _generate_indices(dims, skip_dim_indices = []):
+def _generate_indices(dims, skip_dim_indices=[]):
   res = []
   def _helper(curr_dim_idx, sofar):
     if curr_dim_idx in skip_dim_indices:
       _helper(curr_dim_idx + 1, sofar[:])
@@ -2115,10 +2144,11 @@ def _generate_indices(dims, skip_dim_indices = []):
     for i in range(dims[curr_dim_idx]):
       sofar[curr_dim_idx] = i
       _helper(curr_dim_idx + 1, sofar[:])
   _helper(0, [0 for _ in dims])
   return res
 # aten.sym_numel
 # aten.reciprocal
 @op(torch.ops.aten.reciprocal)
@@ -2174,10 +2204,14 @@ def _aten_round(input, decimals=0):
 @op(torch.ops.aten.max)
 def _aten_max(self, dim=None, keepdim=False):
   if dim is not None:
-    return _with_reduction_scalar(jnp.max, self, dim, keepdim), _with_reduction_scalar(jnp.argmax, self, dim, keepdim).astype(jnp.int64)
+    return _with_reduction_scalar(jnp.max, self, dim,
+                                  keepdim), _with_reduction_scalar(
+                                      jnp.argmax, self, dim,
+                                      keepdim).astype(jnp.int64)
   else:
     return _with_reduction_scalar(jnp.max, self, dim, keepdim)
 # aten.maximum
 @op(torch.ops.aten.maximum)
 def _aten_maximum(self, other):
@@ -2216,27 +2250,28 @@ def _with_reduction_scalar(jax_func, self, dim, keepdim):
 def _aten_any(self, dim=None, keepdim=False):
   return _with_reduction_scalar(jnp.any, self, dim, keepdim)
 # aten.arange
 @op(torch.ops.aten.arange.start_step)
 @op(torch.ops.aten.arange.start)
 @op(torch.ops.aten.arange.default)
 @op_base.convert_dtype(use_default_dtype=False)
 def _aten_arange(
-  start,
-  end=None,
-  step=None,
-  *,
-  dtype=None,
-  layout=None,
-  requires_grad=False,
-  device=None,
-  pin_memory=False,
+    start,
+    end=None,
+    step=None,
+    *,
+    dtype=None,
+    layout=None,
+    requires_grad=False,
+    device=None,
+    pin_memory=False,
 ):
   return jnp.arange(
-    op_base.maybe_convert_constant_dtype(start, dtype),
-    op_base.maybe_convert_constant_dtype(end, dtype),
-    op_base.maybe_convert_constant_dtype(step, dtype),
-    dtype=dtype,
+      op_base.maybe_convert_constant_dtype(start, dtype),
+      op_base.maybe_convert_constant_dtype(end, dtype),
+      op_base.maybe_convert_constant_dtype(step, dtype),
+      dtype=dtype,
   )
@@ -2245,6 +2280,7 @@ def _aten_arange(
 def _aten_argmax(self, dim=None, keepdim=False):
   return _with_reduction_scalar(jnp.argmax, self, dim, keepdim)
 def _strided_index(sizes, strides, storage_offset=None):
   ind = jnp.zeros(sizes, dtype=jnp.int32)
@@ -2257,6 +2293,7 @@ def _strided_index(sizes, strides, storage_offset=None):
     ind += storage_offset
   return ind
 # aten.as_strided
 @op(torch.ops.aten.as_strided)
 @op(torch.ops.aten.as_strided_copy)
@@ -2311,7 +2348,7 @@ def _aten_broadcast_tensors(*tensors):
     Args:
       shapes: A list of tuples representing the shapes of the input tensors.
-    Returns:
+    Returns:
       A tuple representing the broadcasted output shape.
     """
@@ -2344,11 +2381,13 @@ def _aten_broadcast_tensors(*tensors):
       A tuple specifying which dimensions of the input tensor should be broadcasted.
     """
-    res = tuple(i for i, (in_dim, out_dim) in enumerate(zip(input_shape, output_shape)))
+    res = tuple(
+        i for i, (in_dim, out_dim) in enumerate(zip(input_shape, output_shape)))
     return res
   # clean some function's previous wrap
-  if len(tensors)==1 and len(tensors[0])>=1 and isinstance(tensors[0][0], jax.Array):
+  if len(tensors) == 1 and len(tensors[0]) >= 1 and isinstance(
+      tensors[0][0], jax.Array):
     tensors = tensors[0]
   # Get the shapes of all input tensors
@@ -2357,7 +2396,8 @@ def _aten_broadcast_tensors(*tensors):
   output_shape = _get_broadcast_shape(shapes)
   # Broadcast each tensor to the output shape
   broadcasted_tensors = [
-      jax.lax.broadcast_in_dim(t, output_shape, _broadcast_dimensions(t.shape, output_shape))
+      jax.lax.broadcast_in_dim(t, output_shape,
+                               _broadcast_dimensions(t.shape, output_shape))
       for t in tensors
   ]
@@ -2376,6 +2416,7 @@ def _aten_broadcast_to(input, shape):
 def _aten_clamp(self, min=None, max=None):
   return jnp.clip(self, min, max)
 @op(torch.ops.aten.clamp_min)
 def _aten_clamp_min(input, min):
   return jnp.clip(input, min=min)
@@ -2394,7 +2435,7 @@ def _aten_constant_pad_nd(input, padding, value=0):
   rev_padding = [(padding[i - 1], padding[i], 0) for i in range(m - 1, 0, -2)]
   pad_dim = tuple(([(0, 0, 0)] * (len(input.shape) - m // 2)) + rev_padding)
   value_casted = jax.numpy.array(value, dtype=input.dtype)
-  return jax.lax.pad(input, padding_value=value_casted, padding_config = pad_dim)
+  return jax.lax.pad(input, padding_value=value_casted, padding_config=pad_dim)
 # aten.convolution_backward
@@ -2421,9 +2462,8 @@ def _aten_cdist_forward(x1, x2, p, compute_mode=""):
 @op(torch.ops.aten._pdist_forward)
 def _aten__pdist_forward(x, p=2):
   pairwise_dists = _aten_cdist_forward(x, x, p)
-  condensed_dists = pairwise_dists[
-    jnp.triu_indices(pairwise_dists.shape[0], k=1)
-  ]
+  condensed_dists = pairwise_dists[jnp.triu_indices(
+      pairwise_dists.shape[0], k=1)]
   return condensed_dists
@@ -2449,25 +2489,33 @@ def _aten_cosh(input):
   return jnp.cosh(input)
+@op(torch.ops.aten.diag)
+def _aten_diag(input, diagonal=0):
+  return jnp.diag(input, diagonal)
 # aten.diagonal
 @op(torch.ops.aten.diagonal)
+@op(torch.ops.aten.diagonal_copy)
 def _aten_diagonal(input, offset=0, dim1=0, dim2=1):
   return jnp.diagonal(input, offset, dim1, dim2)
 def diag_indices_with_offset(input_shape, offset, dim1=0, dim2=1):
-    input_len = len(input_shape)
-    if dim1 == dim2 or not (0 <= dim1 < input_len and 0 <= dim2 < input_len):
-      raise ValueError("dim1 and dim2 must be different and in range [0, " + str(input_len-1)+ "]")
-    size1, size2 = input_shape[dim1], input_shape[dim2]
-    if offset >= 0:
-        indices1 = jnp.arange(min(size1, size2 - offset))
-        indices2 = jnp.arange(offset, offset + len(indices1))
-    else:
-        indices2 = jnp.arange(min(size1 + offset, size2 ))
-        indices1 = jnp.arange(-offset, -offset + len(indices2))
-    return [indices1, indices2]
+  input_len = len(input_shape)
+  if dim1 == dim2 or not (0 <= dim1 < input_len and 0 <= dim2 < input_len):
+    raise ValueError("dim1 and dim2 must be different and in range [0, " +
+                     str(input_len - 1) + "]")
+  size1, size2 = input_shape[dim1], input_shape[dim2]
+  if offset >= 0:
+    indices1 = jnp.arange(min(size1, size2 - offset))
+    indices2 = jnp.arange(offset, offset + len(indices1))
+  else:
+    indices2 = jnp.arange(min(size1 + offset, size2))
+    indices1 = jnp.arange(-offset, -offset + len(indices2))
+  return [indices1, indices2]
 @op(torch.ops.aten.diagonal_scatter)
 def _aten_diagonal_scatter(input, src, offset=0, dim1=0, dim2=1):
@@ -2476,17 +2524,17 @@ def _aten_diagonal_scatter(input, src, offset=0, dim1=0, dim2=1):
   if input.ndim == 2:
     return input.at[tuple(indexes)].set(src)
   else:
-    # src has the same shape as the output of
+    # src has the same shape as the output of
     # jnp.diagonal(input, offset, dim1, dim2).
     # Last dimension always contains the diagonal elements,
     # while the preceding dimensions represent the "slices"
     # from which these diagonals are extracted. Thus,
     # we alter input axes to match this assumption, write src
     # and then move the axes back to the original state.
-    input = jnp.moveaxis(input, (dim1, dim2), (-2,-1))
-    multi_indexes = [slice(None)]*(input.ndim-2) + indexes
+    input = jnp.moveaxis(input, (dim1, dim2), (-2, -1))
+    multi_indexes = [slice(None)] * (input.ndim - 2) + indexes
     input = input.at[tuple(multi_indexes)].set(src)
-    return jnp.moveaxis(input, (-2,-1), (dim1, dim2))
+    return jnp.moveaxis(input, (-2, -1), (dim1, dim2))
 # aten.diagflat
@@ -2507,9 +2555,9 @@ def _aten_eq(input1, input2):
 # aten.equal
-@op(torch.ops.aten.equal, is_jax_function=False)
+@op(torch.ops.aten.equal)
 def _aten_equal(input, other):
-  res = jnp.array_equal(input._elem, other._elem)
+  res = jnp.array_equal(input, other)
   return bool(res)
@@ -2559,7 +2607,12 @@ def _aten_exp2(input):
 # aten.fill
 @op(torch.ops.aten.fill)
 @op(torch.ops.aten.full_like)
-def _aten_fill(x, value, dtype=None, pin_memory=None, memory_format=None, device=None):
+def _aten_fill(x,
+               value,
+               dtype=None,
+               pin_memory=None,
+               memory_format=None,
+               device=None):
   if dtype is None:
     dtype = x.dtype
   else:
@@ -2634,7 +2687,8 @@ def _aten_glu(x, dim=-1):
 # aten.hardtanh
 @op(torch.ops.aten.hardtanh)
 def _aten_hardtanh(input, min_val=-1, max_val=1, inplace=False):
-  if input.dtype == np.int64 and isinstance(max_val, float) and isinstance(min_val, float):
+  if input.dtype == np.int64 and isinstance(max_val, float) and isinstance(
+      min_val, float):
     min_val = int(min_val)
     max_val = int(max_val)
   return jnp.clip(input, min_val, max_val)
@@ -2644,7 +2698,7 @@ def _aten_hardtanh(input, min_val=-1, max_val=1, inplace=False):
 @op(torch.ops.aten.histc)
 def _aten_histc(input, bins=100, min=0, max=0):
   # TODO(@manfei): this function might cause some uncertainty
-  if min==0 and max==0:
+  if min == 0 and max == 0:
     if isinstance(input, jnp.ndarray) and input.size == 0:
       min = 0
       max = 0
@@ -2652,7 +2706,8 @@ def _aten_histc(input, bins=100, min=0, max=0):
       min = jnp.min(input)
       max = jnp.max(input)
   range_value = (min, max)
-  hist, bin_edges = jnp.histogram(input, bins=bins, range=range_value, weights=None, density=None)
+  hist, bin_edges = jnp.histogram(
+      input, bins=bins, range=range_value, weights=None, density=None)
   return hist
@@ -2667,22 +2722,28 @@ def _aten_digamma(input, *, out=None):
   # replace indices where input == 0 with -inf in res
   return jnp.where(jnp.equal(input, jnp.zeros(input.shape)), -jnp.inf, res)
 @op(torch.ops.aten.igamma)
 def _aten_igamma(input, other):
   return jax.scipy.special.gammainc(input, other)
 @op(torch.ops.aten.lgamma)
 def _aten_lgamma(input, *, out=None):
   return jax.scipy.special.gammaln(input).astype(jnp.float32)
 @op(torch.ops.aten.mvlgamma)
 def _aten_mvlgamma(input, p, *, out=None):
-  return jax.scipy.special.multigammaln(input, d)
+  input = input.astype(mappings.t2j_dtype(torch.get_default_dtype()))
+  return jax.scipy.special.multigammaln(input, p)
 @op(torch.ops.aten.linalg_eig)
 def _aten_linalg_eig(A):
   return jnp.linalg.eig(A)
 @op(torch.ops.aten._linalg_eigh)
 def _aten_linalg_eigh(A, UPLO='L'):
   return jnp.linalg.eigh(A, UPLO)
@@ -2704,7 +2765,9 @@ def _aten_linalg_lstsq(A, B, rcond=None, driver='gelsy'):
     A_reshaped = A.reshape((batch_size,) + A.shape[-2:])
     B_reshaped = B.reshape((batch_size,) + B.shape[-2:])
-    X, residuals, rank, singular_values = jax.vmap(jnp.linalg.lstsq, in_axes=(0, 0))(A_reshaped, B_reshaped, rcond=rcond)
+    X, residuals, rank, singular_values = jax.vmap(
+        jnp.linalg.lstsq, in_axes=(0,
+                                   0))(A_reshaped, B_reshaped, rcond=rcond)
     X = X.reshape(batch_shape + X.shape[-2:])
@@ -2720,7 +2783,8 @@ def _aten_linalg_lstsq(A, B, rcond=None, driver='gelsy'):
       residuals = residuals.reshape(batch_shape + residuals.shape[-1:])
     if driver in ['gelsd', 'gelss']:
-      singular_values = singular_values.reshape(batch_shape + singular_values.shape[-1:])
+      singular_values = singular_values.reshape(batch_shape +
+                                                singular_values.shape[-1:])
     else:
       singular_values = jnp.array([], dtype=input_dtype)
@@ -2729,17 +2793,17 @@ def _aten_linalg_lstsq(A, B, rcond=None, driver='gelsy'):
     X, residuals, rank, singular_values = jnp.linalg.lstsq(A, B, rcond=rcond)
     if driver not in ['gelsd', 'gelsy', 'gelss']:
-        rank = jnp.array([], dtype=jnp.int64)
+      rank = jnp.array([], dtype=jnp.int64)
     rank_value = None
     if rank.size > 0:
-        rank_value = int(rank.item())
-        rank = jnp.array(rank_value, dtype=jnp.int64)
+      rank_value = int(rank.item())
+      rank = jnp.array(rank_value, dtype=jnp.int64)
     # When driver is ‘gels’, assume that A is full-rank.
-    full_rank =  driver == 'gels' or rank_value == n
+    full_rank = driver == 'gels' or rank_value == n
     if driver == 'gelsy' or m <= n or (not full_rank):
-        residuals = jnp.array([], dtype=input_dtype)
+      residuals = jnp.array([], dtype=input_dtype)
     if driver not in ['gelsd', 'gelss']:
       singular_values = jnp.array([], dtype=input_dtype)
@@ -2753,8 +2817,7 @@ def _aten_linalg_ldl_factor_ex(A, hermitian=False, check_errors=False):
   # https://github.com/jax-ml/jax/issues/12779
   # TODO: Not tested for complex inputs. Does not support hermitian=True
   pivots = jnp.broadcast_to(
-      jnp.arange(1, A.shape[-1]+1, dtype=jnp.int32), A.shape[:-1]
-  )
+      jnp.arange(1, A.shape[-1] + 1, dtype=jnp.int32), A.shape[:-1])
   info = jnp.zeros(A.shape[:-2], jnp.int32)
   C = jnp.linalg.cholesky(A)
   if C.size == 0:
@@ -2767,7 +2830,7 @@ def _aten_linalg_ldl_factor_ex(A, hermitian=False, check_errors=False):
   D = C * jnp.eye(C.shape[-1], dtype=A.dtype)
   LD = C @ jnp.linalg.inv(D)
-  LD = fill_diagonal_batch(LD, D*D)
+  LD = fill_diagonal_batch(LD, D * D)
   return LD, pivots, info
@@ -2787,9 +2850,9 @@ def _aten_linalg_lu(A, pivot=True, out=None):
   U = jnp.triu(lu[..., :k, :])
   def perm_to_P(perm):
-      m = perm.shape[-1]
-      P = jnp.eye(m, dtype=dtype)[perm].T
-      return P
+    m = perm.shape[-1]
+    P = jnp.eye(m, dtype=dtype)[perm].T
+    return P
   if permutation.ndim > 1:
     num_batch_dims = permutation.ndim - 1
@@ -2798,7 +2861,7 @@ def _aten_linalg_lu(A, pivot=True, out=None):
   P = perm_to_P(permutation)
-  return P,L,U
+  return P, L, U
 @op(torch.ops.aten.linalg_lu_factor_ex)
@@ -2810,6 +2873,21 @@ def _aten_linalg_lu_factor_ex(A, pivot=True, check_errors=False):
   return lu, pivots, info
+@op(torch.ops.aten.linalg_lu_solve)
+def _aten_linalg_lu_solve(LU, pivots, B, left=True, adjoint=False):
+  # JAX pivots are offset by 1 compared to torch
+  pivots = pivots - 1
+  if not left:
+    # XA = B is same as A'X = B'
+    trans = 0 if adjoint else 2
+    x = jax.scipy.linalg.lu_solve((LU, pivots), jnp.matrix_transpose(B), trans)
+    x = jnp.matrix_transpose(x)
+  else:
+    trans = 2 if adjoint else 0
+    x = jax.scipy.linalg.lu_solve((LU, pivots), B, trans)
+  return x
 @op(torch.ops.aten.gcd)
 def _aten_gcd(input, other):
   return jnp.gcd(input, other)
@@ -2874,12 +2952,14 @@ def _aten_log2(x):
 # aten.logical_and
 @op(torch.ops.aten.logical_and)
+@op(torch.ops.aten.__and__)
 def _aten_logical_and(self, other):
   return jnp.logical_and(self, other)
 # aten.logical_or
 @op(torch.ops.aten.logical_or)
+@op(torch.ops.aten.__or__)
 def _aten_logical_or(self, other):
   return jnp.logical_or(self, other)
@@ -2894,7 +2974,7 @@ def _aten_logical_not(self):
 @op(torch.ops.aten._log_softmax)
 def _aten_log_softmax(self, axis=-1, half_to_float=False):
   if self.shape == ():
-      return jnp.astype(0.0, self.dtype)
+    return jnp.astype(0.0, self.dtype)
   return jax.nn.log_softmax(self, axis)
@@ -2921,6 +3001,7 @@ def _aten_logcumsumexp(self, dim=None):
 # aten.max_pool3d_backward
 # aten.logical_xor
 @op(torch.ops.aten.logical_xor)
+@op(torch.ops.aten.__xor__)
 def _aten_logical_xor(self, other):
   return jnp.logical_xor(self, other)
@@ -2933,19 +3014,22 @@ def _aten_logical_xor(self, other):
 def _aten_neg(x):
   return -1 * x
 @op(torch.ops.aten.nextafter)
 def _aten_nextafter(input, other, *, out=None):
   return jnp.nextafter(input, other)
 @op(torch.ops.aten.nonzero_static)
-def _aten_nonzero_static(input, size, fill_value = -1):
+def _aten_nonzero_static(input, size, fill_value=-1):
   indices = jnp.argwhere(input)
   if size < indices.shape[0]:
     indices = indices[:size]
   elif size > indices.shape[0]:
-    padding = jnp.full((size - indices.shape[0], indices.shape[1]), fill_value, dtype=indices.dtype)
+    padding = jnp.full((size - indices.shape[0], indices.shape[1]),
+                       fill_value,
+                       dtype=indices.dtype)
     indices = jnp.concatenate((indices, padding))
   return indices
@@ -2954,9 +3038,11 @@ def _aten_nonzero_static(input, size, fill_value = -1):
 # aten.nonzero
 @op(torch.ops.aten.nonzero)
 def _aten_nonzero(x, as_tuple=False):
-  if jnp.ndim(x) == 0 and (as_tuple or x.item()==0):
+  if jnp.ndim(x) == 0 and (as_tuple or x.item() == 0):
     return torch.empty(0, 0, dtype=torch.int64)
-  if jnp.ndim(x) == 0: # when x is scalar, return torch.tensor([], size=(1, 0), dtype=torch.int64)
+  if jnp.ndim(
+      x
+  ) == 0:  # when x is scalar, return torch.tensor([], size=(1, 0), dtype=torch.int64)
     res = torch.empty(1, 0, dtype=torch.int64)
     return jnp.array(res.numpy())
   index_tuple = jnp.nonzero(x)
@@ -2997,15 +3083,15 @@ def _aten_put(self, index, source, accumulate=False):
 # aten.randperm
 # randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None)
 @op(torch.ops.aten.randperm, needs_env=True)
-def _aten_randperm(
-  n, *,
-  generator=None,
-  dtype=None,
-  layout=None,
-  device=None,
-  pin_memory=None,
-  env=None):
-    """
+def _aten_randperm(n,
+                   *,
+                   generator=None,
+                   dtype=None,
+                   layout=None,
+                   device=None,
+                   pin_memory=None,
+                   env=None):
+  """
     Generates a random permutation of integers from 0 to n-1.
     Args:
@@ -3019,14 +3105,14 @@ def _aten_randperm(
     Returns:
         A DeviceArray containing a random permutation of integers from 0 to n-1.
     """
-    if dtype:
-      dtype = mappings.t2j_dtype(dtype)
-    else:
-      dtype = jnp.int64.dtype
-    key = env.get_and_rotate_prng_key(generator)
-    indices = jnp.arange(n, dtype=dtype)
-    permutation = jax.random.permutation(key, indices)
-    return permutation
+  if dtype:
+    dtype = mappings.t2j_dtype(dtype)
+  else:
+    dtype = jnp.int64.dtype
+  key = env.get_and_rotate_prng_key(generator)
+  indices = jnp.arange(n, dtype=dtype)
+  permutation = jax.random.permutation(key, indices)
+  return permutation
 # aten.reflection_pad3d
@@ -3071,8 +3157,8 @@ def _aten_sort(a, dim=-1, descending=False, stable=False):
   if a.shape == ():
     return (a, jnp.astype(0, 'int64'))
   return (
-    jnp.sort(a, axis=dim, stable=stable, descending=descending),
-    jnp.argsort(a, axis=dim, stable=stable, descending=descending),
+      jnp.sort(a, axis=dim, stable=stable, descending=descending),
+      jnp.argsort(a, axis=dim, stable=stable, descending=descending),
   )
@@ -3114,8 +3200,8 @@ def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None):
   if dim != -1 and dim != len(input.shape) - 1:
     transpose_shape = list(range(len(input.shape)))
     transpose_shape[dim], transpose_shape[-1] = (
-      transpose_shape[-1],
-      transpose_shape[dim],
+        transpose_shape[-1],
+        transpose_shape[dim],
     )
     input = jnp.transpose(input, transpose_shape)
@@ -3124,8 +3210,7 @@ def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None):
   if sorted:
     values = jnp.sort(values, descending=True)
     indices = jnp.take_along_axis(
-      indices, jnp.argsort(values, axis=-1, descending=True), axis=-1
-    )
+        indices, jnp.argsort(values, axis=-1, descending=True), axis=-1)
   if not largest:
     values = -values  # Negate values back if we found smallest
@@ -3140,21 +3225,39 @@ def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None):
 # aten.tril_indices
 #tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None)
 @op(torch.ops.aten.tril_indices)
-def _aten_tril_indices(row, col, offset=0, *, dtype=jnp.int64.dtype, layout=None, device=None, pin_memory=None):
+def _aten_tril_indices(row,
+                       col,
+                       offset=0,
+                       *,
+                       dtype=jnp.int64.dtype,
+                       layout=None,
+                       device=None,
+                       pin_memory=None):
   a, b = jnp.tril_indices(row, offset, col)
   return jnp.stack((a, b))
 # aten.tril_indices
 #tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None)
 @op(torch.ops.aten.triu_indices)
-def _aten_triu_indices(row, col, offset=0, *, dtype=jnp.int64.dtype, layout=None, device=None, pin_memory=None):
+def _aten_triu_indices(row,
+                       col,
+                       offset=0,
+                       *,
+                       dtype=jnp.int64.dtype,
+                       layout=None,
+                       device=None,
+                       pin_memory=None):
   a, b = jnp.triu_indices(row, offset, col)
   return jnp.stack((a, b))
 @op(torch.ops.aten.unbind_copy)
 def _aten_unbind(a, dim=0):
-  return [jax.lax.index_in_dim(a, i, dim, keepdims=False) for i in range(a.shape[dim])]
+  return [
+      jax.lax.index_in_dim(a, i, dim, keepdims=False)
+      for i in range(a.shape[dim])
+  ]
 # aten.unique_dim
@@ -3167,12 +3270,13 @@ def _aten_unique_dim(input_tensor,
                      sort=True,
                      return_inverse=False,
                      return_counts=False):
-  result_tensor_or_tuple = jnp.unique(input_tensor,
-                                      return_index=False,
-                                      return_inverse=return_inverse,
-                                      return_counts=return_counts,
-                                      axis=dim,
-                                      equal_nan=False)
+  result_tensor_or_tuple = jnp.unique(
+      input_tensor,
+      return_index=False,
+      return_inverse=return_inverse,
+      return_counts=return_counts,
+      axis=dim,
+      equal_nan=False)
   result_list = (
       list(result_tensor_or_tuple) if isinstance(result_tensor_or_tuple, tuple)
       else [result_tensor_or_tuple])
@@ -3197,15 +3301,14 @@ def _aten_unique_dim(input_tensor,
 # NOTE: Like the CUDA and CPU implementations, this implementation always sorts
 # the tensor regardless of the `sorted` argument passed to `torch.unique`.
 @op(torch.ops.aten._unique)
-def _aten_unique(input_tensor,
-                 sort=True,
-                 return_inverse=False):
-  result_tensor_or_tuple = jnp.unique(input_tensor,
-                                      return_index=False,
-                                      return_inverse=return_inverse,
-                                      return_counts=False,
-                                      axis=None,
-                                      equal_nan=False)
+def _aten_unique(input_tensor, sort=True, return_inverse=False):
+  result_tensor_or_tuple = jnp.unique(
+      input_tensor,
+      return_index=False,
+      return_inverse=return_inverse,
+      return_counts=False,
+      axis=None,
+      equal_nan=False)
   if return_inverse:
     return result_tensor_or_tuple
   else:
@@ -3221,11 +3324,12 @@ def _aten_unique2(input_tensor,
                   sort=True,
                   return_inverse=False,
                   return_counts=False):
-  return _aten_unique_dim(input_tensor=input_tensor,
-                          dim=None,
-                          sort=sort,
-                          return_inverse=return_inverse,
-                          return_counts=return_counts)
+  return _aten_unique_dim(
+      input_tensor=input_tensor,
+      dim=None,
+      sort=sort,
+      return_inverse=return_inverse,
+      return_counts=return_counts)
 # aten.unique_consecutive
@@ -3255,17 +3359,18 @@ def _aten_unique_consecutive(input_tensor,
     if dim < 0:
       dim += ndim
-  nd_slice_0 = tuple(slice(None, -1) if d == dim else slice(None)
-                     for d in range(ndim))
-  nd_slice_1 = tuple(slice(1, None) if d == dim else slice(None)
-                     for d in range(ndim))
+  nd_slice_0 = tuple(
+      slice(None, -1) if d == dim else slice(None) for d in range(ndim))
+  nd_slice_1 = tuple(
+      slice(1, None) if d == dim else slice(None) for d in range(ndim))
   axes_to_reduce = tuple(d for d in range(ndim) if d != dim)
   does_not_equal_prior = (
-      jnp.any(input_tensor[nd_slice_0] != input_tensor[nd_slice_1],
-              axis=axes_to_reduce,
-              keepdims=False))
+      jnp.any(
+          input_tensor[nd_slice_0] != input_tensor[nd_slice_1],
+          axis=axes_to_reduce,
+          keepdims=False))
   if input_tensor.shape[dim] != 0:
     # Prepend `True` to represent the first element of the input.
@@ -3273,18 +3378,17 @@ def _aten_unique_consecutive(input_tensor,
   include_indices = jnp.argwhere(does_not_equal_prior)[:, 0]
-  output_tensor = input_tensor[
-      tuple(include_indices if d == dim else slice(None) for d in range(ndim))]
+  output_tensor = input_tensor[tuple(
+      include_indices if d == dim else slice(None) for d in range(ndim))]
   if return_inverse or return_counts:
-    counts = (jnp.append(include_indices[1:], input_tensor.shape[dim]) -
-              include_indices[:])
+    counts = (
+        jnp.append(include_indices[1:], input_tensor.shape[dim]) -
+        include_indices[:])
     inverse = (
         jnp.reshape(jnp.repeat(jnp.arange(len(counts)), counts), inverse_shape)
-        if return_inverse
-        else None
-    )
+        if return_inverse else None)
     return output_tensor, inverse, counts
@@ -3302,25 +3406,33 @@ def _aten_unique_consecutive(input_tensor,
 @op(torch.ops.aten.where.ScalarSelf)
 @op(torch.ops.aten.where.ScalarOther)
 @op(torch.ops.aten.where.Scalar)
-def _aten_where(condition, x = None, y = None):
+def _aten_where(condition, x=None, y=None):
   return jnp.where(condition, x, y)
 # aten.to.dtype
 # Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None
 @op(torch.ops.aten.to.dtype)
-def _aten_to_dtype(
-  a, dtype, non_blocking=False, copy=False, memory_format=None
-):
+def _aten_to_dtype(a,
+                   dtype,
+                   non_blocking=False,
+                   copy=False,
+                   memory_format=None):
   if dtype:
     jaxdtype = mappings.t2j_dtype(dtype)
   return a.astype(jaxdtype)
 @op(torch.ops.aten.to.dtype_layout)
-def _aten_to_dtype_layout(
-  a, *, dtype=None, layout=None, device=None, pin_memory=None, non_blocking=False, copy=False, memory_format=None
-):
+def _aten_to_dtype_layout(a,
+                          *,
+                          dtype=None,
+                          layout=None,
+                          device=None,
+                          pin_memory=None,
+                          non_blocking=False,
+                          copy=False,
+                          memory_format=None):
   return _aten_to_dtype(
       a,
       dtype,
@@ -3328,6 +3440,7 @@ def _aten_to_dtype_layout(
       copy=copy,
       memory_format=memory_format)
 # aten.to.device
@@ -3348,9 +3461,11 @@ def _aten_var_mean_correction(tensor, dim=None, correction=1, keepdim=False):
 @op(torch.ops.aten.scalar_tensor)
 @op_base.convert_dtype()
-def _aten_scalar_tensor(
-  s, dtype=None, layout=None, device=None, pin_memory=None
-):
+def _aten_scalar_tensor(s,
+                        dtype=None,
+                        layout=None,
+                        device=None,
+                        pin_memory=None):
   return jnp.array(s, dtype=dtype)
@@ -3360,9 +3475,9 @@ def _aten_to_device(x, device, dtype):
 @op(torch.ops.aten.max_pool2d_with_indices_backward)
-def max_pool2d_with_indices_backward_custom(
-  grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices
-):
+def max_pool2d_with_indices_backward_custom(grad_output, self, kernel_size,
+                                            stride, padding, dilation,
+                                            ceil_mode, indices):
   """
   Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward.
@@ -3418,15 +3533,15 @@ def _aten_tensor_split(ary, indices_or_sections, axis=0):
 @op(torch.ops.aten.randn, needs_env=True)
 @op_base.convert_dtype()
 def _randn(
-  *size,
-  generator=None,
-  out=None,
-  dtype=None,
-  layout=torch.strided,
-  device=None,
-  requires_grad=False,
-  pin_memory=False,
-  env=None,
+    *size,
+    generator=None,
+    out=None,
+    dtype=None,
+    layout=torch.strided,
+    device=None,
+    requires_grad=False,
+    pin_memory=False,
+    env=None,
 ):
   shape = size
   if len(shape) == 1 and isinstance(shape[0], (list, tuple)):
@@ -3437,13 +3552,14 @@ def _randn(
     res = res.astype(dtype)
   return res
 @op(torch.ops.aten.bernoulli.p, needs_env=True)
-def _bernoulli(
-  self,
-  p = 0.5,
-  *,
-  generator=None,
-  env=None,
+def _aten_bernoulli(
+    self,
+    p=0.5,
+    *,
+    generator=None,
+    env=None,
 ):
   key = env.get_and_rotate_prng_key(generator)
   res = jax.random.uniform(key, self.shape) < p
@@ -3460,14 +3576,14 @@ def geometric(self, p, *, generator=None, env=None):
 @op(torch.ops.aten.randn_like, needs_env=True)
 @op_base.convert_dtype()
 def _aten_randn_like(
-  x,
-  *,
-  dtype=None,
-  layout=None,
-  device=None,
-  pin_memory=False,
-  memory_format=torch.preserve_format,
-  env=None,
+    x,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=torch.preserve_format,
+    env=None,
 ):
   key = env.get_and_rotate_prng_key()
   return jax.random.normal(key, dtype=dtype or x.dtype, shape=x.shape)
@@ -3476,15 +3592,15 @@ def _aten_randn_like(
 @op(torch.ops.aten.rand, needs_env=True)
 @op_base.convert_dtype()
 def _rand(
-  *size,
-  generator=None,
-  out=None,
-  dtype=None,
-  layout=torch.strided,
-  device=None,
-  requires_grad=False,
-  pin_memory=False,
-  env=None,
+    *size,
+    generator=None,
+    out=None,
+    dtype=None,
+    layout=torch.strided,
+    device=None,
+    requires_grad=False,
+    pin_memory=False,
+    env=None,
 ):
   shape = size
   if len(shape) == 1 and isinstance(shape[0], (list, tuple)):
@@ -3505,18 +3621,32 @@ def _aten_outer(a, b):
 def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False):
   return jnp.allclose(input, other, rtol, atol, equal_nan)
 @op(torch.ops.aten.native_batch_norm)
-def _aten_native_batch_norm(input, weight, bias, running_mean, running_var, training=False, momentum=0.1, eps=1e-5):
+def _aten_native_batch_norm(input,
+                            weight,
+                            bias,
+                            running_mean,
+                            running_var,
+                            training=False,
+                            momentum=0.1,
+                            eps=1e-5):
   if running_mean is None:
-    running_mean = jnp.zeros(input.shape[1], dtype=input.dtype)  # Initialize running mean if None
+    running_mean = jnp.zeros(
+        input.shape[1], dtype=input.dtype)  # Initialize running mean if None
   if running_var is None:
-    running_var = jnp.ones(input.shape[1], dtype=input.dtype)   # Initialize running variance if None
+    running_var = jnp.ones(
+        input.shape[1],
+        dtype=input.dtype)  # Initialize running variance if None
   if training:
-    return _aten__native_batch_norm_legit(input, weight, bias, running_mean, running_var, training, momentum, eps)
+    return _aten__native_batch_norm_legit(input, weight, bias, running_mean,
+                                          running_var, training, momentum, eps)
   else:
-    return _aten__native_batch_norm_legit_no_training(input, weight, bias, running_mean, running_var, momentum, eps)
+    return _aten__native_batch_norm_legit_no_training(input, weight, bias,
+                                                      running_mean, running_var,
+                                                      momentum, eps)
 @op(torch.ops.aten.normal, needs_env=True)
@@ -3525,12 +3655,14 @@ def _aten_normal(self, mean=0, std=1, generator=None, env=None):
   res = _randn(*shape, generator=generator, env=env)
   return res * std + mean
 # TODO: not clear what this function should actually do
 # https://github.com/pytorch/pytorch/blob/d96c80649f301129219469d8b4353e52edab3b78/aten/src/ATen/native/native_functions.yaml#L7933-L7940
 @op(torch.ops.aten.lift_fresh)
 def _aten_lift_fresh(self):
   return self
 @op(torch.ops.aten.uniform, needs_env=True)
 def _aten_uniform(self, from_=0, to=1, *, generator=None, env=None):
   assert from_ <= to, f'Uniform from(passed in {from_}) must be less than to(passed in {to})'
@@ -3538,16 +3670,18 @@ def _aten_uniform(self, from_=0, to=1, *, generator=None, env=None):
   res = _rand(*shape, generator=generator, env=env)
   return res * (to - from_) + from_
 #func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 @op(torch.ops.aten.randint, needs_env=True)
 @op_base.convert_dtype(use_default_dtype=False)
 def _aten_randint(
-  *args,
-  generator=None,
-  dtype=None,
-  env=None,
-  **kwargs,
+    *args,
+    generator=None,
+    dtype=None,
+    env=None,
+    **kwargs,
 ):
   if len(args) == 3:
     # low, high, size
@@ -3556,7 +3690,8 @@ def _aten_randint(
     high, size = args
     low = 0
   else:
-    raise AssertionError(f'Expected at 2 or 3 args for Aten::randint, got {len(args)}')
+    raise AssertionError(
+        f'Expected at 2 or 3 args for Aten::randint, got {len(args)}')
   key = env.get_and_rotate_prng_key(generator)
   res = jax.random.randint(key, size, low, high)
@@ -3564,15 +3699,18 @@ def _aten_randint(
     res = res.astype(dtype)
   return res
-@op(torch.ops.aten.randint_like, torch.ops.aten.randint.generator, needs_env=True)
+@op(torch.ops.aten.randint_like,
+    torch.ops.aten.randint.generator,
+    needs_env=True)
 @op_base.convert_dtype(use_default_dtype=False)
 def _aten_randint_like(
-  input,
-  *args,
-  generator=None,
-  dtype=None,
-  env=None,
-  **kwargs,
+    input,
+    *args,
+    generator=None,
+    dtype=None,
+    env=None,
+    **kwargs,
 ):
   if len(args) == 2:
     low, high = args
@@ -3580,7 +3718,8 @@ def _aten_randint_like(
     high = args[0]
     low = 0
   else:
-    raise AssertionError(f'Expected at 1 or 2 args for Aten::randint_like, got {len(args)}')
+    raise AssertionError(
+        f'Expected at 1 or 2 args for Aten::randint_like, got {len(args)}')
   shape = input.shape
   dtype = dtype or input.dtype
@@ -3590,6 +3729,7 @@ def _aten_randint_like(
     res = res.astype(dtype)
   return res
 @op(torch.ops.aten.dim, is_jax_function=False)
 def _aten_dim(self):
   return len(self.shape)
@@ -3602,10 +3742,11 @@ def _aten_copysign(input, other, *, out=None):
   # regardless of their exact integer dtype, whereas jax.copysign returns
   # float64 when one or both of them is int64.
   if jnp.issubdtype(input.dtype, jnp.integer) and jnp.issubdtype(
-    other.dtype, jnp.integer
-  ):
+      other.dtype, jnp.integer):
     result = result.astype(jnp.float32)
   return result
 @op(torch.ops.aten.i0)
 @op_base.promote_int_input
 def _aten_i0(self):
@@ -3637,6 +3778,7 @@ def _aten_special_laguerre_polynomial_l(self, n):
   @jnp.vectorize
   def vectorized(x, n_i):
     def negative_n(x):
       return jnp.zeros_like(x)
@@ -3650,6 +3792,7 @@ def _aten_special_laguerre_polynomial_l(self, n):
       return jnp.ones_like(x)
     def default(x):
       def f(k, carry):
         p, q = carry
         return (q, ((k * 2 + (jnp.ones_like(x) - x)) * q - k * p) / (k + 1))
@@ -3658,9 +3801,9 @@ def _aten_special_laguerre_polynomial_l(self, n):
       return q
     return jnp.piecewise(
-        x, [n_i == 1, n_i == 0, jnp.abs(n_i) == jnp.zeros_like(x), n_i < 0], [
-            one_n, zero_n, zero_abs, negative_n, default]
-    )
+        x, [n_i == 1, n_i == 0,
+            jnp.abs(n_i) == jnp.zeros_like(x), n_i < 0],
+        [one_n, zero_n, zero_abs, negative_n, default])
   return vectorized(self, n.astype(jnp.int64))
@@ -3760,125 +3903,124 @@ def _aten_special_modified_bessel_i0(self):
     return jnp.exp(x) * (0.5 * (b - p)) / jnp.sqrt(x)
   self = jnp.abs(self)
-  return jnp.piecewise(
-      self, [self <= 8], [small, default]
-  )
+  return jnp.piecewise(self, [self <= 8], [small, default])
 @op(torch.ops.aten.special_modified_bessel_i1)
 @op_base.promote_int_input
 def _aten_special_modified_bessel_i1(self):
-    # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3271-L3364
-    def small(x):
-        A = jnp.array(
-            [
-                2.77791411276104639959e-18,
-                -2.11142121435816608115e-17,
-                1.55363195773620046921e-16,
-                -1.10559694773538630805e-15,
-                7.60068429473540693410e-15,
-                -5.04218550472791168711e-14,
-                3.22379336594557470981e-13,
-                -1.98397439776494371520e-12,
-                1.17361862988909016308e-11,
-                -6.66348972350202774223e-11,
-                3.62559028155211703701e-10,
-                -1.88724975172282928790e-09,
-                9.38153738649577178388e-09,
-                -4.44505912879632808065e-08,
-                2.00329475355213526229e-07,
-                -8.56872026469545474066e-07,
-                3.47025130813767847674e-06,
-                -1.32731636560394358279e-05,
-                4.78156510755005422638e-05,
-                -1.61760815825896745588e-04,
-                5.12285956168575772895e-04,
-                -1.51357245063125314899e-03,
-                4.15642294431288815669e-03,
-                -1.05640848946261981558e-02,
-                2.47264490306265168283e-02,
-                -5.29459812080949914269e-02,
-                1.02643658689847095384e-01,
-                -1.76416518357834055153e-01,
-                2.52587186443633654823e-01,
-            ],
-            dtype=self.dtype,
-        )
-        def f(carry, val):
-            p, q, a = carry
-            p, q = q, a
-            return (p, q, ((jnp.abs(x) / 2.0) - 2.0) * q - p + val), None
-        (p, _, a), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
-        return jax.lax.cond(
-          x < 0, lambda: -(0.5 * (a - p) * jnp.abs(x) * jnp.exp(jnp.abs(x))), lambda: 0.5 * (a - p) * jnp.abs(x) * jnp.exp(jnp.abs(x))
-        )
+  # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3271-L3364
-    def default(x):
-        B = jnp.array(
-            [
-                7.51729631084210481353e-18,
-                4.41434832307170791151e-18,
-                -4.65030536848935832153e-17,
-                -3.20952592199342395980e-17,
-                2.96262899764595013876e-16,
-                3.30820231092092828324e-16,
-                -1.88035477551078244854e-15,
-                -3.81440307243700780478e-15,
-                1.04202769841288027642e-14,
-                4.27244001671195135429e-14,
-                -2.10154184277266431302e-14,
-                -4.08355111109219731823e-13,
-                -7.19855177624590851209e-13,
-                2.03562854414708950722e-12,
-                1.41258074366137813316e-11,
-                3.25260358301548823856e-11,
-                -1.89749581235054123450e-11,
-                -5.58974346219658380687e-10,
-                -3.83538038596423702205e-09,
-                -2.63146884688951950684e-08,
-                -2.51223623787020892529e-07,
-                -3.88256480887769039346e-06,
-                -1.10588938762623716291e-04,
-                -9.76109749136146840777e-03,
-                7.78576235018280120474e-01,
-            ],
-            dtype=self.dtype,
-        )
-        def f(carry, val):
-            p, q, b = carry
-            p, q = q, b
-            return (p, q, (32.0 / jnp.abs(x) - 2.0) * q - p + val), None
-        (p, _, b), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
-        return jax.lax.cond(
-          x < 0, lambda: -(jnp.exp(jnp.abs(x)) * (0.5 * (b - p)) / jnp.sqrt(jnp.abs(x))), lambda: jnp.exp(jnp.abs(x)) * (0.5 * (b - p)) / jnp.sqrt(jnp.abs(x))
-        )
+  def small(x):
+    A = jnp.array(
+        [
+            2.77791411276104639959e-18,
+            -2.11142121435816608115e-17,
+            1.55363195773620046921e-16,
+            -1.10559694773538630805e-15,
+            7.60068429473540693410e-15,
+            -5.04218550472791168711e-14,
+            3.22379336594557470981e-13,
+            -1.98397439776494371520e-12,
+            1.17361862988909016308e-11,
+            -6.66348972350202774223e-11,
+            3.62559028155211703701e-10,
+            -1.88724975172282928790e-09,
+            9.38153738649577178388e-09,
+            -4.44505912879632808065e-08,
+            2.00329475355213526229e-07,
+            -8.56872026469545474066e-07,
+            3.47025130813767847674e-06,
+            -1.32731636560394358279e-05,
+            4.78156510755005422638e-05,
+            -1.61760815825896745588e-04,
+            5.12285956168575772895e-04,
+            -1.51357245063125314899e-03,
+            4.15642294431288815669e-03,
+            -1.05640848946261981558e-02,
+            2.47264490306265168283e-02,
+            -5.29459812080949914269e-02,
+            1.02643658689847095384e-01,
+            -1.76416518357834055153e-01,
+            2.52587186443633654823e-01,
+        ],
+        dtype=self.dtype,
+    )
-    return jnp.piecewise(
-        self, [self <= 8], [small, default]
+    def f(carry, val):
+      p, q, a = carry
+      p, q = q, a
+      return (p, q, ((jnp.abs(x) / 2.0) - 2.0) * q - p + val), None
+    (p, _, a), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
+    return jax.lax.cond(
+        x < 0, lambda: -(0.5 * (a - p) * jnp.abs(x) * jnp.exp(jnp.abs(x))),
+        lambda: 0.5 * (a - p) * jnp.abs(x) * jnp.exp(jnp.abs(x)))
+  def default(x):
+    B = jnp.array(
+        [
+            7.51729631084210481353e-18,
+            4.41434832307170791151e-18,
+            -4.65030536848935832153e-17,
+            -3.20952592199342395980e-17,
+            2.96262899764595013876e-16,
+            3.30820231092092828324e-16,
+            -1.88035477551078244854e-15,
+            -3.81440307243700780478e-15,
+            1.04202769841288027642e-14,
+            4.27244001671195135429e-14,
+            -2.10154184277266431302e-14,
+            -4.08355111109219731823e-13,
+            -7.19855177624590851209e-13,
+            2.03562854414708950722e-12,
+            1.41258074366137813316e-11,
+            3.25260358301548823856e-11,
+            -1.89749581235054123450e-11,
+            -5.58974346219658380687e-10,
+            -3.83538038596423702205e-09,
+            -2.63146884688951950684e-08,
+            -2.51223623787020892529e-07,
+            -3.88256480887769039346e-06,
+            -1.10588938762623716291e-04,
+            -9.76109749136146840777e-03,
+            7.78576235018280120474e-01,
+        ],
+        dtype=self.dtype,
     )
+    def f(carry, val):
+      p, q, b = carry
+      p, q = q, b
+      return (p, q, (32.0 / jnp.abs(x) - 2.0) * q - p + val), None
+    (p, _, b), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
+    return jax.lax.cond(
+        x < 0, lambda: -(jnp.exp(jnp.abs(x)) *
+                         (0.5 * (b - p)) / jnp.sqrt(jnp.abs(x))),
+        lambda: jnp.exp(jnp.abs(x)) * (0.5 * (b - p)) / jnp.sqrt(jnp.abs(x)))
+  return jnp.piecewise(self, [self <= 8], [small, default])
 @op(torch.ops.aten.special_modified_bessel_k0)
 @op_base.promote_int_input
 def _aten_special_modified_bessel_k0(self):
-    # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3367-L3441
+  # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3367-L3441
-    def zero(x):
-      return jnp.array(jnp.inf, x.dtype)
+  def zero(x):
+    return jnp.array(jnp.inf, x.dtype)
-    def negative(x):
-        return jnp.array(jnp.nan, x.dtype)
+  def negative(x):
+    return jnp.array(jnp.nan, x.dtype)
-    def small(x):
-        A = jnp.array(
-            [
+  def small(x):
+    A = jnp.array(
+        [
             1.37446543561352307156e-16,
             4.25981614279661018399e-14,
             1.03496952576338420167e-11,
@@ -3889,23 +4031,24 @@ def _aten_special_modified_bessel_k0(self):
             3.59799365153615016266e-02,
             3.44289899924628486886e-01,
             -5.35327393233902768720e-01,
-            ],
-            dtype=self.dtype,
-        )
+        ],
+        dtype=self.dtype,
+    )
-        def f(carry, val):
-            p, q, a = carry
-            p, q = q, a
-            return (p, q, (x * x - 2.0) * q - p + val), None
+    def f(carry, val):
+      p, q, a = carry
+      p, q = q, a
+      return (p, q, (x * x - 2.0) * q - p + val), None
+    (p, _, a), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
-        (p, _, a), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
-        return 0.5 * (a - p) - jnp.log(0.5 * x) * _aten_special_modified_bessel_i0(x)
+    return 0.5 * (a - p) - jnp.log(
+        0.5 * x) * _aten_special_modified_bessel_i0(x)
-    def default(x):
-        B = jnp.array(
-            [
+  def default(x):
+    B = jnp.array(
+        [
             5.30043377268626276149e-18,
             -1.64758043015242134646e-17,
             5.21039150503902756861e-17,
@@ -3931,38 +4074,38 @@ def _aten_special_modified_bessel_k0(self):
             1.56988388573005337491e-03,
             -3.14481013119645005427e-02,
             2.44030308206595545468e+00,
-            ],
-            dtype=self.dtype,
-        )
+        ],
+        dtype=self.dtype,
+    )
+    def f(carry, val):
+      p, q, b = carry
+      p, q = q, b
+      return (p, q, (8.0 / x - 2.0) * q - p + val), None
-        def f(carry, val):
-            p, q, b = carry
-            p, q = q, b
-            return (p, q, (8.0 / x - 2.0) * q - p + val), None
+    (p, _, b), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
-        (p, _, b), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
-        return jnp.exp(-x) * (0.5 * (b - p)) / jnp.sqrt(x)
+    return jnp.exp(-x) * (0.5 * (b - p)) / jnp.sqrt(x)
+  return jnp.piecewise(self, [self <= 2, self < 0, self == 0],
+                       [small, negative, zero, default])
-    return jnp.piecewise(
-        self, [self <= 2, self < 0, self == 0], [small, negative, zero, default]
-    )
 @op(torch.ops.aten.special_modified_bessel_k1)
 @op_base.promote_int_input
 def _aten_special_modified_bessel_k1(self):
-    # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3444-L3519
+  # Adapted from https://github.com/pytorch/pytorch/blob/f8f41dcb24cb4f4e87a51bb04847942dd835e496/aten/src/ATen/native/Math.h#L3444-L3519
-    def zero(x):
-      return jnp.array(jnp.inf, x.dtype)
+  def zero(x):
+    return jnp.array(jnp.inf, x.dtype)
-    def negative(x):
-        return jnp.array(jnp.nan, x.dtype)
+  def negative(x):
+    return jnp.array(jnp.nan, x.dtype)
-    def small(x):
-        A = jnp.array(
-            [
+  def small(x):
+    A = jnp.array(
+        [
             -7.02386347938628759343e-18,
             -2.42744985051936593393e-15,
             -6.66690169419932900609e-13,
@@ -3974,24 +4117,25 @@ def _aten_special_modified_bessel_k1(self):
             -1.22611180822657148235e-01,
             -3.53155960776544875667e-01,
             1.52530022733894777053e+00,
-            ],
-            dtype=self.dtype,
-        )
+        ],
+        dtype=self.dtype,
+    )
-        def f(carry, val):
-            p, q, a = carry
-            p, q = q, a
-            a = (x * x - 2.0) * q - p + val
-            return (p, q, a), None
+    def f(carry, val):
+      p, q, a = carry
+      p, q = q, a
+      a = (x * x - 2.0) * q - p + val
+      return (p, q, a), None
+    (p, _, a), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
-        (p, _, a), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=A)
-        return jnp.log(0.5 * x) * _aten_special_modified_bessel_i1(x) + 0.5 * (a - p) / x
+    return jnp.log(
+        0.5 * x) * _aten_special_modified_bessel_i1(x) + 0.5 * (a - p) / x
-    def default(x):
-        B = jnp.array(
-            [
+  def default(x):
+    B = jnp.array(
+        [
             -5.75674448366501715755e-18,
             1.79405087314755922667e-17,
             -5.68946255844285935196e-17,
@@ -4017,24 +4161,24 @@ def _aten_special_modified_bessel_k1(self):
             -2.85781685962277938680e-03,
             1.03923736576817238437e-01,
             2.72062619048444266945e+00,
-            ],
-            dtype=self.dtype,
-        )
+        ],
+        dtype=self.dtype,
+    )
+    def f(carry, val):
+      p, q, b = carry
+      p, q = q, b
+      b = (8.0 / x - 2.0) * q - p + val
+      return (p, q, b), None
+    (p, _, b), _ = jax.lax.scan(
+        f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
-        def f(carry, val):
-            p, q, b = carry
-            p, q = q, b
-            b = (8.0 / x - 2.0) * q - p + val
-            return (p, q, b), None
+    return jnp.exp(-x) * (0.5 * (b - p)) / jnp.sqrt(x)
-        (p, _, b), _ = jax.lax.scan(
-            f, init=(jnp.zeros_like(x), jnp.zeros_like(x), 0), xs=B)
-        return jnp.exp(-x) * (0.5 * (b - p)) / jnp.sqrt(x)
+  return jnp.piecewise(self, [self <= 2, self < 0, self == 0],
+                       [small, negative, zero, default])
-    return jnp.piecewise(
-        self, [self <= 2, self < 0, self == 0], [small, negative, zero, default]
-    )
 @op(torch.ops.aten.polygamma)
 def _aten_polygamma(x, n):
@@ -4042,10 +4186,12 @@ def _aten_polygamma(x, n):
     n = n.astype(mappings.t2j_dtype(torch.get_default_dtype()))
   return jax.lax.polygamma(jnp.float32(x), n)
 @op(torch.ops.aten.special_ndtri)
 @op_base.promote_int_input
 def _aten_special_ndtri(self):
-    return jax.scipy.special.ndtri(self)
+  return jax.scipy.special.ndtri(self)
 @op(torch.ops.aten.special_bessel_j0)
 @op_base.promote_int_input
@@ -4057,112 +4203,104 @@ def _aten_special_bessel_j0(self):
   def small(x):
     RP = jnp.array(
-      [
-        -4.79443220978201773821e09,
-        1.95617491946556577543e12,
-        -2.49248344360967716204e14,
-        9.70862251047306323952e15,
-      ],
-      dtype=self.dtype,
+        [
+            -4.79443220978201773821e09,
+            1.95617491946556577543e12,
+            -2.49248344360967716204e14,
+            9.70862251047306323952e15,
+        ],
+        dtype=self.dtype,
     )
     RQ = jnp.array(
-      [
-        4.99563147152651017219e02,
-        1.73785401676374683123e05,
-        4.84409658339962045305e07,
-        1.11855537045356834862e10,
-        2.11277520115489217587e12,
-        3.10518229857422583814e14,
-        3.18121955943204943306e16,
-        1.71086294081043136091e18,
-      ],
-      dtype=self.dtype,
+        [
+            4.99563147152651017219e02,
+            1.73785401676374683123e05,
+            4.84409658339962045305e07,
+            1.11855537045356834862e10,
+            2.11277520115489217587e12,
+            3.10518229857422583814e14,
+            3.18121955943204943306e16,
+            1.71086294081043136091e18,
+        ],
+        dtype=self.dtype,
     )
     rp = op_base.foreach_loop(RP, lambda carry, rp_i: carry * (x * x) + rp_i)
     rq = op_base.foreach_loop(RQ, lambda carry, rq_i: carry * (x * x) + rq_i)
-    return (
-      (x * x - 5.78318596294678452118e00)
-      * (x * x - 3.04712623436620863991e01)
-      * rp
-      / rq
-    )
+    return ((x * x - 5.78318596294678452118e00) *
+            (x * x - 3.04712623436620863991e01) * rp / rq)
   def default(x):
     PP = jnp.array(
-      [
-        7.96936729297347051624e-04,
-        8.28352392107440799803e-02,
-        1.23953371646414299388e00,
-        5.44725003058768775090e00,
-        8.74716500199817011941e00,
-        5.30324038235394892183e00,
-        9.99999999999999997821e-01,
-      ],
-      dtype=self.dtype,
+        [
+            7.96936729297347051624e-04,
+            8.28352392107440799803e-02,
+            1.23953371646414299388e00,
+            5.44725003058768775090e00,
+            8.74716500199817011941e00,
+            5.30324038235394892183e00,
+            9.99999999999999997821e-01,
+        ],
+        dtype=self.dtype,
     )
     PQ = jnp.array(
-      [
-        9.24408810558863637013e-04,
-        8.56288474354474431428e-02,
-        1.25352743901058953537e00,
-        5.47097740330417105182e00,
-        8.76190883237069594232e00,
-        5.30605288235394617618e00,
-        1.00000000000000000218e00,
-      ],
-      dtype=self.dtype,
+        [
+            9.24408810558863637013e-04,
+            8.56288474354474431428e-02,
+            1.25352743901058953537e00,
+            5.47097740330417105182e00,
+            8.76190883237069594232e00,
+            5.30605288235394617618e00,
+            1.00000000000000000218e00,
+        ],
+        dtype=self.dtype,
     )
     QP = jnp.array(
-      [
-        -1.13663838898469149931e-02,
-        -1.28252718670509318512e00,
-        -1.95539544257735972385e01,
-        -9.32060152123768231369e01,
-        -1.77681167980488050595e02,
-        -1.47077505154951170175e02,
-        -5.14105326766599330220e01,
-        -6.05014350600728481186e00,
-      ],
-      dtype=self.dtype,
+        [
+            -1.13663838898469149931e-02,
+            -1.28252718670509318512e00,
+            -1.95539544257735972385e01,
+            -9.32060152123768231369e01,
+            -1.77681167980488050595e02,
+            -1.47077505154951170175e02,
+            -5.14105326766599330220e01,
+            -6.05014350600728481186e00,
+        ],
+        dtype=self.dtype,
     )
     QQ = jnp.array(
-      [
-        6.43178256118178023184e01,
-        8.56430025976980587198e02,
-        3.88240183605401609683e03,
-        7.24046774195652478189e03,
-        5.93072701187316984827e03,
-        2.06209331660327847417e03,
-        2.42005740240291393179e02,
-      ],
-      dtype=self.dtype,
+        [
+            6.43178256118178023184e01,
+            8.56430025976980587198e02,
+            3.88240183605401609683e03,
+            7.24046774195652478189e03,
+            5.93072701187316984827e03,
+            2.06209331660327847417e03,
+            2.42005740240291393179e02,
+        ],
+        dtype=self.dtype,
     )
-    pp = op_base.foreach_loop(PP, lambda carry, pp_i: carry * (25.0 / (x * x)) + pp_i)
-    pq = op_base.foreach_loop(PQ, lambda carry, pq_i: carry * (25.0 / (x * x)) + pq_i)
-    qp = op_base.foreach_loop(QP, lambda carry, qp_i: carry * (25.0 / (x * x)) + qp_i)
-    qq = op_base.foreach_loop(QQ, lambda carry, qq_i: carry * (25.0 / (x * x)) + qq_i)
-    return (
-      (
-        pp / pq * jnp.cos(x - 0.785398163397448309615660845819875721)
-        - 5.0
-        / x
-        * (qp / qq)
-        * jnp.sin(x - 0.785398163397448309615660845819875721)
-      )
-      * 0.797884560802865355879892119868763737
-      / jnp.sqrt(x)
-    )
+    pp = op_base.foreach_loop(
+        PP, lambda carry, pp_i: carry * (25.0 / (x * x)) + pp_i)
+    pq = op_base.foreach_loop(
+        PQ, lambda carry, pq_i: carry * (25.0 / (x * x)) + pq_i)
+    qp = op_base.foreach_loop(
+        QP, lambda carry, qp_i: carry * (25.0 / (x * x)) + qp_i)
+    qq = op_base.foreach_loop(
+        QQ, lambda carry, qq_i: carry * (25.0 / (x * x)) + qq_i)
+    return ((pp / pq * jnp.cos(x - 0.785398163397448309615660845819875721) -
+             5.0 / x *
+             (qp / qq) * jnp.sin(x - 0.785398163397448309615660845819875721)) *
+            0.797884560802865355879892119868763737 / jnp.sqrt(x))
   self = jnp.abs(self)
   # Last True condition in  `piecewise` takes priority, but last function is
   # default. See https://github.com/numpy/numpy/issues/16475
-  return jnp.piecewise(
-    self, [self <= 5.0, self < 0.00001], [small, very_small, default]
-  )
+  return jnp.piecewise(self, [self <= 5.0, self < 0.00001],
+                       [small, very_small, default])
 @op(torch.ops.aten.special_bessel_j1)
@@ -4172,114 +4310,106 @@ def _aten_special_bessel_j1(self):
   def small(x):
     RP = jnp.array(
-      [
-        -8.99971225705559398224e08,
-        4.52228297998194034323e11,
-        -7.27494245221818276015e13,
-        3.68295732863852883286e15,
-      ],
-      dtype=self.dtype,
+        [
+            -8.99971225705559398224e08,
+            4.52228297998194034323e11,
+            -7.27494245221818276015e13,
+            3.68295732863852883286e15,
+        ],
+        dtype=self.dtype,
     )
     RQ = jnp.array(
-      [
-        6.20836478118054335476e02,
-        2.56987256757748830383e05,
-        8.35146791431949253037e07,
-        2.21511595479792499675e10,
-        4.74914122079991414898e12,
-        7.84369607876235854894e14,
-        8.95222336184627338078e16,
-        5.32278620332680085395e18,
-      ],
-      dtype=self.dtype,
+        [
+            6.20836478118054335476e02,
+            2.56987256757748830383e05,
+            8.35146791431949253037e07,
+            2.21511595479792499675e10,
+            4.74914122079991414898e12,
+            7.84369607876235854894e14,
+            8.95222336184627338078e16,
+            5.32278620332680085395e18,
+        ],
+        dtype=self.dtype,
     )
     rp = op_base.foreach_loop(RP, lambda carry, rp_i: carry * (x * x) + rp_i)
     rq = op_base.foreach_loop(RQ, lambda carry, rq_i: carry * (x * x) + rq_i)
-    return (
-      rp
-      / rq
-      * x
-      * (x * x - 1.46819706421238932572e01)
-      * (x * x - 4.92184563216946036703e01)
-    )
+    return (rp / rq * x * (x * x - 1.46819706421238932572e01) *
+            (x * x - 4.92184563216946036703e01))
   def default(x):
     PP = jnp.array(
-      [
-        7.62125616208173112003e-04,
-        7.31397056940917570436e-02,
-        1.12719608129684925192e00,
-        5.11207951146807644818e00,
-        8.42404590141772420927e00,
-        5.21451598682361504063e00,
-        1.00000000000000000254e00,
-      ],
-      dtype=self.dtype,
+        [
+            7.62125616208173112003e-04,
+            7.31397056940917570436e-02,
+            1.12719608129684925192e00,
+            5.11207951146807644818e00,
+            8.42404590141772420927e00,
+            5.21451598682361504063e00,
+            1.00000000000000000254e00,
+        ],
+        dtype=self.dtype,
     )
     PQ = jnp.array(
-      [
-        5.71323128072548699714e-04,
-        6.88455908754495404082e-02,
-        1.10514232634061696926e00,
-        5.07386386128601488557e00,
-        8.39985554327604159757e00,
-        5.20982848682361821619e00,
-        9.99999999999999997461e-01,
-      ],
-      dtype=self.dtype,
+        [
+            5.71323128072548699714e-04,
+            6.88455908754495404082e-02,
+            1.10514232634061696926e00,
+            5.07386386128601488557e00,
+            8.39985554327604159757e00,
+            5.20982848682361821619e00,
+            9.99999999999999997461e-01,
+        ],
+        dtype=self.dtype,
     )
     QP = jnp.array(
-      [
-        5.10862594750176621635e-02,
-        4.98213872951233449420e00,
-        7.58238284132545283818e01,
-        3.66779609360150777800e02,
-        7.10856304998926107277e02,
-        5.97489612400613639965e02,
-        2.11688757100572135698e02,
-        2.52070205858023719784e01,
-      ],
-      dtype=self.dtype,
+        [
+            5.10862594750176621635e-02,
+            4.98213872951233449420e00,
+            7.58238284132545283818e01,
+            3.66779609360150777800e02,
+            7.10856304998926107277e02,
+            5.97489612400613639965e02,
+            2.11688757100572135698e02,
+            2.52070205858023719784e01,
+        ],
+        dtype=self.dtype,
     )
     QQ = jnp.array(
-      [
-        7.42373277035675149943e01,
-        1.05644886038262816351e03,
-        4.98641058337653607651e03,
-        9.56231892404756170795e03,
-        7.99704160447350683650e03,
-        2.82619278517639096600e03,
-        3.36093607810698293419e02,
-      ],
-      dtype=self.dtype,
+        [
+            7.42373277035675149943e01,
+            1.05644886038262816351e03,
+            4.98641058337653607651e03,
+            9.56231892404756170795e03,
+            7.99704160447350683650e03,
+            2.82619278517639096600e03,
+            3.36093607810698293419e02,
+        ],
+        dtype=self.dtype,
     )
-    pp = op_base.foreach_loop(PP, lambda carry, pp_i: carry * (25.0 / (x * x)) + pp_i)
-    pq = op_base.foreach_loop(PQ, lambda carry, pq_i: carry * (25.0 / (x * x)) + pq_i)
-    qp = op_base.foreach_loop(QP, lambda carry, qp_i: carry * (25.0 / (x * x)) + qp_i)
-    qq = op_base.foreach_loop(QQ, lambda carry, qq_i: carry * (25.0 / (x * x)) + qq_i)
-    return (
-      (
-        pp / pq * jnp.cos(x - 2.356194490192344928846982537459627163)
-        - 5.0
-        / x
-        * (qp / qq)
-        * jnp.sin(x - 2.356194490192344928846982537459627163)
-      )
-      * 0.797884560802865355879892119868763737
-      / jnp.sqrt(x)
-    )
+    pp = op_base.foreach_loop(
+        PP, lambda carry, pp_i: carry * (25.0 / (x * x)) + pp_i)
+    pq = op_base.foreach_loop(
+        PQ, lambda carry, pq_i: carry * (25.0 / (x * x)) + pq_i)
+    qp = op_base.foreach_loop(
+        QP, lambda carry, qp_i: carry * (25.0 / (x * x)) + qp_i)
+    qq = op_base.foreach_loop(
+        QQ, lambda carry, qq_i: carry * (25.0 / (x * x)) + qq_i)
+    return ((pp / pq * jnp.cos(x - 2.356194490192344928846982537459627163) -
+             5.0 / x *
+             (qp / qq) * jnp.sin(x - 2.356194490192344928846982537459627163)) *
+            0.797884560802865355879892119868763737 / jnp.sqrt(x))
   # If x < 0, bessel_j1(x) = -bessel_j1(-x)
   sign = jnp.sign(self)
   self = jnp.abs(self)
   return sign * jnp.piecewise(
-    self,
-    [self <= 5.0],
-    [small, default],
+      self,
+      [self <= 5.0],
+      [small, default],
   )
@@ -4296,85 +4426,86 @@ def _aten_special_bessel_y0(self):
   def small(x):
     YP = jnp.array(
-      [
-        1.55924367855235737965e04,
-        -1.46639295903971606143e07,
-        5.43526477051876500413e09,
-        -9.82136065717911466409e11,
-        8.75906394395366999549e13,
-        -3.46628303384729719441e15,
-        4.42733268572569800351e16,
-        -1.84950800436986690637e16,
-      ],
-      dtype=self.dtype,
+        [
+            1.55924367855235737965e04,
+            -1.46639295903971606143e07,
+            5.43526477051876500413e09,
+            -9.82136065717911466409e11,
+            8.75906394395366999549e13,
+            -3.46628303384729719441e15,
+            4.42733268572569800351e16,
+            -1.84950800436986690637e16,
+        ],
+        dtype=self.dtype,
     )
     YQ = jnp.array(
-      [
-        1.04128353664259848412e03,
-        6.26107330137134956842e05,
-        2.68919633393814121987e08,
-        8.64002487103935000337e10,
-        2.02979612750105546709e13,
-        3.17157752842975028269e15,
-        2.50596256172653059228e17,
-      ],
-      dtype=self.dtype,
+        [
+            1.04128353664259848412e03,
+            6.26107330137134956842e05,
+            2.68919633393814121987e08,
+            8.64002487103935000337e10,
+            2.02979612750105546709e13,
+            3.17157752842975028269e15,
+            2.50596256172653059228e17,
+        ],
+        dtype=self.dtype,
     )
     yp = op_base.foreach_loop(YP, lambda carry, yp_i: carry * (x * x) + yp_i)
     yq = op_base.foreach_loop(YQ, lambda carry, yq_i: carry * (x * x) + yq_i)
-    return yp / yq + (0.636619772367581343075535053490057448 * jnp.log(x) * _aten_special_bessel_j0(x))
+    return yp / yq + (0.636619772367581343075535053490057448 * jnp.log(x) *
+                      _aten_special_bessel_j0(x))
   def default(x):
     PP = jnp.array(
-      [
-        7.96936729297347051624e-04,
-        8.28352392107440799803e-02,
-        1.23953371646414299388e00,
-        5.44725003058768775090e00,
-        8.74716500199817011941e00,
-        5.30324038235394892183e00,
-        9.99999999999999997821e-01,
-      ],
-      dtype=self.dtype,
+        [
+            7.96936729297347051624e-04,
+            8.28352392107440799803e-02,
+            1.23953371646414299388e00,
+            5.44725003058768775090e00,
+            8.74716500199817011941e00,
+            5.30324038235394892183e00,
+            9.99999999999999997821e-01,
+        ],
+        dtype=self.dtype,
     )
     PQ = jnp.array(
-      [
-        9.24408810558863637013e-04,
-        8.56288474354474431428e-02,
-        1.25352743901058953537e00,
-        5.47097740330417105182e00,
-        8.76190883237069594232e00,
-        5.30605288235394617618e00,
-        1.00000000000000000218e00,
-      ],
-      dtype=self.dtype,
+        [
+            9.24408810558863637013e-04,
+            8.56288474354474431428e-02,
+            1.25352743901058953537e00,
+            5.47097740330417105182e00,
+            8.76190883237069594232e00,
+            5.30605288235394617618e00,
+            1.00000000000000000218e00,
+        ],
+        dtype=self.dtype,
     )
     QP = jnp.array(
-      [
-        -1.13663838898469149931e-02,
-        -1.28252718670509318512e00,
-        -1.95539544257735972385e01,
-        -9.32060152123768231369e01,
-        -1.77681167980488050595e02,
-        -1.47077505154951170175e02,
-        -5.14105326766599330220e01,
-        -6.05014350600728481186e00,
-      ],
-      dtype=self.dtype,
+        [
+            -1.13663838898469149931e-02,
+            -1.28252718670509318512e00,
+            -1.95539544257735972385e01,
+            -9.32060152123768231369e01,
+            -1.77681167980488050595e02,
+            -1.47077505154951170175e02,
+            -5.14105326766599330220e01,
+            -6.05014350600728481186e00,
+        ],
+        dtype=self.dtype,
     )
     QQ = jnp.array(
-      [
-        6.43178256118178023184e01,
-        8.56430025976980587198e02,
-        3.88240183605401609683e03,
-        7.24046774195652478189e03,
-        5.93072701187316984827e03,
-        2.06209331660327847417e03,
-        2.42005740240291393179e02,
-      ],
-      dtype=self.dtype,
+        [
+            6.43178256118178023184e01,
+            8.56430025976980587198e02,
+            3.88240183605401609683e03,
+            7.24046774195652478189e03,
+            5.93072701187316984827e03,
+            2.06209331660327847417e03,
+            2.42005740240291393179e02,
+        ],
+        dtype=self.dtype,
     )
     factor = 25.0 / (x * x)
@@ -4383,22 +4514,15 @@ def _aten_special_bessel_y0(self):
     qp = op_base.foreach_loop(QP, lambda carry, qp_i: carry * factor + qp_i)
     qq = op_base.foreach_loop(QQ, lambda carry, qq_i: carry * factor + qq_i)
-    return (
-      (
-        pp / pq * jnp.sin(x - 0.785398163397448309615660845819875721)
-        + 5.0
-        / x
-        * (qp / qq)
-        * jnp.cos(x - 0.785398163397448309615660845819875721)
-      )
-      * 0.797884560802865355879892119868763737
-      / jnp.sqrt(x)
-    )
+    return ((pp / pq * jnp.sin(x - 0.785398163397448309615660845819875721) +
+             5.0 / x *
+             (qp / qq) * jnp.cos(x - 0.785398163397448309615660845819875721)) *
+            0.797884560802865355879892119868763737 / jnp.sqrt(x))
   return jnp.piecewise(
-    self,
-    [self <= 5.0, self < 0., self == 0.],
-    [small, negative, zero, default],
+      self,
+      [self <= 5.0, self < 0., self == 0.],
+      [small, negative, zero, default],
   )
@@ -4415,90 +4539,86 @@ def _aten_special_bessel_y1(self):
   def small(x):
     YP = jnp.array(
-      [
-        1.26320474790178026440e09,
-        -6.47355876379160291031e11,
-        1.14509511541823727583e14,
-        -8.12770255501325109621e15,
-        2.02439475713594898196e17,
-        -7.78877196265950026825e17,
-      ],
-      dtype=self.dtype,
+        [
+            1.26320474790178026440e09,
+            -6.47355876379160291031e11,
+            1.14509511541823727583e14,
+            -8.12770255501325109621e15,
+            2.02439475713594898196e17,
+            -7.78877196265950026825e17,
+        ],
+        dtype=self.dtype,
     )
     YQ = jnp.array(
-      [
-        5.94301592346128195359e02,
-        2.35564092943068577943e05,
-        7.34811944459721705660e07,
-        1.87601316108706159478e10,
-        3.88231277496238566008e12,
-        6.20557727146953693363e14,
-        6.87141087355300489866e16,
-        3.97270608116560655612e18,
-      ],
-      dtype=self.dtype,
+        [
+            5.94301592346128195359e02,
+            2.35564092943068577943e05,
+            7.34811944459721705660e07,
+            1.87601316108706159478e10,
+            3.88231277496238566008e12,
+            6.20557727146953693363e14,
+            6.87141087355300489866e16,
+            3.97270608116560655612e18,
+        ],
+        dtype=self.dtype,
     )
     yp = op_base.foreach_loop(YP, lambda carry, yp_i: carry * (x * x) + yp_i)
     yq = op_base.foreach_loop(YQ, lambda carry, yq_i: carry * (x * x) + yq_i)
-    return (
-      x * (yp / yq)
-      + (
-        0.636619772367581343075535053490057448
-        * (_aten_special_bessel_j1(x) * jnp.log(x) - 1.0 / x)
-      )
-    )
+    return (x * (yp / yq) +
+            (0.636619772367581343075535053490057448 *
+             (_aten_special_bessel_j1(x) * jnp.log(x) - 1.0 / x)))
   def default(x):
     PP = jnp.array(
-      [
-        7.62125616208173112003e-04,
-        7.31397056940917570436e-02,
-        1.12719608129684925192e00,
-        5.11207951146807644818e00,
-        8.42404590141772420927e00,
-        5.21451598682361504063e00,
-        1.00000000000000000254e00,
-      ],
-      dtype=self.dtype,
+        [
+            7.62125616208173112003e-04,
+            7.31397056940917570436e-02,
+            1.12719608129684925192e00,
+            5.11207951146807644818e00,
+            8.42404590141772420927e00,
+            5.21451598682361504063e00,
+            1.00000000000000000254e00,
+        ],
+        dtype=self.dtype,
     )
     PQ = jnp.array(
-      [
-        5.71323128072548699714e-04,
-        6.88455908754495404082e-02,
-        1.10514232634061696926e00,
-        5.07386386128601488557e00,
-        8.39985554327604159757e00,
-        5.20982848682361821619e00,
-        9.99999999999999997461e-01,
-      ],
-      dtype=self.dtype,
+        [
+            5.71323128072548699714e-04,
+            6.88455908754495404082e-02,
+            1.10514232634061696926e00,
+            5.07386386128601488557e00,
+            8.39985554327604159757e00,
+            5.20982848682361821619e00,
+            9.99999999999999997461e-01,
+        ],
+        dtype=self.dtype,
     )
     QP = jnp.array(
-      [
-        5.10862594750176621635e-02,
-        4.98213872951233449420e00,
-        7.58238284132545283818e01,
-        3.66779609360150777800e02,
-        7.10856304998926107277e02,
-        5.97489612400613639965e02,
-        2.11688757100572135698e02,
-        2.52070205858023719784e01,
-      ],
-      dtype=self.dtype,
+        [
+            5.10862594750176621635e-02,
+            4.98213872951233449420e00,
+            7.58238284132545283818e01,
+            3.66779609360150777800e02,
+            7.10856304998926107277e02,
+            5.97489612400613639965e02,
+            2.11688757100572135698e02,
+            2.52070205858023719784e01,
+        ],
+        dtype=self.dtype,
     )
     QQ = jnp.array(
-      [
-        7.42373277035675149943e01,
-        1.05644886038262816351e03,
-        4.98641058337653607651e03,
-        9.56231892404756170795e03,
-        7.99704160447350683650e03,
-        2.82619278517639096600e03,
-        3.36093607810698293419e02,
-      ],
-      dtype=self.dtype,
+        [
+            7.42373277035675149943e01,
+            1.05644886038262816351e03,
+            4.98641058337653607651e03,
+            9.56231892404756170795e03,
+            7.99704160447350683650e03,
+            2.82619278517639096600e03,
+            3.36093607810698293419e02,
+        ],
+        dtype=self.dtype,
     )
     factor = 25.0 / (x * x)
@@ -4507,22 +4627,15 @@ def _aten_special_bessel_y1(self):
     qp = op_base.foreach_loop(QP, lambda carry, qp_i: carry * factor + qp_i)
     qq = op_base.foreach_loop(QQ, lambda carry, qq_i: carry * factor + qq_i)
-    return (
-      (
-        pp / pq * jnp.sin(x - 2.356194490192344928846982537459627163)
-        + 5.0
-        / x
-        * (qp / qq)
-        * jnp.cos(x - 2.356194490192344928846982537459627163)
-      )
-      * 0.797884560802865355879892119868763737
-      / jnp.sqrt(x)
-    )
+    return ((pp / pq * jnp.sin(x - 2.356194490192344928846982537459627163) +
+             5.0 / x *
+             (qp / qq) * jnp.cos(x - 2.356194490192344928846982537459627163)) *
+            0.797884560802865355879892119868763737 / jnp.sqrt(x))
   return jnp.piecewise(
-    self,
-    [self <= 5.0, self < 0., self == 0.],
-    [small, negative, zero, default],
+      self,
+      [self <= 5.0, self < 0., self == 0.],
+      [small, negative, zero, default],
   )
@@ -4533,11 +4646,13 @@ def _aten_special_chebyshev_polynomial_t(self, n):
   @jnp.vectorize
   def vectorized(x, n_i):
     def negative_n(x):
       return jnp.zeros_like(x)
     def one_x(x):
-      return jnp.where((x > 0) | (n_i % 2 == 0), jnp.ones_like(x), -jnp.ones_like(x))
+      return jnp.where((x > 0) | (n_i % 2 == 0), jnp.ones_like(x),
+                       -jnp.ones_like(x))
     def large_n_small_x(x):
       return jnp.cos(n_i * jnp.acos(x))
@@ -4549,24 +4664,18 @@ def _aten_special_chebyshev_polynomial_t(self, n):
       return x
     def default(x):
       def f(_, carry):
         p, q = carry
         return (q, 2 * x * q - p)
-      _, r  = jax.lax.fori_loop(0, n_i - 1, f, init_val=(1., x))
+      _, r = jax.lax.fori_loop(0, n_i - 1, f, init_val=(1., x))
       return r
-    return jnp.piecewise(
-      x,
-      [
-        n_i == 1,
-        n_i == 0,
-        (n_i == 6) & (jnp.abs(x) < 1),
-        jnp.abs(x) == 1.,
-        n_i < 0
-      ],
-      [one_n, zero_n, large_n_small_x, one_x, negative_n, default]
-    )
+    return jnp.piecewise(x, [
+        n_i == 1, n_i == 0, (n_i == 6) & (jnp.abs(x) < 1),
+        jnp.abs(x) == 1., n_i < 0
+    ], [one_n, zero_n, large_n_small_x, one_x, negative_n, default])
   # Explcicitly vectorize since we must vectorizes over both self and n
   return vectorized(self, n.astype(jnp.int64))
@@ -4579,6 +4688,7 @@ def _aten_special_chebyshev_polynomial_u(self, n):
   @jnp.vectorize
   def vectorized(x, n_i):
     def negative_n(x):
       return jnp.zeros_like(x)
@@ -4588,9 +4698,9 @@ def _aten_special_chebyshev_polynomial_u(self, n):
     def large_n_small_x(x):
       sin_acos_x = jnp.sin(jnp.acos(x))
       return jnp.where(
-        sin_acos_x != 0,
-        jnp.sin((n_i + 1) * jnp.acos(x)) / sin_acos_x,
-        (n_i + 1) * jnp.cos((n_i + 1) * jnp.acos(x)) / x,
+          sin_acos_x != 0,
+          jnp.sin((n_i + 1) * jnp.acos(x)) / sin_acos_x,
+          (n_i + 1) * jnp.cos((n_i + 1) * jnp.acos(x)) / x,
       )
     def zero_n(x):
@@ -4600,6 +4710,7 @@ def _aten_special_chebyshev_polynomial_u(self, n):
       return 2 * x
     def default(x):
       def f(_, carry):
         p, q = carry
         return (q, 2 * x * q - p)
@@ -4608,15 +4719,15 @@ def _aten_special_chebyshev_polynomial_u(self, n):
       return r
     return jnp.piecewise(
-      x,
-      [
-        n_i == 1,
-        n_i == 0,
-        (n_i > 8) & (jnp.abs(x) < 1),
-        jnp.abs(x) == 1.0,
-        n_i < 0,
-      ],
-      [one_n, zero_n, large_n_small_x, one_x, negative_n, default],
+        x,
+        [
+            n_i == 1,
+            n_i == 0,
+            (n_i > 8) & (jnp.abs(x) < 1),
+            jnp.abs(x) == 1.0,
+            n_i < 0,
+        ],
+        [one_n, zero_n, large_n_small_x, one_x, negative_n, default],
     )
   return vectorized(self, n.astype(jnp.int64))
@@ -4627,6 +4738,7 @@ def _aten_special_chebyshev_polynomial_u(self, n):
 def _aten_special_erfcx(x):
   return jnp.exp(x * x) * jax.lax.erfc(x)
 @op(torch.ops.aten.erfc)
 @op_base.promote_int_input
 def _aten_erfcx(x):
@@ -4640,6 +4752,7 @@ def _aten_special_hermite_polynomial_h(self, n):
   @jnp.vectorize
   def vectorized(x, n_i):
     def negative_n(x):
       return jnp.zeros_like(x)
@@ -4650,6 +4763,7 @@ def _aten_special_hermite_polynomial_h(self, n):
       return 2 * x
     def default(x):
       def f(k, carry):
         p, q = carry
         return (q, 2 * x * q - 2 * k * p)
@@ -4657,9 +4771,8 @@ def _aten_special_hermite_polynomial_h(self, n):
       _, r = jax.lax.fori_loop(1, n_i, f, init_val=(1.0, 2 * x))
       return r
-    return jnp.piecewise(
-      x, [n_i == 1, n_i == 0, n_i < 0], [one_n, zero_n, negative_n, default]
-    )
+    return jnp.piecewise(x, [n_i == 1, n_i == 0, n_i < 0],
+                         [one_n, zero_n, negative_n, default])
   return vectorized(self, n.astype(jnp.int64))
@@ -4671,6 +4784,7 @@ def _aten_special_hermite_polynomial_he(self, n):
   @jnp.vectorize
   def vectorized(x, n_i):
     def negative_n(x):
       return jnp.zeros_like(x)
@@ -4681,6 +4795,7 @@ def _aten_special_hermite_polynomial_he(self, n):
       return x
     def default(x):
       def f(k, carry):
         p, q = carry
         return (q, x * q - k * p)
@@ -4688,24 +4803,34 @@ def _aten_special_hermite_polynomial_he(self, n):
       _, r = jax.lax.fori_loop(1, n_i, f, init_val=(1.0, x))
       return r
-    return jnp.piecewise(
-      x, [n_i == 1.0, n_i == 0.0, n_i < 0], [one_n, zero_n, negative_n, default]
-    )
+    return jnp.piecewise(x, [n_i == 1.0, n_i == 0.0, n_i < 0],
+                         [one_n, zero_n, negative_n, default])
   return vectorized(self, n.astype(jnp.int64))
 @op(torch.ops.aten.multinomial, needs_env=True)
-def _aten_multinomial(input, num_samples, replacement=False, *, generator=None, out=None, env=None):
-  assert num_samples <= input.shape[-1] or replacement, "cannot take a larger sample than population when replacement=False"
-  assert jnp.all(input >= 0), "inputs must be non-negative"
+def _aten_multinomial(input,
+                      num_samples,
+                      replacement=False,
+                      *,
+                      generator=None,
+                      out=None,
+                      env=None):
+  assert num_samples <= input.shape[
+      -1] or replacement, "cannot take a larger sample than population when replacement=False"
   key = env.get_and_rotate_prng_key(generator)
   if input.ndim == 1:
-    assert jnp.sum(input) > 0, "rows of input must have non-zero sum"
-    return jax.random.choice(key, input.shape[-1], (num_samples,), replace=replacement, p=input)
+    return jax.random.choice(
+        key, input.shape[-1], (num_samples,), replace=replacement, p=input)
   else:
-    assert jnp.all(jnp.sum(input, axis=1) > 0), "rows of input must have non-zero sum"
-    return jnp.array([jax.random.choice(key, input.shape[-1], (num_samples,), replace=replacement, p=input[i, :]) for i in range(input.shape[0])])
+    return jnp.array([
+        jax.random.choice(
+            key,
+            input.shape[-1], (num_samples,),
+            replace=replacement,
+            p=input[i, :]) for i in range(input.shape[0])
+    ])
 @op(torch.ops.aten.narrow)
@@ -4738,7 +4863,12 @@ def _aten_flatten(x, start_dim=0, end_dim=-1):
 @op(torch.ops.aten.new_empty)
 def _new_empty(self, size, **kwargs):
-  return jnp.empty(size)
+  dtype = kwargs.get('dtype')
+  if dtype is not None:
+    dtype = mappings.t2j_dtype(dtype)
+  else:
+    dtype = self.dtype
+  return jnp.empty(size, dtype=dtype)
 @op(torch.ops.aten.new_empty_strided)
@@ -4756,10 +4886,8 @@ def _aten_unsafe_index_put(self, indices, values, accumulate=False):
   return _aten_index_put(self, indices, values, accumulate)
-@op(torch.ops.aten.conj_physical,
-    torch.ops.aten.conj,
-    torch.ops.aten._conj_physical,
-    torch.ops.aten._conj)
+@op(torch.ops.aten.conj_physical, torch.ops.aten.conj,
+    torch.ops.aten._conj_physical, torch.ops.aten._conj)
 def _aten_conj_physical(self):
   return jnp.conjugate(self)
@@ -4768,6 +4896,7 @@ def _aten_conj_physical(self):
 def _aten_log_sigmoid(x):
   return jax.nn.log_sigmoid(x)
 # torch.qr
 @op(torch.ops.aten.qr)
 def _aten_qr(input, *args, **kwargs):
@@ -4778,6 +4907,7 @@ def _aten_qr(input, *args, **kwargs):
     jax_mode = "complete"
   return jax.numpy.linalg.qr(input, mode=jax_mode)
 # torch.linalg.qr
 @op(torch.ops.aten.linalg_qr)
 def _aten_linalg_qr(input, *args, **kwargs):
@@ -4820,19 +4950,25 @@ def _aten__linalg_solve_ex(a, b):
   res = jnp.linalg.solve(a, b)
   if batched:
     res = res.squeeze(-1)
-  info_shape = a.shape[0] if len(a.shape) >= 3 else []
+  info_shape = a.shape[:-2]
   info = jnp.zeros(info_shape, dtype=mappings.t2j_dtype(torch.int32))
   return res, info
 # torch.linalg.solve_triangular
 @op(torch.ops.aten.linalg_solve_triangular)
-def _aten_linalg_solve_triangular(a, b, *, upper=True, left=True, unitriangular=False):
+def _aten_linalg_solve_triangular(a,
+                                  b,
+                                  *,
+                                  upper=True,
+                                  left=True,
+                                  unitriangular=False):
   if left is False:
     a = jnp.matrix_transpose(a)
     b = jnp.matrix_transpose(b)
     upper = not upper
-  res = jax.scipy.linalg.solve_triangular(a, b, lower=not upper, unit_diagonal=unitriangular)
+  res = jax.scipy.linalg.solve_triangular(
+      a, b, lower=not upper, unit_diagonal=unitriangular)
   if left is False:
     res = jnp.matrix_transpose(res)
   return res
@@ -4852,21 +4988,31 @@ def _aten__linalg_check_errors(*args, **kwargs):
 @op(torch.ops.aten.median)
 def _aten_median(self, dim=None, keepdim=False):
-  output = _with_reduction_scalar(functools.partial(jnp.quantile, q=0.5, method='lower'), self, dim=dim, keepdim=keepdim).astype(self.dtype)
+  output = _with_reduction_scalar(
+      functools.partial(jnp.quantile, q=0.5, method='lower'),
+      self,
+      dim=dim,
+      keepdim=keepdim).astype(self.dtype)
   if dim is None:
     return output
   else:
-    index = _with_reduction_scalar(_get_median_index, self, dim, keepdim).astype(jnp.int64)
+    index = _with_reduction_scalar(_get_median_index, self, dim,
+                                   keepdim).astype(jnp.int64)
     return output, index
 @op(torch.ops.aten.nanmedian)
 def _aten_nanmedian(input, dim=None, keepdim=False, *, out=None):
-  output = _with_reduction_scalar(functools.partial(jnp.nanquantile, q=0.5, method='lower'), input, dim=dim, keepdim=keepdim).astype(input.dtype)
+  output = _with_reduction_scalar(
+      functools.partial(jnp.nanquantile, q=0.5, method='lower'),
+      input,
+      dim=dim,
+      keepdim=keepdim).astype(input.dtype)
   if dim is None:
     return output
   else:
-    index = _with_reduction_scalar(_get_median_index, input, dim, keepdim).astype(jnp.int64)
+    index = _with_reduction_scalar(_get_median_index, input, dim,
+                                   keepdim).astype(jnp.int64)
     return output, index
@@ -4874,20 +5020,31 @@ def _get_median_index(x, axis=None, keepdims=False):
   sorted_arg = jnp.argsort(x, axis=axis)
   n = x.shape[axis] if axis is not None else x.size
   if n % 2 == 1:
-      index = n // 2
+    index = n // 2
   else:
-      index = (n // 2) - 1
+    index = (n // 2) - 1
   if axis is None:
-      median_index = sorted_arg[index]
+    median_index = sorted_arg[index]
   else:
-      median_index = jnp.take(sorted_arg, index, axis=axis)
+    median_index = jnp.take(sorted_arg, index, axis=axis)
   if keepdims and axis is not None:
-          median_index = jnp.expand_dims(median_index, axis)
+    median_index = jnp.expand_dims(median_index, axis)
   return median_index
 @op(torch.ops.aten.triangular_solve)
-def _aten_triangular_solve(b, a, upper=True, transpose=False, unittriangular=False):
-  return (jax.lax.linalg.triangular_solve(a, b, left_side=True, lower=not upper, transpose_a=transpose, unit_diagonal=unittriangular), a)
+def _aten_triangular_solve(b,
+                           a,
+                           upper=True,
+                           transpose=False,
+                           unittriangular=False):
+  return (jax.lax.linalg.triangular_solve(
+      a,
+      b,
+      left_side=True,
+      lower=not upper,
+      transpose_a=transpose,
+      unit_diagonal=unittriangular), a)
 # func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
@@ -4895,16 +5052,16 @@ def _aten_triangular_solve(b, a, upper=True, transpose=False, unittriangular=Fal
 def _aten__fft_c2c(self, dim, normalization, forward):
   if forward:
     norm = [
-      'backward',
-      'ortho',
-      'forward',
+        'backward',
+        'ortho',
+        'forward',
     ][normalization]
     return jnp.fft.fftn(self, axes=dim, norm=norm)
   else:
     norm = [
-      'forward',
-      'ortho',
-      'backward',
+        'forward',
+        'ortho',
+        'backward',
     ][normalization]
     return jnp.fft.ifftn(self, axes=dim, norm=norm)
@@ -4912,21 +5069,22 @@ def _aten__fft_c2c(self, dim, normalization, forward):
 @op(torch.ops.aten._fft_r2c)
 def _aten__fft_r2c(self, dim, normalization, onesided):
   norm = [
-    'backward',
-    'ortho',
-    'forward',
+      'backward',
+      'ortho',
+      'forward',
   ][normalization]
   if onesided:
     return jnp.fft.rfftn(self, axes=dim, norm=norm)
   else:
     return jnp.fft.fftn(self, axes=dim, norm=norm)
 @op(torch.ops.aten._fft_c2r)
 def _aten__fft_c2r(self, dim, normalization, last_dim_size):
   norm = [
-    'forward',
-    'ortho',
-    'backward',
+      'forward',
+      'ortho',
+      'backward',
   ][normalization]
   if len(dim) == 1:
     s = [last_dim_size]
@@ -4936,34 +5094,49 @@ def _aten__fft_c2r(self, dim, normalization, last_dim_size):
 @op(torch.ops.aten._trilinear)
-def _aten_trilinear(i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim=1):
-  return _aten_sum(jnp.expand_dims(i1, expand1) * jnp.expand_dims(i2, expand2) * jnp.expand_dims(i3, expand3), sumdim)
+def _aten_trilinear(i1,
+                    i2,
+                    i3,
+                    expand1,
+                    expand2,
+                    expand3,
+                    sumdim,
+                    unroll_dim=1):
+  return _aten_sum(
+      jnp.expand_dims(i1, expand1) * jnp.expand_dims(i2, expand2) *
+      jnp.expand_dims(i3, expand3), sumdim)
 @op(torch.ops.aten.max_unpool2d)
 @op(torch.ops.aten.max_unpool3d)
 def _aten_max_unpoolxd(input, indices, output_size, stride=None, padding=0):
   if output_size is None:
-    raise ValueError("output_size value is not set correctly. It cannot be None or empty.")
+    raise ValueError(
+        "output_size value is not set correctly. It cannot be None or empty.")
   output_size = [input.shape[0], input.shape[1]] + output_size
   output = jnp.zeros(output_size, dtype=input.dtype)
   for idx in np.ndindex(input.shape):
-      max_index = indices[idx]
-      spatial_dims = output_size[2:]  # (D, H, W)
-      unpooled_spatial_idx = np.unravel_index(max_index, spatial_dims)
-      full_idx = idx[:2] + unpooled_spatial_idx
-      output = output.at[full_idx].set(input[idx])
+    max_index = indices[idx]
+    spatial_dims = output_size[2:]  # (D, H, W)
+    unpooled_spatial_idx = np.unravel_index(max_index, spatial_dims)
+    full_idx = idx[:2] + unpooled_spatial_idx
+    output = output.at[full_idx].set(input[idx])
   return output
-@op(torch.ops.aten._upsample_bilinear2d_aa)
-def _aten_upsample_bilinear2d_aa(input, output_size, align_corners, scale_factors=None, scales_h=None, scales_w=None):
+def _aten_upsample(input,
+                   output_size,
+                   align_corners,
+                   antialias,
+                   method,
+                   scale_factors=None,
+                   scales_h=None,
+                   scales_w=None):
   # input: is of type jaxlib.xla_extension.ArrayImpl
   image = input
-  method = "bilinear"
-  antialias = True # ignored for upsampling
   # https://jax.readthedocs.io/en/latest/_autosummary/jax.image.resize.html
   # Resize does not distinguish batch, channel size.
@@ -4977,12 +5150,12 @@ def _aten_upsample_bilinear2d_aa(input, output_size, align_corners, scale_factor
   shape = list(image.shape)
   # overriding output_size
   if scale_factors:
-    shape[-1] = int(math.floor(shape[-1]*scale_factors[-1]))
-    shape[-2] = int(math.floor(shape[-2]*scale_factors[-2]))
+    shape[-1] = int(math.floor(shape[-1] * scale_factors[-1]))
+    shape[-2] = int(math.floor(shape[-2] * scale_factors[-2]))
   if scales_h:
-    shape[-2] = int(math.floor(shape[-2]*scales_h))
+    shape[-2] = int(math.floor(shape[-2] * scales_h))
   if scales_w:
-    shape[-1] = int(math.floor(shape[-1]*scales_w))
+    shape[-1] = int(math.floor(shape[-1] * scales_w))
   # output_size overrides scale_factors, scales_*
   if output_size:
     shape[-1] = output_size[-1]
@@ -4992,11 +5165,11 @@ def _aten_upsample_bilinear2d_aa(input, output_size, align_corners, scale_factor
   if shape == list(image.shape):
     return image
-  spatial_dims = (2,3)
+  spatial_dims = (2, 3)
   if len(shape) == 3:
-    spatial_dims = (1,2)
+    spatial_dims = (1, 2)
-  scale = list([shape[i] / image.shape[i]  for i in spatial_dims])
+  scale = list([shape[i] / image.shape[i] for i in spatial_dims])
   if scale_factors:
     scale = scale_factors
   if scales_h:
@@ -5008,7 +5181,9 @@ def _aten_upsample_bilinear2d_aa(input, output_size, align_corners, scale_factor
   # align_corners is not supported in resize()
   # https://github.com/jax-ml/jax/issues/11206
   if align_corners:
-    scale = jnp.array([(shape[i] - 1.0) / (image.shape[i] - 1.0) for i in spatial_dims])
+    scale = jnp.array([
+        (shape[i] - 1.0) / (image.shape[i] - 1.0) for i in spatial_dims
+    ])
   translation = jnp.array([0 for i in spatial_dims])
@@ -5022,12 +5197,53 @@ def _aten_upsample_bilinear2d_aa(input, output_size, align_corners, scale_factor
       antialias=antialias,
   )
+@op(torch.ops.aten._upsample_bilinear2d_aa)
+def _aten_upsample_billinear_aa(input,
+                                output_size,
+                                align_corners,
+                                scale_factors=None,
+                                scales_h=None,
+                                scales_w=None):
+  return _aten_upsample(
+      input,
+      output_size,
+      align_corners,
+      True,  # antialias
+      "bilinear",  # method
+      scale_factors,
+      scales_h,
+      scales_w)
+@op(torch.ops.aten._upsample_bicubic2d_aa)
+def _aten_upsample_bicubic2d_aa(input,
+                                output_size,
+                                align_corners,
+                                scale_factors=None,
+                                scales_h=None,
+                                scales_w=None):
+  return _aten_upsample(
+      input,
+      output_size,
+      align_corners,
+      True,  # antialias
+      "bicubic",  # method
+      scale_factors,
+      scales_h,
+      scales_w)
 @op(torch.ops.aten.polar)
 def _aten_polar(abs, angle, *, out=None):
   return jax.lax.complex(abs * jnp.cos(angle), abs * jnp.sin(angle))
 @op(torch.ops.aten.cdist)
-def _aten_cdist(x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary'):
+def _aten_cdist(x1,
+                x2,
+                p=2.0,
+                compute_mode='use_mm_for_euclid_dist_if_necessary'):
   x1 = x1.astype(jnp.float32)
   x2 = x2.astype(jnp.float32)
@@ -5036,7 +5252,8 @@ def _aten_cdist(x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary
     return _hamming_distance(x1, x2).astype(jnp.float32)
   elif p == 2.0:
     # Use optimized Euclidean distance calculation
-    if compute_mode == 'use_mm_for_euclid_dist_if_necessary' and (x1.shape[-2] > 25 or x2.shape[-2] > 25):
+    if compute_mode == 'use_mm_for_euclid_dist_if_necessary' and (
+        x1.shape[-2] > 25 or x2.shape[-2] > 25):
       return _euclidean_mm(x1, x2)
     elif compute_mode == 'use_mm_for_euclid_dist':
       return _euclidean_mm(x1, x2)
@@ -5045,7 +5262,8 @@ def _aten_cdist(x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary
   else:
     # General p-norm distance calculation
     diff = jnp.abs(jnp.expand_dims(x1, -2) - jnp.expand_dims(x2, -3))
-    return jnp.sum(jnp.power(diff, p), axis=-1).astype(jnp.float32) ** (1 / p)
+    return jnp.sum(jnp.power(diff, p), axis=-1).astype(jnp.float32)**(1 / p)
 def _hamming_distance(x1, x2):
   """
@@ -5064,6 +5282,7 @@ def _hamming_distance(x1, x2):
   return hamming_dist
 def _euclidean_mm(x1, x2):
   """
   Computes the Euclidean distance using matrix multiplication.
@@ -5075,8 +5294,8 @@ def _euclidean_mm(x1, x2):
   Returns:
       JAX array of shape (..., P, R) representing pairwise Euclidean distances.
   """
-  x1_sq = jnp.sum(x1 ** 2, axis=-1, keepdims=True).astype(jnp.float32)
-  x2_sq = jnp.sum(x2 ** 2, axis=-1, keepdims=True).astype(jnp.float32)
+  x1_sq = jnp.sum(x1**2, axis=-1, keepdims=True).astype(jnp.float32)
+  x2_sq = jnp.sum(x2**2, axis=-1, keepdims=True).astype(jnp.float32)
   x2_sq = jnp.swapaxes(x2_sq, -2, -1)
@@ -5088,6 +5307,7 @@ def _euclidean_mm(x1, x2):
   return dist
 def _euclidean_direct(x1, x2):
   """
   Computes the Euclidean distance directly without matrix multiplication.
@@ -5101,7 +5321,7 @@ def _euclidean_direct(x1, x2):
   """
   diff = jnp.expand_dims(x1, -2) - jnp.expand_dims(x2, -3)
-  dist_sq = jnp.sum(diff ** 2, axis=-1).astype(jnp.float32)
+  dist_sq = jnp.sum(diff**2, axis=-1).astype(jnp.float32)
   dist_sq = jnp.maximum(dist_sq, 0.0)
@@ -5109,13 +5329,14 @@ def _euclidean_direct(x1, x2):
   return dist
 @op(torch.ops.aten.lu_unpack)
 def _aten_lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
   # lu_unpack doesnt exist in jax.
   # Get commonly used data shape variables
   n = LU_data.shape[-2]
   m = LU_data.shape[-1]
-  dim = min(n,m)
+  dim = min(n, m)
   ### Compute the Lower and Upper triangle
   if unpack_data:
@@ -5130,7 +5351,7 @@ def _aten_lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
     start_indices = jnp.zeros(len(LU_data.shape), dtype=int)
     limit_indices = list(LU_data.shape)
     limit_indices[-1] = dim
-    L = jax.lax.slice(L, start_indices, limit_indices)
+    L = jax.lax.slice(L, start_indices, limit_indices)
     # Extract upper triangle
     U = jnp.triu(LU_data)
@@ -5160,13 +5381,15 @@ def _aten_lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
     # closure to be called for each input 2D matrix.
     def _lu_unpack_2d(p, pivot):
-      _pivot = pivot - 1           # pivots are offset by 1 in jax
+      _pivot = pivot - 1  # pivots are offset by 1 in jax
       indices = jnp.array([*range(n)], dtype=jnp.int32)
       def update_indices(i, _indices):
         tmp = _indices[i]
         _indices = _indices.at[i].set(_indices[_pivot[i]])
         _indices = _indices.at[_pivot[i]].set(tmp)
         return _indices
       indices = jax.lax.fori_loop(0, _pivot.size, update_indices, indices)
       p = p[jnp.array(indices)]
       p = jnp.transpose(p)
@@ -5191,7 +5414,7 @@ def _aten_lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
       reshapedPivot = LU_pivots.reshape(newPivotshape)
       # vmap the reshaped 3d tensors
-      v_lu_unpack_2d = jax.vmap(_lu_unpack_2d, in_axes=(0,0))
+      v_lu_unpack_2d = jax.vmap(_lu_unpack_2d, in_axes=(0, 0))
       unpackedP = v_lu_unpack_2d(reshapedP, reshapedPivot)
       # reshape result back to P's shape
@@ -5210,3 +5433,204 @@ def linear(input, weight, bias=None):
   if bias is not None:
     res += bias
   return res
+@op(torch.ops.aten.kthvalue)
+def kthvalue(input, k, dim=None, keepdim=False, *, out=None):
+  if input.ndim == 0:
+    return input, jnp.array(0)
+  dimension = -1
+  if dim is not None:
+    dimension = dim
+  while dimension < 0:
+    dimension = dimension + input.ndim
+  values = jax.lax.index_in_dim(
+      jnp.partition(input, k - 1, dimension), k - 1, dimension, keepdim)
+  indices = jax.lax.index_in_dim(
+      jnp.argpartition(input, k - 1, dimension).astype('int64'), k - 1,
+      dimension, keepdim)
+  return values, indices
+@op(torch.ops.aten.take)
+def _aten_take(self, index):
+  return self.flatten()[index]
+# func: pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor
+@op(torch.ops.aten.pad)
+def _aten_pad(self, pad, mode='constant', value=None):
+  if not isinstance(pad, (tuple, list)) or len(pad) % 2 != 0:
+    raise ValueError("Padding must be a sequence of even length.")
+  num_dims = self.ndim
+  if len(pad) > 2 * num_dims:
+    raise ValueError(
+        f"Padding sequence length ({len(pad)}) exceeds 2 * number of dimensions ({2 * num_dims})."
+    )
+  # JAX's pad function expects padding for each dimension as a tuple of (low, high)
+  # We need to reverse the pad sequence and group them for JAX.
+  # pad = [p_l0, p_r0, p_l1, p_r1, ...]
+  # becomes ((..., ..., (p_l1, p_r1), (p_l0, p_r0)))
+  jax_pad_width = []
+  # Iterate in reverse pairs
+  for i in range(len(pad) // 2):
+    jax_pad_width.append((pad[(2 * i)], pad[(2 * i + 1)]))
+  # Pad any leading dimensions with (0, 0) if the pad sequence is shorter
+  # than the number of dimensions.
+  for _ in range(num_dims - len(pad) // 2):
+    jax_pad_width.append((0, 0))
+  # Reverse the jax_pad_width list to match the dimension order
+  jax_pad_width.reverse()
+  if mode == "constant":
+    if value is None:
+      value = 0.0
+    return jnp.pad(
+        self, pad_width=jax_pad_width, mode="constant", constant_values=value)
+  elif mode == "reflect":
+    return jnp.pad(self, pad_width=jax_pad_width, mode="reflect")
+  elif mode == "edge":
+    return jnp.pad(self, pad_width=jax_pad_width, mode="edge")
+  else:
+    raise ValueError(
+        f"Unsupported padding mode: {mode}. Expected 'constant', 'reflect', or 'edge'."
+    )
+@op(torch.ops.aten.is_nonzero)
+def _aten_is_nonzero(a):
+  a = jnp.squeeze(a)
+  if a.shape == (0,):
+    raise RuntimeError('bool value of Tensor with no values is ambiguous')
+  if a.ndim != 0:
+    raise RuntimeError(
+        'bool value of Tensor with more than one value is ambiguous')
+  return a.item() != 0
+@op(torch.ops.aten.logit)
+def _aten_logit(self: jax.Array, eps: float | None = None) -> jax.Array:
+  """
+  Computes the logit function of the input tensor.
+  logit(p) = log(p / (1 - p))
+  Args:
+    self: Input tensor.
+    eps: A small value to clip the input tensor to avoid log(0) or division by zero.
+         If None, no clipping is performed.
+  Returns:
+    A tensor with the logit of each element of the input.
+  """
+  if eps is not None:
+    self = jnp.clip(self, eps, 1.0 - eps)
+  res = jnp.log(self / (1.0 - self))
+  res = res.astype(mappings.t2j_dtype(torch.get_default_dtype()))
+  return res
+@op(torch.ops.aten.floor_divide)
+def _aten_floor_divide(x, y):
+  res = jnp.floor_divide(x, y)
+  return res
+@op(torch.ops.aten._assert_tensor_metadata)
+def _aten__assert_tensor_metadata(*args, **kwargs):
+  pass
+mutation_ops_to_functional = {
+    torch.ops.aten.add_:
+        op_base.InplaceOp(torch.ops.aten.add),
+    torch.ops.aten.sub_:
+        op_base.InplaceOp(torch.ops.aten.sub),
+    torch.ops.aten.mul_:
+        op_base.InplaceOp(torch.ops.aten.mul),
+    torch.ops.aten.div_:
+        op_base.InplaceOp(torch.ops.aten.div),
+    torch.ops.aten.pow_:
+        op_base.InplaceOp(torch.ops.aten.pow),
+    torch.ops.aten.lt_:
+        op_base.InplaceOp(torch.ops.aten.lt),
+    torch.ops.aten.le_:
+        op_base.InplaceOp(torch.ops.aten.le),
+    torch.ops.aten.gt_:
+        op_base.InplaceOp(torch.ops.aten.gt),
+    torch.ops.aten.ge_:
+        op_base.InplaceOp(torch.ops.aten.ge),
+    torch.ops.aten.eq_:
+        op_base.InplaceOp(torch.ops.aten.eq),
+    torch.ops.aten.ne_:
+        op_base.InplaceOp(torch.ops.aten.ne),
+    torch.ops.aten.bernoulli_:
+        op_base.InplaceOp(torch.ops.aten.bernoulli.p),
+    torch.ops.aten.bernoulli_.float:
+        op_base.InplaceOp(_aten_bernoulli, is_jax_func=True),
+    torch.ops.aten.geometric_:
+        op_base.InplaceOp(torch.ops.aten.geometric),
+    torch.ops.aten.normal_:
+        op_base.InplaceOp(torch.ops.aten.normal),
+    torch.ops.aten.random_:
+        op_base.InplaceOp(torch.ops.aten.uniform),
+    torch.ops.aten.uniform_:
+        op_base.InplaceOp(torch.ops.aten.uniform),
+    torch.ops.aten.relu_:
+        op_base.InplaceOp(torch.ops.aten.relu),
+    # squeeze_ is expected to change tensor's shape. So replace with new value
+    torch.ops.aten.squeeze_:
+        op_base.InplaceOp(torch.ops.aten.squeeze, True),
+    torch.ops.aten.sqrt_:
+        op_base.InplaceOp(torch.ops.aten.sqrt),
+    torch.ops.aten.clamp_:
+        op_base.InplaceOp(torch.ops.aten.clamp),
+    torch.ops.aten.clamp_min_:
+        op_base.InplaceOp(torch.ops.aten.clamp_min),
+    torch.ops.aten.sigmoid_:
+        op_base.InplaceOp(torch.ops.aten.sigmoid),
+    torch.ops.aten.tanh_:
+        op_base.InplaceOp(torch.ops.aten.tanh),
+    torch.ops.aten.ceil_:
+        op_base.InplaceOp(torch.ops.aten.ceil),
+    torch.ops.aten.logical_not_:
+        op_base.InplaceOp(torch.ops.aten.logical_not),
+    torch.ops.aten.unsqueeze_:
+        op_base.InplaceOp(torch.ops.aten.unsqueeze),
+    torch.ops.aten.transpose_:
+        op_base.InplaceOp(torch.ops.aten.transpose),
+    torch.ops.aten.log_normal_:
+        op_base.InplaceOp(torch.ops.aten.log_normal),
+    torch.ops.aten.scatter_add_:
+        op_base.InplaceOp(torch.ops.aten.scatter_add),
+    torch.ops.aten.scatter_reduce_.two:
+        op_base.InplaceOp(torch.ops.aten.scatter_reduce),
+    torch.ops.aten.scatter_:
+        op_base.InplaceOp(torch.ops.aten.scatter),
+    torch.ops.aten.bitwise_or_:
+        op_base.InplaceOp(torch.ops.aten.bitwise_or),
+    torch.ops.aten.floor_divide_:
+        op_base.InplaceOp(torch.ops.aten.floor_divide),
+    torch.ops.aten.remainder_:
+        op_base.InplaceOp(torch.ops.aten.remainder),
+}
+# Note: tuple comparisons work intuitively, e.g. `_jax_version >= (0, 4, 32)`.
+_jax_version = tuple(int(v) for v in jax.version._version.split("."))
+mutation_needs_env = {
+    torch.ops.aten.bernoulli_,
+    torch.ops.aten.bernoulli_.float,
+}
+for operator, mutation in mutation_ops_to_functional.items():
+  ops_registry.register_torch_dispatch_op(
+      operator,
+      mutation,
+      is_jax_function=False,
+      is_view_op=True,
+      needs_env=(operator in mutation_needs_env))

torchax 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

Potentially problematic release.

torchax 0.0.4py3-none-any.whl → 0.0.5py3-none-any.whl