PyPI - warp-lang - Versions diffs - 1.9.0__py3-none-win_amd64.whl → 1.9.1__py3-none-win_amd64.whl - Mend

warp-lang 1.9.0__py3-none-win_amd64.whl → 1.9.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (37) hide show

warp/__init__.pyi +1420 -2
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build_dll.py +322 -72
warp/builtins.py +289 -23
warp/codegen.py +5 -0
warp/config.py +1 -1
warp/context.py +243 -32
warp/examples/interop/example_jax_kernel.py +2 -1
warp/jax_experimental/custom_call.py +24 -1
warp/jax_experimental/ffi.py +20 -0
warp/jax_experimental/xla_ffi.py +16 -7
warp/native/builtin.h +4 -4
warp/native/sort.cu +22 -13
warp/native/sort.h +2 -0
warp/native/tile.h +188 -13
warp/native/vec.h +0 -53
warp/native/warp.cpp +3 -3
warp/native/warp.cu +60 -30
warp/native/warp.h +3 -3
warp/render/render_opengl.py +14 -12
warp/render/render_usd.py +1 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/interop/test_jax.py +608 -28
warp/tests/test_array.py +2 -0
warp/tests/test_codegen.py +1 -1
warp/tests/test_fem.py +4 -4
warp/tests/test_map.py +14 -0
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -0
warp/tests/tile/test_tile.py +61 -0
warp/types.py +17 -3
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/METADATA +5 -8
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/RECORD +37 -37
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/codegen.py CHANGED Viewed

@@ -1244,6 +1244,11 @@ class Adjoint:
             A line directive for the given statement, or None if no line directive is needed.
         """
+        if adj.filename == "unknown source file" or adj.fun_lineno == 0:
+            # Early return if function is not associated with a source file or is otherwise invalid
+            # TODO: Get line directives working with wp.map() functions
+            return None
         # lineinfo is enabled by default in debug mode regardless of the builder option, don't want to unnecessarily
         # emit line directives in generated code if it's not being compiled with line information
         build_mode = val if (val := adj.builder_options.get("mode")) is not None else warp.config.mode

warp/config.py CHANGED Viewed

@@ -15,7 +15,7 @@
 from typing import Optional
-version: str = "1.9.0"
+version: str = "1.9.1"
 """Warp version string"""
 verify_fp: bool = False

warp/context.py CHANGED Viewed

@@ -2244,21 +2244,7 @@ class Module:
         return self.hashers[block_dim].get_module_hash()
     def _use_ptx(self, device) -> bool:
-        # determine whether to use PTX or CUBIN
-        if device.is_cubin_supported:
-            # get user preference specified either per module or globally
-            preferred_cuda_output = self.options.get("cuda_output") or warp.config.cuda_output
-            if preferred_cuda_output is not None:
-                use_ptx = preferred_cuda_output == "ptx"
-            else:
-                # determine automatically: older drivers may not be able to handle PTX generated using newer
-                # CUDA Toolkits, in which case we fall back on generating CUBIN modules
-                use_ptx = runtime.driver_version >= runtime.toolkit_version
-        else:
-            # CUBIN not an option, must use PTX (e.g. CUDA Toolkit too old)
-            use_ptx = True
-        return use_ptx
+        return device.get_cuda_output_format(self.options.get("cuda_output")) == "ptx"
     def get_module_identifier(self) -> str:
         """Get an abbreviated module name to use for directories and files in the cache.
@@ -2278,19 +2264,7 @@ class Module:
         if device is None:
             device = runtime.get_device()
-        if device.is_cpu:
-            return None
-        if self._use_ptx(device):
-            # use the default PTX arch if the device supports it
-            if warp.config.ptx_target_arch is not None:
-                output_arch = min(device.arch, warp.config.ptx_target_arch)
-            else:
-                output_arch = min(device.arch, runtime.default_ptx_arch)
-        else:
-            output_arch = device.arch
-        return output_arch
+        return device.get_cuda_compile_arch()
     def get_compile_output_name(
         self, device: Device | None, output_arch: int | None = None, use_ptx: bool | None = None
@@ -3327,6 +3301,78 @@ class Device:
         else:
             return False
+    def get_cuda_output_format(self, preferred_cuda_output: str | None = None) -> str | None:
+        """Determine the CUDA output format to use for this device.
+        This method is intended for internal use by Warp's compilation system.
+        External users should not need to call this method directly.
+        It determines whether to use PTX or CUBIN output based on device capabilities,
+        caller preferences, and runtime constraints.
+        Args:
+            preferred_cuda_output: Caller's preferred format (``"ptx"``, ``"cubin"``, or ``None``).
+                If ``None``, falls back to global config or automatic determination.
+        Returns:
+            The output format to use: ``"ptx"``, ``"cubin"``, or ``None`` for CPU devices.
+        """
+        if self.is_cpu:
+            # CPU devices don't use CUDA compilation
+            return None
+        if not self.is_cubin_supported:
+            return "ptx"
+        # Use provided preference or fall back to global config
+        if preferred_cuda_output is None:
+            preferred_cuda_output = warp.config.cuda_output
+        if preferred_cuda_output is not None:
+            # Caller specified a preference, use it if supported
+            if preferred_cuda_output in ("ptx", "cubin"):
+                return preferred_cuda_output
+            else:
+                # Invalid preference, fall back to automatic determination
+                pass
+        # Determine automatically: Older drivers may not be able to handle PTX generated using newer CUDA Toolkits,
+        # in which case we fall back on generating CUBIN modules
+        return "ptx" if self.runtime.driver_version >= self.runtime.toolkit_version else "cubin"
+    def get_cuda_compile_arch(self) -> int | None:
+        """Get the CUDA architecture to use when compiling code for this device.
+        This method is intended for internal use by Warp's compilation system.
+        External users should not need to call this method directly.
+        Determines the appropriate compute capability version to use when compiling
+        CUDA kernels for this device. The architecture depends on the device's
+        CUDA output format preference and available target architectures.
+        For PTX output format, uses the minimum of the device's architecture and
+        the configured PTX target architecture to ensure compatibility.
+        For CUBIN output format, uses the device's exact architecture.
+        Returns:
+            The compute capability version (e.g., 75 for ``sm_75``) to use for compilation,
+            or ``None`` for CPU devices which don't use CUDA compilation.
+        """
+        if self.is_cpu:
+            return None
+        if self.get_cuda_output_format() == "ptx":
+            # use the default PTX arch if the device supports it
+            if warp.config.ptx_target_arch is not None:
+                output_arch = min(self.arch, warp.config.ptx_target_arch)
+            else:
+                output_arch = min(self.arch, runtime.default_ptx_arch)
+        else:
+            output_arch = self.arch
+        return output_arch
 """ Meta-type for arguments that can be resolved to a concrete Device.
 """
@@ -4036,6 +4082,8 @@ class Runtime:
             self.core.wp_cuda_graph_insert_if_else.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_bool,
                 ctypes.POINTER(ctypes.c_int),
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.POINTER(ctypes.c_void_p),
@@ -4045,6 +4093,8 @@ class Runtime:
             self.core.wp_cuda_graph_insert_while.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_bool,
                 ctypes.POINTER(ctypes.c_int),
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.POINTER(ctypes.c_uint64),
@@ -4054,6 +4104,8 @@ class Runtime:
             self.core.wp_cuda_graph_set_condition.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_bool,
                 ctypes.POINTER(ctypes.c_int),
                 ctypes.c_uint64,
             ]
@@ -7053,6 +7105,8 @@ def capture_if(
     if not runtime.core.wp_cuda_graph_insert_if_else(
         device.context,
         stream.cuda_stream,
+        device.get_cuda_compile_arch(),
+        device.get_cuda_output_format() == "ptx",
         ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
         None if on_true is None else ctypes.byref(graph_on_true),
         None if on_false is None else ctypes.byref(graph_on_false),
@@ -7117,7 +7171,9 @@ def capture_if(
     capture_resume(main_graph, stream=stream)
-def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph, stream: Stream = None, **kwargs):
+def capture_while(
+    condition: warp.array(dtype=int), while_body: Callable | Graph, stream: Stream | None = None, **kwargs
+):
     """Create a dynamic loop based on a condition.
     The condition value is retrieved from the first element of the ``condition`` array.
@@ -7185,6 +7241,8 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
     if not runtime.core.wp_cuda_graph_insert_while(
         device.context,
         stream.cuda_stream,
+        device.get_cuda_compile_arch(),
+        device.get_cuda_output_format() == "ptx",
         ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
         ctypes.byref(body_graph),
         ctypes.byref(cond_handle),
@@ -7218,6 +7276,8 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
     if not runtime.core.wp_cuda_graph_set_condition(
         device.context,
         stream.cuda_stream,
+        device.get_cuda_compile_arch(),
+        device.get_cuda_output_format() == "ptx",
         ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
         cond_handle,
     ):
@@ -7748,6 +7808,7 @@ def export_stubs(file):  # pragma: no cover
     print("from typing import Callable", file=file)
     print("from typing import TypeVar", file=file)
     print("from typing import Generic", file=file)
+    print("from typing import Sequence", file=file)
     print("from typing import overload as over", file=file)
     print(file=file)
@@ -7776,7 +7837,7 @@ def export_stubs(file):  # pragma: no cover
     print(header, file=file)
     print(file=file)
-    def add_stub(f):
+    def add_builtin_function_stub(f):
         args = ", ".join(f"{k}: {type_str(v)}" for k, v in f.input_types.items())
         return_str = ""
@@ -7796,12 +7857,162 @@ def export_stubs(file):  # pragma: no cover
         print('    """', file=file)
         print("    ...\n\n", file=file)
+    def add_vector_type_stub(cls, label):
+        cls_name = cls.__name__
+        scalar_type_name = cls._wp_scalar_type_.__name__
+        print(f"class {cls_name}:", file=file)
+        print("    @over", file=file)
+        print("    def __init__(self) -> None:", file=file)
+        print(f'        """Construct a zero-initialized {label}."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, other: {cls_name}) -> None:", file=file)
+        print(f'        """Construct a {label} by copy."""', file=file)
+        print("        ...\n\n", file=file)
+        args = ", ".join(f"{x}: {scalar_type_name}" for x in "xyzw"[: cls._length_])
+        print("    @over", file=file)
+        print(f"    def __init__(self, {args}) -> None:", file=file)
+        print(f'        """Construct a {label} from its component values."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, args: Sequence[{scalar_type_name}]) -> None:", file=file)
+        print(f'        """Construct a {label} from a sequence of values."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, value: {scalar_type_name}) -> None:", file=file)
+        print(f'        """Construct a {label} filled with a value."""', file=file)
+        print("        ...\n\n", file=file)
+    def add_matrix_type_stub(cls, label):
+        cls_name = cls.__name__
+        scalar_type_name = cls._wp_scalar_type_.__name__
+        scalar_short_name = warp.types.scalar_short_name(cls._wp_scalar_type_)
+        print(f"class {cls_name}:", file=file)
+        print("    @over", file=file)
+        print("    def __init__(self) -> None:", file=file)
+        print(f'        """Construct a zero-initialized {label}."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, other: {cls_name}) -> None:", file=file)
+        print(f'        """Construct a {label} by copy."""', file=file)
+        print("        ...\n\n", file=file)
+        args = ", ".join(f"m{i}{j}: {scalar_type_name}" for i in range(cls._shape_[0]) for j in range(cls._shape_[1]))
+        print("    @over", file=file)
+        print(f"    def __init__(self, {args}) -> None:", file=file)
+        print(f'        """Construct a {label} from its component values."""', file=file)
+        print("        ...\n\n", file=file)
+        args = ", ".join(f"v{i}: vec{cls._shape_[0]}{scalar_short_name}" for i in range(cls._shape_[0]))
+        print("    @over", file=file)
+        print(f"    def __init__(self, {args}) -> None:", file=file)
+        print(f'        """Construct a {label} from its row vectors."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, args: Sequence[{scalar_type_name}]) -> None:", file=file)
+        print(f'        """Construct a {label} from a sequence of values."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, value: {scalar_type_name}) -> None:", file=file)
+        print(f'        """Construct a {label} filled with a value."""', file=file)
+        print("        ...\n\n", file=file)
+    def add_transform_type_stub(cls, label):
+        cls_name = cls.__name__
+        scalar_type_name = cls._wp_scalar_type_.__name__
+        scalar_short_name = warp.types.scalar_short_name(cls._wp_scalar_type_)
+        print(f"class {cls_name}:", file=file)
+        print("    @over", file=file)
+        print("    def __init__(self) -> None:", file=file)
+        print(f'        """Construct a zero-initialized {label}."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, other: {cls_name}) -> None:", file=file)
+        print(f'        """Construct a {label} by copy."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, p: vec3{scalar_short_name}, q: quat{scalar_short_name}) -> None:", file=file)
+        print(f'        """Construct a {label} from its p and q components."""', file=file)
+        print("        ...\n\n", file=file)
+        args = ()
+        args += tuple(f"p{x}: {scalar_type_name}" for x in "xyz")
+        args += tuple(f"q{x}: {scalar_type_name}" for x in "xyzw")
+        args = ", ".join(args)
+        print("    @over", file=file)
+        print(f"    def __init__(self, {args}) -> None:", file=file)
+        print(f'        """Construct a {label} from its component values."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(
+            f"    def __init__(self, p: Sequence[{scalar_type_name}], q: Sequence[{scalar_type_name}]) -> None:",
+            file=file,
+        )
+        print(f'        """Construct a {label} from two sequences of values."""', file=file)
+        print("        ...\n\n", file=file)
+        print("    @over", file=file)
+        print(f"    def __init__(self, value: {scalar_type_name}) -> None:", file=file)
+        print(f'        """Construct a {label} filled with a value."""', file=file)
+        print("        ...\n\n", file=file)
+    # Vector types.
+    suffixes = ("h", "f", "d", "b", "ub", "s", "us", "i", "ui", "l", "ul")
+    for length in (2, 3, 4):
+        for suffix in suffixes:
+            cls = getattr(warp.types, f"vec{length}{suffix}")
+            add_vector_type_stub(cls, "vector")
+        print(f"vec{length} = vec{length}f", file=file)
+    # Matrix types.
+    suffixes = ("h", "f", "d")
+    for length in (2, 3, 4):
+        shape = f"{length}{length}"
+        for suffix in suffixes:
+            cls = getattr(warp.types, f"mat{shape}{suffix}")
+            add_matrix_type_stub(cls, "matrix")
+        print(f"mat{shape} = mat{shape}f", file=file)
+    # Quaternion types.
+    suffixes = ("h", "f", "d")
+    for suffix in suffixes:
+        cls = getattr(warp.types, f"quat{suffix}")
+        add_vector_type_stub(cls, "quaternion")
+    print("quat = quatf", file=file)
+    # Transformation types.
+    suffixes = ("h", "f", "d")
+    for suffix in suffixes:
+        cls = getattr(warp.types, f"transform{suffix}")
+        add_transform_type_stub(cls, "transformation")
+    print("transform = transformf", file=file)
     for g in builtin_functions.values():
         if hasattr(g, "overloads"):
             for f in g.overloads:
-                add_stub(f)
+                add_builtin_function_stub(f)
         elif isinstance(g, Function):
-            add_stub(g)
+            add_builtin_function_stub(g)
 def export_builtins(file: io.TextIOBase):  # pragma: no cover

warp/examples/interop/example_jax_kernel.py CHANGED Viewed

@@ -45,7 +45,8 @@ def sincos_kernel(angle: wp.array(dtype=float), sin_out: wp.array(dtype=float),
 @wp.kernel
 def diagonal_kernel(output: wp.array(dtype=wp.mat33)):
     tid = wp.tid()
-    output[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0)
+    d = float(tid + 1)
+    output[tid] = wp.mat33(d, 0.0, 0.0, 0.0, d * 2.0, 0.0, 0.0, 0.0, d * 3.0)
 @wp.kernel

warp/jax_experimental/custom_call.py CHANGED Viewed

@@ -19,6 +19,7 @@ import warp as wp
 from warp.context import type_str
 from warp.jax import get_jax_device
 from warp.types import array_t, launch_bounds_t, strides_from_shape
+from warp.utils import warn
 _jax_warp_p = None
@@ -28,7 +29,7 @@ _registered_kernels = [None]
 _registered_kernel_to_id = {}
-def jax_kernel(kernel, launch_dims=None):
+def jax_kernel(kernel, launch_dims=None, quiet=False):
     """Create a Jax primitive from a Warp kernel.
     NOTE: This is an experimental feature under development.
@@ -38,6 +39,7 @@ def jax_kernel(kernel, launch_dims=None):
         launch_dims: Optional. Specify the kernel launch dimensions. If None,
                      dimensions are inferred from the shape of the first argument.
                      This option when set will specify the output dimensions.
+        quiet: Optional. If True, suppress deprecation warnings with newer JAX versions.
     Limitations:
         - All kernel arguments must be contiguous arrays.
@@ -46,6 +48,27 @@ def jax_kernel(kernel, launch_dims=None):
         - Only the CUDA backend is supported.
     """
+    import jax
+    # check if JAX version supports this
+    if jax.__version_info__ < (0, 4, 25) or jax.__version_info__ >= (0, 8, 0):
+        msg = (
+            "This version of jax_kernel() requires JAX version 0.4.25 - 0.7.x, "
+            f"but installed JAX version is {jax.__version_info__}."
+        )
+        if jax.__version_info__ >= (0, 8, 0):
+            msg += " Please use warp.jax_experimental.ffi.jax_kernel instead."
+        raise RuntimeError(msg)
+    # deprecation warning
+    if jax.__version_info__ >= (0, 5, 0) and not quiet:
+        warn(
+            "This version of jax_kernel() is deprecated and will not be supported with newer JAX versions. "
+            "Please use the newer FFI version instead (warp.jax_experimental.ffi.jax_kernel). "
+            "In Warp release 1.10, the FFI version will become the default implementation of jax_kernel().",
+            DeprecationWarning,
+        )
     if _jax_warp_p is None:
         # Create and register the primitive
         _create_jax_warp_primitive()

warp/jax_experimental/ffi.py CHANGED Viewed

@@ -29,6 +29,18 @@ from warp.types import array_t, launch_bounds_t, strides_from_shape, type_to_war
 from .xla_ffi import *
+def check_jax_version():
+    # check if JAX version supports this
+    if jax.__version_info__ < (0, 5, 0):
+        msg = (
+            "This version of jax_kernel() requires JAX version 0.5.0 or higher, "
+            f"but installed JAX version is {jax.__version_info__}."
+        )
+        if jax.__version_info__ >= (0, 4, 25):
+            msg += " Please use warp.jax_experimental.custom_call.jax_kernel instead."
+        raise RuntimeError(msg)
 class GraphMode(IntEnum):
     NONE = 0  # don't capture a graph
     JAX = 1  # let JAX capture a graph
@@ -668,8 +680,12 @@ def jax_kernel(
         - There must be at least one output or input-output argument.
         - Only the CUDA backend is supported.
     """
+    check_jax_version()
     key = (
         kernel.func,
+        kernel.sig,
         num_outputs,
         vmap_method,
         tuple(launch_dims) if launch_dims else launch_dims,
@@ -726,6 +742,8 @@ def jax_callable(
         - Only the CUDA backend is supported.
     """
+    check_jax_version()
     if graph_compatible is not None:
         wp.utils.warn(
             "The `graph_compatible` argument is deprecated, use `graph_mode` instead.",
@@ -772,6 +790,8 @@ def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = Tr
         graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
     """
+    check_jax_version()
     # TODO check that the name is not already registered
     def ffi_callback(call_frame):

warp/jax_experimental/xla_ffi.py CHANGED Viewed

@@ -475,17 +475,26 @@ _xla_data_type_to_constructor = {
     XLA_FFI_DataType.C64: jnp.complex64,
     XLA_FFI_DataType.C128: jnp.complex128,
     # XLA_FFI_DataType.TOKEN
-    XLA_FFI_DataType.F8E5M2: jnp.float8_e5m2,
-    XLA_FFI_DataType.F8E3M4: jnp.float8_e3m4,
-    XLA_FFI_DataType.F8E4M3: jnp.float8_e4m3,
-    XLA_FFI_DataType.F8E4M3FN: jnp.float8_e4m3fn,
-    XLA_FFI_DataType.F8E4M3B11FNUZ: jnp.float8_e4m3b11fnuz,
-    XLA_FFI_DataType.F8E5M2FNUZ: jnp.float8_e5m2fnuz,
-    XLA_FFI_DataType.F8E4M3FNUZ: jnp.float8_e4m3fnuz,
     # XLA_FFI_DataType.F4E2M1FN: jnp.float4_e2m1fn.dtype,
     # XLA_FFI_DataType.F8E8M0FNU: jnp.float8_e8m0fnu.dtype,
 }
+# newer types not supported by older versions
+if hasattr(jnp, "float8_e5m2"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E5M2] = jnp.float8_e5m2
+if hasattr(jnp, "float8_e3m4"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E3M4] = jnp.float8_e3m4
+if hasattr(jnp, "float8_e4m3"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3] = jnp.float8_e4m3
+if hasattr(jnp, "float8_e4m3fn"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3FN] = jnp.float8_e4m3fn
+if hasattr(jnp, "float8_e4m3b11fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3B11FNUZ] = jnp.float8_e4m3b11fnuz
+if hasattr(jnp, "float8_e5m2fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E5M2FNUZ] = jnp.float8_e5m2fnuz
+if hasattr(jnp, "float8_e4m3fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3FNUZ] = jnp.float8_e4m3fnuz
 ########################################################################
 # Helpers for translating between ctypes and python types

warp/native/builtin.h CHANGED Viewed

@@ -1093,8 +1093,8 @@ CUDA_CALLABLE inline T select(const C& cond, const T& a, const T& b)
     return (!!cond) ? b : a;
 }
-template <typename C, typename T>
-CUDA_CALLABLE inline void adj_select(const C& cond, const T& a, const T& b, C& adj_cond, T& adj_a, T& adj_b, const T& adj_ret)
+template <typename C, typename TA, typename TB, typename TRet>
+CUDA_CALLABLE inline void adj_select(const C& cond, const TA& a, const TB& b, C& adj_cond, TA& adj_a, TB& adj_b, const TRet& adj_ret)
 {
     // The double NOT operator !! casts to bool without compiler warnings.
     if (!!cond)
@@ -1110,8 +1110,8 @@ CUDA_CALLABLE inline T where(const C& cond, const T& a, const T& b)
     return (!!cond) ? a : b;
 }
-template <typename C, typename T>
-CUDA_CALLABLE inline void adj_where(const C& cond, const T& a, const T& b, C& adj_cond, T& adj_a, T& adj_b, const T& adj_ret)
+template <typename C, typename TA, typename TB, typename TRet>
+CUDA_CALLABLE inline void adj_where(const C& cond, const TA& a, const TB& b, C& adj_cond, TA& adj_a, TB& adj_b, const TRet& adj_ret)
 {
     // The double NOT operator !! casts to bool without compiler warnings.
     if (!!cond)

warp/native/sort.cu CHANGED Viewed

@@ -23,7 +23,7 @@
 #include <cub/cub.cuh>
-#include <map>
+#include <unordered_map>
 // temporary buffer for radix sort
 struct RadixSortTemp
@@ -32,8 +32,8 @@ struct RadixSortTemp
     size_t size = 0;
 };
-// map temp buffers to CUDA contexts
-static std::map<void*, RadixSortTemp> g_radix_sort_temp_map;
+// use unique temp buffers per CUDA stream to avoid race conditions
+static std::unordered_map<void*, RadixSortTemp> g_radix_sort_temp_map;
 template <typename KeyType>
@@ -44,6 +44,8 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
     cub::DoubleBuffer<KeyType> d_keys;
 	cub::DoubleBuffer<int> d_values;
+    CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t sort_temp_size;
     check_cuda(cub::DeviceRadixSort::SortPairs(
@@ -52,12 +54,9 @@ void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* s
         d_keys,
         d_values,
         n, 0, sizeof(KeyType)*8,
-        (cudaStream_t)wp_cuda_stream_get_current()));
-    if (!context)
-        context = wp_cuda_context_get_current();
+        stream));
-    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    RadixSortTemp& temp = g_radix_sort_temp_map[stream];
     if (sort_temp_size > temp.size)
     {
@@ -77,6 +76,17 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
     radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
 }
+void radix_sort_release(void* context, void* stream)
+{
+    // release temporary buffer for the given stream, if it exists
+    auto it = g_radix_sort_temp_map.find(stream);
+    if (it != g_radix_sort_temp_map.end())
+    {
+        wp_free_device(context, it->second.mem);
+        g_radix_sort_temp_map.erase(it);
+    }
+}
 template <typename KeyType>
 void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
 {
@@ -153,6 +163,8 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
     int* start_indices = NULL;
     int* end_indices = NULL;
+    CUstream stream = static_cast<CUstream>(wp_cuda_stream_get_current());
     // compute temporary memory required
 	size_t sort_temp_size;
     check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
@@ -166,12 +178,9 @@ void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_o
         end_indices,
         0,
         32,
-        (cudaStream_t)wp_cuda_stream_get_current()));
-    if (!context)
-        context = wp_cuda_context_get_current();
+        stream));
-    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    RadixSortTemp& temp = g_radix_sort_temp_map[stream];
     if (sort_temp_size > temp.size)
     {

warp/native/sort.h CHANGED Viewed

@@ -20,6 +20,8 @@
 #include <stddef.h>
 void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
+void radix_sort_release(void* context, void* stream);
 void radix_sort_pairs_host(int* keys, int* values, int n);
 void radix_sort_pairs_host(float* keys, int* values, int n);
 void radix_sort_pairs_host(int64_t* keys, int* values, int n);