PyPI - warp-lang - Versions diffs - 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (180) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp.so +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/RECORD +177 -165
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/math.py CHANGED Viewed

@@ -22,11 +22,13 @@ Vector norm functions
 """
 __all__ = [
+    "norm_huber",
     "norm_l1",
     "norm_l2",
-    "norm_huber",
     "norm_pseudo_huber",
     "smooth_normalize",
+    "transform_compose",
+    "transform_decompose",
     "transform_from_matrix",
     "transform_to_matrix",
 ]
@@ -142,6 +144,19 @@ def create_transform_from_matrix_func(dtype):
         """
         Construct a transformation from a 4x4 matrix.
+        .. math::
+            M = \\begin{bmatrix}
+            R_{00} & R_{01} & R_{02} & p_x \\\\
+            R_{10} & R_{11} & R_{12} & p_y \\\\
+            R_{20} & R_{21} & R_{22} & p_z \\\\
+            0 & 0 & 0 & 1
+            \\end{bmatrix}
+        Where:
+        * :math:`R` is the 3x3 rotation matrix created from the orientation quaternion of the input transform.
+        * :math:`p` is the 3D position vector :math:`[p_x, p_y, p_z]` of the input transform.
         Args:
             mat (Matrix[4, 4, Float]): Matrix to convert.
@@ -177,6 +192,19 @@ def create_transform_to_matrix_func(dtype):
         """
         Convert a transformation to a 4x4 matrix.
+        .. math::
+            M = \\begin{bmatrix}
+            R_{00} & R_{01} & R_{02} & p_x \\\\
+            R_{10} & R_{11} & R_{12} & p_y \\\\
+            R_{20} & R_{21} & R_{22} & p_z \\\\
+            0 & 0 & 0 & 1
+            \\end{bmatrix}
+        Where:
+        * :math:`R` is the 3x3 rotation matrix created from the orientation quaternion of the input transform.
+        * :math:`p` is the 3D position vector :math:`[p_x, p_y, p_z]` of the input transform.
         Args:
             xform (Transformation[Float]): Transformation to convert.
@@ -212,6 +240,140 @@ wp.func(
 )
+def create_transform_compose_func(dtype):
+    mat44 = wp.types.matrix((4, 4), dtype)
+    quat = wp.types.quaternion(dtype)
+    vec3 = wp.types.vector(3, dtype)
+    def transform_compose(position: vec3, rotation: quat, scale: vec3):
+        """
+        Compose a 4x4 transformation matrix from a 3D position, quaternion orientation, and 3D scale.
+        .. math::
+            M = \\begin{bmatrix}
+            s_x R_{00} & s_y R_{01} & s_z R_{02} & p_x \\\\
+            s_x R_{10} & s_y R_{11} & s_z R_{12} & p_y \\\\
+            s_x R_{20} & s_y R_{21} & s_z R_{22} & p_z \\\\
+            0 & 0 & 0 & 1
+            \\end{bmatrix}
+        Where:
+        * :math:`R` is the 3x3 rotation matrix created from the orientation quaternion of the input transform.
+        * :math:`p` is the 3D position vector :math:`[p_x, p_y, p_z]` of the input transform.
+        * :math:`s` is the 3D scale vector :math:`[s_x, s_y, s_z]` of the input transform.
+        Args:
+            position (Vector[3, Float]): The 3D position vector.
+            rotation (Quaternion[Float]): The quaternion orientation.
+            scale (Vector[3, Float]): The 3D scale vector.
+        Returns:
+            Matrix[4, 4, Float]: The transformation matrix.
+        """
+        R = wp.quat_to_matrix(rotation)
+        # fmt: off
+        return mat44(
+            scale[0] * R[0,0], scale[1] * R[0,1], scale[2] * R[0,2], position[0],
+            scale[0] * R[1,0], scale[1] * R[1,1], scale[2] * R[1,2], position[1],
+            scale[0] * R[2,0], scale[1] * R[2,1], scale[2] * R[2,2], position[2],
+            dtype(0.0), dtype(0.0), dtype(0.0), dtype(1.0),
+        )
+        # fmt: on
+    return transform_compose
+transform_compose = wp.func(
+    create_transform_compose_func(wp.float32),
+    name="transform_compose",
+)
+wp.func(
+    create_transform_compose_func(wp.float16),
+    name="transform_compose",
+)
+wp.func(
+    create_transform_compose_func(wp.float64),
+    name="transform_compose",
+)
+def create_transform_decompose_func(dtype):
+    mat44 = wp.types.matrix((4, 4), dtype)
+    vec3 = wp.types.vector(3, dtype)
+    mat33 = wp.types.matrix((3, 3), dtype)
+    zero = dtype(0.0)
+    def transform_decompose(m: mat44):
+        """
+        Decompose a 4x4 transformation matrix into 3D position, quaternion orientation, and 3D scale.
+        .. math::
+            M = \\begin{bmatrix}
+            s_x R_{00} & s_y R_{01} & s_z R_{02} & p_x \\\\
+            s_x R_{10} & s_y R_{11} & s_z R_{12} & p_y \\\\
+            s_x R_{20} & s_y R_{21} & s_z R_{22} & p_z \\\\
+            0 & 0 & 0 & 1
+            \\end{bmatrix}
+        Where:
+        * :math:`R` is the 3x3 rotation matrix created from the orientation quaternion of the input transform.
+        * :math:`p` is the 3D position vector :math:`[p_x, p_y, p_z]` of the input transform.
+        * :math:`s` is the 3D scale vector :math:`[s_x, s_y, s_z]` of the input transform.
+        Args:
+            m (Matrix[4, 4, Float]): The matrix to decompose.
+        Returns:
+            Tuple[Vector[3, Float], Quaternion[Float], Vector[3, Float]]: A tuple containing the position vector, quaternion orientation, and scale vector.
+        """
+        # extract position
+        position = vec3(m[0, 3], m[1, 3], m[2, 3])
+        # extract rotation matrix components
+        r00, r01, r02 = m[0, 0], m[0, 1], m[0, 2]
+        r10, r11, r12 = m[1, 0], m[1, 1], m[1, 2]
+        r20, r21, r22 = m[2, 0], m[2, 1], m[2, 2]
+        # get scale magnitudes
+        sx = wp.sqrt(r00 * r00 + r10 * r10 + r20 * r20)
+        sy = wp.sqrt(r01 * r01 + r11 * r11 + r21 * r21)
+        sz = wp.sqrt(r02 * r02 + r12 * r12 + r22 * r22)
+        # normalize rotation matrix components
+        if sx != zero:
+            r00 /= sx
+            r10 /= sx
+            r20 /= sx
+        if sy != zero:
+            r01 /= sy
+            r11 /= sy
+            r21 /= sy
+        if sz != zero:
+            r02 /= sz
+            r12 /= sz
+            r22 /= sz
+        # extract rotation (quaternion)
+        rotation = wp.quat_from_matrix(mat33(r00, r01, r02, r10, r11, r12, r20, r21, r22))
+        # extract scale
+        scale = vec3(sx, sy, sz)
+        return position, rotation, scale
+    return transform_decompose
+transform_decompose = wp.func(
+    create_transform_decompose_func(wp.float32),
+    name="transform_decompose",
+)
+wp.func(
+    create_transform_decompose_func(wp.float16),
+    name="transform_decompose",
+)
+wp.func(
+    create_transform_decompose_func(wp.float64),
+    name="transform_decompose",
+)
 # register API functions so they appear in the documentation
 wp.context.register_api_function(
@@ -242,3 +404,11 @@ wp.context.register_api_function(
     transform_to_matrix,
     group="Transformations",
 )
+wp.context.register_api_function(
+    transform_compose,
+    group="Transformations",
+)
+wp.context.register_api_function(
+    transform_decompose,
+    group="Transformations",
+)

warp/native/array.h CHANGED Viewed

@@ -743,6 +743,24 @@ inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, T value)
 template<template<typename> class A, typename T>
 inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_max(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, T old_value, T new_value) { return atomic_cas(&index(buf, i), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, T old_value, T new_value) { return atomic_cas(&index(buf, i, j), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, int k, T old_value, T new_value) { return atomic_cas(&index(buf, i, j, k), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, int k, int l, T old_value, T new_value) { return atomic_cas(&index(buf, i, j, k, l), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, T value) { return atomic_exch(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, T value) { return atomic_exch(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, int k, T value) { return atomic_exch(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_exch(&index(buf, i, j, k, l), value); }
 template<template<typename> class A, typename T>
 inline CUDA_CALLABLE T* address(const A<T>& buf, int i) { return &index(buf, i); }
 template<template<typename> class A, typename T>
@@ -1128,6 +1146,87 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k,
     FP_VERIFY_ADJ_4(value, adj_value)
 }
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, T compare, T value, const A2<T>& adj_buf, int adj_i, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i), compare, value, &index(adj_buf, i), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i), compare, value, &index_grad(buf, i), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j), compare, value, &index(adj_buf, i, j), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j), compare, value, &index_grad(buf, i, j), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, int k, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j, k), compare, value, &index(adj_buf, i, j, k), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j, k), compare, value, &index_grad(buf, i, j, k), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, int k, int l, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j, k, l), compare, value, &index(adj_buf, i, j, k, l), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j, k, l), compare, value, &index_grad(buf, i, j, k, l), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i), value, &index(adj_buf, i), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i), value, &index_grad(buf, i), adj_value, adj_ret);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j), value, &index(adj_buf, i, j), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j), value, &index_grad(buf, i, j), adj_value, adj_ret);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j, k), value, &index(adj_buf, i, j, k), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j, k), value, &index_grad(buf, i, j, k), adj_value, adj_ret);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j, k, l), value, &index(adj_buf, i, j, k, l), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j, k, l), value, &index_grad(buf, i, j, k, l), adj_value, adj_ret);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
 template<template<typename> class A, typename T>
 CUDA_CALLABLE inline int len(const A<T>& a)
 {

warp/native/builtin.h CHANGED Viewed

@@ -52,6 +52,11 @@
 __device__ void __debugbreak() {}
 #endif
+#if defined(__clang__) && defined(__CUDA__) && defined(__CUDA_ARCH__)
+// clang compiling CUDA code, device mode (NOTE: Used when building core library with Clang)
+#include <cuda_fp16.h>
+#endif
 namespace wp
 {
@@ -177,14 +182,14 @@ CUDA_CALLABLE inline float half_to_float(half x)
 #elif defined(__clang__)
 // _Float16 is Clang's native half-precision floating-point type
-inline half float_to_half(float x)
+CUDA_CALLABLE inline half float_to_half(float x)
 {
     _Float16 f16 = static_cast<_Float16>(x);
     return *reinterpret_cast<half*>(&f16);
 }
-inline float half_to_float(half h)
+CUDA_CALLABLE inline float half_to_float(half h)
 {
     _Float16 f16 = *reinterpret_cast<_Float16*>(&h);
     return static_cast<float>(f16);
@@ -1221,6 +1226,15 @@ inline CUDA_CALLABLE launch_coord_t launch_coord(size_t linear, const launch_bou
     return coord;
 }
+inline CUDA_CALLABLE int block_dim()
+{
+#if defined(__CUDA_ARCH__)
+    return blockDim.x;
+#else
+    return 1;
+#endif
+}
 inline CUDA_CALLABLE int tid(size_t index, const launch_bounds_t& bounds)
 {
     // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
@@ -1301,34 +1315,35 @@ inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value)
     float16 old = buf[0];
     buf[0] += value;
     return old;
-#elif defined(__clang__)  // CUDA compiled by Clang
-	__half r = atomicAdd(reinterpret_cast<__half*>(buf), *reinterpret_cast<__half*>(&value));
-    return *reinterpret_cast<float16*>(&r);
 #else  // CUDA compiled by NVRTC
-    //return atomicAdd(buf, value);
-    /* Define __PTR for atomicAdd prototypes below, undef after done */
-    #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
-    #define __PTR   "l"
-    #else
-    #define __PTR   "r"
-    #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
-    half r = 0.0;
     #if __CUDA_ARCH__ >= 700
-        asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
-                    : "=h"(r.u)
-                    : __PTR(buf), "h"(value.u)
-                    : "memory");
+        #if defined(__clang__)  // CUDA compiled by Clang
+            __half r = atomicAdd(reinterpret_cast<__half*>(buf), *reinterpret_cast<__half*>(&value));
+            return *reinterpret_cast<float16*>(&r);
+        #else  // CUDA compiled by NVRTC
+            /* Define __PTR for atomicAdd prototypes below, undef after done */
+            #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+            #define __PTR   "l"
+            #else
+            #define __PTR   "r"
+            #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+            half r = 0.0;
+            asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                        : "=h"(r.u)
+                        : __PTR(buf), "h"(value.u)
+                        : "memory");
+            return r;
+            #undef __PTR
+        #endif
+    #else
+        // No native __half atomic support on compute capability < 7.0
+        return float16(0.0f);
     #endif
-    return r;
-    #undef __PTR
-#endif  // CUDA compiled by NVRTC
+#endif
 }
 template<>
@@ -1508,6 +1523,129 @@ CUDA_CALLABLE inline void adj_atomic_minmax(uint64* buf, uint64* adj_buf, const
 CUDA_CALLABLE inline void adj_atomic_minmax(bool* buf, bool* adj_buf, const bool &value, bool &adj_value) { }
+template<typename T>
+inline CUDA_CALLABLE T atomic_cas(T* address, T compare, T val)
+{
+#if defined(__CUDA_ARCH__)
+    return atomicCAS(address, compare, val);
+#else
+    T old = *address;
+    if (old == compare)
+    {
+        *address = val;
+    }
+    return old;
+#endif
+}
+template<>
+inline CUDA_CALLABLE float atomic_cas(float* address, float compare, float val)
+{
+#if defined(__CUDA_ARCH__)
+    auto result = atomicCAS(reinterpret_cast<unsigned int*>(address),
+                    reinterpret_cast<unsigned int&>(compare),
+                    reinterpret_cast<unsigned int&>(val));
+    return reinterpret_cast<float&>(result);
+#else
+    float old = *address;
+    if (old == compare)
+    {
+        *address = val;
+    }
+    return old;
+#endif
+}
+template<>
+inline CUDA_CALLABLE double atomic_cas(double* address, double compare, double val)
+{
+#if defined(__CUDA_ARCH__)
+    auto result = atomicCAS(reinterpret_cast<unsigned long long int *>(address),
+                    reinterpret_cast<unsigned long long int &>(compare),
+                    reinterpret_cast<unsigned long long int &>(val));
+    return reinterpret_cast<double&>(result);
+#else
+    double old = *address;
+    if (old == compare)
+    {
+        *address = val;
+    }
+    return old;
+#endif
+}
+template<>
+inline CUDA_CALLABLE int64 atomic_cas(int64* address, int64 compare, int64 val)
+{
+#if defined(__CUDA_ARCH__)
+    auto result = atomicCAS(reinterpret_cast<unsigned long long int *>(address),
+                    reinterpret_cast<unsigned long long int &>(compare),
+                    reinterpret_cast<unsigned long long int &>(val));
+    return reinterpret_cast<int64&>(result);
+#else
+    int64 old = *address;
+    if (old == compare)
+    {
+        *address = val;
+    }
+    return old;
+#endif
+}
+template<typename T>
+inline CUDA_CALLABLE T atomic_exch(T* address, T val)
+{
+#if defined(__CUDA_ARCH__)
+    return atomicExch(address, val);
+#else
+    T old = *address;
+    *address = val;
+    return old;
+#endif
+}
+template<>
+inline CUDA_CALLABLE double atomic_exch(double* address, double val)
+{
+#if defined(__CUDA_ARCH__)
+    auto result = atomicExch(reinterpret_cast<unsigned long long int*>(address),
+                     reinterpret_cast<unsigned long long int&>(val));
+    return reinterpret_cast<double&>(result);
+#else
+    double old = *address;
+    *address = val;
+    return old;
+#endif
+}
+template<>
+inline CUDA_CALLABLE int64 atomic_exch(int64* address, int64 val)
+{
+#if defined(__CUDA_ARCH__)
+    auto result = atomicExch(reinterpret_cast<unsigned long long int*>(address),
+                     reinterpret_cast<unsigned long long int&>(val));
+    return reinterpret_cast<int64&>(result);
+#else
+    int64 old = *address;
+    *address = val;
+    return old;
+#endif
+}
+template<typename T>
+CUDA_CALLABLE inline void adj_atomic_cas(T* address, T compare, T val, T* adj_address, T& adj_compare, T& adj_val, T adj_ret)
+{
+    // Not implemented
+}
+template<typename T>
+CUDA_CALLABLE inline void adj_atomic_exch(T* address, T val, T* adj_address, T& adj_val, T adj_ret)
+{
+    // Not implemented
+}
 } // namespace wp
@@ -1778,8 +1916,9 @@ inline CUDA_CALLABLE void expect_near(const T& actual, const T& expected, const
     if (abs(actual - expected) > tolerance)
     {
         printf("Error, expect_near() failed with tolerance "); print(tolerance);
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
+        printf("    Expected: "); print(expected);
+        printf("    Actual: "); print(actual);
+        printf("    Absolute difference: "); print(abs(actual - expected));
     }
 }
@@ -1789,8 +1928,9 @@ inline CUDA_CALLABLE void expect_near(const vec3& actual, const vec3& expected,
     if (diff > tolerance)
     {
         printf("Error, expect_near() failed with tolerance "); print(tolerance);
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
+        printf("    Expected: "); print(expected);
+        printf("    Actual: "); print(actual);
+        printf("    Max absolute difference: "); print(diff);
     }
 }
@@ -1810,6 +1950,7 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 // include array.h so we have the print, isfinite functions for the inner array types defined
 #include "array.h"
+#include "tuple.h"
 #include "mesh.h"
 #include "bvh.h"
 #include "svd.h"
@@ -1823,4 +1964,6 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 #if !defined(WP_ENABLE_CUDA) // only include in kernels for now
 #include "tile.h"
 #include "tile_reduce.h"
+#include "tile_scan.h"
+#include "tile_radix_sort.h"
 #endif //!defined(WP_ENABLE_CUDA)

warp/native/coloring.cpp CHANGED Viewed

@@ -372,7 +372,7 @@ public:
             // we need to update max_weight because weight_buckets[max_weight] became empty
         {
             int new_max_weight = 0;
-            for (size_t bucket_idx = max_weight - 1; bucket_idx >= 0; bucket_idx--)
+            for (int bucket_idx = max_weight - 1; bucket_idx >= 0; bucket_idx--)
             {
                 if (weight_buckets[bucket_idx].size())
                 {