PyPI - warp-lang - Versions diffs - 1.2.1__py3-none-manylinux2014_aarch64.whl → 1.3.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.2.1__py3-none-manylinux2014_aarch64.whl → 1.3.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (194) hide show

warp/__init__.py +8 -6
warp/autograd.py +823 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +6 -2
warp/builtins.py +1410 -886
warp/codegen.py +503 -166
warp/config.py +48 -18
warp/context.py +401 -199
warp/dlpack.py +8 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +1 -1
warp/examples/benchmarks/benchmark_interop_torch.py +158 -0
warp/examples/benchmarks/benchmark_launches.py +1 -1
warp/examples/core/example_cupy.py +78 -0
warp/examples/fem/example_apic_fluid.py +17 -36
warp/examples/fem/example_burgers.py +9 -18
warp/examples/fem/example_convection_diffusion.py +7 -17
warp/examples/fem/example_convection_diffusion_dg.py +27 -47
warp/examples/fem/example_deformed_geometry.py +11 -22
warp/examples/fem/example_diffusion.py +7 -18
warp/examples/fem/example_diffusion_3d.py +24 -28
warp/examples/fem/example_diffusion_mgpu.py +7 -14
warp/examples/fem/example_magnetostatics.py +190 -0
warp/examples/fem/example_mixed_elasticity.py +111 -80
warp/examples/fem/example_navier_stokes.py +30 -34
warp/examples/fem/example_nonconforming_contact.py +290 -0
warp/examples/fem/example_stokes.py +17 -32
warp/examples/fem/example_stokes_transfer.py +12 -21
warp/examples/fem/example_streamlines.py +350 -0
warp/examples/fem/utils.py +936 -0
warp/fabric.py +5 -2
warp/fem/__init__.py +13 -3
warp/fem/cache.py +161 -11
warp/fem/dirichlet.py +37 -28
warp/fem/domain.py +105 -14
warp/fem/field/__init__.py +14 -3
warp/fem/field/field.py +454 -11
warp/fem/field/nodal_field.py +33 -18
warp/fem/geometry/deformed_geometry.py +50 -15
warp/fem/geometry/hexmesh.py +12 -24
warp/fem/geometry/nanogrid.py +106 -31
warp/fem/geometry/quadmesh_2d.py +6 -11
warp/fem/geometry/tetmesh.py +103 -61
warp/fem/geometry/trimesh_2d.py +98 -47
warp/fem/integrate.py +231 -186
warp/fem/operator.py +14 -9
warp/fem/quadrature/pic_quadrature.py +35 -9
warp/fem/quadrature/quadrature.py +119 -32
warp/fem/space/basis_space.py +98 -22
warp/fem/space/collocated_function_space.py +3 -1
warp/fem/space/function_space.py +7 -2
warp/fem/space/grid_2d_function_space.py +3 -3
warp/fem/space/grid_3d_function_space.py +4 -4
warp/fem/space/hexmesh_function_space.py +3 -2
warp/fem/space/nanogrid_function_space.py +12 -14
warp/fem/space/partition.py +45 -47
warp/fem/space/restriction.py +19 -16
warp/fem/space/shape/cube_shape_function.py +91 -3
warp/fem/space/shape/shape_function.py +7 -0
warp/fem/space/shape/square_shape_function.py +32 -0
warp/fem/space/shape/tet_shape_function.py +11 -7
warp/fem/space/shape/triangle_shape_function.py +10 -1
warp/fem/space/topology.py +116 -42
warp/fem/types.py +8 -1
warp/fem/utils.py +301 -83
warp/native/array.h +16 -0
warp/native/builtin.h +0 -15
warp/native/cuda_util.cpp +14 -6
warp/native/exports.h +1348 -1308
warp/native/quat.h +79 -0
warp/native/rand.h +27 -4
warp/native/sparse.cpp +83 -81
warp/native/sparse.cu +381 -453
warp/native/vec.h +64 -0
warp/native/volume.cpp +40 -49
warp/native/volume_builder.cu +2 -3
warp/native/volume_builder.h +12 -17
warp/native/warp.cu +3 -3
warp/native/warp.h +69 -59
warp/render/render_opengl.py +17 -9
warp/sim/articulation.py +117 -17
warp/sim/collide.py +35 -29
warp/sim/model.py +123 -18
warp/sim/render.py +3 -1
warp/sparse.py +867 -203
warp/stubs.py +312 -541
warp/tape.py +29 -1
warp/tests/disabled_kinematics.py +1 -1
warp/tests/test_adam.py +1 -1
warp/tests/test_arithmetic.py +1 -1
warp/tests/test_array.py +58 -1
warp/tests/test_array_reduce.py +1 -1
warp/tests/test_async.py +1 -1
warp/tests/test_atomic.py +1 -1
warp/tests/test_bool.py +1 -1
warp/tests/test_builtins_resolution.py +1 -1
warp/tests/test_bvh.py +6 -1
warp/tests/test_closest_point_edge_edge.py +1 -1
warp/tests/test_codegen.py +66 -1
warp/tests/test_compile_consts.py +1 -1
warp/tests/test_conditional.py +1 -1
warp/tests/test_copy.py +1 -1
warp/tests/test_ctypes.py +1 -1
warp/tests/test_dense.py +1 -1
warp/tests/test_devices.py +1 -1
warp/tests/test_dlpack.py +1 -1
warp/tests/test_examples.py +33 -4
warp/tests/test_fabricarray.py +5 -2
warp/tests/test_fast_math.py +1 -1
warp/tests/test_fem.py +213 -6
warp/tests/test_fp16.py +1 -1
warp/tests/test_func.py +1 -1
warp/tests/test_future_annotations.py +90 -0
warp/tests/test_generics.py +1 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +1 -1
warp/tests/test_grad_debug.py +247 -0
warp/tests/test_hash_grid.py +6 -1
warp/tests/test_implicit_init.py +354 -0
warp/tests/test_import.py +1 -1
warp/tests/test_indexedarray.py +1 -1
warp/tests/test_intersect.py +1 -1
warp/tests/test_jax.py +1 -1
warp/tests/test_large.py +1 -1
warp/tests/test_launch.py +1 -1
warp/tests/test_lerp.py +1 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_lvalue.py +1 -1
warp/tests/test_marching_cubes.py +5 -2
warp/tests/test_mat.py +34 -35
warp/tests/test_mat_lite.py +2 -1
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_math.py +1 -1
warp/tests/test_matmul.py +20 -16
warp/tests/test_matmul_lite.py +1 -1
warp/tests/test_mempool.py +1 -1
warp/tests/test_mesh.py +5 -2
warp/tests/test_mesh_query_aabb.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_mesh_query_ray.py +1 -1
warp/tests/test_mlp.py +1 -1
warp/tests/test_model.py +1 -1
warp/tests/test_module_hashing.py +77 -1
warp/tests/test_modules_lite.py +1 -1
warp/tests/test_multigpu.py +1 -1
warp/tests/test_noise.py +1 -1
warp/tests/test_operators.py +1 -1
warp/tests/test_options.py +1 -1
warp/tests/test_overwrite.py +542 -0
warp/tests/test_peer.py +1 -1
warp/tests/test_pinned.py +1 -1
warp/tests/test_print.py +1 -1
warp/tests/test_quat.py +15 -1
warp/tests/test_rand.py +1 -1
warp/tests/test_reload.py +1 -1
warp/tests/test_rounding.py +1 -1
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +95 -0
warp/tests/test_sim_grad.py +1 -1
warp/tests/test_sim_kinematics.py +1 -1
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +82 -15
warp/tests/test_spatial.py +1 -1
warp/tests/test_special_values.py +2 -11
warp/tests/test_streams.py +11 -1
warp/tests/test_struct.py +1 -1
warp/tests/test_tape.py +1 -1
warp/tests/test_torch.py +194 -1
warp/tests/test_transient_module.py +1 -1
warp/tests/test_types.py +1 -1
warp/tests/test_utils.py +1 -1
warp/tests/test_vec.py +15 -63
warp/tests/test_vec_lite.py +2 -1
warp/tests/test_vec_scalar_ops.py +122 -39
warp/tests/test_verify_fp.py +1 -1
warp/tests/test_volume.py +28 -2
warp/tests/test_volume_write.py +1 -1
warp/tests/unittest_serial.py +1 -1
warp/tests/unittest_suites.py +9 -1
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +2 -5
warp/torch.py +103 -41
warp/types.py +344 -227
warp/utils.py +11 -2
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/METADATA +99 -46
warp_lang-1.3.0.dist-info/RECORD +368 -0
warp/examples/fem/bsr_utils.py +0 -378
warp/examples/fem/mesh_utils.py +0 -133
warp/examples/fem/plot_utils.py +0 -292
warp_lang-1.2.1.dist-info/RECORD +0 -359
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/WHEEL +0 -0
{warp_lang-1.2.1.dist-info → warp_lang-1.3.0.dist-info}/top_level.txt +0 -0

warp/fem/utils.py CHANGED Viewed

@@ -1,14 +1,10 @@
-from typing import Any, Tuple
+from typing import Any, Tuple, Union
 import numpy as np
 import warp as wp
-from warp.fem.cache import (
-    Temporary,
-    TemporaryStore,
-    borrow_temporary,
-    borrow_temporary_like,
-)
+import warp.fem.cache as cache
+from warp.fem.types import NULL_NODE_INDEX
 from warp.utils import array_scan, radix_sort_pairs, runlength_encode
@@ -115,121 +111,331 @@ def skew_part(x: wp.mat33):
     return wp.vec3(a, b, c)
+@wp.func
+def householder_qr_decomposition(A: Any):
+    """
+    QR decomposition of a square matrix using Householder reflections
+    Returns Q and R such that Q R = A, Q orthonormal (such that QQ^T = Id), R upper triangular
+    """
+    x = type(A[0])()
+    Q = wp.identity(n=type(x).length, dtype=A.dtype)
+    zero = x.dtype(0.0)
+    two = x.dtype(2.0)
+    for i in range(type(x).length):
+        for k in range(type(x).length):
+            x[k] = wp.select(k < i, A[k, i], zero)
+        alpha = wp.length(x) * wp.sign(x[i])
+        x[i] += alpha
+        two_over_x_sq = wp.select(alpha == zero, two / wp.length_sq(x), zero)
+        A -= wp.outer(two_over_x_sq * x, x * A)
+        Q -= wp.outer(Q * x, two_over_x_sq * x)
+    return Q, A
+@wp.func
+def householder_make_hessenberg(A: Any):
+    """Transforms a square matrix to Hessenberg form (single lower diagonal) using Householder reflections
+    Returns:
+        Q and H such that Q H Q^T = A, Q orthonormal, H under Hessenberg form
+        If A is symmetric, H will be tridiagonal
+    """
+    x = type(A[0])()
+    Q = wp.identity(n=type(x).length, dtype=A.dtype)
+    zero = x.dtype(0.0)
+    two = x.dtype(2.0)
+    for i in range(1, type(x).length):
+        for k in range(type(x).length):
+            x[k] = wp.select(k < i, A[k, i - 1], zero)
+        alpha = wp.length(x) * wp.sign(x[i])
+        x[i] += alpha
+        two_over_x_sq = wp.select(alpha == zero, two / wp.length_sq(x), zero)
+        # apply on both sides
+        A -= wp.outer(two_over_x_sq * x, x * A)
+        A -= wp.outer(A * x, two_over_x_sq * x)
+        Q -= wp.outer(Q * x, two_over_x_sq * x)
+    return Q, A
+@wp.func
+def solve_triangular(R: Any, b: Any):
+    """Solves for R x = b where R is an upper triangular matrix
+    Returns x
+    """
+    zero = b.dtype(0)
+    x = type(b)(b.dtype(0))
+    for i in range(b.length, 0, -1):
+        j = i - 1
+        r = b[j] - wp.dot(R[j], x)
+        x[j] = wp.select(R[j, j] == zero, r / R[j, j], zero)
+    return x
+@wp.func
+def inverse_qr(A: Any):
+    # Computes a square matrix inverse using QR factorization
+    Q, R = householder_qr_decomposition(A)
+    A_inv = type(A)()
+    for i in range(type(A[0]).length):
+        A_inv[i] = solve_triangular(R, Q[i])  # ith column of Q^T
+    return wp.transpose(A_inv)
+@wp.func
+def symmetric_eigenvalues_qr(A: Any, tol: Any):
+    """
+    Computes the eigenvalues and eigen vectors of a square symmetric matrix A using the QR algorithm
+    Args:
+        A: square symmetric matrix
+        tol: Tolerance for the diagonalization residual (squared L2 norm of off-diagonal terms)
+    Returns a tuple (D: vector of eigenvalues, P: matrix with one eigenvector per row) such that A = P^T D P
+    """
+    two = A.dtype(2.0)
+    zero = A.dtype(0.0)
+    # temp storage for matrix rows
+    ri = type(A[0])()
+    rn = type(ri)()
+    # tridiagonal storage for R
+    R_L = type(ri)()
+    R_L = type(ri)(zero)
+    R_U = type(ri)(zero)
+    # so that we can use the type length in expression
+    # this will prevent unrolling by warp, but should be ok for native code
+    m = int(0)
+    for _ in range(type(ri).length):
+        m += 1
+    # Put A under Hessenberg form (tridiagonal)
+    Q, H = householder_make_hessenberg(A)
+    Q = wp.transpose(Q)  # algorithm below works and transposed Q as rows are easier to index
+    for _ in range(16 * m):  # failsafe, usually converges faster than that
+        # Initialize R with current H
+        R_D = wp.get_diag(H)
+        for i in range(1, type(ri).length):
+            R_L[i - 1] = H[i, i - 1]
+            R_U[i - 1] = H[i - 1, i]
+        # compute QR decomposition, directly transform H and eigenvectors
+        for n in range(1, m):
+            i = n - 1
+            # compute reflection
+            xi = R_D[i]
+            xn = R_L[i]
+            xii = xi * xi
+            xnn = xn * xn
+            alpha = wp.sqrt(xii + xnn) * wp.sign(xi)
+            xi += alpha
+            xii = xi * xi
+            xin = xi * xn
+            two_over_x_sq = wp.select(alpha == zero, two / (xii + xnn), zero)
+            xii *= two_over_x_sq
+            xin *= two_over_x_sq
+            xnn *= two_over_x_sq
+            # Left-multiply R and Q, multiply H on both sides
+            # Note that R should get non-zero coefficients on the second upper diagonal,
+            # but those won't get read afterwards, so we can ignore them
+            R_D[n] -= R_U[i] * xin + R_D[n] * xnn
+            R_U[n] -= R_U[n] * xnn
+            ri = Q[i]
+            rn = Q[n]
+            Q[i] -= ri * xii + rn * xin
+            Q[n] -= ri * xin + rn * xnn
+            # H is multiplied on both sides, but stays tridiagonal except for moving buldge
+            # Note: we could reduce the stencil to for 4 columns qui we do below,
+            # but unlikely to be worth it for our small matrix sizes
+            ri = H[i]
+            rn = H[n]
+            H[i] -= ri * xii + rn * xin
+            H[n] -= ri * xin + rn * xnn
+            # multiply on right, manually. We just need to consider 4 rows
+            if i > 0:
+                ci = H[i - 1, i]
+                cn = H[i - 1, n]
+                H[i - 1, i] -= ci * xii + cn * xin
+                H[i - 1, n] -= ci * xin + cn * xnn
+            for k in range(2):
+                ci = H[i + k, i]
+                cn = H[i + k, n]
+                H[i + k, i] -= ci * xii + cn * xin
+                H[i + k, n] -= ci * xin + cn * xnn
+            if n + 1 < m:
+                ci = H[n + 1, i]
+                cn = H[n + 1, n]
+                H[n + 1, i] -= ci * xii + cn * xin
+                H[n + 1, n] -= ci * xin + cn * xnn
+        # Terminate if the upper diagonal of R is near zero
+        if wp.length_sq(R_U) < tol:
+            break
+    return wp.get_diag(H), Q
 def compress_node_indices(
-    node_count: int, node_indices: wp.array(dtype=int), temporary_store: TemporaryStore = None
-) -> Tuple[Temporary, Temporary, int, Temporary]:
+    node_count: int,
+    node_indices: wp.array(dtype=int),
+    return_unique_nodes=False,
+    temporary_store: cache.TemporaryStore = None,
+) -> Union[Tuple[cache.Temporary, cache.Temporary], Tuple[cache.Temporary, cache.Temporary, int, cache.Temporary]]:
     """
     Compress an unsorted list of node indices into:
      - a node_offsets array, giving for each node the start offset of corresponding indices in sorted_array_indices
      - a sorted_array_indices array, listing the indices in the input array corresponding to each node
+    Plus if `return_unique_nodes` is ``True``,
      - the number of unique node indices
      - a unique_node_indices array containing the sorted list of unique node indices (i.e. the list of indices i for which node_offsets[i] < node_offsets[i+1])
+    Node indices equal to NULL_NODE_INDEX will be ignored
     """
     index_count = node_indices.size
+    device = node_indices.device
-    sorted_node_indices_temp = borrow_temporary(
-        temporary_store, shape=2 * index_count, dtype=int, device=node_indices.device
-    )
-    sorted_array_indices_temp = borrow_temporary_like(sorted_node_indices_temp, temporary_store)
+    with wp.ScopedDevice(device):
+        sorted_node_indices_temp = cache.borrow_temporary(temporary_store, shape=2 * index_count, dtype=int)
+        sorted_array_indices_temp = cache.borrow_temporary_like(sorted_node_indices_temp, temporary_store)
-    sorted_node_indices = sorted_node_indices_temp.array
-    sorted_array_indices = sorted_array_indices_temp.array
+        sorted_node_indices = sorted_node_indices_temp.array
+        sorted_array_indices = sorted_array_indices_temp.array
-    wp.copy(dest=sorted_node_indices, src=node_indices, count=index_count)
+        wp.copy(dest=sorted_node_indices, src=node_indices, count=index_count)
-    indices_per_element = 1 if node_indices.ndim == 1 else node_indices.shape[-1]
-    wp.launch(
-        kernel=_iota_kernel,
-        dim=index_count,
-        inputs=[sorted_array_indices, indices_per_element],
-        device=sorted_array_indices.device,
-    )
+        indices_per_element = 1 if node_indices.ndim == 1 else node_indices.shape[-1]
+        wp.launch(
+            kernel=_iota_kernel,
+            dim=index_count,
+            inputs=[sorted_array_indices, indices_per_element],
+        )
-    # Sort indices
-    radix_sort_pairs(sorted_node_indices, sorted_array_indices, count=index_count)
+        # Sort indices
+        radix_sort_pairs(sorted_node_indices, sorted_array_indices, count=index_count)
-    # Build prefix sum of number of elements per node
-    unique_node_indices_temp = borrow_temporary(
-        temporary_store, shape=index_count, dtype=int, device=node_indices.device
-    )
-    node_element_counts_temp = borrow_temporary(
-        temporary_store, shape=index_count, dtype=int, device=node_indices.device
-    )
+        # Build prefix sum of number of elements per node
+        unique_node_indices_temp = cache.borrow_temporary(temporary_store, shape=index_count, dtype=int)
+        node_element_counts_temp = cache.borrow_temporary(temporary_store, shape=index_count, dtype=int)
-    unique_node_indices = unique_node_indices_temp.array
-    node_element_counts = node_element_counts_temp.array
+        unique_node_indices = unique_node_indices_temp.array
+        node_element_counts = node_element_counts_temp.array
-    unique_node_count_dev = borrow_temporary(temporary_store, shape=(1,), dtype=int, device=sorted_node_indices.device)
-    runlength_encode(
-        sorted_node_indices,
-        unique_node_indices,
-        node_element_counts,
-        value_count=index_count,
-        run_count=unique_node_count_dev.array,
-    )
+        unique_node_count_dev = cache.borrow_temporary(temporary_store, shape=(1,), dtype=int)
-    # Transfer unique node count to host
-    if node_indices.device.is_cuda:
-        unique_node_count_host = borrow_temporary(temporary_store, shape=(1,), dtype=int, pinned=True, device="cpu")
-        wp.copy(src=unique_node_count_dev.array, dest=unique_node_count_host.array, count=1)
-        wp.synchronize_stream(wp.get_stream(node_indices.device))
-        unique_node_count_dev.release()
+        runlength_encode(
+            sorted_node_indices,
+            unique_node_indices,
+            node_element_counts,
+            value_count=index_count,
+            run_count=unique_node_count_dev.array,
+        )
+        # Scatter seen run counts to global array of element count per node
+        node_offsets_temp = cache.borrow_temporary(temporary_store, shape=(node_count + 1), dtype=int)
+        node_offsets = node_offsets_temp.array
+        node_offsets.zero_()
+        wp.launch(
+            kernel=_scatter_node_counts,
+            dim=node_count + 1,  # +1 to accommodate possible NULL node,
+            inputs=[node_element_counts, unique_node_indices, node_offsets, unique_node_count_dev.array],
+        )
+        if device.is_cuda and return_unique_nodes:
+            unique_node_count_host = cache.borrow_temporary(
+                temporary_store, shape=(1,), dtype=int, pinned=True, device="cpu"
+            )
+            wp.copy(src=unique_node_count_dev.array, dest=unique_node_count_host.array, count=1)
+            copy_event = cache.capture_event(device)
+        # Prefix sum of number of elements per node
+        array_scan(node_offsets, node_offsets, inclusive=True)
+        sorted_node_indices_temp.release()
+        node_element_counts_temp.release()
+        if not return_unique_nodes:
+            unique_node_count_dev.release()
+            return node_offsets_temp, sorted_array_indices_temp
+        if device.is_cuda:
+            cache.synchronize_event(copy_event)
+            unique_node_count_dev.release()
+        else:
+            unique_node_count_host = unique_node_count_dev
         unique_node_count = int(unique_node_count_host.array.numpy()[0])
         unique_node_count_host.release()
-    else:
-        unique_node_count = int(unique_node_count_dev.array.numpy()[0])
-        unique_node_count_dev.release()
+        return node_offsets_temp, sorted_array_indices_temp, unique_node_count, unique_node_indices_temp
-    # Scatter seen run counts to global array of element count per node
-    node_offsets_temp = borrow_temporary(
-        temporary_store, shape=(node_count + 1), device=node_element_counts.device, dtype=int
-    )
-    node_offsets = node_offsets_temp.array
-    node_offsets.zero_()
-    wp.launch(
-        kernel=_scatter_node_counts,
-        dim=unique_node_count,
-        inputs=[node_element_counts, unique_node_indices, node_offsets],
-        device=node_offsets.device,
-    )
+def host_read_at_index(array: wp.array, index: int = -1, temporary_store: cache.TemporaryStore = None) -> int:
+    """Returns the value of the array element at the given index on host"""
-    # Prefix sum of number of elements per node
-    array_scan(node_offsets, node_offsets, inclusive=True)
+    if index < 0:
+        index += array.shape[0]
-    sorted_node_indices_temp.release()
-    node_element_counts_temp.release()
+    if array.device.is_cuda:
+        temp = cache.borrow_temporary(temporary_store, shape=1, dtype=int, pinned=True, device="cpu")
+        wp.copy(dest=temp.array, src=array, src_offset=index, count=1)
+        wp.synchronize_stream(wp.get_stream(array.device))
+        return temp.array.numpy()[0]
-    return node_offsets_temp, sorted_array_indices_temp, unique_node_count, unique_node_indices_temp
+    return array.numpy()[index]
 def masked_indices(
-    mask: wp.array, missing_index=-1, temporary_store: TemporaryStore = None
-) -> Tuple[Temporary, Temporary]:
+    mask: wp.array, missing_index=-1, temporary_store: cache.TemporaryStore = None
+) -> Tuple[cache.Temporary, cache.Temporary]:
     """
     From an array of boolean masks (must be either 0 or 1), returns:
       - The list of indices for which the mask is 1
       - A map associating to each element of the input mask array its local index if non-zero, or missing_index if zero.
     """
-    offsets_temp = borrow_temporary_like(mask, temporary_store)
+    offsets_temp = cache.borrow_temporary_like(mask, temporary_store)
     offsets = offsets_temp.array
     wp.utils.array_scan(mask, offsets, inclusive=True)
     # Get back total counts on host
-    if offsets.device.is_cuda:
-        masked_count_temp = borrow_temporary(temporary_store, shape=1, dtype=int, pinned=True, device="cpu")
-        wp.copy(dest=masked_count_temp.array, src=offsets, src_offset=offsets.shape[0] - 1, count=1)
-        wp.synchronize_stream(wp.get_stream(offsets.device))
-        masked_count = int(masked_count_temp.array.numpy()[0])
-        masked_count_temp.release()
-    else:
-        masked_count = int(offsets.numpy()[-1])
+    masked_count = int(host_read_at_index(offsets, temporary_store=temporary_store))
     # Convert counts to indices
-    indices_temp = borrow_temporary(temporary_store, shape=masked_count, device=mask.device, dtype=int)
+    indices_temp = cache.borrow_temporary(temporary_store, shape=masked_count, device=mask.device, dtype=int)
     wp.launch(
         kernel=_masked_indices_kernel,
@@ -262,10 +468,22 @@ def _iota_kernel(indices: wp.array(dtype=int), divisor: int):
 @wp.kernel
 def _scatter_node_counts(
-    unique_counts: wp.array(dtype=int), unique_node_indices: wp.array(dtype=int), node_counts: wp.array(dtype=int)
+    unique_counts: wp.array(dtype=int),
+    unique_node_indices: wp.array(dtype=int),
+    node_counts: wp.array(dtype=int),
+    unique_node_count: wp.array(dtype=int),
 ):
     i = wp.tid()
-    node_counts[1 + unique_node_indices[i]] = unique_counts[i]
+    if i >= unique_node_count[0]:
+        return
+    node_index = unique_node_indices[i]
+    if node_index == NULL_NODE_INDEX:
+        wp.atomic_sub(unique_node_count, 0, 1)
+        return
+    node_counts[1 + node_index] = unique_counts[i]
 @wp.kernel
@@ -467,7 +685,7 @@ def grid_to_hexes(Nx: int, Ny: int, Nz: int):
         Nz: Resolution of the grid along `z` dimension
     Returns:
-        Array of shape (Nx * Ny * Nz, 8) containing vertex indices for each hexaedron
+        Array of shape (Nx * Ny * Nz, 8) containing vertex indices for each hexahedron
     """
     hex_vtx = np.array(

warp/native/array.h CHANGED Viewed

@@ -207,6 +207,22 @@ struct array_t
         strides[3] = sizeof(T);
     }
+    CUDA_CALLABLE array_t(uint64 data, int size, uint64 grad=0)
+        : array_t((T*)(data), size, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, int dim2, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, dim2, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, int dim2, int dim3, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, dim2, dim3, (T*)(grad))
+    {}
     CUDA_CALLABLE inline bool empty() const { return !data; }
     T* data;

warp/native/builtin.h CHANGED Viewed

@@ -1145,21 +1145,6 @@ struct launch_bounds_t
     size_t size;                // total number of threads
 };
-#ifndef __CUDACC__
-static size_t s_threadIdx;
-#endif
-inline CUDA_CALLABLE size_t grid_index()
-{
-#ifdef __CUDACC__
-    // Need to cast at least one of the variables being multiplied so that type promotion happens before the multiplication
-    size_t grid_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-    return grid_index;
-#else
-    return s_threadIdx;
-#endif
-}
 inline CUDA_CALLABLE int tid(size_t index)
 {
     // For the 1-D tid() we need to warn the user if we're about to provide a truncated index

warp/native/cuda_util.cpp CHANGED Viewed

@@ -24,14 +24,11 @@
 #include <stack>
 // the minimum CUDA version required from the driver
-#define WP_CUDA_DRIVER_VERSION 11030
+#define WP_CUDA_DRIVER_VERSION 11040
 // the minimum CUDA Toolkit version required to build Warp
 #define WP_CUDA_TOOLKIT_VERSION 11050
-#define WP_CUDA_VERSION_MAJOR(version) (version / 1000)
-#define WP_CUDA_VERSION_MINOR(version) ((version % 1000) / 10)
 // check if the CUDA Toolkit is too old
 #if CUDA_VERSION < WP_CUDA_TOOLKIT_VERSION
 #error Building Warp requires CUDA Toolkit version 11.5 or higher
@@ -108,6 +105,17 @@ bool ContextGuard::always_restore = false;
 CudaTimingState* g_cuda_timing_state = NULL;
+static inline int get_major(int version)
+{
+    return version / 1000;
+}
+static inline int get_minor(int version)
+{
+    return (version % 1000) / 10;
+}
 static bool get_driver_entry_point(const char* name, void** pfn)
 {
     if (!pfn_cuGetProcAddress || !name || !pfn)
@@ -163,8 +171,8 @@ bool init_cuda_driver()
         if (driver_version < WP_CUDA_DRIVER_VERSION)
         {
             fprintf(stderr, "Warp CUDA error: Warp requires CUDA driver %d.%d or higher, but the current driver only supports CUDA %d.%d\n",
-                WP_CUDA_VERSION_MAJOR(WP_CUDA_DRIVER_VERSION), WP_CUDA_VERSION_MINOR(WP_CUDA_DRIVER_VERSION),
-                WP_CUDA_VERSION_MAJOR(driver_version), WP_CUDA_VERSION_MINOR(driver_version));
+                get_major(WP_CUDA_DRIVER_VERSION), get_minor(WP_CUDA_DRIVER_VERSION),
+                get_major(driver_version), get_minor(driver_version));
             return false;
         }
     }