PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/fem/utils.py ADDED Viewed

@@ -0,0 +1,495 @@
+from typing import Any, Tuple
+import numpy as np
+import warp as wp
+from warp.fem.cache import (
+    Temporary,
+    TemporaryStore,
+    borrow_temporary,
+    borrow_temporary_like,
+)
+from warp.utils import array_scan, radix_sort_pairs, runlength_encode
+@wp.func
+def generalized_outer(x: Any, y: Any):
+    """Generalized outer product allowing for the first argument to be a scalar"""
+    return wp.outer(x, y)
+@wp.func
+def generalized_outer(x: wp.float32, y: wp.vec2):
+    return x * y
+@wp.func
+def generalized_outer(x: wp.float32, y: wp.vec3):
+    return x * y
+@wp.func
+def generalized_inner(x: Any, y: Any):
+    """Generalized inner product allowing for the first argument to be a tensor"""
+    return wp.dot(x, y)
+@wp.func
+def generalized_inner(x: wp.mat22, y: wp.vec2):
+    return x[0] * y[0] + x[1] * y[1]
+@wp.func
+def generalized_inner(x: wp.mat33, y: wp.vec3):
+    return x[0] * y[0] + x[1] * y[1] + x[2] * y[2]
+@wp.func
+def apply_right(x: Any, y: Any):
+    """Performs x y multiplication with y a square matrix and x either a row-vector or a matrix.
+    Will be removed once native @ operator is implemented.
+    """
+    return x * y
+@wp.func
+def apply_right(x: wp.vec2, y: wp.mat22):
+    return x[0] * y[0] + x[1] * y[1]
+@wp.func
+def apply_right(x: wp.vec3, y: wp.mat33):
+    return x[0] * y[0] + x[1] * y[1] + x[2] * y[2]
+@wp.func
+def unit_element(template_type: Any, coord: int):
+    """Returns a instance of `template_type` with a single coordinate set to 1 in the canonical basis"""
+    t = type(template_type)(0.0)
+    t[coord] = 1.0
+    return t
+@wp.func
+def unit_element(template_type: wp.float32, coord: int):
+    return 1.0
+@wp.func
+def unit_element(template_type: wp.mat22, coord: int):
+    t = wp.mat22(0.0)
+    row = coord // 2
+    col = coord - 2 * row
+    t[row, col] = 1.0
+    return t
+@wp.func
+def unit_element(template_type: wp.mat33, coord: int):
+    t = wp.mat33(0.0)
+    row = coord // 3
+    col = coord - 3 * row
+    t[row, col] = 1.0
+    return t
+@wp.func
+def symmetric_part(x: Any):
+    """Symmetric part of a square tensor"""
+    return 0.5 * (x + wp.transpose(x))
+@wp.func
+def skew_part(x: wp.mat22):
+    """Skew part of a 2x2 tensor as corresponding rotation angle"""
+    return 0.5 * (x[1, 0] - x[0, 1])
+@wp.func
+def skew_part(x: wp.mat33):
+    """Skew part of a 3x3 tensor as the corresponding rotation vector"""
+    a = 0.5 * (x[2, 1] - x[1, 2])
+    b = 0.5 * (x[0, 2] - x[2, 0])
+    c = 0.5 * (x[1, 0] - x[0, 1])
+    return wp.vec3(a, b, c)
+def compress_node_indices(
+    node_count: int, node_indices: wp.array(dtype=int), temporary_store: TemporaryStore = None
+) -> Tuple[Temporary, Temporary, int, Temporary]:
+    """
+    Compress an unsorted list of node indices into:
+     - a node_offsets array, giving for each node the start offset of corresponding indices in sorted_array_indices
+     - a sorted_array_indices array, listing the indices in the input array corresponding to each node
+     - the number of unique node indices
+     - a unique_node_indices array containing the sorted list of unique node indices (i.e. the list of indices i for which node_offsets[i] < node_offsets[i+1])
+    """
+    index_count = node_indices.size
+    sorted_node_indices_temp = borrow_temporary(
+        temporary_store, shape=2 * index_count, dtype=int, device=node_indices.device
+    )
+    sorted_array_indices_temp = borrow_temporary_like(sorted_node_indices_temp, temporary_store)
+    sorted_node_indices = sorted_node_indices_temp.array
+    sorted_array_indices = sorted_array_indices_temp.array
+    wp.copy(dest=sorted_node_indices, src=node_indices, count=index_count)
+    indices_per_element = 1 if node_indices.ndim == 1 else node_indices.shape[-1]
+    wp.launch(
+        kernel=_iota_kernel,
+        dim=index_count,
+        inputs=[sorted_array_indices, indices_per_element],
+        device=sorted_array_indices.device,
+    )
+    # Sort indices
+    radix_sort_pairs(sorted_node_indices, sorted_array_indices, count=index_count)
+    # Build prefix sum of number of elements per node
+    unique_node_indices_temp = borrow_temporary(
+        temporary_store, shape=index_count, dtype=int, device=node_indices.device
+    )
+    node_element_counts_temp = borrow_temporary(
+        temporary_store, shape=index_count, dtype=int, device=node_indices.device
+    )
+    unique_node_indices = unique_node_indices_temp.array
+    node_element_counts = node_element_counts_temp.array
+    unique_node_count_dev = borrow_temporary(temporary_store, shape=(1,), dtype=int, device=sorted_node_indices.device)
+    runlength_encode(
+        sorted_node_indices,
+        unique_node_indices,
+        node_element_counts,
+        value_count=index_count,
+        run_count=unique_node_count_dev.array,
+    )
+    # Transfer unique node count to host
+    if node_indices.device.is_cuda:
+        unique_node_count_host = borrow_temporary(temporary_store, shape=(1,), dtype=int, pinned=True, device="cpu")
+        wp.copy(src=unique_node_count_dev.array, dest=unique_node_count_host.array, count=1)
+        wp.synchronize_stream(wp.get_stream(node_indices.device))
+        unique_node_count_dev.release()
+        unique_node_count = int(unique_node_count_host.array.numpy()[0])
+        unique_node_count_host.release()
+    else:
+        unique_node_count = int(unique_node_count_dev.array.numpy()[0])
+        unique_node_count_dev.release()
+    # Scatter seen run counts to global array of element count per node
+    node_offsets_temp = borrow_temporary(
+        temporary_store, shape=(node_count + 1), device=node_element_counts.device, dtype=int
+    )
+    node_offsets = node_offsets_temp.array
+    node_offsets.zero_()
+    wp.launch(
+        kernel=_scatter_node_counts,
+        dim=unique_node_count,
+        inputs=[node_element_counts, unique_node_indices, node_offsets],
+        device=node_offsets.device,
+    )
+    # Prefix sum of number of elements per node
+    array_scan(node_offsets, node_offsets, inclusive=True)
+    sorted_node_indices_temp.release()
+    node_element_counts_temp.release()
+    return node_offsets_temp, sorted_array_indices_temp, unique_node_count, unique_node_indices_temp
+def masked_indices(
+    mask: wp.array, missing_index=-1, temporary_store: TemporaryStore = None
+) -> Tuple[Temporary, Temporary]:
+    """
+    From an array of boolean masks (must be either 0 or 1), returns:
+      - The list of indices for which the mask is 1
+      - A map associating to each element of the input mask array its local index if non-zero, or missing_index if zero.
+    """
+    offsets_temp = borrow_temporary_like(mask, temporary_store)
+    offsets = offsets_temp.array
+    wp.utils.array_scan(mask, offsets, inclusive=True)
+    # Get back total counts on host
+    if offsets.device.is_cuda:
+        masked_count_temp = borrow_temporary(temporary_store, shape=1, dtype=int, pinned=True, device="cpu")
+        wp.copy(dest=masked_count_temp.array, src=offsets, src_offset=offsets.shape[0] - 1, count=1)
+        wp.synchronize_stream(wp.get_stream(offsets.device))
+        masked_count = int(masked_count_temp.array.numpy()[0])
+        masked_count_temp.release()
+    else:
+        masked_count = int(offsets.numpy()[-1])
+    # Convert counts to indices
+    indices_temp = borrow_temporary(temporary_store, shape=masked_count, device=mask.device, dtype=int)
+    wp.launch(
+        kernel=_masked_indices_kernel,
+        dim=offsets.shape,
+        inputs=[missing_index, mask, offsets, indices_temp.array, offsets],
+        device=mask.device,
+    )
+    return indices_temp, offsets_temp
+def array_axpy(x: wp.array, y: wp.array, alpha: float = 1.0, beta: float = 1.0):
+    """Performs y = alpha*x + beta*y"""
+    dtype = wp.types.type_scalar_type(x.dtype)
+    alpha = dtype(alpha)
+    beta = dtype(beta)
+    if not wp.types.types_equal(x.dtype, y.dtype) or x.shape != y.shape or x.device != y.device:
+        raise ValueError("x and y arrays must have same dat atype, shape and device")
+    wp.launch(kernel=_array_axpy_kernel, dim=x.shape, device=x.device, inputs=[x, y, alpha, beta])
+@wp.kernel
+def _iota_kernel(indices: wp.array(dtype=int), divisor: int):
+    indices[wp.tid()] = wp.tid() // divisor
+@wp.kernel
+def _scatter_node_counts(
+    unique_counts: wp.array(dtype=int), unique_node_indices: wp.array(dtype=int), node_counts: wp.array(dtype=int)
+):
+    i = wp.tid()
+    node_counts[1 + unique_node_indices[i]] = unique_counts[i]
+@wp.kernel
+def _masked_indices_kernel(
+    missing_index: int,
+    mask: wp.array(dtype=int),
+    offsets: wp.array(dtype=int),
+    masked_to_global: wp.array(dtype=int),
+    global_to_masked: wp.array(dtype=int),
+):
+    i = wp.tid()
+    if mask[i] == 0:
+        global_to_masked[i] = missing_index
+    else:
+        masked_idx = offsets[i] - 1
+        global_to_masked[i] = masked_idx
+        masked_to_global[masked_idx] = i
+@wp.kernel
+def _array_axpy_kernel(x: wp.array(dtype=Any), y: wp.array(dtype=Any), alpha: Any, beta: Any):
+    i = wp.tid()
+    y[i] = beta * y[i] + alpha * x[i]
+def grid_to_tris(Nx: int, Ny: int):
+    """Constructs a triangular mesh topology by dividing each cell of a dense 2D grid into two triangles.
+    The resulting triangles will be oriented counter-clockwise assuming that `y` is the fastest moving index direction
+    Args:
+        Nx: Resolution of the grid along `x` dimension
+        Ny: Resolution of the grid along `y` dimension
+    Returns:
+        Array of shape (2 * Nx * Ny, 3) containing vertex indices for each triangle
+    """
+    cx, cy = np.meshgrid(np.arange(Nx, dtype=int), np.arange(Ny, dtype=int), indexing="ij")
+    vidx = np.transpose(
+        np.array(
+            [
+                (Ny + 1) * cx + cy,
+                (Ny + 1) * (cx + 1) + cy,
+                (Ny + 1) * (cx + 1) + (cy + 1),
+                (Ny + 1) * cx + cy,
+                (Ny + 1) * (cx + 1) + (cy + 1),
+                (Ny + 1) * (cx) + (cy + 1),
+            ]
+        )
+    ).reshape((-1, 3))
+    return vidx
+def grid_to_tets(Nx: int, Ny: int, Nz: int):
+    """Constructs a tetrahedral mesh topology by diving each cell of a dense 3D grid into five tetrahedrons
+    The resulting tets have positive volume assuming that `z` is the fastest moving index direction
+    Args:
+        Nx: Resolution of the grid along `x` dimension
+        Ny: Resolution of the grid along `y` dimension
+        Nz: Resolution of the grid along `z` dimension
+    Returns:
+        Array of shape (5 * Nx * Ny * Nz, 4) containing vertex indices for each tet
+    """
+    # Global node indices for each cell
+    cx, cy, cz = np.meshgrid(
+        np.arange(Nx, dtype=int), np.arange(Ny, dtype=int), np.arange(Nz, dtype=int), indexing="ij"
+    )
+    grid_vidx = np.array(
+        [
+            (Ny + 1) * (Nz + 1) * cx + (Nz + 1) * cy + cz,
+            (Ny + 1) * (Nz + 1) * cx + (Nz + 1) * cy + cz + 1,
+            (Ny + 1) * (Nz + 1) * cx + (Nz + 1) * (cy + 1) + cz,
+            (Ny + 1) * (Nz + 1) * cx + (Nz + 1) * (cy + 1) + cz + 1,
+            (Ny + 1) * (Nz + 1) * (cx + 1) + (Nz + 1) * cy + cz,
+            (Ny + 1) * (Nz + 1) * (cx + 1) + (Nz + 1) * cy + cz + 1,
+            (Ny + 1) * (Nz + 1) * (cx + 1) + (Nz + 1) * (cy + 1) + cz,
+            (Ny + 1) * (Nz + 1) * (cx + 1) + (Nz + 1) * (cy + 1) + cz + 1,
+        ]
+    )
+    # decompose grid cells into 5 tets
+    tet_vidx = np.array(
+        [
+            [0, 1, 2, 4],
+            [3, 2, 1, 7],
+            [5, 1, 7, 4],
+            [6, 7, 4, 2],
+            [4, 1, 2, 7],
+        ]
+    )
+    # Convert to 3d index coordinates
+    vidx_coords = np.array(
+        [
+            [0, 0, 0],
+            [0, 0, 1],
+            [0, 1, 0],
+            [0, 1, 1],
+            [1, 0, 0],
+            [1, 0, 1],
+            [1, 1, 0],
+            [1, 1, 1],
+        ]
+    )
+    tet_coords = vidx_coords[tet_vidx]
+    # Symmetry bits for each cell
+    ox, oy, oz = np.meshgrid(
+        np.arange(Nx, dtype=int) % 2, np.arange(Ny, dtype=int) % 2, np.arange(Nz, dtype=int) % 2, indexing="ij"
+    )
+    tet_coords = np.broadcast_to(tet_coords, shape=(*ox.shape, *tet_coords.shape))
+    # Flip coordinates according to symmetry
+    ox_bk = np.broadcast_to(ox.reshape(*ox.shape, 1, 1), tet_coords.shape[:-1])
+    oy_bk = np.broadcast_to(oy.reshape(*oy.shape, 1, 1), tet_coords.shape[:-1])
+    oz_bk = np.broadcast_to(oz.reshape(*oz.shape, 1, 1), tet_coords.shape[:-1])
+    tet_coords_x = tet_coords[..., 0] ^ ox_bk
+    tet_coords_y = tet_coords[..., 1] ^ oy_bk
+    tet_coords_z = tet_coords[..., 2] ^ oz_bk
+    # Back to local vertex indices
+    corner_indices = 4 * tet_coords_x + 2 * tet_coords_y + tet_coords_z
+    # Now go from cell-local to global node indices
+    # There must be a nicer way than this, but for small grids this works
+    corner_indices = corner_indices.reshape(-1, 4)
+    grid_vidx = grid_vidx.reshape((8, -1, 1))
+    grid_vidx = np.broadcast_to(grid_vidx, shape=(8, grid_vidx.shape[1], 5))
+    grid_vidx = grid_vidx.reshape((8, -1))
+    node_indices = np.arange(corner_indices.shape[0])
+    tet_grid_vidx = np.transpose(
+        [
+            grid_vidx[corner_indices[:, 0], node_indices],
+            grid_vidx[corner_indices[:, 1], node_indices],
+            grid_vidx[corner_indices[:, 2], node_indices],
+            grid_vidx[corner_indices[:, 3], node_indices],
+        ]
+    )
+    return tet_grid_vidx
+def grid_to_quads(Nx: int, Ny: int):
+    """Constructs a quadrilateral mesh topology from a dense 2D grid
+    The resulting quads will be indexed counter-clockwise
+    Args:
+        Nx: Resolution of the grid along `x` dimension
+        Ny: Resolution of the grid along `y` dimension
+    Returns:
+        Array of shape (Nx * Ny, 4) containing vertex indices for each quadrilateral
+    """
+    quad_vtx = np.array(
+        [
+            [0, 0],
+            [1, 0],
+            [1, 1],
+            [0, 1],
+        ]
+    ).T
+    quads = np.stack(np.meshgrid(np.arange(0, Nx), np.arange(0, Ny), indexing="ij"))
+    quads_vtx_shape = (*quads.shape, quad_vtx.shape[1])
+    quads_vtx = np.broadcast_to(quads.reshape(*quads.shape, 1), quads_vtx_shape) + np.broadcast_to(
+        quad_vtx.reshape(2, 1, 1, quad_vtx.shape[1]), quads_vtx_shape
+    )
+    quad_vtx_indices = quads_vtx[0] * (Ny + 1) + quads_vtx[1]
+    return quad_vtx_indices.reshape(-1, 4)
+def grid_to_hexes(Nx: int, Ny: int, Nz: int):
+    """Constructs a hexahedral mesh topology from a dense 3D grid
+    The resulting hexes will be indexed following usual convention assuming that `z` is the fastest moving index direction
+    (counter-clockwise bottom vertices, then counter-clockwise top vertices)
+    Args:
+        Nx: Resolution of the grid along `x` dimension
+        Ny: Resolution of the grid along `y` dimension
+        Nz: Resolution of the grid along `z` dimension
+    Returns:
+        Array of shape (Nx * Ny * Nz, 8) containing vertex indices for each hexaedron
+    """
+    hex_vtx = np.array(
+        [
+            [0, 0, 0],
+            [1, 0, 0],
+            [1, 1, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+            [1, 0, 1],
+            [1, 1, 1],
+            [0, 1, 1],
+        ]
+    ).T
+    hexes = np.stack(np.meshgrid(np.arange(0, Nx), np.arange(0, Ny), np.arange(0, Nz), indexing="ij"))
+    hexes_vtx_shape = (*hexes.shape, hex_vtx.shape[1])
+    hexes_vtx = np.broadcast_to(hexes.reshape(*hexes.shape, 1), hexes_vtx_shape) + np.broadcast_to(
+        hex_vtx.reshape(3, 1, 1, 1, hex_vtx.shape[1]), hexes_vtx_shape
+    )
+    hexes_vtx_indices = hexes_vtx[0] * (Nz + 1) * (Ny + 1) + hexes_vtx[1] * (Nz + 1) + hexes_vtx[2]
+    return hexes_vtx_indices.reshape(-1, 8)