PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/utils.py CHANGED Viewed

@@ -5,215 +5,37 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-import os
-import math
-import timeit
 import cProfile
+import sys
+import timeit
+import warnings
+from typing import Any
 import numpy as np
-from typing import Union, Tuple
 import warp as wp
+import warp.types
-def length(a):
-    return np.linalg.norm(a)
-def length_sq(a):
-    return np.dot(a, a)
-def cross(a, b):
-    return np.array((a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]), dtype=np.float32)
-# NumPy has no normalize() method..
-def normalize(v):
-    norm = np.linalg.norm(v)
-    if norm == 0.0:
-        return v
-    return v / norm
-def skew(v):
-    return np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
-# math utils
-# def quat(i, j, k, w):
-#     return np.array([i, j, k, w])
-def quat_identity():
-    return np.array((0.0, 0.0, 0.0, 1.0))
-def quat_inverse(q):
-    return np.array((-q[0], -q[1], -q[2], q[3]))
-def quat_from_axis_angle(axis, angle):
-    v = normalize(np.array(axis))
-    half = angle * 0.5
-    w = math.cos(half)
-    sin_theta_over_two = math.sin(half)
-    v *= sin_theta_over_two
-    return np.array((v[0], v[1], v[2], w))
-def quat_to_axis_angle(quat):
-    w2 = quat[3] * quat[3]
-    if w2 > 1 - 1e-7:
-        return np.zeros(3), 0.0
-    angle = 2 * np.arccos(quat[3])
-    xyz = quat[:3] / np.sqrt(1 - w2)
-    return xyz, angle
-# quat_rotate a vector
-def quat_rotate(q, x):
-    x = np.array(x)
-    axis = np.array((q[0], q[1], q[2]))
-    return x * (2.0 * q[3] * q[3] - 1.0) + np.cross(axis, x) * q[3] * 2.0 + axis * np.dot(axis, x) * 2.0
-# multiply two quats
-def quat_multiply(a, b):
-    return np.array(
-        (
-            a[3] * b[0] + b[3] * a[0] + a[1] * b[2] - b[1] * a[2],
-            a[3] * b[1] + b[3] * a[1] + a[2] * b[0] - b[2] * a[0],
-            a[3] * b[2] + b[3] * a[2] + a[0] * b[1] - b[0] * a[1],
-            a[3] * b[3] - a[0] * b[0] - a[1] * b[1] - a[2] * b[2],
-        )
-    )
-# convert to mat33
-def quat_to_matrix(q):
-    c1 = quat_rotate(q, np.array((1.0, 0.0, 0.0)))
-    c2 = quat_rotate(q, np.array((0.0, 1.0, 0.0)))
-    c3 = quat_rotate(q, np.array((0.0, 0.0, 1.0)))
-    return np.array([c1, c2, c3]).T
-def quat_rpy(roll, pitch, yaw):
-    cy = math.cos(yaw * 0.5)
-    sy = math.sin(yaw * 0.5)
-    cr = math.cos(roll * 0.5)
-    sr = math.sin(roll * 0.5)
-    cp = math.cos(pitch * 0.5)
-    sp = math.sin(pitch * 0.5)
-    w = cy * cr * cp + sy * sr * sp
-    x = cy * sr * cp - sy * cr * sp
-    y = cy * cr * sp + sy * sr * cp
-    z = sy * cr * cp - cy * sr * sp
-    return (x, y, z, w)
-def quat_from_matrix(m):
-    tr = m[0, 0] + m[1, 1] + m[2, 2]
-    h = 0.0
-    if tr >= 0.0:
-        h = math.sqrt(tr + 1.0)
-        w = 0.5 * h
-        h = 0.5 / h
-        x = (m[2, 1] - m[1, 2]) * h
-        y = (m[0, 2] - m[2, 0]) * h
-        z = (m[1, 0] - m[0, 1]) * h
-    else:
-        i = 0
-        if m[1, 1] > m[0, 0]:
-            i = 1
-        if m[2, 2] > m[i, i]:
-            i = 2
-        if i == 0:
-            h = math.sqrt((m[0, 0] - (m[1, 1] + m[2, 2])) + 1.0)
-            x = 0.5 * h
-            h = 0.5 / h
-            y = (m[0, 1] + m[1, 0]) * h
-            z = (m[2, 0] + m[0, 2]) * h
-            w = (m[2, 1] - m[1, 2]) * h
-        elif i == 1:
-            h = math.sqrt((m[1, 1] - (m[2, 2] + m[0, 0])) + 1.0)
-            y = 0.5 * h
-            h = 0.5 / h
-            z = (m[1, 2] + m[2, 1]) * h
-            x = (m[0, 1] + m[1, 0]) * h
-            w = (m[0, 2] - m[2, 0]) * h
-        elif i == 2:
-            h = math.sqrt((m[2, 2] - (m[0, 0] + m[1, 1])) + 1.0)
-            z = 0.5 * h
-            h = 0.5 / h
-            x = (m[2, 0] + m[0, 2]) * h
-            y = (m[1, 2] + m[2, 1]) * h
-            w = (m[1, 0] - m[0, 1]) * h
-    return normalize(np.array([x, y, z, w]))
-# rigid body transform
-# def transform(x, r):
-#     return (np.array(x), np.array(r))
-def transform_identity():
-    return wp.transform(np.array((0.0, 0.0, 0.0)), quat_identity())
-# se(3) -> SE(3), Park & Lynch pg. 105, screw in [w, v] normalized form
-def transform_exp(s, angle):
-    w = np.array(s[0:3])
-    v = np.array(s[3:6])
-    if length(w) < 1.0:
-        r = quat_identity()
-    else:
-        r = quat_from_axis_angle(w, angle)
-    t = v * angle + (1.0 - math.cos(angle)) * np.cross(w, v) + (angle - math.sin(angle)) * np.cross(w, np.cross(w, v))
-    return (t, r)
-def transform_inverse(t):
-    q_inv = quat_inverse(t.q)
-    return wp.transform(-quat_rotate(q_inv, t.p), q_inv)
-def transform_vector(t, v):
-    return quat_rotate(t.q, v)
+warnings_seen = set()
-def transform_point(t, p):
-    return np.array(t.p) + quat_rotate(t.q, p)
+def warp_showwarning(message, category, filename, lineno, file=None, line=None):
+    """Version of warnings.showwarning that always prints to sys.stdout."""
+    sys.stdout.write(warnings.formatwarning(message, category, filename, lineno, line=line))
-def transform_multiply(t, u):
-    return wp.transform(quat_rotate(t.q, u.p) + t.p, quat_multiply(t.q, u.q))
+def warn(message, category=None, stacklevel=1):
+    if (category, message) in warnings_seen:
+        return
+    with warnings.catch_warnings():
+        warnings.simplefilter("default")  # Change the filter in this process
+        warnings.showwarning = warp_showwarning
+        warnings.warn(message, category, stacklevel + 1)  # Increment stacklevel by 1 since we are in a wrapper
-# flatten an array of transforms (p,q) format to a 7-vector
-def transform_flatten(t):
-    return np.array([*t.p, *t.q])
+    if category is DeprecationWarning:
+        warnings_seen.add((category, message))
 # expand a 7-vec to a tuple of arrays
@@ -221,183 +43,368 @@ def transform_expand(t):
     return wp.transform(np.array(t[0:3]), np.array(t[3:7]))
-# convert array of transforms to a array of 7-vecs
-def transform_flatten_list(xforms):
-    exp = lambda t: transform_flatten(t)
-    return list(map(exp, xforms))
-def transform_expand_list(xforms):
-    exp = lambda t: transform_expand(t)
-    return list(map(exp, xforms))
-def transform_inertia(m, I, p, q):
+@wp.func
+def quat_between_vectors(a: wp.vec3, b: wp.vec3) -> wp.quat:
     """
-    Transforms the inertia tensor described by the given mass and 3x3 inertia
-    matrix to a new frame described by the given position and orientation.
+    Compute the quaternion that rotates vector a to vector b
     """
-    R = quat_to_matrix(q)
-    # Steiner's theorem
-    return R @ I @ R.T + m * (np.dot(p, p) * np.eye(3) - np.outer(p, p))
+    a = wp.normalize(a)
+    b = wp.normalize(b)
+    c = wp.cross(a, b)
+    d = wp.dot(a, b)
+    q = wp.quat(c[0], c[1], c[2], 1.0 + d)
+    return wp.normalize(q)
-# spatial operators
+def array_scan(in_array, out_array, inclusive=True):
+    if in_array.device != out_array.device:
+        raise RuntimeError("Array storage devices do not match")
-# AdT
-def spatial_adjoint(t):
-    R = quat_to_matrix(t.q)
-    w = skew(t.p)
+    if in_array.size != out_array.size:
+        raise RuntimeError("Array storage sizes do not match")
-    A = np.zeros((6, 6))
-    A[0:3, 0:3] = R
-    A[3:6, 0:3] = np.dot(w, R)
-    A[3:6, 3:6] = R
+    if in_array.dtype != out_array.dtype:
+        raise RuntimeError("Array data types do not match")
-    return A
+    if in_array.size == 0:
+        return
+    from warp.context import runtime
-# (AdT)^-T
-def spatial_adjoint_dual(t):
-    R = quat_to_matrix(t.q)
-    w = skew(t.p)
+    if in_array.device.is_cpu:
+        if in_array.dtype == wp.int32:
+            runtime.core.array_scan_int_host(in_array.ptr, out_array.ptr, in_array.size, inclusive)
+        elif in_array.dtype == wp.float32:
+            runtime.core.array_scan_float_host(in_array.ptr, out_array.ptr, in_array.size, inclusive)
+        else:
+            raise RuntimeError("Unsupported data type")
+    elif in_array.device.is_cuda:
+        if in_array.dtype == wp.int32:
+            runtime.core.array_scan_int_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
+        elif in_array.dtype == wp.float32:
+            runtime.core.array_scan_float_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
+        else:
+            raise RuntimeError("Unsupported data type")
-    A = np.zeros((6, 6))
-    A[0:3, 0:3] = R
-    A[0:3, 3:6] = np.dot(w, R)
-    A[3:6, 3:6] = R
-    return A
+def radix_sort_pairs(keys, values, count: int):
+    if keys.device != values.device:
+        raise RuntimeError("Array storage devices do not match")
+    if count == 0:
+        return
-# AdT*s
-def transform_twist(t_ab, s_b):
-    return np.dot(spatial_adjoint(t_ab), s_b)
+    if keys.size < 2 * count or values.size < 2 * count:
+        raise RuntimeError("Array storage must be large enough to contain 2*count elements")
+    from warp.context import runtime
-# AdT^{-T}*s
-def transform_wrench(t_ab, f_b):
-    return np.dot(spatial_adjoint_dual(t_ab), f_b)
+    if keys.device.is_cpu:
+        if keys.dtype == wp.int32 and values.dtype == wp.int32:
+            runtime.core.radix_sort_pairs_int_host(keys.ptr, values.ptr, count)
+        else:
+            raise RuntimeError("Unsupported data type")
+    elif keys.device.is_cuda:
+        if keys.dtype == wp.int32 and values.dtype == wp.int32:
+            runtime.core.radix_sort_pairs_int_device(keys.ptr, values.ptr, count)
+        else:
+            raise RuntimeError("Unsupported data type")
-# transform spatial inertia (6x6) in b frame to a frame
-def transform_spatial_inertia(t_ab, I_b):
-    t_ba = transform_inverse(t_ab)
+def runlength_encode(values, run_values, run_lengths, run_count=None, value_count=None):
+    if run_values.device != values.device or run_lengths.device != values.device:
+        raise RuntimeError("Array storage devices do not match")
-    # todo: write specialized method
-    I_a = np.dot(np.dot(spatial_adjoint(t_ba).T, I_b), spatial_adjoint(t_ba))
-    return I_a
+    if value_count is None:
+        value_count = values.size
+    if run_values.size < value_count or run_lengths.size < value_count:
+        raise RuntimeError("Output array storage sizes must be at least equal to value_count")
-def translate_twist(p_ab, s_b):
-    w = s_b[0:3]
-    v = np.cross(p_ab, s_b[0:3]) + s_b[3:6]
+    if values.dtype != run_values.dtype:
+        raise RuntimeError("values and run_values data types do not match")
-    return np.array((*w, *v))
+    if run_lengths.dtype != wp.int32:
+        raise RuntimeError("run_lengths array must be of type int32")
+    # User can provide a device output array for storing the number of runs
+    # For convenience, if no such array is provided, number of runs is returned on host
+    if run_count is None:
+        if value_count == 0:
+            return 0
+        run_count = wp.empty(shape=(1,), dtype=int, device=values.device)
+        host_return = True
+    else:
+        if run_count.device != values.device:
+            raise RuntimeError("run_count storage device does not match other arrays")
+        if run_count.dtype != wp.int32:
+            raise RuntimeError("run_count array must be of type int32")
+        if value_count == 0:
+            run_count.zero_()
+            return 0
+        host_return = False
-def translate_wrench(p_ab, s_b):
-    w = s_b[0:3] + np.cross(p_ab, s_b[3:6])
-    v = s_b[3:6]
+    from warp.context import runtime
-    return np.array((*w, *v))
+    if values.device.is_cpu:
+        if values.dtype == wp.int32:
+            runtime.core.runlength_encode_int_host(
+                values.ptr, run_values.ptr, run_lengths.ptr, run_count.ptr, value_count
+            )
+        else:
+            raise RuntimeError("Unsupported data type")
+    elif values.device.is_cuda:
+        if values.dtype == wp.int32:
+            runtime.core.runlength_encode_int_device(
+                values.ptr, run_values.ptr, run_lengths.ptr, run_count.ptr, value_count
+            )
+        else:
+            raise RuntimeError("Unsupported data type")
+    if host_return:
+        return int(run_count.numpy()[0])
-# def spatial_vector(v=(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)):
-#     return np.array(v)
+def array_sum(values, out=None, value_count=None, axis=None):
+    if value_count is None:
+        if axis is None:
+            value_count = values.size
+        else:
+            value_count = values.shape[axis]
-# ad_V pg. 289 L&P, pg. 25 Featherstone
-def spatial_cross(a, b):
-    w = np.cross(a[0:3], b[0:3])
-    v = np.cross(a[3:6], b[0:3]) + np.cross(a[0:3], b[3:6])
+    if axis is None:
+        output_shape = (1,)
+    else:
-    return np.array((*w, *v))
+        def output_dim(ax, dim):
+            return 1 if ax == axis else dim
+        output_shape = tuple(output_dim(ax, dim) for ax, dim in enumerate(values.shape))
-# ad_V^T pg. 290 L&P,  pg. 25 Featurestone, note this does not includes the sign flip in the definition
-def spatial_cross_dual(a, b):
-    w = np.cross(a[0:3], b[0:3]) + np.cross(a[3:6], b[3:6])
-    v = np.cross(a[0:3], b[3:6])
+    type_length = wp.types.type_length(values.dtype)
+    scalar_type = wp.types.type_scalar_type(values.dtype)
-    return np.array((*w, *v))
+    # User can provide a device output array for storing the number of runs
+    # For convenience, if no such array is provided, number of runs is returned on host
+    if out is None:
+        host_return = True
+        out = wp.empty(shape=output_shape, dtype=values.dtype, device=values.device)
+    else:
+        host_return = False
+        if out.device != values.device:
+            raise RuntimeError("out storage device should match values array")
+        if out.dtype != values.dtype:
+            raise RuntimeError(f"out array should have type {values.dtype.__name__}")
+        if out.shape != output_shape:
+            raise RuntimeError(f"out array should have shape {output_shape}")
+    if value_count == 0:
+        out.zero_()
+        if axis is None and host_return:
+            return out.numpy()[0]
+        return out
+    from warp.context import runtime
-def spatial_dot(a, b):
-    return np.dot(a, b)
+    if values.device.is_cpu:
+        if scalar_type == wp.float32:
+            native_func = runtime.core.array_sum_float_host
+        elif scalar_type == wp.float64:
+            native_func = runtime.core.array_sum_double_host
+        else:
+            raise RuntimeError("Unsupported data type")
+    elif values.device.is_cuda:
+        if scalar_type == wp.float32:
+            native_func = runtime.core.array_sum_float_device
+        elif scalar_type == wp.float64:
+            native_func = runtime.core.array_sum_double_device
+        else:
+            raise RuntimeError("Unsupported data type")
+    if axis is None:
+        stride = wp.types.type_size_in_bytes(values.dtype)
+        native_func(values.ptr, out.ptr, value_count, stride, type_length)
-def spatial_outer(a, b):
-    return np.outer(a, b)
+        if host_return:
+            return out.numpy()[0]
+    else:
+        stride = values.strides[axis]
+        for idx in np.ndindex(output_shape):
+            out_offset = sum(i * s for i, s in zip(idx, out.strides))
+            val_offset = sum(i * s for i, s in zip(idx, values.strides))
+            native_func(
+                values.ptr + val_offset,
+                out.ptr + out_offset,
+                value_count,
+                stride,
+                type_length,
+            )
-# def spatial_matrix():
-#     return np.zeros((6, 6))
+        if host_return:
+            return out
-def spatial_matrix_from_inertia(I, m):
-    G = spatial_matrix()
+def array_inner(a, b, out=None, count=None, axis=None):
+    if a.size != b.size:
+        raise RuntimeError("Array storage sizes do not match")
-    G[0:3, 0:3] = I
-    G[3, 3] = m
-    G[4, 4] = m
-    G[5, 5] = m
+    if a.device != b.device:
+        raise RuntimeError("Array storage devices do not match")
-    return G
+    if a.dtype != b.dtype:
+        raise RuntimeError("Array data types do not match")
+    if count is None:
+        if axis is None:
+            count = a.size
+        else:
+            count = a.shape[axis]
-# solves x = I^(-1)b
-def spatial_solve(I, b):
-    return np.dot(np.linalg.inv(I), b)
+    if axis is None:
+        output_shape = (1,)
+    else:
+        def output_dim(ax, dim):
+            return 1 if ax == axis else dim
-# helper to retrive body angular velocity from a twist v_s in se(3)
-def get_body_angular_velocity(v_s):
-    return v_s[0:3]
+        output_shape = tuple(output_dim(ax, dim) for ax, dim in enumerate(a.shape))
+    type_length = wp.types.type_length(a.dtype)
+    scalar_type = wp.types.type_scalar_type(a.dtype)
-# helper to compute velocity of a point p on a body given it's spatial twist v_s
-def get_body_linear_velocity(v_s, p):
-    dpdt = v_s[3:6] + np.cross(v_s[0:3], p)
-    return dpdt
+    # User can provide a device output array for storing the number of runs
+    # For convenience, if no such array is provided, number of runs is returned on host
+    if out is None:
+        host_return = True
+        out = wp.empty(shape=output_shape, dtype=scalar_type, device=a.device)
+    else:
+        host_return = False
+        if out.device != a.device:
+            raise RuntimeError("out storage device should match values array")
+        if out.dtype != scalar_type:
+            raise RuntimeError(f"out array should have type {scalar_type.__name__}")
+        if out.shape != output_shape:
+            raise RuntimeError(f"out array should have shape {output_shape}")
+    if count == 0:
+        if axis is None and host_return:
+            return 0.0
+        out.zero_()
+        return out
+    from warp.context import runtime
-# helper to build a body twist given the angular and linear velocity of
-# the center of mass specified in the world frame, returns the body
-# twist with respect to the origin (v_s)
-def get_body_twist(w_m, v_m, p_m):
-    lin = v_m + np.cross(p_m, w_m)
-    return (*w_m, *lin)
+    if a.device.is_cpu:
+        if scalar_type == wp.float32:
+            native_func = runtime.core.array_inner_float_host
+        elif scalar_type == wp.float64:
+            native_func = runtime.core.array_inner_double_host
+        else:
+            raise RuntimeError("Unsupported data type")
+    elif a.device.is_cuda:
+        if scalar_type == wp.float32:
+            native_func = runtime.core.array_inner_float_device
+        elif scalar_type == wp.float64:
+            native_func = runtime.core.array_inner_double_device
+        else:
+            raise RuntimeError("Unsupported data type")
+    if axis is None:
+        stride_a = wp.types.type_size_in_bytes(a.dtype)
+        stride_b = wp.types.type_size_in_bytes(b.dtype)
+        native_func(a.ptr, b.ptr, out.ptr, count, stride_a, stride_b, type_length)
-def array_scan(in_array, out_array, inclusive=True):
+        if host_return:
+            return out.numpy()[0]
+    else:
+        stride_a = a.strides[axis]
+        stride_b = b.strides[axis]
+        for idx in np.ndindex(output_shape):
+            out_offset = sum(i * s for i, s in zip(idx, out.strides))
+            a_offset = sum(i * s for i, s in zip(idx, a.strides))
+            b_offset = sum(i * s for i, s in zip(idx, b.strides))
+            native_func(
+                a.ptr + a_offset,
+                b.ptr + b_offset,
+                out.ptr + out_offset,
+                count,
+                stride_a,
+                stride_b,
+                type_length,
+            )
+        if host_return:
+            return out
+@wp.kernel
+def _array_cast_kernel(
+    dest: Any,
+    src: Any,
+):
+    i = wp.tid()
+    dest[i] = dest.dtype(src[i])
+def array_cast(in_array, out_array, count=None):
     if in_array.device != out_array.device:
         raise RuntimeError("Array storage devices do not match")
-    if in_array.size != out_array.size:
-        raise RuntimeError("Array storage sizes do not match")
+    in_array_data_shape = getattr(in_array.dtype, "_shape_", ())
+    out_array_data_shape = getattr(out_array.dtype, "_shape_", ())
+    if in_array.ndim != out_array.ndim or in_array_data_shape != out_array_data_shape:
+        # Number of dimensions or data type shape do not match.
+        # Flatten arrays and do cast at the scalar level
+        in_array = in_array.flatten()
+        out_array = out_array.flatten()
+        in_array_data_length = warp.types.type_length(in_array.dtype)
+        out_array_data_length = warp.types.type_length(out_array.dtype)
+        in_array_scalar_type = wp.types.type_scalar_type(in_array.dtype)
+        out_array_scalar_type = wp.types.type_scalar_type(out_array.dtype)
+        in_array = wp.array(
+            data=None,
+            ptr=in_array.ptr,
+            capacity=in_array.capacity,
+            owner=False,
+            device=in_array.device,
+            dtype=in_array_scalar_type,
+            shape=in_array.shape[0] * in_array_data_length,
+        )
-    if in_array.dtype != out_array.dtype:
-        raise RuntimeError("Array data types do not match")
+        out_array = wp.array(
+            data=None,
+            ptr=out_array.ptr,
+            capacity=out_array.capacity,
+            owner=False,
+            device=out_array.device,
+            dtype=out_array_scalar_type,
+            shape=out_array.shape[0] * out_array_data_length,
+        )
-    from warp.context import runtime
+        if count is not None:
+            count *= in_array_data_length
-    if in_array.device == "cpu":
-        if in_array.dtype == wp.int32:
-            runtime.core.array_scan_int_host(in_array.ptr, out_array.ptr, in_array.size, inclusive)
-        elif in_array.dtype == wp.float32:
-            runtime.core.array_scan_float_host(in_array.ptr, out_array.ptr, in_array.size, inclusive)
-        else:
-            raise RuntimeError("Unsupported data type")
-    elif in_array.device == "cuda":
-        if in_array.dtype == wp.int32:
-            runtime.core.array_scan_int_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
-        elif in_array.dtype == wp.float32:
-            runtime.core.array_scan_float_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
-        else:
-            raise RuntimeError("Unsupported data type")
+    if count is None:
+        count = in_array.size
+    if in_array.ndim == 1:
+        dim = count
+    elif count < in_array.size:
+        raise RuntimeError("Partial cast is not supported for arrays with more than one dimension")
+    else:
+        dim = in_array.shape
+    if in_array.dtype == out_array.dtype:
+        # Same data type, can simply copy
+        wp.copy(dest=out_array, src=in_array, count=count)
+    else:
+        wp.launch(kernel=_array_cast_kernel, dim=dim, inputs=[out_array, in_array], device=out_array.device)
 # code snippet for invoking cProfile
@@ -411,6 +418,25 @@ def array_scan(in_array, out_array, inclusive=True):
 # exit(0)
+# helper kernels for initializing NVDB volumes from a dense array
+@wp.kernel
+def copy_dense_volume_to_nano_vdb_v(volume: wp.uint64, values: wp.array(dtype=wp.vec3, ndim=3)):
+    i, j, k = wp.tid()
+    wp.volume_store_v(volume, i, j, k, values[i, j, k])
+@wp.kernel
+def copy_dense_volume_to_nano_vdb_f(volume: wp.uint64, values: wp.array(dtype=wp.float32, ndim=3)):
+    i, j, k = wp.tid()
+    wp.volume_store_f(volume, i, j, k, values[i, j, k])
+@wp.kernel
+def copy_dense_volume_to_nano_vdb_i(volume: wp.uint64, values: wp.array(dtype=wp.int32, ndim=3)):
+    i, j, k = wp.tid()
+    wp.volume_store_i(volume, i, j, k, values[i, j, k])
 # represent an edge between v0, v1 with connected faces f0, f1, and opposite vertex o0, and o1
 # winding is such that first tri can be reconstructed as {v0, v1, o0}, and second tri as { v1, v0, o1 }
 class MeshEdge:
@@ -454,11 +480,8 @@ class MeshAdjacency:
         self.edges[key] = edge
-    def opposite_vertex(self, edge):
-        pass
-def mem_report():
+def mem_report(): #pragma: no cover
     def _mem_report(tensors, mem_type):
         """Print the selected tensors of type
         There are two major storage types in our major concern:
@@ -494,6 +517,7 @@ def mem_report():
         print("Type: %s Total Tensors: %d \tUsed Memory Space: %.2f MBytes" % (mem_type, total_numel, total_mem))
     import gc
     import torch
     gc.collect()
@@ -509,35 +533,6 @@ def mem_report():
     print("=" * LEN)
-def lame_parameters(E, nu):
-    l = (E * nu) / ((1.0 + nu) * (1.0 - 2.0 * nu))
-    mu = E / (2.0 * (1.0 + nu))
-    return (l, mu)
-# **Deprecated: use ScopedDevice instead
-# ensures that correct CUDA is set for the guards lifetime
-# restores the previous CUDA context on exit
-class ScopedCudaGuard:
-    def __init__(self):
-        import warnings
-        warnings.warn("ScopedCudaGuard is deprecated, use ScopedDevice instead")
-        if wp.context.runtime.cuda_devices:
-            self.device = wp.context.runtime.initial_cuda_device
-        else:
-            self.device = None
-    def __enter__(self):
-        if self.device is not None:
-            self.device.context_guard.__enter__()
-    def __exit__(self, exc_type, exc_value, traceback):
-        if self.device is not None:
-            self.device.context_guard.__exit__(exc_type, exc_value, traceback)
 class ScopedDevice:
     def __init__(self, device):
@@ -642,7 +637,8 @@ class ScopedTimer:
                 return
             self.start = timeit.default_timer()
-            ScopedTimer.indent += 1
+            if self.print:
+                ScopedTimer.indent += 1
             if self.detailed:
                 self.cp = cProfile.Profile()
@@ -679,3 +675,17 @@ class ScopedTimer:
                 print("{}{} took {:.2f} ms".format(indent, self.name, self.elapsed))
             ScopedTimer.indent -= 1
+# helper kernels for adj_matmul
+@wp.kernel
+def add_kernel_2d(x: wp.array2d(dtype=Any), acc: wp.array2d(dtype=Any), beta: Any):
+    i, j = wp.tid()
+    x[i,j] = x[i,j] + beta * acc[i,j]
+@wp.kernel
+def add_kernel_3d(x: wp.array3d(dtype=Any), acc: wp.array3d(dtype=Any), beta: Any):
+    i, j, k = wp.tid()
+    x[i,j,k] = x[i,j,k] + beta * acc[i,j,k]