PyPI - warp-lang - Versions diffs - 1.5.0__py3-none-manylinux2014_aarch64.whl → 1.6.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.5.0__py3-none-manylinux2014_aarch64.whl → 1.6.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (132) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1124 -497
warp/codegen.py +261 -136
warp/config.py +1 -1
warp/context.py +357 -119
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_torch.py +18 -34
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth.py +3 -1
warp/examples/sim/example_cloth_self_contact.py +260 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +180 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/fem/geometry/geometry.py +0 -2
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +5 -1
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +91 -53
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +41 -0
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1187 -669
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +130 -64
warp/sim/__init__.py +6 -1
warp/sim/collide.py +270 -26
warp/sim/import_urdf.py +8 -8
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +154 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +134 -72
warp/sparse.py +1 -1
warp/stubs.py +265 -132
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +74 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +2 -2
warp/tests/test_coloring.py +12 -2
warp/tests/test_examples.py +12 -1
warp/tests/test_func.py +21 -4
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_lerp.py +13 -87
warp/tests/test_mat.py +138 -167
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +17 -16
warp/tests/test_matmul_lite.py +10 -15
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +47 -2
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_static.py +19 -3
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +178 -191
warp/tests/test_tile_load.py +356 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -13
warp/thirdparty/unittest_parallel.py +2 -2
warp/types.py +411 -101
warp/utils.py +10 -7
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/METADATA +92 -69
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/RECORD +130 -119
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0

warp/tests/test_mat.py CHANGED Viewed

@@ -6,20 +6,14 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import unittest
+from typing import Any
 import numpy as np
 import warp as wp
 from warp.tests.unittest_utils import *
-np_signed_int_types = [
-    np.int8,
-    np.int16,
-    np.int32,
-    np.int64,
-    np.byte,
-]
+np_signed_int_types = [np.int8, np.int16, np.int32, np.int64, np.byte]
 np_float_types = [np.float16, np.float32, np.float64]
@@ -42,11 +36,7 @@ def getkernel(func, suffix=""):
 def get_select_kernel(dtype):
-    def output_select_kernel_fn(
-        input: wp.array(dtype=dtype),
-        index: int,
-        out: wp.array(dtype=dtype),
-    ):
+    def output_select_kernel_fn(input: wp.array(dtype=dtype), index: int, out: wp.array(dtype=dtype)):
         out[0] = input[index]
     return getkernel(output_select_kernel_fn, suffix=dtype.__name__)
@@ -61,33 +51,19 @@ def test_anon_constructor_error_shape_arg_missing(test, device):
         RuntimeError,
         r"the `shape` argument must be specified when initializing a matrix by value$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_anon_constructor_error_shape_mismatch(test, device):
     @wp.kernel
     def kernel():
-        wp.matrix(
-            wp.matrix(shape=(1, 2), dtype=float),
-            shape=(3, 4),
-            dtype=float,
-        )
+        wp.matrix(wp.matrix(shape=(1, 2), dtype=float), shape=(3, 4), dtype=float)
     with test.assertRaisesRegex(
         RuntimeError,
         r"incompatible matrix of shape \(3, 4\) given when copy constructing a matrix of shape \(1, 2\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_anon_constructor_error_type_mismatch(test, device):
@@ -99,12 +75,7 @@ def test_anon_constructor_error_type_mismatch(test, device):
         RuntimeError,
         r"the value used to fill this matrix is expected to be of the type `float16`$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_anon_constructor_error_invalid_arg_count(test, device):
@@ -116,12 +87,7 @@ def test_anon_constructor_error_invalid_arg_count(test, device):
         RuntimeError,
         r"incompatible number of values given \(3\) when constructing a matrix of shape \(2, 2\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_anon_xform_constructor_error_type_mismatch(test, device):
@@ -150,12 +116,7 @@ def test_tpl_constructor_error_incompatible_sizes(test, device):
         RuntimeError,
         r"incompatible matrix of shape \(3, 3\) given when copy constructing a matrix of shape \(2, 2\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_tpl_constructor_error_invalid_vector_count(test, device):
@@ -167,12 +128,7 @@ def test_tpl_constructor_error_invalid_vector_count(test, device):
         RuntimeError,
         r"incompatible number of column vectors given \(2\) when constructing a matrix of shape \(3, 3\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_tpl_constructor_error_invalid_vector_shape(test, device):
@@ -184,12 +140,7 @@ def test_tpl_constructor_error_invalid_vector_shape(test, device):
         RuntimeError,
         r"incompatible column vector lengths given when constructing a matrix of shape \(2, 2\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_tpl_constructor_error_invalid_arg_count(test, device):
@@ -201,12 +152,7 @@ def test_tpl_constructor_error_invalid_arg_count(test, device):
         RuntimeError,
         r"incompatible number of values given \(3\) when constructing a matrix of shape \(2, 2\)$",
     ):
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_py_arithmetic_ops(test, device, dtype):
@@ -541,16 +487,7 @@ def test_subtraction(test, device, dtype, register_kernels=False):
                         wp.launch(
                             kernel,
                             dim=1,
-                            inputs=[
-                                s2,
-                                s3,
-                                s4,
-                                s5,
-                                v2,
-                                v3,
-                                v4,
-                                v5,
-                            ],
+                            inputs=[s2, s3, s4, s5, v2, v3, v4, v5],
                             outputs=[outcomponents],
                             device=device,
                         )
@@ -558,11 +495,11 @@ def test_subtraction(test, device, dtype, register_kernels=False):
                             output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device
                         )
                     tape.backward(loss=out)
-                    expectedresult = np.zeros((dim, dim), dtype=dtype)
-                    expectedresult[i, j] = 2
-                    assert_np_equal(tape.gradients[in2].numpy()[0], expectedresult, tol=10 * tol)
-                    expectedresult[i, j] = -2
-                    assert_np_equal(tape.gradients[in1].numpy()[0], expectedresult, tol=10 * tol)
+                    expected_result = np.zeros((dim, dim), dtype=dtype)
+                    expected_result[i, j] = 2
+                    assert_np_equal(tape.gradients[in2].numpy()[0], expected_result, tol=10 * tol)
+                    expected_result[i, j] = -2
+                    assert_np_equal(tape.gradients[in1].numpy()[0], expected_result, tol=10 * tol)
                     tape.zero()
                     idx = idx + 1
@@ -608,21 +545,7 @@ def test_determinant(test, device, dtype, register_kernels=False):
     tape = wp.Tape()
     with tape:
-        wp.launch(
-            kernel,
-            dim=1,
-            inputs=[
-                v2,
-                v3,
-                v4,
-            ],
-            outputs=[
-                det2,
-                det3,
-                det4,
-            ],
-            device=device,
-        )
+        wp.launch(kernel, dim=1, inputs=[v2, v3, v4], outputs=[det2, det3, det4], device=device)
     if dtype in np_float_types:
         assert_np_equal(det2.numpy()[0], 2 * np.linalg.det(v2.numpy()[0].astype(np.float64)), tol=100 * tol)
@@ -658,16 +581,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
-                        v3,
-                        v4,
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device), v3, v4],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dplus = det2.numpy()[0]
@@ -675,16 +590,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device),
-                        v3,
-                        v4,
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[wp.array(v2test, dtype=v2.dtype, requires_grad=True, device=device), v3, v4],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dminus = det2.numpy()[0]
@@ -697,16 +604,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        v2,
-                        wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
-                        v4,
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[v2, wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device), v4],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dplus = det3.numpy()[0]
@@ -714,16 +613,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        v2,
-                        wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device),
-                        v4,
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[v2, wp.array(v3test, dtype=v3.dtype, requires_grad=True, device=device), v4],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dminus = det3.numpy()[0]
@@ -736,16 +627,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        v2,
-                        v3,
-                        wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[v2, v3, wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device)],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dplus = det4.numpy()[0]
@@ -753,16 +636,8 @@ def test_determinant(test, device, dtype, register_kernels=False):
                 wp.launch(
                     kernel,
                     dim=1,
-                    inputs=[
-                        v2,
-                        v3,
-                        wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device),
-                    ],
-                    outputs=[
-                        det2,
-                        det3,
-                        det4,
-                    ],
+                    inputs=[v2, v3, wp.array(v4test, dtype=v4.dtype, requires_grad=True, device=device)],
+                    outputs=[det2, det3, det4],
                     device=device,
                 )
                 dminus = det4.numpy()[0]
@@ -1722,8 +1597,9 @@ def test_matrix_mutation(expected: wp.types.matrix(shape=(10, 3), dtype=float)):
     wp.expect_eq(m, expected)
-CONSTANT_SHAPE_ROWS = wp.constant(10)
-CONSTANT_SHAPE_COLS = wp.constant(10)
+# NOTE: Compile tile is highly sensitive to shape so we use small values now
+CONSTANT_SHAPE_ROWS = wp.constant(2)
+CONSTANT_SHAPE_COLS = wp.constant(2)
 # tests that we can use global constants in shape keyword argument
@@ -1737,6 +1613,106 @@ def test_constructors_constant_shape():
             m[i, j] = float(i * j)
+Mat23 = wp.mat((2, 3), dtype=wp.float16)
+@wp.kernel
+def matrix_len_kernel(
+    m1: wp.mat22, m2: wp.mat((3, 3), float), m3: wp.mat((Any, Any), float), m4: Mat23, out: wp.array(dtype=int)
+):
+    length = wp.static(len(m1))
+    wp.expect_eq(len(m1), 2)
+    out[0] = len(m1)
+    length = len(m2)
+    wp.expect_eq(wp.static(len(m2)), 3)
+    out[1] = len(m2)
+    length = len(m3)
+    wp.expect_eq(len(m3), 4)
+    out[2] = wp.static(len(m3))
+    length = wp.static(len(m4))
+    wp.expect_eq(wp.static(len(m4)), 2)
+    out[3] = wp.static(len(m4))
+    foo = wp.mat22()
+    length = len(foo)
+    wp.expect_eq(len(foo), 2)
+    out[4] = len(foo)
+def test_matrix_len(test, device):
+    m1 = wp.mat22()
+    m2 = wp.mat33()
+    m3 = wp.mat44()
+    m4 = Mat23()
+    out = wp.empty(5, dtype=int, device=device)
+    wp.launch(matrix_len_kernel, dim=(1,), inputs=(m1, m2, m3, m4), outputs=(out,), device=device)
+    test.assertEqual(out.numpy()[0], 2)
+    test.assertEqual(out.numpy()[1], 3)
+    test.assertEqual(out.numpy()[2], 4)
+    test.assertEqual(out.numpy()[3], 2)
+    test.assertEqual(out.numpy()[4], 2)
+    test.assertEqual(len(m1), 2)
+    test.assertEqual(len(m2), 3)
+    test.assertEqual(len(m3), 4)
+    test.assertEqual(len(m4), 2)
+@wp.kernel
+def matrix_augassign_kernel(
+    a: wp.array(dtype=wp.mat22), b: wp.array(dtype=wp.mat22), c: wp.array(dtype=wp.mat22), d: wp.array(dtype=wp.mat22)
+):
+    i = wp.tid()
+    m1 = wp.mat22()
+    m2 = b[i]
+    m1[0, 0] += m2[0, 0]
+    m1[0, 1] += m2[0, 1]
+    m1[1, 0] += m2[1, 0]
+    m1[1, 1] += m2[1, 1]
+    a[i] = m1
+    m3 = wp.mat22()
+    m4 = d[i]
+    m3[0, 0] -= m4[0, 0]
+    m3[0, 1] -= m4[0, 1]
+    m3[1, 0] -= m4[1, 0]
+    m3[1, 1] -= m4[1, 1]
+    c[i] = m3
+def test_matrix_augassign(test, device):
+    N = 3
+    a = wp.zeros(N, dtype=wp.mat22, requires_grad=True)
+    b = wp.ones(N, dtype=wp.mat22, requires_grad=True)
+    c = wp.zeros(N, dtype=wp.mat22, requires_grad=True)
+    d = wp.ones(N, dtype=wp.mat22, requires_grad=True)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(matrix_augassign_kernel, N, inputs=[a, b, c, d])
+    tape.backward(grads={a: wp.ones_like(a), c: wp.ones_like(c)})
+    assert_np_equal(a.numpy(), wp.ones_like(a).numpy())
+    assert_np_equal(a.grad.numpy(), wp.ones_like(a).numpy())
+    assert_np_equal(b.grad.numpy(), wp.ones_like(a).numpy())
+    assert_np_equal(c.numpy(), -wp.ones_like(c).numpy())
+    assert_np_equal(c.grad.numpy(), wp.ones_like(c).numpy())
+    assert_np_equal(d.grad.numpy(), -wp.ones_like(d).numpy())
 devices = get_test_devices()
@@ -1797,16 +1773,10 @@ add_function_test(
     devices=devices,
 )
 add_function_test(
-    TestMat,
-    "test_anon_constructor_error_shape_mismatch",
-    test_anon_constructor_error_shape_mismatch,
-    devices=devices,
+    TestMat, "test_anon_constructor_error_shape_mismatch", test_anon_constructor_error_shape_mismatch, devices=devices
 )
 add_function_test(
-    TestMat,
-    "test_anon_constructor_error_type_mismatch",
-    test_anon_constructor_error_type_mismatch,
-    devices=devices,
+    TestMat, "test_anon_constructor_error_type_mismatch", test_anon_constructor_error_type_mismatch, devices=devices
 )
 add_function_test(
     TestMat,
@@ -1875,7 +1845,8 @@ for dtype in np_float_types:
         devices=devices,
         dtype=dtype,
     )
+add_function_test(TestMat, "test_matrix_len", test_matrix_len, devices=devices)
+add_function_test(TestMat, "test_matrix_augassign", test_matrix_augassign, devices=devices)
 if __name__ == "__main__":
     wp.clear_kernel_cache()

warp/tests/test_math.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import unittest
-from typing import NamedTuple
+from typing import Any, NamedTuple
 import numpy as np
@@ -50,6 +50,51 @@ def test_scalar_math(test, device):
         assert_np_equal(tape.gradients[x].numpy(), np.array([adj_float_results_expected[i]]), tol=1e-6)
+@wp.kernel
+def test_vec_norm_kernel(vs: wp.array(dtype=Any), out: wp.array(dtype=float, ndim=2)):
+    tid = wp.tid()
+    out[tid, 0] = wp.norm_l1(vs[tid])
+    out[tid, 1] = wp.norm_l2(vs[tid])
+    out[tid, 2] = wp.norm_huber(vs[tid])
+    out[tid, 3] = wp.norm_pseudo_huber(vs[tid])
+def test_vec_norm(test, device):
+    # ground-truth implementations from SciPy
+    def huber(delta, x):
+        if x <= delta:
+            return 0.5 * x**2
+        else:
+            return delta * (x - 0.5 * delta)
+    def pseudo_huber(delta, x):
+        return delta**2 * (np.sqrt(1 + (x / delta) ** 2) - 1)
+    v0 = wp.vec3(-2.0, -1.0, -3.0)
+    v1 = wp.vec3(2.0, 1.0, 3.0)
+    v2 = wp.vec3(0.0, 0.0, 0.0)
+    xs = wp.array([v0, v1, v2], dtype=wp.vec3, requires_grad=True, device=device)
+    out = wp.empty((len(xs), 4), dtype=wp.float32, requires_grad=True, device=device)
+    wp.launch(test_vec_norm_kernel, dim=len(xs), inputs=[xs], outputs=[out], device=device)
+    for i, x in enumerate([v0, v1, v2]):
+        assert_np_equal(
+            out.numpy()[i],
+            np.array(
+                [
+                    np.linalg.norm(x, ord=1),
+                    np.linalg.norm(x, ord=2),
+                    huber(1.0, wp.length(x)),
+                    # note SciPy defines the Pseudo-Huber loss slightly differently
+                    pseudo_huber(1.0, wp.length(x)) + 1.0,
+                ]
+            ),
+            tol=1e-6,
+        )
 devices = get_test_devices()
@@ -117,6 +162,7 @@ class TestMath(unittest.TestCase):
 add_function_test(TestMath, "test_scalar_math", test_scalar_math, devices=devices)
+add_function_test(TestMath, "test_vec_norm", test_vec_norm, devices=devices)
 if __name__ == "__main__":

warp/tests/test_matmul.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import itertools
 import unittest
 from typing import Any
@@ -105,19 +106,15 @@ class gemm_test_bed_runner:
         assert_np_equal(C.grad.numpy(), adj_C_np)
     def run(self):
-        Ms = [64, 128, 256]
-        Ns = [64, 128, 256]
-        Ks = [64, 128, 256]
+        Ms = [16, 32, 64]
+        Ns = [16, 32, 64]
+        Ks = [16, 32, 64]
         batch_counts = [1, 4]
         betas = [0.0, 1.0]
         alpha = 1.0
-        for batch_count in batch_counts:
-            for m in Ms:
-                for n in Ns:
-                    for k in Ks:
-                        for beta in betas:
-                            self.run_and_verify(m, n, k, batch_count, alpha, beta)
+        for batch_count, m, n, k, beta in itertools.product(batch_counts, Ms, Ns, Ks, betas):
+            self.run_and_verify(m, n, k, batch_count, alpha, beta)
 class gemm_test_bed_runner_transpose:
@@ -488,13 +485,17 @@ class TestMatmul(unittest.TestCase):
 # add_function_test(TestMatmul, "test_f16", test_f16, devices=devices)
-add_function_test(TestMatmul, "test_f32", test_f32, devices=devices)
-add_function_test(TestMatmul, "test_f64", test_f64, devices=devices)
-add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
-add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
-add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices)
-add_function_test(TestMatmul, "test_adjoint_accumulation", test_adjoint_accumulation, devices=devices)
-add_function_test(TestMatmul, "test_cuda_graph_capture", test_cuda_graph_capture, devices=cuda_devices)
+add_function_test(TestMatmul, "test_f32", test_f32, devices=devices, check_output=False)
+add_function_test(TestMatmul, "test_f64", test_f64, devices=devices, check_output=False)
+add_function_test(TestMatmul, "test_tape", test_tape, devices=devices, check_output=False)
+add_function_test(TestMatmul, "test_operator", test_operator, devices=devices, check_output=False)
+add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices, check_output=False)
+add_function_test(
+    TestMatmul, "test_adjoint_accumulation", test_adjoint_accumulation, devices=devices, check_output=False
+)
+add_function_test(
+    TestMatmul, "test_cuda_graph_capture", test_cuda_graph_capture, devices=cuda_devices, check_output=False
+)
 if __name__ == "__main__":

warp/tests/test_matmul_lite.py CHANGED Viewed

@@ -102,19 +102,14 @@ class gemm_test_bed_runner:
         assert_np_equal(C.grad.numpy(), adj_C_np)
     def run(self):
-        Ms = [8]
-        Ns = [16]
-        Ks = [32]
-        batch_counts = [1]
-        betas = [1.0]
+        m = 8
+        n = 16
+        k = 32
+        batch_count = 1
+        beta = 1.0
         alpha = 1.0
-        for batch_count in batch_counts:
-            for m in Ms:
-                for n in Ns:
-                    for k in Ks:
-                        for beta in betas:
-                            self.run_and_verify(m, n, k, batch_count, alpha, beta)
+        self.run_and_verify(m, n, k, batch_count, alpha, beta)
 class gemm_test_bed_runner_transpose:
@@ -397,10 +392,10 @@ class TestMatmulLite(unittest.TestCase):
     pass
-add_function_test(TestMatmulLite, "test_f32", test_f32, devices=devices)
-add_function_test(TestMatmulLite, "test_tape", test_tape, devices=devices)
-add_function_test(TestMatmulLite, "test_operator", test_operator, devices=devices)
-add_function_test(TestMatmulLite, "test_large_batch_count", test_large_batch_count, devices=devices)
+add_function_test(TestMatmulLite, "test_f32", test_f32, devices=devices, check_output=False)
+add_function_test(TestMatmulLite, "test_tape", test_tape, devices=devices, check_output=False)
+add_function_test(TestMatmulLite, "test_operator", test_operator, devices=devices, check_output=False)
+add_function_test(TestMatmulLite, "test_large_batch_count", test_large_batch_count, devices=devices, check_output=False)
 if __name__ == "__main__":