PyPI - warp-lang - Versions diffs - 1.0.0b2__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.0.0b2__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (269) hide show

docs/conf.py +17 -5
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/env/env_usd.py +4 -1
examples/env/environment.py +8 -9
examples/example_dem.py +34 -33
examples/example_diffray.py +364 -337
examples/example_fluid.py +32 -23
examples/example_jacobian_ik.py +97 -93
examples/example_marching_cubes.py +6 -16
examples/example_mesh.py +6 -16
examples/example_mesh_intersect.py +16 -14
examples/example_nvdb.py +14 -16
examples/example_raycast.py +14 -13
examples/example_raymarch.py +16 -23
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +82 -78
examples/example_sim_cloth.py +45 -48
examples/example_sim_fk_grad.py +51 -44
examples/example_sim_fk_grad_torch.py +47 -40
examples/example_sim_grad_bounce.py +108 -133
examples/example_sim_grad_cloth.py +99 -113
examples/example_sim_granular.py +5 -6
examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
examples/example_sim_neo_hookean.py +51 -55
examples/example_sim_particle_chain.py +4 -4
examples/example_sim_quadruped.py +126 -81
examples/example_sim_rigid_chain.py +54 -61
examples/example_sim_rigid_contact.py +66 -70
examples/example_sim_rigid_fem.py +3 -3
examples/example_sim_rigid_force.py +1 -1
examples/example_sim_rigid_gyroscopic.py +3 -4
examples/example_sim_rigid_kinematics.py +28 -39
examples/example_sim_trajopt.py +112 -110
examples/example_sph.py +9 -8
examples/example_wave.py +7 -7
examples/fem/bsr_utils.py +30 -17
examples/fem/example_apic_fluid.py +85 -69
examples/fem/example_convection_diffusion.py +97 -93
examples/fem/example_convection_diffusion_dg.py +142 -149
examples/fem/example_convection_diffusion_dg0.py +141 -136
examples/fem/example_deformed_geometry.py +146 -0
examples/fem/example_diffusion.py +115 -84
examples/fem/example_diffusion_3d.py +116 -86
examples/fem/example_diffusion_mgpu.py +102 -79
examples/fem/example_mixed_elasticity.py +139 -100
examples/fem/example_navier_stokes.py +175 -162
examples/fem/example_stokes.py +143 -111
examples/fem/example_stokes_transfer.py +186 -157
examples/fem/mesh_utils.py +59 -97
examples/fem/plot_utils.py +138 -17
tools/ci/publishing/build_nodes_info.py +54 -0
warp/__init__.py +4 -3
warp/__init__.pyi +1 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +836 -492
warp/codegen.py +864 -553
warp/config.py +3 -1
warp/context.py +389 -172
warp/fem/__init__.py +24 -6
warp/fem/cache.py +318 -25
warp/fem/dirichlet.py +7 -3
warp/fem/domain.py +14 -0
warp/fem/field/__init__.py +30 -38
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +244 -138
warp/fem/field/restriction.py +8 -6
warp/fem/field/test.py +127 -59
warp/fem/field/trial.py +117 -60
warp/fem/geometry/__init__.py +5 -1
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +24 -1
warp/fem/geometry/geometry.py +86 -14
warp/fem/geometry/grid_2d.py +112 -54
warp/fem/geometry/grid_3d.py +134 -65
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +85 -33
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +451 -115
warp/fem/geometry/trimesh_2d.py +197 -92
warp/fem/integrate.py +534 -268
warp/fem/operator.py +58 -31
warp/fem/polynomial.py +11 -0
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +150 -58
warp/fem/quadrature/quadrature.py +209 -57
warp/fem/space/__init__.py +230 -53
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +49 -2
warp/fem/space/function_space.py +90 -39
warp/fem/space/grid_2d_function_space.py +149 -496
warp/fem/space/grid_3d_function_space.py +173 -538
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +129 -76
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +46 -34
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +132 -1039
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +104 -742
warp/fem/types.py +13 -11
warp/fem/utils.py +335 -60
warp/native/array.h +120 -34
warp/native/builtin.h +101 -72
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +22 -40
warp/native/clang/clang.cpp +1 -0
warp/native/crt.h +2 -0
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1522 -1243
warp/native/intersect.h +19 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +76 -17
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -18
warp/native/mesh.h +395 -40
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +44 -34
warp/native/reduce.cpp +1 -1
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +163 -155
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +18 -14
warp/native/vec.h +103 -21
warp/native/warp.cpp +2 -1
warp/native/warp.cu +28 -3
warp/native/warp.h +4 -3
warp/render/render_opengl.py +261 -109
warp/sim/__init__.py +1 -2
warp/sim/articulation.py +385 -185
warp/sim/import_mjcf.py +59 -48
warp/sim/import_urdf.py +15 -15
warp/sim/import_usd.py +174 -102
warp/sim/inertia.py +17 -18
warp/sim/integrator_xpbd.py +4 -3
warp/sim/model.py +330 -250
warp/sim/render.py +1 -1
warp/sparse.py +625 -152
warp/stubs.py +341 -309
warp/tape.py +9 -6
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +94 -74
warp/tests/test_array.py +82 -101
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +22 -12
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +18 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +165 -134
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +237 -0
warp/tests/test_fabricarray.py +22 -24
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1034 -124
warp/tests/test_fp16.py +23 -16
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +123 -181
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +35 -34
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +24 -25
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +304 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +60 -22
warp/tests/test_mesh_query_aabb.py +21 -25
warp/tests/test_mesh_query_point.py +111 -22
warp/tests/test_mesh_query_ray.py +12 -24
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +90 -86
warp/tests/test_transient_module.py +10 -12
warp/tests/test_types.py +363 -0
warp/tests/test_utils.py +451 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +418 -376
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/unittest_utils.py +342 -0
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +589 -0
warp/types.py +622 -211
warp/utils.py +54 -393
warp_lang-1.0.0b6.dist-info/METADATA +238 -0
warp_lang-1.0.0b6.dist-info/RECORD +409 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
examples/example_cache_management.py +0 -40
examples/example_multigpu.py +0 -54
examples/example_struct.py +0 -65
examples/fem/example_stokes_transfer_3d.py +0 -210
warp/fem/field/discrete_field.py +0 -80
warp/fem/space/nodal_function_space.py +0 -233
warp/tests/test_all.py +0 -223
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-1.0.0b2.dist-info/METADATA +0 -26
warp_lang-1.0.0b2.dist-info/RECORD +0 -378
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/tests/test_matmul_lite.py ADDED Viewed

@@ -0,0 +1,410 @@
+# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+wp.init()
+from warp.context import runtime  # noqa: E402
+class gemm_test_bed_runner:
+    def __init__(self, dtype, device):
+        self.dtype = dtype
+        self.device = device
+    def alloc(self, m, n, k, batch_count):
+        rng = np.random.default_rng(42)
+        low = -4.5
+        high = 3.5
+        if batch_count == 1:
+            A = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+        else:
+            A = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+        return A, B, C, D
+    def run_and_verify(self, m, n, k, batch_count, alpha, beta):
+        A, B, C, D = self.alloc(m, n, k, batch_count)
+        ones = wp.zeros_like(D)
+        ones.fill_(1.0)
+        if batch_count == 1:
+            tape = wp.Tape()
+            with tape:
+                wp.matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape.backward(grads={D: ones})
+            D_np = alpha * (A.numpy() @ B.numpy()) + beta * C.numpy()
+            assert np.array_equal(D_np, D.numpy())
+            adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose())
+            adj_B_np = alpha * (A.numpy().transpose() @ ones.numpy())
+            adj_C_np = beta * ones.numpy()
+        else:
+            tape = wp.Tape()
+            with tape:
+                wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
+            tape.backward(grads={D: ones})
+            D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
+            assert np.array_equal(D_np, D.numpy())
+            adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose((0, 2, 1)))
+            adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones.numpy())
+            adj_C_np = beta * ones.numpy()
+        assert np.array_equal(adj_A_np, A.grad.numpy())
+        assert np.array_equal(adj_B_np, B.grad.numpy())
+        assert np.array_equal(adj_C_np, C.grad.numpy())
+    def run(self):
+        Ms = [8]
+        Ns = [16]
+        Ks = [32]
+        batch_counts = [1]
+        betas = [1.0]
+        alpha = 1.0
+        for batch_count in batch_counts:
+            for m in Ms:
+                for n in Ns:
+                    for k in Ks:
+                        for beta in betas:
+                            self.run_and_verify(m, n, k, batch_count, alpha, beta)
+class gemm_test_bed_runner_transpose:
+    def __init__(self, dtype, device):
+        self.dtype = dtype
+        self.device = device
+    def alloc(self, m, n, k, batch_count):
+        rng = np.random.default_rng(42)
+        low = -4.5
+        high = 3.5
+        if batch_count == 1:
+            A = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array2d(
+                np.ceil(rng.uniform(low=low, high=high, size=(m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array2d(np.zeros((m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array2d(A.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array2d(B.numpy().transpose([1, 0]), dtype=self.dtype, device=self.device, requires_grad=True)
+        else:
+            A = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            B = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            C = wp.array3d(
+                np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+                dtype=self.dtype,
+                device=self.device,
+                requires_grad=True,
+            )
+            D = wp.array3d(np.zeros((batch_count, m, n)), dtype=self.dtype, device=self.device, requires_grad=True)
+            AT = wp.array3d(A.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
+            BT = wp.array3d(B.numpy().transpose([0, 2, 1]), dtype=self.dtype, device=self.device, requires_grad=True)
+        return A, B, C, D, AT, BT
+    def run_and_verify(self, m, n, k, batch_count, alpha, beta):
+        A, B, C1, D1, AT1, BT1 = self.alloc(m, n, k, batch_count)
+        C2 = wp.clone(C1)
+        C3 = wp.clone(C1)
+        D2 = wp.clone(D1)
+        D3 = wp.clone(D1)
+        AT2 = wp.clone(AT1)
+        BT2 = wp.clone(BT1)
+        ones1 = wp.zeros_like(D1)
+        ones1.fill_(1.0)
+        ones2 = wp.zeros_like(D2)
+        ones2.fill_(1.0)
+        ones3 = wp.zeros_like(D3)
+        ones3.fill_(1.0)
+        if batch_count == 1:
+            ATT1 = AT1.transpose([1, 0])
+            BTT1 = BT1.transpose([1, 0])
+            ATT2 = AT2.transpose([1, 0])
+            BTT2 = BT2.transpose([1, 0])
+            tape = wp.Tape()
+            with tape:
+                wp.matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
+                wp.matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
+                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
+            D_np = alpha * (A.numpy() @ B.numpy()) + beta * C1.numpy()
+            assert np.array_equal(D_np, D1.numpy())
+            assert np.array_equal(D_np, D2.numpy())
+            assert np.array_equal(D_np, D3.numpy())
+            adj_A_np = alpha * (ones1.numpy() @ B.numpy().transpose())
+            adj_B_np = alpha * (A.numpy().transpose() @ ones1.numpy())
+            adj_C_np = beta * ones1.numpy()
+        else:
+            ATT1 = AT1.transpose([0, 2, 1])
+            BTT1 = BT1.transpose([0, 2, 1])
+            ATT2 = AT2.transpose([0, 2, 1])
+            BTT2 = BT2.transpose([0, 2, 1])
+            tape = wp.Tape()
+            with tape:
+                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
+                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
+                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+            tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
+            D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C1.numpy()
+            assert np.array_equal(D_np, D1.numpy())
+            assert np.array_equal(D_np, D2.numpy())
+            assert np.array_equal(D_np, D3.numpy())
+            adj_A_np = alpha * np.matmul(ones1.numpy(), B.numpy().transpose((0, 2, 1)))
+            adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones1.numpy())
+            adj_C_np = beta * ones1.numpy()
+        assert np.array_equal(adj_A_np, A.grad.numpy())
+        assert np.array_equal(adj_A_np, ATT1.grad.numpy())
+        assert np.array_equal(adj_A_np, ATT2.grad.numpy())
+        assert np.array_equal(adj_B_np, B.grad.numpy())
+        assert np.array_equal(adj_B_np, BTT1.grad.numpy())
+        assert np.array_equal(adj_B_np, BTT2.grad.numpy())
+        assert np.array_equal(adj_C_np, C1.grad.numpy())
+        assert np.array_equal(adj_C_np, C2.grad.numpy())
+        assert np.array_equal(adj_C_np, C3.grad.numpy())
+    def run(self):
+        m = 8
+        n = 16
+        k = 32
+        batch_counts = [1, 4]
+        beta = 1.0
+        alpha = 1.0
+        for batch_count in batch_counts:
+            self.run_and_verify(m, n, k, batch_count, alpha, beta)
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_f32(test, device):
+    gemm_test_bed_runner(wp.float32, device).run()
+    gemm_test_bed_runner_transpose(wp.float32, device).run()
+@wp.kernel
+def matrix_sum_kernel(arr: wp.array2d(dtype=float), loss: wp.array(dtype=float)):
+    i, j = wp.tid()
+    wp.atomic_add(loss, 0, arr[i, j])
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_tape(test, device):
+    rng = np.random.default_rng(42)
+    low = -4.5
+    high = 3.5
+    m = 8
+    n = 16
+    k = 32
+    A = wp.array2d(
+        np.ceil(rng.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
+    )
+    B = wp.array2d(
+        np.ceil(rng.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
+    )
+    C = wp.array2d(
+        np.ceil(rng.uniform(low=low, high=high, size=(m, n))), dtype=float, device=device, requires_grad=True
+    )
+    D = wp.array2d(np.zeros((m, n)), dtype=float, device=device, requires_grad=True)
+    loss = wp.zeros(1, dtype=float, device=device, requires_grad=True)
+    # test tape
+    tape = wp.Tape()
+    with tape:
+        wp.matmul(A, B, C, D, device=device)
+        wp.launch(matrix_sum_kernel, dim=(m, n), inputs=[D, loss], device=device)
+    tape.backward(loss=loss)
+    A_grad = A.grad.numpy()
+    tape.reset()
+    # test adjoint
+    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
+    wp.adj_matmul(A, B, C, A.grad, B.grad, C.grad, D.grad, device=device)
+    assert_np_equal(A_grad, A.grad.numpy())
+    # test zero
+    tape.zero()
+    assert_array_equal(A.grad, wp.zeros_like(A))
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_operator(test, device):
+    rng = np.random.default_rng(42)
+    low = -4.5
+    high = 3.5
+    m = 8
+    n = 16
+    k = 32
+    A = wp.array2d(
+        np.ceil(rng.uniform(low=low, high=high, size=(m, k))), dtype=float, device=device, requires_grad=True
+    )
+    B = wp.array2d(
+        np.ceil(rng.uniform(low=low, high=high, size=(k, n))), dtype=float, device=device, requires_grad=True
+    )
+    loss = wp.zeros(1, dtype=float, device=device, requires_grad=True)
+    # test tape
+    tape = wp.Tape()
+    with tape:
+        D = A @ B
+        wp.launch(matrix_sum_kernel, dim=(m, n), inputs=[D, loss], device=device)
+    tape.backward(loss=loss)
+    # test adjoint
+    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
+    B_transpose = wp.array2d(B.transpose().numpy(), dtype=float, device=device)
+    adj_A = D.grad @ B_transpose
+    assert_array_equal(adj_A, A.grad)
+    # test zero
+    tape.zero()
+    assert_array_equal(A.grad, wp.zeros_like(A))
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_large_batch_count(test, device):
+    rng = np.random.default_rng(42)
+    low = -4.5
+    high = 3.5
+    m = 2
+    n = 3
+    k = 4
+    batch_count = 65535 * 2 + int(65535 / 2)
+    A = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, k))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    B = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, k, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    C = wp.array3d(
+        np.ceil(rng.uniform(low=low, high=high, size=(batch_count, m, n))),
+        dtype=float,
+        device=device,
+        requires_grad=True,
+    )
+    D = wp.array3d(np.zeros((batch_count, m, n)), dtype=float, device=device, requires_grad=True)
+    ones = wp.zeros_like(D)
+    ones.fill_(1.0)
+    alpha = 1.0
+    beta = 1.0
+    tape = wp.Tape()
+    with tape:
+        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False, device=device)
+    tape.backward(grads={D: ones})
+    D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
+    assert np.array_equal(D_np, D.numpy())
+    adj_A_np = alpha * np.matmul(ones.numpy(), B.numpy().transpose((0, 2, 1)))
+    adj_B_np = alpha * np.matmul(A.numpy().transpose((0, 2, 1)), ones.numpy())
+    adj_C_np = beta * ones.numpy()
+    assert np.array_equal(adj_A_np, A.grad.numpy())
+    assert np.array_equal(adj_B_np, B.grad.numpy())
+    assert np.array_equal(adj_C_np, C.grad.numpy())
+devices = get_test_devices()
+class TestMatmulLite(unittest.TestCase):
+    pass
+add_function_test(TestMatmulLite, "test_f32", test_f32, devices=devices)
+add_function_test(TestMatmulLite, "test_tape", test_tape, devices=devices)
+add_function_test(TestMatmulLite, "test_operator", test_operator, devices=devices)
+add_function_test(TestMatmulLite, "test_large_batch_count", test_large_batch_count, devices=devices)
+if __name__ == "__main__":
+    wp.build.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=False)

warp/tests/test_mesh.py CHANGED Viewed

@@ -10,8 +10,7 @@ import unittest
 import numpy as np
 import warp as wp
-from warp.tests.test_base import *
+from warp.tests.unittest_utils import *
 # fmt: off
@@ -223,9 +222,9 @@ def query_ray_kernel(
 def test_mesh_query_ray(test, device):
-    points = wp.array(POINT_POSITIONS, dtype=wp.vec3)
+    points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device=device)
-    indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int)
+    indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
     mesh = wp.Mesh(points=points, indices=indices)
     expected_sign = -1.0
     wp.launch(
@@ -235,9 +234,10 @@ def test_mesh_query_ray(test, device):
             mesh.id,
             expected_sign,
         ],
+        device=device,
     )
-    indices = wp.array(LEFT_HANDED_FACE_VERTEX_INDICES, dtype=int)
+    indices = wp.array(LEFT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
     mesh = wp.Mesh(points=points, indices=indices)
     expected_sign = 1.0
     wp.launch(
@@ -247,40 +247,78 @@ def test_mesh_query_ray(test, device):
             mesh.id,
             expected_sign,
         ],
+        device=device,
     )
 def test_mesh_refit_graph(test, device):
-    points = wp.array(POINT_POSITIONS, dtype=wp.vec3)
+    points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device=device)
-    indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int)
+    indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
     mesh = wp.Mesh(points=points, indices=indices)
-    wp.capture_begin()
-    mesh.refit()
-    graph = wp.capture_end()
+    wp.capture_begin(device, force_module_load=False)
+    try:
+        mesh.refit()
+    finally:
+        graph = wp.capture_end(device)
     # replay
     num_iters = 10
     for _ in range(num_iters):
         wp.capture_launch(graph)
+    wp.synchronize_device(device)
+def test_mesh_exceptions(test, device):
+    # points and indices must be on same device
+    with test.assertRaises(RuntimeError):
+        points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device="cpu")
+        indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
+        wp.Mesh(points=points, indices=indices)
+    # points must be vec3
+    with test.assertRaises(RuntimeError):
+        points = wp.array(POINT_POSITIONS, dtype=wp.vec3d, device=device)
+        indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
+        wp.Mesh(points=points, indices=indices)
+    # velocities must be vec3
+    with test.assertRaises(RuntimeError):
+        points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device=device)
+        velocities = wp.zeros(points.shape, dtype=wp.vec3d, device=device)
+        indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
+        wp.Mesh(points=points, indices=indices, velocities=velocities)
+    # indices must be int32
+    with test.assertRaises(RuntimeError):
+        points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device=device)
+        indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=wp.int64, device=device)
+        wp.Mesh(points=points, indices=indices)
+    # indices must be 1d
+    with test.assertRaises(RuntimeError):
+        points = wp.array(POINT_POSITIONS, dtype=wp.vec3, device=device)
+        indices = wp.array(RIGHT_HANDED_FACE_VERTEX_INDICES, dtype=int, device=device)
+        indices = indices.reshape((3, -1))
+        wp.Mesh(points=points, indices=indices)
+devices = get_test_devices()
-def register(parent):
-    devices = get_test_devices()
+class TestMesh(unittest.TestCase):
+    pass
-    class TestMesh(parent):
-        pass
-    add_function_test(TestMesh, "test_mesh_read_properties", test_mesh_read_properties, devices=devices)
-    add_function_test(TestMesh, "test_mesh_query_point", test_mesh_query_point, devices=devices)
-    add_function_test(TestMesh, "test_mesh_query_ray", test_mesh_query_ray, devices=devices)
-    add_function_test(TestMesh, "test_mesh_refit_graph", test_mesh_refit_graph, devices=wp.get_cuda_devices())
-    return TestMesh
+add_function_test(TestMesh, "test_mesh_read_properties", test_mesh_read_properties, devices=devices)
+add_function_test(TestMesh, "test_mesh_query_point", test_mesh_query_point, devices=devices)
+add_function_test(TestMesh, "test_mesh_query_ray", test_mesh_query_ray, devices=devices)
+add_function_test(TestMesh, "test_mesh_refit_graph", test_mesh_refit_graph, devices=get_unique_cuda_test_devices())
+add_function_test(TestMesh, "test_mesh_exceptions", test_mesh_exceptions, devices=get_unique_cuda_test_devices())
 if __name__ == "__main__":
-    _ = register(unittest.TestCase)
+    wp.build.clear_kernel_cache()
     unittest.main(verbosity=2)

warp/tests/test_mesh_query_aabb.py CHANGED Viewed

@@ -5,10 +5,12 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
 import numpy as np
 import warp as wp
-from warp.tests.test_base import *
+from warp.tests.unittest_utils import *
 wp.init()
@@ -96,7 +98,6 @@ def test_compute_bounds(test, device):
     lower_view = lowers.numpy()
     upper_view = uppers.numpy()
-    wp.synchronize()
     # Confirm the bounds of each triangle are correct.
     test.assertTrue(lower_view[0][0] == 0)
@@ -148,8 +149,6 @@ def test_mesh_query_aabb_count_overlap(test, device):
         device=device,
     )
-    wp.synchronize()
     view = counts.numpy()
     # 2 triangles that share a vertex having overlapping AABBs.
@@ -188,8 +187,6 @@ def test_mesh_query_aabb_count_nonoverlap(test, device):
         device=device,
     )
-    wp.synchronize()
     view = counts.numpy()
     # AABB query only returns one triangle at a time, the triangles are not close enough to overlap.
@@ -197,29 +194,28 @@ def test_mesh_query_aabb_count_nonoverlap(test, device):
         test.assertTrue(c == 1)
-def register(parent):
-    devices = get_test_devices()
+devices = get_test_devices()
-    class TestMeshQueryAABBMethods(parent):
-        pass
-    add_function_test(TestMeshQueryAABBMethods, "test_compute_bounds", test_compute_bounds, devices=devices)
-    add_function_test(
-        TestMeshQueryAABBMethods,
-        "test_mesh_query_aabb_count_overlap",
-        test_mesh_query_aabb_count_overlap,
-        devices=devices,
-    )
-    add_function_test(
-        TestMeshQueryAABBMethods,
-        "test_mesh_query_aabb_count_nonoverlap",
-        test_mesh_query_aabb_count_nonoverlap,
-        devices=devices,
-    )
+class TestMeshQueryAABBMethods(unittest.TestCase):
+    pass
-    return TestMeshQueryAABBMethods
+add_function_test(TestMeshQueryAABBMethods, "test_compute_bounds", test_compute_bounds, devices=devices)
+add_function_test(
+    TestMeshQueryAABBMethods,
+    "test_mesh_query_aabb_count_overlap",
+    test_mesh_query_aabb_count_overlap,
+    devices=devices,
+)
+add_function_test(
+    TestMeshQueryAABBMethods,
+    "test_mesh_query_aabb_count_nonoverlap",
+    test_mesh_query_aabb_count_nonoverlap,
+    devices=devices,
+)
 if __name__ == "__main__":
-    c = register(unittest.TestCase)
+    wp.build.clear_kernel_cache()
     unittest.main(verbosity=2)