PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (179) hide show

warp/__init__.py +7 -1
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +452 -362
warp/codegen.py +179 -119
warp/config.py +42 -6
warp/context.py +490 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/nodal_field.py +22 -68
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +9 -10
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +3 -8
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +14 -27
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +301 -105
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +99 -10
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/sim/articulation.py +4 -4
warp/sim/collide.py +21 -10
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/integrator_euler.py +5 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +5 -5
warp/sim/model.py +42 -13
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +216 -19
warp/tests/__main__.py +0 -15
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +2 -2
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_examples.py +28 -36
warp/tests/test_fem.py +23 -4
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +233 -79
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +67 -46
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +46 -34
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +1 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -59
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +110 -658
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/METADATA +29 -7
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/RECORD +172 -162
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info/licenses}/LICENSE.md +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/tests/test_examples.py CHANGED Viewed

@@ -25,15 +25,14 @@ Generally the test_options[_cpu,_cuda] dictionaries should be used to prevent
 graphical windows from being open by the example {"headless": True} and to
 override example defaults so the example can run in less than ten seconds.
-Use {"usd_required": True} and {"torch_required": True} to skip running the test
-if usd-core or torch are not found in the Python environment.
-Use "cutlass_required": True} to skip the test if Warp needs to be built with
-CUTLASS.
+To skip tests if the optional dependencies are not found, use the following keys:
+- {"usd_required": True} (requires usd-core)
+- {"torch_required": True} (requires torch)
+- {"pillow_required": True} (requires pillow)
 Use the "num_frames" and "train_iters" keys to control the number of steps.
-Use "test_timeout" to override the default test timeout threshold of 300 seconds.
+Use "test_timeout" to override the default test timeout threshold of 600 seconds.
 """
 import os
@@ -52,7 +51,7 @@ from warp.tests.unittest_utils import (
 )
 from warp.utils import check_p2p
-wp.init()  # For wp.context.runtime.core.is_cutlass_enabled()
+wp.init()  # For wp.context.runtime.core.is_debug_enabled()
 def _build_command_line_options(test_options: Dict[str, Any]) -> list:
@@ -119,9 +118,13 @@ def add_example_test(
         if usd_required and not USD_AVAILABLE:
             test.skipTest("Requires usd-core")
-        cutlass_required = options.pop("cutlass_required", False)
-        if cutlass_required and not wp.context.runtime.core.is_cutlass_enabled():
-            test.skipTest("Warp was not built with CUTLASS support")
+        # Mark the test as skipped if pillow is not installed but required
+        pillow_required = options.pop("pillow_required", False)
+        if pillow_required:
+            try:
+                import PIL  # noqa: F401
+            except ImportError:
+                test.skipTest("Requires pillow")
         # Find the current Warp cache
         warp_cache_path = wp.config.kernel_cache_dir
@@ -169,7 +172,7 @@ def add_example_test(
         command.extend(_build_command_line_options(options))
         # Set the test timeout in seconds
-        test_timeout = options.pop("test_timeout", 300)
+        test_timeout = options.pop("test_timeout", 600)
         # with wp.ScopedTimer(f"{name}_{sanitize_identifier(device)}"):
         # Run the script as a subprocess
@@ -242,19 +245,23 @@ add_example_test(
     devices=test_devices,
     test_options={"height": 512, "width": 1024, "headless": True},
 )
+add_example_test(
+    TestCoreExamples,
+    name="core.example_sample_mesh",
+    devices=test_devices,
+    test_options_cpu={"num_frames": 1},
+)
 add_example_test(
     TestCoreExamples,
     name="core.example_sph",
     devices=test_devices,
     test_options_cpu={"num_frames": 1},
-    test_options_cuda={"test_timeout": 600},
 )
 add_example_test(
     TestCoreExamples,
     name="core.example_torch",
     devices=test_devices,
     test_options={"headless": True, "num_frames": 1000, "torch_required": True},
-    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(TestCoreExamples, name="core.example_wave", devices=test_devices)
@@ -268,7 +275,6 @@ add_example_test(
     name="optim.example_bounce",
     devices=test_devices,
     test_options_cpu={"train_iters": 3},
-    test_options_cuda={"test_timeout": 600},
 )
 add_example_test(
     TestOptimExamples,
@@ -281,7 +287,6 @@ add_example_test(
     TestOptimExamples,
     name="optim.example_cloth_throw",
     devices=test_devices,
-    test_options={"test_timeout": 600},
     test_options_cpu={"train_iters": 3},
 )
 add_example_test(
@@ -291,6 +296,12 @@ add_example_test(
     test_options={"usd_required": True, "headless": True},
     test_options_cpu={"train_iters": 2},
 )
+add_example_test(
+    TestOptimExamples,
+    name="optim.example_fluid_checkpoint",
+    devices=cuda_test_devices,
+    test_options={"headless": True, "train_iters": 5, "num_frames": 300, "pillow_required": True},
+)
 add_example_test(TestOptimExamples, name="optim.example_inverse_kinematics", devices=test_devices)
 add_example_test(
     TestOptimExamples,
@@ -305,19 +316,6 @@ add_example_test(
     devices=test_devices,
     test_options={"headless": True, "train_iters": 50},
 )
-# NOTE: This example uses CUTLASS and will run orders of magnitude slower when Warp is built in debug mode
-add_example_test(
-    TestOptimExamples,
-    name="optim.example_walker",
-    devices=test_devices,
-    test_options={"usd_required": True},
-    test_options_cuda={
-        "train_iters": 1 if warp.context.runtime.core.is_debug_enabled() else 3,
-        "num_frames": 1 if warp.context.runtime.core.is_debug_enabled() else 60,
-        "cutlass_required": True,
-    },
-    test_options_cpu={"train_iters": 1, "num_frames": 30},
-)
 add_example_test(
     TestOptimExamples,
     name="optim.example_softbody_properties",
@@ -333,15 +331,13 @@ class TestSimExamples(unittest.TestCase):
     pass
-add_example_test(
-    TestSimExamples, name="sim.example_cartpole", devices=test_devices, test_options_cuda={"test_timeout": 600}
-)
+add_example_test(TestSimExamples, name="sim.example_cartpole", devices=test_devices)
 add_example_test(
     TestSimExamples,
     name="sim.example_cloth",
     devices=test_devices,
     test_options={"usd_required": True},
-    test_options_cpu={"num_frames": 10, "test_timeout": 600},
+    test_options_cpu={"num_frames": 10},
 )
 add_example_test(
     TestSimExamples, name="sim.example_granular", devices=test_devices, test_options_cpu={"num_frames": 10}
@@ -421,28 +417,24 @@ add_example_test(
     name="fem.example_convection_diffusion",
     devices=test_devices,
     test_options={"resolution": 20, "headless": True},
-    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples,
     name="fem.example_burgers",
     devices=test_devices,
     test_options={"resolution": 20, "num_frames": 25, "degree": 1, "headless": True},
-    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples,
     name="fem.example_convection_diffusion_dg",
     devices=test_devices,
     test_options={"resolution": 20, "num_frames": 25, "headless": True},
-    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples,
     name="fem.example_mixed_elasticity",
     devices=test_devices,
     test_options={"nonconforming_stresses": True, "mesh": "quad", "headless": True},
-    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples, name="fem.example_stokes_transfer", devices=test_devices, test_options={"headless": True}

warp/tests/test_fem.py CHANGED Viewed

@@ -33,6 +33,7 @@ from warp.fem.utils import (
     grid_to_tets,
     grid_to_tris,
 )
+from warp.sparse import bsr_zeros
 from warp.tests.unittest_utils import *
 vec6f = wp.vec(length=6, dtype=float)
@@ -147,11 +148,12 @@ def test_interpolate_gradient(test, device):
         scalar_space = fem.make_polynomial_space(geo, degree=2)
         # Point-based vector space
-        # So we can test gradient with respect to inteprolation point position
+        # So we can test gradient with respect to interpolation point position
         point_coords = wp.array([[[0.5, 0.5, 0.0]]], dtype=fem.Coords, requires_grad=True)
-        interpolation_nodes = fem.PointBasisSpace(
-            fem.ExplicitQuadrature(domain=fem.Cells(geo), points=point_coords, weights=wp.array([[1.0]], dtype=float))
+        point_quadrature = fem.ExplicitQuadrature(
+            domain=fem.Cells(geo), points=point_coords, weights=wp.array([[1.0]], dtype=float)
         )
+        interpolation_nodes = fem.PointBasisSpace(point_quadrature)
         vector_space = fem.make_collocated_function_space(interpolation_nodes, dtype=wp.vec2)
         # Initialize scalar field with known function
@@ -213,6 +215,23 @@ def test_interpolate_gradient(test, device):
         )
         assert_np_equal(point_coords.grad.numpy(), np.array([[[2.0, 0.0, 0.0]]]))
+        # Compare against jacobian
+        scalar_trial = fem.make_trial(scalar_space)
+        jacobian = bsr_zeros(
+            rows_of_blocks=point_quadrature.total_point_count(),
+            cols_of_blocks=scalar_space.node_count(),
+            block_type=wp.mat(shape=(2, 1), dtype=float),
+        )
+        fem.interpolate(
+            grad_field,
+            dest=jacobian,
+            quadrature=point_quadrature,
+            fields={"p": scalar_trial},
+            kernel_options={"enable_backward": False},
+        )
+        assert jacobian.nnz_sync() == 4  # one non-zero per edge center
+        assert_np_equal((jacobian @ scalar_field.dof_values.grad).numpy(), [[0.0, 0.5]])
 @integrand
 def vector_divergence_form(s: Sample, u: Field, q: Field):
@@ -1868,7 +1887,7 @@ def test_qr_eigenvalues():
     wp.expect_near(wp.ddot(Err4, Err4), 0.0, tol)
     # test robustness to low requested tolerance
-    Rank6 = mat66f(
+    Rank6 = wp.matrix_from_cols(
         vec6f(0.00171076, 0.0, 0.0, 0.0, 0.0, 0.0),
         vec6f(0.0, 0.00169935, 6.14367e-06, -3.52589e-05, 3.02397e-05, -1.53458e-11),
         vec6f(0.0, 6.14368e-06, 0.00172217, 2.03568e-05, 1.74589e-05, -2.92627e-05),

warp/tests/test_linear_solvers.py CHANGED Viewed

@@ -21,8 +21,6 @@ import warp as wp
 from warp.optim.linear import bicgstab, cg, cr, gmres, preconditioner
 from warp.tests.unittest_utils import *
-wp.init()  # For runtime.core.is_cutlass_enabled()
 def _check_linear_solve(test, A, b, func, *args, **kwargs):
     # test from zero
@@ -185,15 +183,6 @@ class TestLinearSolvers(unittest.TestCase):
 devices = get_test_devices()
-if not wp.context.runtime.core.is_cutlass_enabled():
-    devices = [d for d in devices if not d.is_cuda]
-    print("Skipping CUDA linear solver tests because CUTLASS is not supported in this build")
-if wp.context.runtime.core.is_debug_enabled():
-    # cutlass-based matmul is *very* slow in debug mode -- skip
-    devices = [d for d in devices if not d.is_cuda]
-    print("Skipping CUDA linear solver tests in debug mode")
 add_function_test(TestLinearSolvers, "test_cg", test_cg, devices=devices)
 add_function_test(TestLinearSolvers, "test_cr", test_cr, devices=devices)
 add_function_test(TestLinearSolvers, "test_bicgstab", test_bicgstab, devices=devices)

warp/tests/test_mat.py CHANGED Viewed

@@ -127,30 +127,6 @@ def test_tpl_constructor_error_incompatible_sizes(test, device):
         wp.launch(kernel, dim=1, inputs=[], device=device)
-def test_tpl_constructor_error_invalid_vector_count(test, device):
-    @wp.kernel
-    def kernel():
-        wp.mat33(wp.vec3(1.0, 2.0, 3.0), wp.vec3(1.0, 2.0, 3.0))
-    with test.assertRaisesRegex(
-        RuntimeError,
-        r"incompatible number of column vectors given \(2\) when constructing a matrix of shape \(3, 3\)$",
-    ):
-        wp.launch(kernel, dim=1, inputs=[], device=device)
-def test_tpl_constructor_error_invalid_vector_shape(test, device):
-    @wp.kernel
-    def kernel():
-        wp.mat22(wp.vec3(1.0, 2.0, 3.0), wp.vec3(4.0, 5.0, 6.0))
-    with test.assertRaisesRegex(
-        RuntimeError,
-        r"incompatible column vector lengths given when constructing a matrix of shape \(2, 2\)$",
-    ):
-        wp.launch(kernel, dim=1, inputs=[], device=device)
 def test_tpl_constructor_error_invalid_arg_count(test, device):
     @wp.kernel
     def kernel():
@@ -234,7 +210,7 @@ def test_quat_constructor(test, device, dtype, register_kernels=False):
         c0 = s[0][0] * R[0]
         c1 = s[0][1] * R[1]
         c2 = s[0][2] * R[2]
-        m_alt = mat44(
+        m_alt = wp.matrix_from_cols(
             vec4(c0[0], c0[1], c0[2], wptype(0.0)),
             vec4(c1[0], c1[1], c1[2], wptype(0.0)),
             vec4(c2[0], c2[1], c2[2], wptype(0.0)),
@@ -1066,6 +1042,124 @@ def test_svd(test, device, dtype, register_kernels=False):
                 assert_np_equal((plusval - minusval) / (2 * dx), m3grads[ii, jj], tol=fdtol)
+def test_svd_2D(test, device, dtype, register_kernels=False):
+    rng = np.random.default_rng(123)
+    tol = {
+        np.float16: 1.0e-3,
+        np.float32: 1.0e-6,
+        np.float64: 1.0e-12,
+    }.get(dtype, 0)
+    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
+    vec2 = wp.types.vector(length=2, dtype=wptype)
+    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
+    def check_mat_svd2(
+        m2: wp.array(dtype=mat22),
+        Uout: wp.array(dtype=mat22),
+        sigmaout: wp.array(dtype=vec2),
+        Vout: wp.array(dtype=mat22),
+        outcomponents: wp.array(dtype=wptype),
+    ):
+        U = mat22()
+        sigma = vec2()
+        V = mat22()
+        wp.svd2(m2[0], U, sigma, V)  # Assuming there's a 2D SVD kernel
+        Uout[0] = U
+        sigmaout[0] = sigma
+        Vout[0] = V
+        # multiply outputs by 2 so we've got something to backpropagate:
+        idx = 0
+        for i in range(2):
+            for j in range(2):
+                outcomponents[idx] = wptype(2) * U[i, j]
+                idx = idx + 1
+        for i in range(2):
+            outcomponents[idx] = wptype(2) * sigma[i]
+            idx = idx + 1
+        for i in range(2):
+            for j in range(2):
+                outcomponents[idx] = wptype(2) * V[i, j]
+                idx = idx + 1
+    kernel = getkernel(check_mat_svd2, suffix=dtype.__name__)
+    output_select_kernel = get_select_kernel(wptype)
+    if register_kernels:
+        return
+    m2 = wp.array(randvals(rng, [1, 2, 2], dtype) + np.eye(2), dtype=mat22, requires_grad=True, device=device)
+    outcomponents = wp.zeros(2 * 2 * 2 + 2, dtype=wptype, requires_grad=True, device=device)
+    Uout = wp.zeros(1, dtype=mat22, requires_grad=True, device=device)
+    sigmaout = wp.zeros(1, dtype=vec2, requires_grad=True, device=device)
+    Vout = wp.zeros(1, dtype=mat22, requires_grad=True, device=device)
+    wp.launch(kernel, dim=1, inputs=[m2], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
+    Uout_np = Uout.numpy()[0].astype(np.float64)
+    sigmaout_np = np.diag(sigmaout.numpy()[0].astype(np.float64))
+    Vout_np = Vout.numpy()[0].astype(np.float64)
+    assert_np_equal(
+        np.matmul(Uout_np, np.matmul(sigmaout_np, Vout_np.T)), m2.numpy()[0].astype(np.float64), tol=30 * tol
+    )
+    if dtype == np.float16:
+        # Skip gradient check for float16 due to rounding errors
+        return
+    # Check gradients:
+    out = wp.zeros(1, dtype=wptype, requires_grad=True, device=device)
+    idx = 0
+    for idx in range(2 * 2 + 2 + 2 * 2):
+        tape = wp.Tape()
+        with tape:
+            wp.launch(kernel, dim=1, inputs=[m2], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
+            wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
+        tape.backward(out)
+        m2grads = 1.0 * tape.gradients[m2].numpy()[0]
+        tape.zero()
+        dx = 0.0001
+        fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
+        for ii in range(2):
+            for jj in range(2):
+                m2test = 1.0 * m2.numpy()
+                m2test[0, ii, jj] += dx
+                wp.launch(
+                    kernel,
+                    dim=1,
+                    inputs=[wp.array(m2test, dtype=mat22, device=device)],
+                    outputs=[Uout, sigmaout, Vout, outcomponents],
+                    device=device,
+                )
+                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
+                plusval = out.numpy()[0]
+                m2test = 1.0 * m2.numpy()
+                m2test[0, ii, jj] -= dx
+                wp.launch(
+                    kernel,
+                    dim=1,
+                    inputs=[wp.array(m2test, dtype=mat22, device=device)],
+                    outputs=[Uout, sigmaout, Vout, outcomponents],
+                    device=device,
+                )
+                wp.launch(output_select_kernel, dim=1, inputs=[outcomponents, idx], outputs=[out], device=device)
+                minusval = out.numpy()[0]
+                assert_np_equal((plusval - minusval) / (2 * dx), m2grads[ii, jj], tol=fdtol)
 def test_qr(test, device, dtype, register_kernels=False):
     rng = np.random.default_rng(123)
@@ -1513,13 +1607,12 @@ def test_transform_vector(test, device, dtype, register_kernels=False):
             tape.zero()
-def test_mat_array_type_indexing(test, device, dtype, register_kernels=False):
+def test_matrix_assign_inplace(test, device, dtype, register_kernels=False):
     np_type = np.dtype(dtype)
     wp_type = wp.types.np_dtype_to_warp_type[np_type]
     vec2 = wp.types.vector(length=2, dtype=wp_type)
     mat22 = wp.types.matrix(shape=(2, 2), dtype=wp_type)
-    mat33 = wp.types.matrix(shape=(3, 3), dtype=wp_type)
     def mattest_read_write_store(x: wp.array(dtype=wp_type), a: wp.array(dtype=mat22)):
         tid = wp.tid()
@@ -1536,17 +1629,8 @@ def test_mat_array_type_indexing(test, device, dtype, register_kernels=False):
         a[1, 1] = wp_type(3.0)
         x[i, j] = a
-    def mattest_in_register_overwrite(x: wp.array2d(dtype=mat22), y: wp.array(dtype=vec2)):
-        i, j = wp.tid()
-        a = mat22(wp_type(0.0))
-        a[0] = y[i]
-        a[0, 1] = wp_type(3.0)
-        x[i, j] = a
     kernel_read_write_store = getkernel(mattest_read_write_store, suffix=dtype.__name__)
     kernel_in_register = getkernel(mattest_in_register, suffix=dtype.__name__)
-    kernel_in_register_overwrite = getkernel(mattest_in_register_overwrite, suffix=dtype.__name__)
     if register_kernels:
         return
@@ -1576,19 +1660,6 @@ def test_mat_array_type_indexing(test, device, dtype, register_kernels=False):
     assert_np_equal(x.numpy(), np.array([[[[1.0, 1.0], [0.0, 3.0]]]], dtype=np_type))
     assert_np_equal(y.grad.numpy(), np.array([[1.0, 1.0]], dtype=np_type))
-    tape.reset()
-    x = wp.zeros((1, 1), dtype=mat22, device=device, requires_grad=True)
-    y = wp.ones(1, dtype=vec2, device=device, requires_grad=True)
-    with tape:
-        wp.launch(kernel_in_register_overwrite, dim=(1, 1), inputs=[x, y], device=device)
-    tape.backward(grads={x: wp.ones_like(x, requires_grad=False)})
-    assert_np_equal(x.numpy(), np.array([[[[1.0, 3.0], [0.0, 0.0]]]], dtype=np_type))
-    assert_np_equal(y.grad.numpy(), np.array([[1.0, 0.0]], dtype=np_type))
 # Test matrix constructors using explicit type (float16)
 # note that these tests are specifically not using generics / closure
@@ -1623,10 +1694,61 @@ def test_matrix_constructor_value_func():
     c = mat32d()
     d = mat32d(c, shape=(3, 2))
     e = mat32d(wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0), wp.float64(1.0), wp.float64(2.0))
-    f = mat32d(
-        wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
-        wp.vec3d(wp.float64(1.0), wp.float64(2.0), wp.float64(3.0)),
+@wp.kernel
+def test_matrix_from_vecs():
+    m1 = wp.matrix_from_cols(
+        wp.vec3(1.0, 2.0, 3.0),
+        wp.vec3(4.0, 5.0, 6.0),
+        wp.vec3(7.0, 8.0, 9.0),
+    )
+    wp.expect_eq(m1[0, 0], 1.0)
+    wp.expect_eq(m1[0, 1], 4.0)
+    wp.expect_eq(m1[0, 2], 7.0)
+    wp.expect_eq(m1[1, 0], 2.0)
+    wp.expect_eq(m1[1, 1], 5.0)
+    wp.expect_eq(m1[1, 2], 8.0)
+    wp.expect_eq(m1[2, 0], 3.0)
+    wp.expect_eq(m1[2, 1], 6.0)
+    wp.expect_eq(m1[2, 2], 9.0)
+    m2 = wp.matrix_from_rows(
+        wp.vec3(1.0, 2.0, 3.0),
+        wp.vec3(4.0, 5.0, 6.0),
+        wp.vec3(7.0, 8.0, 9.0),
+    )
+    wp.expect_eq(m2[0, 0], 1.0)
+    wp.expect_eq(m2[0, 1], 2.0)
+    wp.expect_eq(m2[0, 2], 3.0)
+    wp.expect_eq(m2[1, 0], 4.0)
+    wp.expect_eq(m2[1, 1], 5.0)
+    wp.expect_eq(m2[1, 2], 6.0)
+    wp.expect_eq(m2[2, 0], 7.0)
+    wp.expect_eq(m2[2, 1], 8.0)
+    wp.expect_eq(m2[2, 2], 9.0)
+    m3 = wp.matrix_from_cols(
+        wp.vec3(1.0, 2.0, 3.0),
+        wp.vec3(4.0, 5.0, 6.0),
     )
+    wp.expect_eq(m3[0, 0], 1.0)
+    wp.expect_eq(m3[0, 1], 4.0)
+    wp.expect_eq(m3[1, 0], 2.0)
+    wp.expect_eq(m3[1, 1], 5.0)
+    wp.expect_eq(m3[2, 0], 3.0)
+    wp.expect_eq(m3[2, 1], 6.0)
+    m4 = wp.matrix_from_rows(
+        wp.vec3(1.0, 2.0, 3.0),
+        wp.vec3(4.0, 5.0, 6.0),
+    )
+    wp.expect_eq(m4[0, 0], 1.0)
+    wp.expect_eq(m4[0, 1], 2.0)
+    wp.expect_eq(m4[0, 2], 3.0)
+    wp.expect_eq(m4[1, 0], 4.0)
+    wp.expect_eq(m4[1, 1], 5.0)
+    wp.expect_eq(m4[1, 2], 6.0)
 # Same as above but with a default (float/int) type
@@ -1743,15 +1865,20 @@ def test_matrix_len(test, device):
 @wp.kernel
 def matrix_augassign_kernel(
-    a: wp.array(dtype=wp.mat22), b: wp.array(dtype=wp.mat22), c: wp.array(dtype=wp.mat22), d: wp.array(dtype=wp.mat22)
+    a: wp.array(dtype=wp.mat22),
+    b: wp.array(dtype=wp.mat22),
+    x: wp.array(dtype=wp.vec2),
+    c: wp.array(dtype=wp.mat22),
+    d: wp.array(dtype=wp.mat22),
+    y: wp.array(dtype=wp.vec2),
 ):
     i = wp.tid()
     m1 = wp.mat22()
     m2 = b[i]
+    v2 = x[i]
-    m1[0, 0] += m2[0, 0]
-    m1[0, 1] += m2[0, 1]
+    m1[0] += v2
     m1[1, 0] += m2[1, 0]
     m1[1, 1] += m2[1, 1]
@@ -1759,9 +1886,9 @@ def matrix_augassign_kernel(
     m3 = wp.mat22()
     m4 = d[i]
+    v4 = y[i]
-    m3[0, 0] -= m4[0, 0]
-    m3[0, 1] -= m4[0, 1]
+    m3[0] -= v4
     m3[1, 0] -= m4[1, 0]
     m3[1, 1] -= m4[1, 1]
@@ -1769,27 +1896,61 @@ def matrix_augassign_kernel(
 def test_matrix_augassign(test, device):
-    N = 3
+    N = 1
-    a = wp.zeros(N, dtype=wp.mat22, requires_grad=True)
-    b = wp.ones(N, dtype=wp.mat22, requires_grad=True)
+    a = wp.zeros(N, dtype=wp.mat22, requires_grad=True, device=device)
+    b = wp.ones(N, dtype=wp.mat22, requires_grad=True, device=device)
+    x = wp.ones(N, dtype=wp.vec2, requires_grad=True, device=device)
-    c = wp.zeros(N, dtype=wp.mat22, requires_grad=True)
-    d = wp.ones(N, dtype=wp.mat22, requires_grad=True)
+    c = wp.zeros(N, dtype=wp.mat22, requires_grad=True, device=device)
+    d = wp.ones(N, dtype=wp.mat22, requires_grad=True, device=device)
+    y = wp.ones(N, dtype=wp.vec2, requires_grad=True, device=device)
     tape = wp.Tape()
     with tape:
-        wp.launch(matrix_augassign_kernel, N, inputs=[a, b, c, d])
+        wp.launch(matrix_augassign_kernel, N, inputs=[a, b, x, c, d, y], device=device)
     tape.backward(grads={a: wp.ones_like(a), c: wp.ones_like(c)})
     assert_np_equal(a.numpy(), wp.ones_like(a).numpy())
     assert_np_equal(a.grad.numpy(), wp.ones_like(a).numpy())
-    assert_np_equal(b.grad.numpy(), wp.ones_like(a).numpy())
+    assert_np_equal(b.grad.numpy(), np.array([[[0, 0], [1, 1]]], dtype=float))
+    assert_np_equal(x.grad.numpy(), np.array([[1, 1]], dtype=float))
     assert_np_equal(c.numpy(), -wp.ones_like(c).numpy())
     assert_np_equal(c.grad.numpy(), wp.ones_like(c).numpy())
-    assert_np_equal(d.grad.numpy(), -wp.ones_like(d).numpy())
+    assert_np_equal(d.grad.numpy(), np.array([[[0, 0], [-1, -1]]], dtype=float))
+    assert_np_equal(y.grad.numpy(), np.array([[-1, -1]], dtype=float))
+def test_matrix_assign_copy(test, device):
+    saved_enable_vector_component_overwrites_setting = wp.config.enable_vector_component_overwrites
+    try:
+        wp.config.enable_vector_component_overwrites = True
+        @wp.kernel
+        def mat_in_register_overwrite(x: wp.array2d(dtype=wp.mat22), y: wp.array(dtype=wp.vec2)):
+            i, j = wp.tid()
+            a = wp.mat22()
+            a[0] = y[i]
+            a[0, 1] = 3.0
+            x[i, j] = a
+        x = wp.zeros((1, 1), dtype=wp.mat22, device=device, requires_grad=True)
+        y = wp.ones(1, dtype=wp.vec2, device=device, requires_grad=True)
+        tape = wp.Tape()
+        with tape:
+            wp.launch(mat_in_register_overwrite, dim=(1, 1), inputs=[x, y], device=device)
+        tape.backward(grads={x: wp.ones_like(x, requires_grad=False)})
+        assert_np_equal(x.numpy(), np.array([[[[1.0, 3.0], [0.0, 0.0]]]], dtype=float))
+        assert_np_equal(y.grad.numpy(), np.array([[1.0, 0.0]], dtype=float))
+    finally:
+        wp.config.enable_vector_component_overwrites = saved_enable_vector_component_overwrites_setting
 devices = get_test_devices()
@@ -1814,6 +1975,7 @@ add_kernel_test(TestMat, test_constructors_explicit_precision, dim=1, devices=de
 add_kernel_test(TestMat, test_constructors_default_precision, dim=1, devices=devices)
 add_kernel_test(TestMat, test_constructors_constant_shape, dim=1, devices=devices)
 add_kernel_test(TestMat, test_matrix_constructor_value_func, dim=1, devices=devices)
+add_kernel_test(TestMat, test_matrix_from_vecs, dim=1, devices=devices)
 mat103 = wp.types.matrix(shape=(10, 3), dtype=float)
 add_kernel_test(
@@ -1878,18 +2040,6 @@ add_function_test(
     test_tpl_constructor_error_incompatible_sizes,
     devices=devices,
 )
-add_function_test(
-    TestMat,
-    "test_tpl_constructor_error_invalid_vector_count",
-    test_tpl_constructor_error_invalid_vector_count,
-    devices=devices,
-)
-add_function_test(
-    TestMat,
-    "test_tpl_constructor_error_invalid_vector_shape",
-    test_tpl_constructor_error_invalid_vector_shape,
-    devices=devices,
-)
 add_function_test(
     TestMat,
     "test_tpl_constructor_error_invalid_arg_count",
@@ -1908,6 +2058,9 @@ for dtype in np_float_types:
         TestMat, f"test_inverse_{dtype.__name__}", test_inverse, devices=devices, dtype=dtype
     )
     add_function_test_register_kernel(TestMat, f"test_svd_{dtype.__name__}", test_svd, devices=devices, dtype=dtype)
+    add_function_test_register_kernel(
+        TestMat, f"test_svd_2D{dtype.__name__}", test_svd_2D, devices=devices, dtype=dtype
+    )
     add_function_test_register_kernel(TestMat, f"test_qr_{dtype.__name__}", test_qr, devices=devices, dtype=dtype)
     add_function_test_register_kernel(TestMat, f"test_eig_{dtype.__name__}", test_eig, devices=devices, dtype=dtype)
     add_function_test_register_kernel(
@@ -1922,13 +2075,14 @@ for dtype in np_float_types:
     add_function_test_register_kernel(TestMat, f"test_skew_{dtype.__name__}", test_skew, devices=devices, dtype=dtype)
     add_function_test_register_kernel(
         TestMat,
-        f"test_mat_array_type_indexing_{dtype.__name__}",
-        test_mat_array_type_indexing,
+        f"test_matrix_assign_inplace_{dtype.__name__}",
+        test_matrix_assign_inplace,
         devices=devices,
         dtype=dtype,
     )
 add_function_test(TestMat, "test_matrix_len", test_matrix_len, devices=devices)
 add_function_test(TestMat, "test_matrix_augassign", test_matrix_augassign, devices=devices)
+add_function_test(TestMat, "test_matrix_assign_copy", test_matrix_assign_copy, devices=devices)
 if __name__ == "__main__":
     wp.clear_kernel_cache()