PyPI - warp-lang - Versions diffs - 1.7.2rc1__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.2rc1__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (181) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/RECORD +178 -166
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/utils.py CHANGED Viewed

@@ -21,7 +21,8 @@ import os
 import sys
 import time
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Union
+from types import ModuleType
+from typing import Any, Callable
 import numpy as np
@@ -29,6 +30,7 @@ import warp as wp
 import warp.context
 import warp.types
 from warp.context import Devicelike
+from warp.types import Array, DType, type_repr, types_equal
 warnings_seen = set()
@@ -52,7 +54,7 @@ def warp_showwarning(message, category, filename, lineno, file=None, line=None):
         if line:
             line = line.strip()
-            s += "  %s\n" % line
+            s += f"  {line}\n"
     else:
         # simple warning
         s = f"Warp {category.__name__}: {message}\n"
@@ -96,14 +98,31 @@ def quat_between_vectors(a: wp.vec3, b: wp.vec3) -> wp.quat:
 def array_scan(in_array, out_array, inclusive=True):
+    """Perform a scan (prefix sum) operation on an array.
+    This function computes the inclusive or exclusive scan of the input array and stores the result in the output array.
+    The scan operation computes a running sum of elements in the array.
+    Args:
+        in_array (wp.array): Input array to scan. Must be of type int32 or float32.
+        out_array (wp.array): Output array to store scan results. Must match input array type and size.
+        inclusive (bool, optional): If True, performs an inclusive scan (includes current element in sum).
+                                  If False, performs an exclusive scan (excludes current element). Defaults to True.
+    Raises:
+        RuntimeError: If array storage devices don't match, if storage size is insufficient, or if data types are unsupported.
+    """
     if in_array.device != out_array.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError(f"In and out array storage devices do not match ({in_array.device} vs {out_array.device})")
     if in_array.size != out_array.size:
-        raise RuntimeError("Array storage sizes do not match")
+        raise RuntimeError(f"In and out array storage sizes do not match ({in_array.size} vs {out_array.size})")
-    if in_array.dtype != out_array.dtype:
-        raise RuntimeError("Array data types do not match")
+    if not types_equal(in_array.dtype, out_array.dtype):
+        raise RuntimeError(
+            f"In and out array data types do not match ({type_repr(in_array.dtype)} vs {type_repr(out_array.dtype)})"
+        )
     if in_array.size == 0:
         return
@@ -116,25 +135,39 @@ def array_scan(in_array, out_array, inclusive=True):
         elif in_array.dtype == wp.float32:
             runtime.core.array_scan_float_host(in_array.ptr, out_array.ptr, in_array.size, inclusive)
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(in_array.dtype)}")
     elif in_array.device.is_cuda:
         if in_array.dtype == wp.int32:
             runtime.core.array_scan_int_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
         elif in_array.dtype == wp.float32:
             runtime.core.array_scan_float_device(in_array.ptr, out_array.ptr, in_array.size, inclusive)
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(in_array.dtype)}")
 def radix_sort_pairs(keys, values, count: int):
+    """Sort key-value pairs using radix sort.
+    This function sorts pairs of arrays based on the keys array, maintaining the key-value
+    relationship. The sort is stable and operates in linear time.
+    The `keys` and `values` arrays must be large enough to accommodate 2*`count` elements.
+    Args:
+        keys (wp.array): Array of keys to sort. Must be of type int32, float32, or int64.
+        values (wp.array): Array of values to sort along with keys. Must be of type int32.
+        count (int): Number of elements to sort.
+    Raises:
+        RuntimeError: If array storage devices don't match, if storage size is insufficient, or if data types are unsupported.
+    """
     if keys.device != values.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError(f"Keys and values array storage devices do not match ({keys.device} vs {values.device})")
     if count == 0:
         return
     if keys.size < 2 * count or values.size < 2 * count:
-        raise RuntimeError("Array storage must be large enough to contain 2*count elements")
+        raise RuntimeError("Keys and values array storage must be large enough to contain 2*count elements")
     from warp.context import runtime
@@ -146,7 +179,9 @@ def radix_sort_pairs(keys, values, count: int):
         elif keys.dtype == wp.int64 and values.dtype == wp.int32:
             runtime.core.radix_sort_pairs_int64_host(keys.ptr, values.ptr, count)
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(
+                f"Unsupported keys and values data types: {type_repr(keys.dtype)}, {type_repr(values.dtype)}"
+            )
     elif keys.device.is_cuda:
         if keys.dtype == wp.int32 and values.dtype == wp.int32:
             runtime.core.radix_sort_pairs_int_device(keys.ptr, values.ptr, count)
@@ -155,7 +190,9 @@ def radix_sort_pairs(keys, values, count: int):
         elif keys.dtype == wp.int64 and values.dtype == wp.int32:
             runtime.core.radix_sort_pairs_int64_device(keys.ptr, values.ptr, count)
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(
+                f"Unsupported keys and values data types: {type_repr(keys.dtype)}, {type_repr(values.dtype)}"
+            )
 def segmented_sort_pairs(
@@ -169,6 +206,7 @@ def segmented_sort_pairs(
     This function performs a segmented sort of key-value pairs, where the sorting is done independently within each segment.
     The segments are defined by their start and optionally end indices.
+    The `keys` and `values` arrays must be large enough to accommodate 2*`count` elements.
     Args:
         keys: Array of keys to sort. Must be of type int32 or float32.
@@ -187,7 +225,7 @@ def segmented_sort_pairs(
                      if segment_start_indices is not of type int32, or if data types are unsupported.
     """
     if keys.device != values.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError(f"Array storage devices do not match ({keys.device} vs {values.device})")
     if count == 0:
         return
@@ -219,39 +257,80 @@ def segmented_sort_pairs(
     if keys.device.is_cpu:
         if keys.dtype == wp.int32 and values.dtype == wp.int32:
             runtime.core.segmented_sort_pairs_int_host(
-                keys.ptr, values.ptr, count, segment_start_indices_ptr, segment_end_indices_ptr, num_segments
+                keys.ptr,
+                values.ptr,
+                count,
+                segment_start_indices_ptr,
+                segment_end_indices_ptr,
+                num_segments,
             )
         elif keys.dtype == wp.float32 and values.dtype == wp.int32:
             runtime.core.segmented_sort_pairs_float_host(
-                keys.ptr, values.ptr, count, segment_start_indices_ptr, segment_end_indices_ptr, num_segments
+                keys.ptr,
+                values.ptr,
+                count,
+                segment_start_indices_ptr,
+                segment_end_indices_ptr,
+                num_segments,
             )
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(keys.dtype)}")
     elif keys.device.is_cuda:
         if keys.dtype == wp.int32 and values.dtype == wp.int32:
             runtime.core.segmented_sort_pairs_int_device(
-                keys.ptr, values.ptr, count, segment_start_indices_ptr, segment_end_indices_ptr, num_segments
+                keys.ptr,
+                values.ptr,
+                count,
+                segment_start_indices_ptr,
+                segment_end_indices_ptr,
+                num_segments,
             )
         elif keys.dtype == wp.float32 and values.dtype == wp.int32:
             runtime.core.segmented_sort_pairs_float_device(
-                keys.ptr, values.ptr, count, segment_start_indices_ptr, segment_end_indices_ptr, num_segments
+                keys.ptr,
+                values.ptr,
+                count,
+                segment_start_indices_ptr,
+                segment_end_indices_ptr,
+                num_segments,
             )
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(keys.dtype)}")
 def runlength_encode(values, run_values, run_lengths, run_count=None, value_count=None):
+    """Perform run-length encoding on an array.
+    This function compresses an array by replacing consecutive identical values with a single value
+    and its count. For example, [1,1,1,2,2,3] becomes values=[1,2,3] and lengths=[3,2,1].
+    Args:
+        values (wp.array): Input array to encode. Must be of type int32.
+        run_values (wp.array): Output array to store unique values. Must be at least value_count in size.
+        run_lengths (wp.array): Output array to store run lengths. Must be at least value_count in size.
+        run_count (wp.array, optional): Optional output array to store the number of runs.
+                                       If None, returns the count as an integer.
+        value_count (int, optional): Number of values to process. If None, processes entire array.
+    Returns:
+        int or wp.array: Number of runs if run_count is None, otherwise returns run_count array.
+    Raises:
+        RuntimeError: If array storage devices don't match, if storage size is insufficient, or if data types are unsupported.
+    """
     if run_values.device != values.device or run_lengths.device != values.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError("run_values, run_lengths and values storage devices do not match")
     if value_count is None:
         value_count = values.size
     if run_values.size < value_count or run_lengths.size < value_count:
-        raise RuntimeError("Output array storage sizes must be at least equal to value_count")
+        raise RuntimeError(f"Output array storage sizes must be at least equal to value_count ({value_count})")
-    if values.dtype != run_values.dtype:
-        raise RuntimeError("values and run_values data types do not match")
+    if not types_equal(values.dtype, run_values.dtype):
+        raise RuntimeError(
+            f"values and run_values data types do not match ({type_repr(values.dtype)} vs {type_repr(run_values.dtype)})"
+        )
     if run_lengths.dtype != wp.int32:
         raise RuntimeError("run_lengths array must be of type int32")
@@ -270,7 +349,7 @@ def runlength_encode(values, run_values, run_lengths, run_count=None, value_coun
             raise RuntimeError("run_count array must be of type int32")
         if value_count == 0:
             run_count.zero_()
-            return 0
+            return run_count
         host_return = False
     from warp.context import runtime
@@ -281,20 +360,39 @@ def runlength_encode(values, run_values, run_lengths, run_count=None, value_coun
                 values.ptr, run_values.ptr, run_lengths.ptr, run_count.ptr, value_count
             )
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(values.dtype)}")
     elif values.device.is_cuda:
         if values.dtype == wp.int32:
             runtime.core.runlength_encode_int_device(
                 values.ptr, run_values.ptr, run_lengths.ptr, run_count.ptr, value_count
             )
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(values.dtype)}")
     if host_return:
         return int(run_count.numpy()[0])
+    return run_count
 def array_sum(values, out=None, value_count=None, axis=None):
+    """Compute the sum of array elements.
+    This function computes the sum of array elements, optionally along a specified axis.
+    The operation can be performed on the entire array or along a specific dimension.
+    Args:
+        values (wp.array): Input array to sum. Must be of type float32 or float64.
+        out (wp.array, optional): Output array to store results. If None, a new array is created.
+        value_count (int, optional): Number of elements to process. If None, processes entire array.
+        axis (int, optional): Axis along which to compute sum. If None, computes sum of all elements.
+    Returns:
+        wp.array or float: The sum result. Returns a float if axis is None and out is None,
+                           otherwise returns the output array.
+    Raises:
+        RuntimeError: If output array storage device or data type is incompatible with input array.
+    """
     if value_count is None:
         if axis is None:
             value_count = values.size
@@ -310,7 +408,7 @@ def array_sum(values, out=None, value_count=None, axis=None):
         output_shape = tuple(output_dim(ax, dim) for ax, dim in enumerate(values.shape))
-    type_length = wp.types.type_length(values.dtype)
+    type_size = wp.types.type_size(values.dtype)
     scalar_type = wp.types.type_scalar_type(values.dtype)
     # User can provide a device output array for storing the number of runs
@@ -341,48 +439,67 @@ def array_sum(values, out=None, value_count=None, axis=None):
         elif scalar_type == wp.float64:
             native_func = runtime.core.array_sum_double_host
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(values.dtype)}")
     elif values.device.is_cuda:
         if scalar_type == wp.float32:
             native_func = runtime.core.array_sum_float_device
         elif scalar_type == wp.float64:
             native_func = runtime.core.array_sum_double_device
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(values.dtype)}")
     if axis is None:
         stride = wp.types.type_size_in_bytes(values.dtype)
-        native_func(values.ptr, out.ptr, value_count, stride, type_length)
+        native_func(values.ptr, out.ptr, value_count, stride, type_size)
         if host_return:
             return out.numpy()[0]
-    else:
-        stride = values.strides[axis]
-        for idx in np.ndindex(output_shape):
-            out_offset = sum(i * s for i, s in zip(idx, out.strides))
-            val_offset = sum(i * s for i, s in zip(idx, values.strides))
-            native_func(
-                values.ptr + val_offset,
-                out.ptr + out_offset,
-                value_count,
-                stride,
-                type_length,
-            )
+        return out
-        if host_return:
-            return out
+    stride = values.strides[axis]
+    for idx in np.ndindex(output_shape):
+        out_offset = sum(i * s for i, s in zip(idx, out.strides))
+        val_offset = sum(i * s for i, s in zip(idx, values.strides))
+        native_func(
+            values.ptr + val_offset,
+            out.ptr + out_offset,
+            value_count,
+            stride,
+            type_size,
+        )
+    return out
 def array_inner(a, b, out=None, count=None, axis=None):
+    """Compute the inner product of two arrays.
+    This function computes the dot product between two arrays, optionally along a specified axis.
+    The operation can be performed on the entire arrays or along a specific dimension.
+    Args:
+        a (wp.array): First input array.
+        b (wp.array): Second input array. Must match shape and type of a.
+        out (wp.array, optional): Output array to store results. If None, a new array is created.
+        count (int, optional): Number of elements to process. If None, processes entire arrays.
+        axis (int, optional): Axis along which to compute inner product. If None, computes on flattened arrays.
+    Returns:
+        wp.array or float: The inner product result. Returns a float if axis is None and out is None,
+                           otherwise returns the output array.
+    Raises:
+        RuntimeError: If array storage devices, sizes, or data types are incompatible.
+    """
     if a.size != b.size:
-        raise RuntimeError("Array storage sizes do not match")
+        raise RuntimeError(f"A and b array storage sizes do not match ({a.size} vs {b.size})")
     if a.device != b.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError(f"A and b array storage devices do not match ({a.device} vs {b.device})")
-    if a.dtype != b.dtype:
-        raise RuntimeError("Array data types do not match")
+    if not types_equal(a.dtype, b.dtype):
+        raise RuntimeError(f"A and b array data types do not match ({type_repr(a.dtype)} vs {type_repr(b.dtype)})")
     if count is None:
         if axis is None:
@@ -399,7 +516,7 @@ def array_inner(a, b, out=None, count=None, axis=None):
         output_shape = tuple(output_dim(ax, dim) for ax, dim in enumerate(a.shape))
-    type_length = wp.types.type_length(a.dtype)
+    type_size = wp.types.type_size(a.dtype)
     scalar_type = wp.types.type_scalar_type(a.dtype)
     # User can provide a device output array for storing the number of runs
@@ -430,43 +547,43 @@ def array_inner(a, b, out=None, count=None, axis=None):
         elif scalar_type == wp.float64:
             native_func = runtime.core.array_inner_double_host
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(a.dtype)}")
     elif a.device.is_cuda:
         if scalar_type == wp.float32:
             native_func = runtime.core.array_inner_float_device
         elif scalar_type == wp.float64:
             native_func = runtime.core.array_inner_double_device
         else:
-            raise RuntimeError("Unsupported data type")
+            raise RuntimeError(f"Unsupported data type: {type_repr(a.dtype)}")
     if axis is None:
         stride_a = wp.types.type_size_in_bytes(a.dtype)
         stride_b = wp.types.type_size_in_bytes(b.dtype)
-        native_func(a.ptr, b.ptr, out.ptr, count, stride_a, stride_b, type_length)
+        native_func(a.ptr, b.ptr, out.ptr, count, stride_a, stride_b, type_size)
         if host_return:
             return out.numpy()[0]
-    else:
-        stride_a = a.strides[axis]
-        stride_b = b.strides[axis]
-        for idx in np.ndindex(output_shape):
-            out_offset = sum(i * s for i, s in zip(idx, out.strides))
-            a_offset = sum(i * s for i, s in zip(idx, a.strides))
-            b_offset = sum(i * s for i, s in zip(idx, b.strides))
-            native_func(
-                a.ptr + a_offset,
-                b.ptr + b_offset,
-                out.ptr + out_offset,
-                count,
-                stride_a,
-                stride_b,
-                type_length,
-            )
+        return out
-        if host_return:
-            return out
+    stride_a = a.strides[axis]
+    stride_b = b.strides[axis]
+    for idx in np.ndindex(output_shape):
+        out_offset = sum(i * s for i, s in zip(idx, out.strides))
+        a_offset = sum(i * s for i, s in zip(idx, a.strides))
+        b_offset = sum(i * s for i, s in zip(idx, b.strides))
+        native_func(
+            a.ptr + a_offset,
+            b.ptr + b_offset,
+            out.ptr + out_offset,
+            count,
+            stride_a,
+            stride_b,
+            type_size,
+        )
+    return out
 @wp.kernel
@@ -479,8 +596,28 @@ def _array_cast_kernel(
 def array_cast(in_array, out_array, count=None):
+    """Cast elements from one array to another array with a different data type.
+    This function performs element-wise casting from the input array to the output array.
+    The arrays must have the same number of dimensions and data type shapes. If they don't match,
+    the arrays will be flattened and casting will be performed at the scalar level.
+    Args:
+        in_array (wp.array): Input array to cast from.
+        out_array (wp.array): Output array to cast to. Must have the same device as in_array.
+        count (int, optional): Number of elements to process. If None, processes entire array.
+                             For multi-dimensional arrays, partial casting is not supported.
+    Raises:
+        RuntimeError: If arrays have different devices or if attempting partial casting
+                     on multi-dimensional arrays.
+    Note:
+        If the input and output arrays have the same data type, this function will
+        simply copy the data without any conversion.
+    """
     if in_array.device != out_array.device:
-        raise RuntimeError("Array storage devices do not match")
+        raise RuntimeError(f"Array storage devices do not match ({in_array.device} vs {out_array.device})")
     in_array_data_shape = getattr(in_array.dtype, "_shape_", ())
     out_array_data_shape = getattr(out_array.dtype, "_shape_", ())
@@ -491,8 +628,8 @@ def array_cast(in_array, out_array, count=None):
         in_array = in_array.flatten()
         out_array = out_array.flatten()
-        in_array_data_length = warp.types.type_length(in_array.dtype)
-        out_array_data_length = warp.types.type_length(out_array.dtype)
+        in_array_data_length = warp.types.type_size(in_array.dtype)
+        out_array_data_length = warp.types.type_size(out_array.dtype)
         in_array_scalar_type = wp.types.type_scalar_type(in_array.dtype)
         out_array_scalar_type = wp.types.type_scalar_type(out_array.dtype)
@@ -534,6 +671,430 @@ def array_cast(in_array, out_array, count=None):
         wp.launch(kernel=_array_cast_kernel, dim=dim, inputs=[out_array, in_array], device=out_array.device)
+def create_warp_function(func: Callable) -> tuple[wp.Function, warp.context.Module]:
+    """Create a Warp function from a Python function.
+    Args:
+        func (Callable): A Python function to be converted to a Warp function.
+    Returns:
+        wp.Function: A Warp function created from the input function.
+    """
+    from .codegen import Adjoint, get_full_arg_spec
+    def unique_name(code: str):
+        return "func_" + hex(hash(code))[-8:]
+    # Create a Warp function from the input function
+    source = None
+    argspec = get_full_arg_spec(func)
+    key = getattr(func, "__name__", None)
+    if key is None:
+        source, _ = Adjoint.extract_function_source(func)
+        key = unique_name(source)
+    elif key == "<lambda>":
+        body = Adjoint.extract_lambda_source(func, only_body=True)
+        if body is None:
+            raise ValueError("Could not extract lambda source code")
+        key = unique_name(body)
+        source = f"def {key}({', '.join(argspec.args)}):\n  return {body}"
+    else:
+        # use the qualname of the function as the key
+        key = getattr(func, "__qualname__", key)
+        key = key.replace(".", "_").replace(" ", "_").replace("<", "").replace(">", "_")
+    module = warp.context.get_module(f"map_{key}")
+    func = wp.Function(
+        func,
+        namespace="",
+        module=module,
+        key=key,
+        source=source,
+        overloaded_annotations=dict.fromkeys(argspec.args, Any),
+    )
+    return func, module
+def broadcast_shapes(shapes: list[tuple[int]]) -> tuple[int]:
+    """Broadcast a list of shapes to a common shape.
+    Following the broadcasting rules of NumPy, two shapes are compatible when:
+    starting from the trailing dimension,
+        1. the two dimensions are equal, or
+        2. one of the dimensions is 1.
+    Example:
+        >>> broadcast_shapes([(3, 1, 4), (5, 4)])
+        (3, 5, 4)
+    Returns:
+        tuple[int]: The broadcasted shape.
+    Raises:
+        ValueError: If the shapes are not broadcastable.
+    """
+    ref = shapes[0]
+    for shape in shapes[1:]:
+        broad = []
+        for j in range(1, max(len(ref), len(shape)) + 1):
+            if j <= len(ref) and j <= len(shape):
+                s = shape[-j]
+                r = ref[-j]
+                if s == r:
+                    broad.append(s)
+                elif s == 1 or r == 1:
+                    broad.append(max(s, r))
+                else:
+                    raise ValueError(f"Shapes {ref} and {shape} are not broadcastable")
+            elif j <= len(ref):
+                broad.append(ref[-j])
+            else:
+                broad.append(shape[-j])
+        ref = tuple(reversed(broad))
+    return ref
+def map(
+    func: Callable | wp.Function,
+    *inputs: Array[DType] | Any,
+    out: Array[DType] | list[Array[DType]] | None = None,
+    return_kernel: bool = False,
+    block_dim=256,
+    device: Devicelike = None,
+) -> Array[DType] | list[Array[DType]] | wp.Kernel:
+    """
+    Map a function over the elements of one or more arrays.
+    You can use a Warp function, a regular Python function, or a lambda expression to map it to a set of arrays.
+    .. testcode::
+        a = wp.array([1, 2, 3], dtype=wp.float32)
+        b = wp.array([4, 5, 6], dtype=wp.float32)
+        c = wp.array([7, 8, 9], dtype=wp.float32)
+        result = wp.map(lambda x, y, z: x + 2.0 * y - z, a, b, c)
+        print(result)
+    .. testoutput::
+        [2. 4. 6.]
+    Clamp values in an array in place:
+    .. testcode::
+        xs = wp.array([-1.0, 0.0, 1.0], dtype=wp.float32)
+        wp.map(wp.clamp, xs, -0.5, 0.5, out=xs)
+        print(xs)
+    .. testoutput::
+        [-0.5  0.   0.5]
+    Note that only one of the inputs must be a Warp array. For example, it is possible
+    vectorize the function :func:`warp.transform_point` over a collection of points
+    with a given input transform as follows:
+    .. code-block:: python
+        tf = wp.transform((1.0, 2.0, 3.0), wp.quat_rpy(0.2, -0.6, 0.1))
+        points = wp.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=wp.vec3)
+        transformed = wp.map(wp.transform_point, tf, points)
+    Besides regular Warp arrays, other array types, such as the ``indexedarray``, are supported as well:
+    .. testcode::
+        arr = wp.array(data=np.arange(10, dtype=np.float32))
+        indices = wp.array([1, 3, 5, 7, 9], dtype=int)
+        iarr = wp.indexedarray1d(arr, [indices])
+        out = wp.map(lambda x: x * 10.0, iarr)
+        print(out)
+    .. testoutput::
+        [10. 30. 50. 70. 90.]
+    If multiple arrays are provided, the
+    `NumPy broadcasting rules <https://numpy.org/doc/stable/user/basics.broadcasting.html>`_
+    are applied to determine the shape of the output array.
+    Two shapes are compatible when:
+    starting from the trailing dimension,
+    1. the two dimensions are equal, or
+    2. one of the dimensions is 1.
+    For example, given arrays of shapes ``(3, 1, 4)`` and ``(5, 4)``, the broadcasted
+    shape is ``(3, 5, 4)``.
+    If no array(s) are provided to the ``out`` argument, the output array(s) are created automatically.
+    The data type(s) of the output array(s) are determined by the type of the return value(s) of
+    the function. The ``requires_grad`` flag for an automatically created output array is set to ``True``
+    if any of the input arrays have it set to ``True`` and the respective output array's ``dtype`` is a type that
+    supports differentiation.
+    Args:
+        func (Callable | Function): The function to map over the arrays.
+        *inputs (array | Any): The input arrays or values to pass to the function.
+        out (array | list[array] | None): Optional output array(s) to store the result(s). If None, the output array(s) will be created automatically.
+        return_kernel (bool): If True, only return the generated kernel without performing the mapping operation.
+        block_dim (int): The block dimension for the kernel launch.
+        device (Devicelike): The device on which to run the kernel.
+    Returns:
+        array | list[array] | Kernel:
+            The resulting array(s) of the mapping. If ``return_kernel`` is True, only returns the kernel used for mapping.
+    """
+    import builtins
+    from .codegen import Adjoint, Struct, StructInstance
+    from .types import (
+        is_array,
+        type_is_matrix,
+        type_is_quaternion,
+        type_is_transformation,
+        type_is_vector,
+        type_repr,
+        type_to_warp,
+        types_equal,
+    )
+    # mapping from struct name to its Python definition
+    referenced_modules: dict[str, ModuleType] = {}
+    def type_to_code(wp_type) -> str:
+        """Returns the string representation of a given Warp type."""
+        if is_array(wp_type):
+            return f"warp.array(ndim={wp_type.ndim}, dtype={type_to_code(wp_type.dtype)})"
+        if isinstance(wp_type, Struct):
+            key = f"{wp_type.__module__}.{wp_type.key}"
+            module = sys.modules.get(wp_type.__module__, None)
+            if module is not None:
+                referenced_modules[wp_type.__module__] = module
+            return key
+        if type_is_transformation(wp_type):
+            return f"warp.types.transformation(dtype={type_to_code(wp_type._wp_scalar_type_)})"
+        if type_is_quaternion(wp_type):
+            return f"warp.types.quaternion(dtype={type_to_code(wp_type._wp_scalar_type_)})"
+        if type_is_vector(wp_type):
+            return f"warp.types.vector(length={wp_type._shape_[0]}, dtype={type_to_code(wp_type._wp_scalar_type_)})"
+        if type_is_matrix(wp_type):
+            return f"warp.types.matrix(shape=({wp_type._shape_[0]}, {wp_type._shape_[1]}), dtype={type_to_code(wp_type._wp_scalar_type_)})"
+        if wp_type == builtins.bool:
+            return "bool"
+        if wp_type == builtins.float:
+            return "float"
+        if wp_type == builtins.int:
+            return "int"
+        name = getattr(wp_type, "__name__", None)
+        if name is None:
+            return type_repr(wp_type)
+        name = getattr(wp_type, "__qualname__", name)
+        module = getattr(wp_type, "__module__", None)
+        if module is not None:
+            referenced_modules[wp_type.__module__] = module
+        return wp_type.__module__ + "." + name
+    def get_warp_type(value):
+        dtype = type(value)
+        if issubclass(dtype, StructInstance):
+            # a struct
+            return value._cls
+        return type_to_warp(dtype)
+    # gather the arrays in the inputs
+    array_shapes = [a.shape for a in inputs if is_array(a)]
+    if len(array_shapes) == 0:
+        raise ValueError("map requires at least one warp.array input")
+    # broadcast the shapes of the arrays
+    out_shape = broadcast_shapes(array_shapes)
+    module = None
+    out_dtypes = None
+    skip_arg_type_checks = False
+    if isinstance(func, wp.Function):
+        func_name = func.key
+        wp_func = func
+    else:
+        # check if op is a callable function
+        if not callable(func):
+            raise TypeError("func must be a callable function or a warp.Function")
+        wp_func, module = create_warp_function(func)
+        func_name = wp_func.key
+        # we created a generic function here (arg types are all Any)
+        skip_arg_type_checks = True
+    if module is None:
+        module = warp.context.get_module(f"map_{func_name}")
+    arg_names = list(wp_func.input_types.keys())
+    # determine output dtype
+    if wp_func.value_func is not None or wp_func.value_type is not None:
+        arg_types = {}
+        arg_values = {}
+        for i, arg_name in enumerate(arg_names):
+            if is_array(inputs[i]):
+                # we will pass an element of the array to the function
+                arg_types[arg_name] = inputs[i].dtype
+                if device is None:
+                    device = inputs[i].device
+            else:
+                # we pass the input value directly to the function
+                arg_types[arg_name] = get_warp_type(inputs[i])
+        func_or_none = wp_func.get_overload(list(arg_types.values()), {})
+        if func_or_none is None:
+            raise TypeError(
+                f"Function {func_name} does not support the provided argument types {', '.join(type_repr(t) for t in arg_types.values())}"
+            )
+        func = func_or_none
+        if func.value_func is not None:
+            out_dtype = func.value_func(arg_types, arg_values)
+        else:
+            out_dtype = func.value_type
+        if isinstance(out_dtype, tuple) or isinstance(out_dtype, list):
+            out_dtypes = out_dtype
+        else:
+            out_dtypes = (out_dtype,)
+    else:
+        # try to evaluate the function to determine the output type
+        args = []
+        arg_types = wp_func.input_types
+        if len(inputs) != len(arg_types):
+            raise TypeError(
+                f"Number of input arguments ({len(inputs)}) does not match expected number of function arguments ({len(arg_types)})"
+            )
+        for (arg_name, arg_type), input in zip(arg_types.items(), inputs):
+            if is_array(input):
+                if not skip_arg_type_checks and not types_equal(input.dtype, arg_type):
+                    raise TypeError(
+                        f'Incorrect input provided for argument "{arg_name}": received array of dtype {type_repr(input.dtype)}, expected {type_repr(arg_type)}'
+                    )
+                args.append(input.dtype())
+                if device is None:
+                    device = input.device
+            else:
+                if not skip_arg_type_checks and not types_equal(type(input), arg_type):
+                    raise TypeError(
+                        f'Incorrect input provided for argument "{arg_name}": received {type_repr(type(input))}, expected {type_repr(arg_type)}'
+                    )
+                args.append(input)
+        result = wp_func(*args)
+        if result is None:
+            raise TypeError("The provided function must return a value")
+        if isinstance(result, tuple) or isinstance(result, list):
+            out_dtypes = tuple(get_warp_type(r) for r in result)
+        else:
+            out_dtypes = (get_warp_type(result),)
+    if out_dtypes is None:
+        raise TypeError("Could not determine the output type of the function, make sure it returns a value")
+    if out is None:
+        requires_grad = any(getattr(a, "requires_grad", False) for a in inputs if is_array(a))
+        outputs = []
+        for dtype in out_dtypes:
+            rg = requires_grad and Adjoint.is_differentiable_value_type(dtype)
+            outputs.append(wp.empty(out_shape, dtype=dtype, requires_grad=rg, device=device))
+    elif len(out_dtypes) == 1 and is_array(out):
+        if not types_equal(out.dtype, out_dtypes[0]):
+            raise TypeError(
+                f"Output array dtype {type_repr(out.dtype)} does not match expected dtype {type_repr(out_dtypes[0])}"
+            )
+        if out.shape != out_shape:
+            raise TypeError(f"Output array shape {out.shape} does not match expected shape {out_shape}")
+        outputs = [out]
+    elif len(out_dtypes) > 1:
+        if isinstance(out, tuple) or isinstance(out, list):
+            if len(out) != len(out_dtypes):
+                raise TypeError(
+                    f"Number of provided output arrays ({len(out)}) does not match expected number of function outputs ({len(out_dtypes)})"
+                )
+            for i, a in enumerate(out):
+                if not types_equal(a.dtype, out_dtypes[i]):
+                    raise TypeError(
+                        f"Output array {i} dtype {type_repr(a.dtype)} does not match expected dtype {type_repr(out_dtypes[i])}"
+                    )
+                if a.shape != out_shape:
+                    raise TypeError(f"Output array {i} shape {a.shape} does not match expected shape {out_shape}")
+            outputs = list(out)
+        else:
+            raise TypeError(
+                f"Invalid output provided, expected {len(out_dtypes)} Warp arrays with shape {out_shape} and dtypes ({', '.join(type_repr(t) for t in out_dtypes)})"
+            )
+    # create code for a kernel
+    code = """def map_kernel({kernel_args}):
+    {tids} = wp.tid()
+    {load_args}
+    """
+    if len(outputs) == 1:
+        code += "__out_0[{tids}] = {func_name}({arg_names})"
+    else:
+        code += ", ".join(f"__o_{i}" for i in range(len(outputs)))
+        code += " = {func_name}({arg_names})\n"
+        for i in range(len(outputs)):
+            code += f"    __out_{i}" + "[{tids}]" + f" = __o_{i}\n"
+    tids = [f"__tid_{i}" for i in range(len(out_shape))]
+    load_args = []
+    kernel_args = []
+    for arg_name, input in zip(arg_names, inputs):
+        if is_array(input):
+            arr_name = f"{arg_name}_array"
+            array_type_name = type(input).__name__
+            kernel_args.append(
+                f"{arr_name}: wp.{array_type_name}(dtype={type_to_code(input.dtype)}, ndim={input.ndim})"
+            )
+            shape = input.shape
+            indices = []
+            for i in range(1, len(shape) + 1):
+                if shape[-i] == 1:
+                    indices.append("0")
+                else:
+                    indices.append(tids[-i])
+            load_args.append(f"{arg_name} = {arr_name}[{', '.join(reversed(indices))}]")
+        else:
+            kernel_args.append(f"{arg_name}: {type_to_code(type(input))}")
+    for i, o in enumerate(outputs):
+        array_type_name = type(o).__name__
+        kernel_args.append(f"__out_{i}: wp.{array_type_name}(dtype={type_to_code(o.dtype)}, ndim={o.ndim})")
+    code = code.format(
+        func_name=func_name,
+        kernel_args=", ".join(kernel_args),
+        arg_names=", ".join(arg_names),
+        tids=", ".join(tids),
+        load_args="\n    ".join(load_args),
+    )
+    namespace = {}
+    namespace.update({"wp": wp, "warp": wp, func_name: wp_func, "Any": Any})
+    namespace.update(referenced_modules)
+    exec(code, namespace)
+    kernel = wp.Kernel(namespace["map_kernel"], key="map_kernel", source=code, module=module)
+    if return_kernel:
+        return kernel
+    wp.launch(
+        kernel,
+        dim=out_shape,
+        inputs=inputs,
+        outputs=outputs,
+        block_dim=block_dim,
+        device=device,
+    )
+    if len(outputs) == 1:
+        o = outputs[0]
+    else:
+        o = outputs
+    return o
 # code snippet for invoking cProfile
 # cp = cProfile.Profile()
 # cp.enable()
@@ -634,7 +1195,7 @@ def mem_report():  # pragma: no cover
             element_size = tensor.storage().element_size()
             mem = numel * element_size / 1024 / 1024  # 32bit=4Byte, MByte
             total_mem += mem
-        print("Type: %s Total Tensors: %d \tUsed Memory Space: %.2f MBytes" % (mem_type, total_numel, total_mem))
+        print(f"Type: {mem_type:<4} | Total Tensors: {total_numel:>8} | Used Memory: {total_mem:>8.2f} MB")
     import gc
@@ -712,7 +1273,7 @@ class ScopedStream:
         device (Device): The device associated with the stream.
     """
-    def __init__(self, stream: Optional[wp.Stream], sync_enter: bool = True, sync_exit: bool = False):
+    def __init__(self, stream: wp.Stream | None, sync_enter: bool = True, sync_exit: bool = False):
         """Initializes the context manager with a stream and synchronization options.
         Args:
@@ -765,12 +1326,12 @@ class ScopedTimer:
         active: bool = True,
         print: bool = True,
         detailed: bool = False,
-        dict: Optional[Dict[str, List[float]]] = None,
+        dict: dict[str, list[float]] | None = None,
         use_nvtx: bool = False,
-        color: Union[int, str] = "rapids",
+        color: int | str = "rapids",
         synchronize: bool = False,
         cuda_filter: int = 0,
-        report_func: Optional[Callable[[List[TimingResult], str], None]] = None,
+        report_func: Callable[[list[TimingResult], str], None] | None = None,
         skip_tape: bool = False,
     ):
         """Context manager object for a timer
@@ -792,7 +1353,7 @@ class ScopedTimer:
         Attributes:
             extra_msg (str): Can be set to a string that will be added to the printout at context exit.
             elapsed (float): The duration of the ``with`` block used with this object
-            timing_results (List[TimingResult]): The list of activity timing results, if collection was requested using ``cuda_filter``
+            timing_results (list[TimingResult]): The list of activity timing results, if collection was requested using ``cuda_filter``
         """
         self.name = name
         self.active = active and self.enabled
@@ -986,12 +1547,12 @@ def check_p2p():
 class timing_result_t(ctypes.Structure):
     """CUDA timing struct for fetching values from C++"""
-    _fields_ = [
+    _fields_ = (
         ("context", ctypes.c_void_p),
         ("name", ctypes.c_char_p),
         ("filter", ctypes.c_int),
         ("elapsed", ctypes.c_float),
-    ]
+    )
 class TimingResult:
@@ -1025,7 +1586,7 @@ def timing_begin(cuda_filter: int = TIMING_ALL, synchronize: bool = True) -> Non
     warp.context.runtime.core.cuda_timing_begin(cuda_filter)
-def timing_end(synchronize: bool = True) -> List[TimingResult]:
+def timing_end(synchronize: bool = True) -> list[TimingResult]:
     """End detailed activity timing.
     Parameters:
@@ -1071,7 +1632,7 @@ def timing_end(synchronize: bool = True) -> List[TimingResult]:
     return results
-def timing_print(results: List[TimingResult], indent: str = "") -> None:
+def timing_print(results: list[TimingResult], indent: str = "") -> None:
     """Print timing results.
     Parameters: