PyPI - warp-lang - Versions diffs - 1.6.0__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.6.0__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (37) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/builtins.py +1 -1
warp/codegen.py +10 -3
warp/config.py +65 -21
warp/context.py +202 -65
warp/examples/core/example_marching_cubes.py +1 -1
warp/examples/core/example_mesh.py +1 -1
warp/examples/core/example_wave.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +81 -27
warp/examples/tile/example_tile_nbody.py +26 -15
warp/native/clang/clang.cpp +1 -1
warp/native/crt.h +1 -0
warp/native/mat.h +16 -3
warp/native/tile.h +12 -8
warp/render/render_opengl.py +23 -15
warp/render/render_usd.py +10 -2
warp/sim/collide.py +29 -16
warp/sim/import_urdf.py +20 -5
warp/sim/integrator_featherstone.py +4 -11
warp/sim/model.py +62 -59
warp/sim/render.py +2 -2
warp/stubs.py +1 -1
warp/tests/test_array.py +26 -0
warp/tests/test_collision.py +6 -6
warp/tests/test_examples.py +7 -1
warp/tests/test_launch.py +77 -26
warp/tests/test_mat.py +75 -1
warp/tests/test_overwrite.py +4 -3
warp/tests/test_tile_load.py +44 -1
warp/thirdparty/unittest_parallel.py +3 -0
warp/types.py +66 -68
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/METADATA +34 -17
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/RECORD +37 -37
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0

warp/bin/warp-clang.so CHANGED Viewed

Binary file

warp/bin/warp.so CHANGED Viewed

Binary file

warp/builtins.py CHANGED Viewed

@@ -4173,7 +4173,7 @@ add_builtin(
     input_types={"state": uint32},
     value_type=int,
     group="Random",
-    doc="Return a random integer in the range [0, 2^32).",
+    doc="Return a random integer in the range [-2^31, 2^31).",
 )
 add_builtin(
     "randi",

warp/codegen.py CHANGED Viewed

@@ -2278,15 +2278,22 @@ class Adjoint:
         out = adj.add_call(func, args, kwargs, type_args, min_outputs=min_outputs)
         if warp.config.verify_autograd_array_access:
+            # Extract the types and values passed as arguments to the function call.
+            arg_types = tuple(strip_reference(get_arg_type(x)) for x in args)
+            kwarg_types = {k: strip_reference(get_arg_type(v)) for k, v in kwargs.items()}
+            # Resolve the exact function signature among any existing overload.
+            resolved_func = adj.resolve_func(func, arg_types, kwarg_types, min_outputs)
             # update arg read/write states according to what happens to that arg in the called function
-            if hasattr(func, "adj"):
+            if hasattr(resolved_func, "adj"):
                 for i, arg in enumerate(args):
-                    if func.adj.args[i].is_write:
+                    if resolved_func.adj.args[i].is_write:
                         kernel_name = adj.fun_name
                         filename = adj.filename
                         lineno = adj.lineno + adj.fun_lineno
                         arg.mark_write(kernel_name=kernel_name, filename=filename, lineno=lineno)
-                    if func.adj.args[i].is_read:
+                    if resolved_func.adj.args[i].is_read:
                         arg.mark_read()
         return out

warp/config.py CHANGED Viewed

@@ -7,64 +7,108 @@
 from typing import Optional
-version: str = "1.6.0"
+version: str = "1.6.1"
 """Warp version string"""
 verify_fp: bool = False
-"""If `True`, Warp will check that inputs and outputs are finite before and/or after various operations.
-Has performance implications.
+"""Enable floating-point verification for inputs and outputs.
+When enabled, checks if all values are finite before and after operations.
+Note: Enabling this flag impacts performance.
 """
 verify_cuda: bool = False
-"""If `True`, Warp will check for CUDA errors after every launch operation.
-CUDA error verification cannot be used during graph capture. Has performance implications.
+"""Enable CUDA error checking after kernel launches.
+This setting cannot be used during graph capture
+Note: Enabling this flag impacts performance
 """
 print_launches: bool = False
-"""If `True`, Warp will print details of every kernel launch to standard out
-(e.g. launch dimensions, inputs, outputs, device, etc.). Has performance implications
+"""Enable detailed kernel launch logging.
+Prints information about each kernel launch including:
+- Launch dimensions
+- Input/output parameters
+- Target device
+Note: Enabling this flag impacts performance.
 """
 mode: str = "release"
-"""Controls whether to compile Warp kernels in debug or release mode.
-Valid choices are `"release"` or `"debug"`. Has performance implications.
+"""Compilation mode for Warp kernels.
+Args:
+    mode: Either ``"release"`` or ``"debug"``.
+Note: Debug mode may impact performance.
 """
 verbose: bool = False
-"""If `True`, additional information will be printed to standard out during code generation, compilation, etc."""
+"""Enable detailed logging during code generation and compilation."""
 verbose_warnings: bool = False
-"""If `True`, Warp warnings will include extra information such as the source file and line number."""
+"""Enable extended warning messages with source location information."""
 quiet: bool = False
-"""Suppress all output except errors and warnings."""
+"""Disable Warp module initialization messages.
+Error messages and warnings remain unaffected.
+"""
 verify_autograd_array_access: bool = False
-"""print warnings related to array overwrites that may result in incorrect gradients"""
+"""Enable warnings for array overwrites that may affect gradient computation."""
+enable_vector_component_overwrites: bool = False
+"""Allow multiple writes to vector/matrix/quaternion components.
+Note: Enabling this may significantly increase kernel compilation time.
+"""
 cache_kernels: bool = True
-"""If `True`, kernels that have already been compiled from previous application launches will not be recompiled."""
+"""Enable kernel caching between application launches."""
 kernel_cache_dir: Optional[str] = None
-"""Path to kernel cache directory, if `None`, a default path will be used."""
+"""Directory path for storing compiled kernel cache.
+If ``None``, the path is determined in the following order:
+1. ``WARP_CACHE_PATH`` environment variable.
+2. System's user cache directory (via ``appdirs.user_cache_directory``).
+Note: Subdirectories prefixed with ``wp_`` will be created in this location.
+"""
 cuda_output: Optional[str] = None
-"""Preferred CUDA output format for kernels (`"ptx"` or `"cubin"`), determined automatically if unspecified"""
+"""Preferred CUDA output format for kernel compilation.
+Args:
+    cuda_output: One of {``None``, ``"ptx"``, ``"cubin"``}. If ``None``, format is auto-determined.
+"""
 ptx_target_arch: int = 75
-"""Target architecture for PTX generation, defaults to the lowest architecture that supports all of Warp's features."""
+"""Target architecture version for PTX generation.
+Defaults to minimum architecture version supporting all Warp features.
+"""
 enable_backward: bool = True
-"""Whether to compiler the backward passes of the kernels."""
+"""Enable compilation of kernel backward passes."""
 llvm_cuda: bool = False
-"""Use Clang/LLVM instead of NVRTC to compile CUDA."""
+"""Use Clang/LLVM compiler instead of NVRTC for CUDA compilation."""
 enable_graph_capture_module_load_by_default: bool = True
-"""Default value of `force_module_load` for `capture_begin()` if CUDA driver does not support at least CUDA 12.3."""
+"""Enable automatic module loading before graph capture.
+Only affects systems with CUDA driver versions below 12.3.
+"""
 enable_mempools_at_init: bool = True
-"""Whether CUDA devices will be initialized with mempools enabled (if supported)."""
+"""Enable CUDA memory pools during device initialization when supported."""
 max_unroll: int = 16
 """Maximum unroll factor for loops."""

warp/context.py CHANGED Viewed

@@ -34,6 +34,7 @@ import warp
 import warp.build
 import warp.codegen
 import warp.config
+from warp.types import launch_bounds_t
 # represents either a built-in or user-defined function
@@ -5187,8 +5188,23 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
+    """Represents all data required for a kernel launch so that launches can be replayed quickly.
+    Users should not directly instantiate this class, instead use
+    ``wp.launch(..., record_cmd=True)`` to record a launch.
+    """
     def __init__(
-        self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0, block_dim=256
+        self,
+        kernel,
+        device: Device,
+        hooks: Optional[KernelHooks] = None,
+        params: Optional[Sequence[Any]] = None,
+        params_addr: Optional[Sequence[ctypes.c_void_p]] = None,
+        bounds: Optional[launch_bounds_t] = None,
+        max_blocks: int = 0,
+        block_dim: int = 256,
+        adjoint: bool = False,
     ):
         # retain the module executable so it doesn't get unloaded
         self.module_exec = kernel.module.load(device)
@@ -5201,13 +5217,14 @@ class Launch:
         # if not specified set a zero bound
         if not bounds:
-            bounds = warp.types.launch_bounds_t(0)
+            bounds = launch_bounds_t(0)
         # if not specified then build a list of default value params for args
         if not params:
             params = []
             params.append(bounds)
+            # Pack forward parameters
             for a in kernel.adj.args:
                 if isinstance(a.type, warp.types.array):
                     params.append(a.type.__ctype__())
@@ -5216,6 +5233,18 @@ class Launch:
                 else:
                     params.append(pack_arg(kernel, a.type, a.label, 0, device, False))
+            # Pack adjoint parameters if adjoint=True
+            if adjoint:
+                for a in kernel.adj.args:
+                    if isinstance(a.type, warp.types.array):
+                        params.append(a.type.__ctype__())
+                    elif isinstance(a.type, warp.codegen.Struct):
+                        params.append(a.type().__ctype__())
+                    else:
+                        # For primitive types in adjoint mode, initialize with 0
+                        params.append(pack_arg(kernel, a.type, a.label, 0, device, True))
+            # Create array of parameter addresses
             kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
             kernel_params = (ctypes.c_void_p * len(kernel_args))(*kernel_args)
@@ -5225,13 +5254,30 @@ class Launch:
         self.hooks = hooks
         self.params = params
         self.params_addr = params_addr
-        self.device = device
-        self.bounds = bounds
-        self.max_blocks = max_blocks
-        self.block_dim = block_dim
+        self.device: Device = device
+        """The device to launch on.
+        This should not be changed after the launch object is created.
+        """
+        self.bounds: launch_bounds_t = bounds
+        """The launch bounds. Update with :meth:`set_dim`."""
+        self.max_blocks: int = max_blocks
+        """The maximum number of CUDA thread blocks to use."""
+        self.block_dim: int = block_dim
+        """The number of threads per block."""
-    def set_dim(self, dim):
-        self.bounds = warp.types.launch_bounds_t(dim)
+        self.adjoint: bool = adjoint
+        """Whether to run the adjoint kernel instead of the forward kernel."""
+    def set_dim(self, dim: Union[int, List[int], Tuple[int, ...]]):
+        """Set the launch dimensions.
+        Args:
+            dim: The dimensions of the launch.
+        """
+        self.bounds = launch_bounds_t(dim)
         # launch bounds always at index 0
         self.params[0] = self.bounds
@@ -5240,22 +5286,36 @@ class Launch:
         if self.params_addr:
             self.params_addr[0] = ctypes.c_void_p(ctypes.addressof(self.bounds))
-    # set kernel param at an index, will convert to ctype as necessary
-    def set_param_at_index(self, index, value):
+    def set_param_at_index(self, index: int, value: Any, adjoint: bool = False):
+        """Set a kernel parameter at an index.
+        Args:
+            index: The index of the param to set.
+            value: The value to set the param to.
+        """
         arg_type = self.kernel.adj.args[index].type
         arg_name = self.kernel.adj.args[index].label
-        carg = pack_arg(self.kernel, arg_type, arg_name, value, self.device, False)
+        carg = pack_arg(self.kernel, arg_type, arg_name, value, self.device, adjoint)
+        if adjoint:
+            params_index = index + len(self.kernel.adj.args) + 1
+        else:
+            params_index = index + 1
-        self.params[index + 1] = carg
+        self.params[params_index] = carg
         # for CUDA kernels we need to update the address to each arg
         if self.params_addr:
-            self.params_addr[index + 1] = ctypes.c_void_p(ctypes.addressof(carg))
+            self.params_addr[params_index] = ctypes.c_void_p(ctypes.addressof(carg))
-    # set kernel param at an index without any type conversion
-    # args must be passed as ctypes or basic int / float types
-    def set_param_at_index_from_ctype(self, index, value):
+    def set_param_at_index_from_ctype(self, index: int, value: Union[ctypes.Structure, int, float]):
+        """Set a kernel parameter at an index without any type conversion.
+        Args:
+            index: The index of the param to set.
+            value: The value to set the param to.
+        """
         if isinstance(value, ctypes.Structure):
             # not sure how to directly assign struct->struct without reallocating using ctypes
             self.params[index + 1] = value
@@ -5267,32 +5327,62 @@ class Launch:
         else:
             self.params[index + 1].__init__(value)
-    # set kernel param by argument name
-    def set_param_by_name(self, name, value):
+    def set_param_by_name(self, name: str, value: Any, adjoint: bool = False):
+        """Set a kernel parameter by argument name.
+        Args:
+            name: The name of the argument to set.
+            value: The value to set the argument to.
+            adjoint: If ``True``, set the adjoint of this parameter instead of the forward parameter.
+        """
         for i, arg in enumerate(self.kernel.adj.args):
             if arg.label == name:
-                self.set_param_at_index(i, value)
+                self.set_param_at_index(i, value, adjoint)
+                return
+        raise ValueError(f"Argument '{name}' not found in kernel '{self.kernel.key}'")
-    # set kernel param by argument name with no type conversions
-    def set_param_by_name_from_ctype(self, name, value):
+    def set_param_by_name_from_ctype(self, name: str, value: ctypes.Structure):
+        """Set a kernel parameter by argument name with no type conversions.
+        Args:
+            name: The name of the argument to set.
+            value: The value to set the argument to.
+        """
         # lookup argument index
         for i, arg in enumerate(self.kernel.adj.args):
             if arg.label == name:
                 self.set_param_at_index_from_ctype(i, value)
-    # set all params
-    def set_params(self, values):
+    def set_params(self, values: Sequence[Any]):
+        """Set all parameters.
+        Args:
+            values: A list of values to set the params to.
+        """
         for i, v in enumerate(values):
             self.set_param_at_index(i, v)
-    # set all params without performing type-conversions
-    def set_params_from_ctypes(self, values):
+    def set_params_from_ctypes(self, values: Sequence[ctypes.Structure]):
+        """Set all parameters without performing type-conversions.
+        Args:
+            values: A list of ctypes or basic int / float types.
+        """
         for i, v in enumerate(values):
             self.set_param_at_index_from_ctype(i, v)
-    def launch(self, stream=None) -> Any:
+    def launch(self, stream: Optional[Stream] = None) -> None:
+        """Launch the kernel.
+        Args:
+            stream: The stream to launch on.
+        """
         if self.device.is_cpu:
-            self.hooks.forward(*self.params)
+            if self.adjoint:
+                self.hooks.backward(*self.params)
+            else:
+                self.hooks.forward(*self.params)
         else:
             if stream is None:
                 stream = self.device.stream
@@ -5305,32 +5395,44 @@ class Launch:
                 if graph is not None:
                     graph.retain_module_exec(self.module_exec)
-            runtime.core.cuda_launch_kernel(
-                self.device.context,
-                self.hooks.forward,
-                self.bounds.size,
-                self.max_blocks,
-                self.block_dim,
-                self.hooks.forward_smem_bytes,
-                self.params_addr,
-                stream.cuda_stream,
-            )
+            if self.adjoint:
+                runtime.core.cuda_launch_kernel(
+                    self.device.context,
+                    self.hooks.backward,
+                    self.bounds.size,
+                    self.max_blocks,
+                    self.block_dim,
+                    self.hooks.backward_smem_bytes,
+                    self.params_addr,
+                    stream.cuda_stream,
+                )
+            else:
+                runtime.core.cuda_launch_kernel(
+                    self.device.context,
+                    self.hooks.forward,
+                    self.bounds.size,
+                    self.max_blocks,
+                    self.block_dim,
+                    self.hooks.forward_smem_bytes,
+                    self.params_addr,
+                    stream.cuda_stream,
+                )
 def launch(
     kernel,
-    dim: Tuple[int],
+    dim: Union[int, Sequence[int]],
     inputs: Sequence = [],
     outputs: Sequence = [],
     adj_inputs: Sequence = [],
     adj_outputs: Sequence = [],
     device: Devicelike = None,
-    stream: Stream = None,
-    adjoint=False,
-    record_tape=True,
-    record_cmd=False,
-    max_blocks=0,
-    block_dim=256,
+    stream: Optional[Stream] = None,
+    adjoint: bool = False,
+    record_tape: bool = True,
+    record_cmd: bool = False,
+    max_blocks: int = 0,
+    block_dim: int = 256,
 ):
     """Launch a Warp kernel on the target device
@@ -5338,18 +5440,23 @@ def launch(
     Args:
         kernel: The name of a Warp kernel function, decorated with the ``@wp.kernel`` decorator
-        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints with max of 4 dimensions
+        dim: The number of threads to launch the kernel, can be an integer or a
+          sequence of integers with a maximum of 4 dimensions.
         inputs: The input parameters to the kernel (optional)
         outputs: The output parameters (optional)
         adj_inputs: The adjoint inputs (optional)
         adj_outputs: The adjoint outputs (optional)
-        device: The device to launch on (optional)
-        stream: The stream to launch on (optional)
-        adjoint: Whether to run forward or backward pass (typically use False)
-        record_tape: When true the launch will be recorded the global wp.Tape() object when present
-        record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
-        max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
-            If negative or zero, the maximum hardware value will be used.
+        device: The device to launch on.
+        stream: The stream to launch on.
+        adjoint: Whether to run forward or backward pass (typically use ``False``).
+        record_tape: When ``True``, the launch will be recorded the global
+          :class:`wp.Tape() <warp.Tape>` object when present.
+        record_cmd: When ``True``, the launch will return a :class:`Launch`
+          object. The launch will not occur until the user calls
+          :meth:`Launch.launch()`.
+        max_blocks: The maximum number of CUDA thread blocks to use.
+          Only has an effect for CUDA kernel launches.
+          If negative or zero, the maximum hardware value will be used.
         block_dim: The number of threads per block.
     """
@@ -5370,7 +5477,7 @@ def launch(
         print(f"kernel: {kernel.key} dim: {dim} inputs: {inputs} outputs: {outputs} device: {device}")
     # construct launch bounds
-    bounds = warp.types.launch_bounds_t(dim)
+    bounds = launch_bounds_t(dim)
     if bounds.size > 0:
         # first param is the number of threads
@@ -5427,6 +5534,17 @@ def launch(
                         f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                     )
+                if record_cmd:
+                    launch = Launch(
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=None,
+                        bounds=bounds,
+                        device=device,
+                        adjoint=adjoint,
+                    )
+                    return launch
                 hooks.backward(*params)
             else:
@@ -5437,7 +5555,13 @@ def launch(
                 if record_cmd:
                     launch = Launch(
-                        kernel=kernel, hooks=hooks, params=params, params_addr=None, bounds=bounds, device=device
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=None,
+                        bounds=bounds,
+                        device=device,
+                        adjoint=adjoint,
                     )
                     return launch
                 else:
@@ -5464,16 +5588,30 @@ def launch(
                         f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                     )
-                runtime.core.cuda_launch_kernel(
-                    device.context,
-                    hooks.backward,
-                    bounds.size,
-                    max_blocks,
-                    block_dim,
-                    hooks.backward_smem_bytes,
-                    kernel_params,
-                    stream.cuda_stream,
-                )
+                if record_cmd:
+                    launch = Launch(
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=kernel_params,
+                        bounds=bounds,
+                        device=device,
+                        max_blocks=max_blocks,
+                        block_dim=block_dim,
+                        adjoint=adjoint,
+                    )
+                    return launch
+                else:
+                    runtime.core.cuda_launch_kernel(
+                        device.context,
+                        hooks.backward,
+                        bounds.size,
+                        max_blocks,
+                        block_dim,
+                        hooks.backward_smem_bytes,
+                        kernel_params,
+                        stream.cuda_stream,
+                    )
             else:
                 if hooks.forward is None:
@@ -5493,7 +5631,6 @@ def launch(
                         block_dim=block_dim,
                     )
                     return launch
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(

warp/examples/core/example_marching_cubes.py CHANGED Viewed

@@ -153,7 +153,7 @@ class Example:
                 "surface",
                 self.mc.verts.numpy(),
                 self.mc.indices.numpy(),
-                colors=((0.35, 0.55, 0.9),) * len(self.mc.verts),
+                colors=(0.35, 0.55, 0.9),
                 update_topology=True,
             )
             self.renderer.end_frame()

warp/examples/core/example_mesh.py CHANGED Viewed

@@ -138,7 +138,7 @@ class Example:
                 name="mesh",
                 points=self.mesh.points.numpy(),
                 indices=self.mesh.indices.numpy(),
-                colors=((0.35, 0.55, 0.9),) * len(self.mesh.points),
+                colors=(0.35, 0.55, 0.9),
             )
             self.renderer.render_points(
                 name="points", points=self.positions.numpy(), radius=self.sim_margin, colors=(0.8, 0.3, 0.2)

warp/examples/core/example_wave.py CHANGED Viewed

@@ -223,7 +223,7 @@ class Example:
             vertices = self.sim_verts.numpy()
             self.renderer.begin_frame(self.sim_time)
-            self.renderer.render_mesh("surface", vertices, self.indices, colors=((0.35, 0.55, 0.9),) * len(vertices))
+            self.renderer.render_mesh("surface", vertices, self.indices, colors=(0.35, 0.55, 0.9))
             self.renderer.render_sphere(
                 "sphere",
                 (self.cx * self.grid_size, 0.0, self.cy * self.grid_size),