PyPI - warp-lang - Versions diffs - 1.5.0__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.5.0__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (34) hide show

warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/builtins.py +59 -28
warp/codegen.py +21 -17
warp/config.py +1 -1
warp/context.py +59 -35
warp/examples/sim/example_cloth.py +3 -1
warp/fem/geometry/geometry.py +0 -2
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +56 -53
warp/native/tile.h +2 -5
warp/render/render_opengl.py +7 -6
warp/sim/import_urdf.py +8 -8
warp/sim/model.py +23 -19
warp/sparse.py +1 -1
warp/stubs.py +23 -23
warp/tests/test_coloring.py +12 -2
warp/tests/test_examples.py +3 -1
warp/tests/test_func.py +21 -4
warp/tests/test_lerp.py +13 -87
warp/tests/test_matmul.py +6 -9
warp/tests/test_matmul_lite.py +6 -11
warp/tests/test_overwrite.py +45 -0
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_static.py +3 -3
warp/tests/test_tile.py +44 -0
warp/tests/unittest_utils.py +0 -2
warp/types.py +2 -2
warp/utils.py +1 -2
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/METADATA +28 -29
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/RECORD +34 -34
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0

warp/bin/libwarp-clang.dylib CHANGED Viewed

Binary file

warp/bin/libwarp.dylib CHANGED Viewed

Binary file

warp/builtins.py CHANGED Viewed

@@ -399,11 +399,11 @@ def scalar_infer_type(arg_types: Mapping[str, type]):
     scalar_types = set()
     for t in arg_types:
-        t = strip_reference(t)
-        if hasattr(t, "_wp_scalar_type_"):
-            scalar_types.add(t._wp_scalar_type_)
-        elif t in scalar_and_bool_types:
-            scalar_types.add(t)
+        t_val = strip_reference(t)
+        if hasattr(t_val, "_wp_scalar_type_"):
+            scalar_types.add(t_val._wp_scalar_type_)
+        elif t_val in scalar_and_bool_types:
+            scalar_types.add(t_val)
     if len(scalar_types) > 1:
         raise RuntimeError(
@@ -1852,6 +1852,7 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
         step = args[2]
     if start is None or stop is None or step is None:
+        print(args)
         raise RuntimeError("wp.tile_arange() arguments must be compile time constants")
     if "dtype" in arg_values:
@@ -2083,7 +2084,7 @@ def tile_store_1d_value_func(arg_types, arg_values):
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "i": int, "t": Any},
+    input_types={"a": array(dtype=Any), "i": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_store_1d_value_func,
     variadic=False,
     skip_replay=True,
@@ -2132,7 +2133,7 @@ def tile_store_2d_value_func(arg_types, arg_values):
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Any},
+    input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_store_2d_value_func,
     variadic=False,
     skip_replay=True,
@@ -2177,7 +2178,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_atomic_add_value_func,
     variadic=True,
     skip_replay=True,
@@ -2365,7 +2366,7 @@ def untile_value_func(arg_types, arg_values):
 add_builtin(
     "untile",
-    input_types={"a": Any},
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=untile_value_func,
     variadic=True,
     doc="""Convert a Tile back to per-thread values.
@@ -2390,7 +2391,7 @@ add_builtin(
             t = wp.tile(i)*2
             # convert back to per-thread values
-            s = wp.untile()
+            s = wp.untile(t)
             print(s)
@@ -2562,7 +2563,7 @@ add_builtin(
     variadic=True,
     doc="""Broadcast a tile.
-    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
     :param a: Tile to broadcast
     :returns: Tile with broadcast ``shape=(m, n)``""",
@@ -2654,9 +2655,9 @@ add_builtin(
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
@@ -2703,18 +2704,19 @@ add_builtin(
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            t = wp.tile_arange(64, 128)
             s = wp.tile_min(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
     .. code-block:: text
-        tile(m=1, n=1, storage=register) = [[-10]]
+        tile(m=1, n=1, storage=register) = [[64 ]]
     """,
     group="Tile Primitives",
@@ -2755,18 +2757,18 @@ add_builtin(
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
-            s = wp.tile_min(t)
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_max(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
     .. code-block:: text
-        tile(m=1, n=1, storage=register) = [[10]]
+        tile(m=1, n=1, storage=register) = [[127 ]]
     """,
     group="Tile Primitives",
@@ -2796,7 +2798,7 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any,
 add_builtin(
     "tile_reduce",
-    input_types={"op": Callable, "a": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_reduce_value_func,
     native_func="tile_reduce",
     doc="""Apply a custom reduction operator across the tile.
@@ -2819,7 +2821,7 @@ add_builtin(
             print(s)
-        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
     Prints:
@@ -2856,7 +2858,7 @@ def tile_unary_map_value_func(arg_types, arg_values):
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_unary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -2881,7 +2883,7 @@ add_builtin(
             print(s)
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
     Prints:
@@ -2923,7 +2925,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Any, "b": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_binary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -2952,7 +2954,7 @@ add_builtin(
             print(s)
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
     Prints:
@@ -4665,6 +4667,19 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     return arr_type.dtype
+def atomic_op_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    # as this is a codegen callback, we can mark the fact that this func writes to an array here
+    if warp.config.verify_autograd_array_access:
+        arr = args["arr"]
+        arr.mark_write()
+    func_args = tuple(args.values())
+    # we don't need to specify template arguments for atomic ops
+    template_args = ()
+    return (func_args, template_args)
 for array_type in array_types:
     # don't list indexed array operations explicitly in docs
     hidden = array_type == indexedarray
@@ -4675,6 +4690,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically add ``value`` onto ``arr[i]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4685,6 +4701,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically add ``value`` onto ``arr[i,j]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4695,6 +4712,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4705,6 +4723,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4716,6 +4735,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically subtract ``value`` onto ``arr[i]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4726,6 +4746,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4736,6 +4757,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4746,6 +4768,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value.",
         group="Utility",
         skip_replay=True,
@@ -4757,6 +4780,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4769,6 +4793,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4781,6 +4806,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4793,6 +4819,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4806,6 +4833,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4818,6 +4846,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4830,6 +4859,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",
@@ -4842,6 +4872,7 @@ for array_type in array_types:
         input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
         constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
+        dispatch_func=atomic_op_dispatch_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
     The operation is only atomic on a per-component basis for vectors and matrices.""",

warp/codegen.py CHANGED Viewed

@@ -1175,25 +1175,25 @@ class Adjoint:
         left = adj.load(left)
         s = output.emit() + " = " + ("(" * len(comps)) + left.emit() + " "
-        prev_comp = None
+        prev_comp_var = None
         for op, comp in zip(op_strings, comps):
             comp_chainable = op_str_is_chainable(op)
-            if comp_chainable and prev_comp:
-                # We  restrict chaining to operands of the same type
-                if prev_comp.type is comp.type:
-                    prev_comp = adj.load(prev_comp)
-                    comp = adj.load(comp)
-                    s += "&& (" + prev_comp.emit() + " " + op + " " + comp.emit() + ")) "
+            if comp_chainable and prev_comp_var:
+                # We restrict chaining to operands of the same type
+                if prev_comp_var.type is comp.type:
+                    prev_comp_var = adj.load(prev_comp_var)
+                    comp_var = adj.load(comp)
+                    s += "&& (" + prev_comp_var.emit() + " " + op + " " + comp_var.emit() + ")) "
                 else:
                     raise WarpCodegenTypeError(
-                        f"Cannot chain comparisons of unequal types: {prev_comp.type} {op} {comp.type}."
+                        f"Cannot chain comparisons of unequal types: {prev_comp_var.type} {op} {comp.type}."
                     )
             else:
-                comp = adj.load(comp)
-                s += op + " " + comp.emit() + ") "
+                comp_var = adj.load(comp)
+                s += op + " " + comp_var.emit() + ") "
-            prev_comp = comp
+            prev_comp_var = comp_var
         s = s.rstrip() + ";"
@@ -1366,13 +1366,15 @@ class Adjoint:
         fwd_args = []
         for func_arg in func_args:
             if not isinstance(func_arg, (Reference, warp.context.Function)):
-                func_arg = adj.load(func_arg)
+                func_arg_var = adj.load(func_arg)
+            else:
+                func_arg_var = func_arg
             # if the argument is a function (and not a builtin), then build it recursively
-            if isinstance(func_arg, warp.context.Function) and not func_arg.is_builtin():
-                adj.builder.build_function(func_arg)
+            if isinstance(func_arg_var, warp.context.Function) and not func_arg_var.is_builtin():
+                adj.builder.build_function(func_arg_var)
-            fwd_args.append(strip_reference(func_arg))
+            fwd_args.append(strip_reference(func_arg_var))
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
@@ -2569,8 +2571,10 @@ class Adjoint:
             adj.return_var = ()
             for ret in var:
                 if is_reference(ret.type):
-                    ret = adj.add_builtin_call("copy", [ret])
-                adj.return_var += (ret,)
+                    ret_var = adj.add_builtin_call("copy", [ret])
+                else:
+                    ret_var = ret
+                adj.return_var += (ret_var,)
         adj.add_return(adj.return_var)

warp/config.py CHANGED Viewed

@@ -7,7 +7,7 @@
 from typing import Optional
-version: str = "1.5.0"
+version: str = "1.5.1"
 """Warp version string"""
 verify_fp: bool = False

warp/context.py CHANGED Viewed

@@ -7,6 +7,7 @@
 import ast
 import ctypes
+import errno
 import functools
 import hashlib
 import inspect
@@ -17,6 +18,7 @@ import operator
 import os
 import platform
 import sys
+import time
 import types
 import typing
 import weakref
@@ -238,24 +240,23 @@ class Function:
         # in a way that is compatible with Python's semantics.
         signature_params = []
         signature_default_param_kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
-        for param_name in self.input_types.keys():
-            if param_name.startswith("**"):
-                param_name = param_name[2:]
+        for raw_param_name in self.input_types.keys():
+            if raw_param_name.startswith("**"):
+                param_name = raw_param_name[2:]
                 param_kind = inspect.Parameter.VAR_KEYWORD
-            elif param_name.startswith("*"):
-                param_name = param_name[1:]
+            elif raw_param_name.startswith("*"):
+                param_name = raw_param_name[1:]
                 param_kind = inspect.Parameter.VAR_POSITIONAL
                 # Once a variadic argument like `*args` is found, any following
                 # arguments need to be passed using keywords.
                 signature_default_param_kind = inspect.Parameter.KEYWORD_ONLY
             else:
+                param_name = raw_param_name
                 param_kind = signature_default_param_kind
-            param = param = inspect.Parameter(
-                param_name,
-                param_kind,
-                default=self.defaults.get(param_name, inspect.Parameter.empty),
+            param = inspect.Parameter(
+                param_name, param_kind, default=self.defaults.get(param_name, inspect.Parameter.empty)
             )
             signature_params.append(param)
         self.signature = inspect.Signature(signature_params)
@@ -294,22 +295,22 @@ class Function:
         if hasattr(self, "user_overloads") and len(self.user_overloads):
             # user-defined function with overloads
+            bound_args = self.signature.bind(*args, **kwargs)
+            if self.defaults:
+                warp.codegen.apply_defaults(bound_args, self.defaults)
-            if len(kwargs):
-                raise RuntimeError(
-                    f"Error calling function '{self.key}', keyword arguments are not supported for user-defined overloads."
-                )
+            arguments = tuple(bound_args.arguments.values())
             # try and find a matching overload
             for overload in self.user_overloads.values():
-                if len(overload.input_types) != len(args):
+                if len(overload.input_types) != len(arguments):
                     continue
                 template_types = list(overload.input_types.values())
                 arg_names = list(overload.input_types.keys())
                 try:
                     # attempt to unify argument types with function template types
-                    warp.types.infer_argument_types(args, template_types, arg_names)
-                    return overload.func(*args)
+                    warp.types.infer_argument_types(arguments, template_types, arg_names)
+                    return overload.func(*arguments)
                 except Exception:
                     continue
@@ -509,11 +510,10 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
                 if elem_count != arg_type._length_:
                     return (False, None)
-                # Retrieve the element type of the sequence while ensuring
-                # that it's homogeneous.
+                # Retrieve the element type of the sequence while ensuring that it's homogeneous.
                 elem_type = type(arr[0])
-                for i in range(1, elem_count):
-                    if type(arr[i]) is not elem_type:
+                for array_index in range(1, elem_count):
+                    if type(arr[array_index]) is not elem_type:
                         raise ValueError("All array elements must share the same type.")
                 expected_elem_type = arg_type._wp_scalar_type_
@@ -543,10 +543,10 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
                 c_param = arg_type()
                 if warp.types.type_is_matrix(arg_type):
                     rows, cols = arg_type._shape_
-                    for i in range(rows):
-                        idx_start = i * cols
+                    for row_index in range(rows):
+                        idx_start = row_index * cols
                         idx_end = idx_start + cols
-                        c_param[i] = arr[idx_start:idx_end]
+                        c_param[row_index] = arr[idx_start:idx_end]
                 else:
                     c_param[:] = arr
@@ -1239,16 +1239,16 @@ def add_builtin(
                 typelists.append(l)
             for arg_types in itertools.product(*typelists):
-                arg_types = dict(zip(input_types.keys(), arg_types))
+                concrete_arg_types = dict(zip(input_types.keys(), arg_types))
                 # Some of these argument lists won't work, eg if the function is mul(), we won't be
                 # able to do a matrix vector multiplication for a mat22 and a vec3. The `constraint`
                 # function determines which combinations are valid:
                 if constraint:
-                    if constraint(arg_types) is False:
+                    if constraint(concrete_arg_types) is False:
                         continue
-                return_type = value_func(arg_types, None)
+                return_type = value_func(concrete_arg_types, None)
                 # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
                 # in the list of hard coded types so it knows it's returning one of them:
@@ -1266,7 +1266,7 @@ def add_builtin(
                 # finally we can generate a function call for these concrete types:
                 add_builtin(
                     key,
-                    input_types=arg_types,
+                    input_types=concrete_arg_types,
                     value_type=return_type,
                     value_func=value_func if return_type is Any else None,
                     export_func=export_func,
@@ -2133,12 +2133,34 @@ class Module:
                 # -----------------------------------------------------------
                 # update cache
-                try:
-                    # Copy process-specific build directory to a process-independent location
-                    os.rename(build_dir, module_dir)
-                except (OSError, FileExistsError):
-                    # another process likely updated the module dir first
-                    pass
+                def safe_rename(src, dst, attempts=5, delay=0.1):
+                    for i in range(attempts):
+                        try:
+                            os.rename(src, dst)
+                            return
+                        except FileExistsError:
+                            return
+                        except OSError as e:
+                            if e.errno == errno.ENOTEMPTY:
+                                # if directory exists we assume another process
+                                # got there first, in which case we will copy
+                                # our output to the directory manually in second step
+                                return
+                            else:
+                                # otherwise assume directory creation failed e.g.: access denied
+                                # on Windows we see occasional failures to rename directories due to
+                                # some process holding a lock on a file to be moved to workaround
+                                # this we make multiple attempts to rename with some delay
+                                if i < attempts - 1:
+                                    time.sleep(delay)
+                                else:
+                                    print(
+                                        f"Could not update Warp cache with module binaries, trying to rename {build_dir} to {module_dir}, error {e}"
+                                    )
+                                    raise e
+                # try to move process outputs to cache
+                safe_rename(build_dir, module_dir)
                 if os.path.exists(module_dir):
                     if not os.path.exists(binary_path):
@@ -4074,7 +4096,7 @@ def set_mempool_enabled(device: Devicelike, enable: bool) -> None:
     They should generally be enabled, but there is a rare caveat.  Copying data between different GPUs
     may fail during graph capture if the memory was allocated using pooled allocators and memory pool
     access is not enabled between the two GPUs.  This is an internal CUDA limitation that is not related
-    to Warp.  The preferred solution is to enable memory pool access using `warp.set_mempool_access_enabled()`.
+    to Warp.  The preferred solution is to enable memory pool access using :func:`set_mempool_access_enabled`.
     If peer access is not supported, then the default CUDA allocators must be used to pre-allocate the memory
     prior to graph capture.
     """
@@ -5272,6 +5294,8 @@ def launch(
                         params_addr=kernel_params,
                         bounds=bounds,
                         device=device,
+                        max_blocks=max_blocks,
+                        block_dim=block_dim,
                     )
                     return launch
@@ -5355,7 +5379,7 @@ def launch_tiled(*args, **kwargs):
     kwargs["dim"] = dim + [kwargs["block_dim"]]
     # forward to original launch method
-    launch(*args, **kwargs)
+    return launch(*args, **kwargs)
 def synchronize():

warp/examples/sim/example_cloth.py CHANGED Viewed

@@ -100,7 +100,6 @@ class Example:
                 tri_ka=1e4,
                 tri_kd=1e-5,
                 edge_ke=100,
-                color_particles=True,
             )
         usd_stage = Usd.Stage.Open(os.path.join(warp.examples.get_asset_directory(), "bunny.usd"))
@@ -122,6 +121,9 @@ class Example:
             kf=1.0e1,
         )
+        if self.integrator_type == IntegratorType.VBD:
+            builder.color()
         self.model = builder.finalize()
         self.model.ground = True
         self.model.soft_contact_ke = 1.0e4

warp/fem/geometry/geometry.py CHANGED Viewed

@@ -59,7 +59,6 @@ class Geometry:
     SideIndexArg: wp.codegen.Struct
     """Structure containing arguments to be passed to device functions for indexing sides"""
-    @staticmethod
     def cell_arg_value(self, device) -> "Geometry.CellArg":
         """Value of the arguments to be passed to cell-related device functions"""
         raise NotImplementedError
@@ -107,7 +106,6 @@ class Geometry:
         For elements with the same dimension as the embedding space, this will be zero."""
         raise NotImplementedError
-    @staticmethod
     def side_arg_value(self, device) -> "Geometry.SideArg":
         """Value of the arguments to be passed to side-related device functions"""
         raise NotImplementedError

warp/native/coloring.cpp CHANGED Viewed

@@ -590,7 +590,11 @@ extern "C"
         if (num_colors > 1) {
             std::vector<std::vector<int>> color_groups;
             convert_to_color_groups(num_colors, graph.node_colors, color_groups);
-            return balance_color_groups(target_max_min_ratio, graph, color_groups);
+            float max_min_ratio = balance_color_groups(target_max_min_ratio, graph, color_groups);
+            memcpy(node_colors.data, graph.node_colors.data(), num_nodes * sizeof(int));
+            return max_min_ratio;
         }
         else
         {