PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (59) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build_dll.py +5 -0
warp/codegen.py +15 -3
warp/config.py +1 -1
warp/context.py +122 -24
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fem/field/virtual.py +2 -0
warp/fem/integrate.py +78 -47
warp/jax_experimental/ffi.py +201 -53
warp/native/array.h +4 -4
warp/native/builtin.h +8 -4
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +3 -3
warp/native/mesh.h +1 -1
warp/native/quat.h +6 -2
warp/native/rand.h +7 -7
warp/native/sparse.cu +1 -1
warp/native/svd.h +23 -8
warp/native/tile.h +20 -1
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +4 -4
warp/native/warp.cpp +1 -1
warp/native/warp.cu +15 -2
warp/native/warp.h +1 -1
warp/render/render_opengl.py +52 -51
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +1 -1
warp/tape.py +2 -0
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +76 -1
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_mat.py +22 -0
warp/tests/test_quat.py +22 -0
warp/tests/test_sparse.py +32 -0
warp/tests/test_static.py +48 -0
warp/tests/test_tape.py +38 -0
warp/tests/test_vec.py +38 -408
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +31 -143
warp/tests/tile/test_tile_mathdx.py +2 -2
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +12 -12
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +10 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/METADATA +4 -4
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/RECORD +59 -57
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0

warp/bin/warp-clang.so CHANGED Viewed

Binary file

warp/bin/warp.so CHANGED Viewed

Binary file

warp/build_dll.py CHANGED Viewed

@@ -227,6 +227,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[
                 "-gencode=arch=compute_61,code=sm_61",
                 "-gencode=arch=compute_70,code=sm_70",  # Volta
                 "-gencode=arch=compute_75,code=sm_75",  # Turing
+                "-gencode=arch=compute_75,code=compute_75",  # Turing (PTX)
                 "-gencode=arch=compute_80,code=sm_80",  # Ampere
                 "-gencode=arch=compute_86,code=sm_86",
             ]
@@ -260,6 +261,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, arch, libs: Optional[
                     "--cuda-gpu-arch=sm_87",  # Orin
                 ]
+                if ctk_version >= (12, 8):
+                    gencode_opts += ["-gencode=arch=compute_101,code=sm_101"]  # Thor (CUDA 12 numbering)
+                    clang_arch_flags += ["--cuda-gpu-arch=sm_101"]
             if ctk_version >= (12, 8):
                 # Support for Blackwell is available with CUDA Toolkit 12.8+
                 gencode_opts += [

warp/codegen.py CHANGED Viewed

@@ -616,6 +616,8 @@ def compute_type_str(base_name, template_params):
     def param2str(p):
         if isinstance(p, int):
             return str(p)
+        elif hasattr(p, "_wp_generic_type_str_"):
+            return compute_type_str(f"wp::{p._wp_generic_type_str_}", p._wp_type_params_)
         elif hasattr(p, "_type_"):
             if p.__name__ == "bool":
                 return "bool"
@@ -967,6 +969,11 @@ class Adjoint:
             # this is to avoid registering false references to overshadowed modules
             adj.symbols[name] = arg
+        # Indicates whether there are unresolved static expressions in the function.
+        # These stem from wp.static() expressions that could not be evaluated at declaration time.
+        # This will signal to the module builder that this module needs to be rebuilt even if the module hash is unchanged.
+        adj.has_unresolved_static_expressions = False
         # try to replace static expressions by their constant result if the
         # expression can be evaluated at declaration time
         adj.static_expressions: dict[str, Any] = {}
@@ -2322,8 +2329,9 @@ class Adjoint:
         if adj.is_static_expression(func):
             # try to evaluate wp.static() expressions
-            obj, _ = adj.evaluate_static_expression(node)
+            obj, code = adj.evaluate_static_expression(node)
             if obj is not None:
+                adj.static_expressions[code] = obj
                 if isinstance(obj, warp.context.Function):
                     # special handling for wp.static() evaluating to a function
                     return obj
@@ -3109,6 +3117,7 @@ class Adjoint:
         # Since this is an expression, we can enforce it to be defined on a single line.
         static_code = static_code.replace("\n", "")
+        code_to_eval = static_code  # code to be evaluated
         vars_dict = adj.get_static_evaluation_context()
         # add constant variables to the static call context
@@ -3150,10 +3159,10 @@ class Adjoint:
                     loc = end
                 new_static_code += static_code[len_value_locs[-1][2] :]
-                static_code = new_static_code
+                code_to_eval = new_static_code
         try:
-            value = eval(static_code, vars_dict)
+            value = eval(code_to_eval, vars_dict)
             if warp.config.verbose:
                 print(f"Evaluated static command: {static_code} = {value}")
         except NameError as e:
@@ -3206,6 +3215,9 @@ class Adjoint:
                         #      (and is therefore not executable and raises this exception), in which
                         #      case changing the constant, or the code affecting this constant, would lead to
                         #      a different module hash anyway.
+                        # In any case, we mark this Adjoint to have unresolvable static expressions.
+                        # This will trigger a code generation step even if the module hash is unchanged.
+                        adj.has_unresolved_static_expressions = True
                         pass
                 return self.generic_visit(node)

warp/config.py CHANGED Viewed

@@ -15,7 +15,7 @@
 from typing import Optional
-version: str = "1.8.0"
+version: str = "1.8.1"
 """Warp version string"""
 verify_fp: bool = False

warp/context.py CHANGED Viewed

@@ -1692,7 +1692,7 @@ class ModuleHasher:
             ch.update(bytes(name, "utf-8"))
             ch.update(self.get_constant_bytes(value))
-        # hash wp.static() expressions that were evaluated at declaration time
+        # hash wp.static() expressions
         for k, v in adj.static_expressions.items():
             ch.update(bytes(k, "utf-8"))
             if isinstance(v, Function):
@@ -2011,6 +2011,9 @@ class Module:
         # is retained and later reloaded with the same hash.
         self.cpu_exec_id = 0
+        # Indicates whether the module has functions or kernels with unresolved static expressions.
+        self.has_unresolved_static_expressions = False
         self.options = {
             "max_unroll": warp.config.max_unroll,
             "enable_backward": warp.config.enable_backward,
@@ -2018,7 +2021,7 @@ class Module:
             "fuse_fp": True,
             "lineinfo": warp.config.lineinfo,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
-            "mode": warp.config.mode,
+            "mode": None,
             "block_dim": 256,
             "compile_time_trace": warp.config.compile_time_trace,
         }
@@ -2047,6 +2050,10 @@ class Module:
         # track all kernel objects, even if they are duplicates
         self._live_kernels.add(kernel)
+        # Check for unresolved static expressions in the kernel.
+        if kernel.adj.has_unresolved_static_expressions:
+            self.has_unresolved_static_expressions = True
         self.find_references(kernel.adj)
         # for a reload of module on next launch
@@ -2106,6 +2113,10 @@ class Module:
                                 del func_existing.user_overloads[k]
                 func_existing.add_overload(func)
+        # Check for unresolved static expressions in the function.
+        if func.adj.has_unresolved_static_expressions:
+            self.has_unresolved_static_expressions = True
         self.find_references(func.adj)
         # for a reload of module on next launch
@@ -2165,7 +2176,7 @@ class Module:
         self.hashers[block_dim] = ModuleHasher(self)
         return self.hashers[block_dim].get_module_hash()
-    def load(self, device, block_dim=None) -> ModuleExec:
+    def load(self, device, block_dim=None) -> ModuleExec | None:
         device = runtime.get_device(device)
         # update module options if launching with a new block dim
@@ -2174,6 +2185,20 @@ class Module:
         active_block_dim = self.options["block_dim"]
+        if self.has_unresolved_static_expressions:
+            # The module hash currently does not account for unresolved static expressions
+            # (only static expressions evaluated at declaration time so far).
+            # We need to generate the code for the functions and kernels that have
+            # unresolved static expressions and then compute the module hash again.
+            builder_options = {
+                **self.options,
+                "output_arch": None,
+            }
+            # build functions, kernels to resolve static expressions
+            _ = ModuleBuilder(self, builder_options)
+            self.has_unresolved_static_expressions = False
         # compute the hash if needed
         if active_block_dim not in self.hashers:
             self.hashers[active_block_dim] = ModuleHasher(self)
@@ -2262,6 +2287,8 @@ class Module:
                 module_load_timer.extra_msg = " (compiled)"  # For wp.ScopedTimer informational purposes
+                mode = self.options["mode"] if self.options["mode"] is not None else warp.config.mode
                 # build CPU
                 if device.is_cpu:
                     # build
@@ -2281,7 +2308,7 @@ class Module:
                             warp.build.build_cpu(
                                 output_path,
                                 source_code_path,
-                                mode=self.options["mode"],
+                                mode=mode,
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
                                 fuse_fp=self.options["fuse_fp"],
@@ -2311,7 +2338,7 @@ class Module:
                                 source_code_path,
                                 output_arch,
                                 output_path,
-                                config=self.options["mode"],
+                                config=mode,
                                 verify_fp=warp.config.verify_fp,
                                 fast_math=self.options["fast_math"],
                                 fuse_fp=self.options["fuse_fp"],
@@ -3759,6 +3786,7 @@ class Runtime:
             self.core.cuda_graph_end_capture.restype = ctypes.c_bool
             self.core.cuda_graph_create_exec.argtypes = [
+                ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.POINTER(ctypes.c_void_p),
@@ -4066,9 +4094,14 @@ class Runtime:
             # Update the default PTX architecture based on devices present in the system.
             # Use the lowest architecture among devices that meet the minimum architecture requirement.
             # Devices below the required minimum will use the highest architecture they support.
-            eligible_archs = [d.arch for d in self.cuda_devices if d.arch >= self.default_ptx_arch]
-            if eligible_archs:
-                self.default_ptx_arch = min(eligible_archs)
+            try:
+                self.default_ptx_arch = min(
+                    d.arch
+                    for d in self.cuda_devices
+                    if d.arch >= self.default_ptx_arch and d.arch in self.nvrtc_supported_archs
+                )
+            except ValueError:
+                pass  # no eligible NVRTC-supported arch ≥ default, retain existing
         else:
             # CUDA not available
             self.set_default_device("cpu")
@@ -6255,6 +6288,40 @@ def get_module_options(module: Any = None) -> dict[str, Any]:
     return get_module(m.__name__).options
+def _unregister_capture(device: Device, stream: Stream, graph: Graph):
+    """Unregister a graph capture from the device and runtime.
+    This should be called when a graph capture is no longer active, either because it completed or was paused.
+    The graph should only be registered while it is actively capturing.
+    Args:
+        device: The CUDA device the graph was being captured on
+        stream: The CUDA stream the graph was being captured on
+        graph: The Graph object that was being captured
+    """
+    del device.captures[stream]
+    del runtime.captures[graph.capture_id]
+def _register_capture(device: Device, stream: Stream, graph: Graph, capture_id: int):
+    """Register a graph capture with the device and runtime.
+    Makes the graph discoverable through its capture_id so that retain_module_exec() can be called
+    when launching kernels during graph capture. This ensures modules are retained until graph execution completes.
+    Args:
+        device: The CUDA device the graph is being captured on
+        stream: The CUDA stream the graph is being captured on
+        graph: The Graph object being captured
+        capture_id: Unique identifier for this graph capture
+    """
+    # add to ongoing captures on the device
+    device.captures[stream] = graph
+    # add to lookup table by globally unique capture id
+    runtime.captures[capture_id] = graph
 def capture_begin(
     device: Devicelike = None,
     stream: Stream | None = None,
@@ -6320,11 +6387,7 @@ def capture_begin(
     capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
     graph = Graph(device, capture_id)
-    # add to ongoing captures on the device
-    device.captures[stream] = graph
-    # add to lookup table by globally unique capture id
-    runtime.captures[capture_id] = graph
+    _register_capture(device, stream, graph, capture_id)
 def capture_end(device: Devicelike = None, stream: Stream | None = None) -> Graph:
@@ -6352,8 +6415,7 @@ def capture_end(device: Devicelike = None, stream: Stream | None = None) -> Grap
     if graph is None:
         raise RuntimeError("Graph capture is not active on this stream")
-    del device.captures[stream]
-    del runtime.captures[graph.capture_id]
+    _unregister_capture(device, stream, graph)
     # get the graph executable
     g = ctypes.c_void_p()
@@ -6393,7 +6455,7 @@ def assert_conditional_graph_support():
         raise RuntimeError("Conditional graph nodes require CUDA driver 12.4+")
-def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> ctypes.c_void_p:
+def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> Graph:
     if stream is not None:
         device = stream.device
     else:
@@ -6402,14 +6464,24 @@ def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> ct
             raise RuntimeError("Must be a CUDA device")
         stream = device.stream
-    graph = ctypes.c_void_p()
-    if not runtime.core.cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(graph)):
+    # get the graph being captured
+    graph = device.captures.get(stream)
+    if graph is None:
+        raise RuntimeError("Graph capture is not active on this stream")
+    _unregister_capture(device, stream, graph)
+    g = ctypes.c_void_p()
+    if not runtime.core.cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(g)):
         raise RuntimeError(runtime.get_error_string())
+    graph.graph = g
     return graph
-def capture_resume(graph: ctypes.c_void_p, device: Devicelike = None, stream: Stream | None = None):
+def capture_resume(graph: Graph, device: Devicelike = None, stream: Stream | None = None):
     if stream is not None:
         device = stream.device
     else:
@@ -6418,9 +6490,14 @@ def capture_resume(graph: ctypes.c_void_p, device: Devicelike = None, stream: St
             raise RuntimeError("Must be a CUDA device")
         stream = device.stream
-    if not runtime.core.cuda_graph_resume_capture(device.context, stream.cuda_stream, graph):
+    if not runtime.core.cuda_graph_resume_capture(device.context, stream.cuda_stream, graph.graph):
         raise RuntimeError(runtime.get_error_string())
+    capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
+    graph.capture_id = capture_id
+    _register_capture(device, stream, graph, capture_id)
 # reusable pinned readback buffer for conditions
 condition_host = None
@@ -6518,10 +6595,15 @@ def capture_if(
     # pause capturing parent graph
     main_graph = capture_pause(stream=stream)
+    # store the pointer to the cuda graph to restore it later
+    main_graph_ptr = main_graph.graph
     # capture if-graph
     if on_true is not None:
-        capture_resume(graph_on_true, stream=stream)
+        # temporarily repurpose the main_graph python object such that all dependencies
+        # added through retain_module_exec() end up in the correct python graph object
+        main_graph.graph = graph_on_true
+        capture_resume(main_graph, stream=stream)
         if isinstance(on_true, Callable):
             on_true(**kwargs)
         elif isinstance(on_true, Graph):
@@ -6541,7 +6623,10 @@ def capture_if(
     # capture else-graph
     if on_false is not None:
-        capture_resume(graph_on_false, stream=stream)
+        # temporarily repurpose the main_graph python object such that all dependencies
+        # added through retain_module_exec() end up in the correct python graph object
+        main_graph.graph = graph_on_false
+        capture_resume(main_graph, stream=stream)
         if isinstance(on_false, Callable):
             on_false(**kwargs)
         elif isinstance(on_false, Graph):
@@ -6559,6 +6644,9 @@ def capture_if(
             raise TypeError("on_false must be a Callable or a Graph")
         capture_pause(stream=stream)
+    # restore the main graph to its original state
+    main_graph.graph = main_graph_ptr
     # resume capturing parent graph
     capture_resume(main_graph, stream=stream)
@@ -6641,7 +6729,13 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
     # pause capturing parent graph and start capturing child graph
     main_graph = capture_pause(stream=stream)
-    capture_resume(body_graph, stream=stream)
+    # store the pointer to the cuda graph to restore it later
+    main_graph_ptr = main_graph.graph
+    # temporarily repurpose the main_graph python object such that all dependencies
+    # added through retain_module_exec() end up in the correct python graph object
+    main_graph.graph = body_graph
+    capture_resume(main_graph, stream=stream)
     # capture while-body
     if isinstance(while_body, Callable):
@@ -6670,6 +6764,8 @@ def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph
     # stop capturing child graph and resume capturing parent graph
     capture_pause(stream=stream)
+    # restore the main graph to its original state
+    main_graph.graph = main_graph_ptr
     capture_resume(main_graph, stream=stream)
@@ -6691,7 +6787,9 @@ def capture_launch(graph: Graph, stream: Stream | None = None):
     if graph.graph_exec is None:
         g = ctypes.c_void_p()
-        result = runtime.core.cuda_graph_create_exec(graph.device.context, graph.graph, ctypes.byref(g))
+        result = runtime.core.cuda_graph_create_exec(
+            graph.device.context, stream.cuda_stream, graph.graph, ctypes.byref(g)
+        )
         if not result:
             raise RuntimeError(f"Graph creation error: {runtime.get_error_string()}")
         graph.graph_exec = g

warp/examples/interop/example_jax_callable.py CHANGED Viewed

@@ -42,7 +42,7 @@ def scale_vec_kernel(a: wp.array(dtype=wp.vec2), s: float, output: wp.array(dtyp
 # The Python function to call.
 # Note the argument annotations, just like Warp kernels.
-def example_func(
+def scale_func(
     # inputs
     a: wp.array(dtype=float),
     b: wp.array(dtype=wp.vec2),
@@ -55,8 +55,23 @@ def example_func(
     wp.launch(scale_vec_kernel, dim=b.shape, inputs=[b, s], outputs=[d])
+@wp.kernel
+def accum_kernel(a: wp.array(dtype=float), b: wp.array(dtype=float)):
+    tid = wp.tid()
+    b[tid] += a[tid]
+def in_out_func(
+    a: wp.array(dtype=float),  # input only
+    b: wp.array(dtype=float),  # input and output
+    c: wp.array(dtype=float),  # output only
+):
+    wp.launch(scale_kernel, dim=a.size, inputs=[a, 2.0], outputs=[c])
+    wp.launch(accum_kernel, dim=a.size, inputs=[a, b])  # modifies `b`
 def example1():
-    jax_func = jax_callable(example_func, num_outputs=2, vmap_method="broadcast_all")
+    jax_func = jax_callable(scale_func, num_outputs=2)
     @jax.jit
     def f():
@@ -78,7 +93,7 @@ def example1():
 def example2():
-    jax_func = jax_callable(example_func, num_outputs=2, vmap_method="broadcast_all")
+    jax_func = jax_callable(scale_func, num_outputs=2)
     # NOTE: scalar arguments must be static compile-time constants
     @partial(jax.jit, static_argnames=["s"])
@@ -100,11 +115,26 @@ def example2():
     print(r2)
+def example3():
+    # Using input-output arguments
+    jax_func = jax_callable(in_out_func, num_outputs=2, in_out_argnames=["b"])
+    f = jax.jit(jax_func)
+    a = jnp.ones(10, dtype=jnp.float32)
+    b = jnp.arange(10, dtype=jnp.float32)
+    b, c = f(a, b)
+    print(b)
+    print(c)
 def main():
     wp.init()
     wp.load_module(device=wp.get_device())
-    examples = [example1, example2]
+    examples = [example1, example2, example3]
     for example in examples:
         print("\n===========================================================================")

warp/examples/interop/example_jax_kernel.py CHANGED Viewed

@@ -72,6 +72,17 @@ def scale_vec_kernel(a: wp.array(dtype=wp.vec2), s: float, output: wp.array(dtyp
     output[tid] = a[tid] * s
+@wp.kernel
+def in_out_kernel(
+    a: wp.array(dtype=float),  # input only
+    b: wp.array(dtype=float),  # input and output
+    c: wp.array(dtype=float),  # output only
+):
+    tid = wp.tid()
+    b[tid] += a[tid]
+    c[tid] = 2.0 * a[tid]
 def example1():
     # two inputs and one output
     jax_add = jax_kernel(add_kernel)
@@ -189,11 +200,26 @@ def example7():
     print(f())
+def example8():
+    # Using input-output arguments
+    jax_func = jax_kernel(in_out_kernel, num_outputs=2, in_out_argnames=["b"])
+    f = jax.jit(jax_func)
+    a = jnp.ones(10, dtype=jnp.float32)
+    b = jnp.arange(10, dtype=jnp.float32)
+    b, c = f(a, b)
+    print(b)
+    print(c)
 def main():
     wp.init()
     wp.load_module(device=wp.get_device())
-    examples = [example1, example2, example3, example4, example5, example6, example7]
+    examples = [example1, example2, example3, example4, example5, example6, example7, example8]
     for example in examples:
         print("\n===========================================================================")

warp/fem/field/virtual.py CHANGED Viewed

@@ -365,6 +365,8 @@ class LocalAdjointField(SpaceField):
         self._TAYLOR_DOF_COUNTS = LocalAdjointField.DofOffsets(0)
         self.TAYLOR_DOF_COUNT = 0
+        cache.setup_dynamic_attributes(self)
     def notify_operator_usage(self, ops: Set[operator.Operator]):
         # Rebuild degrees-of-freedom offsets based on used operators