PyPI - warp-lang - Versions diffs - 1.5.0__py3-none-manylinux2014_x86_64.whl → 1.5.1__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.5.0__py3-none-manylinux2014_x86_64.whl → 1.5.1__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (34) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/builtins.py +59 -28
warp/codegen.py +21 -17
warp/config.py +1 -1
warp/context.py +59 -35
warp/examples/sim/example_cloth.py +3 -1
warp/fem/geometry/geometry.py +0 -2
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +56 -53
warp/native/tile.h +2 -5
warp/render/render_opengl.py +7 -6
warp/sim/import_urdf.py +8 -8
warp/sim/model.py +23 -19
warp/sparse.py +1 -1
warp/stubs.py +23 -23
warp/tests/test_coloring.py +12 -2
warp/tests/test_examples.py +3 -1
warp/tests/test_func.py +21 -4
warp/tests/test_lerp.py +13 -87
warp/tests/test_matmul.py +6 -9
warp/tests/test_matmul_lite.py +6 -11
warp/tests/test_overwrite.py +45 -0
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_static.py +3 -3
warp/tests/test_tile.py +44 -0
warp/tests/unittest_utils.py +0 -2
warp/types.py +2 -2
warp/utils.py +1 -2
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/METADATA +28 -29
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/RECORD +34 -34
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0

warp/native/cuda_util.cpp CHANGED Viewed

@@ -120,15 +120,17 @@ static inline int get_minor(int version)
     return (version % 1000) / 10;
 }
-static bool get_driver_entry_point(const char* name, void** pfn)
+// Get versioned driver entry point. The version argument should match the function pointer type.
+// For example, to initialize PFN_cuCtxCreate_v3020 use version 3020.
+static bool get_driver_entry_point(const char* name, int version, void** pfn)
 {
     if (!pfn_cuGetProcAddress || !name || !pfn)
         return false;
 #if CUDA_VERSION < 12000
-    CUresult r = pfn_cuGetProcAddress(name, pfn, WP_CUDA_DRIVER_VERSION, CU_GET_PROC_ADDRESS_DEFAULT);
+    CUresult r = pfn_cuGetProcAddress(name, pfn, version, CU_GET_PROC_ADDRESS_DEFAULT);
 #else
-    CUresult r = pfn_cuGetProcAddress(name, pfn, WP_CUDA_DRIVER_VERSION, CU_GET_PROC_ADDRESS_DEFAULT, NULL);
+    CUresult r = pfn_cuGetProcAddress(name, pfn, version, CU_GET_PROC_ADDRESS_DEFAULT, NULL);
 #endif
     if (r != CUDA_SUCCESS)
@@ -170,7 +172,8 @@ bool init_cuda_driver()
     // check the CUDA driver version and report an error if it's too low
     int driver_version = 0;
-    if (get_driver_entry_point("cuDriverGetVersion", &(void*&)pfn_cuDriverGetVersion) && check_cu(pfn_cuDriverGetVersion(&driver_version)))
+    if (get_driver_entry_point("cuDriverGetVersion", 2020, &(void*&)pfn_cuDriverGetVersion) &&
+        check_cu(pfn_cuDriverGetVersion(&driver_version)))
     {
         if (driver_version < WP_CUDA_DRIVER_VERSION)
         {
@@ -186,55 +189,55 @@ bool init_cuda_driver()
     }
     // initialize driver entry points
-    get_driver_entry_point("cuGetErrorString", &(void*&)pfn_cuGetErrorString);
-    get_driver_entry_point("cuGetErrorName", &(void*&)pfn_cuGetErrorName);
-    get_driver_entry_point("cuInit", &(void*&)pfn_cuInit);
-    get_driver_entry_point("cuDeviceGet", &(void*&)pfn_cuDeviceGet);
-    get_driver_entry_point("cuDeviceGetCount", &(void*&)pfn_cuDeviceGetCount);
-    get_driver_entry_point("cuDeviceGetName", &(void*&)pfn_cuDeviceGetName);
-    get_driver_entry_point("cuDeviceGetAttribute", &(void*&)pfn_cuDeviceGetAttribute);
-    get_driver_entry_point("cuDeviceGetUuid", &(void*&)pfn_cuDeviceGetUuid);
-    get_driver_entry_point("cuDevicePrimaryCtxRetain", &(void*&)pfn_cuDevicePrimaryCtxRetain);
-    get_driver_entry_point("cuDevicePrimaryCtxRelease", &(void*&)pfn_cuDevicePrimaryCtxRelease);
-    get_driver_entry_point("cuDeviceCanAccessPeer", &(void*&)pfn_cuDeviceCanAccessPeer);
-    get_driver_entry_point("cuMemGetInfo", &(void*&)pfn_cuMemGetInfo);
-    get_driver_entry_point("cuCtxSetCurrent", &(void*&)pfn_cuCtxSetCurrent);
-    get_driver_entry_point("cuCtxGetCurrent", &(void*&)pfn_cuCtxGetCurrent);
-    get_driver_entry_point("cuCtxPushCurrent", &(void*&)pfn_cuCtxPushCurrent);
-    get_driver_entry_point("cuCtxPopCurrent", &(void*&)pfn_cuCtxPopCurrent);
-    get_driver_entry_point("cuCtxSynchronize", &(void*&)pfn_cuCtxSynchronize);
-    get_driver_entry_point("cuCtxGetDevice", &(void*&)pfn_cuCtxGetDevice);
-    get_driver_entry_point("cuCtxCreate", &(void*&)pfn_cuCtxCreate);
-    get_driver_entry_point("cuCtxDestroy", &(void*&)pfn_cuCtxDestroy);
-    get_driver_entry_point("cuCtxEnablePeerAccess", &(void*&)pfn_cuCtxEnablePeerAccess);
-    get_driver_entry_point("cuCtxDisablePeerAccess", &(void*&)pfn_cuCtxDisablePeerAccess);
-    get_driver_entry_point("cuStreamCreate", &(void*&)pfn_cuStreamCreate);
-    get_driver_entry_point("cuStreamDestroy", &(void*&)pfn_cuStreamDestroy);
-    get_driver_entry_point("cuStreamSynchronize", &(void*&)pfn_cuStreamSynchronize);
-    get_driver_entry_point("cuStreamWaitEvent", &(void*&)pfn_cuStreamWaitEvent);
-    get_driver_entry_point("cuStreamGetCtx", &(void*&)pfn_cuStreamGetCtx);
-    get_driver_entry_point("cuStreamGetCaptureInfo", &(void*&)pfn_cuStreamGetCaptureInfo);
-    get_driver_entry_point("cuStreamUpdateCaptureDependencies", &(void*&)pfn_cuStreamUpdateCaptureDependencies);
-    get_driver_entry_point("cuStreamCreateWithPriority", &(void*&)pfn_cuStreamCreateWithPriority);
-    get_driver_entry_point("cuStreamGetPriority", &(void*&)pfn_cuStreamGetPriority);
-    get_driver_entry_point("cuEventCreate", &(void*&)pfn_cuEventCreate);
-    get_driver_entry_point("cuEventDestroy", &(void*&)pfn_cuEventDestroy);
-    get_driver_entry_point("cuEventRecord", &(void*&)pfn_cuEventRecord);
-    get_driver_entry_point("cuEventRecordWithFlags", &(void*&)pfn_cuEventRecordWithFlags);
-    get_driver_entry_point("cuEventSynchronize", &(void*&)pfn_cuEventSynchronize);
-    get_driver_entry_point("cuModuleLoadDataEx", &(void*&)pfn_cuModuleLoadDataEx);
-    get_driver_entry_point("cuModuleUnload", &(void*&)pfn_cuModuleUnload);
-    get_driver_entry_point("cuModuleGetFunction", &(void*&)pfn_cuModuleGetFunction);
-    get_driver_entry_point("cuLaunchKernel", &(void*&)pfn_cuLaunchKernel);
-    get_driver_entry_point("cuMemcpyPeerAsync", &(void*&)pfn_cuMemcpyPeerAsync);
-    get_driver_entry_point("cuPointerGetAttribute", &(void*&)pfn_cuPointerGetAttribute);
-    get_driver_entry_point("cuGraphicsMapResources", &(void*&)pfn_cuGraphicsMapResources);
-    get_driver_entry_point("cuGraphicsUnmapResources", &(void*&)pfn_cuGraphicsUnmapResources);
-    get_driver_entry_point("cuGraphicsResourceGetMappedPointer", &(void*&)pfn_cuGraphicsResourceGetMappedPointer);
-    get_driver_entry_point("cuGraphicsGLRegisterBuffer", &(void*&)pfn_cuGraphicsGLRegisterBuffer);
-    get_driver_entry_point("cuGraphicsUnregisterResource", &(void*&)pfn_cuGraphicsUnregisterResource);
-    get_driver_entry_point("cuModuleGetGlobal", &(void*&)pfn_cuModuleGetGlobal);
-    get_driver_entry_point("cuFuncSetAttribute", &(void*&)pfn_cuFuncSetAttribute);
+    get_driver_entry_point("cuGetErrorString", 6000, &(void*&)pfn_cuGetErrorString);
+    get_driver_entry_point("cuGetErrorName", 6000, &(void*&)pfn_cuGetErrorName);
+    get_driver_entry_point("cuInit", 2000, &(void*&)pfn_cuInit);
+    get_driver_entry_point("cuDeviceGet", 2000, &(void*&)pfn_cuDeviceGet);
+    get_driver_entry_point("cuDeviceGetCount", 2000, &(void*&)pfn_cuDeviceGetCount);
+    get_driver_entry_point("cuDeviceGetName", 2000, &(void*&)pfn_cuDeviceGetName);
+    get_driver_entry_point("cuDeviceGetAttribute", 2000, &(void*&)pfn_cuDeviceGetAttribute);
+    get_driver_entry_point("cuDeviceGetUuid", 110400, &(void*&)pfn_cuDeviceGetUuid);
+    get_driver_entry_point("cuDevicePrimaryCtxRetain", 7000, &(void*&)pfn_cuDevicePrimaryCtxRetain);
+    get_driver_entry_point("cuDevicePrimaryCtxRelease", 11000, &(void*&)pfn_cuDevicePrimaryCtxRelease);
+    get_driver_entry_point("cuDeviceCanAccessPeer", 4000, &(void*&)pfn_cuDeviceCanAccessPeer);
+    get_driver_entry_point("cuMemGetInfo", 3020, &(void*&)pfn_cuMemGetInfo);
+    get_driver_entry_point("cuCtxSetCurrent", 4000, &(void*&)pfn_cuCtxSetCurrent);
+    get_driver_entry_point("cuCtxGetCurrent", 4000, &(void*&)pfn_cuCtxGetCurrent);
+    get_driver_entry_point("cuCtxPushCurrent", 4000, &(void*&)pfn_cuCtxPushCurrent);
+    get_driver_entry_point("cuCtxPopCurrent", 4000, &(void*&)pfn_cuCtxPopCurrent);
+    get_driver_entry_point("cuCtxSynchronize", 2000, &(void*&)pfn_cuCtxSynchronize);
+    get_driver_entry_point("cuCtxGetDevice", 2000, &(void*&)pfn_cuCtxGetDevice);
+    get_driver_entry_point("cuCtxCreate", 3020, &(void*&)pfn_cuCtxCreate);
+    get_driver_entry_point("cuCtxDestroy", 4000, &(void*&)pfn_cuCtxDestroy);
+    get_driver_entry_point("cuCtxEnablePeerAccess", 4000, &(void*&)pfn_cuCtxEnablePeerAccess);
+    get_driver_entry_point("cuCtxDisablePeerAccess", 4000, &(void*&)pfn_cuCtxDisablePeerAccess);
+    get_driver_entry_point("cuStreamCreate", 2000, &(void*&)pfn_cuStreamCreate);
+    get_driver_entry_point("cuStreamDestroy", 4000, &(void*&)pfn_cuStreamDestroy);
+    get_driver_entry_point("cuStreamSynchronize", 2000, &(void*&)pfn_cuStreamSynchronize);
+    get_driver_entry_point("cuStreamWaitEvent", 3020, &(void*&)pfn_cuStreamWaitEvent);
+    get_driver_entry_point("cuStreamGetCtx", 9020, &(void*&)pfn_cuStreamGetCtx);
+    get_driver_entry_point("cuStreamGetCaptureInfo", 11030, &(void*&)pfn_cuStreamGetCaptureInfo);
+    get_driver_entry_point("cuStreamUpdateCaptureDependencies", 11030, &(void*&)pfn_cuStreamUpdateCaptureDependencies);
+    get_driver_entry_point("cuStreamCreateWithPriority", 5050, &(void*&)pfn_cuStreamCreateWithPriority);
+    get_driver_entry_point("cuStreamGetPriority", 5050, &(void*&)pfn_cuStreamGetPriority);
+    get_driver_entry_point("cuEventCreate", 2000, &(void*&)pfn_cuEventCreate);
+    get_driver_entry_point("cuEventDestroy", 4000, &(void*&)pfn_cuEventDestroy);
+    get_driver_entry_point("cuEventRecord", 2000, &(void*&)pfn_cuEventRecord);
+    get_driver_entry_point("cuEventRecordWithFlags", 11010, &(void*&)pfn_cuEventRecordWithFlags);
+    get_driver_entry_point("cuEventSynchronize", 2000, &(void*&)pfn_cuEventSynchronize);
+    get_driver_entry_point("cuModuleLoadDataEx", 2010, &(void*&)pfn_cuModuleLoadDataEx);
+    get_driver_entry_point("cuModuleUnload", 2000, &(void*&)pfn_cuModuleUnload);
+    get_driver_entry_point("cuModuleGetFunction", 2000, &(void*&)pfn_cuModuleGetFunction);
+    get_driver_entry_point("cuLaunchKernel", 4000, &(void*&)pfn_cuLaunchKernel);
+    get_driver_entry_point("cuMemcpyPeerAsync", 4000, &(void*&)pfn_cuMemcpyPeerAsync);
+    get_driver_entry_point("cuPointerGetAttribute", 4000, &(void*&)pfn_cuPointerGetAttribute);
+    get_driver_entry_point("cuGraphicsMapResources", 3000, &(void*&)pfn_cuGraphicsMapResources);
+    get_driver_entry_point("cuGraphicsUnmapResources", 3000, &(void*&)pfn_cuGraphicsUnmapResources);
+    get_driver_entry_point("cuGraphicsResourceGetMappedPointer", 3020, &(void*&)pfn_cuGraphicsResourceGetMappedPointer);
+    get_driver_entry_point("cuGraphicsGLRegisterBuffer", 3000, &(void*&)pfn_cuGraphicsGLRegisterBuffer);
+    get_driver_entry_point("cuGraphicsUnregisterResource", 3000, &(void*&)pfn_cuGraphicsUnregisterResource);
+    get_driver_entry_point("cuModuleGetGlobal", 3020, &(void*&)pfn_cuModuleGetGlobal);
+    get_driver_entry_point("cuFuncSetAttribute", 9000, &(void*&)pfn_cuFuncSetAttribute);
     if (pfn_cuInit)
         cuda_driver_initialized = check_cu(pfn_cuInit(0));

warp/native/tile.h CHANGED Viewed

@@ -1125,8 +1125,6 @@ inline CUDA_CALLABLE auto untile(Tile& tile)
     }
 }
 template <typename Tile, typename Value>
 inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, Value& adj_ret)
 {
@@ -1156,7 +1154,7 @@ inline CUDA_CALLABLE auto tile_zeros()
     return T(0);
 }
-// zero initialized tile
+// one-initialized tile
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_ones()
 {
@@ -1164,7 +1162,7 @@ inline CUDA_CALLABLE auto tile_ones()
     return T(1);
 }
-// zero initialized tile
+// tile with evenly spaced values
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
 {
@@ -1220,7 +1218,6 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
     src.copy_to_global(dest, x, y);
 }
-// entry point for store
 template <typename T, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, Tile& src)
 {

warp/render/render_opengl.py CHANGED Viewed

@@ -1040,7 +1040,7 @@ class OpenGLRenderer:
         self.render_depth = render_depth
         self.enable_backface_culling = enable_backface_culling
-        self._device = wp.get_cuda_device()
+        self._device = wp.get_preferred_device()
         self._title = title
         self.window = pyglet.window.Window(
@@ -2278,14 +2278,9 @@ Instances: {len(self._instances)}"""
         colors1 = np.array(colors1, dtype=np.float32)
         colors2 = np.array(colors2, dtype=np.float32)
-        # create buffer for checkerboard colors
-        self._instance_color1_buffer = gl.GLuint()
-        gl.glGenBuffers(1, self._instance_color1_buffer)
         gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._instance_color1_buffer)
         gl.glBufferData(gl.GL_ARRAY_BUFFER, colors1.nbytes, colors1.ctypes.data, gl.GL_STATIC_DRAW)
-        self._instance_color2_buffer = gl.GLuint()
-        gl.glGenBuffers(1, self._instance_color2_buffer)
         gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._instance_color2_buffer)
         gl.glBufferData(gl.GL_ARRAY_BUFFER, colors2.nbytes, colors2.ctypes.data, gl.GL_STATIC_DRAW)
@@ -2322,6 +2317,12 @@ Instances: {len(self._instances)}"""
             int(self._instance_transform_gl_buffer.value), self._device
         )
+        # create color buffers
+        self._instance_color1_buffer = gl.GLuint()
+        gl.glGenBuffers(1, self._instance_color1_buffer)
+        self._instance_color2_buffer = gl.GLuint()
+        gl.glGenBuffers(1, self._instance_color2_buffer)
         self.update_instance_colors()
         # set up instance attribute pointers

warp/sim/import_urdf.py CHANGED Viewed

@@ -211,14 +211,14 @@ def parse_urdf(
                 if hasattr(m, "geometry"):
                     # multiple meshes are contained in a scene
                     for geom in m.geometry.values():
-                        vertices = np.array(geom.vertices, dtype=np.float32) * scaling
-                        faces = np.array(geom.faces.flatten(), dtype=np.int32)
-                        mesh = Mesh(vertices, faces)
+                        geom_vertices = np.array(geom.vertices, dtype=np.float32) * scaling
+                        geom_faces = np.array(geom.faces.flatten(), dtype=np.int32)
+                        geom_mesh = Mesh(geom_vertices, geom_faces)
                         s = builder.add_shape_mesh(
                             body=link,
                             pos=wp.vec3(tf.p),
                             rot=wp.quat(tf.q),
-                            mesh=mesh,
+                            mesh=geom_mesh,
                             density=density,
                             is_visible=visible,
                             has_ground_collision=not just_visual,
@@ -228,14 +228,14 @@ def parse_urdf(
                         shapes.append(s)
                 else:
                     # a single mesh
-                    vertices = np.array(m.vertices, dtype=np.float32) * scaling
-                    faces = np.array(m.faces.flatten(), dtype=np.int32)
-                    mesh = Mesh(vertices, faces)
+                    m_vertices = np.array(m.vertices, dtype=np.float32) * scaling
+                    m_faces = np.array(m.faces.flatten(), dtype=np.int32)
+                    m_mesh = Mesh(m_vertices, m_faces)
                     s = builder.add_shape_mesh(
                         body=link,
                         pos=wp.vec3(tf.p),
                         rot=wp.quat(tf.q),
-                        mesh=mesh,
+                        mesh=m_mesh,
                         density=density,
                         is_visible=visible,
                         has_ground_collision=not just_visual,

warp/sim/model.py CHANGED Viewed

@@ -578,14 +578,14 @@ class Model:
                This setting is not supported by :class:`FeatherstoneIntegrator`.
-        joint_limit_lower (array): Joint lower position limits, shape [joint_count], float
-        joint_limit_upper (array): Joint upper position limits, shape [joint_count], float
-        joint_limit_ke (array): Joint position limit stiffness (used by the Euler integrators), shape [joint_count], float
-        joint_limit_kd (array): Joint position limit damping (used by the Euler integrators), shape [joint_count], float
+        joint_limit_lower (array): Joint lower position limits, shape [joint_axis_count], float
+        joint_limit_upper (array): Joint upper position limits, shape [joint_axis_count], float
+        joint_limit_ke (array): Joint position limit stiffness (used by the Euler integrators), shape [joint_axis_count], float
+        joint_limit_kd (array): Joint position limit damping (used by the Euler integrators), shape [joint_axis_count], float
         joint_twist_lower (array): Joint lower twist limit, shape [joint_count], float
         joint_twist_upper (array): Joint upper twist limit, shape [joint_count], float
-        joint_q_start (array): Start index of the first position coordinate per joint, shape [joint_count], int
-        joint_qd_start (array): Start index of the first velocity coordinate per joint, shape [joint_count], int
+        joint_q_start (array): Start index of the first position coordinate per joint (note the last value is an additional sentinel entry to allow for querying the q dimensionality of joint i via ``joint_q_start[i+1] - joint_q_start[i]``), shape [joint_count + 1], int
+        joint_qd_start (array): Start index of the first velocity coordinate per joint (note the last value is an additional sentinel entry to allow for querying the qd dimensionality of joint i via ``joint_qd_start[i+1] - joint_qd_start[i]``), shape [joint_count + 1], int
         articulation_start (array): Articulation start index, shape [articulation_count], int
         joint_name (list): Joint names, shape [joint_count], str
         joint_attach_ke (float): Joint attachment force stiffness (used by :class:`SemiImplicitIntegrator`)
@@ -1442,12 +1442,14 @@ class ModelBuilder:
             self.shape_collision_filter_pairs.add((i + shape_count, j + shape_count))
         for group, shapes in builder.shape_collision_group_map.items():
             if separate_collision_group:
-                group = self.last_collision_group + 1
+                extend_group = self.last_collision_group + 1
             else:
-                group = group + self.last_collision_group if group > -1 else -1
-            if group not in self.shape_collision_group_map:
-                self.shape_collision_group_map[group] = []
-            self.shape_collision_group_map[group].extend([s + shape_count for s in shapes])
+                extend_group = group + self.last_collision_group if group > -1 else -1
+            if extend_group not in self.shape_collision_group_map:
+                self.shape_collision_group_map[extend_group] = []
+            self.shape_collision_group_map[extend_group].extend([s + shape_count for s in shapes])
         # update last collision group counter
         if separate_collision_group:
@@ -2616,11 +2618,12 @@ class ModelBuilder:
             joint_remap[joint["original_id"]] = i
         # update articulation_start
         for i, old_i in enumerate(self.articulation_start):
-            while old_i not in joint_remap:
-                old_i += 1
-                if old_i >= self.joint_count:
+            start_i = old_i
+            while start_i not in joint_remap:
+                start_i += 1
+                if start_i >= self.joint_count:
                     break
-            self.articulation_start[i] = joint_remap.get(old_i, old_i)
+            self.articulation_start[i] = joint_remap.get(start_i, start_i)
         # remove empty articulation starts, i.e. where the start and end are the same
         self.articulation_start = list(set(self.articulation_start))
@@ -4269,8 +4272,7 @@ class ModelBuilder:
         pos = wp.vec3(pos[0], pos[1], pos[2])
         # add particles
         for v in vertices:
-            v = wp.vec3(v[0], v[1], v[2])
-            p = wp.quat_rotate(rot, v * scale) + pos
+            p = wp.quat_rotate(rot, wp.vec3(v[0], v[1], v[2]) * scale) + pos
             self.add_particle(p, vel, 0.0)
@@ -4402,16 +4404,18 @@ class ModelBuilder:
             balance_colors: Whether to apply the color balancing algorithm to balance the size of each color
             target_max_min_color_ratio: the color balancing algorithm will stop when the ratio between the largest color and
                 the smallest color reaches this value
-            algorithm: Value should an enum type of ColoringAlgorithm, otherwise it will raise an error. ColoringAlgorithm.mcs means using the MCS coloring algorithm,
+            algorithm: Value should be an enum type of ColoringAlgorithm, otherwise it will raise an error. ColoringAlgorithm.mcs means using the MCS coloring algorithm,
                 while ColoringAlgorithm.ordered_greedy means using the degree-ordered greedy algorithm. The MCS algorithm typically generates 30% to 50% fewer colors
                 compared to the ordered greedy algorithm, while maintaining the same linear complexity. Although MCS has a constant overhead that makes it about twice
                 as slow as the greedy algorithm, it produces significantly better coloring results. We recommend using MCS, especially if coloring is only part of the
-                preprocessing stage.e.
+                preprocessing.
         Note:
             References to the coloring algorithm:
             MCS: Pereira, F. M. Q., & Palsberg, J. (2005, November). Register allocation via coloring of chordal graphs. In Asian Symposium on Programming Languages and Systems (pp. 315-329). Berlin, Heidelberg: Springer Berlin Heidelberg.
             Ordered Greedy: Ton-That, Q. M., Kry, P. G., & Andrews, S. (2023). Parallel block Neo-Hookean XPBD using graph clustering. Computers & Graphics, 110, 1-10.
         """

warp/sparse.py CHANGED Viewed

@@ -8,7 +8,7 @@ from warp.types import Array, Cols, Rows, Scalar, Vector
 # typing hints
-_BlockType = TypeVar("BlockType")
+_BlockType = TypeVar("BlockType")  # noqa: PLC0132
 class _MatrixBlockType(Generic[Rows, Cols, Scalar]):

warp/stubs.py CHANGED Viewed

@@ -975,7 +975,7 @@ def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32, storage: st
 @over
-def tile_store(a: Array[Any], i: int32, t: Any):
+def tile_store(a: Array[Any], i: int32, t: Tile):
     """Stores a 1D tile to a global memory array.
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -988,7 +988,7 @@ def tile_store(a: Array[Any], i: int32, t: Any):
 @over
-def tile_store(a: Array[Any], i: int32, j: int32, t: Any):
+def tile_store(a: Array[Any], i: int32, j: int32, t: Tile):
     """Stores a tile to a global memory array.
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -1002,7 +1002,7 @@ def tile_store(a: Array[Any], i: int32, j: int32, t: Any):
 @over
-def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile:
+def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Tile) -> Tile:
     """Atomically add a tile to the array `a`, each element will be updated atomically.
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
@@ -1077,7 +1077,7 @@ def tile(x: Any) -> Tile:
 @over
-def untile(a: Any) -> Scalar:
+def untile(a: Tile) -> Scalar:
     """Convert a Tile back to per-thread values.
     This function converts a block-wide tile back to per-thread values.
@@ -1100,7 +1100,7 @@ def untile(a: Any) -> Scalar:
             t = wp.tile(i) * 2
             # convert back to per-thread values
-            s = wp.untile()
+            s = wp.untile(t)
             print(s)
@@ -1154,7 +1154,7 @@ def tile_transpose(a: Tile) -> Tile:
 def tile_broadcast(a: Tile, m: int32, n: int32) -> Tile:
     """Broadcast a tile.
-    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
     :param a: Tile to broadcast
     :returns: Tile with broadcast ``shape=(m, n)``
@@ -1178,10 +1178,10 @@ def tile_sum(a: Tile) -> Tile:
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
@@ -1207,19 +1207,19 @@ def tile_min(a: Tile) -> Tile:
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            t = wp.tile_arange(64, 128)
             s = wp.tile_min(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
     .. code-block:: text
-        tile(m=1, n=1, storage=register) = [[-10]]
+        tile(m=1, n=1, storage=register) = [[64 ]]
     """
@@ -1239,19 +1239,19 @@ def tile_max(a: Tile) -> Tile:
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
-            s = wp.tile_min(t)
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_max(t)
-            print(t)
+            print(s)
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
     Prints:
     .. code-block:: text
-        tile(m=1, n=1, storage=register) = [[10]]
+        tile(m=1, n=1, storage=register) = [[127 ]]
     """
@@ -1259,7 +1259,7 @@ def tile_max(a: Tile) -> Tile:
 @over
-def tile_reduce(op: Callable, a: Any) -> Tile:
+def tile_reduce(op: Callable, a: Tile) -> Tile:
     """Apply a custom reduction operator across the tile.
     This function cooperatively performs a reduction using the provided operator across the tile.
@@ -1280,7 +1280,7 @@ def tile_reduce(op: Callable, a: Any) -> Tile:
             print(s)
-        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
     Prints:
@@ -1293,7 +1293,7 @@ def tile_reduce(op: Callable, a: Any) -> Tile:
 @over
-def tile_map(op: Callable, a: Any) -> Tile:
+def tile_map(op: Callable, a: Tile) -> Tile:
     """Apply a unary function onto the tile.
     This function cooperatively applies a unary function to each element of the tile using all threads in the block.
@@ -1314,7 +1314,7 @@ def tile_map(op: Callable, a: Any) -> Tile:
             print(s)
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
     Prints:
@@ -1327,7 +1327,7 @@ def tile_map(op: Callable, a: Any) -> Tile:
 @over
-def tile_map(op: Callable, a: Any, b: Any) -> Tile:
+def tile_map(op: Callable, a: Tile, b: Tile) -> Tile:
     """Apply a binary function onto the tile.
     This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
@@ -1352,7 +1352,7 @@ def tile_map(op: Callable, a: Any, b: Any) -> Tile:
             print(s)
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
     Prints:

warp/tests/test_coloring.py CHANGED Viewed

@@ -11,7 +11,12 @@ import numpy as np
 import warp as wp
 import warp.examples
 import warp.sim
-from warp.sim.graph_coloring import ColoringAlgorithm, construct_trimesh_graph_edges, validate_graph_coloring
+from warp.sim.graph_coloring import (
+    ColoringAlgorithm,
+    construct_trimesh_graph_edges,
+    convert_to_color_groups,
+    validate_graph_coloring,
+)
 from warp.tests.unittest_utils import *
@@ -120,7 +125,7 @@ def test_coloring_trimesh(test, device):
             ColoringAlgorithm.MCS.value,
             particle_colors.__ctype__(),
         )
-        wp.context.runtime.core.balance_coloring(
+        max_min_ratio = wp.context.runtime.core.balance_coloring(
             model.particle_count,
             edge_indices_cpu_with_bending.__ctype__(),
             num_colors_mcs,
@@ -134,6 +139,11 @@ def test_coloring_trimesh(test, device):
             device="cpu",
         )
+        color_categories_balanced = convert_to_color_groups(num_colors_mcs, particle_colors)
+        color_sizes = np.array([c.shape[0] for c in color_categories_balanced], dtype=np.float32)
+        test.assertTrue(np.max(color_sizes) / np.min(color_sizes) <= max_min_ratio)
 @unittest.skipUnless(USD_AVAILABLE, "Requires usd-core")
 def test_combine_coloring(test, device):

warp/tests/test_examples.py CHANGED Viewed

@@ -165,7 +165,9 @@ def add_example_test(
         # with wp.ScopedTimer(f"{name}_{sanitize_identifier(device)}"):
         # Run the script as a subprocess
-        result = subprocess.run(command, capture_output=True, text=True, env=env_vars, timeout=test_timeout)
+        result = subprocess.run(
+            command, capture_output=True, text=True, env=env_vars, timeout=test_timeout, check=False
+        )
         # Check the return code (0 is standard for success)
         test.assertEqual(

warp/tests/test_func.py CHANGED Viewed

@@ -162,7 +162,7 @@ def user_func_with_defaults(a: int = 123, b: int = 234) -> int:
 @wp.kernel
-def test_user_func_with_defaults():
+def user_func_with_defaults_kernel():
     a = user_func_with_defaults()
     wp.expect_eq(a, 357)
@@ -179,6 +179,25 @@ def test_user_func_with_defaults():
     wp.expect_eq(e, 234)
+def test_user_func_with_defaults(test, device):
+    wp.launch(user_func_with_defaults_kernel, dim=1, device=device)
+    a = user_func_with_defaults()
+    assert a == 357
+    b = user_func_with_defaults(111)
+    assert b == 345
+    c = user_func_with_defaults(111, 222)
+    assert c == 333
+    d = user_func_with_defaults(a=111)
+    assert d == 345
+    e = user_func_with_defaults(b=111)
+    assert e == 234
 @wp.func
 def user_func_return_multiple_values(a: int, b: float) -> Tuple[int, float]:
     return a + a, b * b
@@ -406,9 +425,7 @@ add_function_test(TestFunc, func=test_func_closure_capture, name="test_func_clos
 add_function_test(TestFunc, func=test_multi_valued_func, name="test_multi_valued_func", devices=devices)
 add_kernel_test(TestFunc, kernel=test_func_defaults, name="test_func_defaults", dim=1, devices=devices)
 add_kernel_test(TestFunc, kernel=test_builtin_shadowing, name="test_builtin_shadowing", dim=1, devices=devices)
-add_kernel_test(
-    TestFunc, kernel=test_user_func_with_defaults, name="test_user_func_with_defaults", dim=1, devices=devices
-)
+add_function_test(TestFunc, func=test_user_func_with_defaults, name="test_user_func_with_defaults", devices=devices)
 add_kernel_test(
     TestFunc,
     kernel=test_user_func_return_multiple_values,