PyPI - warp-lang - Versions diffs - 1.6.1__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.6.1__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (401) hide show

warp/__init__.py +21 -7
warp/autograd.py +14 -6
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +424 -6
warp/build_dll.py +20 -20
warp/builtins.py +467 -368
warp/codegen.py +193 -125
warp/config.py +56 -12
warp/constants.py +14 -6
warp/context.py +524 -277
warp/dlpack.py +22 -12
warp/examples/__init__.py +14 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_api.py +14 -6
warp/examples/benchmarks/benchmark_cloth.py +14 -6
warp/examples/benchmarks/benchmark_cloth_cupy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_jax.py +14 -6
warp/examples/benchmarks/benchmark_cloth_numba.py +15 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_paddle.py +14 -6
warp/examples/benchmarks/benchmark_cloth_pytorch.py +14 -6
warp/examples/benchmarks/benchmark_cloth_taichi.py +14 -6
warp/examples/benchmarks/benchmark_cloth_warp.py +14 -6
warp/examples/benchmarks/benchmark_gemm.py +82 -48
warp/examples/benchmarks/benchmark_interop_paddle.py +14 -6
warp/examples/benchmarks/benchmark_interop_torch.py +14 -6
warp/examples/benchmarks/benchmark_launches.py +14 -6
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +14 -6
warp/examples/core/example_cupy.py +14 -6
warp/examples/core/example_dem.py +14 -6
warp/examples/core/example_fluid.py +14 -6
warp/examples/core/example_graph_capture.py +14 -6
warp/examples/core/example_marching_cubes.py +14 -6
warp/examples/core/example_mesh.py +14 -6
warp/examples/core/example_mesh_intersect.py +14 -6
warp/examples/core/example_nvdb.py +14 -6
warp/examples/core/example_raycast.py +14 -6
warp/examples/core/example_raymarch.py +14 -6
warp/examples/core/example_render_opengl.py +14 -6
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +14 -6
warp/examples/core/example_torch.py +14 -6
warp/examples/core/example_wave.py +14 -6
warp/examples/fem/example_adaptive_grid.py +14 -6
warp/examples/fem/example_apic_fluid.py +15 -7
warp/examples/fem/example_burgers.py +16 -8
warp/examples/fem/example_convection_diffusion.py +14 -6
warp/examples/fem/example_convection_diffusion_dg.py +14 -6
warp/examples/fem/example_deformed_geometry.py +15 -7
warp/examples/fem/example_diffusion.py +14 -6
warp/examples/fem/example_diffusion_3d.py +14 -6
warp/examples/fem/example_diffusion_mgpu.py +14 -6
warp/examples/fem/example_distortion_energy.py +15 -7
warp/examples/fem/example_magnetostatics.py +20 -12
warp/examples/fem/example_mixed_elasticity.py +14 -6
warp/examples/fem/example_navier_stokes.py +14 -6
warp/examples/fem/example_nonconforming_contact.py +14 -6
warp/examples/fem/example_stokes.py +14 -6
warp/examples/fem/example_stokes_transfer.py +14 -6
warp/examples/fem/example_streamlines.py +14 -6
warp/examples/fem/utils.py +24 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +14 -6
warp/examples/optim/example_cloth_throw.py +14 -6
warp/examples/optim/example_diffray.py +14 -6
warp/examples/optim/example_drone.py +14 -6
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +14 -6
warp/examples/optim/example_inverse_kinematics_torch.py +14 -6
warp/examples/optim/example_softbody_properties.py +14 -6
warp/examples/optim/example_spring_cage.py +14 -6
warp/examples/optim/example_trajectory.py +14 -6
warp/examples/sim/example_cartpole.py +14 -6
warp/examples/sim/example_cloth.py +14 -6
warp/examples/sim/example_cloth_self_contact.py +14 -6
warp/examples/sim/example_granular.py +14 -6
warp/examples/sim/example_granular_collision_sdf.py +14 -6
warp/examples/sim/example_jacobian_ik.py +14 -6
warp/examples/sim/example_particle_chain.py +14 -6
warp/examples/sim/example_quadruped.py +14 -6
warp/examples/sim/example_rigid_chain.py +14 -6
warp/examples/sim/example_rigid_contact.py +14 -6
warp/examples/sim/example_rigid_force.py +14 -6
warp/examples/sim/example_rigid_gyroscopic.py +14 -6
warp/examples/sim/example_rigid_soft_contact.py +14 -6
warp/examples/sim/example_soft_body.py +14 -6
warp/examples/tile/example_tile_cholesky.py +14 -6
warp/examples/tile/example_tile_convolution.py +14 -6
warp/examples/tile/example_tile_fft.py +14 -6
warp/examples/tile/example_tile_filtering.py +14 -6
warp/examples/tile/example_tile_matmul.py +16 -10
warp/examples/tile/example_tile_mlp.py +14 -6
warp/examples/tile/example_tile_nbody.py +14 -6
warp/examples/tile/example_tile_walker.py +14 -6
warp/fabric.py +15 -0
warp/fem/__init__.py +26 -1
warp/fem/adaptivity.py +19 -4
warp/fem/cache.py +15 -0
warp/fem/dirichlet.py +15 -0
warp/fem/domain.py +15 -0
warp/fem/field/__init__.py +15 -0
warp/fem/field/field.py +15 -0
warp/fem/field/nodal_field.py +37 -68
warp/fem/field/restriction.py +15 -0
warp/fem/field/virtual.py +77 -23
warp/fem/geometry/__init__.py +15 -0
warp/fem/geometry/adaptive_nanogrid.py +24 -10
warp/fem/geometry/closest_point.py +16 -1
warp/fem/geometry/deformed_geometry.py +20 -2
warp/fem/geometry/element.py +15 -0
warp/fem/geometry/geometry.py +20 -0
warp/fem/geometry/grid_2d.py +27 -12
warp/fem/geometry/grid_3d.py +27 -15
warp/fem/geometry/hexmesh.py +20 -7
warp/fem/geometry/nanogrid.py +24 -11
warp/fem/geometry/partition.py +15 -0
warp/fem/geometry/quadmesh.py +28 -13
warp/fem/geometry/tetmesh.py +18 -4
warp/fem/geometry/trimesh.py +18 -8
warp/fem/integrate.py +277 -93
warp/fem/linalg.py +20 -5
warp/fem/operator.py +15 -0
warp/fem/polynomial.py +15 -0
warp/fem/quadrature/__init__.py +15 -0
warp/fem/quadrature/pic_quadrature.py +52 -22
warp/fem/quadrature/quadrature.py +209 -25
warp/fem/space/__init__.py +16 -1
warp/fem/space/basis_function_space.py +19 -2
warp/fem/space/basis_space.py +40 -18
warp/fem/space/dof_mapper.py +15 -0
warp/fem/space/function_space.py +15 -0
warp/fem/space/grid_2d_function_space.py +15 -0
warp/fem/space/grid_3d_function_space.py +15 -0
warp/fem/space/hexmesh_function_space.py +17 -2
warp/fem/space/nanogrid_function_space.py +15 -0
warp/fem/space/partition.py +21 -2
warp/fem/space/quadmesh_function_space.py +23 -8
warp/fem/space/restriction.py +15 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +38 -23
warp/fem/space/shape/shape_function.py +15 -0
warp/fem/space/shape/square_shape_function.py +27 -12
warp/fem/space/shape/tet_shape_function.py +15 -0
warp/fem/space/shape/triangle_shape_function.py +16 -1
warp/fem/space/tetmesh_function_space.py +18 -3
warp/fem/space/topology.py +15 -0
warp/fem/space/trimesh_function_space.py +17 -2
warp/fem/types.py +15 -0
warp/fem/utils.py +27 -6
warp/jax.py +28 -7
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -33
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +103 -6
warp/native/array.h +28 -6
warp/native/builtin.h +44 -9
warp/native/bvh.cpp +18 -7
warp/native/bvh.cu +57 -20
warp/native/bvh.h +17 -7
warp/native/clang/clang.cpp +45 -9
warp/native/coloring.cpp +15 -6
warp/native/crt.cpp +15 -6
warp/native/crt.h +15 -6
warp/native/cuda_crt.h +15 -6
warp/native/cuda_util.cpp +29 -6
warp/native/cuda_util.h +17 -6
warp/native/error.cpp +15 -6
warp/native/error.h +15 -6
warp/native/exports.h +85 -63
warp/native/fabric.h +15 -6
warp/native/hashgrid.cpp +15 -6
warp/native/hashgrid.cu +15 -6
warp/native/hashgrid.h +15 -6
warp/native/initializer_array.h +15 -6
warp/native/intersect.h +41 -32
warp/native/intersect_adj.h +48 -39
warp/native/intersect_tri.h +17 -0
warp/native/marching.cpp +16 -0
warp/native/marching.cu +16 -7
warp/native/marching.h +17 -0
warp/native/mat.h +528 -15
warp/native/mathdx.cpp +15 -6
warp/native/matnn.h +15 -6
warp/native/mesh.cpp +15 -6
warp/native/mesh.cu +15 -6
warp/native/mesh.h +25 -16
warp/native/noise.h +15 -6
warp/native/quat.h +114 -17
warp/native/rand.h +21 -6
warp/native/range.h +15 -6
warp/native/reduce.cpp +15 -6
warp/native/reduce.cu +15 -6
warp/native/runlength_encode.cpp +15 -6
warp/native/runlength_encode.cu +15 -6
warp/native/scan.cpp +15 -6
warp/native/scan.cu +15 -6
warp/native/scan.h +15 -6
warp/native/solid_angle.h +17 -0
warp/native/sort.cpp +137 -65
warp/native/sort.cu +167 -21
warp/native/sort.h +23 -7
warp/native/sparse.cpp +58 -28
warp/native/sparse.cu +67 -23
warp/native/spatial.h +15 -6
warp/native/svd.h +131 -6
warp/native/temp_buffer.h +15 -6
warp/native/tile.h +316 -111
warp/native/tile_reduce.h +61 -9
warp/native/vec.h +83 -13
warp/native/volume.cpp +100 -119
warp/native/volume.cu +15 -6
warp/native/volume.h +15 -6
warp/native/volume_builder.cu +40 -16
warp/native/volume_builder.h +21 -6
warp/native/volume_impl.h +15 -6
warp/native/warp.cpp +20 -12
warp/native/warp.cu +114 -16
warp/native/warp.h +34 -16
warp/optim/__init__.py +14 -6
warp/optim/adam.py +14 -6
warp/optim/linear.py +25 -10
warp/optim/sgd.py +14 -6
warp/paddle.py +14 -6
warp/render/__init__.py +14 -6
warp/render/render_opengl.py +14 -6
warp/render/render_usd.py +14 -6
warp/render/utils.py +14 -6
warp/sim/__init__.py +14 -7
warp/sim/articulation.py +18 -10
warp/sim/collide.py +35 -16
warp/sim/graph_coloring.py +14 -6
warp/sim/import_mjcf.py +463 -162
warp/sim/import_snu.py +14 -7
warp/sim/import_urdf.py +46 -18
warp/sim/import_usd.py +14 -7
warp/sim/inertia.py +14 -6
warp/sim/integrator.py +14 -6
warp/sim/integrator_euler.py +19 -11
warp/sim/integrator_featherstone.py +17 -16
warp/sim/integrator_vbd.py +222 -8
warp/sim/integrator_xpbd.py +19 -11
warp/sim/model.py +56 -19
warp/sim/particles.py +14 -6
warp/sim/render.py +14 -6
warp/sim/utils.py +17 -2
warp/sparse.py +657 -555
warp/stubs.py +231 -19
warp/tape.py +14 -6
warp/tests/aux_test_class_kernel.py +14 -6
warp/tests/aux_test_compile_consts_dummy.py +14 -6
warp/tests/aux_test_conditional_unequal_types_kernels.py +14 -6
warp/tests/aux_test_dependent.py +14 -6
warp/tests/aux_test_grad_customs.py +14 -6
warp/tests/aux_test_instancing_gc.py +14 -6
warp/tests/aux_test_module_unload.py +14 -6
warp/tests/aux_test_name_clash1.py +14 -6
warp/tests/aux_test_name_clash2.py +14 -6
warp/tests/aux_test_unresolved_func.py +14 -6
warp/tests/aux_test_unresolved_symbol.py +14 -6
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_async.py → cuda/test_async.py} +14 -6
warp/tests/{test_ipc.py → cuda/test_ipc.py} +14 -6
warp/tests/{test_mempool.py → cuda/test_mempool.py} +53 -6
warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +14 -6
warp/tests/{test_peer.py → cuda/test_peer.py} +14 -6
warp/tests/{test_pinned.py → cuda/test_pinned.py} +14 -6
warp/tests/{test_streams.py → cuda/test_streams.py} +85 -6
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_bvh.py → geometry/test_bvh.py} +14 -6
warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +14 -6
warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +14 -6
warp/tests/{test_mesh.py → geometry/test_mesh.py} +14 -6
warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +14 -6
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +80 -69
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +15 -7
warp/tests/{test_volume.py → geometry/test_volume.py} +55 -12
warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +14 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +42 -11
warp/tests/{test_jax.py → interop/test_jax.py} +14 -6
warp/tests/{test_paddle.py → interop/test_paddle.py} +14 -6
warp/tests/{test_torch.py → interop/test_torch.py} +14 -6
warp/tests/run_coverage_serial.py +14 -6
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +23 -16
warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +14 -6
warp/tests/{test_collision.py → sim/test_collision.py} +16 -8
warp/tests/{test_coloring.py → sim/test_coloring.py} +14 -7
warp/tests/{test_model.py → sim/test_model.py} +55 -7
warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +14 -6
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +16 -7
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +14 -6
warp/tests/test_arithmetic.py +14 -6
warp/tests/test_array.py +14 -6
warp/tests/test_array_reduce.py +14 -6
warp/tests/test_assert.py +14 -6
warp/tests/test_atomic.py +14 -6
warp/tests/test_bool.py +15 -7
warp/tests/test_builtins_resolution.py +14 -6
warp/tests/test_closest_point_edge_edge.py +14 -6
warp/tests/test_codegen.py +14 -6
warp/tests/test_codegen_instancing.py +14 -6
warp/tests/test_compile_consts.py +14 -6
warp/tests/test_conditional.py +14 -6
warp/tests/test_context.py +14 -6
warp/tests/test_copy.py +14 -6
warp/tests/test_ctypes.py +14 -6
warp/tests/test_dense.py +14 -6
warp/tests/test_devices.py +14 -6
warp/tests/test_examples.py +42 -42
warp/tests/test_fabricarray.py +14 -6
warp/tests/test_fast_math.py +14 -6
warp/tests/test_fem.py +37 -10
warp/tests/test_fp16.py +14 -6
warp/tests/test_func.py +14 -6
warp/tests/test_future_annotations.py +14 -6
warp/tests/test_generics.py +14 -6
warp/tests/test_grad.py +14 -6
warp/tests/test_grad_customs.py +14 -6
warp/tests/test_grad_debug.py +14 -6
warp/tests/test_implicit_init.py +14 -6
warp/tests/test_import.py +14 -6
warp/tests/test_indexedarray.py +14 -6
warp/tests/test_intersect.py +14 -6
warp/tests/test_iter.py +14 -6
warp/tests/test_large.py +14 -6
warp/tests/test_launch.py +14 -6
warp/tests/test_lerp.py +14 -6
warp/tests/test_linear_solvers.py +15 -11
warp/tests/test_lvalue.py +14 -6
warp/tests/test_mat.py +247 -85
warp/tests/test_mat_lite.py +14 -6
warp/tests/test_mat_scalar_ops.py +18 -10
warp/tests/test_math.py +14 -6
warp/tests/test_mlp.py +14 -6
warp/tests/test_module_hashing.py +14 -6
warp/tests/test_modules_lite.py +14 -6
warp/tests/test_noise.py +14 -6
warp/tests/test_operators.py +14 -6
warp/tests/test_options.py +14 -6
warp/tests/test_overwrite.py +15 -60
warp/tests/test_print.py +14 -6
warp/tests/test_quat.py +81 -52
warp/tests/test_rand.py +58 -43
warp/tests/test_reload.py +14 -6
warp/tests/test_rounding.py +14 -6
warp/tests/test_runlength_encode.py +14 -6
warp/tests/test_scalar_ops.py +14 -6
warp/tests/test_smoothstep.py +14 -6
warp/tests/test_snippet.py +15 -0
warp/tests/test_sparse.py +61 -12
warp/tests/test_spatial.py +89 -6
warp/tests/test_special_values.py +14 -6
warp/tests/test_static.py +15 -7
warp/tests/test_struct.py +14 -6
warp/tests/test_tape.py +14 -6
warp/tests/test_transient_module.py +14 -6
warp/tests/test_triangle_closest_point.py +14 -6
warp/tests/test_types.py +14 -6
warp/tests/test_utils.py +98 -10
warp/tests/test_vec.py +60 -40
warp/tests/test_vec_lite.py +14 -6
warp/tests/test_vec_scalar_ops.py +14 -6
warp/tests/test_verify_fp.py +14 -6
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +150 -57
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +15 -7
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +23 -12
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +39 -20
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +74 -7
warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +14 -6
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +15 -7
warp/tests/unittest_serial.py +15 -6
warp/tests/unittest_suites.py +59 -65
warp/tests/unittest_utils.py +16 -7
warp/tests/walkthrough_debug.py +14 -6
warp/thirdparty/unittest_parallel.py +15 -8
warp/torch.py +14 -6
warp/types.py +124 -664
warp/utils.py +151 -78
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/METADATA +39 -12
warp_lang-1.7.0.dist-info/RECORD +429 -0
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp/examples/optim/example_walker.py +0 -309
warp/native/cutlass_gemm.cpp +0 -34
warp/native/cutlass_gemm.cu +0 -373
warp/tests/test_matmul.py +0 -503
warp/tests/test_matmul_lite.py +0 -403
warp/tests/test_vbd.py +0 -378
warp/tests/unused_test_misc.py +0 -69
warp_lang-1.6.1.dist-info/LICENSE.md +0 -126
warp_lang-1.6.1.dist-info/RECORD +0 -419
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/sparse.cpp CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "warp.h"
@@ -72,7 +81,8 @@ template <typename T> void bsr_dyn_block_transpose(const T* src, T* dest, int ro
 template <typename T>
 int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_block, const int row_count,
                                   const int nnz, const int* tpl_rows, const int* tpl_columns, const T* tpl_values,
-                                  const bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns, T* bsr_values)
+                                  const bool prune_numerical_zeros, const bool masked, int* bsr_offsets,
+                                  int* bsr_columns, T* bsr_values)
 {
     // get specialized accumulator for common block sizes (1,1), (1,2), (1,3),
@@ -115,14 +125,33 @@ int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_b
     std::iota(block_indices.begin(), block_indices.end(), 0);
     // remove zero blocks  and invalid row indices
-    block_indices.erase(std::remove_if(block_indices.begin(), block_indices.end(),
-                                       [&](int i)
-                                       {
-                                           return tpl_rows[i] < 0 || tpl_rows[i] >= row_count ||
-                                                  (prune_numerical_zeros && tpl_values &&
-                                                   block_is_zero_func(tpl_values + i * block_size, block_size));
-                                       }),
-                        block_indices.end());
+    auto discard_block = [&](int i)
+    {
+        const int row = tpl_rows[i];
+        if (row < 0 || row >= row_count)
+        {
+            return true;
+        }
+        if (prune_numerical_zeros && tpl_values && block_is_zero_func(tpl_values + i * block_size, block_size))
+        {
+            return true;
+        }
+        if (!masked)
+        {
+            return false;
+        }
+        const int* beg = bsr_columns + bsr_offsets[row];
+        const int* end = bsr_columns + bsr_offsets[row + 1];
+        const int col = tpl_columns[i];
+        const int* block = std::lower_bound(beg, end, col);
+        return block == end || *block != col;
+    };
+    block_indices.erase(std::remove_if(block_indices.begin(), block_indices.end(), discard_block), block_indices.end());
     // sort block indices according to lexico order
     std::sort(block_indices.begin(), block_indices.end(), [tpl_rows, tpl_columns](int i, int j) -> bool
@@ -272,12 +301,12 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
 WP_API void bsr_matrix_from_triplets_float_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
                                                 int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                                bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
-                                                void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
+                                                bool prune_numerical_zeros, bool masked, int* bsr_offsets,
+                                                int* bsr_columns, void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
     bsr_matrix_from_triplets_host<float>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns,
-                                         static_cast<const float*>(tpl_values), prune_numerical_zeros, bsr_offsets,
-                                         bsr_columns, static_cast<float*>(bsr_values));
+                                         static_cast<const float*>(tpl_values), prune_numerical_zeros, masked,
+                                         bsr_offsets, bsr_columns, static_cast<float*>(bsr_values));
     if (bsr_nnz)
     {
         *bsr_nnz = bsr_offsets[row_count];
@@ -286,12 +315,12 @@ WP_API void bsr_matrix_from_triplets_float_host(int rows_per_block, int cols_per
 WP_API void bsr_matrix_from_triplets_double_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
                                                  int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                                 bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
-                                                 void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
+                                                 bool prune_numerical_zeros, bool masked, int* bsr_offsets,
+                                                 int* bsr_columns, void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
     bsr_matrix_from_triplets_host<double>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns,
-                                          static_cast<const double*>(tpl_values), prune_numerical_zeros, bsr_offsets,
-                                          bsr_columns, static_cast<double*>(bsr_values));
+                                          static_cast<const double*>(tpl_values), prune_numerical_zeros, masked,
+                                          bsr_offsets, bsr_columns, static_cast<double*>(bsr_values));
     if (bsr_nnz)
     {
         *bsr_nnz = bsr_offsets[row_count];
@@ -318,16 +347,17 @@ WP_API void bsr_transpose_double_host(int rows_per_block, int cols_per_block, in
 #if !WP_ENABLE_CUDA
 WP_API void bsr_matrix_from_triplets_float_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
-                                                   int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                                   bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
-                                                   void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
+                                                  int* tpl_rows, int* tpl_columns, void* tpl_values,
+                                                  bool prune_numerical_zeros, bool masked, int* bsr_offsets,
+                                                  int* bsr_columns, void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
 }
 WP_API void bsr_matrix_from_triplets_double_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
                                                    int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                                   bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
-                                                   void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
+                                                   bool prune_numerical_zeros, bool masked, int* bsr_offsets,
+                                                   int* bsr_columns, void* bsr_values, int* bsr_nnz,
+                                                   void* bsr_nnz_event)
 {
 }

warp/native/sparse.cu CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "cuda_util.h"
@@ -52,10 +61,41 @@ template <typename T> struct BsrBlockIsNotZero
     }
 };
+struct BsrBlockInMask
+{
+    const int* bsr_offsets;
+    const int* bsr_columns;
+    CUDA_CALLABLE_DEVICE bool operator()(int row, int col) const
+    {
+        if (bsr_offsets == nullptr)
+            return true;
+        int lower = bsr_offsets[row];
+        int upper = bsr_offsets[row + 1] - 1;
+        while (lower < upper)
+        {
+            const int mid = lower + (upper - lower) / 2;
+            if (bsr_columns[mid] < col)
+            {
+                lower = mid + 1;
+            }
+            else
+            {
+                upper = mid;
+            }
+        }
+        return lower == upper && (bsr_columns[lower] == col);
+    }
+};
 template <typename T>
 __global__ void bsr_fill_triplet_key_values(const int nnz, const int nrow, const int* tpl_rows, const int* tpl_columns,
-                                            const BsrBlockIsNotZero<T> nonZero, uint32_t* block_indices,
-                                            BsrRowCol* tpl_row_col)
+                                            const BsrBlockIsNotZero<T> nonZero, const BsrBlockInMask mask,
+                                            uint32_t* block_indices, BsrRowCol* tpl_row_col)
 {
     int block = blockIdx.x * blockDim.x + threadIdx.x;
     if (block >= nnz)
@@ -65,7 +105,8 @@ __global__ void bsr_fill_triplet_key_values(const int nnz, const int nrow, const
     const int col = tpl_columns[block];
     const bool is_valid = row >= 0 && row < nrow;
-    const BsrRowCol row_col = is_valid && nonZero(block) ? bsr_combine_row_col(row, col) : PRUNED_ROWCOL;
+    const BsrRowCol row_col =
+        is_valid && nonZero(block) && mask(row, col) ? bsr_combine_row_col(row, col) : PRUNED_ROWCOL;
     tpl_row_col[block] = row_col;
     block_indices[block] = block;
 }
@@ -113,7 +154,7 @@ __global__ void bsr_find_row_offsets(uint32_t row_count, const T* d_nnz, const B
 }
 template <typename T>
-__global__ void bsr_merge_blocks(const uint32_t* d_nnz, int block_size, const uint32_t* block_offsets,
+__global__ void bsr_merge_blocks(const int* d_nnz, int block_size, const uint32_t* block_offsets,
                                  const uint32_t* sorted_block_indices, const BsrRowCol* unique_row_cols,
                                  const T* tpl_values, int* bsr_cols, T* bsr_values)
@@ -154,8 +195,8 @@ __global__ void bsr_merge_blocks(const uint32_t* d_nnz, int block_size, const ui
 template <typename T>
 void bsr_matrix_from_triplets_device(const int rows_per_block, const int cols_per_block, const int row_count,
                                      const int nnz, const int* tpl_rows, const int* tpl_columns, const T* tpl_values,
-                                     const bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
-                                     T* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
+                                     const bool prune_numerical_zeros, const bool masked, int* bsr_offsets,
+                                     int* bsr_columns, T* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
     const int block_size = rows_per_block * cols_per_block;
@@ -177,8 +218,9 @@ void bsr_matrix_from_triplets_device(const int rows_per_block, const int cols_pe
     // Combine rows and columns so we can sort on them both
     BsrBlockIsNotZero<T> isNotZero{block_size, prune_numerical_zeros ? tpl_values : nullptr};
+    BsrBlockInMask mask{masked ? bsr_offsets : nullptr, bsr_columns};
     wp_launch_device(WP_CURRENT_CONTEXT, bsr_fill_triplet_key_values, nnz,
-                     (nnz, row_count, tpl_rows, tpl_columns, isNotZero, d_keys.Current(), d_values.Current()));
+                     (nnz, row_count, tpl_rows, tpl_columns, isNotZero, mask, d_keys.Current(), d_values.Current()));
     // Sort
     {
@@ -205,7 +247,7 @@ void bsr_matrix_from_triplets_device(const int rows_per_block, const int cols_pe
     if (bsr_nnz)
     {
-        // Copy nnz to host, and record an event for the competed transfer if desired
+        // Copy nnz to host, and record an event for the completed transfer if desired
         memcpy_d2h(WP_CURRENT_CONTEXT, bsr_nnz, bsr_offsets + row_count, sizeof(int), stream);
@@ -227,7 +269,7 @@ void bsr_matrix_from_triplets_device(const int rows_per_block, const int cols_pe
     // Accumulate repeated blocks and set column indices
     wp_launch_device(WP_CURRENT_CONTEXT, bsr_merge_blocks, nnz,
-                     (unique_triplet_count, block_size, d_keys.Alternate(), d_keys.Current(), d_values.Alternate(),
+                     (bsr_offsets + row_count, block_size, d_keys.Alternate(), d_keys.Current(), d_values.Alternate(),
                       tpl_values, bsr_columns, bsr_values));
 }
@@ -443,22 +485,24 @@ void bsr_transpose_device(int rows_per_block, int cols_per_block, int row_count,
 void bsr_matrix_from_triplets_float_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
                                            int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                           bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                           bool prune_numerical_zeros, bool masked, int* bsr_offsets, int* bsr_columns,
                                            void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return bsr_matrix_from_triplets_device<float>(
-        rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns, static_cast<const float*>(tpl_values),
-        prune_numerical_zeros, bsr_offsets, bsr_columns, static_cast<float*>(bsr_values), bsr_nnz, bsr_nnz_event);
+    return bsr_matrix_from_triplets_device<float>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns,
+                                                  static_cast<const float*>(tpl_values), prune_numerical_zeros, masked,
+                                                  bsr_offsets, bsr_columns, static_cast<float*>(bsr_values), bsr_nnz,
+                                                  bsr_nnz_event);
 }
 void bsr_matrix_from_triplets_double_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
                                             int* tpl_rows, int* tpl_columns, void* tpl_values,
-                                            bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                            bool prune_numerical_zeros, bool masked, int* bsr_offsets, int* bsr_columns,
                                             void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return bsr_matrix_from_triplets_device<double>(
-        rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns, static_cast<const double*>(tpl_values),
-        prune_numerical_zeros, bsr_offsets, bsr_columns, static_cast<double*>(bsr_values), bsr_nnz, bsr_nnz_event);
+    return bsr_matrix_from_triplets_device<double>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows,
+                                                   tpl_columns, static_cast<const double*>(tpl_values),
+                                                   prune_numerical_zeros, masked, bsr_offsets, bsr_columns,
+                                                   static_cast<double*>(bsr_values), bsr_nnz, bsr_nnz_event);
 }
 void bsr_transpose_float_device(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,

warp/native/spatial.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once

warp/native/svd.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 // The MIT License (MIT)
@@ -423,6 +432,62 @@ void _svd(// input A
     );
 }
+template<typename Type>
+inline CUDA_CALLABLE
+void _svd_2(// input A
+        Type a11, Type a12,
+        Type a21, Type a22,
+        // output U
+        Type &u11, Type &u12,
+        Type &u21, Type &u22,
+        // output S
+        Type &s11, Type &s12,
+        Type &s21, Type &s22,
+        // output V
+        Type &v11, Type &v12,
+        Type &v21, Type &v22)
+{
+    // Step 1: Compute ATA
+    Type ATA11 = a11 * a11 + a21 * a21;
+    Type ATA12 = a11 * a12 + a21 * a22;
+    Type ATA22 = a12 * a12 + a22 * a22;
+    // Step 2: Eigenanalysis
+    Type trace = ATA11 + ATA22;
+    Type det = ATA11 * ATA22 - ATA12 * ATA12;
+    Type sqrt_term = sqrt(trace * trace - Type(4.0) * det);
+    Type lambda1 = (trace + sqrt_term) * Type(0.5);
+    Type lambda2 = (trace - sqrt_term) * Type(0.5);
+    // Step 3: Singular values
+    Type sigma1 = sqrt(lambda1);
+    Type sigma2 = sqrt(lambda2);
+    // Step 4: Eigenvectors (find V)
+    Type v1x = ATA12, v1y = lambda1 - ATA11; // For first eigenvector
+    Type v2x = ATA12, v2y = lambda2 - ATA11; // For second eigenvector
+    Type norm1 = sqrt(v1x * v1x + v1y * v1y);
+    Type norm2 = sqrt(v2x * v2x + v2y * v2y);
+    v11 = v1x / norm1; v12 = v2x / norm2;
+    v21 = v1y / norm1; v22 = v2y / norm2;
+    // Step 5: Compute U
+    Type inv_sigma1 = (sigma1 > Type(1e-6)) ? Type(1.0) / sigma1 : Type(0.0);
+    Type inv_sigma2 = (sigma2 > Type(1e-6)) ? Type(1.0) / sigma2 : Type(0.0);
+    u11 = (a11 * v11 + a12 * v21) * inv_sigma1;
+    u12 = (a11 * v12 + a12 * v22) * inv_sigma2;
+    u21 = (a21 * v11 + a22 * v21) * inv_sigma1;
+    u22 = (a21 * v12 + a22 * v22) * inv_sigma2;
+    // Step 6: Set S
+    s11 = sigma1; s12 = Type(0.0);
+    s21 = Type(0.0); s22 = sigma2;
+}
 template<typename Type>
 inline CUDA_CALLABLE void svd3(const mat_t<3,3,Type>& A, mat_t<3,3,Type>& U, vec_t<3,Type>& sigma, mat_t<3,3,Type>& V) {
   Type s12, s13, s21, s23, s31, s32;
@@ -483,6 +548,66 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
   adj_A = adj_A + (u_term + v_term + sigma_term);
 }
+template<typename Type>
+inline CUDA_CALLABLE void svd2(const mat_t<2,2,Type>& A, mat_t<2,2,Type>& U, vec_t<2,Type>& sigma, mat_t<2,2,Type>& V) {
+  Type s12, s21;
+  _svd_2(A.data[0][0], A.data[0][1],
+       A.data[1][0], A.data[1][1],
+       U.data[0][0], U.data[0][1],
+       U.data[1][0], U.data[1][1],
+       sigma[0], s12,
+       s21, sigma[1],
+       V.data[0][0], V.data[0][1],
+       V.data[1][0], V.data[1][1]);
+}
+template<typename Type>
+inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
+                                   const mat_t<2,2,Type>& U,
+                                   const vec_t<2,Type>& sigma,
+                                   const mat_t<2,2,Type>& V,
+                                   mat_t<2,2,Type>& adj_A,
+                                   const mat_t<2,2,Type>& adj_U,
+                                   const vec_t<2,Type>& adj_sigma,
+                                   const mat_t<2,2,Type>& adj_V) {
+    Type s1_squared = sigma[0] * sigma[0];
+    Type s2_squared = sigma[1] * sigma[1];
+    // Compute inverse of (s1^2 - s2^2) if possible, use small epsilon to prevent division by zero
+    Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-1e-6f));
+    // Construct the matrix F for the adjoint
+    mat_t<2,2,Type> F = mat_t<2,2,Type>(0.0, F01,
+                                        -F01, 0.0);
+    // Create a matrix to handle the adjoint of the singular values (diagonal matrix)
+    mat_t<2,2,Type> adj_sigma_mat = mat_t<2,2,Type>(adj_sigma[0], 0.0,
+                                                   0.0, adj_sigma[1]);
+    // Matrix for handling singular values (diagonal matrix with sigma values)
+    mat_t<2,2,Type> s_mat = mat_t<2,2,Type>(sigma[0], 0.0,
+                                            0.0, sigma[1]);
+    // Compute the transpose of U and V
+    mat_t<2,2,Type> UT = transpose(U);
+    mat_t<2,2,Type> VT = transpose(V);
+    // Compute the term for sigma (diagonal matrix of adjoint singular values)
+    mat_t<2,2,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
+    // Compute the adjoint contributions for U (left singular vectors)
+    mat_t<2,2,Type> u_term = mul(mul(U, mul(cw_mul(F, (mul(UT, adj_U) - mul(transpose(adj_U), U))), s_mat)), VT);
+    // Compute the adjoint contributions for V (right singular vectors)
+    mat_t<2,2,Type> v_term = mul(U, mul(s_mat, mul(cw_mul(F, (mul(VT, adj_V) - mul(transpose(adj_V), V))), VT)));
+    // Combine the terms to compute the adjoint of A
+    adj_A = adj_A + (u_term + v_term + sigma_term);
+}
 template<typename Type>
 inline CUDA_CALLABLE void qr3(const mat_t<3,3,Type>& A, mat_t<3,3,Type>& Q, mat_t<3,3,Type>& R) {

warp/native/temp_buffer.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once