PyPI - warp-lang - Versions diffs - 1.7.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (429) hide show

warp/__init__.py +139 -0
warp/__init__.pyi +1 -0
warp/autograd.py +1142 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +557 -0
warp/build_dll.py +405 -0
warp/builtins.py +6855 -0
warp/codegen.py +3969 -0
warp/config.py +158 -0
warp/constants.py +57 -0
warp/context.py +6812 -0
warp/dlpack.py +462 -0
warp/examples/__init__.py +24 -0
warp/examples/assets/bear.usd +0 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usd +0 -0
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/assets/pixel.jpg +0 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usd +0 -0
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_api.py +389 -0
warp/examples/benchmarks/benchmark_cloth.py +296 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
warp/examples/benchmarks/benchmark_gemm.py +164 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
warp/examples/benchmarks/benchmark_launches.py +301 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +37 -0
warp/examples/core/example_cupy.py +86 -0
warp/examples/core/example_dem.py +241 -0
warp/examples/core/example_fluid.py +299 -0
warp/examples/core/example_graph_capture.py +150 -0
warp/examples/core/example_marching_cubes.py +194 -0
warp/examples/core/example_mesh.py +180 -0
warp/examples/core/example_mesh_intersect.py +211 -0
warp/examples/core/example_nvdb.py +182 -0
warp/examples/core/example_raycast.py +111 -0
warp/examples/core/example_raymarch.py +205 -0
warp/examples/core/example_render_opengl.py +193 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +411 -0
warp/examples/core/example_torch.py +211 -0
warp/examples/core/example_wave.py +269 -0
warp/examples/fem/example_adaptive_grid.py +286 -0
warp/examples/fem/example_apic_fluid.py +423 -0
warp/examples/fem/example_burgers.py +261 -0
warp/examples/fem/example_convection_diffusion.py +178 -0
warp/examples/fem/example_convection_diffusion_dg.py +204 -0
warp/examples/fem/example_deformed_geometry.py +172 -0
warp/examples/fem/example_diffusion.py +196 -0
warp/examples/fem/example_diffusion_3d.py +225 -0
warp/examples/fem/example_diffusion_mgpu.py +220 -0
warp/examples/fem/example_distortion_energy.py +228 -0
warp/examples/fem/example_magnetostatics.py +240 -0
warp/examples/fem/example_mixed_elasticity.py +291 -0
warp/examples/fem/example_navier_stokes.py +261 -0
warp/examples/fem/example_nonconforming_contact.py +298 -0
warp/examples/fem/example_stokes.py +213 -0
warp/examples/fem/example_stokes_transfer.py +262 -0
warp/examples/fem/example_streamlines.py +352 -0
warp/examples/fem/utils.py +1000 -0
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +266 -0
warp/examples/optim/example_cloth_throw.py +228 -0
warp/examples/optim/example_diffray.py +561 -0
warp/examples/optim/example_drone.py +870 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +182 -0
warp/examples/optim/example_inverse_kinematics_torch.py +191 -0
warp/examples/optim/example_softbody_properties.py +400 -0
warp/examples/optim/example_spring_cage.py +245 -0
warp/examples/optim/example_trajectory.py +227 -0
warp/examples/sim/example_cartpole.py +143 -0
warp/examples/sim/example_cloth.py +225 -0
warp/examples/sim/example_cloth_self_contact.py +322 -0
warp/examples/sim/example_granular.py +130 -0
warp/examples/sim/example_granular_collision_sdf.py +202 -0
warp/examples/sim/example_jacobian_ik.py +244 -0
warp/examples/sim/example_particle_chain.py +124 -0
warp/examples/sim/example_quadruped.py +203 -0
warp/examples/sim/example_rigid_chain.py +203 -0
warp/examples/sim/example_rigid_contact.py +195 -0
warp/examples/sim/example_rigid_force.py +133 -0
warp/examples/sim/example_rigid_gyroscopic.py +115 -0
warp/examples/sim/example_rigid_soft_contact.py +140 -0
warp/examples/sim/example_soft_body.py +196 -0
warp/examples/tile/example_tile_cholesky.py +87 -0
warp/examples/tile/example_tile_convolution.py +66 -0
warp/examples/tile/example_tile_fft.py +55 -0
warp/examples/tile/example_tile_filtering.py +113 -0
warp/examples/tile/example_tile_matmul.py +85 -0
warp/examples/tile/example_tile_mlp.py +383 -0
warp/examples/tile/example_tile_nbody.py +199 -0
warp/examples/tile/example_tile_walker.py +327 -0
warp/fabric.py +355 -0
warp/fem/__init__.py +106 -0
warp/fem/adaptivity.py +508 -0
warp/fem/cache.py +572 -0
warp/fem/dirichlet.py +202 -0
warp/fem/domain.py +411 -0
warp/fem/field/__init__.py +125 -0
warp/fem/field/field.py +619 -0
warp/fem/field/nodal_field.py +326 -0
warp/fem/field/restriction.py +37 -0
warp/fem/field/virtual.py +848 -0
warp/fem/geometry/__init__.py +32 -0
warp/fem/geometry/adaptive_nanogrid.py +857 -0
warp/fem/geometry/closest_point.py +84 -0
warp/fem/geometry/deformed_geometry.py +221 -0
warp/fem/geometry/element.py +776 -0
warp/fem/geometry/geometry.py +362 -0
warp/fem/geometry/grid_2d.py +392 -0
warp/fem/geometry/grid_3d.py +452 -0
warp/fem/geometry/hexmesh.py +911 -0
warp/fem/geometry/nanogrid.py +571 -0
warp/fem/geometry/partition.py +389 -0
warp/fem/geometry/quadmesh.py +663 -0
warp/fem/geometry/tetmesh.py +855 -0
warp/fem/geometry/trimesh.py +806 -0
warp/fem/integrate.py +2335 -0
warp/fem/linalg.py +419 -0
warp/fem/operator.py +293 -0
warp/fem/polynomial.py +229 -0
warp/fem/quadrature/__init__.py +17 -0
warp/fem/quadrature/pic_quadrature.py +299 -0
warp/fem/quadrature/quadrature.py +591 -0
warp/fem/space/__init__.py +228 -0
warp/fem/space/basis_function_space.py +468 -0
warp/fem/space/basis_space.py +667 -0
warp/fem/space/dof_mapper.py +251 -0
warp/fem/space/function_space.py +309 -0
warp/fem/space/grid_2d_function_space.py +177 -0
warp/fem/space/grid_3d_function_space.py +227 -0
warp/fem/space/hexmesh_function_space.py +257 -0
warp/fem/space/nanogrid_function_space.py +201 -0
warp/fem/space/partition.py +367 -0
warp/fem/space/quadmesh_function_space.py +223 -0
warp/fem/space/restriction.py +179 -0
warp/fem/space/shape/__init__.py +143 -0
warp/fem/space/shape/cube_shape_function.py +1105 -0
warp/fem/space/shape/shape_function.py +133 -0
warp/fem/space/shape/square_shape_function.py +926 -0
warp/fem/space/shape/tet_shape_function.py +834 -0
warp/fem/space/shape/triangle_shape_function.py +672 -0
warp/fem/space/tetmesh_function_space.py +271 -0
warp/fem/space/topology.py +424 -0
warp/fem/space/trimesh_function_space.py +194 -0
warp/fem/types.py +99 -0
warp/fem/utils.py +420 -0
warp/jax.py +187 -0
warp/jax_experimental/__init__.py +16 -0
warp/jax_experimental/custom_call.py +351 -0
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +244 -0
warp/native/array.h +1145 -0
warp/native/builtin.h +1800 -0
warp/native/bvh.cpp +492 -0
warp/native/bvh.cu +791 -0
warp/native/bvh.h +554 -0
warp/native/clang/clang.cpp +536 -0
warp/native/coloring.cpp +613 -0
warp/native/crt.cpp +51 -0
warp/native/crt.h +362 -0
warp/native/cuda_crt.h +1058 -0
warp/native/cuda_util.cpp +646 -0
warp/native/cuda_util.h +307 -0
warp/native/error.cpp +77 -0
warp/native/error.h +36 -0
warp/native/exports.h +1878 -0
warp/native/fabric.h +245 -0
warp/native/hashgrid.cpp +311 -0
warp/native/hashgrid.cu +87 -0
warp/native/hashgrid.h +240 -0
warp/native/initializer_array.h +41 -0
warp/native/intersect.h +1230 -0
warp/native/intersect_adj.h +375 -0
warp/native/intersect_tri.h +339 -0
warp/native/marching.cpp +19 -0
warp/native/marching.cu +514 -0
warp/native/marching.h +19 -0
warp/native/mat.h +2220 -0
warp/native/mathdx.cpp +87 -0
warp/native/matnn.h +343 -0
warp/native/mesh.cpp +266 -0
warp/native/mesh.cu +404 -0
warp/native/mesh.h +1980 -0
warp/native/nanovdb/GridHandle.h +366 -0
warp/native/nanovdb/HostBuffer.h +590 -0
warp/native/nanovdb/NanoVDB.h +6624 -0
warp/native/nanovdb/PNanoVDB.h +3390 -0
warp/native/noise.h +859 -0
warp/native/quat.h +1371 -0
warp/native/rand.h +342 -0
warp/native/range.h +139 -0
warp/native/reduce.cpp +174 -0
warp/native/reduce.cu +364 -0
warp/native/runlength_encode.cpp +79 -0
warp/native/runlength_encode.cu +61 -0
warp/native/scan.cpp +47 -0
warp/native/scan.cu +53 -0
warp/native/scan.h +23 -0
warp/native/solid_angle.h +466 -0
warp/native/sort.cpp +251 -0
warp/native/sort.cu +277 -0
warp/native/sort.h +33 -0
warp/native/sparse.cpp +378 -0
warp/native/sparse.cu +524 -0
warp/native/spatial.h +657 -0
warp/native/svd.h +702 -0
warp/native/temp_buffer.h +46 -0
warp/native/tile.h +2584 -0
warp/native/tile_reduce.h +264 -0
warp/native/vec.h +1426 -0
warp/native/volume.cpp +501 -0
warp/native/volume.cu +67 -0
warp/native/volume.h +969 -0
warp/native/volume_builder.cu +477 -0
warp/native/volume_builder.h +52 -0
warp/native/volume_impl.h +70 -0
warp/native/warp.cpp +1082 -0
warp/native/warp.cu +3636 -0
warp/native/warp.h +381 -0
warp/optim/__init__.py +17 -0
warp/optim/adam.py +163 -0
warp/optim/linear.py +1137 -0
warp/optim/sgd.py +112 -0
warp/paddle.py +407 -0
warp/render/__init__.py +18 -0
warp/render/render_opengl.py +3518 -0
warp/render/render_usd.py +784 -0
warp/render/utils.py +160 -0
warp/sim/__init__.py +65 -0
warp/sim/articulation.py +793 -0
warp/sim/collide.py +2395 -0
warp/sim/graph_coloring.py +300 -0
warp/sim/import_mjcf.py +790 -0
warp/sim/import_snu.py +227 -0
warp/sim/import_urdf.py +579 -0
warp/sim/import_usd.py +894 -0
warp/sim/inertia.py +324 -0
warp/sim/integrator.py +242 -0
warp/sim/integrator_euler.py +1997 -0
warp/sim/integrator_featherstone.py +2101 -0
warp/sim/integrator_vbd.py +2048 -0
warp/sim/integrator_xpbd.py +3292 -0
warp/sim/model.py +4791 -0
warp/sim/particles.py +121 -0
warp/sim/render.py +427 -0
warp/sim/utils.py +428 -0
warp/sparse.py +2057 -0
warp/stubs.py +3333 -0
warp/tape.py +1203 -0
warp/tests/__init__.py +1 -0
warp/tests/__main__.py +4 -0
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/mlp_golden.npy +0 -0
warp/tests/assets/pixel.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/spiky.usd +0 -0
warp/tests/assets/test_grid.nvdb +0 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/assets/test_int32_grid.nvdb +0 -0
warp/tests/assets/test_vec_grid.nvdb +0 -0
warp/tests/assets/torus.nvdb +0 -0
warp/tests/assets/torus.usda +105 -0
warp/tests/aux_test_class_kernel.py +34 -0
warp/tests/aux_test_compile_consts_dummy.py +18 -0
warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
warp/tests/aux_test_dependent.py +29 -0
warp/tests/aux_test_grad_customs.py +29 -0
warp/tests/aux_test_instancing_gc.py +26 -0
warp/tests/aux_test_module_unload.py +23 -0
warp/tests/aux_test_name_clash1.py +40 -0
warp/tests/aux_test_name_clash2.py +40 -0
warp/tests/aux_test_reference.py +9 -0
warp/tests/aux_test_reference_reference.py +8 -0
warp/tests/aux_test_square.py +16 -0
warp/tests/aux_test_unresolved_func.py +22 -0
warp/tests/aux_test_unresolved_symbol.py +22 -0
warp/tests/cuda/__init__.py +0 -0
warp/tests/cuda/test_async.py +676 -0
warp/tests/cuda/test_ipc.py +124 -0
warp/tests/cuda/test_mempool.py +233 -0
warp/tests/cuda/test_multigpu.py +169 -0
warp/tests/cuda/test_peer.py +139 -0
warp/tests/cuda/test_pinned.py +84 -0
warp/tests/cuda/test_streams.py +634 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/geometry/test_bvh.py +200 -0
warp/tests/geometry/test_hash_grid.py +221 -0
warp/tests/geometry/test_marching_cubes.py +74 -0
warp/tests/geometry/test_mesh.py +316 -0
warp/tests/geometry/test_mesh_query_aabb.py +399 -0
warp/tests/geometry/test_mesh_query_point.py +932 -0
warp/tests/geometry/test_mesh_query_ray.py +311 -0
warp/tests/geometry/test_volume.py +1103 -0
warp/tests/geometry/test_volume_write.py +346 -0
warp/tests/interop/__init__.py +0 -0
warp/tests/interop/test_dlpack.py +729 -0
warp/tests/interop/test_jax.py +371 -0
warp/tests/interop/test_paddle.py +800 -0
warp/tests/interop/test_torch.py +1001 -0
warp/tests/run_coverage_serial.py +39 -0
warp/tests/sim/__init__.py +0 -0
warp/tests/sim/disabled_kinematics.py +244 -0
warp/tests/sim/flaky_test_sim_grad.py +290 -0
warp/tests/sim/test_collision.py +604 -0
warp/tests/sim/test_coloring.py +258 -0
warp/tests/sim/test_model.py +224 -0
warp/tests/sim/test_sim_grad_bounce_linear.py +212 -0
warp/tests/sim/test_sim_kinematics.py +98 -0
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +163 -0
warp/tests/test_arithmetic.py +1096 -0
warp/tests/test_array.py +2972 -0
warp/tests/test_array_reduce.py +156 -0
warp/tests/test_assert.py +250 -0
warp/tests/test_atomic.py +153 -0
warp/tests/test_bool.py +220 -0
warp/tests/test_builtins_resolution.py +1298 -0
warp/tests/test_closest_point_edge_edge.py +327 -0
warp/tests/test_codegen.py +810 -0
warp/tests/test_codegen_instancing.py +1495 -0
warp/tests/test_compile_consts.py +215 -0
warp/tests/test_conditional.py +252 -0
warp/tests/test_context.py +42 -0
warp/tests/test_copy.py +238 -0
warp/tests/test_ctypes.py +638 -0
warp/tests/test_dense.py +73 -0
warp/tests/test_devices.py +97 -0
warp/tests/test_examples.py +482 -0
warp/tests/test_fabricarray.py +996 -0
warp/tests/test_fast_math.py +74 -0
warp/tests/test_fem.py +2003 -0
warp/tests/test_fp16.py +136 -0
warp/tests/test_func.py +454 -0
warp/tests/test_future_annotations.py +98 -0
warp/tests/test_generics.py +656 -0
warp/tests/test_grad.py +893 -0
warp/tests/test_grad_customs.py +339 -0
warp/tests/test_grad_debug.py +341 -0
warp/tests/test_implicit_init.py +411 -0
warp/tests/test_import.py +45 -0
warp/tests/test_indexedarray.py +1140 -0
warp/tests/test_intersect.py +73 -0
warp/tests/test_iter.py +76 -0
warp/tests/test_large.py +177 -0
warp/tests/test_launch.py +411 -0
warp/tests/test_lerp.py +151 -0
warp/tests/test_linear_solvers.py +193 -0
warp/tests/test_lvalue.py +427 -0
warp/tests/test_mat.py +2089 -0
warp/tests/test_mat_lite.py +122 -0
warp/tests/test_mat_scalar_ops.py +2913 -0
warp/tests/test_math.py +178 -0
warp/tests/test_mlp.py +282 -0
warp/tests/test_module_hashing.py +258 -0
warp/tests/test_modules_lite.py +44 -0
warp/tests/test_noise.py +252 -0
warp/tests/test_operators.py +299 -0
warp/tests/test_options.py +129 -0
warp/tests/test_overwrite.py +551 -0
warp/tests/test_print.py +339 -0
warp/tests/test_quat.py +2315 -0
warp/tests/test_rand.py +339 -0
warp/tests/test_reload.py +302 -0
warp/tests/test_rounding.py +185 -0
warp/tests/test_runlength_encode.py +196 -0
warp/tests/test_scalar_ops.py +105 -0
warp/tests/test_smoothstep.py +108 -0
warp/tests/test_snippet.py +318 -0
warp/tests/test_sparse.py +582 -0
warp/tests/test_spatial.py +2229 -0
warp/tests/test_special_values.py +361 -0
warp/tests/test_static.py +592 -0
warp/tests/test_struct.py +734 -0
warp/tests/test_tape.py +204 -0
warp/tests/test_transient_module.py +93 -0
warp/tests/test_triangle_closest_point.py +145 -0
warp/tests/test_types.py +562 -0
warp/tests/test_utils.py +588 -0
warp/tests/test_vec.py +1487 -0
warp/tests/test_vec_lite.py +80 -0
warp/tests/test_vec_scalar_ops.py +2327 -0
warp/tests/test_verify_fp.py +100 -0
warp/tests/tile/__init__.py +0 -0
warp/tests/tile/test_tile.py +780 -0
warp/tests/tile/test_tile_load.py +407 -0
warp/tests/tile/test_tile_mathdx.py +208 -0
warp/tests/tile/test_tile_mlp.py +402 -0
warp/tests/tile/test_tile_reduce.py +447 -0
warp/tests/tile/test_tile_shared_memory.py +247 -0
warp/tests/tile/test_tile_view.py +173 -0
warp/tests/unittest_serial.py +47 -0
warp/tests/unittest_suites.py +427 -0
warp/tests/unittest_utils.py +468 -0
warp/tests/walkthrough_debug.py +93 -0
warp/thirdparty/__init__.py +0 -0
warp/thirdparty/appdirs.py +598 -0
warp/thirdparty/dlpack.py +145 -0
warp/thirdparty/unittest_parallel.py +570 -0
warp/torch.py +391 -0
warp/types.py +5230 -0
warp/utils.py +1137 -0
warp_lang-1.7.0.dist-info/METADATA +516 -0
warp_lang-1.7.0.dist-info/RECORD +429 -0
warp_lang-1.7.0.dist-info/WHEEL +5 -0
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp_lang-1.7.0.dist-info/top_level.txt +1 -0

warp/tests/tile/test_tile_reduce.py ADDED Viewed

@@ -0,0 +1,447 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+# num threads per-tile
+TILE_DIM = 64
+@wp.kernel
+def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+    n = input.shape[1]
+    count = int(n / TILE_DIM)
+    s = wp.tile_zeros(shape=1, dtype=float)
+    for j in range(count):
+        a = wp.tile_load(input[i], shape=TILE_DIM, offset=j * TILE_DIM)
+        s += wp.tile_sum(a) * 0.5
+    wp.tile_store(output, s, offset=i)
+def test_tile_reduce_sum(test, device):
+    batch_count = 56
+    N = TILE_DIM * 3
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
+    sum_wp = output_wp.numpy()
+    for i in range(batch_count):
+        sum_np = np.sum(input[i]) * 0.5
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
+    output_wp.grad.fill_(1.0)
+    tape.backward()
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
+@wp.kernel
+def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+    a = wp.tile_load(input[i], shape=TILE_DIM)
+    m = wp.tile_min(a)
+    wp.tile_store(output, m, offset=i)
+def test_tile_reduce_min(test, device):
+    batch_count = 56
+    N = TILE_DIM
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_min_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
+    min_wp = output_wp.numpy()
+    for i in range(batch_count):
+        min_np = np.min(input[i])
+        test.assertAlmostEqual(min_wp[i], min_np, places=4)
+@wp.kernel
+def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+    a = wp.tile_load(input[i], shape=TILE_DIM)
+    m = wp.tile_max(a)
+    wp.tile_store(output, m, offset=i)
+def test_tile_reduce_max(test, device):
+    batch_count = 56
+    N = TILE_DIM
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_max_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
+    max_wp = output_wp.numpy()
+    for i in range(batch_count):
+        max_np = np.max(input[i])
+        test.assertAlmostEqual(max_wp[i], max_np, places=4)
+@wp.kernel
+def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+    a = wp.tile_load(input[i], shape=TILE_DIM)
+    m = wp.tile_reduce(wp.mul, a)
+    wp.tile_store(output, m, offset=i)
+def test_tile_reduce_custom(test, device):
+    batch_count = 56
+    N = TILE_DIM
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_reduce_custom_kernel,
+            dim=[batch_count],
+            inputs=[input_wp, output_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+    prod_wp = output_wp.numpy()
+    for i in range(batch_count):
+        prod_np = np.prod(input[i])
+        test.assertAlmostEqual(prod_wp[i], prod_np, places=4)
+@wp.struct
+class KeyValue:
+    key: wp.int32
+    value: wp.float32
+@wp.func
+def kv_max(a: KeyValue, b: KeyValue) -> KeyValue:
+    return wp.where(a.value < b.value, b, a)
+@wp.kernel
+def initialize_key_value(values: wp.array2d(dtype=wp.float32), keyvalues: wp.array2d(dtype=KeyValue)):
+    batch, idx = wp.tid()
+    keyvalues[batch, idx] = KeyValue(idx, values[batch, idx])
+@wp.kernel(enable_backward=False)
+def tile_reduce_custom_struct_kernel(values: wp.array2d(dtype=KeyValue), res: wp.array(dtype=KeyValue)):
+    # output tile index
+    i = wp.tid()
+    t = wp.tile_load(values, shape=(1, TILE_DIM), offset=(i, 0))
+    max_el = wp.tile_reduce(kv_max, t)
+    wp.tile_store(res, max_el, offset=i)
+def test_tile_reduce_custom_struct(test, device):
+    batch_count = 56
+    N = TILE_DIM
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+    input_wp = wp.array(input, dtype=wp.float32, device=device)
+    keyvalues_wp = wp.empty(input_wp.shape, dtype=KeyValue, device=device)
+    wp.launch(initialize_key_value, dim=[batch_count, N], inputs=[input_wp], outputs=[keyvalues_wp], device=device)
+    output_wp = wp.empty(batch_count, dtype=KeyValue, device=device)
+    wp.launch_tiled(
+        tile_reduce_custom_struct_kernel,
+        dim=[batch_count],
+        inputs=[keyvalues_wp],
+        outputs=[output_wp],
+        block_dim=TILE_DIM,
+        device=device,
+    )
+    prod_wp = np.array([k for k, v in output_wp.numpy()])
+    expected = np.argmax(input, axis=1)
+    assert_np_equal(prod_wp, expected)
+@wp.kernel
+def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+    a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
+    s = wp.tile_sum(a) * 0.5
+    wp.tile_store(output, s, offset=i)
+def test_tile_reduce_grouped_sum(test, device):
+    batch_count = 56
+    M = TILE_M
+    N = TILE_N
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, M, N), dtype=np.float32)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
+    sum_wp = output_wp.numpy()
+    for i in range(batch_count):
+        sum_np = np.sum(input[i]) * 0.5
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
+    output_wp.grad.fill_(1.0)
+    tape.backward()
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
+@wp.kernel
+def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
+    # thread index
+    i = wp.tid()
+    t = wp.tile(i)  # convert to block wide tile
+    s = wp.tile_sum(t)  # sum over block
+    # update global sum
+    wp.tile_atomic_add(output, s)
+def test_tile_reduce_simt(test, device):
+    # use an unaligned grid dimension
+    N = TILE_DIM * 4 + 5
+    output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch(tile_reduce_simt_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
+    test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
+@wp.kernel
+def tile_untile_kernel(output: wp.array(dtype=int)):
+    # thread index
+    i = wp.tid()
+    # convert to block wide tile
+    t = wp.tile(i) * 2
+    s = wp.untile(t)
+    output[i] = s
+def test_tile_untile(test, device):
+    # use an unaligned grid dimension
+    N = TILE_DIM * 4 + 5
+    output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
+    assert_np_equal(output.numpy(), np.arange(N) * 2)
+@wp.kernel
+def tile_untile_scalar_kernel(output: wp.array(dtype=int)):
+    # thread index
+    i = wp.tid()
+    # convert to block wide tile
+    t = wp.tile(i) * 2
+    s = wp.untile(t)
+    output[i] = s
+def test_tile_untile_scalar(test, device):
+    # use an unaligned grid dimension
+    N = TILE_DIM * 4 + 5
+    output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
+    assert_np_equal(output.numpy(), np.arange(N) * 2)
+@wp.kernel
+def test_untile_vector_kernel(input: wp.array(dtype=wp.vec3), output: wp.array(dtype=wp.vec3)):
+    i = wp.tid()
+    v = input[i] * 0.5
+    t = wp.tile(v)
+    u = wp.untile(t)
+    output[i] = u * 2.0
+def test_tile_untile_vector(test, device):
+    input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True, device=device)
+    output = wp.zeros_like(input, device=device)
+    with wp.Tape() as tape:
+        wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16, device=device)
+    output.grad = wp.ones_like(output, device=device)
+    tape.backward()
+    assert_np_equal(output.numpy(), input.numpy())
+    assert_np_equal(input.grad.numpy(), np.ones((16, 3)))
+@wp.kernel
+def tile_ones_kernel(out: wp.array(dtype=float)):
+    i = wp.tid()
+    t = wp.tile_ones(dtype=float, shape=(16, 16))
+    s = wp.tile_sum(t)
+    wp.tile_store(out, s)
+def test_tile_ones(test, device):
+    output = wp.zeros(1, dtype=float, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(tile_ones_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
+    test.assertAlmostEqual(output.numpy()[0], 256.0)
+@wp.kernel
+def tile_arange_kernel(out: wp.array2d(dtype=int)):
+    i = wp.tid()
+    a = wp.tile_arange(17, dtype=int)
+    b = wp.tile_arange(5, 23, dtype=int)
+    c = wp.tile_arange(0, 34, 2, dtype=int)
+    d = wp.tile_arange(-1, 16, dtype=int)
+    e = wp.tile_arange(17, 0, -1, dtype=int)
+    wp.tile_store(out[0], a)
+    wp.tile_store(out[1], b)
+    wp.tile_store(out[2], c)
+    wp.tile_store(out[3], d)
+    wp.tile_store(out[4], e)
+def test_tile_arange(test, device):
+    N = 17
+    output = wp.zeros(shape=(5, N), dtype=int, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
+    assert_np_equal(output.numpy()[0], np.arange(17))
+    assert_np_equal(output.numpy()[1], np.arange(5, 22))
+    assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
+    assert_np_equal(output.numpy()[3], np.arange(-1, 16))
+    assert_np_equal(output.numpy()[4], np.arange(17, 0, -1))
+devices = get_test_devices()
+class TestTileReduce(unittest.TestCase):
+    pass
+add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_min", test_tile_reduce_min, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_max", test_tile_reduce_max, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_custom", test_tile_reduce_custom, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_custom_struct", test_tile_reduce_custom_struct, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_grouped_sum", test_tile_reduce_sum, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
+add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
+add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
+add_function_test(TestTileReduce, "test_tile_untile_scalar", test_tile_untile_scalar, devices=devices)
+add_function_test(TestTileReduce, "test_tile_untile_vector", test_tile_untile_vector, devices=devices)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/tile/test_tile_shared_memory.py ADDED Viewed

@@ -0,0 +1,247 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+# checks that we can configure shared memory to the expected size
+def test_tile_shared_mem_size(test, device):
+    DIM_M = 32
+    DIM_N = 32
+    BLOCK_DIM = 256
+    @wp.kernel
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = expected_forward_bytes * 2
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that we can configure shared memory > 48kb default
+def test_tile_shared_mem_large(test, device):
+    # set dimensions that require 64kb for the forward kernel
+    DIM_M = 64
+    DIM_N = 128
+    BLOCK_DIM = 256
+    # we disable backward kernel gen since 128k is not supported on most architectures
+    @wp.kernel(enable_backward=False)
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = 0
+    assert expected_forward_bytes == 2**16
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that we can configure dynamic shared memory during graph capture
+def test_tile_shared_mem_graph(test, device):
+    DIM_M = 32
+    DIM_N = 32
+    BLOCK_DIM = 256
+    @wp.kernel
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.load_module(device=device)
+    wp.capture_begin(device, force_module_load=False)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    graph = wp.capture_end(device)
+    wp.capture_launch(graph)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = expected_forward_bytes * 2
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that stack allocations work for user functions
+def test_tile_shared_mem_func(test, device):
+    DIM_M = 64
+    DIM_N = 64
+    SMALL_DIM_M = 64 // 4
+    SMALL_DIM_N = 64 // 4
+    BLOCK_DIM = 256
+    @wp.func
+    def add_tile_small():
+        a = wp.tile_ones(shape=(SMALL_DIM_M, SMALL_DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(SMALL_DIM_M, SMALL_DIM_N), dtype=float, storage="shared") * 2.0
+        return a + b
+    @wp.func
+    def add_tile_big():
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        return a + b
+    @wp.kernel
+    def compute(out: wp.array2d(dtype=float)):
+        s = add_tile_small()
+        b = add_tile_big()
+        wp.tile_store(out, b)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    # ensure that total required dynamic shared is the larger of the two tiles
+    expected_required_shared = 64 * 64 * 4 * 2
+    assert hooks.forward_smem_bytes == expected_required_shared
+    assert hooks.backward_smem_bytes == expected_required_shared * 2
+def round_up(a, b):
+    return b * ((a + b - 1) // b)
+# checks that using non-16B aligned sizes work
+def test_tile_shared_non_aligned(test, device):
+    # Tile size = 4 (float) * 1 * 3 = 12B % 16 != 0
+    DIM_M = 1
+    DIM_N = 3
+    BLOCK_DIM = 256
+    @wp.func
+    def foo():
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 3.0
+        return a + b
+    @wp.kernel
+    def compute(out: wp.array2d(dtype=float)):
+        # This test the logic in the stack allocator, which should increment and
+        # decrement the stack pointer each time foo() is called
+        # Failing to do so correct will make b out of bounds and corrupt the results
+        for _ in range(4096):
+            foo()
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        wp.tile_store(out, b)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N), dtype=float))
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    # ensure that total required dynamic shared is the larger of the two tiles
+    expected_required_shared = 3 * round_up(DIM_M * DIM_N * 4, 16)
+    assert hooks.forward_smem_bytes == expected_required_shared
+    assert hooks.backward_smem_bytes == expected_required_shared * 2
+devices = get_cuda_test_devices()
+class TestTileSharedMemory(unittest.TestCase):
+    pass
+add_function_test(
+    TestTileSharedMemory, "test_tile_shared_mem_size", test_tile_shared_mem_size, devices=devices, check_output=False
+)
+add_function_test(
+    TestTileSharedMemory, "test_tile_shared_mem_large", test_tile_shared_mem_large, devices=devices, check_output=False
+)
+add_function_test(TestTileSharedMemory, "test_tile_shared_mem_graph", test_tile_shared_mem_graph, devices=devices)
+add_function_test(TestTileSharedMemory, "test_tile_shared_mem_func", test_tile_shared_mem_func, devices=devices)
+add_function_test(TestTileSharedMemory, "test_tile_shared_non_aligned", test_tile_shared_non_aligned, devices=devices)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)