PyPI - warp-lang - Versions diffs - 1.6.1__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.6.1__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (401) hide show

warp/__init__.py +21 -7
warp/autograd.py +14 -6
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +424 -6
warp/build_dll.py +20 -20
warp/builtins.py +467 -368
warp/codegen.py +193 -125
warp/config.py +56 -12
warp/constants.py +14 -6
warp/context.py +524 -277
warp/dlpack.py +22 -12
warp/examples/__init__.py +14 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_api.py +14 -6
warp/examples/benchmarks/benchmark_cloth.py +14 -6
warp/examples/benchmarks/benchmark_cloth_cupy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_jax.py +14 -6
warp/examples/benchmarks/benchmark_cloth_numba.py +15 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_paddle.py +14 -6
warp/examples/benchmarks/benchmark_cloth_pytorch.py +14 -6
warp/examples/benchmarks/benchmark_cloth_taichi.py +14 -6
warp/examples/benchmarks/benchmark_cloth_warp.py +14 -6
warp/examples/benchmarks/benchmark_gemm.py +82 -48
warp/examples/benchmarks/benchmark_interop_paddle.py +14 -6
warp/examples/benchmarks/benchmark_interop_torch.py +14 -6
warp/examples/benchmarks/benchmark_launches.py +14 -6
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +14 -6
warp/examples/core/example_cupy.py +14 -6
warp/examples/core/example_dem.py +14 -6
warp/examples/core/example_fluid.py +14 -6
warp/examples/core/example_graph_capture.py +14 -6
warp/examples/core/example_marching_cubes.py +14 -6
warp/examples/core/example_mesh.py +14 -6
warp/examples/core/example_mesh_intersect.py +14 -6
warp/examples/core/example_nvdb.py +14 -6
warp/examples/core/example_raycast.py +14 -6
warp/examples/core/example_raymarch.py +14 -6
warp/examples/core/example_render_opengl.py +14 -6
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +14 -6
warp/examples/core/example_torch.py +14 -6
warp/examples/core/example_wave.py +14 -6
warp/examples/fem/example_adaptive_grid.py +14 -6
warp/examples/fem/example_apic_fluid.py +15 -7
warp/examples/fem/example_burgers.py +16 -8
warp/examples/fem/example_convection_diffusion.py +14 -6
warp/examples/fem/example_convection_diffusion_dg.py +14 -6
warp/examples/fem/example_deformed_geometry.py +15 -7
warp/examples/fem/example_diffusion.py +14 -6
warp/examples/fem/example_diffusion_3d.py +14 -6
warp/examples/fem/example_diffusion_mgpu.py +14 -6
warp/examples/fem/example_distortion_energy.py +15 -7
warp/examples/fem/example_magnetostatics.py +20 -12
warp/examples/fem/example_mixed_elasticity.py +14 -6
warp/examples/fem/example_navier_stokes.py +14 -6
warp/examples/fem/example_nonconforming_contact.py +14 -6
warp/examples/fem/example_stokes.py +14 -6
warp/examples/fem/example_stokes_transfer.py +14 -6
warp/examples/fem/example_streamlines.py +14 -6
warp/examples/fem/utils.py +24 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +14 -6
warp/examples/optim/example_cloth_throw.py +14 -6
warp/examples/optim/example_diffray.py +14 -6
warp/examples/optim/example_drone.py +14 -6
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +14 -6
warp/examples/optim/example_inverse_kinematics_torch.py +14 -6
warp/examples/optim/example_softbody_properties.py +14 -6
warp/examples/optim/example_spring_cage.py +14 -6
warp/examples/optim/example_trajectory.py +14 -6
warp/examples/sim/example_cartpole.py +14 -6
warp/examples/sim/example_cloth.py +14 -6
warp/examples/sim/example_cloth_self_contact.py +14 -6
warp/examples/sim/example_granular.py +14 -6
warp/examples/sim/example_granular_collision_sdf.py +14 -6
warp/examples/sim/example_jacobian_ik.py +14 -6
warp/examples/sim/example_particle_chain.py +14 -6
warp/examples/sim/example_quadruped.py +14 -6
warp/examples/sim/example_rigid_chain.py +14 -6
warp/examples/sim/example_rigid_contact.py +14 -6
warp/examples/sim/example_rigid_force.py +14 -6
warp/examples/sim/example_rigid_gyroscopic.py +14 -6
warp/examples/sim/example_rigid_soft_contact.py +14 -6
warp/examples/sim/example_soft_body.py +14 -6
warp/examples/tile/example_tile_cholesky.py +14 -6
warp/examples/tile/example_tile_convolution.py +14 -6
warp/examples/tile/example_tile_fft.py +14 -6
warp/examples/tile/example_tile_filtering.py +14 -6
warp/examples/tile/example_tile_matmul.py +16 -10
warp/examples/tile/example_tile_mlp.py +14 -6
warp/examples/tile/example_tile_nbody.py +14 -6
warp/examples/tile/example_tile_walker.py +14 -6
warp/fabric.py +15 -0
warp/fem/__init__.py +26 -1
warp/fem/adaptivity.py +19 -4
warp/fem/cache.py +15 -0
warp/fem/dirichlet.py +15 -0
warp/fem/domain.py +15 -0
warp/fem/field/__init__.py +15 -0
warp/fem/field/field.py +15 -0
warp/fem/field/nodal_field.py +37 -68
warp/fem/field/restriction.py +15 -0
warp/fem/field/virtual.py +77 -23
warp/fem/geometry/__init__.py +15 -0
warp/fem/geometry/adaptive_nanogrid.py +24 -10
warp/fem/geometry/closest_point.py +16 -1
warp/fem/geometry/deformed_geometry.py +20 -2
warp/fem/geometry/element.py +15 -0
warp/fem/geometry/geometry.py +20 -0
warp/fem/geometry/grid_2d.py +27 -12
warp/fem/geometry/grid_3d.py +27 -15
warp/fem/geometry/hexmesh.py +20 -7
warp/fem/geometry/nanogrid.py +24 -11
warp/fem/geometry/partition.py +15 -0
warp/fem/geometry/quadmesh.py +28 -13
warp/fem/geometry/tetmesh.py +18 -4
warp/fem/geometry/trimesh.py +18 -8
warp/fem/integrate.py +277 -93
warp/fem/linalg.py +20 -5
warp/fem/operator.py +15 -0
warp/fem/polynomial.py +15 -0
warp/fem/quadrature/__init__.py +15 -0
warp/fem/quadrature/pic_quadrature.py +52 -22
warp/fem/quadrature/quadrature.py +209 -25
warp/fem/space/__init__.py +16 -1
warp/fem/space/basis_function_space.py +19 -2
warp/fem/space/basis_space.py +40 -18
warp/fem/space/dof_mapper.py +15 -0
warp/fem/space/function_space.py +15 -0
warp/fem/space/grid_2d_function_space.py +15 -0
warp/fem/space/grid_3d_function_space.py +15 -0
warp/fem/space/hexmesh_function_space.py +17 -2
warp/fem/space/nanogrid_function_space.py +15 -0
warp/fem/space/partition.py +21 -2
warp/fem/space/quadmesh_function_space.py +23 -8
warp/fem/space/restriction.py +15 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +38 -23
warp/fem/space/shape/shape_function.py +15 -0
warp/fem/space/shape/square_shape_function.py +27 -12
warp/fem/space/shape/tet_shape_function.py +15 -0
warp/fem/space/shape/triangle_shape_function.py +16 -1
warp/fem/space/tetmesh_function_space.py +18 -3
warp/fem/space/topology.py +15 -0
warp/fem/space/trimesh_function_space.py +17 -2
warp/fem/types.py +15 -0
warp/fem/utils.py +27 -6
warp/jax.py +28 -7
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -33
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +103 -6
warp/native/array.h +28 -6
warp/native/builtin.h +44 -9
warp/native/bvh.cpp +18 -7
warp/native/bvh.cu +57 -20
warp/native/bvh.h +17 -7
warp/native/clang/clang.cpp +45 -9
warp/native/coloring.cpp +15 -6
warp/native/crt.cpp +15 -6
warp/native/crt.h +15 -6
warp/native/cuda_crt.h +15 -6
warp/native/cuda_util.cpp +29 -6
warp/native/cuda_util.h +17 -6
warp/native/error.cpp +15 -6
warp/native/error.h +15 -6
warp/native/exports.h +85 -63
warp/native/fabric.h +15 -6
warp/native/hashgrid.cpp +15 -6
warp/native/hashgrid.cu +15 -6
warp/native/hashgrid.h +15 -6
warp/native/initializer_array.h +15 -6
warp/native/intersect.h +41 -32
warp/native/intersect_adj.h +48 -39
warp/native/intersect_tri.h +17 -0
warp/native/marching.cpp +16 -0
warp/native/marching.cu +16 -7
warp/native/marching.h +17 -0
warp/native/mat.h +528 -15
warp/native/mathdx.cpp +15 -6
warp/native/matnn.h +15 -6
warp/native/mesh.cpp +15 -6
warp/native/mesh.cu +15 -6
warp/native/mesh.h +25 -16
warp/native/noise.h +15 -6
warp/native/quat.h +114 -17
warp/native/rand.h +21 -6
warp/native/range.h +15 -6
warp/native/reduce.cpp +15 -6
warp/native/reduce.cu +15 -6
warp/native/runlength_encode.cpp +15 -6
warp/native/runlength_encode.cu +15 -6
warp/native/scan.cpp +15 -6
warp/native/scan.cu +15 -6
warp/native/scan.h +15 -6
warp/native/solid_angle.h +17 -0
warp/native/sort.cpp +137 -65
warp/native/sort.cu +167 -21
warp/native/sort.h +23 -7
warp/native/sparse.cpp +58 -28
warp/native/sparse.cu +67 -23
warp/native/spatial.h +15 -6
warp/native/svd.h +131 -6
warp/native/temp_buffer.h +15 -6
warp/native/tile.h +316 -111
warp/native/tile_reduce.h +61 -9
warp/native/vec.h +83 -13
warp/native/volume.cpp +100 -119
warp/native/volume.cu +15 -6
warp/native/volume.h +15 -6
warp/native/volume_builder.cu +40 -16
warp/native/volume_builder.h +21 -6
warp/native/volume_impl.h +15 -6
warp/native/warp.cpp +20 -12
warp/native/warp.cu +114 -16
warp/native/warp.h +34 -16
warp/optim/__init__.py +14 -6
warp/optim/adam.py +14 -6
warp/optim/linear.py +25 -10
warp/optim/sgd.py +14 -6
warp/paddle.py +14 -6
warp/render/__init__.py +14 -6
warp/render/render_opengl.py +14 -6
warp/render/render_usd.py +14 -6
warp/render/utils.py +14 -6
warp/sim/__init__.py +14 -7
warp/sim/articulation.py +18 -10
warp/sim/collide.py +35 -16
warp/sim/graph_coloring.py +14 -6
warp/sim/import_mjcf.py +463 -162
warp/sim/import_snu.py +14 -7
warp/sim/import_urdf.py +46 -18
warp/sim/import_usd.py +14 -7
warp/sim/inertia.py +14 -6
warp/sim/integrator.py +14 -6
warp/sim/integrator_euler.py +19 -11
warp/sim/integrator_featherstone.py +17 -16
warp/sim/integrator_vbd.py +222 -8
warp/sim/integrator_xpbd.py +19 -11
warp/sim/model.py +56 -19
warp/sim/particles.py +14 -6
warp/sim/render.py +14 -6
warp/sim/utils.py +17 -2
warp/sparse.py +657 -555
warp/stubs.py +231 -19
warp/tape.py +14 -6
warp/tests/aux_test_class_kernel.py +14 -6
warp/tests/aux_test_compile_consts_dummy.py +14 -6
warp/tests/aux_test_conditional_unequal_types_kernels.py +14 -6
warp/tests/aux_test_dependent.py +14 -6
warp/tests/aux_test_grad_customs.py +14 -6
warp/tests/aux_test_instancing_gc.py +14 -6
warp/tests/aux_test_module_unload.py +14 -6
warp/tests/aux_test_name_clash1.py +14 -6
warp/tests/aux_test_name_clash2.py +14 -6
warp/tests/aux_test_unresolved_func.py +14 -6
warp/tests/aux_test_unresolved_symbol.py +14 -6
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_async.py → cuda/test_async.py} +14 -6
warp/tests/{test_ipc.py → cuda/test_ipc.py} +14 -6
warp/tests/{test_mempool.py → cuda/test_mempool.py} +53 -6
warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +14 -6
warp/tests/{test_peer.py → cuda/test_peer.py} +14 -6
warp/tests/{test_pinned.py → cuda/test_pinned.py} +14 -6
warp/tests/{test_streams.py → cuda/test_streams.py} +85 -6
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_bvh.py → geometry/test_bvh.py} +14 -6
warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +14 -6
warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +14 -6
warp/tests/{test_mesh.py → geometry/test_mesh.py} +14 -6
warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +14 -6
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +80 -69
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +15 -7
warp/tests/{test_volume.py → geometry/test_volume.py} +55 -12
warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +14 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +42 -11
warp/tests/{test_jax.py → interop/test_jax.py} +14 -6
warp/tests/{test_paddle.py → interop/test_paddle.py} +14 -6
warp/tests/{test_torch.py → interop/test_torch.py} +14 -6
warp/tests/run_coverage_serial.py +14 -6
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +23 -16
warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +14 -6
warp/tests/{test_collision.py → sim/test_collision.py} +16 -8
warp/tests/{test_coloring.py → sim/test_coloring.py} +14 -7
warp/tests/{test_model.py → sim/test_model.py} +55 -7
warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +14 -6
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +16 -7
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +14 -6
warp/tests/test_arithmetic.py +14 -6
warp/tests/test_array.py +14 -6
warp/tests/test_array_reduce.py +14 -6
warp/tests/test_assert.py +14 -6
warp/tests/test_atomic.py +14 -6
warp/tests/test_bool.py +15 -7
warp/tests/test_builtins_resolution.py +14 -6
warp/tests/test_closest_point_edge_edge.py +14 -6
warp/tests/test_codegen.py +14 -6
warp/tests/test_codegen_instancing.py +14 -6
warp/tests/test_compile_consts.py +14 -6
warp/tests/test_conditional.py +14 -6
warp/tests/test_context.py +14 -6
warp/tests/test_copy.py +14 -6
warp/tests/test_ctypes.py +14 -6
warp/tests/test_dense.py +14 -6
warp/tests/test_devices.py +14 -6
warp/tests/test_examples.py +42 -42
warp/tests/test_fabricarray.py +14 -6
warp/tests/test_fast_math.py +14 -6
warp/tests/test_fem.py +37 -10
warp/tests/test_fp16.py +14 -6
warp/tests/test_func.py +14 -6
warp/tests/test_future_annotations.py +14 -6
warp/tests/test_generics.py +14 -6
warp/tests/test_grad.py +14 -6
warp/tests/test_grad_customs.py +14 -6
warp/tests/test_grad_debug.py +14 -6
warp/tests/test_implicit_init.py +14 -6
warp/tests/test_import.py +14 -6
warp/tests/test_indexedarray.py +14 -6
warp/tests/test_intersect.py +14 -6
warp/tests/test_iter.py +14 -6
warp/tests/test_large.py +14 -6
warp/tests/test_launch.py +14 -6
warp/tests/test_lerp.py +14 -6
warp/tests/test_linear_solvers.py +15 -11
warp/tests/test_lvalue.py +14 -6
warp/tests/test_mat.py +247 -85
warp/tests/test_mat_lite.py +14 -6
warp/tests/test_mat_scalar_ops.py +18 -10
warp/tests/test_math.py +14 -6
warp/tests/test_mlp.py +14 -6
warp/tests/test_module_hashing.py +14 -6
warp/tests/test_modules_lite.py +14 -6
warp/tests/test_noise.py +14 -6
warp/tests/test_operators.py +14 -6
warp/tests/test_options.py +14 -6
warp/tests/test_overwrite.py +15 -60
warp/tests/test_print.py +14 -6
warp/tests/test_quat.py +81 -52
warp/tests/test_rand.py +58 -43
warp/tests/test_reload.py +14 -6
warp/tests/test_rounding.py +14 -6
warp/tests/test_runlength_encode.py +14 -6
warp/tests/test_scalar_ops.py +14 -6
warp/tests/test_smoothstep.py +14 -6
warp/tests/test_snippet.py +15 -0
warp/tests/test_sparse.py +61 -12
warp/tests/test_spatial.py +89 -6
warp/tests/test_special_values.py +14 -6
warp/tests/test_static.py +15 -7
warp/tests/test_struct.py +14 -6
warp/tests/test_tape.py +14 -6
warp/tests/test_transient_module.py +14 -6
warp/tests/test_triangle_closest_point.py +14 -6
warp/tests/test_types.py +14 -6
warp/tests/test_utils.py +98 -10
warp/tests/test_vec.py +60 -40
warp/tests/test_vec_lite.py +14 -6
warp/tests/test_vec_scalar_ops.py +14 -6
warp/tests/test_verify_fp.py +14 -6
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +150 -57
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +15 -7
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +23 -12
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +39 -20
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +74 -7
warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +14 -6
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +15 -7
warp/tests/unittest_serial.py +15 -6
warp/tests/unittest_suites.py +59 -65
warp/tests/unittest_utils.py +16 -7
warp/tests/walkthrough_debug.py +14 -6
warp/thirdparty/unittest_parallel.py +15 -8
warp/torch.py +14 -6
warp/types.py +124 -664
warp/utils.py +151 -78
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/METADATA +39 -12
warp_lang-1.7.0.dist-info/RECORD +429 -0
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp/examples/optim/example_walker.py +0 -309
warp/native/cutlass_gemm.cpp +0 -34
warp/native/cutlass_gemm.cu +0 -373
warp/tests/test_matmul.py +0 -503
warp/tests/test_matmul_lite.py +0 -403
warp/tests/test_vbd.py +0 -378
warp/tests/unused_test_misc.py +0 -69
warp_lang-1.6.1.dist-info/LICENSE.md +0 -126
warp_lang-1.6.1.dist-info/RECORD +0 -419
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/tile_reduce.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once
@@ -15,6 +24,8 @@
 namespace wp
 {
+#if defined(__CUDA_ARCH__)
 template <typename T>
 inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
 {
@@ -148,7 +159,39 @@ auto tile_reduce_impl(Op f, Tile& t)
     return output;
 }
-void adj_tile_reduce_impl()
+#else
+// CPU implementation
+template <typename Tile, typename Op>
+auto tile_reduce_impl(Op f, Tile& t)
+{
+   using T = typename Tile::Type;
+    auto input = t.copy_to_register();
+    auto output = tile_register_t<T, tile_layout_register_t<tile_shape_t<1>>>();
+   using Layout = typename decltype(input)::Layout;
+   T sum = input.data[0];
+    WP_PRAGMA_UNROLL
+    for (int i=1; i < Layout::NumRegs; ++i)
+    {
+        int linear = Layout::linear_from_register(i);
+        if (!Layout::valid(linear))
+            break;
+        sum = f(sum, input.data[i]);
+    }
+    output.data[0] = sum;
+    return output;
+}
+#endif // !defined(__CUDA_ARCH__)
+inline void adj_tile_reduce_impl()
 {
     // todo: general purpose reduction gradients not implemented
 }
@@ -171,16 +214,25 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
     using T = typename Tile::Type;
+#if !defined(__CUDA_ARCH__)
+    for (int i=0; i < Tile::Layout::Size; ++i)
+    {
+        adj_t(i) += adj_ret.data[0];
+    }
+#else
     // broadcast incoming adjoint to block
     WP_TILE_SHARED T scratch;
-    if (threadIdx.x == 0)
+    if (WP_TILE_THREAD_IDX == 0)
         scratch = adj_ret.data[0];
     WP_TILE_SYNC();
     // broadcast scalar across input dimensions (note zero strides)
-    auto adj_ret_reg = tile_shared_t<T, tile_layout_strided_t<typename Tile::Layout::Shape, tile_stride_t<0, 0>>>(&scratch, NULL).copy_to_register();
+    auto adj_ret_reg = tile_shared_t<T, tile_layout_strided_t<typename Tile::Layout::Shape, tile_stride_t<0, 0>>, false>(&scratch, nullptr).copy_to_register();
     adj_t.grad_add(adj_ret_reg);
+#endif
 }
 template <typename Tile>

warp/native/vec.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once
@@ -497,37 +506,98 @@ inline CUDA_CALLABLE void adj_indexref(vec_t<Length, Type>* v, int idx,
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void augassign_add(vec_t<Length, Type>& v, int idx, Type value)
+inline CUDA_CALLABLE void add_inplace(vec_t<Length, Type>& v, int idx, Type value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     v[idx] += value;
 }
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_augassign_add(vec_t<Length, Type>& v, int idx, Type value,
+inline CUDA_CALLABLE void adj_add_inplace(vec_t<Length, Type>& v, int idx, Type value,
                                         vec_t<Length, Type>& adj_v, int adj_idx, Type& adj_value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     adj_value += adj_v[idx];
 }
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void augassign_sub(vec_t<Length, Type>& v, int idx, Type value)
+inline CUDA_CALLABLE void sub_inplace(vec_t<Length, Type>& v, int idx, Type value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     v[idx] -= value;
 }
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_augassign_sub(vec_t<Length, Type>& v, int idx, Type value,
+inline CUDA_CALLABLE void adj_sub_inplace(vec_t<Length, Type>& v, int idx, Type value,
                                         vec_t<Length, Type>& adj_v, int adj_idx, Type& adj_value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     adj_value -= adj_v[idx];
 }
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> assign(vec_t<Length, Type>& v, int idx, Type value)
+inline CUDA_CALLABLE void assign_inplace(vec_t<Length, Type>& v, int idx, Type value)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    v[idx] = value;
+}
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(vec_t<Length, Type>& v, int idx, Type value, vec_t<Length, Type>& adj_v, int& adj_idx, Type& adj_value)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx >= Length)
+    {
+        printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    adj_value += adj_v[idx];
+}
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> assign_copy(vec_t<Length, Type>& v, int idx, Type value)
 {
 #ifndef NDEBUG
     if (idx < 0 || idx >= Length)
@@ -543,7 +613,7 @@ inline CUDA_CALLABLE vec_t<Length, Type> assign(vec_t<Length, Type>& v, int idx,
 }
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_assign(vec_t<Length, Type>& v, int idx, Type value, vec_t<Length, Type>& adj_v, int& adj_idx, Type& adj_value, const vec_t<Length, Type>& adj_ret)
+inline CUDA_CALLABLE void adj_assign_copy(vec_t<Length, Type>& v, int idx, Type value, vec_t<Length, Type>& adj_v, int& adj_idx, Type& adj_value, const vec_t<Length, Type>& adj_ret)
 {
 #ifndef NDEBUG
     if (idx < 0 || idx >= Length)
@@ -765,7 +835,7 @@ inline CUDA_CALLABLE vec_t<Length,Type> sign(vec_t<Length,Type> v)
 template<unsigned Length, typename Type>
 inline CUDA_CALLABLE void expect_near(const vec_t<Length, Type>& actual, const vec_t<Length, Type>& expected, const Type& tolerance)
 {
-    const Type diff(0);
+    Type diff(0);
     for(size_t i=0; i<Length; ++i)
     {
         diff = max(diff,abs(actual[i] - expected[i]));