PyPI - warp-lang - Versions diffs - 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl - Mend

warp-lang 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (271) hide show

docs/conf.py +17 -5
examples/env/env_ant.py +1 -1
examples/env/env_cartpole.py +1 -1
examples/env/env_humanoid.py +1 -1
examples/env/env_usd.py +4 -1
examples/env/environment.py +8 -9
examples/example_dem.py +34 -33
examples/example_diffray.py +364 -337
examples/example_fluid.py +32 -23
examples/example_jacobian_ik.py +97 -93
examples/example_marching_cubes.py +6 -16
examples/example_mesh.py +6 -16
examples/example_mesh_intersect.py +16 -14
examples/example_nvdb.py +14 -16
examples/example_raycast.py +14 -13
examples/example_raymarch.py +16 -23
examples/example_render_opengl.py +19 -10
examples/example_sim_cartpole.py +82 -78
examples/example_sim_cloth.py +45 -48
examples/example_sim_fk_grad.py +51 -44
examples/example_sim_fk_grad_torch.py +47 -40
examples/example_sim_grad_bounce.py +108 -133
examples/example_sim_grad_cloth.py +99 -113
examples/example_sim_granular.py +5 -6
examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
examples/example_sim_neo_hookean.py +51 -55
examples/example_sim_particle_chain.py +4 -4
examples/example_sim_quadruped.py +126 -81
examples/example_sim_rigid_chain.py +54 -61
examples/example_sim_rigid_contact.py +66 -70
examples/example_sim_rigid_fem.py +3 -3
examples/example_sim_rigid_force.py +1 -1
examples/example_sim_rigid_gyroscopic.py +3 -4
examples/example_sim_rigid_kinematics.py +28 -39
examples/example_sim_trajopt.py +112 -110
examples/example_sph.py +9 -8
examples/example_wave.py +7 -7
examples/fem/bsr_utils.py +30 -17
examples/fem/example_apic_fluid.py +85 -69
examples/fem/example_convection_diffusion.py +97 -93
examples/fem/example_convection_diffusion_dg.py +142 -149
examples/fem/example_convection_diffusion_dg0.py +141 -136
examples/fem/example_deformed_geometry.py +146 -0
examples/fem/example_diffusion.py +115 -84
examples/fem/example_diffusion_3d.py +116 -86
examples/fem/example_diffusion_mgpu.py +102 -79
examples/fem/example_mixed_elasticity.py +139 -100
examples/fem/example_navier_stokes.py +175 -162
examples/fem/example_stokes.py +143 -111
examples/fem/example_stokes_transfer.py +186 -157
examples/fem/mesh_utils.py +59 -97
examples/fem/plot_utils.py +138 -17
tools/ci/publishing/build_nodes_info.py +54 -0
warp/__init__.py +4 -3
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +836 -492
warp/codegen.py +864 -553
warp/config.py +3 -1
warp/context.py +389 -172
warp/fem/__init__.py +24 -6
warp/fem/cache.py +318 -25
warp/fem/dirichlet.py +7 -3
warp/fem/domain.py +14 -0
warp/fem/field/__init__.py +30 -38
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +244 -138
warp/fem/field/restriction.py +8 -6
warp/fem/field/test.py +127 -59
warp/fem/field/trial.py +117 -60
warp/fem/geometry/__init__.py +5 -1
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +24 -1
warp/fem/geometry/geometry.py +86 -14
warp/fem/geometry/grid_2d.py +112 -54
warp/fem/geometry/grid_3d.py +134 -65
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +85 -33
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +451 -115
warp/fem/geometry/trimesh_2d.py +197 -92
warp/fem/integrate.py +534 -268
warp/fem/operator.py +58 -31
warp/fem/polynomial.py +11 -0
warp/fem/quadrature/__init__.py +1 -1
warp/fem/quadrature/pic_quadrature.py +150 -58
warp/fem/quadrature/quadrature.py +209 -57
warp/fem/space/__init__.py +230 -53
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +49 -2
warp/fem/space/function_space.py +90 -39
warp/fem/space/grid_2d_function_space.py +149 -496
warp/fem/space/grid_3d_function_space.py +173 -538
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +129 -76
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +46 -34
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +132 -1039
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +104 -742
warp/fem/types.py +13 -11
warp/fem/utils.py +335 -60
warp/native/array.h +120 -34
warp/native/builtin.h +101 -72
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +22 -40
warp/native/clang/clang.cpp +1 -0
warp/native/crt.h +2 -0
warp/native/cuda_util.cpp +8 -3
warp/native/cuda_util.h +1 -0
warp/native/exports.h +1522 -1243
warp/native/intersect.h +19 -4
warp/native/intersect_adj.h +8 -8
warp/native/mat.h +76 -17
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -18
warp/native/mesh.h +395 -40
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +44 -34
warp/native/reduce.cpp +1 -1
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +163 -155
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +18 -14
warp/native/vec.h +103 -21
warp/native/warp.cpp +2 -1
warp/native/warp.cu +28 -3
warp/native/warp.h +4 -3
warp/render/render_opengl.py +261 -109
warp/sim/__init__.py +1 -2
warp/sim/articulation.py +385 -185
warp/sim/import_mjcf.py +59 -48
warp/sim/import_urdf.py +15 -15
warp/sim/import_usd.py +174 -102
warp/sim/inertia.py +17 -18
warp/sim/integrator_xpbd.py +4 -3
warp/sim/model.py +330 -250
warp/sim/render.py +1 -1
warp/sparse.py +625 -152
warp/stubs.py +341 -309
warp/tape.py +9 -6
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +94 -74
warp/tests/test_array.py +82 -101
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +22 -12
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +18 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +165 -134
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +237 -0
warp/tests/test_fabricarray.py +22 -24
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1034 -124
warp/tests/test_fp16.py +23 -16
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +123 -181
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +35 -34
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +24 -25
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +304 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +60 -22
warp/tests/test_mesh_query_aabb.py +21 -25
warp/tests/test_mesh_query_point.py +111 -22
warp/tests/test_mesh_query_ray.py +12 -24
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +90 -86
warp/tests/test_transient_module.py +10 -12
warp/tests/test_types.py +363 -0
warp/tests/test_utils.py +451 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +418 -376
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +291 -0
warp/tests/unittest_utils.py +342 -0
warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +589 -0
warp/types.py +622 -211
warp/utils.py +54 -393
warp_lang-1.0.0b6.dist-info/METADATA +238 -0
warp_lang-1.0.0b6.dist-info/RECORD +409 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
examples/example_cache_management.py +0 -40
examples/example_multigpu.py +0 -54
examples/example_struct.py +0 -65
examples/fem/example_stokes_transfer_3d.py +0 -210
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/fem/field/discrete_field.py +0 -80
warp/fem/space/nodal_function_space.py +0 -233
warp/tests/test_all.py +0 -223
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-1.0.0b2.dist-info/METADATA +0 -26
warp_lang-1.0.0b2.dist-info/RECORD +0 -380
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
{warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0

warp/native/array.h CHANGED Viewed

@@ -19,6 +19,12 @@ namespace wp
     printf(")\n"); \
     assert(0); \
+#define FP_VERIFY_FWD(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(addr", __FILE__, __LINE__, __FUNCTION__); \
+        FP_ASSERT_FWD(value) \
+    } \
 #define FP_VERIFY_FWD_1(value) \
     if (!isfinite(value)) { \
         printf("%s:%d - %s(arr, %d) ", __FILE__, __LINE__, __FUNCTION__, i); \
@@ -43,6 +49,13 @@ namespace wp
         FP_ASSERT_FWD(value) \
     } \
+#define FP_VERIFY_ADJ(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(addr",  __FILE__, __LINE__, __FUNCTION__); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
 #define FP_VERIFY_ADJ_1(value, adj_value) \
     if (!isfinite(value) || !isfinite(adj_value)) \
     { \
@@ -74,11 +87,13 @@ namespace wp
 #else
+#define FP_VERIFY_FWD(value) {}
 #define FP_VERIFY_FWD_1(value) {}
 #define FP_VERIFY_FWD_2(value) {}
 #define FP_VERIFY_FWD_3(value) {}
 #define FP_VERIFY_FWD_4(value) {}
+#define FP_VERIFY_ADJ(value, adj_value) {}
 #define FP_VERIFY_ADJ_1(value, adj_value) {}
 #define FP_VERIFY_ADJ_2(value, adj_value) {}
 #define FP_VERIFY_ADJ_3(value, adj_value) {}
@@ -113,12 +128,12 @@ struct shape_t
     }
 };
-CUDA_CALLABLE inline int index(const shape_t& s, int i)
+CUDA_CALLABLE inline int extract(const shape_t& s, int i)
 {
     return s.dims[i];
 }
-CUDA_CALLABLE inline void adj_index(const shape_t& s, int i, const shape_t& adj_s, int adj_i, int adj_ret) {}
+CUDA_CALLABLE inline void adj_extract(const shape_t& s, int i, const shape_t& adj_s, int adj_i, int adj_ret) {}
 inline CUDA_CALLABLE void print(shape_t s)
 {
@@ -670,43 +685,60 @@ template<template<typename> class A, typename T>
 inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_max(&index(buf, i, j, k, l), value); }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE T load(const A<T>& buf, int i) { return index(buf, i); }
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i) { return &index(buf, i); }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE T load(const A<T>& buf, int i, int j) { return index(buf, i, j); }
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j) { return &index(buf, i, j); }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE T load(const A<T>& buf, int i, int j, int k) { return index(buf, i, j, k); }
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k) { return &index(buf, i, j, k); }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE T load(const A<T>& buf, int i, int j, int k, int l) { return index(buf, i, j, k, l); }
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k, int l) { return &index(buf, i, j, k, l); }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE void store(const A<T>& buf, int i, T value)
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, T value)
 {
     FP_VERIFY_FWD_1(value)
     index(buf, i) = value;
 }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE void store(const A<T>& buf, int i, int j, T value)
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, T value)
 {
     FP_VERIFY_FWD_2(value)
     index(buf, i, j) = value;
 }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE void store(const A<T>& buf, int i, int j, int k, T value)
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, T value)
 {
     FP_VERIFY_FWD_3(value)
     index(buf, i, j, k) = value;
 }
 template<template<typename> class A, typename T>
-inline CUDA_CALLABLE void store(const A<T>& buf, int i, int j, int k, int l, T value)
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, int l, T value)
 {
     FP_VERIFY_FWD_4(value)
     index(buf, i, j, k, l) = value;
 }
+template<typename T>
+inline CUDA_CALLABLE void store(T* address, T value)
+{
+    FP_VERIFY_FWD(value)
+    *address = value;
+}
+template<typename T>
+inline CUDA_CALLABLE T load(T* address)
+{
+    T value = *address;
+    FP_VERIFY_FWD(value)
+    return value;
+}
 // select operator to check for array being null
 template <typename T1, typename T2>
 CUDA_CALLABLE inline T2 select(const array_t<T1>& arr, const T2& a, const T2& b) { return arr.data?b:a; }
@@ -744,32 +776,32 @@ CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { }
 // only generate gradients for T types
 template<typename T>
-inline CUDA_CALLABLE void adj_load(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int& adj_i, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int& adj_i, const T& adj_output)
 {
     if (buf.grad)
         adj_atomic_add(&index_grad(buf, i), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_load(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output)
 {
     if (buf.grad)
         adj_atomic_add(&index_grad(buf, i, j), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_load(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output)
 {
     if (buf.grad)
         adj_atomic_add(&index_grad(buf, i, j, k), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_load(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output)
 {
     if (buf.grad)
         adj_atomic_add(&index_grad(buf, i, j, k, l), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value)
 {
     if (buf.grad)
         adj_value += index_grad(buf, i);
@@ -777,7 +809,7 @@ inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, T value, const
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value)
 {
     if (buf.grad)
         adj_value += index_grad(buf, i, j);
@@ -786,7 +818,7 @@ inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, T value
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value)
 {
     if (buf.grad)
         adj_value += index_grad(buf, i, j, k);
@@ -794,7 +826,7 @@ inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, int k,
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value)
 {
     if (buf.grad)
         adj_value += index_grad(buf, i, j, k, l);
@@ -802,6 +834,19 @@ inline CUDA_CALLABLE void adj_store(const array_t<T>& buf, int i, int j, int k,
     FP_VERIFY_ADJ_4(value, adj_value)
 }
+template<typename T>
+inline CUDA_CALLABLE void adj_store(const T* address, T value, const T& adj_address, T& adj_value)
+{
+	// nop; generic store() operations are not differentiable, only array_store() is
+    FP_VERIFY_ADJ(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& adj_value)
+{
+    // nop; generic load() operations are not differentiable
+}
 template<typename T>
 inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret)
 {
@@ -871,22 +916,22 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
 // generic array types that do not support gradient computation (indexedarray, etc.)
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_load(const A1<T>& buf, int i, const A2<T>& adj_buf, int& adj_i, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int& adj_i, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_load(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_load(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_load(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
 inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
@@ -906,23 +951,64 @@ inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k,
 template<template<typename> class A1, template<typename> class A2, typename T>
 inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
+// generic handler for scalar values
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
+    if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
 } // namespace wp

warp/native/builtin.h CHANGED Viewed

@@ -251,8 +251,6 @@ CUDA_CALLABLE inline void adj_int8(T, T&, int8) {}
 template <typename T>
 CUDA_CALLABLE inline void adj_uint8(T, T&, uint8) {}
 template <typename T>
-CUDA_CALLABLE inline void adj_bool(T, T&, bool) {}
-template <typename T>
 CUDA_CALLABLE inline void adj_int16(T, T&, int16) {}
 template <typename T>
 CUDA_CALLABLE inline void adj_uint16(T, T&, uint16) {}
@@ -297,7 +295,7 @@ inline CUDA_CALLABLE T rshift(T a, T b) { return a>>b; } \
 inline CUDA_CALLABLE T invert(T x) { return ~x; } \
 inline CUDA_CALLABLE bool isfinite(T x) { return true; } \
 inline CUDA_CALLABLE void adj_mul(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_div(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret) { } \
 inline CUDA_CALLABLE void adj_add(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
 inline CUDA_CALLABLE void adj_sub(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
 inline CUDA_CALLABLE void adj_mod(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
@@ -435,11 +433,6 @@ inline CUDA_CALLABLE void adj_clamp(T x, T a, T b, T& adj_x, T& adj_a, T& adj_b,
     else\
         adj_x += adj_ret;\
 }\
-inline CUDA_CALLABLE void adj_round(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_rint(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_trunc(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_floor(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_ceil(T x, T& adj_x, T adj_ret){ }\
 inline CUDA_CALLABLE T div(T a, T b)\
 {\
     DO_IF_FPCHECK(\
@@ -450,10 +443,10 @@ inline CUDA_CALLABLE T div(T a, T b)\
     })\
     return a/b;\
 }\
-inline CUDA_CALLABLE void adj_div(T a, T b, T& adj_a, T& adj_b, T adj_ret)\
+inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
 {\
     adj_a += adj_ret/b;\
-    adj_b -= adj_ret*(a/b)/b;\
+    adj_b -= adj_ret*(ret)/b;\
     DO_IF_FPCHECK(\
     if (!isfinite(adj_a) || !isfinite(adj_b))\
     {\
@@ -792,6 +785,10 @@ inline CUDA_CALLABLE half sqrt(half x)
     return ::sqrtf(float(x));
 }
+inline CUDA_CALLABLE float cbrt(float x) { return ::cbrtf(x); }
+inline CUDA_CALLABLE double cbrt(double x) { return ::cbrt(x); }
+inline CUDA_CALLABLE half cbrt(half x) { return ::cbrtf(float(x)); }
 inline CUDA_CALLABLE float tan(float x) { return ::tanf(x); }
 inline CUDA_CALLABLE float sinh(float x) { return ::sinhf(x);}
 inline CUDA_CALLABLE float cosh(float x) { return ::coshf(x);}
@@ -818,6 +815,21 @@ inline CUDA_CALLABLE float rint(float x) { return ::rintf(x); }
 inline CUDA_CALLABLE float trunc(float x) { return ::truncf(x); }
 inline CUDA_CALLABLE float floor(float x) { return ::floorf(x); }
 inline CUDA_CALLABLE float ceil(float x) { return ::ceilf(x); }
+inline CUDA_CALLABLE float frac(float x) { return x - trunc(x); }
+inline CUDA_CALLABLE double round(double x) { return ::round(x); }
+inline CUDA_CALLABLE double rint(double x) { return ::rint(x); }
+inline CUDA_CALLABLE double trunc(double x) { return ::trunc(x); }
+inline CUDA_CALLABLE double floor(double x) { return ::floor(x); }
+inline CUDA_CALLABLE double ceil(double x) { return ::ceil(x); }
+inline CUDA_CALLABLE double frac(double x) { return x - trunc(x); }
+inline CUDA_CALLABLE half round(half x) { return ::roundf(float(x)); }
+inline CUDA_CALLABLE half rint(half x) { return ::rintf(float(x)); }
+inline CUDA_CALLABLE half trunc(half x) { return ::truncf(float(x)); }
+inline CUDA_CALLABLE half floor(half x) { return ::floorf(float(x)); }
+inline CUDA_CALLABLE half ceil(half x) { return ::ceilf(float(x)); }
+inline CUDA_CALLABLE half frac(half x) { return float(x) - trunc(float(x)); }
 #define DECLARE_ADJOINTS(T)\
 inline CUDA_CALLABLE void adj_log(T a, T& adj_a, T adj_ret)\
@@ -847,11 +859,11 @@ inline CUDA_CALLABLE void adj_log10(T a, T& adj_a, T adj_ret)\
         assert(0);\
     })\
 }\
-inline CUDA_CALLABLE void adj_exp(T a, T& adj_a, T adj_ret) { adj_a += exp(a)*adj_ret; }\
-inline CUDA_CALLABLE void adj_pow(T a, T b, T& adj_a, T& adj_b, T adj_ret)\
+inline CUDA_CALLABLE void adj_exp(T a, T ret, T& adj_a, T adj_ret) { adj_a += ret*adj_ret; }\
+inline CUDA_CALLABLE void adj_pow(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
 { \
     adj_a += b*pow(a, b-T(1))*adj_ret;\
-    adj_b += log(a)*pow(a, b)*adj_ret;\
+    adj_b += log(a)*ret*adj_ret;\
     DO_IF_FPCHECK(if (!isfinite(adj_a) || !isfinite(adj_b))\
     {\
         printf("%s:%d - adj_pow(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(a), float(b), float(adj_a), float(adj_b), float(adj_ret));\
@@ -950,20 +962,28 @@ inline CUDA_CALLABLE void adj_cosh(T x, T& adj_x, T adj_ret)\
 {\
     adj_x += sinh(x)*adj_ret;\
 }\
-inline CUDA_CALLABLE void adj_tanh(T x, T& adj_x, T adj_ret)\
+inline CUDA_CALLABLE void adj_tanh(T x, T ret, T& adj_x, T adj_ret)\
 {\
-    T tanh_x = tanh(x);\
-    adj_x += (T(1) - tanh_x*tanh_x)*adj_ret;\
+    adj_x += (T(1) - ret*ret)*adj_ret;\
 }\
-inline CUDA_CALLABLE void adj_sqrt(T x, T& adj_x, T adj_ret)\
+inline CUDA_CALLABLE void adj_sqrt(T x, T ret, T& adj_x, T adj_ret)\
 {\
-    adj_x += T(0.5)*(T(1)/sqrt(x))*adj_ret;\
+    adj_x += T(0.5)*(T(1)/ret)*adj_ret;\
     DO_IF_FPCHECK(if (!isfinite(adj_x))\
     {\
         printf("%s:%d - adj_sqrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
         assert(0);\
     })\
 }\
+inline CUDA_CALLABLE void adj_cbrt(T x, T ret, T& adj_x, T adj_ret)\
+{\
+    adj_x += (T(1)/T(3))*(T(1)/(ret*ret))*adj_ret;\
+    DO_IF_FPCHECK(if (!isfinite(adj_x))\
+    {\
+        printf("%s:%d - adj_cbrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
+        assert(0);\
+    })\
+}\
 inline CUDA_CALLABLE void adj_degrees(T x, T& adj_x, T adj_ret)\
 {\
     adj_x += RAD_TO_DEG * adj_ret;\
@@ -971,7 +991,13 @@ inline CUDA_CALLABLE void adj_degrees(T x, T& adj_x, T adj_ret)\
 inline CUDA_CALLABLE void adj_radians(T x, T& adj_x, T adj_ret)\
 {\
     adj_x += DEG_TO_RAD * adj_ret;\
-}
+}\
+inline CUDA_CALLABLE void adj_round(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_rint(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_trunc(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_floor(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_ceil(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_frac(T x, T& adj_x, T adj_ret){ }
 DECLARE_ADJOINTS(float16)
 DECLARE_ADJOINTS(float32)
@@ -995,17 +1021,31 @@ CUDA_CALLABLE inline void adj_select(const C& cond, const T& a, const T& b, C& a
 }
 template <typename T>
-CUDA_CALLABLE inline void copy(T& dest, const T& src)
+CUDA_CALLABLE inline T copy(const T& src)
+{
+    return src;
+}
+template <typename T>
+CUDA_CALLABLE inline void adj_copy(const T& src, T& adj_src, T& adj_dest)
+{
+    adj_src = adj_dest;
+    adj_dest = T{};
+}
+template <typename T>
+CUDA_CALLABLE inline void assign(T& dest, const T& src)
 {
     dest = src;
 }
 template <typename T>
-CUDA_CALLABLE inline void adj_copy(T& dest, const T& src, T& adj_dest, T& adj_src)
+CUDA_CALLABLE inline void adj_assign(T& dest, const T& src, T& adj_dest, T& adj_src)
 {
-    // nop, this is non-differentiable operation since it violates SSA
+    // this is generally a non-differentiable operation since it violates SSA,
+    // except in read-modify-write statements which are reversible through backpropagation
     adj_src = adj_dest;
-    adj_dest = T(0);
+    adj_dest = T{};
 }
@@ -1050,34 +1090,8 @@ struct launch_bounds_t
     size_t size;                // total number of threads
 };
-#ifdef __CUDACC__
-// store launch bounds in shared memory so
-// we can access them from any user func
-// this is to avoid having to explicitly
-// set another piece of __constant__ memory
-// from the host
-__shared__ launch_bounds_t s_launchBounds;
-__device__ inline void set_launch_bounds(const launch_bounds_t& b)
-{
-    if (threadIdx.x == 0)
-        s_launchBounds = b;
-    __syncthreads();
-}
-#else
-// for single-threaded CPU we store launch
-// bounds in static memory to share globally
-static launch_bounds_t s_launchBounds;
+#ifndef __CUDACC__
 static size_t s_threadIdx;
-inline void set_launch_bounds(const launch_bounds_t& b)
-{
-    s_launchBounds = b;
-}
 #endif
 inline CUDA_CALLABLE size_t grid_index()
@@ -1091,10 +1105,8 @@ inline CUDA_CALLABLE size_t grid_index()
 #endif
 }
-inline CUDA_CALLABLE int tid()
+inline CUDA_CALLABLE int tid(size_t index)
 {
-    const size_t index = grid_index();
     // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
     // Only do this in _DEBUG when called from device to avoid excessive register allocation
 #if defined(_DEBUG) || !defined(__CUDA_ARCH__)
@@ -1105,23 +1117,19 @@ inline CUDA_CALLABLE int tid()
     return static_cast<int>(index);
 }
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds)
 {
-    const size_t index = grid_index();
-    const size_t n = s_launchBounds.shape[1];
+    const size_t n = launch_bounds.shape[1];
     // convert to work item
     i = index/n;
     j = index%n;
 }
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds)
 {
-    const size_t index = grid_index();
-    const size_t n = s_launchBounds.shape[1];
-    const size_t o = s_launchBounds.shape[2];
+    const size_t n = launch_bounds.shape[1];
+    const size_t o = launch_bounds.shape[2];
     // convert to work item
     i = index/(n*o);
@@ -1129,13 +1137,11 @@ inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k)
     k = index%o;
 }
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds)
 {
-    const size_t index = grid_index();
-    const size_t n = s_launchBounds.shape[1];
-    const size_t o = s_launchBounds.shape[2];
-    const size_t p = s_launchBounds.shape[3];
+    const size_t n = launch_bounds.shape[1];
+    const size_t o = launch_bounds.shape[2];
+    const size_t p = launch_bounds.shape[3];
     // convert to work item
     i = index/(n*o*p);
@@ -1265,9 +1271,36 @@ inline CUDA_CALLABLE int atomic_min(int* address, int val)
 #endif
 }
+// default behavior for adjoint of atomic min/max operation that accumulates gradients for all elements matching the min/max value
+template <typename T>
+CUDA_CALLABLE inline void adj_atomic_minmax(T *addr, T *adj_addr, const T &value, T &adj_value)
+{
+    if (value == *addr)
+        adj_value += *adj_addr;
+}
+// for integral types we do not accumulate gradients
+CUDA_CALLABLE inline void adj_atomic_minmax(int8* buf, int8* adj_buf, const int8 &value, int8 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint8* buf, uint8* adj_buf, const uint8 &value, uint8 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int16* buf, int16* adj_buf, const int16 &value, int16 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint16* buf, uint16* adj_buf, const uint16 &value, uint16 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int32* buf, int32* adj_buf, const int32 &value, int32 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint32* buf, uint32* adj_buf, const uint32 &value, uint32 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int64* buf, int64* adj_buf, const int64 &value, int64 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint64* buf, uint64* adj_buf, const uint64 &value, uint64 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(bool* buf, bool* adj_buf, const bool &value, bool &adj_value) { }
 } // namespace wp
+// bool and printf are defined outside of the wp namespace in crt.h, hence
+// their adjoint counterparts are also defined in the global namespace.
+template <typename T>
+CUDA_CALLABLE inline void adj_bool(T, T&, bool) {}
+inline CUDA_CALLABLE void adj_printf(const char* fmt, ...) {}
 #include "vec.h"
 #include "mat.h"
 #include "quat.h"
@@ -1432,10 +1465,6 @@ inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_
 inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {}
-// printf defined globally in crt.h
-inline CUDA_CALLABLE void adj_printf(const char* fmt, ...) {}
 template <typename T>
 inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
 {