PyPI - warp-lang - Versions diffs - 1.0.1__py3-none-manylinux2014_aarch64.whl → 1.1.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.0.1__py3-none-manylinux2014_aarch64.whl → 1.1.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show

warp/__init__.py +108 -97
warp/__init__.pyi +1 -1
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +115 -113
warp/build_dll.py +383 -375
warp/builtins.py +3425 -3354
warp/codegen.py +2878 -2792
warp/config.py +40 -36
warp/constants.py +45 -45
warp/context.py +5194 -5102
warp/dlpack.py +442 -442
warp/examples/__init__.py +16 -16
warp/examples/assets/bear.usd +0 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usd +0 -0
warp/examples/assets/nv_ant.xml +92 -92
warp/examples/assets/nv_humanoid.xml +183 -183
warp/examples/assets/quadruped.urdf +267 -267
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usd +0 -0
warp/examples/benchmarks/benchmark_api.py +383 -383
warp/examples/benchmarks/benchmark_cloth.py +278 -279
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -88
warp/examples/benchmarks/benchmark_cloth_jax.py +97 -100
warp/examples/benchmarks/benchmark_cloth_numba.py +146 -142
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -77
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -86
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -112
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -146
warp/examples/benchmarks/benchmark_launches.py +295 -295
warp/examples/browse.py +29 -28
warp/examples/core/example_dem.py +234 -221
warp/examples/core/example_fluid.py +293 -267
warp/examples/core/example_graph_capture.py +144 -129
warp/examples/core/example_marching_cubes.py +188 -176
warp/examples/core/example_mesh.py +174 -154
warp/examples/core/example_mesh_intersect.py +205 -193
warp/examples/core/example_nvdb.py +176 -169
warp/examples/core/example_raycast.py +105 -89
warp/examples/core/example_raymarch.py +199 -178
warp/examples/core/example_render_opengl.py +185 -141
warp/examples/core/example_sph.py +405 -389
warp/examples/core/example_torch.py +222 -181
warp/examples/core/example_wave.py +263 -249
warp/examples/fem/bsr_utils.py +378 -380
warp/examples/fem/example_apic_fluid.py +407 -391
warp/examples/fem/example_convection_diffusion.py +182 -168
warp/examples/fem/example_convection_diffusion_dg.py +219 -209
warp/examples/fem/example_convection_diffusion_dg0.py +204 -194
warp/examples/fem/example_deformed_geometry.py +177 -159
warp/examples/fem/example_diffusion.py +201 -173
warp/examples/fem/example_diffusion_3d.py +177 -152
warp/examples/fem/example_diffusion_mgpu.py +221 -214
warp/examples/fem/example_mixed_elasticity.py +244 -222
warp/examples/fem/example_navier_stokes.py +259 -243
warp/examples/fem/example_stokes.py +220 -192
warp/examples/fem/example_stokes_transfer.py +265 -249
warp/examples/fem/mesh_utils.py +133 -109
warp/examples/fem/plot_utils.py +292 -287
warp/examples/optim/example_bounce.py +260 -248
warp/examples/optim/example_cloth_throw.py +222 -210
warp/examples/optim/example_diffray.py +566 -535
warp/examples/optim/example_drone.py +864 -835
warp/examples/optim/example_inverse_kinematics.py +176 -169
warp/examples/optim/example_inverse_kinematics_torch.py +185 -170
warp/examples/optim/example_spring_cage.py +239 -234
warp/examples/optim/example_trajectory.py +223 -201
warp/examples/optim/example_walker.py +306 -292
warp/examples/sim/example_cartpole.py +139 -128
warp/examples/sim/example_cloth.py +196 -184
warp/examples/sim/example_granular.py +124 -113
warp/examples/sim/example_granular_collision_sdf.py +197 -185
warp/examples/sim/example_jacobian_ik.py +236 -213
warp/examples/sim/example_particle_chain.py +118 -106
warp/examples/sim/example_quadruped.py +193 -179
warp/examples/sim/example_rigid_chain.py +197 -189
warp/examples/sim/example_rigid_contact.py +189 -176
warp/examples/sim/example_rigid_force.py +127 -126
warp/examples/sim/example_rigid_gyroscopic.py +109 -97
warp/examples/sim/example_rigid_soft_contact.py +134 -124
warp/examples/sim/example_soft_body.py +190 -178
warp/fabric.py +337 -335
warp/fem/__init__.py +60 -27
warp/fem/cache.py +401 -388
warp/fem/dirichlet.py +178 -179
warp/fem/domain.py +262 -263
warp/fem/field/__init__.py +100 -101
warp/fem/field/field.py +148 -149
warp/fem/field/nodal_field.py +298 -299
warp/fem/field/restriction.py +22 -21
warp/fem/field/test.py +180 -181
warp/fem/field/trial.py +183 -183
warp/fem/geometry/__init__.py +15 -19
warp/fem/geometry/closest_point.py +69 -70
warp/fem/geometry/deformed_geometry.py +270 -271
warp/fem/geometry/element.py +744 -744
warp/fem/geometry/geometry.py +184 -186
warp/fem/geometry/grid_2d.py +380 -373
warp/fem/geometry/grid_3d.py +441 -435
warp/fem/geometry/hexmesh.py +953 -953
warp/fem/geometry/partition.py +374 -376
warp/fem/geometry/quadmesh_2d.py +532 -532
warp/fem/geometry/tetmesh.py +840 -840
warp/fem/geometry/trimesh_2d.py +577 -577
warp/fem/integrate.py +1630 -1615
warp/fem/operator.py +190 -191
warp/fem/polynomial.py +214 -213
warp/fem/quadrature/__init__.py +2 -2
warp/fem/quadrature/pic_quadrature.py +243 -245
warp/fem/quadrature/quadrature.py +295 -294
warp/fem/space/__init__.py +294 -292
warp/fem/space/basis_space.py +488 -489
warp/fem/space/collocated_function_space.py +100 -105
warp/fem/space/dof_mapper.py +236 -236
warp/fem/space/function_space.py +148 -145
warp/fem/space/grid_2d_function_space.py +267 -267
warp/fem/space/grid_3d_function_space.py +305 -306
warp/fem/space/hexmesh_function_space.py +350 -352
warp/fem/space/partition.py +350 -350
warp/fem/space/quadmesh_2d_function_space.py +368 -369
warp/fem/space/restriction.py +158 -160
warp/fem/space/shape/__init__.py +13 -15
warp/fem/space/shape/cube_shape_function.py +738 -738
warp/fem/space/shape/shape_function.py +102 -103
warp/fem/space/shape/square_shape_function.py +611 -611
warp/fem/space/shape/tet_shape_function.py +565 -567
warp/fem/space/shape/triangle_shape_function.py +429 -429
warp/fem/space/tetmesh_function_space.py +294 -292
warp/fem/space/topology.py +297 -295
warp/fem/space/trimesh_2d_function_space.py +223 -221
warp/fem/types.py +77 -77
warp/fem/utils.py +495 -495
warp/jax.py +166 -141
warp/jax_experimental.py +341 -339
warp/native/array.h +1072 -1025
warp/native/builtin.h +1560 -1560
warp/native/bvh.cpp +398 -398
warp/native/bvh.cu +525 -525
warp/native/bvh.h +429 -429
warp/native/clang/clang.cpp +495 -464
warp/native/crt.cpp +31 -31
warp/native/crt.h +334 -334
warp/native/cuda_crt.h +1049 -1049
warp/native/cuda_util.cpp +549 -540
warp/native/cuda_util.h +288 -203
warp/native/cutlass_gemm.cpp +34 -34
warp/native/cutlass_gemm.cu +372 -372
warp/native/error.cpp +66 -66
warp/native/error.h +27 -27
warp/native/fabric.h +228 -228
warp/native/hashgrid.cpp +301 -278
warp/native/hashgrid.cu +78 -77
warp/native/hashgrid.h +227 -227
warp/native/initializer_array.h +32 -32
warp/native/intersect.h +1204 -1204
warp/native/intersect_adj.h +365 -365
warp/native/intersect_tri.h +322 -322
warp/native/marching.cpp +2 -2
warp/native/marching.cu +497 -497
warp/native/marching.h +2 -2
warp/native/mat.h +1498 -1498
warp/native/matnn.h +333 -333
warp/native/mesh.cpp +203 -203
warp/native/mesh.cu +293 -293
warp/native/mesh.h +1887 -1887
warp/native/nanovdb/NanoVDB.h +4782 -4782
warp/native/nanovdb/PNanoVDB.h +2553 -2553
warp/native/nanovdb/PNanoVDBWrite.h +294 -294
warp/native/noise.h +850 -850
warp/native/quat.h +1084 -1084
warp/native/rand.h +299 -299
warp/native/range.h +108 -108
warp/native/reduce.cpp +156 -156
warp/native/reduce.cu +348 -348
warp/native/runlength_encode.cpp +61 -61
warp/native/runlength_encode.cu +46 -46
warp/native/scan.cpp +30 -30
warp/native/scan.cu +36 -36
warp/native/scan.h +7 -7
warp/native/solid_angle.h +442 -442
warp/native/sort.cpp +94 -94
warp/native/sort.cu +97 -97
warp/native/sort.h +14 -14
warp/native/sparse.cpp +337 -337
warp/native/sparse.cu +544 -544
warp/native/spatial.h +630 -630
warp/native/svd.h +562 -562
warp/native/temp_buffer.h +30 -30
warp/native/vec.h +1132 -1132
warp/native/volume.cpp +297 -297
warp/native/volume.cu +32 -32
warp/native/volume.h +538 -538
warp/native/volume_builder.cu +425 -425
warp/native/volume_builder.h +19 -19
warp/native/warp.cpp +1057 -1052
warp/native/warp.cu +2943 -2828
warp/native/warp.h +313 -305
warp/optim/__init__.py +9 -9
warp/optim/adam.py +120 -120
warp/optim/linear.py +1104 -939
warp/optim/sgd.py +104 -92
warp/render/__init__.py +10 -10
warp/render/render_opengl.py +3217 -3204
warp/render/render_usd.py +768 -749
warp/render/utils.py +152 -150
warp/sim/__init__.py +52 -59
warp/sim/articulation.py +685 -685
warp/sim/collide.py +1594 -1590
warp/sim/import_mjcf.py +489 -481
warp/sim/import_snu.py +220 -221
warp/sim/import_urdf.py +536 -516
warp/sim/import_usd.py +887 -881
warp/sim/inertia.py +316 -317
warp/sim/integrator.py +234 -233
warp/sim/integrator_euler.py +1956 -1956
warp/sim/integrator_featherstone.py +1910 -1991
warp/sim/integrator_xpbd.py +3294 -3312
warp/sim/model.py +4473 -4314
warp/sim/particles.py +113 -112
warp/sim/render.py +417 -403
warp/sim/utils.py +413 -410
warp/sparse.py +1227 -1227
warp/stubs.py +2109 -2469
warp/tape.py +1162 -225
warp/tests/__init__.py +1 -1
warp/tests/__main__.py +4 -4
warp/tests/assets/torus.usda +105 -105
warp/tests/aux_test_class_kernel.py +26 -26
warp/tests/aux_test_compile_consts_dummy.py +10 -10
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -21
warp/tests/aux_test_dependent.py +22 -22
warp/tests/aux_test_grad_customs.py +23 -23
warp/tests/aux_test_reference.py +11 -11
warp/tests/aux_test_reference_reference.py +10 -10
warp/tests/aux_test_square.py +17 -17
warp/tests/aux_test_unresolved_func.py +14 -14
warp/tests/aux_test_unresolved_symbol.py +14 -14
warp/tests/disabled_kinematics.py +239 -239
warp/tests/run_coverage_serial.py +31 -31
warp/tests/test_adam.py +157 -157
warp/tests/test_arithmetic.py +1124 -1124
warp/tests/test_array.py +2417 -2326
warp/tests/test_array_reduce.py +150 -150
warp/tests/test_async.py +668 -656
warp/tests/test_atomic.py +141 -141
warp/tests/test_bool.py +204 -149
warp/tests/test_builtins_resolution.py +1292 -1292
warp/tests/test_bvh.py +164 -171
warp/tests/test_closest_point_edge_edge.py +228 -228
warp/tests/test_codegen.py +566 -553
warp/tests/test_compile_consts.py +97 -101
warp/tests/test_conditional.py +246 -246
warp/tests/test_copy.py +232 -215
warp/tests/test_ctypes.py +632 -632
warp/tests/test_dense.py +67 -67
warp/tests/test_devices.py +91 -98
warp/tests/test_dlpack.py +530 -529
warp/tests/test_examples.py +400 -378
warp/tests/test_fabricarray.py +955 -955
warp/tests/test_fast_math.py +62 -54
warp/tests/test_fem.py +1277 -1278
warp/tests/test_fp16.py +130 -130
warp/tests/test_func.py +338 -337
warp/tests/test_generics.py +571 -571
warp/tests/test_grad.py +746 -640
warp/tests/test_grad_customs.py +333 -336
warp/tests/test_hash_grid.py +210 -164
warp/tests/test_import.py +39 -39
warp/tests/test_indexedarray.py +1134 -1134
warp/tests/test_intersect.py +67 -67
warp/tests/test_jax.py +307 -307
warp/tests/test_large.py +167 -164
warp/tests/test_launch.py +354 -354
warp/tests/test_lerp.py +261 -261
warp/tests/test_linear_solvers.py +191 -171
warp/tests/test_lvalue.py +421 -493
warp/tests/test_marching_cubes.py +65 -65
warp/tests/test_mat.py +1801 -1827
warp/tests/test_mat_lite.py +115 -115
warp/tests/test_mat_scalar_ops.py +2907 -2889
warp/tests/test_math.py +126 -193
warp/tests/test_matmul.py +500 -499
warp/tests/test_matmul_lite.py +410 -410
warp/tests/test_mempool.py +188 -190
warp/tests/test_mesh.py +284 -324
warp/tests/test_mesh_query_aabb.py +228 -241
warp/tests/test_mesh_query_point.py +692 -702
warp/tests/test_mesh_query_ray.py +292 -303
warp/tests/test_mlp.py +276 -276
warp/tests/test_model.py +110 -110
warp/tests/test_modules_lite.py +39 -39
warp/tests/test_multigpu.py +163 -163
warp/tests/test_noise.py +248 -248
warp/tests/test_operators.py +250 -250
warp/tests/test_options.py +123 -125
warp/tests/test_peer.py +133 -137
warp/tests/test_pinned.py +78 -78
warp/tests/test_print.py +54 -54
warp/tests/test_quat.py +2086 -2086
warp/tests/test_rand.py +288 -288
warp/tests/test_reload.py +217 -217
warp/tests/test_rounding.py +179 -179
warp/tests/test_runlength_encode.py +190 -190
warp/tests/test_sim_grad.py +243 -0
warp/tests/test_sim_kinematics.py +91 -97
warp/tests/test_smoothstep.py +168 -168
warp/tests/test_snippet.py +305 -266
warp/tests/test_sparse.py +468 -460
warp/tests/test_spatial.py +2148 -2148
warp/tests/test_streams.py +486 -473
warp/tests/test_struct.py +710 -675
warp/tests/test_tape.py +173 -148
warp/tests/test_torch.py +743 -743
warp/tests/test_transient_module.py +87 -87
warp/tests/test_types.py +556 -659
warp/tests/test_utils.py +490 -499
warp/tests/test_vec.py +1264 -1268
warp/tests/test_vec_lite.py +73 -73
warp/tests/test_vec_scalar_ops.py +2099 -2099
warp/tests/test_verify_fp.py +94 -94
warp/tests/test_volume.py +737 -736
warp/tests/test_volume_write.py +255 -265
warp/tests/unittest_serial.py +37 -37
warp/tests/unittest_suites.py +363 -359
warp/tests/unittest_utils.py +603 -578
warp/tests/unused_test_misc.py +71 -71
warp/tests/walkthrough_debug.py +85 -85
warp/thirdparty/appdirs.py +598 -598
warp/thirdparty/dlpack.py +143 -143
warp/thirdparty/unittest_parallel.py +566 -561
warp/torch.py +321 -295
warp/types.py +4504 -4450
warp/utils.py +1008 -821
{warp_lang-1.0.1.dist-info → warp_lang-1.1.0.dist-info}/LICENSE.md +126 -126
{warp_lang-1.0.1.dist-info → warp_lang-1.1.0.dist-info}/METADATA +338 -400
warp_lang-1.1.0.dist-info/RECORD +352 -0
warp/examples/assets/cube.usda +0 -42
warp/examples/assets/sphere.usda +0 -56
warp/examples/assets/torus.usda +0 -105
warp_lang-1.0.1.dist-info/RECORD +0 -352
{warp_lang-1.0.1.dist-info → warp_lang-1.1.0.dist-info}/WHEEL +0 -0
{warp_lang-1.0.1.dist-info → warp_lang-1.1.0.dist-info}/top_level.txt +0 -0

warp/native/builtin.h CHANGED Viewed

@@ -1,1560 +1,1560 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- */
-#pragma once
-// All built-in types and functions. To be compatible with runtime NVRTC compilation
-// this header must be independently compilable (i.e.: without external SDK headers)
-// to achieve this we redefine a subset of CRT functions (printf, pow, sin, cos, etc)
-#include "crt.h"
-#ifdef _WIN32
-#define __restrict__ __restrict
-#endif
-#if !defined(__CUDACC__)
-    #define CUDA_CALLABLE
-    #define CUDA_CALLABLE_DEVICE
-#else
-    #define CUDA_CALLABLE __host__ __device__
-    #define CUDA_CALLABLE_DEVICE __device__
-#endif
-#ifdef WP_VERIFY_FP
-#define FP_CHECK 1
-#define DO_IF_FPCHECK(X) {X}
-#define DO_IF_NO_FPCHECK(X)
-#else
-#define FP_CHECK 0
-#define DO_IF_FPCHECK(X)
-#define DO_IF_NO_FPCHECK(X) {X}
-#endif
-#define RAD_TO_DEG 57.29577951308232087679
-#define DEG_TO_RAD  0.01745329251994329577
-#if defined(__CUDACC__) && !defined(_MSC_VER)
-__device__ void __debugbreak() {}
-#endif
-namespace wp
-{
-// numeric types (used from generated kernels)
-typedef float float32;
-typedef double float64;
-typedef int8_t int8;
-typedef uint8_t uint8;
-typedef int16_t int16;
-typedef uint16_t uint16;
-typedef int32_t int32;
-typedef uint32_t uint32;
-typedef int64_t int64;
-typedef uint64_t uint64;
-// matches Python string type for constant strings
-typedef const char* str;
-struct half;
-CUDA_CALLABLE half float_to_half(float x);
-CUDA_CALLABLE float half_to_float(half x);
-struct half
-{
-    CUDA_CALLABLE inline half() : u(0) {}
-    CUDA_CALLABLE inline half(float f)
-    {
-        *this = float_to_half(f);
-    }
-    unsigned short u;
-    CUDA_CALLABLE inline bool operator==(const half& h) const { return u == h.u; }
-    CUDA_CALLABLE inline bool operator!=(const half& h) const { return u != h.u; }
-    CUDA_CALLABLE inline bool operator>(const half& h) const { return half_to_float(*this) > half_to_float(h); }
-    CUDA_CALLABLE inline bool operator>=(const half& h) const { return half_to_float(*this) >= half_to_float(h); }
-    CUDA_CALLABLE inline bool operator<(const half& h) const { return half_to_float(*this) < half_to_float(h); }
-    CUDA_CALLABLE inline bool operator<=(const half& h) const { return half_to_float(*this) <= half_to_float(h); }
-    CUDA_CALLABLE inline bool operator!() const
-    {
-        return float32(*this) == 0;
-    }
-    CUDA_CALLABLE inline half operator*=(const half& h)
-    {
-        half prod = half(float32(*this) * float32(h));
-        this->u = prod.u;
-        return *this;
-    }
-    CUDA_CALLABLE inline half operator/=(const half& h)
-    {
-        half quot = half(float32(*this) / float32(h));
-        this->u = quot.u;
-        return *this;
-    }
-    CUDA_CALLABLE inline half operator+=(const half& h)
-    {
-        half sum = half(float32(*this) + float32(h));
-        this->u = sum.u;
-        return *this;
-    }
-    CUDA_CALLABLE inline half operator-=(const half& h)
-    {
-        half diff = half(float32(*this) - float32(h));
-        this->u = diff.u;
-        return *this;
-    }
-    CUDA_CALLABLE inline operator float32() const { return float32(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator float64() const { return float64(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator int8() const { return int8(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator uint8() const { return uint8(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator int16() const { return int16(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator uint16() const { return uint16(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator int32() const { return int32(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator uint32() const { return uint32(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator int64() const { return int64(half_to_float(*this)); }
-    CUDA_CALLABLE inline operator uint64() const { return uint64(half_to_float(*this)); }
-};
-static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
-typedef half float16;
-#if defined(__CUDA_ARCH__)
-CUDA_CALLABLE inline half float_to_half(float x)
-{
-    half h;
-    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(h.u) : "f"(x));
-    return h;
-}
-CUDA_CALLABLE inline float half_to_float(half x)
-{
-    float val;
-    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(x.u));
-    return val;
-}
-#elif defined(__clang__)
-// _Float16 is Clang's native half-precision floating-point type
-inline half float_to_half(float x)
-{
-    _Float16 f16 = static_cast<_Float16>(x);
-    return *reinterpret_cast<half*>(&f16);
-}
-inline float half_to_float(half h)
-{
-    _Float16 f16 = *reinterpret_cast<_Float16*>(&h);
-    return static_cast<float>(f16);
-}
-#else  // Native C++ for Warp builtins outside of kernels
-extern "C" WP_API uint16_t float_to_half_bits(float x);
-extern "C" WP_API float half_bits_to_float(uint16_t u);
-inline half float_to_half(float x)
-{
-    half h;
-    h.u = float_to_half_bits(x);
-    return h;
-}
-inline float half_to_float(half h)
-{
-   return half_bits_to_float(h.u);
-}
-#endif
-// BAD operator implementations for fp16 arithmetic...
-// negation:
-inline CUDA_CALLABLE half operator - (half a)
-{
-    return float_to_half( -half_to_float(a) );
-}
-inline CUDA_CALLABLE half operator + (half a,half b)
-{
-    return float_to_half( half_to_float(a) + half_to_float(b) );
-}
-inline CUDA_CALLABLE half operator - (half a,half b)
-{
-    return float_to_half( half_to_float(a) - half_to_float(b) );
-}
-inline CUDA_CALLABLE half operator * (half a,half b)
-{
-    return float_to_half( half_to_float(a) * half_to_float(b) );
-}
-inline CUDA_CALLABLE half operator * (half a,double b)
-{
-    return float_to_half( half_to_float(a) * b );
-}
-inline CUDA_CALLABLE half operator * (double a,half b)
-{
-    return float_to_half( a * half_to_float(b) );
-}
-inline CUDA_CALLABLE half operator / (half a,half b)
-{
-    return float_to_half( half_to_float(a) / half_to_float(b) );
-}
-template <typename T>
-CUDA_CALLABLE float cast_float(T x) { return (float)(x); }
-template <typename T>
-CUDA_CALLABLE int cast_int(T x) { return (int)(x); }
-template <typename T>
-CUDA_CALLABLE void adj_cast_float(T x, T& adj_x, float adj_ret) { adj_x += T(adj_ret); }
-template <typename T>
-CUDA_CALLABLE void adj_cast_int(T x, T& adj_x, int adj_ret) { adj_x += adj_ret; }
-template <typename T>
-CUDA_CALLABLE inline void adj_int8(T, T&, int8) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_uint8(T, T&, uint8) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_int16(T, T&, int16) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_uint16(T, T&, uint16) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_int32(T, T&, int32) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_uint32(T, T&, uint32) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_int64(T, T&, int64) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_uint64(T, T&, uint64) {}
-template <typename T>
-CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float16 adj_ret) { adj_x += T(adj_ret); }
-template <typename T>
-CUDA_CALLABLE inline void adj_float32(T x, T& adj_x, float32 adj_ret) { adj_x += T(adj_ret); }
-template <typename T>
-CUDA_CALLABLE inline void adj_float64(T x, T& adj_x, float64 adj_ret) { adj_x += T(adj_ret); }
-#define kEps 0.0f
-// basic ops for integer types
-#define DECLARE_INT_OPS(T) \
-inline CUDA_CALLABLE T mul(T a, T b) { return a*b; } \
-inline CUDA_CALLABLE T div(T a, T b) { return a/b; } \
-inline CUDA_CALLABLE T add(T a, T b) { return a+b; } \
-inline CUDA_CALLABLE T sub(T a, T b) { return a-b; } \
-inline CUDA_CALLABLE T mod(T a, T b) { return a%b; } \
-inline CUDA_CALLABLE T min(T a, T b) { return a<b?a:b; } \
-inline CUDA_CALLABLE T max(T a, T b) { return a>b?a:b; } \
-inline CUDA_CALLABLE T clamp(T x, T a, T b) { return min(max(a, x), b); } \
-inline CUDA_CALLABLE T floordiv(T a, T b) { return a/b; } \
-inline CUDA_CALLABLE T nonzero(T x) { return x == T(0) ? T(0) : T(1); } \
-inline CUDA_CALLABLE T sqrt(T x) { return 0; } \
-inline CUDA_CALLABLE T bit_and(T a, T b) { return a&b; } \
-inline CUDA_CALLABLE T bit_or(T a, T b) { return a|b; } \
-inline CUDA_CALLABLE T bit_xor(T a, T b) { return a^b; } \
-inline CUDA_CALLABLE T lshift(T a, T b) { return a<<b; } \
-inline CUDA_CALLABLE T rshift(T a, T b) { return a>>b; } \
-inline CUDA_CALLABLE T invert(T x) { return ~x; } \
-inline CUDA_CALLABLE bool isfinite(T x) { return true; } \
-inline CUDA_CALLABLE void adj_mul(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_add(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_sub(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_mod(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_min(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_max(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_abs(T x, T adj_x, T& adj_ret) { } \
-inline CUDA_CALLABLE void adj_sign(T x, T adj_x, T& adj_ret) { } \
-inline CUDA_CALLABLE void adj_clamp(T x, T a, T b, T& adj_x, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_floordiv(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_step(T x, T& adj_x, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_nonzero(T x, T& adj_x, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_sqrt(T x, T adj_x, T& adj_ret) { } \
-inline CUDA_CALLABLE void adj_bit_and(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_bit_or(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_bit_xor(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_lshift(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_rshift(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_invert(T x, T adj_x, T& adj_ret) { }
-inline CUDA_CALLABLE int8 abs(int8 x) { return ::abs(x); }
-inline CUDA_CALLABLE int16 abs(int16 x) { return ::abs(x); }
-inline CUDA_CALLABLE int32 abs(int32 x) { return ::abs(x); }
-inline CUDA_CALLABLE int64 abs(int64 x) { return ::llabs(x); }
-inline CUDA_CALLABLE uint8 abs(uint8 x) { return x; }
-inline CUDA_CALLABLE uint16 abs(uint16 x) { return x; }
-inline CUDA_CALLABLE uint32 abs(uint32 x) { return x; }
-inline CUDA_CALLABLE uint64 abs(uint64 x) { return x; }
- DECLARE_INT_OPS(int8)
- DECLARE_INT_OPS(int16)
- DECLARE_INT_OPS(int32)
- DECLARE_INT_OPS(int64)
- DECLARE_INT_OPS(uint8)
- DECLARE_INT_OPS(uint16)
- DECLARE_INT_OPS(uint32)
- DECLARE_INT_OPS(uint64)
-inline CUDA_CALLABLE int8 step(int8 x) { return x < 0 ? 1 : 0; }
-inline CUDA_CALLABLE int16 step(int16 x) { return x < 0 ? 1 : 0; }
-inline CUDA_CALLABLE int32 step(int32 x) { return x < 0 ? 1 : 0; }
-inline CUDA_CALLABLE int64 step(int64 x) { return x < 0 ? 1 : 0; }
-inline CUDA_CALLABLE uint8 step(uint8 x) { return 0; }
-inline CUDA_CALLABLE uint16 step(uint16 x) { return 0; }
-inline CUDA_CALLABLE uint32 step(uint32 x) { return 0; }
-inline CUDA_CALLABLE uint64 step(uint64 x) { return 0; }
-inline CUDA_CALLABLE int8 sign(int8 x) { return x < 0 ? -1 : 1; }
-inline CUDA_CALLABLE int8 sign(int16 x) { return x < 0 ? -1 : 1; }
-inline CUDA_CALLABLE int8 sign(int32 x) { return x < 0 ? -1 : 1; }
-inline CUDA_CALLABLE int8 sign(int64 x) { return x < 0 ? -1 : 1; }
-inline CUDA_CALLABLE uint8 sign(uint8 x) { return 1; }
-inline CUDA_CALLABLE uint16 sign(uint16 x) { return 1; }
-inline CUDA_CALLABLE uint32 sign(uint32 x) { return 1; }
-inline CUDA_CALLABLE uint64 sign(uint64 x) { return 1; }
-// Catch-all for non-float types
-template<typename T>
-inline bool CUDA_CALLABLE isfinite(const T&)
-{
-    return true;
-}
-inline bool CUDA_CALLABLE isfinite(half x)
-{
-    return ::isfinite(float(x));
-}
-inline bool CUDA_CALLABLE isfinite(float x)
-{
-    return ::isfinite(x);
-}
-inline bool CUDA_CALLABLE isfinite(double x)
-{
-    return ::isfinite(x);
-}
-template<typename T>
-inline CUDA_CALLABLE void print(const T&)
-{
-    printf("<type without print implementation>\n");
-}
-inline CUDA_CALLABLE void print(float16 f)
-{
-    printf("%g\n", half_to_float(f));
-}
-inline CUDA_CALLABLE void print(float f)
-{
-    printf("%g\n", f);
-}
-inline CUDA_CALLABLE void print(double f)
-{
-    printf("%g\n", f);
-}
-// basic ops for float types
-#define DECLARE_FLOAT_OPS(T) \
-inline CUDA_CALLABLE T mul(T a, T b) { return a*b; } \
-inline CUDA_CALLABLE T add(T a, T b) { return a+b; } \
-inline CUDA_CALLABLE T sub(T a, T b) { return a-b; } \
-inline CUDA_CALLABLE T min(T a, T b) { return a<b?a:b; } \
-inline CUDA_CALLABLE T max(T a, T b) { return a>b?a:b; } \
-inline CUDA_CALLABLE T sign(T x) { return x < T(0) ? -1 : 1; } \
-inline CUDA_CALLABLE T step(T x) { return x < T(0) ? T(1) : T(0); }\
-inline CUDA_CALLABLE T nonzero(T x) { return x == T(0) ? T(0) : T(1); }\
-inline CUDA_CALLABLE T clamp(T x, T a, T b) { return min(max(a, x), b); }\
-inline CUDA_CALLABLE void adj_abs(T x, T& adj_x, T adj_ret) \
-{\
-    if (x < T(0))\
-        adj_x -= adj_ret;\
-    else\
-        adj_x += adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_mul(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += b*adj_ret; adj_b += a*adj_ret; } \
-inline CUDA_CALLABLE void adj_add(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += adj_ret; adj_b += adj_ret; } \
-inline CUDA_CALLABLE void adj_sub(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += adj_ret; adj_b -= adj_ret; } \
-inline CUDA_CALLABLE void adj_min(T a, T b, T& adj_a, T& adj_b, T adj_ret) \
-{ \
-    if (a < b) \
-        adj_a += adj_ret; \
-    else \
-        adj_b += adj_ret; \
-} \
-inline CUDA_CALLABLE void adj_max(T a, T b, T& adj_a, T& adj_b, T adj_ret) \
-{ \
-    if (a > b) \
-        adj_a += adj_ret; \
-    else \
-        adj_b += adj_ret; \
-} \
-inline CUDA_CALLABLE void adj_floordiv(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
-inline CUDA_CALLABLE void adj_mod(T a, T b, T& adj_a, T& adj_b, T adj_ret){ adj_a += adj_ret; }\
-inline CUDA_CALLABLE void adj_sign(T x, T adj_x, T& adj_ret) { }\
-inline CUDA_CALLABLE void adj_step(T x, T& adj_x, T adj_ret) { }\
-inline CUDA_CALLABLE void adj_nonzero(T x, T& adj_x, T adj_ret) { }\
-inline CUDA_CALLABLE void adj_clamp(T x, T a, T b, T& adj_x, T& adj_a, T& adj_b, T adj_ret)\
-{\
-    if (x < a)\
-        adj_a += adj_ret;\
-    else if (x > b)\
-        adj_b += adj_ret;\
-    else\
-        adj_x += adj_ret;\
-}\
-inline CUDA_CALLABLE T div(T a, T b)\
-{\
-    DO_IF_FPCHECK(\
-    if (!isfinite(a) || !isfinite(b) || b == T(0))\
-    {\
-        printf("%s:%d div(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));\
-        assert(0);\
-    })\
-    return a/b;\
-}\
-inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
-{\
-    adj_a += adj_ret/b;\
-    adj_b -= adj_ret*(ret)/b;\
-    DO_IF_FPCHECK(\
-    if (!isfinite(adj_a) || !isfinite(adj_b))\
-    {\
-        printf("%s:%d - adj_div(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(a), float(b), float(adj_a), float(adj_b), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-DECLARE_FLOAT_OPS(float16)
-DECLARE_FLOAT_OPS(float32)
-DECLARE_FLOAT_OPS(float64)
-// basic ops for float types
-inline CUDA_CALLABLE float16 mod(float16 a, float16 b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
-        assert(0);
-    }
-#endif
-    return fmodf(float(a), float(b));
-}
-inline CUDA_CALLABLE float32 mod(float32 a, float32 b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
-        assert(0);
-    }
-#endif
-    return fmodf(a, b);
-}
-inline CUDA_CALLABLE double mod(double a, double b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
-        assert(0);
-    }
-#endif
-    return fmod(a, b);
-}
-inline CUDA_CALLABLE half log(half a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || float(a) < 0.0f)
-    {
-        printf("%s:%d log(%f)\n", __FILE__, __LINE__, float(a));
-        assert(0);
-    }
-#endif
-    return ::logf(a);
-}
-inline CUDA_CALLABLE float log(float a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0f)
-    {
-        printf("%s:%d log(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::logf(a);
-}
-inline CUDA_CALLABLE double log(double a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0)
-    {
-        printf("%s:%d log(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::log(a);
-}
-inline CUDA_CALLABLE half log2(half a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || float(a) < 0.0f)
-    {
-        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, float(a));
-        assert(0);
-    }
-#endif
-    return ::log2f(float(a));
-}
-inline CUDA_CALLABLE float log2(float a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0f)
-    {
-        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::log2f(a);
-}
-inline CUDA_CALLABLE double log2(double a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0)
-    {
-        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::log2(a);
-}
-inline CUDA_CALLABLE half log10(half a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || float(a) < 0.0f)
-    {
-        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, float(a));
-        assert(0);
-    }
-#endif
-    return ::log10f(float(a));
-}
-inline CUDA_CALLABLE float log10(float a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0f)
-    {
-        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::log10f(a);
-}
-inline CUDA_CALLABLE double log10(double a)
-{
-#if FP_CHECK
-    if (!isfinite(a) || a < 0.0)
-    {
-        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a);
-        assert(0);
-    }
-#endif
-    return ::log10(a);
-}
-inline CUDA_CALLABLE half exp(half a)
-{
-    half result = ::expf(float(a));
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(result))
-    {
-        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, float(a), float(result));
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE float exp(float a)
-{
-    float result = ::expf(a);
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(result))
-    {
-        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result);
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE double exp(double a)
-{
-    double result = ::exp(a);
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(result))
-    {
-        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result);
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE half pow(half a, half b)
-{
-    float result = ::powf(float(a), float(b));
-#if FP_CHECK
-    if (!isfinite(float(a)) || !isfinite(float(b)) || !isfinite(result))
-    {
-        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, float(a), float(b), result);
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE float pow(float a, float b)
-{
-    float result = ::powf(a, b);
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || !isfinite(result))
-    {
-        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result);
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE double pow(double a, double b)
-{
-    double result = ::pow(a, b);
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || !isfinite(result))
-    {
-        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result);
-        assert(0);
-    }
-#endif
-    return result;
-}
-inline CUDA_CALLABLE half floordiv(half a, half b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
-        assert(0);
-    }
-#endif
-    return floorf(float(a/b));
-}
-inline CUDA_CALLABLE float floordiv(float a, float b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
-        assert(0);
-    }
-#endif
-    return floorf(a/b);
-}
-inline CUDA_CALLABLE double floordiv(double a, double b)
-{
-#if FP_CHECK
-    if (!isfinite(a) || !isfinite(b) || b == 0.0)
-    {
-        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
-        assert(0);
-    }
-#endif
-    return ::floor(a/b);
-}
-inline CUDA_CALLABLE float leaky_min(float a, float b, float r) { return min(a, b); }
-inline CUDA_CALLABLE float leaky_max(float a, float b, float r) { return max(a, b); }
-inline CUDA_CALLABLE half abs(half x) { return ::fabsf(float(x)); }
-inline CUDA_CALLABLE float abs(float x) { return ::fabsf(x); }
-inline CUDA_CALLABLE double abs(double x) { return ::fabs(x); }
-inline CUDA_CALLABLE float acos(float x){ return ::acosf(min(max(x, -1.0f), 1.0f)); }
-inline CUDA_CALLABLE float asin(float x){ return ::asinf(min(max(x, -1.0f), 1.0f)); }
-inline CUDA_CALLABLE float atan(float x) { return ::atanf(x); }
-inline CUDA_CALLABLE float atan2(float y, float x) { return ::atan2f(y, x); }
-inline CUDA_CALLABLE float sin(float x) { return ::sinf(x); }
-inline CUDA_CALLABLE float cos(float x) { return ::cosf(x); }
-inline CUDA_CALLABLE double acos(double x){ return ::acos(min(max(x, -1.0), 1.0)); }
-inline CUDA_CALLABLE double asin(double x){ return ::asin(min(max(x, -1.0), 1.0)); }
-inline CUDA_CALLABLE double atan(double x) { return ::atan(x); }
-inline CUDA_CALLABLE double atan2(double y, double x) { return ::atan2(y, x); }
-inline CUDA_CALLABLE double sin(double x) { return ::sin(x); }
-inline CUDA_CALLABLE double cos(double x) { return ::cos(x); }
-inline CUDA_CALLABLE half acos(half x){ return ::acosf(min(max(float(x), -1.0f), 1.0f)); }
-inline CUDA_CALLABLE half asin(half x){ return ::asinf(min(max(float(x), -1.0f), 1.0f)); }
-inline CUDA_CALLABLE half atan(half x) { return ::atanf(float(x)); }
-inline CUDA_CALLABLE half atan2(half y, half x) { return ::atan2f(float(y), float(x)); }
-inline CUDA_CALLABLE half sin(half x) { return ::sinf(float(x)); }
-inline CUDA_CALLABLE half cos(half x) { return ::cosf(float(x)); }
-inline CUDA_CALLABLE float sqrt(float x)
-{
-#if FP_CHECK
-    if (x < 0.0f)
-    {
-        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x);
-        assert(0);
-    }
-#endif
-    return ::sqrtf(x);
-}
-inline CUDA_CALLABLE double sqrt(double x)
-{
-#if FP_CHECK
-    if (x < 0.0)
-    {
-        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x);
-        assert(0);
-    }
-#endif
-    return ::sqrt(x);
-}
-inline CUDA_CALLABLE half sqrt(half x)
-{
-#if FP_CHECK
-    if (float(x) < 0.0f)
-    {
-        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, float(x));
-        assert(0);
-    }
-#endif
-    return ::sqrtf(float(x));
-}
-inline CUDA_CALLABLE float cbrt(float x) { return ::cbrtf(x); }
-inline CUDA_CALLABLE double cbrt(double x) { return ::cbrt(x); }
-inline CUDA_CALLABLE half cbrt(half x) { return ::cbrtf(float(x)); }
-inline CUDA_CALLABLE float tan(float x) { return ::tanf(x); }
-inline CUDA_CALLABLE float sinh(float x) { return ::sinhf(x);}
-inline CUDA_CALLABLE float cosh(float x) { return ::coshf(x);}
-inline CUDA_CALLABLE float tanh(float x) { return ::tanhf(x);}
-inline CUDA_CALLABLE float degrees(float x) { return x * RAD_TO_DEG;}
-inline CUDA_CALLABLE float radians(float x) { return x * DEG_TO_RAD;}
-inline CUDA_CALLABLE double tan(double x) { return ::tan(x); }
-inline CUDA_CALLABLE double sinh(double x) { return ::sinh(x);}
-inline CUDA_CALLABLE double cosh(double x) { return ::cosh(x);}
-inline CUDA_CALLABLE double tanh(double x) { return ::tanh(x);}
-inline CUDA_CALLABLE double degrees(double x) { return x * RAD_TO_DEG;}
-inline CUDA_CALLABLE double radians(double x) { return x * DEG_TO_RAD;}
-inline CUDA_CALLABLE half tan(half x) { return ::tanf(float(x)); }
-inline CUDA_CALLABLE half sinh(half x) { return ::sinhf(float(x));}
-inline CUDA_CALLABLE half cosh(half x) { return ::coshf(float(x));}
-inline CUDA_CALLABLE half tanh(half x) { return ::tanhf(float(x));}
-inline CUDA_CALLABLE half degrees(half x) { return x * RAD_TO_DEG;}
-inline CUDA_CALLABLE half radians(half x) { return x * DEG_TO_RAD;}
-inline CUDA_CALLABLE float round(float x) { return ::roundf(x); }
-inline CUDA_CALLABLE float rint(float x) { return ::rintf(x); }
-inline CUDA_CALLABLE float trunc(float x) { return ::truncf(x); }
-inline CUDA_CALLABLE float floor(float x) { return ::floorf(x); }
-inline CUDA_CALLABLE float ceil(float x) { return ::ceilf(x); }
-inline CUDA_CALLABLE float frac(float x) { return x - trunc(x); }
-inline CUDA_CALLABLE double round(double x) { return ::round(x); }
-inline CUDA_CALLABLE double rint(double x) { return ::rint(x); }
-inline CUDA_CALLABLE double trunc(double x) { return ::trunc(x); }
-inline CUDA_CALLABLE double floor(double x) { return ::floor(x); }
-inline CUDA_CALLABLE double ceil(double x) { return ::ceil(x); }
-inline CUDA_CALLABLE double frac(double x) { return x - trunc(x); }
-inline CUDA_CALLABLE half round(half x) { return ::roundf(float(x)); }
-inline CUDA_CALLABLE half rint(half x) { return ::rintf(float(x)); }
-inline CUDA_CALLABLE half trunc(half x) { return ::truncf(float(x)); }
-inline CUDA_CALLABLE half floor(half x) { return ::floorf(float(x)); }
-inline CUDA_CALLABLE half ceil(half x) { return ::ceilf(float(x)); }
-inline CUDA_CALLABLE half frac(half x) { return float(x) - trunc(float(x)); }
-#define DECLARE_ADJOINTS(T)\
-inline CUDA_CALLABLE void adj_log(T a, T& adj_a, T adj_ret)\
-{\
-    adj_a += (T(1)/a)*adj_ret;\
-    DO_IF_FPCHECK(if (!isfinite(adj_a))\
-    {\
-        printf("%s:%d - adj_log(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-inline CUDA_CALLABLE void adj_log2(T a, T& adj_a, T adj_ret)\
-{ \
-    adj_a += (T(1)/a)*(T(1)/log(T(2)))*adj_ret; \
-    DO_IF_FPCHECK(if (!isfinite(adj_a))\
-    {\
-        printf("%s:%d - adj_log2(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
-        assert(0);\
-    })   \
-}\
-inline CUDA_CALLABLE void adj_log10(T a, T& adj_a, T adj_ret)\
-{\
-    adj_a += (T(1)/a)*(T(1)/log(T(10)))*adj_ret; \
-    DO_IF_FPCHECK(if (!isfinite(adj_a))\
-    {\
-        printf("%s:%d - adj_log10(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-inline CUDA_CALLABLE void adj_exp(T a, T ret, T& adj_a, T adj_ret) { adj_a += ret*adj_ret; }\
-inline CUDA_CALLABLE void adj_pow(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
-{ \
-    adj_a += b*pow(a, b-T(1))*adj_ret;\
-    adj_b += log(a)*ret*adj_ret;\
-    DO_IF_FPCHECK(if (!isfinite(adj_a) || !isfinite(adj_b))\
-    {\
-        printf("%s:%d - adj_pow(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(a), float(b), float(adj_a), float(adj_b), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-inline CUDA_CALLABLE void adj_leaky_min(T a, T b, T r, T& adj_a, T& adj_b, T& adj_r, T adj_ret)\
-{\
-    if (a < b)\
-        adj_a += adj_ret;\
-    else\
-    {\
-        adj_a += r*adj_ret;\
-        adj_b += adj_ret;\
-    }\
-}\
-inline CUDA_CALLABLE void adj_leaky_max(T a, T b, T r, T& adj_a, T& adj_b, T& adj_r, T adj_ret)\
-{\
-    if (a > b)\
-        adj_a += adj_ret;\
-    else\
-    {\
-        adj_a += r*adj_ret;\
-        adj_b += adj_ret;\
-    }\
-}\
-inline CUDA_CALLABLE void adj_acos(T x, T& adj_x, T adj_ret)\
-{\
-    T d = sqrt(T(1)-x*x);\
-    DO_IF_FPCHECK(adj_x -= (T(1)/d)*adj_ret;\
-    if (!isfinite(d) || !isfinite(adj_x))\
-    {\
-        printf("%s:%d - adj_acos(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));        \
-        assert(0);\
-    })\
-    DO_IF_NO_FPCHECK(if (d > T(0))\
-        adj_x -= (T(1)/d)*adj_ret;)\
-}\
-inline CUDA_CALLABLE void adj_asin(T x, T& adj_x, T adj_ret)\
-{\
-    T d = sqrt(T(1)-x*x);\
-    DO_IF_FPCHECK(adj_x += (T(1)/d)*adj_ret;\
-    if (!isfinite(d) || !isfinite(adj_x))\
-    {\
-        printf("%s:%d - adj_asin(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));   \
-        assert(0);\
-    })\
-    DO_IF_NO_FPCHECK(if (d > T(0))\
-        adj_x += (T(1)/d)*adj_ret;)\
-}\
-inline CUDA_CALLABLE void adj_tan(T x, T& adj_x, T adj_ret)\
-{\
-    T cos_x = cos(x);\
-    DO_IF_FPCHECK(adj_x += (T(1)/(cos_x*cos_x))*adj_ret;\
-    if (!isfinite(adj_x) || cos_x == T(0))\
-    {\
-        printf("%s:%d - adj_tan(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
-        assert(0);\
-    })\
-    DO_IF_NO_FPCHECK(if (cos_x != T(0))\
-        adj_x += (T(1)/(cos_x*cos_x))*adj_ret;)\
-}\
-inline CUDA_CALLABLE void adj_atan(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += adj_ret /(x*x + T(1));\
-}\
-inline CUDA_CALLABLE void adj_atan2(T y, T x, T& adj_y, T& adj_x, T adj_ret)\
-{\
-    T d = x*x + y*y;\
-    DO_IF_FPCHECK(adj_x -= y/d*adj_ret;\
-    adj_y += x/d*adj_ret;\
-    if (!isfinite(adj_x) || !isfinite(adj_y) || d == T(0))\
-    {\
-        printf("%s:%d - adj_atan2(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(y), float(x), float(adj_y), float(adj_x), float(adj_ret));\
-        assert(0);\
-    })\
-    DO_IF_NO_FPCHECK(if (d > T(0))\
-    {\
-        adj_x -= (y/d)*adj_ret;\
-        adj_y += (x/d)*adj_ret;\
-    })\
-}\
-inline CUDA_CALLABLE void adj_sin(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += cos(x)*adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_cos(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x -= sin(x)*adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_sinh(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += cosh(x)*adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_cosh(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += sinh(x)*adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_tanh(T x, T ret, T& adj_x, T adj_ret)\
-{\
-    adj_x += (T(1) - ret*ret)*adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_sqrt(T x, T ret, T& adj_x, T adj_ret)\
-{\
-    adj_x += T(0.5)*(T(1)/ret)*adj_ret;\
-    DO_IF_FPCHECK(if (!isfinite(adj_x))\
-    {\
-        printf("%s:%d - adj_sqrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-inline CUDA_CALLABLE void adj_cbrt(T x, T ret, T& adj_x, T adj_ret)\
-{\
-    adj_x += (T(1)/T(3))*(T(1)/(ret*ret))*adj_ret;\
-    DO_IF_FPCHECK(if (!isfinite(adj_x))\
-    {\
-        printf("%s:%d - adj_cbrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
-        assert(0);\
-    })\
-}\
-inline CUDA_CALLABLE void adj_degrees(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += RAD_TO_DEG * adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_radians(T x, T& adj_x, T adj_ret)\
-{\
-    adj_x += DEG_TO_RAD * adj_ret;\
-}\
-inline CUDA_CALLABLE void adj_round(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_rint(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_trunc(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_floor(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_ceil(T x, T& adj_x, T adj_ret){ }\
-inline CUDA_CALLABLE void adj_frac(T x, T& adj_x, T adj_ret){ }
-DECLARE_ADJOINTS(float16)
-DECLARE_ADJOINTS(float32)
-DECLARE_ADJOINTS(float64)
-template <typename C, typename T>
-CUDA_CALLABLE inline T select(const C& cond, const T& a, const T& b)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    return (!!cond) ? b : a;
-}
-template <typename C, typename T>
-CUDA_CALLABLE inline void adj_select(const C& cond, const T& a, const T& b, C& adj_cond, T& adj_a, T& adj_b, const T& adj_ret)
-{
-    // The double NOT operator !! casts to bool without compiler warnings.
-    if (!!cond)
-        adj_b += adj_ret;
-    else
-        adj_a += adj_ret;
-}
-template <typename T>
-CUDA_CALLABLE inline T copy(const T& src)
-{
-    return src;
-}
-template <typename T>
-CUDA_CALLABLE inline void adj_copy(const T& src, T& adj_src, T& adj_dest)
-{
-    adj_src = adj_dest;
-    adj_dest = T{};
-}
-template <typename T>
-CUDA_CALLABLE inline void assign(T& dest, const T& src)
-{
-    dest = src;
-}
-template <typename T>
-CUDA_CALLABLE inline void adj_assign(T& dest, const T& src, T& adj_dest, T& adj_src)
-{
-    // this is generally a non-differentiable operation since it violates SSA,
-    // except in read-modify-write statements which are reversible through backpropagation
-    adj_src = adj_dest;
-    adj_dest = T{};
-}
-// some helpful operator overloads (just for C++ use, these are not adjointed)
-template <typename T>
-CUDA_CALLABLE inline T& operator += (T& a, const T& b) { a = add(a, b); return a; }
-template <typename T>
-CUDA_CALLABLE inline T& operator -= (T& a, const T& b) { a = sub(a, b); return a; }
-template <typename T>
-CUDA_CALLABLE inline T operator+(const T& a, const T& b) { return add(a, b); }
-template <typename T>
-CUDA_CALLABLE inline T operator-(const T& a, const T& b) { return sub(a, b); }
-template <typename T>
-CUDA_CALLABLE inline T pos(const T& x) { return x; }
-template <typename T>
-CUDA_CALLABLE inline void adj_pos(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(adj_ret); }
-// unary negation implemented as negative multiply, not sure the fp implications of this
-// may be better as 0.0 - x?
-template <typename T>
-CUDA_CALLABLE inline T neg(const T& x) { return T(0.0) - x; }
-template <typename T>
-CUDA_CALLABLE inline void adj_neg(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(-adj_ret); }
-// unary boolean negation
-template <typename T>
-CUDA_CALLABLE inline bool unot(const T& b) { return !b; }
-template <typename T>
-CUDA_CALLABLE inline void adj_unot(const T& b, T& adj_b, const bool& adj_ret) { }
-const int LAUNCH_MAX_DIMS = 4;   // should match types.py
-struct launch_bounds_t
-{
-    int shape[LAUNCH_MAX_DIMS]; // size of each dimension
-    int ndim;                   // number of valid dimension
-    size_t size;                // total number of threads
-};
-#ifndef __CUDACC__
-static size_t s_threadIdx;
-#endif
-inline CUDA_CALLABLE size_t grid_index()
-{
-#ifdef __CUDACC__
-    // Need to cast at least one of the variables being multiplied so that type promotion happens before the multiplication
-    size_t grid_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-    return grid_index;
-#else
-    return s_threadIdx;
-#endif
-}
-inline CUDA_CALLABLE int tid(size_t index)
-{
-    // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
-    // Only do this in _DEBUG when called from device to avoid excessive register allocation
-#if defined(_DEBUG) || !defined(__CUDA_ARCH__)
-    if (index > 2147483647) {
-        printf("Warp warning: tid() is returning an overflowed int\n");
-    }
-#endif
-    return static_cast<int>(index);
-}
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds)
-{
-    const size_t n = launch_bounds.shape[1];
-    // convert to work item
-    i = index/n;
-    j = index%n;
-}
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds)
-{
-    const size_t n = launch_bounds.shape[1];
-    const size_t o = launch_bounds.shape[2];
-    // convert to work item
-    i = index/(n*o);
-    j = index%(n*o)/o;
-    k = index%o;
-}
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds)
-{
-    const size_t n = launch_bounds.shape[1];
-    const size_t o = launch_bounds.shape[2];
-    const size_t p = launch_bounds.shape[3];
-    // convert to work item
-    i = index/(n*o*p);
-    j = index%(n*o*p)/(o*p);
-    k = index%(o*p)/p;
-    l = index%p;
-}
-template<typename T>
-inline CUDA_CALLABLE T atomic_add(T* buf, T value)
-{
-#if !defined(__CUDA_ARCH__)
-    T old = buf[0];
-    buf[0] += value;
-    return old;
-#else
-    return atomicAdd(buf, value);
-#endif
-}
-template<>
-inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value)
-{
-#if !defined(__CUDA_ARCH__)
-    float16 old = buf[0];
-    buf[0] += value;
-    return old;
-#elif defined(__clang__)  // CUDA compiled by Clang
-	__half r = atomicAdd(reinterpret_cast<__half*>(buf), *reinterpret_cast<__half*>(&value));
-    return *reinterpret_cast<float16*>(&r);
-#else  // CUDA compiled by NVRTC
-    //return atomicAdd(buf, value);
-    /* Define __PTR for atomicAdd prototypes below, undef after done */
-    #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
-    #define __PTR   "l"
-    #else
-    #define __PTR   "r"
-    #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
-    half r = 0.0;
-    #if __CUDA_ARCH__ >= 700
-        asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
-                    : "=h"(r.u)
-                    : __PTR(buf), "h"(value.u)
-                    : "memory");
-    #endif
-    return r;
-    #undef __PTR
-#endif  // CUDA compiled by NVRTC
-}
-// emulate atomic float max
-inline CUDA_CALLABLE float atomic_max(float* address, float val)
-{
-#if defined(__CUDA_ARCH__)
-    int *address_as_int = (int*)address;
-    int old = *address_as_int, assumed;
-	while (val > __int_as_float(old))
-	{
-        assumed = old;
-        old = atomicCAS(address_as_int, assumed,
-                        __float_as_int(val));
-    }
-    return __int_as_float(old);
-#else
-    float old = *address;
-    *address = max(old, val);
-    return old;
-#endif
-}
-// emulate atomic float min/max with atomicCAS()
-inline CUDA_CALLABLE float atomic_min(float* address, float val)
-{
-#if defined(__CUDA_ARCH__)
-    int *address_as_int = (int*)address;
-    int old = *address_as_int, assumed;
-    while (val < __int_as_float(old))
-	{
-        assumed = old;
-        old = atomicCAS(address_as_int, assumed,
-                        __float_as_int(val));
-    }
-    return __int_as_float(old);
-#else
-    float old = *address;
-    *address = min(old, val);
-    return old;
-#endif
-}
-inline CUDA_CALLABLE int atomic_max(int* address, int val)
-{
-#if defined(__CUDA_ARCH__)
-    return atomicMax(address, val);
-#else
-    int old = *address;
-    *address = max(old, val);
-    return old;
-#endif
-}
-// atomic int min
-inline CUDA_CALLABLE int atomic_min(int* address, int val)
-{
-#if defined(__CUDA_ARCH__)
-    return atomicMin(address, val);
-#else
-    int old = *address;
-    *address = min(old, val);
-    return old;
-#endif
-}
-// default behavior for adjoint of atomic min/max operation that accumulates gradients for all elements matching the min/max value
-template <typename T>
-CUDA_CALLABLE inline void adj_atomic_minmax(T *addr, T *adj_addr, const T &value, T &adj_value)
-{
-    if (value == *addr)
-        adj_value += *adj_addr;
-}
-// for integral types we do not accumulate gradients
-CUDA_CALLABLE inline void adj_atomic_minmax(int8* buf, int8* adj_buf, const int8 &value, int8 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(uint8* buf, uint8* adj_buf, const uint8 &value, uint8 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(int16* buf, int16* adj_buf, const int16 &value, int16 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(uint16* buf, uint16* adj_buf, const uint16 &value, uint16 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(int32* buf, int32* adj_buf, const int32 &value, int32 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(uint32* buf, uint32* adj_buf, const uint32 &value, uint32 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(int64* buf, int64* adj_buf, const int64 &value, int64 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(uint64* buf, uint64* adj_buf, const uint64 &value, uint64 &adj_value) { }
-CUDA_CALLABLE inline void adj_atomic_minmax(bool* buf, bool* adj_buf, const bool &value, bool &adj_value) { }
-} // namespace wp
-// bool and printf are defined outside of the wp namespace in crt.h, hence
-// their adjoint counterparts are also defined in the global namespace.
-template <typename T>
-CUDA_CALLABLE inline void adj_bool(T, T&, bool) {}
-inline CUDA_CALLABLE void adj_printf(const char* fmt, ...) {}
-#include "vec.h"
-#include "mat.h"
-#include "quat.h"
-#include "spatial.h"
-#include "intersect.h"
-#include "intersect_adj.h"
-//--------------
-namespace wp
-{
-// dot for scalar types just to make some templates compile for scalar/vector
-inline CUDA_CALLABLE float dot(float a, float b) { return mul(a, b); }
-inline CUDA_CALLABLE void adj_dot(float a, float b, float& adj_a, float& adj_b, float adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); }
-inline CUDA_CALLABLE float tensordot(float a, float b) { return mul(a, b); }
-#define DECLARE_INTERP_FUNCS(T) \
-CUDA_CALLABLE inline T smoothstep(T edge0, T edge1, T x)\
-{\
-    x = clamp((x - edge0) / (edge1 - edge0), T(0), T(1));\
-    return x * x * (T(3) - T(2) * x);\
-}\
-CUDA_CALLABLE inline void adj_smoothstep(T edge0, T edge1, T x, T& adj_edge0, T& adj_edge1, T& adj_x, T adj_ret)\
-{\
-    T ab = edge0 - edge1;\
-    T ax = edge0 - x;\
-    T bx = edge1 - x;\
-    T xb = x - edge1;\
-\
-    if (bx / ab >= T(0) || ax / ab <= T(0))\
-    {\
-        return;\
-    }\
-\
-    T ab3 = ab * ab * ab;\
-    T ab4 = ab3 * ab;\
-    adj_edge0 += adj_ret * ((T(6) * ax * bx * bx) / ab4);\
-    adj_edge1 += adj_ret * ((T(6) * ax * ax * xb) / ab4);\
-    adj_x     += adj_ret * ((T(6) * ax * bx     ) / ab3);\
-}\
-CUDA_CALLABLE inline T lerp(const T& a, const T& b, T t)\
-{\
-    return a*(T(1)-t) + b*t;\
-}\
-CUDA_CALLABLE inline void adj_lerp(const T& a, const T& b, T t, T& adj_a, T& adj_b, T& adj_t, const T& adj_ret)\
-{\
-    adj_a += adj_ret*(T(1)-t);\
-    adj_b += adj_ret*t;\
-    adj_t += b*adj_ret - a*adj_ret;\
-}
-DECLARE_INTERP_FUNCS(float16)
-DECLARE_INTERP_FUNCS(float32)
-DECLARE_INTERP_FUNCS(float64)
-inline CUDA_CALLABLE void print(const str s)
-{
-    printf("%s\n", s);
-}
-inline CUDA_CALLABLE void print(int i)
-{
-    printf("%d\n", i);
-}
-inline CUDA_CALLABLE void print(short i)
-{
-    printf("%hd\n", i);
-}
-inline CUDA_CALLABLE void print(long i)
-{
-    printf("%ld\n", i);
-}
-inline CUDA_CALLABLE void print(long long i)
-{
-    printf("%lld\n", i);
-}
-inline CUDA_CALLABLE void print(unsigned i)
-{
-    printf("%u\n", i);
-}
-inline CUDA_CALLABLE void print(unsigned short i)
-{
-    printf("%hu\n", i);
-}
-inline CUDA_CALLABLE void print(unsigned long i)
-{
-    printf("%lu\n", i);
-}
-inline CUDA_CALLABLE void print(unsigned long long i)
-{
-    printf("%llu\n", i);
-}
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void print(vec_t<Length, Type> v)
-{
-    for( unsigned i=0; i < Length; ++i )
-    {
-        printf("%g ", float(v[i]));
-    }
-    printf("\n");
-}
-template<typename Type>
-inline CUDA_CALLABLE void print(quat_t<Type> i)
-{
-    printf("%g %g %g %g\n", float(i.x), float(i.y), float(i.z), float(i.w));
-}
-template<unsigned Rows,unsigned Cols,typename Type>
-inline CUDA_CALLABLE void print(const mat_t<Rows,Cols,Type> &m)
-{
-    for( unsigned i=0; i< Rows; ++i )
-    {
-        for( unsigned j=0; j< Cols; ++j )
-        {
-            printf("%g ",float(m.data[i][j]));
-        }
-        printf("\n");
-    }
-}
-template<typename Type>
-inline CUDA_CALLABLE void print(transform_t<Type> t)
-{
-    printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
-}
-inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); }
-inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned long long f, unsigned long long adj_f) { printf("%llu adj: %llu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(half h, half adj_h) { printf("%g adj: %g\n", half_to_float(h), half_to_float(adj_h)); }
-inline CUDA_CALLABLE void adj_print(double f, double adj_f) { printf("%g adj: %g\n", f, adj_f); }
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_print(vec_t<Length, Type> v, vec_t<Length, Type>& adj_v) { printf("%g %g adj: %g %g \n", v[0], v[1], adj_v[0], adj_v[1]); }
-template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type> m, mat_t<Rows, Cols, Type>& adj_m) { }
-template<typename Type>
-inline CUDA_CALLABLE void adj_print(quat_t<Type> q, quat_t<Type>& adj_q) { printf("%g %g %g %g adj: %g %g %g %g\n", q.x, q.y, q.z, q.w, adj_q.x, adj_q.y, adj_q.z, adj_q.w); }
-template<typename Type>
-inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_t) {}
-inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {}
-template <typename T>
-inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
-{
-    if (!(actual == expected))
-    {
-        printf("Error, expect_eq() failed:\n");
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
-    }
-}
-template <typename T>
-inline CUDA_CALLABLE void adj_expect_eq(const T& a, const T& b, T& adj_a, T& adj_b)
-{
-    // nop
-}
-template <typename T>
-inline CUDA_CALLABLE void expect_neq(const T& actual, const T& expected)
-{
-    if (actual == expected)
-    {
-        printf("Error, expect_neq() failed:\n");
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
-    }
-}
-template <typename T>
-inline CUDA_CALLABLE void adj_expect_neq(const T& a, const T& b, T& adj_a, T& adj_b)
-{
-    // nop
-}
-template <typename T>
-inline CUDA_CALLABLE void expect_near(const T& actual, const T& expected, const T& tolerance)
-{
-    if (abs(actual - expected) > tolerance)
-    {
-        printf("Error, expect_near() failed with tolerance "); print(tolerance);
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
-    }
-}
-inline CUDA_CALLABLE void expect_near(const vec3& actual, const vec3& expected, const float& tolerance)
-{
-    const float diff = max(max(abs(actual[0] - expected[0]), abs(actual[1] - expected[1])), abs(actual[2] - expected[2]));
-    if (diff > tolerance)
-    {
-        printf("Error, expect_near() failed with tolerance "); print(tolerance);
-        printf("\t Expected: "); print(expected);
-        printf("\t Actual: "); print(actual);
-    }
-}
-template <typename T>
-inline CUDA_CALLABLE void adj_expect_near(const T& actual, const T& expected, const T& tolerance, T& adj_actual, T& adj_expected, T& adj_tolerance)
-{
-    // nop
-}
-inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expected, float tolerance, vec3& adj_actual, vec3& adj_expected, float adj_tolerance)
-{
-    // nop
-}
-} // namespace wp
-// include array.h so we have the print, isfinite functions for the inner array types defined
-#include "array.h"
-#include "mesh.h"
-#include "bvh.h"
-#include "svd.h"
-#include "hashgrid.h"
-#include "volume.h"
-#include "range.h"
-#include "rand.h"
-#include "noise.h"
-#include "matnn.h"
+/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+#pragma once
+// All built-in types and functions. To be compatible with runtime NVRTC compilation
+// this header must be independently compilable (i.e.: without external SDK headers)
+// to achieve this we redefine a subset of CRT functions (printf, pow, sin, cos, etc)
+#include "crt.h"
+#ifdef _WIN32
+#define __restrict__ __restrict
+#endif
+#if !defined(__CUDACC__)
+    #define CUDA_CALLABLE
+    #define CUDA_CALLABLE_DEVICE
+#else
+    #define CUDA_CALLABLE __host__ __device__
+    #define CUDA_CALLABLE_DEVICE __device__
+#endif
+#ifdef WP_VERIFY_FP
+#define FP_CHECK 1
+#define DO_IF_FPCHECK(X) {X}
+#define DO_IF_NO_FPCHECK(X)
+#else
+#define FP_CHECK 0
+#define DO_IF_FPCHECK(X)
+#define DO_IF_NO_FPCHECK(X) {X}
+#endif
+#define RAD_TO_DEG 57.29577951308232087679
+#define DEG_TO_RAD  0.01745329251994329577
+#if defined(__CUDACC__) && !defined(_MSC_VER)
+__device__ void __debugbreak() {}
+#endif
+namespace wp
+{
+// numeric types (used from generated kernels)
+typedef float float32;
+typedef double float64;
+typedef int8_t int8;
+typedef uint8_t uint8;
+typedef int16_t int16;
+typedef uint16_t uint16;
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+// matches Python string type for constant strings
+typedef const char* str;
+struct half;
+CUDA_CALLABLE half float_to_half(float x);
+CUDA_CALLABLE float half_to_float(half x);
+struct half
+{
+    CUDA_CALLABLE inline half() : u(0) {}
+    CUDA_CALLABLE inline half(float f)
+    {
+        *this = float_to_half(f);
+    }
+    unsigned short u;
+    CUDA_CALLABLE inline bool operator==(const half& h) const { return u == h.u; }
+    CUDA_CALLABLE inline bool operator!=(const half& h) const { return u != h.u; }
+    CUDA_CALLABLE inline bool operator>(const half& h) const { return half_to_float(*this) > half_to_float(h); }
+    CUDA_CALLABLE inline bool operator>=(const half& h) const { return half_to_float(*this) >= half_to_float(h); }
+    CUDA_CALLABLE inline bool operator<(const half& h) const { return half_to_float(*this) < half_to_float(h); }
+    CUDA_CALLABLE inline bool operator<=(const half& h) const { return half_to_float(*this) <= half_to_float(h); }
+    CUDA_CALLABLE inline bool operator!() const
+    {
+        return float32(*this) == 0;
+    }
+    CUDA_CALLABLE inline half operator*=(const half& h)
+    {
+        half prod = half(float32(*this) * float32(h));
+        this->u = prod.u;
+        return *this;
+    }
+    CUDA_CALLABLE inline half operator/=(const half& h)
+    {
+        half quot = half(float32(*this) / float32(h));
+        this->u = quot.u;
+        return *this;
+    }
+    CUDA_CALLABLE inline half operator+=(const half& h)
+    {
+        half sum = half(float32(*this) + float32(h));
+        this->u = sum.u;
+        return *this;
+    }
+    CUDA_CALLABLE inline half operator-=(const half& h)
+    {
+        half diff = half(float32(*this) - float32(h));
+        this->u = diff.u;
+        return *this;
+    }
+    CUDA_CALLABLE inline operator float32() const { return float32(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator float64() const { return float64(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator int8() const { return int8(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator uint8() const { return uint8(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator int16() const { return int16(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator uint16() const { return uint16(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator int32() const { return int32(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator uint32() const { return uint32(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator int64() const { return int64(half_to_float(*this)); }
+    CUDA_CALLABLE inline operator uint64() const { return uint64(half_to_float(*this)); }
+};
+static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
+typedef half float16;
+#if defined(__CUDA_ARCH__)
+CUDA_CALLABLE inline half float_to_half(float x)
+{
+    half h;
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(h.u) : "f"(x));
+    return h;
+}
+CUDA_CALLABLE inline float half_to_float(half x)
+{
+    float val;
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(x.u));
+    return val;
+}
+#elif defined(__clang__)
+// _Float16 is Clang's native half-precision floating-point type
+inline half float_to_half(float x)
+{
+    _Float16 f16 = static_cast<_Float16>(x);
+    return *reinterpret_cast<half*>(&f16);
+}
+inline float half_to_float(half h)
+{
+    _Float16 f16 = *reinterpret_cast<_Float16*>(&h);
+    return static_cast<float>(f16);
+}
+#else  // Native C++ for Warp builtins outside of kernels
+extern "C" WP_API uint16_t float_to_half_bits(float x);
+extern "C" WP_API float half_bits_to_float(uint16_t u);
+inline half float_to_half(float x)
+{
+    half h;
+    h.u = float_to_half_bits(x);
+    return h;
+}
+inline float half_to_float(half h)
+{
+   return half_bits_to_float(h.u);
+}
+#endif
+// BAD operator implementations for fp16 arithmetic...
+// negation:
+inline CUDA_CALLABLE half operator - (half a)
+{
+    return float_to_half( -half_to_float(a) );
+}
+inline CUDA_CALLABLE half operator + (half a,half b)
+{
+    return float_to_half( half_to_float(a) + half_to_float(b) );
+}
+inline CUDA_CALLABLE half operator - (half a,half b)
+{
+    return float_to_half( half_to_float(a) - half_to_float(b) );
+}
+inline CUDA_CALLABLE half operator * (half a,half b)
+{
+    return float_to_half( half_to_float(a) * half_to_float(b) );
+}
+inline CUDA_CALLABLE half operator * (half a,double b)
+{
+    return float_to_half( half_to_float(a) * b );
+}
+inline CUDA_CALLABLE half operator * (double a,half b)
+{
+    return float_to_half( a * half_to_float(b) );
+}
+inline CUDA_CALLABLE half operator / (half a,half b)
+{
+    return float_to_half( half_to_float(a) / half_to_float(b) );
+}
+template <typename T>
+CUDA_CALLABLE float cast_float(T x) { return (float)(x); }
+template <typename T>
+CUDA_CALLABLE int cast_int(T x) { return (int)(x); }
+template <typename T>
+CUDA_CALLABLE void adj_cast_float(T x, T& adj_x, float adj_ret) { adj_x += T(adj_ret); }
+template <typename T>
+CUDA_CALLABLE void adj_cast_int(T x, T& adj_x, int adj_ret) { adj_x += adj_ret; }
+template <typename T>
+CUDA_CALLABLE inline void adj_int8(T, T&, int8) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_uint8(T, T&, uint8) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_int16(T, T&, int16) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_uint16(T, T&, uint16) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_int32(T, T&, int32) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_uint32(T, T&, uint32) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_int64(T, T&, int64) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_uint64(T, T&, uint64) {}
+template <typename T>
+CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float16 adj_ret) { adj_x += T(adj_ret); }
+template <typename T>
+CUDA_CALLABLE inline void adj_float32(T x, T& adj_x, float32 adj_ret) { adj_x += T(adj_ret); }
+template <typename T>
+CUDA_CALLABLE inline void adj_float64(T x, T& adj_x, float64 adj_ret) { adj_x += T(adj_ret); }
+#define kEps 0.0f
+// basic ops for integer types
+#define DECLARE_INT_OPS(T) \
+inline CUDA_CALLABLE T mul(T a, T b) { return a*b; } \
+inline CUDA_CALLABLE T div(T a, T b) { return a/b; } \
+inline CUDA_CALLABLE T add(T a, T b) { return a+b; } \
+inline CUDA_CALLABLE T sub(T a, T b) { return a-b; } \
+inline CUDA_CALLABLE T mod(T a, T b) { return a%b; } \
+inline CUDA_CALLABLE T min(T a, T b) { return a<b?a:b; } \
+inline CUDA_CALLABLE T max(T a, T b) { return a>b?a:b; } \
+inline CUDA_CALLABLE T clamp(T x, T a, T b) { return min(max(a, x), b); } \
+inline CUDA_CALLABLE T floordiv(T a, T b) { return a/b; } \
+inline CUDA_CALLABLE T nonzero(T x) { return x == T(0) ? T(0) : T(1); } \
+inline CUDA_CALLABLE T sqrt(T x) { return 0; } \
+inline CUDA_CALLABLE T bit_and(T a, T b) { return a&b; } \
+inline CUDA_CALLABLE T bit_or(T a, T b) { return a|b; } \
+inline CUDA_CALLABLE T bit_xor(T a, T b) { return a^b; } \
+inline CUDA_CALLABLE T lshift(T a, T b) { return a<<b; } \
+inline CUDA_CALLABLE T rshift(T a, T b) { return a>>b; } \
+inline CUDA_CALLABLE T invert(T x) { return ~x; } \
+inline CUDA_CALLABLE bool isfinite(T x) { return true; } \
+inline CUDA_CALLABLE void adj_mul(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_add(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_sub(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_mod(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_min(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_max(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_abs(T x, T adj_x, T& adj_ret) { } \
+inline CUDA_CALLABLE void adj_sign(T x, T adj_x, T& adj_ret) { } \
+inline CUDA_CALLABLE void adj_clamp(T x, T a, T b, T& adj_x, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_floordiv(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_step(T x, T& adj_x, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_nonzero(T x, T& adj_x, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_sqrt(T x, T adj_x, T& adj_ret) { } \
+inline CUDA_CALLABLE void adj_bit_and(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_bit_or(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_bit_xor(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_lshift(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_rshift(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_invert(T x, T adj_x, T& adj_ret) { }
+inline CUDA_CALLABLE int8 abs(int8 x) { return ::abs(x); }
+inline CUDA_CALLABLE int16 abs(int16 x) { return ::abs(x); }
+inline CUDA_CALLABLE int32 abs(int32 x) { return ::abs(x); }
+inline CUDA_CALLABLE int64 abs(int64 x) { return ::llabs(x); }
+inline CUDA_CALLABLE uint8 abs(uint8 x) { return x; }
+inline CUDA_CALLABLE uint16 abs(uint16 x) { return x; }
+inline CUDA_CALLABLE uint32 abs(uint32 x) { return x; }
+inline CUDA_CALLABLE uint64 abs(uint64 x) { return x; }
+ DECLARE_INT_OPS(int8)
+ DECLARE_INT_OPS(int16)
+ DECLARE_INT_OPS(int32)
+ DECLARE_INT_OPS(int64)
+ DECLARE_INT_OPS(uint8)
+ DECLARE_INT_OPS(uint16)
+ DECLARE_INT_OPS(uint32)
+ DECLARE_INT_OPS(uint64)
+inline CUDA_CALLABLE int8 step(int8 x) { return x < 0 ? 1 : 0; }
+inline CUDA_CALLABLE int16 step(int16 x) { return x < 0 ? 1 : 0; }
+inline CUDA_CALLABLE int32 step(int32 x) { return x < 0 ? 1 : 0; }
+inline CUDA_CALLABLE int64 step(int64 x) { return x < 0 ? 1 : 0; }
+inline CUDA_CALLABLE uint8 step(uint8 x) { return 0; }
+inline CUDA_CALLABLE uint16 step(uint16 x) { return 0; }
+inline CUDA_CALLABLE uint32 step(uint32 x) { return 0; }
+inline CUDA_CALLABLE uint64 step(uint64 x) { return 0; }
+inline CUDA_CALLABLE int8 sign(int8 x) { return x < 0 ? -1 : 1; }
+inline CUDA_CALLABLE int8 sign(int16 x) { return x < 0 ? -1 : 1; }
+inline CUDA_CALLABLE int8 sign(int32 x) { return x < 0 ? -1 : 1; }
+inline CUDA_CALLABLE int8 sign(int64 x) { return x < 0 ? -1 : 1; }
+inline CUDA_CALLABLE uint8 sign(uint8 x) { return 1; }
+inline CUDA_CALLABLE uint16 sign(uint16 x) { return 1; }
+inline CUDA_CALLABLE uint32 sign(uint32 x) { return 1; }
+inline CUDA_CALLABLE uint64 sign(uint64 x) { return 1; }
+// Catch-all for non-float types
+template<typename T>
+inline bool CUDA_CALLABLE isfinite(const T&)
+{
+    return true;
+}
+inline bool CUDA_CALLABLE isfinite(half x)
+{
+    return ::isfinite(float(x));
+}
+inline bool CUDA_CALLABLE isfinite(float x)
+{
+    return ::isfinite(x);
+}
+inline bool CUDA_CALLABLE isfinite(double x)
+{
+    return ::isfinite(x);
+}
+template<typename T>
+inline CUDA_CALLABLE void print(const T&)
+{
+    printf("<type without print implementation>\n");
+}
+inline CUDA_CALLABLE void print(float16 f)
+{
+    printf("%g\n", half_to_float(f));
+}
+inline CUDA_CALLABLE void print(float f)
+{
+    printf("%g\n", f);
+}
+inline CUDA_CALLABLE void print(double f)
+{
+    printf("%g\n", f);
+}
+// basic ops for float types
+#define DECLARE_FLOAT_OPS(T) \
+inline CUDA_CALLABLE T mul(T a, T b) { return a*b; } \
+inline CUDA_CALLABLE T add(T a, T b) { return a+b; } \
+inline CUDA_CALLABLE T sub(T a, T b) { return a-b; } \
+inline CUDA_CALLABLE T min(T a, T b) { return a<b?a:b; } \
+inline CUDA_CALLABLE T max(T a, T b) { return a>b?a:b; } \
+inline CUDA_CALLABLE T sign(T x) { return x < T(0) ? -1 : 1; } \
+inline CUDA_CALLABLE T step(T x) { return x < T(0) ? T(1) : T(0); }\
+inline CUDA_CALLABLE T nonzero(T x) { return x == T(0) ? T(0) : T(1); }\
+inline CUDA_CALLABLE T clamp(T x, T a, T b) { return min(max(a, x), b); }\
+inline CUDA_CALLABLE void adj_abs(T x, T& adj_x, T adj_ret) \
+{\
+    if (x < T(0))\
+        adj_x -= adj_ret;\
+    else\
+        adj_x += adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_mul(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += b*adj_ret; adj_b += a*adj_ret; } \
+inline CUDA_CALLABLE void adj_add(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += adj_ret; adj_b += adj_ret; } \
+inline CUDA_CALLABLE void adj_sub(T a, T b, T& adj_a, T& adj_b, T adj_ret) { adj_a += adj_ret; adj_b -= adj_ret; } \
+inline CUDA_CALLABLE void adj_min(T a, T b, T& adj_a, T& adj_b, T adj_ret) \
+{ \
+    if (a < b) \
+        adj_a += adj_ret; \
+    else \
+        adj_b += adj_ret; \
+} \
+inline CUDA_CALLABLE void adj_max(T a, T b, T& adj_a, T& adj_b, T adj_ret) \
+{ \
+    if (a > b) \
+        adj_a += adj_ret; \
+    else \
+        adj_b += adj_ret; \
+} \
+inline CUDA_CALLABLE void adj_floordiv(T a, T b, T& adj_a, T& adj_b, T adj_ret) { } \
+inline CUDA_CALLABLE void adj_mod(T a, T b, T& adj_a, T& adj_b, T adj_ret){ adj_a += adj_ret; }\
+inline CUDA_CALLABLE void adj_sign(T x, T adj_x, T& adj_ret) { }\
+inline CUDA_CALLABLE void adj_step(T x, T& adj_x, T adj_ret) { }\
+inline CUDA_CALLABLE void adj_nonzero(T x, T& adj_x, T adj_ret) { }\
+inline CUDA_CALLABLE void adj_clamp(T x, T a, T b, T& adj_x, T& adj_a, T& adj_b, T adj_ret)\
+{\
+    if (x < a)\
+        adj_a += adj_ret;\
+    else if (x > b)\
+        adj_b += adj_ret;\
+    else\
+        adj_x += adj_ret;\
+}\
+inline CUDA_CALLABLE T div(T a, T b)\
+{\
+    DO_IF_FPCHECK(\
+    if (!isfinite(a) || !isfinite(b) || b == T(0))\
+    {\
+        printf("%s:%d div(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));\
+        assert(0);\
+    })\
+    return a/b;\
+}\
+inline CUDA_CALLABLE void adj_div(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
+{\
+    adj_a += adj_ret/b;\
+    adj_b -= adj_ret*(ret)/b;\
+    DO_IF_FPCHECK(\
+    if (!isfinite(adj_a) || !isfinite(adj_b))\
+    {\
+        printf("%s:%d - adj_div(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(a), float(b), float(adj_a), float(adj_b), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+DECLARE_FLOAT_OPS(float16)
+DECLARE_FLOAT_OPS(float32)
+DECLARE_FLOAT_OPS(float64)
+// basic ops for float types
+inline CUDA_CALLABLE float16 mod(float16 a, float16 b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
+        assert(0);
+    }
+#endif
+    return fmodf(float(a), float(b));
+}
+inline CUDA_CALLABLE float32 mod(float32 a, float32 b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
+        assert(0);
+    }
+#endif
+    return fmodf(a, b);
+}
+inline CUDA_CALLABLE double mod(double a, double b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
+        assert(0);
+    }
+#endif
+    return fmod(a, b);
+}
+inline CUDA_CALLABLE half log(half a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || float(a) < 0.0f)
+    {
+        printf("%s:%d log(%f)\n", __FILE__, __LINE__, float(a));
+        assert(0);
+    }
+#endif
+    return ::logf(a);
+}
+inline CUDA_CALLABLE float log(float a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0f)
+    {
+        printf("%s:%d log(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::logf(a);
+}
+inline CUDA_CALLABLE double log(double a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0)
+    {
+        printf("%s:%d log(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::log(a);
+}
+inline CUDA_CALLABLE half log2(half a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || float(a) < 0.0f)
+    {
+        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, float(a));
+        assert(0);
+    }
+#endif
+    return ::log2f(float(a));
+}
+inline CUDA_CALLABLE float log2(float a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0f)
+    {
+        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::log2f(a);
+}
+inline CUDA_CALLABLE double log2(double a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0)
+    {
+        printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::log2(a);
+}
+inline CUDA_CALLABLE half log10(half a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || float(a) < 0.0f)
+    {
+        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, float(a));
+        assert(0);
+    }
+#endif
+    return ::log10f(float(a));
+}
+inline CUDA_CALLABLE float log10(float a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0f)
+    {
+        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::log10f(a);
+}
+inline CUDA_CALLABLE double log10(double a)
+{
+#if FP_CHECK
+    if (!isfinite(a) || a < 0.0)
+    {
+        printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a);
+        assert(0);
+    }
+#endif
+    return ::log10(a);
+}
+inline CUDA_CALLABLE half exp(half a)
+{
+    half result = ::expf(float(a));
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(result))
+    {
+        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, float(a), float(result));
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE float exp(float a)
+{
+    float result = ::expf(a);
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(result))
+    {
+        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result);
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE double exp(double a)
+{
+    double result = ::exp(a);
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(result))
+    {
+        printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result);
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE half pow(half a, half b)
+{
+    float result = ::powf(float(a), float(b));
+#if FP_CHECK
+    if (!isfinite(float(a)) || !isfinite(float(b)) || !isfinite(result))
+    {
+        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, float(a), float(b), result);
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE float pow(float a, float b)
+{
+    float result = ::powf(a, b);
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || !isfinite(result))
+    {
+        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result);
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE double pow(double a, double b)
+{
+    double result = ::pow(a, b);
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || !isfinite(result))
+    {
+        printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result);
+        assert(0);
+    }
+#endif
+    return result;
+}
+inline CUDA_CALLABLE half floordiv(half a, half b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b));
+        assert(0);
+    }
+#endif
+    return floorf(float(a/b));
+}
+inline CUDA_CALLABLE float floordiv(float a, float b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || b == 0.0f)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
+        assert(0);
+    }
+#endif
+    return floorf(a/b);
+}
+inline CUDA_CALLABLE double floordiv(double a, double b)
+{
+#if FP_CHECK
+    if (!isfinite(a) || !isfinite(b) || b == 0.0)
+    {
+        printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b);
+        assert(0);
+    }
+#endif
+    return ::floor(a/b);
+}
+inline CUDA_CALLABLE float leaky_min(float a, float b, float r) { return min(a, b); }
+inline CUDA_CALLABLE float leaky_max(float a, float b, float r) { return max(a, b); }
+inline CUDA_CALLABLE half abs(half x) { return ::fabsf(float(x)); }
+inline CUDA_CALLABLE float abs(float x) { return ::fabsf(x); }
+inline CUDA_CALLABLE double abs(double x) { return ::fabs(x); }
+inline CUDA_CALLABLE float acos(float x){ return ::acosf(min(max(x, -1.0f), 1.0f)); }
+inline CUDA_CALLABLE float asin(float x){ return ::asinf(min(max(x, -1.0f), 1.0f)); }
+inline CUDA_CALLABLE float atan(float x) { return ::atanf(x); }
+inline CUDA_CALLABLE float atan2(float y, float x) { return ::atan2f(y, x); }
+inline CUDA_CALLABLE float sin(float x) { return ::sinf(x); }
+inline CUDA_CALLABLE float cos(float x) { return ::cosf(x); }
+inline CUDA_CALLABLE double acos(double x){ return ::acos(min(max(x, -1.0), 1.0)); }
+inline CUDA_CALLABLE double asin(double x){ return ::asin(min(max(x, -1.0), 1.0)); }
+inline CUDA_CALLABLE double atan(double x) { return ::atan(x); }
+inline CUDA_CALLABLE double atan2(double y, double x) { return ::atan2(y, x); }
+inline CUDA_CALLABLE double sin(double x) { return ::sin(x); }
+inline CUDA_CALLABLE double cos(double x) { return ::cos(x); }
+inline CUDA_CALLABLE half acos(half x){ return ::acosf(min(max(float(x), -1.0f), 1.0f)); }
+inline CUDA_CALLABLE half asin(half x){ return ::asinf(min(max(float(x), -1.0f), 1.0f)); }
+inline CUDA_CALLABLE half atan(half x) { return ::atanf(float(x)); }
+inline CUDA_CALLABLE half atan2(half y, half x) { return ::atan2f(float(y), float(x)); }
+inline CUDA_CALLABLE half sin(half x) { return ::sinf(float(x)); }
+inline CUDA_CALLABLE half cos(half x) { return ::cosf(float(x)); }
+inline CUDA_CALLABLE float sqrt(float x)
+{
+#if FP_CHECK
+    if (x < 0.0f)
+    {
+        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x);
+        assert(0);
+    }
+#endif
+    return ::sqrtf(x);
+}
+inline CUDA_CALLABLE double sqrt(double x)
+{
+#if FP_CHECK
+    if (x < 0.0)
+    {
+        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x);
+        assert(0);
+    }
+#endif
+    return ::sqrt(x);
+}
+inline CUDA_CALLABLE half sqrt(half x)
+{
+#if FP_CHECK
+    if (float(x) < 0.0f)
+    {
+        printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, float(x));
+        assert(0);
+    }
+#endif
+    return ::sqrtf(float(x));
+}
+inline CUDA_CALLABLE float cbrt(float x) { return ::cbrtf(x); }
+inline CUDA_CALLABLE double cbrt(double x) { return ::cbrt(x); }
+inline CUDA_CALLABLE half cbrt(half x) { return ::cbrtf(float(x)); }
+inline CUDA_CALLABLE float tan(float x) { return ::tanf(x); }
+inline CUDA_CALLABLE float sinh(float x) { return ::sinhf(x);}
+inline CUDA_CALLABLE float cosh(float x) { return ::coshf(x);}
+inline CUDA_CALLABLE float tanh(float x) { return ::tanhf(x);}
+inline CUDA_CALLABLE float degrees(float x) { return x * RAD_TO_DEG;}
+inline CUDA_CALLABLE float radians(float x) { return x * DEG_TO_RAD;}
+inline CUDA_CALLABLE double tan(double x) { return ::tan(x); }
+inline CUDA_CALLABLE double sinh(double x) { return ::sinh(x);}
+inline CUDA_CALLABLE double cosh(double x) { return ::cosh(x);}
+inline CUDA_CALLABLE double tanh(double x) { return ::tanh(x);}
+inline CUDA_CALLABLE double degrees(double x) { return x * RAD_TO_DEG;}
+inline CUDA_CALLABLE double radians(double x) { return x * DEG_TO_RAD;}
+inline CUDA_CALLABLE half tan(half x) { return ::tanf(float(x)); }
+inline CUDA_CALLABLE half sinh(half x) { return ::sinhf(float(x));}
+inline CUDA_CALLABLE half cosh(half x) { return ::coshf(float(x));}
+inline CUDA_CALLABLE half tanh(half x) { return ::tanhf(float(x));}
+inline CUDA_CALLABLE half degrees(half x) { return x * RAD_TO_DEG;}
+inline CUDA_CALLABLE half radians(half x) { return x * DEG_TO_RAD;}
+inline CUDA_CALLABLE float round(float x) { return ::roundf(x); }
+inline CUDA_CALLABLE float rint(float x) { return ::rintf(x); }
+inline CUDA_CALLABLE float trunc(float x) { return ::truncf(x); }
+inline CUDA_CALLABLE float floor(float x) { return ::floorf(x); }
+inline CUDA_CALLABLE float ceil(float x) { return ::ceilf(x); }
+inline CUDA_CALLABLE float frac(float x) { return x - trunc(x); }
+inline CUDA_CALLABLE double round(double x) { return ::round(x); }
+inline CUDA_CALLABLE double rint(double x) { return ::rint(x); }
+inline CUDA_CALLABLE double trunc(double x) { return ::trunc(x); }
+inline CUDA_CALLABLE double floor(double x) { return ::floor(x); }
+inline CUDA_CALLABLE double ceil(double x) { return ::ceil(x); }
+inline CUDA_CALLABLE double frac(double x) { return x - trunc(x); }
+inline CUDA_CALLABLE half round(half x) { return ::roundf(float(x)); }
+inline CUDA_CALLABLE half rint(half x) { return ::rintf(float(x)); }
+inline CUDA_CALLABLE half trunc(half x) { return ::truncf(float(x)); }
+inline CUDA_CALLABLE half floor(half x) { return ::floorf(float(x)); }
+inline CUDA_CALLABLE half ceil(half x) { return ::ceilf(float(x)); }
+inline CUDA_CALLABLE half frac(half x) { return float(x) - trunc(float(x)); }
+#define DECLARE_ADJOINTS(T)\
+inline CUDA_CALLABLE void adj_log(T a, T& adj_a, T adj_ret)\
+{\
+    adj_a += (T(1)/a)*adj_ret;\
+    DO_IF_FPCHECK(if (!isfinite(adj_a))\
+    {\
+        printf("%s:%d - adj_log(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+inline CUDA_CALLABLE void adj_log2(T a, T& adj_a, T adj_ret)\
+{ \
+    adj_a += (T(1)/a)*(T(1)/log(T(2)))*adj_ret; \
+    DO_IF_FPCHECK(if (!isfinite(adj_a))\
+    {\
+        printf("%s:%d - adj_log2(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
+        assert(0);\
+    })   \
+}\
+inline CUDA_CALLABLE void adj_log10(T a, T& adj_a, T adj_ret)\
+{\
+    adj_a += (T(1)/a)*(T(1)/log(T(10)))*adj_ret; \
+    DO_IF_FPCHECK(if (!isfinite(adj_a))\
+    {\
+        printf("%s:%d - adj_log10(%f, %f, %f)\n", __FILE__, __LINE__, float(a), float(adj_a), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+inline CUDA_CALLABLE void adj_exp(T a, T ret, T& adj_a, T adj_ret) { adj_a += ret*adj_ret; }\
+inline CUDA_CALLABLE void adj_pow(T a, T b, T ret, T& adj_a, T& adj_b, T adj_ret)\
+{ \
+    adj_a += b*pow(a, b-T(1))*adj_ret;\
+    adj_b += log(a)*ret*adj_ret;\
+    DO_IF_FPCHECK(if (!isfinite(adj_a) || !isfinite(adj_b))\
+    {\
+        printf("%s:%d - adj_pow(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(a), float(b), float(adj_a), float(adj_b), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+inline CUDA_CALLABLE void adj_leaky_min(T a, T b, T r, T& adj_a, T& adj_b, T& adj_r, T adj_ret)\
+{\
+    if (a < b)\
+        adj_a += adj_ret;\
+    else\
+    {\
+        adj_a += r*adj_ret;\
+        adj_b += adj_ret;\
+    }\
+}\
+inline CUDA_CALLABLE void adj_leaky_max(T a, T b, T r, T& adj_a, T& adj_b, T& adj_r, T adj_ret)\
+{\
+    if (a > b)\
+        adj_a += adj_ret;\
+    else\
+    {\
+        adj_a += r*adj_ret;\
+        adj_b += adj_ret;\
+    }\
+}\
+inline CUDA_CALLABLE void adj_acos(T x, T& adj_x, T adj_ret)\
+{\
+    T d = sqrt(T(1)-x*x);\
+    DO_IF_FPCHECK(adj_x -= (T(1)/d)*adj_ret;\
+    if (!isfinite(d) || !isfinite(adj_x))\
+    {\
+        printf("%s:%d - adj_acos(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));        \
+        assert(0);\
+    })\
+    DO_IF_NO_FPCHECK(if (d > T(0))\
+        adj_x -= (T(1)/d)*adj_ret;)\
+}\
+inline CUDA_CALLABLE void adj_asin(T x, T& adj_x, T adj_ret)\
+{\
+    T d = sqrt(T(1)-x*x);\
+    DO_IF_FPCHECK(adj_x += (T(1)/d)*adj_ret;\
+    if (!isfinite(d) || !isfinite(adj_x))\
+    {\
+        printf("%s:%d - adj_asin(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));   \
+        assert(0);\
+    })\
+    DO_IF_NO_FPCHECK(if (d > T(0))\
+        adj_x += (T(1)/d)*adj_ret;)\
+}\
+inline CUDA_CALLABLE void adj_tan(T x, T& adj_x, T adj_ret)\
+{\
+    T cos_x = cos(x);\
+    DO_IF_FPCHECK(adj_x += (T(1)/(cos_x*cos_x))*adj_ret;\
+    if (!isfinite(adj_x) || cos_x == T(0))\
+    {\
+        printf("%s:%d - adj_tan(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
+        assert(0);\
+    })\
+    DO_IF_NO_FPCHECK(if (cos_x != T(0))\
+        adj_x += (T(1)/(cos_x*cos_x))*adj_ret;)\
+}\
+inline CUDA_CALLABLE void adj_atan(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += adj_ret /(x*x + T(1));\
+}\
+inline CUDA_CALLABLE void adj_atan2(T y, T x, T& adj_y, T& adj_x, T adj_ret)\
+{\
+    T d = x*x + y*y;\
+    DO_IF_FPCHECK(adj_x -= y/d*adj_ret;\
+    adj_y += x/d*adj_ret;\
+    if (!isfinite(adj_x) || !isfinite(adj_y) || d == T(0))\
+    {\
+        printf("%s:%d - adj_atan2(%f, %f, %f, %f, %f)\n", __FILE__, __LINE__, float(y), float(x), float(adj_y), float(adj_x), float(adj_ret));\
+        assert(0);\
+    })\
+    DO_IF_NO_FPCHECK(if (d > T(0))\
+    {\
+        adj_x -= (y/d)*adj_ret;\
+        adj_y += (x/d)*adj_ret;\
+    })\
+}\
+inline CUDA_CALLABLE void adj_sin(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += cos(x)*adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_cos(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x -= sin(x)*adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_sinh(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += cosh(x)*adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_cosh(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += sinh(x)*adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_tanh(T x, T ret, T& adj_x, T adj_ret)\
+{\
+    adj_x += (T(1) - ret*ret)*adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_sqrt(T x, T ret, T& adj_x, T adj_ret)\
+{\
+    adj_x += T(0.5)*(T(1)/ret)*adj_ret;\
+    DO_IF_FPCHECK(if (!isfinite(adj_x))\
+    {\
+        printf("%s:%d - adj_sqrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+inline CUDA_CALLABLE void adj_cbrt(T x, T ret, T& adj_x, T adj_ret)\
+{\
+    adj_x += (T(1)/T(3))*(T(1)/(ret*ret))*adj_ret;\
+    DO_IF_FPCHECK(if (!isfinite(adj_x))\
+    {\
+        printf("%s:%d - adj_cbrt(%f, %f, %f)\n", __FILE__, __LINE__, float(x), float(adj_x), float(adj_ret));\
+        assert(0);\
+    })\
+}\
+inline CUDA_CALLABLE void adj_degrees(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += RAD_TO_DEG * adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_radians(T x, T& adj_x, T adj_ret)\
+{\
+    adj_x += DEG_TO_RAD * adj_ret;\
+}\
+inline CUDA_CALLABLE void adj_round(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_rint(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_trunc(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_floor(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_ceil(T x, T& adj_x, T adj_ret){ }\
+inline CUDA_CALLABLE void adj_frac(T x, T& adj_x, T adj_ret){ }
+DECLARE_ADJOINTS(float16)
+DECLARE_ADJOINTS(float32)
+DECLARE_ADJOINTS(float64)
+template <typename C, typename T>
+CUDA_CALLABLE inline T select(const C& cond, const T& a, const T& b)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    return (!!cond) ? b : a;
+}
+template <typename C, typename T>
+CUDA_CALLABLE inline void adj_select(const C& cond, const T& a, const T& b, C& adj_cond, T& adj_a, T& adj_b, const T& adj_ret)
+{
+    // The double NOT operator !! casts to bool without compiler warnings.
+    if (!!cond)
+        adj_b += adj_ret;
+    else
+        adj_a += adj_ret;
+}
+template <typename T>
+CUDA_CALLABLE inline T copy(const T& src)
+{
+    return src;
+}
+template <typename T>
+CUDA_CALLABLE inline void adj_copy(const T& src, T& adj_src, T& adj_dest)
+{
+    adj_src += adj_dest;
+    adj_dest = T{};
+}
+template <typename T>
+CUDA_CALLABLE inline void assign(T& dest, const T& src)
+{
+    dest = src;
+}
+template <typename T>
+CUDA_CALLABLE inline void adj_assign(T& dest, const T& src, T& adj_dest, T& adj_src)
+{
+    // this is generally a non-differentiable operation since it violates SSA,
+    // except in read-modify-write statements which are reversible through backpropagation
+    adj_src = adj_dest;
+    adj_dest = T{};
+}
+// some helpful operator overloads (just for C++ use, these are not adjointed)
+template <typename T>
+CUDA_CALLABLE inline T& operator += (T& a, const T& b) { a = add(a, b); return a; }
+template <typename T>
+CUDA_CALLABLE inline T& operator -= (T& a, const T& b) { a = sub(a, b); return a; }
+template <typename T>
+CUDA_CALLABLE inline T operator+(const T& a, const T& b) { return add(a, b); }
+template <typename T>
+CUDA_CALLABLE inline T operator-(const T& a, const T& b) { return sub(a, b); }
+template <typename T>
+CUDA_CALLABLE inline T pos(const T& x) { return x; }
+template <typename T>
+CUDA_CALLABLE inline void adj_pos(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(adj_ret); }
+// unary negation implemented as negative multiply, not sure the fp implications of this
+// may be better as 0.0 - x?
+template <typename T>
+CUDA_CALLABLE inline T neg(const T& x) { return T(0.0) - x; }
+template <typename T>
+CUDA_CALLABLE inline void adj_neg(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(-adj_ret); }
+// unary boolean negation
+template <typename T>
+CUDA_CALLABLE inline bool unot(const T& b) { return !b; }
+template <typename T>
+CUDA_CALLABLE inline void adj_unot(const T& b, T& adj_b, const bool& adj_ret) { }
+const int LAUNCH_MAX_DIMS = 4;   // should match types.py
+struct launch_bounds_t
+{
+    int shape[LAUNCH_MAX_DIMS]; // size of each dimension
+    int ndim;                   // number of valid dimension
+    size_t size;                // total number of threads
+};
+#ifndef __CUDACC__
+static size_t s_threadIdx;
+#endif
+inline CUDA_CALLABLE size_t grid_index()
+{
+#ifdef __CUDACC__
+    // Need to cast at least one of the variables being multiplied so that type promotion happens before the multiplication
+    size_t grid_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+    return grid_index;
+#else
+    return s_threadIdx;
+#endif
+}
+inline CUDA_CALLABLE int tid(size_t index)
+{
+    // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
+    // Only do this in _DEBUG when called from device to avoid excessive register allocation
+#if defined(_DEBUG) || !defined(__CUDA_ARCH__)
+    if (index > 2147483647) {
+        printf("Warp warning: tid() is returning an overflowed int\n");
+    }
+#endif
+    return static_cast<int>(index);
+}
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds)
+{
+    const size_t n = launch_bounds.shape[1];
+    // convert to work item
+    i = index/n;
+    j = index%n;
+}
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds)
+{
+    const size_t n = launch_bounds.shape[1];
+    const size_t o = launch_bounds.shape[2];
+    // convert to work item
+    i = index/(n*o);
+    j = index%(n*o)/o;
+    k = index%o;
+}
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds)
+{
+    const size_t n = launch_bounds.shape[1];
+    const size_t o = launch_bounds.shape[2];
+    const size_t p = launch_bounds.shape[3];
+    // convert to work item
+    i = index/(n*o*p);
+    j = index%(n*o*p)/(o*p);
+    k = index%(o*p)/p;
+    l = index%p;
+}
+template<typename T>
+inline CUDA_CALLABLE T atomic_add(T* buf, T value)
+{
+#if !defined(__CUDA_ARCH__)
+    T old = buf[0];
+    buf[0] += value;
+    return old;
+#else
+    return atomicAdd(buf, value);
+#endif
+}
+template<>
+inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value)
+{
+#if !defined(__CUDA_ARCH__)
+    float16 old = buf[0];
+    buf[0] += value;
+    return old;
+#elif defined(__clang__)  // CUDA compiled by Clang
+	__half r = atomicAdd(reinterpret_cast<__half*>(buf), *reinterpret_cast<__half*>(&value));
+    return *reinterpret_cast<float16*>(&r);
+#else  // CUDA compiled by NVRTC
+    //return atomicAdd(buf, value);
+    /* Define __PTR for atomicAdd prototypes below, undef after done */
+    #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+    #define __PTR   "l"
+    #else
+    #define __PTR   "r"
+    #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+    half r = 0.0;
+    #if __CUDA_ARCH__ >= 700
+        asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                    : "=h"(r.u)
+                    : __PTR(buf), "h"(value.u)
+                    : "memory");
+    #endif
+    return r;
+    #undef __PTR
+#endif  // CUDA compiled by NVRTC
+}
+// emulate atomic float max
+inline CUDA_CALLABLE float atomic_max(float* address, float val)
+{
+#if defined(__CUDA_ARCH__)
+    int *address_as_int = (int*)address;
+    int old = *address_as_int, assumed;
+	while (val > __int_as_float(old))
+	{
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed,
+                        __float_as_int(val));
+    }
+    return __int_as_float(old);
+#else
+    float old = *address;
+    *address = max(old, val);
+    return old;
+#endif
+}
+// emulate atomic float min/max with atomicCAS()
+inline CUDA_CALLABLE float atomic_min(float* address, float val)
+{
+#if defined(__CUDA_ARCH__)
+    int *address_as_int = (int*)address;
+    int old = *address_as_int, assumed;
+    while (val < __int_as_float(old))
+	{
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed,
+                        __float_as_int(val));
+    }
+    return __int_as_float(old);
+#else
+    float old = *address;
+    *address = min(old, val);
+    return old;
+#endif
+}
+inline CUDA_CALLABLE int atomic_max(int* address, int val)
+{
+#if defined(__CUDA_ARCH__)
+    return atomicMax(address, val);
+#else
+    int old = *address;
+    *address = max(old, val);
+    return old;
+#endif
+}
+// atomic int min
+inline CUDA_CALLABLE int atomic_min(int* address, int val)
+{
+#if defined(__CUDA_ARCH__)
+    return atomicMin(address, val);
+#else
+    int old = *address;
+    *address = min(old, val);
+    return old;
+#endif
+}
+// default behavior for adjoint of atomic min/max operation that accumulates gradients for all elements matching the min/max value
+template <typename T>
+CUDA_CALLABLE inline void adj_atomic_minmax(T *addr, T *adj_addr, const T &value, T &adj_value)
+{
+    if (value == *addr)
+        adj_value += *adj_addr;
+}
+// for integral types we do not accumulate gradients
+CUDA_CALLABLE inline void adj_atomic_minmax(int8* buf, int8* adj_buf, const int8 &value, int8 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint8* buf, uint8* adj_buf, const uint8 &value, uint8 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int16* buf, int16* adj_buf, const int16 &value, int16 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint16* buf, uint16* adj_buf, const uint16 &value, uint16 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int32* buf, int32* adj_buf, const int32 &value, int32 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint32* buf, uint32* adj_buf, const uint32 &value, uint32 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(int64* buf, int64* adj_buf, const int64 &value, int64 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(uint64* buf, uint64* adj_buf, const uint64 &value, uint64 &adj_value) { }
+CUDA_CALLABLE inline void adj_atomic_minmax(bool* buf, bool* adj_buf, const bool &value, bool &adj_value) { }
+} // namespace wp
+// bool and printf are defined outside of the wp namespace in crt.h, hence
+// their adjoint counterparts are also defined in the global namespace.
+template <typename T>
+CUDA_CALLABLE inline void adj_bool(T, T&, bool) {}
+inline CUDA_CALLABLE void adj_printf(const char* fmt, ...) {}
+#include "vec.h"
+#include "mat.h"
+#include "quat.h"
+#include "spatial.h"
+#include "intersect.h"
+#include "intersect_adj.h"
+//--------------
+namespace wp
+{
+// dot for scalar types just to make some templates compile for scalar/vector
+inline CUDA_CALLABLE float dot(float a, float b) { return mul(a, b); }
+inline CUDA_CALLABLE void adj_dot(float a, float b, float& adj_a, float& adj_b, float adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); }
+inline CUDA_CALLABLE float tensordot(float a, float b) { return mul(a, b); }
+#define DECLARE_INTERP_FUNCS(T) \
+CUDA_CALLABLE inline T smoothstep(T edge0, T edge1, T x)\
+{\
+    x = clamp((x - edge0) / (edge1 - edge0), T(0), T(1));\
+    return x * x * (T(3) - T(2) * x);\
+}\
+CUDA_CALLABLE inline void adj_smoothstep(T edge0, T edge1, T x, T& adj_edge0, T& adj_edge1, T& adj_x, T adj_ret)\
+{\
+    T ab = edge0 - edge1;\
+    T ax = edge0 - x;\
+    T bx = edge1 - x;\
+    T xb = x - edge1;\
+\
+    if (bx / ab >= T(0) || ax / ab <= T(0))\
+    {\
+        return;\
+    }\
+\
+    T ab3 = ab * ab * ab;\
+    T ab4 = ab3 * ab;\
+    adj_edge0 += adj_ret * ((T(6) * ax * bx * bx) / ab4);\
+    adj_edge1 += adj_ret * ((T(6) * ax * ax * xb) / ab4);\
+    adj_x     += adj_ret * ((T(6) * ax * bx     ) / ab3);\
+}\
+CUDA_CALLABLE inline T lerp(const T& a, const T& b, T t)\
+{\
+    return a*(T(1)-t) + b*t;\
+}\
+CUDA_CALLABLE inline void adj_lerp(const T& a, const T& b, T t, T& adj_a, T& adj_b, T& adj_t, const T& adj_ret)\
+{\
+    adj_a += adj_ret*(T(1)-t);\
+    adj_b += adj_ret*t;\
+    adj_t += b*adj_ret - a*adj_ret;\
+}
+DECLARE_INTERP_FUNCS(float16)
+DECLARE_INTERP_FUNCS(float32)
+DECLARE_INTERP_FUNCS(float64)
+inline CUDA_CALLABLE void print(const str s)
+{
+    printf("%s\n", s);
+}
+inline CUDA_CALLABLE void print(int i)
+{
+    printf("%d\n", i);
+}
+inline CUDA_CALLABLE void print(short i)
+{
+    printf("%hd\n", i);
+}
+inline CUDA_CALLABLE void print(long i)
+{
+    printf("%ld\n", i);
+}
+inline CUDA_CALLABLE void print(long long i)
+{
+    printf("%lld\n", i);
+}
+inline CUDA_CALLABLE void print(unsigned i)
+{
+    printf("%u\n", i);
+}
+inline CUDA_CALLABLE void print(unsigned short i)
+{
+    printf("%hu\n", i);
+}
+inline CUDA_CALLABLE void print(unsigned long i)
+{
+    printf("%lu\n", i);
+}
+inline CUDA_CALLABLE void print(unsigned long long i)
+{
+    printf("%llu\n", i);
+}
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE void print(vec_t<Length, Type> v)
+{
+    for( unsigned i=0; i < Length; ++i )
+    {
+        printf("%g ", float(v[i]));
+    }
+    printf("\n");
+}
+template<typename Type>
+inline CUDA_CALLABLE void print(quat_t<Type> i)
+{
+    printf("%g %g %g %g\n", float(i.x), float(i.y), float(i.z), float(i.w));
+}
+template<unsigned Rows,unsigned Cols,typename Type>
+inline CUDA_CALLABLE void print(const mat_t<Rows,Cols,Type> &m)
+{
+    for( unsigned i=0; i< Rows; ++i )
+    {
+        for( unsigned j=0; j< Cols; ++j )
+        {
+            printf("%g ",float(m.data[i][j]));
+        }
+        printf("\n");
+    }
+}
+template<typename Type>
+inline CUDA_CALLABLE void print(transform_t<Type> t)
+{
+    printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
+}
+inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); }
+inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(unsigned long long f, unsigned long long adj_f) { printf("%llu adj: %llu\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(half h, half adj_h) { printf("%g adj: %g\n", half_to_float(h), half_to_float(adj_h)); }
+inline CUDA_CALLABLE void adj_print(double f, double adj_f) { printf("%g adj: %g\n", f, adj_f); }
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE void adj_print(vec_t<Length, Type> v, vec_t<Length, Type>& adj_v) { printf("%g %g adj: %g %g \n", v[0], v[1], adj_v[0], adj_v[1]); }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type> m, mat_t<Rows, Cols, Type>& adj_m) { }
+template<typename Type>
+inline CUDA_CALLABLE void adj_print(quat_t<Type> q, quat_t<Type>& adj_q) { printf("%g %g %g %g adj: %g %g %g %g\n", q.x, q.y, q.z, q.w, adj_q.x, adj_q.y, adj_q.z, adj_q.w); }
+template<typename Type>
+inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_t) {}
+inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {}
+template <typename T>
+inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
+{
+    if (!(actual == expected))
+    {
+        printf("Error, expect_eq() failed:\n");
+        printf("\t Expected: "); print(expected);
+        printf("\t Actual: "); print(actual);
+    }
+}
+template <typename T>
+inline CUDA_CALLABLE void adj_expect_eq(const T& a, const T& b, T& adj_a, T& adj_b)
+{
+    // nop
+}
+template <typename T>
+inline CUDA_CALLABLE void expect_neq(const T& actual, const T& expected)
+{
+    if (actual == expected)
+    {
+        printf("Error, expect_neq() failed:\n");
+        printf("\t Expected: "); print(expected);
+        printf("\t Actual: "); print(actual);
+    }
+}
+template <typename T>
+inline CUDA_CALLABLE void adj_expect_neq(const T& a, const T& b, T& adj_a, T& adj_b)
+{
+    // nop
+}
+template <typename T>
+inline CUDA_CALLABLE void expect_near(const T& actual, const T& expected, const T& tolerance)
+{
+    if (abs(actual - expected) > tolerance)
+    {
+        printf("Error, expect_near() failed with tolerance "); print(tolerance);
+        printf("\t Expected: "); print(expected);
+        printf("\t Actual: "); print(actual);
+    }
+}
+inline CUDA_CALLABLE void expect_near(const vec3& actual, const vec3& expected, const float& tolerance)
+{
+    const float diff = max(max(abs(actual[0] - expected[0]), abs(actual[1] - expected[1])), abs(actual[2] - expected[2]));
+    if (diff > tolerance)
+    {
+        printf("Error, expect_near() failed with tolerance "); print(tolerance);
+        printf("\t Expected: "); print(expected);
+        printf("\t Actual: "); print(actual);
+    }
+}
+template <typename T>
+inline CUDA_CALLABLE void adj_expect_near(const T& actual, const T& expected, const T& tolerance, T& adj_actual, T& adj_expected, T& adj_tolerance)
+{
+    // nop
+}
+inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expected, float tolerance, vec3& adj_actual, vec3& adj_expected, float adj_tolerance)
+{
+    // nop
+}
+} // namespace wp
+// include array.h so we have the print, isfinite functions for the inner array types defined
+#include "array.h"
+#include "mesh.h"
+#include "bvh.h"
+#include "svd.h"
+#include "hashgrid.h"
+#include "volume.h"
+#include "range.h"
+#include "rand.h"
+#include "noise.h"
+#include "matnn.h"