PyPI - warp-lang - Versions diffs - 1.10.0__py3-none-macosx_11_0_arm64.whl - Mend

warp-lang 1.10.0__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (468) hide show

warp/__init__.py +334 -0
warp/__init__.pyi +5856 -0
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1077 -0
warp/_src/build.py +620 -0
warp/_src/build_dll.py +642 -0
warp/_src/builtins.py +10555 -0
warp/_src/codegen.py +4361 -0
warp/_src/config.py +178 -0
warp/_src/constants.py +59 -0
warp/_src/context.py +8352 -0
warp/_src/dlpack.py +464 -0
warp/_src/fabric.py +362 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +510 -0
warp/_src/fem/cache.py +689 -0
warp/_src/fem/dirichlet.py +190 -0
warp/_src/fem/domain.py +553 -0
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +703 -0
warp/_src/fem/field/nodal_field.py +403 -0
warp/_src/fem/field/restriction.py +39 -0
warp/_src/fem/field/virtual.py +1021 -0
warp/_src/fem/geometry/__init__.py +32 -0
warp/_src/fem/geometry/adaptive_nanogrid.py +782 -0
warp/_src/fem/geometry/closest_point.py +99 -0
warp/_src/fem/geometry/deformed_geometry.py +277 -0
warp/_src/fem/geometry/element.py +854 -0
warp/_src/fem/geometry/geometry.py +693 -0
warp/_src/fem/geometry/grid_2d.py +478 -0
warp/_src/fem/geometry/grid_3d.py +539 -0
warp/_src/fem/geometry/hexmesh.py +956 -0
warp/_src/fem/geometry/nanogrid.py +660 -0
warp/_src/fem/geometry/partition.py +483 -0
warp/_src/fem/geometry/quadmesh.py +597 -0
warp/_src/fem/geometry/tetmesh.py +762 -0
warp/_src/fem/geometry/trimesh.py +588 -0
warp/_src/fem/integrate.py +2507 -0
warp/_src/fem/linalg.py +385 -0
warp/_src/fem/operator.py +398 -0
warp/_src/fem/polynomial.py +231 -0
warp/_src/fem/quadrature/__init__.py +17 -0
warp/_src/fem/quadrature/pic_quadrature.py +318 -0
warp/_src/fem/quadrature/quadrature.py +665 -0
warp/_src/fem/space/__init__.py +248 -0
warp/_src/fem/space/basis_function_space.py +499 -0
warp/_src/fem/space/basis_space.py +681 -0
warp/_src/fem/space/dof_mapper.py +253 -0
warp/_src/fem/space/function_space.py +312 -0
warp/_src/fem/space/grid_2d_function_space.py +179 -0
warp/_src/fem/space/grid_3d_function_space.py +229 -0
warp/_src/fem/space/hexmesh_function_space.py +255 -0
warp/_src/fem/space/nanogrid_function_space.py +199 -0
warp/_src/fem/space/partition.py +435 -0
warp/_src/fem/space/quadmesh_function_space.py +222 -0
warp/_src/fem/space/restriction.py +221 -0
warp/_src/fem/space/shape/__init__.py +152 -0
warp/_src/fem/space/shape/cube_shape_function.py +1107 -0
warp/_src/fem/space/shape/shape_function.py +134 -0
warp/_src/fem/space/shape/square_shape_function.py +928 -0
warp/_src/fem/space/shape/tet_shape_function.py +829 -0
warp/_src/fem/space/shape/triangle_shape_function.py +674 -0
warp/_src/fem/space/tetmesh_function_space.py +270 -0
warp/_src/fem/space/topology.py +461 -0
warp/_src/fem/space/trimesh_function_space.py +193 -0
warp/_src/fem/types.py +114 -0
warp/_src/fem/utils.py +488 -0
warp/_src/jax.py +188 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +389 -0
warp/_src/jax_experimental/ffi.py +1286 -0
warp/_src/jax_experimental/xla_ffi.py +658 -0
warp/_src/marching_cubes.py +710 -0
warp/_src/math.py +416 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +165 -0
warp/_src/optim/linear.py +1608 -0
warp/_src/optim/sgd.py +114 -0
warp/_src/paddle.py +408 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +291 -0
warp/_src/render/render_opengl.py +3638 -0
warp/_src/render/render_usd.py +939 -0
warp/_src/render/utils.py +162 -0
warp/_src/sparse.py +2718 -0
warp/_src/tape.py +1208 -0
warp/_src/thirdparty/__init__.py +0 -0
warp/_src/thirdparty/appdirs.py +598 -0
warp/_src/thirdparty/dlpack.py +145 -0
warp/_src/thirdparty/unittest_parallel.py +676 -0
warp/_src/torch.py +393 -0
warp/_src/types.py +5888 -0
warp/_src/utils.py +1695 -0
warp/autograd.py +33 -0
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +29 -0
warp/build_dll.py +24 -0
warp/codegen.py +24 -0
warp/constants.py +24 -0
warp/context.py +33 -0
warp/dlpack.py +24 -0
warp/examples/__init__.py +24 -0
warp/examples/assets/bear.usd +0 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cube.usd +0 -0
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/assets/pixel.jpg +0 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usd +0 -0
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_api.py +389 -0
warp/examples/benchmarks/benchmark_cloth.py +296 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
warp/examples/benchmarks/benchmark_gemm.py +164 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
warp/examples/benchmarks/benchmark_launches.py +301 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/browse.py +37 -0
warp/examples/core/example_cupy.py +86 -0
warp/examples/core/example_dem.py +241 -0
warp/examples/core/example_fluid.py +299 -0
warp/examples/core/example_graph_capture.py +150 -0
warp/examples/core/example_marching_cubes.py +195 -0
warp/examples/core/example_mesh.py +180 -0
warp/examples/core/example_mesh_intersect.py +211 -0
warp/examples/core/example_nvdb.py +182 -0
warp/examples/core/example_raycast.py +111 -0
warp/examples/core/example_raymarch.py +205 -0
warp/examples/core/example_render_opengl.py +290 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +411 -0
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_torch.py +211 -0
warp/examples/core/example_wave.py +269 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/distributed/example_jacobi_mpi.py +506 -0
warp/examples/fem/example_adaptive_grid.py +286 -0
warp/examples/fem/example_apic_fluid.py +469 -0
warp/examples/fem/example_burgers.py +261 -0
warp/examples/fem/example_convection_diffusion.py +181 -0
warp/examples/fem/example_convection_diffusion_dg.py +225 -0
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +172 -0
warp/examples/fem/example_diffusion.py +196 -0
warp/examples/fem/example_diffusion_3d.py +225 -0
warp/examples/fem/example_diffusion_mgpu.py +225 -0
warp/examples/fem/example_distortion_energy.py +228 -0
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +242 -0
warp/examples/fem/example_mixed_elasticity.py +293 -0
warp/examples/fem/example_navier_stokes.py +263 -0
warp/examples/fem/example_nonconforming_contact.py +300 -0
warp/examples/fem/example_stokes.py +213 -0
warp/examples/fem/example_stokes_transfer.py +262 -0
warp/examples/fem/example_streamlines.py +357 -0
warp/examples/fem/utils.py +1047 -0
warp/examples/interop/example_jax_callable.py +146 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +232 -0
warp/examples/optim/example_diffray.py +561 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +88 -0
warp/examples/tile/example_tile_convolution.py +66 -0
warp/examples/tile/example_tile_fft.py +55 -0
warp/examples/tile/example_tile_filtering.py +113 -0
warp/examples/tile/example_tile_matmul.py +85 -0
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/examples/tile/example_tile_mlp.py +385 -0
warp/examples/tile/example_tile_nbody.py +199 -0
warp/fabric.py +24 -0
warp/fem/__init__.py +173 -0
warp/fem/adaptivity.py +26 -0
warp/fem/cache.py +30 -0
warp/fem/dirichlet.py +24 -0
warp/fem/field/__init__.py +24 -0
warp/fem/field/field.py +26 -0
warp/fem/geometry/__init__.py +21 -0
warp/fem/geometry/closest_point.py +31 -0
warp/fem/linalg.py +38 -0
warp/fem/operator.py +32 -0
warp/fem/polynomial.py +29 -0
warp/fem/space/__init__.py +22 -0
warp/fem/space/basis_space.py +24 -0
warp/fem/space/shape/__init__.py +68 -0
warp/fem/space/topology.py +24 -0
warp/fem/types.py +24 -0
warp/fem/utils.py +32 -0
warp/jax.py +29 -0
warp/jax_experimental/__init__.py +29 -0
warp/jax_experimental/custom_call.py +29 -0
warp/jax_experimental/ffi.py +39 -0
warp/jax_experimental/xla_ffi.py +24 -0
warp/marching_cubes.py +24 -0
warp/math.py +37 -0
warp/native/array.h +1687 -0
warp/native/builtin.h +2327 -0
warp/native/bvh.cpp +562 -0
warp/native/bvh.cu +826 -0
warp/native/bvh.h +555 -0
warp/native/clang/clang.cpp +541 -0
warp/native/coloring.cpp +622 -0
warp/native/crt.cpp +51 -0
warp/native/crt.h +568 -0
warp/native/cuda_crt.h +1058 -0
warp/native/cuda_util.cpp +677 -0
warp/native/cuda_util.h +313 -0
warp/native/error.cpp +77 -0
warp/native/error.h +36 -0
warp/native/exports.h +2023 -0
warp/native/fabric.h +246 -0
warp/native/hashgrid.cpp +311 -0
warp/native/hashgrid.cu +89 -0
warp/native/hashgrid.h +240 -0
warp/native/initializer_array.h +41 -0
warp/native/intersect.h +1253 -0
warp/native/intersect_adj.h +375 -0
warp/native/intersect_tri.h +348 -0
warp/native/mat.h +5189 -0
warp/native/mathdx.cpp +93 -0
warp/native/matnn.h +221 -0
warp/native/mesh.cpp +266 -0
warp/native/mesh.cu +406 -0
warp/native/mesh.h +2097 -0
warp/native/nanovdb/GridHandle.h +533 -0
warp/native/nanovdb/HostBuffer.h +591 -0
warp/native/nanovdb/NanoVDB.h +6246 -0
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +3390 -0
warp/native/noise.h +859 -0
warp/native/quat.h +1664 -0
warp/native/rand.h +342 -0
warp/native/range.h +145 -0
warp/native/reduce.cpp +174 -0
warp/native/reduce.cu +363 -0
warp/native/runlength_encode.cpp +79 -0
warp/native/runlength_encode.cu +61 -0
warp/native/scan.cpp +47 -0
warp/native/scan.cu +55 -0
warp/native/scan.h +23 -0
warp/native/solid_angle.h +466 -0
warp/native/sort.cpp +251 -0
warp/native/sort.cu +286 -0
warp/native/sort.h +35 -0
warp/native/sparse.cpp +241 -0
warp/native/sparse.cu +435 -0
warp/native/spatial.h +1306 -0
warp/native/svd.h +727 -0
warp/native/temp_buffer.h +46 -0
warp/native/tile.h +4124 -0
warp/native/tile_radix_sort.h +1112 -0
warp/native/tile_reduce.h +838 -0
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +2199 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +501 -0
warp/native/volume.cu +68 -0
warp/native/volume.h +970 -0
warp/native/volume_builder.cu +483 -0
warp/native/volume_builder.h +52 -0
warp/native/volume_impl.h +70 -0
warp/native/warp.cpp +1143 -0
warp/native/warp.cu +4604 -0
warp/native/warp.h +358 -0
warp/optim/__init__.py +20 -0
warp/optim/adam.py +24 -0
warp/optim/linear.py +35 -0
warp/optim/sgd.py +24 -0
warp/paddle.py +24 -0
warp/py.typed +0 -0
warp/render/__init__.py +22 -0
warp/render/imgui_manager.py +29 -0
warp/render/render_opengl.py +24 -0
warp/render/render_usd.py +24 -0
warp/render/utils.py +24 -0
warp/sparse.py +51 -0
warp/tape.py +24 -0
warp/tests/__init__.py +1 -0
warp/tests/__main__.py +4 -0
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/mlp_golden.npy +0 -0
warp/tests/assets/pixel.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/spiky.usd +0 -0
warp/tests/assets/test_grid.nvdb +0 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/assets/test_int32_grid.nvdb +0 -0
warp/tests/assets/test_vec_grid.nvdb +0 -0
warp/tests/assets/torus.nvdb +0 -0
warp/tests/assets/torus.usda +105 -0
warp/tests/aux_test_class_kernel.py +34 -0
warp/tests/aux_test_compile_consts_dummy.py +18 -0
warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
warp/tests/aux_test_dependent.py +29 -0
warp/tests/aux_test_grad_customs.py +29 -0
warp/tests/aux_test_instancing_gc.py +26 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/aux_test_module_unload.py +23 -0
warp/tests/aux_test_name_clash1.py +40 -0
warp/tests/aux_test_name_clash2.py +40 -0
warp/tests/aux_test_reference.py +9 -0
warp/tests/aux_test_reference_reference.py +8 -0
warp/tests/aux_test_square.py +16 -0
warp/tests/aux_test_unresolved_func.py +22 -0
warp/tests/aux_test_unresolved_symbol.py +22 -0
warp/tests/cuda/__init__.py +0 -0
warp/tests/cuda/test_async.py +676 -0
warp/tests/cuda/test_conditional_captures.py +1147 -0
warp/tests/cuda/test_ipc.py +124 -0
warp/tests/cuda/test_mempool.py +233 -0
warp/tests/cuda/test_multigpu.py +169 -0
warp/tests/cuda/test_peer.py +139 -0
warp/tests/cuda/test_pinned.py +84 -0
warp/tests/cuda/test_streams.py +691 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/geometry/test_bvh.py +335 -0
warp/tests/geometry/test_hash_grid.py +259 -0
warp/tests/geometry/test_marching_cubes.py +294 -0
warp/tests/geometry/test_mesh.py +318 -0
warp/tests/geometry/test_mesh_query_aabb.py +392 -0
warp/tests/geometry/test_mesh_query_point.py +935 -0
warp/tests/geometry/test_mesh_query_ray.py +323 -0
warp/tests/geometry/test_volume.py +1103 -0
warp/tests/geometry/test_volume_write.py +346 -0
warp/tests/interop/__init__.py +0 -0
warp/tests/interop/test_dlpack.py +730 -0
warp/tests/interop/test_jax.py +1673 -0
warp/tests/interop/test_paddle.py +800 -0
warp/tests/interop/test_torch.py +1001 -0
warp/tests/run_coverage_serial.py +39 -0
warp/tests/test_adam.py +162 -0
warp/tests/test_arithmetic.py +1096 -0
warp/tests/test_array.py +3756 -0
warp/tests/test_array_reduce.py +156 -0
warp/tests/test_assert.py +303 -0
warp/tests/test_atomic.py +336 -0
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +312 -0
warp/tests/test_bool.py +220 -0
warp/tests/test_builtins_resolution.py +732 -0
warp/tests/test_closest_point_edge_edge.py +327 -0
warp/tests/test_codegen.py +974 -0
warp/tests/test_codegen_instancing.py +1495 -0
warp/tests/test_compile_consts.py +215 -0
warp/tests/test_conditional.py +298 -0
warp/tests/test_context.py +35 -0
warp/tests/test_copy.py +319 -0
warp/tests/test_ctypes.py +618 -0
warp/tests/test_dense.py +73 -0
warp/tests/test_devices.py +127 -0
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +424 -0
warp/tests/test_fabricarray.py +998 -0
warp/tests/test_fast_math.py +72 -0
warp/tests/test_fem.py +2204 -0
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_fp16.py +136 -0
warp/tests/test_func.py +501 -0
warp/tests/test_future_annotations.py +100 -0
warp/tests/test_generics.py +656 -0
warp/tests/test_grad.py +893 -0
warp/tests/test_grad_customs.py +339 -0
warp/tests/test_grad_debug.py +341 -0
warp/tests/test_implicit_init.py +411 -0
warp/tests/test_import.py +45 -0
warp/tests/test_indexedarray.py +1140 -0
warp/tests/test_intersect.py +103 -0
warp/tests/test_iter.py +76 -0
warp/tests/test_large.py +177 -0
warp/tests/test_launch.py +411 -0
warp/tests/test_lerp.py +151 -0
warp/tests/test_linear_solvers.py +223 -0
warp/tests/test_lvalue.py +427 -0
warp/tests/test_map.py +526 -0
warp/tests/test_mat.py +3515 -0
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +573 -0
warp/tests/test_mat_lite.py +122 -0
warp/tests/test_mat_scalar_ops.py +2913 -0
warp/tests/test_math.py +212 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_module_hashing.py +258 -0
warp/tests/test_modules_lite.py +70 -0
warp/tests/test_noise.py +252 -0
warp/tests/test_operators.py +299 -0
warp/tests/test_options.py +129 -0
warp/tests/test_overwrite.py +551 -0
warp/tests/test_print.py +408 -0
warp/tests/test_quat.py +2653 -0
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_rand.py +339 -0
warp/tests/test_reload.py +303 -0
warp/tests/test_rounding.py +157 -0
warp/tests/test_runlength_encode.py +196 -0
warp/tests/test_scalar_ops.py +133 -0
warp/tests/test_smoothstep.py +108 -0
warp/tests/test_snippet.py +318 -0
warp/tests/test_sparse.py +845 -0
warp/tests/test_spatial.py +2859 -0
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_special_values.py +361 -0
warp/tests/test_static.py +640 -0
warp/tests/test_struct.py +901 -0
warp/tests/test_tape.py +242 -0
warp/tests/test_transient_module.py +93 -0
warp/tests/test_triangle_closest_point.py +192 -0
warp/tests/test_tuple.py +361 -0
warp/tests/test_types.py +615 -0
warp/tests/test_utils.py +594 -0
warp/tests/test_vec.py +1408 -0
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/test_vec_lite.py +80 -0
warp/tests/test_vec_scalar_ops.py +2327 -0
warp/tests/test_verify_fp.py +100 -0
warp/tests/test_version.py +75 -0
warp/tests/tile/__init__.py +0 -0
warp/tests/tile/test_tile.py +1519 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +608 -0
warp/tests/tile/test_tile_load.py +724 -0
warp/tests/tile/test_tile_mathdx.py +156 -0
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +400 -0
warp/tests/tile/test_tile_reduce.py +950 -0
warp/tests/tile/test_tile_shared_memory.py +376 -0
warp/tests/tile/test_tile_sort.py +121 -0
warp/tests/tile/test_tile_view.py +173 -0
warp/tests/unittest_serial.py +47 -0
warp/tests/unittest_suites.py +430 -0
warp/tests/unittest_utils.py +469 -0
warp/tests/walkthrough_debug.py +95 -0
warp/torch.py +24 -0
warp/types.py +51 -0
warp/utils.py +31 -0
warp_lang-1.10.0.dist-info/METADATA +459 -0
warp_lang-1.10.0.dist-info/RECORD +468 -0
warp_lang-1.10.0.dist-info/WHEEL +5 -0
warp_lang-1.10.0.dist-info/licenses/LICENSE.md +176 -0
warp_lang-1.10.0.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/top_level.txt +1 -0

warp/native/array.h ADDED Viewed

@@ -0,0 +1,1687 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "builtin.h"
+namespace wp
+{
+#if FP_CHECK
+#define FP_ASSERT_FWD(value) \
+    print(value); \
+    printf(")\n"); \
+    assert(0); \
+#define FP_ASSERT_ADJ(value, adj_value) \
+    print(value); \
+    printf(", "); \
+    print(adj_value); \
+    printf(")\n"); \
+    assert(0); \
+#define FP_VERIFY_FWD(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(addr", __FILE__, __LINE__, __FUNCTION__); \
+        FP_ASSERT_FWD(value) \
+    } \
+#define FP_VERIFY_FWD_1(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(arr, %d) ", __FILE__, __LINE__, __FUNCTION__, i); \
+        FP_ASSERT_FWD(value) \
+    } \
+#define FP_VERIFY_FWD_2(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(arr, %d, %d) ", __FILE__, __LINE__, __FUNCTION__, i, j); \
+        FP_ASSERT_FWD(value) \
+    } \
+#define FP_VERIFY_FWD_3(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(arr, %d, %d, %d) ", __FILE__, __LINE__, __FUNCTION__, i, j, k); \
+        FP_ASSERT_FWD(value) \
+    } \
+#define FP_VERIFY_FWD_4(value) \
+    if (!isfinite(value)) { \
+        printf("%s:%d - %s(arr, %d, %d, %d, %d) ", __FILE__, __LINE__, __FUNCTION__, i, j, k, l); \
+        FP_ASSERT_FWD(value) \
+    } \
+#define FP_VERIFY_ADJ(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(addr",  __FILE__, __LINE__, __FUNCTION__); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
+#define FP_VERIFY_ADJ_1(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(arr, %d) ",  __FILE__, __LINE__, __FUNCTION__, i); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
+#define FP_VERIFY_ADJ_2(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(arr, %d, %d) ",  __FILE__, __LINE__, __FUNCTION__, i, j); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
+#define FP_VERIFY_ADJ_3(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(arr, %d, %d, %d) ", __FILE__, __LINE__, __FUNCTION__, i, j, k); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
+#define FP_VERIFY_ADJ_4(value, adj_value) \
+    if (!isfinite(value) || !isfinite(adj_value)) \
+    { \
+        printf("%s:%d - %s(arr, %d, %d, %d, %d) ", __FILE__, __LINE__, __FUNCTION__, i, j, k, l); \
+        FP_ASSERT_ADJ(value, adj_value); \
+    } \
+#else
+#define FP_VERIFY_FWD(value) {}
+#define FP_VERIFY_FWD_1(value) {}
+#define FP_VERIFY_FWD_2(value) {}
+#define FP_VERIFY_FWD_3(value) {}
+#define FP_VERIFY_FWD_4(value) {}
+#define FP_VERIFY_ADJ(value, adj_value) {}
+#define FP_VERIFY_ADJ_1(value, adj_value) {}
+#define FP_VERIFY_ADJ_2(value, adj_value) {}
+#define FP_VERIFY_ADJ_3(value, adj_value) {}
+#define FP_VERIFY_ADJ_4(value, adj_value) {}
+#endif  // WP_FP_CHECK
+template<size_t... Is>
+struct index_sequence {};
+template<size_t N, size_t... Is>
+struct make_index_sequence_impl : make_index_sequence_impl<N-1, N-1, Is...> {};
+template<size_t... Is>
+struct make_index_sequence_impl<0, Is...>
+{
+    using type = index_sequence<Is...>;
+};
+template<size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+const int ARRAY_MAX_DIMS = 4;       // must match constant in types.py
+// must match constants in types.py
+const int ARRAY_TYPE_REGULAR = 0;
+const int ARRAY_TYPE_INDEXED = 1;
+const int ARRAY_TYPE_FABRIC = 2;
+const int ARRAY_TYPE_FABRIC_INDEXED = 3;
+struct shape_t
+{
+    int dims[ARRAY_MAX_DIMS];
+    CUDA_CALLABLE inline shape_t()
+        : dims()
+    {}
+    CUDA_CALLABLE inline int operator[](int i) const
+    {
+        assert(i < ARRAY_MAX_DIMS);
+        return dims[i];
+    }
+    CUDA_CALLABLE inline int& operator[](int i)
+    {
+        assert(i < ARRAY_MAX_DIMS);
+        return dims[i];
+    }
+};
+CUDA_CALLABLE inline int extract(const shape_t& s, int i)
+{
+    return s.dims[i];
+}
+CUDA_CALLABLE inline void adj_extract(const shape_t& s, int i, const shape_t& adj_s, int adj_i, int adj_ret) {}
+inline CUDA_CALLABLE void print(shape_t s)
+{
+    // todo: only print valid dims, currently shape has a fixed size
+    // but we don't know how many dims are valid (e.g.: 1d, 2d, etc)
+    // should probably store ndim with shape
+    printf("(%d, %d, %d, %d)\n", s.dims[0], s.dims[1], s.dims[2], s.dims[3]);
+}
+inline CUDA_CALLABLE void adj_print(shape_t s, shape_t& adj_s) {}
+template <typename T>
+struct array_t
+{
+    CUDA_CALLABLE inline array_t()
+        : data(nullptr),
+          grad(nullptr),
+          shape(),
+          strides(),
+          ndim(0)
+    {}
+    CUDA_CALLABLE array_t(T* data, int size, T* grad=nullptr) : data(data), grad(grad) {
+        // constructor for 1d array
+        shape.dims[0] = size;
+        shape.dims[1] = 0;
+        shape.dims[2] = 0;
+        shape.dims[3] = 0;
+        ndim = 1;
+        strides[0] = sizeof(T);
+        strides[1] = 0;
+        strides[2] = 0;
+        strides[3] = 0;
+    }
+    CUDA_CALLABLE array_t(T* data, int dim0, int dim1, T* grad=nullptr) : data(data), grad(grad) {
+        // constructor for 2d array
+        shape.dims[0] = dim0;
+        shape.dims[1] = dim1;
+        shape.dims[2] = 0;
+        shape.dims[3] = 0;
+        ndim = 2;
+        strides[0] = dim1 * sizeof(T);
+        strides[1] = sizeof(T);
+        strides[2] = 0;
+        strides[3] = 0;
+    }
+    CUDA_CALLABLE array_t(T* data, int dim0, int dim1, int dim2, T* grad=nullptr) : data(data), grad(grad) {
+        // constructor for 3d array
+        shape.dims[0] = dim0;
+        shape.dims[1] = dim1;
+        shape.dims[2] = dim2;
+        shape.dims[3] = 0;
+        ndim = 3;
+        strides[0] = dim1 * dim2 * sizeof(T);
+        strides[1] = dim2 * sizeof(T);
+        strides[2] = sizeof(T);
+        strides[3] = 0;
+    }
+    CUDA_CALLABLE array_t(T* data, int dim0, int dim1, int dim2, int dim3, T* grad=nullptr) : data(data), grad(grad) {
+        // constructor for 4d array
+        shape.dims[0] = dim0;
+        shape.dims[1] = dim1;
+        shape.dims[2] = dim2;
+        shape.dims[3] = dim3;
+        ndim = 4;
+        strides[0] = dim1 * dim2 * dim3 * sizeof(T);
+        strides[1] = dim2 * dim3 * sizeof(T);
+        strides[2] = dim3 * sizeof(T);
+        strides[3] = sizeof(T);
+    }
+    CUDA_CALLABLE array_t(uint64 data, int size, uint64 grad=0)
+        : array_t((T*)(data), size, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, int dim2, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, dim2, (T*)(grad))
+    {}
+    CUDA_CALLABLE array_t(uint64 data, int dim0, int dim1, int dim2, int dim3, uint64 grad=0)
+        : array_t((T*)(data), dim0, dim1, dim2, dim3, (T*)(grad))
+    {}
+    CUDA_CALLABLE inline bool empty() const { return !data; }
+    T* data;
+    T* grad;
+    shape_t shape;
+    int strides[ARRAY_MAX_DIMS];
+    int ndim;
+    CUDA_CALLABLE inline operator T*() const { return data; }
+};
+// Required when compiling adjoints.
+template <typename T>
+inline CUDA_CALLABLE array_t<T> add(
+    const array_t<T>& a, const array_t<T>& b
+)
+{
+    return array_t<T>();
+}
+// Stack‑allocated counterpart to `array_t<T>`.
+// Useful for small buffers that have their shape known at compile-time,
+// and that gain from having array semantics instead of vectors.
+template <int Size, typename T>
+struct fixedarray_t : array_t<T>
+{
+    using Base = array_t<T>;
+    static_assert(Size > 0, "Expected Size > 0");
+    CUDA_CALLABLE inline fixedarray_t()
+        : Base(storage, Size), storage()
+    {}
+    CUDA_CALLABLE fixedarray_t(int dim0, T* grad=nullptr)
+        : Base(storage, dim0, grad), storage()
+    {
+        assert(Size == dim0);
+    }
+    CUDA_CALLABLE fixedarray_t(int dim0, int dim1, T* grad=nullptr)
+        : Base(storage, dim0, dim1, grad), storage()
+    {
+        assert(Size == dim0 * dim1);
+    }
+    CUDA_CALLABLE fixedarray_t(int dim0, int dim1, int dim2, T* grad=nullptr)
+        : Base(storage, dim0, dim1, dim2, grad), storage()
+    {
+        assert(Size == dim0 * dim1 * dim2);
+    }
+    CUDA_CALLABLE fixedarray_t(int dim0, int dim1, int dim2, int dim3, T* grad=nullptr)
+        : Base(storage, dim0, dim1, dim2, dim3, grad), storage()
+    {
+        assert(Size == dim0 * dim1 * dim2 * dim3);
+    }
+    CUDA_CALLABLE fixedarray_t<Size, T>& operator=(const fixedarray_t<Size, T>& other)
+    {
+        for (unsigned int i = 0; i < Size; ++i)
+        {
+            this->storage[i] = other.storage[i];
+        }
+        this->data = this->storage;
+        this->grad = nullptr;
+        this->shape = other.shape;
+        for (unsigned int i = 0; i < ARRAY_MAX_DIMS; ++i)
+        {
+            this->strides[i] = other.strides[i];
+        }
+        this->ndim = other.ndim;
+        return *this;
+    }
+    T storage[Size];
+};
+// Required when compiling adjoints.
+template <int Size, typename T>
+inline CUDA_CALLABLE fixedarray_t<Size, T> add(
+    const fixedarray_t<Size, T>& a, const fixedarray_t<Size, T>& b
+)
+{
+    return fixedarray_t<Size, T>();
+}
+// TODO:
+// - templated index type?
+// - templated dimensionality? (also for array_t to save space when passing arrays to kernels)
+template <typename T>
+struct indexedarray_t
+{
+    CUDA_CALLABLE inline indexedarray_t()
+        : arr(),
+          indices(),
+          shape()
+    {}
+    CUDA_CALLABLE inline bool empty() const { return !arr.data; }
+    array_t<T> arr;
+    int* indices[ARRAY_MAX_DIMS];  // index array per dimension (can be NULL)
+    shape_t shape;  // element count per dimension (num. indices if indexed, array dim if not)
+};
+// return stride (in bytes) of the given index
+template <typename T>
+CUDA_CALLABLE inline size_t stride(const array_t<T>& a, int dim)
+{
+    return size_t(a.strides[dim]);
+}
+template <typename T>
+CUDA_CALLABLE inline T* data_at_byte_offset(const array_t<T>& a, size_t byte_offset)
+{
+    return reinterpret_cast<T*>(reinterpret_cast<char*>(a.data) + byte_offset);
+}
+template <typename T>
+CUDA_CALLABLE inline T* grad_at_byte_offset(const array_t<T>& a, size_t byte_offset)
+{
+    return reinterpret_cast<T*>(reinterpret_cast<char*>(a.grad) + byte_offset);
+}
+template <typename T>
+CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i)
+{
+    assert(i >= 0 && i < arr.shape[0]);
+    return i*stride(arr, 0);
+}
+template <typename T>
+CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j)
+{
+    // if (i < 0 || i >= arr.shape[0])
+    //     printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
+    // if (j < 0 || j >= arr.shape[1])
+    //     printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
+    assert(i >= 0 && i < arr.shape[0]);
+    assert(j >= 0 && j < arr.shape[1]);
+    return i*stride(arr, 0) + j*stride(arr, 1);
+}
+template <typename T>
+CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j, int k)
+{
+    assert(i >= 0 && i < arr.shape[0]);
+    assert(j >= 0 && j < arr.shape[1]);
+    assert(k >= 0 && k < arr.shape[2]);
+    return i*stride(arr, 0) + j*stride(arr, 1) + k*stride(arr, 2);
+}
+template <typename T>
+CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j, int k, int l)
+{
+    assert(i >= 0 && i < arr.shape[0]);
+    assert(j >= 0 && j < arr.shape[1]);
+    assert(k >= 0 && k < arr.shape[2]);
+    assert(l >= 0 && l < arr.shape[3]);
+    return i*stride(arr, 0) + j*stride(arr, 1) + k*stride(arr, 2) + l*stride(arr, 3);
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i)
+{
+    assert(arr.ndim == 1);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    T& result = *data_at_byte_offset(arr, byte_offset(arr, i));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j)
+{
+    assert(arr.ndim == 2);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j));
+    FP_VERIFY_FWD_2(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j, int k)
+{
+    assert(arr.ndim == 3);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    assert(k >= -arr.shape[2] && k < arr.shape[2]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += arr.shape[2];
+    }
+    T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j, k));
+    FP_VERIFY_FWD_3(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const array_t<T>& arr, int i, int j, int k, int l)
+{
+    assert(arr.ndim == 4);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    assert(k >= -arr.shape[2] && k < arr.shape[2]);
+    assert(l >= -arr.shape[3] && l < arr.shape[3]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += arr.shape[2];
+    }
+    if (l < 0)
+    {
+        l += arr.shape[3];
+    }
+    T& result = *data_at_byte_offset(arr, byte_offset(arr, i, j, k, l));
+    FP_VERIFY_FWD_4(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i)
+{
+    assert(arr.ndim == 1);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    T& result = *grad_at_byte_offset(arr, byte_offset(arr, i));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j)
+{
+    assert(arr.ndim == 2);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j));
+    FP_VERIFY_FWD_2(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j, int k)
+{
+    assert(arr.ndim == 3);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    assert(k >= -arr.shape[2] && k < arr.shape[2]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += arr.shape[2];
+    }
+    T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j, k));
+    FP_VERIFY_FWD_3(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index_grad(const array_t<T>& arr, int i, int j, int k, int l)
+{
+    assert(arr.ndim == 4);
+    assert(i >= -arr.shape[0] && i < arr.shape[0]);
+    assert(j >= -arr.shape[1] && j < arr.shape[1]);
+    assert(k >= -arr.shape[2] && k < arr.shape[2]);
+    assert(l >= -arr.shape[3] && l < arr.shape[3]);
+    if (i < 0)
+    {
+        i += arr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += arr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += arr.shape[2];
+    }
+    if (l < 0)
+    {
+        l += arr.shape[3];
+    }
+    T& result = *grad_at_byte_offset(arr, byte_offset(arr, i, j, k, l));
+    FP_VERIFY_FWD_4(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i)
+{
+    assert(iarr.arr.ndim == 1);
+    assert(i >= -iarr.shape[0] && i < iarr.shape[0]);
+    if (i < 0)
+    {
+        i += iarr.shape[0];
+    }
+    if (iarr.indices[0])
+    {
+        i = iarr.indices[0][i];
+        assert(i >= 0 && i < iarr.arr.shape[0]);
+    }
+    T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j)
+{
+    assert(iarr.arr.ndim == 2);
+    assert(i >= -iarr.shape[0] && i < iarr.shape[0]);
+    assert(j >= -iarr.shape[1] && j < iarr.shape[1]);
+    if (i < 0)
+    {
+        i += iarr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += iarr.shape[1];
+    }
+    if (iarr.indices[0])
+    {
+        i = iarr.indices[0][i];
+        assert(i >= 0 && i < iarr.arr.shape[0]);
+    }
+    if (iarr.indices[1])
+    {
+        j = iarr.indices[1][j];
+        assert(j >= 0 && j < iarr.arr.shape[1]);
+    }
+    T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j, int k)
+{
+    assert(iarr.arr.ndim == 3);
+    assert(i >= -iarr.shape[0] && i < iarr.shape[0]);
+    assert(j >= -iarr.shape[1] && j < iarr.shape[1]);
+    assert(k >= -iarr.shape[2] && k < iarr.shape[2]);
+    if (i < 0)
+    {
+        i += iarr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += iarr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += iarr.shape[2];
+    }
+    if (iarr.indices[0])
+    {
+        i = iarr.indices[0][i];
+        assert(i >= 0 && i < iarr.arr.shape[0]);
+    }
+    if (iarr.indices[1])
+    {
+        j = iarr.indices[1][j];
+        assert(j >= 0 && j < iarr.arr.shape[1]);
+    }
+    if (iarr.indices[2])
+    {
+        k = iarr.indices[2][k];
+        assert(k >= 0 && k < iarr.arr.shape[2]);
+    }
+    T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j, k));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline T& index(const indexedarray_t<T>& iarr, int i, int j, int k, int l)
+{
+    assert(iarr.arr.ndim == 4);
+    assert(i >= -iarr.shape[0] && i < iarr.shape[0]);
+    assert(j >= -iarr.shape[1] && j < iarr.shape[1]);
+    assert(k >= -iarr.shape[2] && k < iarr.shape[2]);
+    assert(l >= -iarr.shape[3] && l < iarr.shape[3]);
+    if (i < 0)
+    {
+        i += iarr.shape[0];
+    }
+    if (j < 0)
+    {
+        j += iarr.shape[1];
+    }
+    if (k < 0)
+    {
+        k += iarr.shape[2];
+    }
+    if (l < 0)
+    {
+        l += iarr.shape[3];
+    }
+    if (iarr.indices[0])
+    {
+        i = iarr.indices[0][i];
+        assert(i >= 0 && i < iarr.arr.shape[0]);
+    }
+    if (iarr.indices[1])
+    {
+        j = iarr.indices[1][j];
+        assert(j >= 0 && j < iarr.arr.shape[1]);
+    }
+    if (iarr.indices[2])
+    {
+        k = iarr.indices[2][k];
+        assert(k >= 0 && k < iarr.arr.shape[2]);
+    }
+    if (iarr.indices[3])
+    {
+        l = iarr.indices[3][l];
+        assert(l >= 0 && l < iarr.arr.shape[3]);
+    }
+    T& result = *data_at_byte_offset(iarr.arr, byte_offset(iarr.arr, i, j, k, l));
+    FP_VERIFY_FWD_1(result)
+    return result;
+}
+template <typename T>
+CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i)
+{
+    assert(src.ndim > 1);
+    assert(i >= -src.shape[0] && i < src.shape[0]);
+    if (i < 0)
+    {
+        i += src.shape[0];
+    }
+    array_t<T> a;
+    size_t offset = byte_offset(src, i);
+    a.data = data_at_byte_offset(src, offset);
+    if (src.grad)
+        a.grad = grad_at_byte_offset(src, offset);
+    a.shape[0] = src.shape[1];
+    a.shape[1] = src.shape[2];
+    a.shape[2] = src.shape[3];
+    a.strides[0] = src.strides[1];
+    a.strides[1] = src.strides[2];
+    a.strides[2] = src.strides[3];
+    a.ndim = src.ndim-1;
+    return a;
+}
+template <typename T>
+CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i, int j)
+{
+    assert(src.ndim > 2);
+    assert(i >= -src.shape[0] && i < src.shape[0]);
+    assert(j >= -src.shape[1] && j < src.shape[1]);
+    if (i < 0)
+    {
+        i += src.shape[0];
+    }
+    if (j < 0)
+    {
+        j += src.shape[1];
+    }
+    array_t<T> a;
+    size_t offset = byte_offset(src, i, j);
+    a.data = data_at_byte_offset(src, offset);
+    if (src.grad)
+        a.grad = grad_at_byte_offset(src, offset);
+    a.shape[0] = src.shape[2];
+    a.shape[1] = src.shape[3];
+    a.strides[0] = src.strides[2];
+    a.strides[1] = src.strides[3];
+    a.ndim = src.ndim-2;
+    return a;
+}
+template <typename T>
+CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, int i, int j, int k)
+{
+    assert(src.ndim > 3);
+    assert(i >= -src.shape[0] && i < src.shape[0]);
+    assert(j >= -src.shape[1] && j < src.shape[1]);
+    assert(k >= -src.shape[2] && k < src.shape[2]);
+    if (i < 0)
+    {
+        i += src.shape[0];
+    }
+    if (j < 0)
+    {
+        j += src.shape[1];
+    }
+    if (k < 0)
+    {
+        k += src.shape[2];
+    }
+    array_t<T> a;
+    size_t offset = byte_offset(src, i, j, k);
+    a.data = data_at_byte_offset(src, offset);
+    if (src.grad)
+        a.grad = grad_at_byte_offset(src, offset);
+    a.shape[0] = src.shape[3];
+    a.strides[0] = src.strides[3];
+    a.ndim = src.ndim-3;
+    return a;
+}
+template <typename T, size_t... Idxs>
+size_t byte_offset_helper(
+    array_t<T>& src,
+    const slice_t (&slices)[sizeof...(Idxs)],
+    index_sequence<Idxs...>
+)
+{
+    return byte_offset(src, slices[Idxs].start...);
+}
+template <typename T, typename... Slices>
+CUDA_CALLABLE inline array_t<T> view(array_t<T>& src, const Slices&... slice_args)
+{
+    constexpr int N = sizeof...(Slices);
+    static_assert(N >= 1 && N <= 4, "view supports 1 to 4 slices");
+    assert(src.ndim >= N);
+    slice_t slices[N] = { slice_args... };
+    int slice_idxs[N];
+    int slice_count = 0;
+    for (int i = 0; i < N; ++i)
+    {
+        if (slices[i].step == 0)
+        {
+            // We have a slice representing an integer index.
+            if (slices[i].start < 0)
+            {
+                slices[i].start += src.shape[i];
+            }
+        }
+        else
+        {
+            slices[i] = slice_adjust_indices(slices[i], src.shape[i]);
+            slice_idxs[slice_count] = i;
+            ++slice_count;
+        }
+    }
+    size_t offset = byte_offset_helper(src, slices, make_index_sequence<N>{});
+    array_t<T> out;
+    out.data = data_at_byte_offset(src, offset);
+    if (src.grad)
+    {
+        out.grad = grad_at_byte_offset(src, offset);
+    }
+    int dim = 0;
+    for (; dim < slice_count; ++dim)
+    {
+        int idx = slice_idxs[dim];
+        out.shape[dim] = slice_get_length(slices[idx]);
+        out.strides[dim] = src.strides[idx] * slices[idx].step;
+    }
+    for (; dim < slice_count + 4 - N; ++dim)
+    {
+        out.shape[dim] = src.shape[dim - slice_count + N];
+        out.strides[dim] = src.strides[dim - slice_count + N];
+    }
+    for (; dim < 4; ++dim)
+    {
+        out.shape[dim] = 0;
+        out.strides[dim] = 0;
+    }
+    out.ndim = src.ndim + slice_count - N;
+    return out;
+}
+template <typename T>
+CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i)
+{
+    assert(src.arr.ndim > 1);
+    if (src.indices[0])
+    {
+        assert(i >= -src.shape[0] && i < src.shape[0]);
+        if (i < 0)
+        {
+            i += src.shape[0];
+        }
+        i = src.indices[0][i];
+    }
+    indexedarray_t<T> a;
+    a.arr = view(src.arr, i);
+    a.indices[0] = src.indices[1];
+    a.indices[1] = src.indices[2];
+    a.indices[2] = src.indices[3];
+    a.shape[0] = src.shape[1];
+    a.shape[1] = src.shape[2];
+    a.shape[2] = src.shape[3];
+    return a;
+}
+template <typename T>
+CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i, int j)
+{
+    assert(src.arr.ndim > 2);
+    if (src.indices[0])
+    {
+        assert(i >= -src.shape[0] && i < src.shape[0]);
+        if (i < 0)
+        {
+            i += src.shape[0];
+        }
+        i = src.indices[0][i];
+    }
+    if (src.indices[1])
+    {
+        assert(j >= -src.shape[1] && j < src.shape[1]);
+        if (j < 0)
+        {
+            j += src.shape[1];
+        }
+        j = src.indices[1][j];
+    }
+    indexedarray_t<T> a;
+    a.arr = view(src.arr, i, j);
+    a.indices[0] = src.indices[2];
+    a.indices[1] = src.indices[3];
+    a.shape[0] = src.shape[2];
+    a.shape[1] = src.shape[3];
+    return a;
+}
+template <typename T>
+CUDA_CALLABLE inline indexedarray_t<T> view(indexedarray_t<T>& src, int i, int j, int k)
+{
+    assert(src.arr.ndim > 3);
+    if (src.indices[0])
+    {
+        assert(i >= -src.shape[0] && i < src.shape[0]);
+        if (i < 0)
+        {
+            i += src.shape[0];
+        }
+        i = src.indices[0][i];
+    }
+    if (src.indices[1])
+    {
+        assert(j >= -src.shape[1] && j < src.shape[1]);
+        if (j < 0)
+        {
+            j += src.shape[1];
+        }
+        j = src.indices[1][j];
+    }
+    if (src.indices[2])
+    {
+        assert(k >= -src.shape[2] && k < src.shape[2]);
+        if (k < 0)
+        {
+            k += src.shape[2];
+        }
+        k = src.indices[2][k];
+    }
+    indexedarray_t<T> a;
+    a.arr = view(src.arr, i, j, k);
+    a.indices[0] = src.indices[3];
+    a.shape[0] = src.shape[3];
+    return a;
+}
+template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T>
+inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, A2<T>& adj_src, int adj_i, A3<T>& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T>
+inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, int j, A2<T>& adj_src, int adj_i, int adj_j, A3<T>& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, template<typename> class A3, typename T>
+inline CUDA_CALLABLE void adj_view(A1<T>& src, int i, int j, int k, A2<T>& adj_src, int adj_i, int adj_j, int adj_k, A3<T>& adj_ret) {}
+template <typename... Args>
+CUDA_CALLABLE inline void adj_view(Args&&...) { }
+// TODO: lower_bound() for indexed arrays?
+template <typename T>
+CUDA_CALLABLE inline int lower_bound(const array_t<T>& arr, int arr_begin, int arr_end, T value)
+{
+    assert(arr.ndim == 1);
+    int lower = arr_begin;
+    int upper = arr_end - 1;
+    while(lower < upper)
+    {
+        int mid = lower + (upper - lower) / 2;
+        if (arr[mid] < value)
+        {
+            lower = mid + 1;
+        }
+        else
+        {
+            upper = mid;
+        }
+    }
+    return lower;
+}
+template <typename T>
+CUDA_CALLABLE inline int lower_bound(const array_t<T>& arr, T value)
+{
+    return lower_bound(arr, 0, arr.shape[0], value);
+}
+template <typename T> inline CUDA_CALLABLE void adj_lower_bound(const array_t<T>& arr, T value, array_t<T> adj_arr, T adj_value, int adj_ret) {}
+template <typename T> inline CUDA_CALLABLE void adj_lower_bound(const array_t<T>& arr, int arr_begin, int arr_end, T value, array_t<T> adj_arr, int adj_arr_begin, int adj_arr_end, T adj_value, int adj_ret) {}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, T value) { return atomic_add(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, T value) { return atomic_add(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, int k, T value) { return atomic_add(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_add(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_add(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, T value) { return atomic_add(&index(buf, i), -value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, T value) { return atomic_add(&index(buf, i, j), -value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, int k, T value) { return atomic_add(&index(buf, i, j, k), -value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_sub(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_add(&index(buf, i, j, k, l), -value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, T value) { return atomic_min(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, T value) { return atomic_min(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, int k, T value) { return atomic_min(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_min(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_min(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, T value) { return atomic_max(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, T value) { return atomic_max(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, T value) { return atomic_max(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_max(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_max(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, T old_value, T new_value) { return atomic_cas(&index(buf, i), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, T old_value, T new_value) { return atomic_cas(&index(buf, i, j), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, int k, T old_value, T new_value) { return atomic_cas(&index(buf, i, j, k), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_cas(const A<T>& buf, int i, int j, int k, int l, T old_value, T new_value) { return atomic_cas(&index(buf, i, j, k, l), old_value, new_value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, T value) { return atomic_exch(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, T value) { return atomic_exch(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, int k, T value) { return atomic_exch(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_exch(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_exch(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_and(const A<T>& buf, int i, T value) { return atomic_and(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_and(const A<T>& buf, int i, int j, T value) { return atomic_and(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_and(const A<T>& buf, int i, int j, int k, T value) { return atomic_and(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_and(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_and(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_or(const A<T>& buf, int i, T value) { return atomic_or(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_or(const A<T>& buf, int i, int j, T value) { return atomic_or(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_or(const A<T>& buf, int i, int j, int k, T value) { return atomic_or(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_or(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_or(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_xor(const A<T>& buf, int i, T value) { return atomic_xor(&index(buf, i), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_xor(const A<T>& buf, int i, int j, T value) { return atomic_xor(&index(buf, i, j), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_xor(const A<T>& buf, int i, int j, int k, T value) { return atomic_xor(&index(buf, i, j, k), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T atomic_xor(const A<T>& buf, int i, int j, int k, int l, T value) { return atomic_xor(&index(buf, i, j, k, l), value); }
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i)
+{
+    return &index(buf, i); // cppcheck-suppress returnDanglingLifetime
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j)
+{
+    return &index(buf, i, j); // cppcheck-suppress returnDanglingLifetime
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k)
+{
+    return &index(buf, i, j, k); // cppcheck-suppress returnDanglingLifetime
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE T* address(const A<T>& buf, int i, int j, int k, int l)
+{
+    return &index(buf, i, j, k, l); // cppcheck-suppress returnDanglingLifetime
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, T value)
+{
+    FP_VERIFY_FWD_1(value)
+    index(buf, i) = value;
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, T value)
+{
+    FP_VERIFY_FWD_2(value)
+    index(buf, i, j) = value;
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, T value)
+{
+    FP_VERIFY_FWD_3(value)
+    index(buf, i, j, k) = value;
+}
+template<template<typename> class A, typename T>
+inline CUDA_CALLABLE void array_store(const A<T>& buf, int i, int j, int k, int l, T value)
+{
+    FP_VERIFY_FWD_4(value)
+    index(buf, i, j, k, l) = value;
+}
+template<typename T>
+inline CUDA_CALLABLE void store(T* address, T value)
+{
+    FP_VERIFY_FWD(value)
+    *address = value;
+}
+template<typename T>
+inline CUDA_CALLABLE T load(T* address)
+{
+    T value = *address;
+    FP_VERIFY_FWD(value)
+    return value;
+}
+// where() overload for array condition - returns a if array.data is non-null, otherwise returns b
+template <typename T1, typename T2>
+CUDA_CALLABLE inline T2 where(const array_t<T1>& arr, const T2& a, const T2& b) { return arr.data?a:b; }
+template <typename T1, typename T2>
+CUDA_CALLABLE inline void adj_where(const array_t<T1>& arr, const T2& a, const T2& b, const array_t<T1>& adj_cond, T2& adj_a, T2& adj_b, const T2& adj_ret)
+{
+    if (arr.data)
+        adj_a += adj_ret;
+    else
+        adj_b += adj_ret;
+}
+// stub for the case where we have an nested array inside a struct and
+// atomic add the whole struct onto an array (e.g.: during backwards pass)
+template <typename T>
+CUDA_CALLABLE inline void atomic_add(array_t<T>*, array_t<T>) {}
+// for float and vector types this is just an alias for an atomic add
+template <typename T>
+CUDA_CALLABLE inline void adj_atomic_add(T* buf, T value) { atomic_add(buf, value); }
+// for integral types we do not accumulate gradients
+CUDA_CALLABLE inline void adj_atomic_add(int8* buf, int8 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(uint8* buf, uint8 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(int16* buf, int16 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(uint16* buf, uint16 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(int32* buf, int32 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(uint32* buf, uint32 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(int64* buf, int64 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(uint64* buf, uint64 value) { }
+CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { }
+// only generate gradients for T types
+template<typename T>
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int adj_i, const T& adj_output)
+{
+    if (adj_buf.data)
+        adj_atomic_add(&index(adj_buf, i), adj_output);
+    else if (buf.grad)
+        adj_atomic_add(&index_grad(buf, i), adj_output);
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int adj_i, int adj_j, const T& adj_output)
+{
+    if (adj_buf.data)
+        adj_atomic_add(&index(adj_buf, i, j), adj_output);
+    else if (buf.grad)
+        adj_atomic_add(&index_grad(buf, i, j), adj_output);
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output)
+{
+    if (adj_buf.data)
+        adj_atomic_add(&index(adj_buf, i, j, k), adj_output);
+    else if (buf.grad)
+        adj_atomic_add(&index_grad(buf, i, j, k), adj_output);
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output)
+{
+    if (adj_buf.data)
+        adj_atomic_add(&index(adj_buf, i, j, k, l), adj_output);
+    else if (buf.grad)
+        adj_atomic_add(&index_grad(buf, i, j, k, l), adj_output);
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j, k);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j, k);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j, k, l);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j, k, l);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_store(const T* address, T value, const T& adj_address, T& adj_value)
+{
+	// nop; generic store() operations are not differentiable, only array_store() is
+    FP_VERIFY_ADJ(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& adj_value)
+{
+    // nop; generic load() operations are not differentiable
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j, k);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j, k);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value += index(adj_buf, i, j, k, l);
+    else if (buf.grad)
+        adj_value += index_grad(buf, i, j, k, l);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value -= index(adj_buf, i);
+    else if (buf.grad)
+        adj_value -= index_grad(buf, i);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value -= index(adj_buf, i, j);
+    else if (buf.grad)
+        adj_value -= index_grad(buf, i, j);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value -= index(adj_buf, i, j, k);
+    else if (buf.grad)
+        adj_value -= index_grad(buf, i, j, k);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
+{
+    if (adj_buf.data)
+        adj_value -= index(adj_buf, i, j, k, l);
+    else if (buf.grad)
+        adj_value -= index_grad(buf, i, j, k, l);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+// generic array types that do not support gradient computation (indexedarray, etc.)
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int adj_i, const T& adj_output) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int adj_i, int adj_j, const T& adj_output) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
+// generic handler for scalar values
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i), &index_grad(buf, i), value, adj_value);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j), &index_grad(buf, i, j), value, adj_value);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k), &index_grad(buf, i, j, k), value, adj_value);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
+    else if (buf.grad)
+        adj_atomic_minmax(&index(buf, i, j, k, l), &index_grad(buf, i, j, k, l), value, adj_value);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, T compare, T value, const A2<T>& adj_buf, int adj_i, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i), compare, value, &index(adj_buf, i), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i), compare, value, &index_grad(buf, i), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j), compare, value, &index(adj_buf, i, j), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j), compare, value, &index_grad(buf, i, j), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, int k, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j, k), compare, value, &index(adj_buf, i, j, k), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j, k), compare, value, &index_grad(buf, i, j, k), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_cas(const A1<T>& buf, int i, int j, int k, int l, T compare, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_compare, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_cas(&index(buf, i, j, k, l), compare, value, &index(adj_buf, i, j, k, l), adj_compare, adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_cas(&index(buf, i, j, k, l), compare, value, &index_grad(buf, i, j, k, l), adj_compare, adj_value, adj_ret);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i), value, &index(adj_buf, i), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i), value, &index_grad(buf, i), adj_value, adj_ret);
+    FP_VERIFY_ADJ_1(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j), value, &index(adj_buf, i, j), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j), value, &index_grad(buf, i, j), adj_value, adj_ret);
+    FP_VERIFY_ADJ_2(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j, k), value, &index(adj_buf, i, j, k), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j, k), value, &index_grad(buf, i, j, k), adj_value, adj_ret);
+    FP_VERIFY_ADJ_3(value, adj_value)
+}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_exch(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
+    if (adj_buf.data)
+        adj_atomic_exch(&index(buf, i, j, k, l), value, &index(adj_buf, i, j, k, l), adj_value, adj_ret);
+    else if (buf.grad)
+        adj_atomic_exch(&index(buf, i, j, k, l), value, &index_grad(buf, i, j, k, l), adj_value, adj_ret);
+    FP_VERIFY_ADJ_4(value, adj_value)
+}
+// for bitwise operations we do not accumulate gradients
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_and(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_and(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_and(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_and(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_or(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_or(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_or(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_or(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_xor(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_xor(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_xor(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A1, template<typename> class A2, typename T>
+inline CUDA_CALLABLE void adj_atomic_xor(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
+template<template<typename> class A, typename T>
+CUDA_CALLABLE inline int len(const A<T>& a)
+{
+    return a.shape[0];
+}
+template<template<typename> class A, typename T>
+CUDA_CALLABLE inline void adj_len(const A<T>& a, A<T>& adj_a, int& adj_ret)
+{
+}
+} // namespace wp
+#include "fabric.h"