PyPI - warp-lang - Versions diffs - 1.10.0__py3-none-macosx_11_0_arm64.whl - Mend

warp-lang 1.10.0__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (468) hide show

warp/__init__.py +334 -0
warp/__init__.pyi +5856 -0
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1077 -0
warp/_src/build.py +620 -0
warp/_src/build_dll.py +642 -0
warp/_src/builtins.py +10555 -0
warp/_src/codegen.py +4361 -0
warp/_src/config.py +178 -0
warp/_src/constants.py +59 -0
warp/_src/context.py +8352 -0
warp/_src/dlpack.py +464 -0
warp/_src/fabric.py +362 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +510 -0
warp/_src/fem/cache.py +689 -0
warp/_src/fem/dirichlet.py +190 -0
warp/_src/fem/domain.py +553 -0
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +703 -0
warp/_src/fem/field/nodal_field.py +403 -0
warp/_src/fem/field/restriction.py +39 -0
warp/_src/fem/field/virtual.py +1021 -0
warp/_src/fem/geometry/__init__.py +32 -0
warp/_src/fem/geometry/adaptive_nanogrid.py +782 -0
warp/_src/fem/geometry/closest_point.py +99 -0
warp/_src/fem/geometry/deformed_geometry.py +277 -0
warp/_src/fem/geometry/element.py +854 -0
warp/_src/fem/geometry/geometry.py +693 -0
warp/_src/fem/geometry/grid_2d.py +478 -0
warp/_src/fem/geometry/grid_3d.py +539 -0
warp/_src/fem/geometry/hexmesh.py +956 -0
warp/_src/fem/geometry/nanogrid.py +660 -0
warp/_src/fem/geometry/partition.py +483 -0
warp/_src/fem/geometry/quadmesh.py +597 -0
warp/_src/fem/geometry/tetmesh.py +762 -0
warp/_src/fem/geometry/trimesh.py +588 -0
warp/_src/fem/integrate.py +2507 -0
warp/_src/fem/linalg.py +385 -0
warp/_src/fem/operator.py +398 -0
warp/_src/fem/polynomial.py +231 -0
warp/_src/fem/quadrature/__init__.py +17 -0
warp/_src/fem/quadrature/pic_quadrature.py +318 -0
warp/_src/fem/quadrature/quadrature.py +665 -0
warp/_src/fem/space/__init__.py +248 -0
warp/_src/fem/space/basis_function_space.py +499 -0
warp/_src/fem/space/basis_space.py +681 -0
warp/_src/fem/space/dof_mapper.py +253 -0
warp/_src/fem/space/function_space.py +312 -0
warp/_src/fem/space/grid_2d_function_space.py +179 -0
warp/_src/fem/space/grid_3d_function_space.py +229 -0
warp/_src/fem/space/hexmesh_function_space.py +255 -0
warp/_src/fem/space/nanogrid_function_space.py +199 -0
warp/_src/fem/space/partition.py +435 -0
warp/_src/fem/space/quadmesh_function_space.py +222 -0
warp/_src/fem/space/restriction.py +221 -0
warp/_src/fem/space/shape/__init__.py +152 -0
warp/_src/fem/space/shape/cube_shape_function.py +1107 -0
warp/_src/fem/space/shape/shape_function.py +134 -0
warp/_src/fem/space/shape/square_shape_function.py +928 -0
warp/_src/fem/space/shape/tet_shape_function.py +829 -0
warp/_src/fem/space/shape/triangle_shape_function.py +674 -0
warp/_src/fem/space/tetmesh_function_space.py +270 -0
warp/_src/fem/space/topology.py +461 -0
warp/_src/fem/space/trimesh_function_space.py +193 -0
warp/_src/fem/types.py +114 -0
warp/_src/fem/utils.py +488 -0
warp/_src/jax.py +188 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +389 -0
warp/_src/jax_experimental/ffi.py +1286 -0
warp/_src/jax_experimental/xla_ffi.py +658 -0
warp/_src/marching_cubes.py +710 -0
warp/_src/math.py +416 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +165 -0
warp/_src/optim/linear.py +1608 -0
warp/_src/optim/sgd.py +114 -0
warp/_src/paddle.py +408 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +291 -0
warp/_src/render/render_opengl.py +3638 -0
warp/_src/render/render_usd.py +939 -0
warp/_src/render/utils.py +162 -0
warp/_src/sparse.py +2718 -0
warp/_src/tape.py +1208 -0
warp/_src/thirdparty/__init__.py +0 -0
warp/_src/thirdparty/appdirs.py +598 -0
warp/_src/thirdparty/dlpack.py +145 -0
warp/_src/thirdparty/unittest_parallel.py +676 -0
warp/_src/torch.py +393 -0
warp/_src/types.py +5888 -0
warp/_src/utils.py +1695 -0
warp/autograd.py +33 -0
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +29 -0
warp/build_dll.py +24 -0
warp/codegen.py +24 -0
warp/constants.py +24 -0
warp/context.py +33 -0
warp/dlpack.py +24 -0
warp/examples/__init__.py +24 -0
warp/examples/assets/bear.usd +0 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cube.usd +0 -0
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/assets/pixel.jpg +0 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usd +0 -0
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_api.py +389 -0
warp/examples/benchmarks/benchmark_cloth.py +296 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
warp/examples/benchmarks/benchmark_gemm.py +164 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
warp/examples/benchmarks/benchmark_launches.py +301 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/browse.py +37 -0
warp/examples/core/example_cupy.py +86 -0
warp/examples/core/example_dem.py +241 -0
warp/examples/core/example_fluid.py +299 -0
warp/examples/core/example_graph_capture.py +150 -0
warp/examples/core/example_marching_cubes.py +195 -0
warp/examples/core/example_mesh.py +180 -0
warp/examples/core/example_mesh_intersect.py +211 -0
warp/examples/core/example_nvdb.py +182 -0
warp/examples/core/example_raycast.py +111 -0
warp/examples/core/example_raymarch.py +205 -0
warp/examples/core/example_render_opengl.py +290 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +411 -0
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_torch.py +211 -0
warp/examples/core/example_wave.py +269 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/distributed/example_jacobi_mpi.py +506 -0
warp/examples/fem/example_adaptive_grid.py +286 -0
warp/examples/fem/example_apic_fluid.py +469 -0
warp/examples/fem/example_burgers.py +261 -0
warp/examples/fem/example_convection_diffusion.py +181 -0
warp/examples/fem/example_convection_diffusion_dg.py +225 -0
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +172 -0
warp/examples/fem/example_diffusion.py +196 -0
warp/examples/fem/example_diffusion_3d.py +225 -0
warp/examples/fem/example_diffusion_mgpu.py +225 -0
warp/examples/fem/example_distortion_energy.py +228 -0
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +242 -0
warp/examples/fem/example_mixed_elasticity.py +293 -0
warp/examples/fem/example_navier_stokes.py +263 -0
warp/examples/fem/example_nonconforming_contact.py +300 -0
warp/examples/fem/example_stokes.py +213 -0
warp/examples/fem/example_stokes_transfer.py +262 -0
warp/examples/fem/example_streamlines.py +357 -0
warp/examples/fem/utils.py +1047 -0
warp/examples/interop/example_jax_callable.py +146 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +232 -0
warp/examples/optim/example_diffray.py +561 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +88 -0
warp/examples/tile/example_tile_convolution.py +66 -0
warp/examples/tile/example_tile_fft.py +55 -0
warp/examples/tile/example_tile_filtering.py +113 -0
warp/examples/tile/example_tile_matmul.py +85 -0
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/examples/tile/example_tile_mlp.py +385 -0
warp/examples/tile/example_tile_nbody.py +199 -0
warp/fabric.py +24 -0
warp/fem/__init__.py +173 -0
warp/fem/adaptivity.py +26 -0
warp/fem/cache.py +30 -0
warp/fem/dirichlet.py +24 -0
warp/fem/field/__init__.py +24 -0
warp/fem/field/field.py +26 -0
warp/fem/geometry/__init__.py +21 -0
warp/fem/geometry/closest_point.py +31 -0
warp/fem/linalg.py +38 -0
warp/fem/operator.py +32 -0
warp/fem/polynomial.py +29 -0
warp/fem/space/__init__.py +22 -0
warp/fem/space/basis_space.py +24 -0
warp/fem/space/shape/__init__.py +68 -0
warp/fem/space/topology.py +24 -0
warp/fem/types.py +24 -0
warp/fem/utils.py +32 -0
warp/jax.py +29 -0
warp/jax_experimental/__init__.py +29 -0
warp/jax_experimental/custom_call.py +29 -0
warp/jax_experimental/ffi.py +39 -0
warp/jax_experimental/xla_ffi.py +24 -0
warp/marching_cubes.py +24 -0
warp/math.py +37 -0
warp/native/array.h +1687 -0
warp/native/builtin.h +2327 -0
warp/native/bvh.cpp +562 -0
warp/native/bvh.cu +826 -0
warp/native/bvh.h +555 -0
warp/native/clang/clang.cpp +541 -0
warp/native/coloring.cpp +622 -0
warp/native/crt.cpp +51 -0
warp/native/crt.h +568 -0
warp/native/cuda_crt.h +1058 -0
warp/native/cuda_util.cpp +677 -0
warp/native/cuda_util.h +313 -0
warp/native/error.cpp +77 -0
warp/native/error.h +36 -0
warp/native/exports.h +2023 -0
warp/native/fabric.h +246 -0
warp/native/hashgrid.cpp +311 -0
warp/native/hashgrid.cu +89 -0
warp/native/hashgrid.h +240 -0
warp/native/initializer_array.h +41 -0
warp/native/intersect.h +1253 -0
warp/native/intersect_adj.h +375 -0
warp/native/intersect_tri.h +348 -0
warp/native/mat.h +5189 -0
warp/native/mathdx.cpp +93 -0
warp/native/matnn.h +221 -0
warp/native/mesh.cpp +266 -0
warp/native/mesh.cu +406 -0
warp/native/mesh.h +2097 -0
warp/native/nanovdb/GridHandle.h +533 -0
warp/native/nanovdb/HostBuffer.h +591 -0
warp/native/nanovdb/NanoVDB.h +6246 -0
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +3390 -0
warp/native/noise.h +859 -0
warp/native/quat.h +1664 -0
warp/native/rand.h +342 -0
warp/native/range.h +145 -0
warp/native/reduce.cpp +174 -0
warp/native/reduce.cu +363 -0
warp/native/runlength_encode.cpp +79 -0
warp/native/runlength_encode.cu +61 -0
warp/native/scan.cpp +47 -0
warp/native/scan.cu +55 -0
warp/native/scan.h +23 -0
warp/native/solid_angle.h +466 -0
warp/native/sort.cpp +251 -0
warp/native/sort.cu +286 -0
warp/native/sort.h +35 -0
warp/native/sparse.cpp +241 -0
warp/native/sparse.cu +435 -0
warp/native/spatial.h +1306 -0
warp/native/svd.h +727 -0
warp/native/temp_buffer.h +46 -0
warp/native/tile.h +4124 -0
warp/native/tile_radix_sort.h +1112 -0
warp/native/tile_reduce.h +838 -0
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +2199 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +501 -0
warp/native/volume.cu +68 -0
warp/native/volume.h +970 -0
warp/native/volume_builder.cu +483 -0
warp/native/volume_builder.h +52 -0
warp/native/volume_impl.h +70 -0
warp/native/warp.cpp +1143 -0
warp/native/warp.cu +4604 -0
warp/native/warp.h +358 -0
warp/optim/__init__.py +20 -0
warp/optim/adam.py +24 -0
warp/optim/linear.py +35 -0
warp/optim/sgd.py +24 -0
warp/paddle.py +24 -0
warp/py.typed +0 -0
warp/render/__init__.py +22 -0
warp/render/imgui_manager.py +29 -0
warp/render/render_opengl.py +24 -0
warp/render/render_usd.py +24 -0
warp/render/utils.py +24 -0
warp/sparse.py +51 -0
warp/tape.py +24 -0
warp/tests/__init__.py +1 -0
warp/tests/__main__.py +4 -0
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/mlp_golden.npy +0 -0
warp/tests/assets/pixel.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/spiky.usd +0 -0
warp/tests/assets/test_grid.nvdb +0 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/assets/test_int32_grid.nvdb +0 -0
warp/tests/assets/test_vec_grid.nvdb +0 -0
warp/tests/assets/torus.nvdb +0 -0
warp/tests/assets/torus.usda +105 -0
warp/tests/aux_test_class_kernel.py +34 -0
warp/tests/aux_test_compile_consts_dummy.py +18 -0
warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
warp/tests/aux_test_dependent.py +29 -0
warp/tests/aux_test_grad_customs.py +29 -0
warp/tests/aux_test_instancing_gc.py +26 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/aux_test_module_unload.py +23 -0
warp/tests/aux_test_name_clash1.py +40 -0
warp/tests/aux_test_name_clash2.py +40 -0
warp/tests/aux_test_reference.py +9 -0
warp/tests/aux_test_reference_reference.py +8 -0
warp/tests/aux_test_square.py +16 -0
warp/tests/aux_test_unresolved_func.py +22 -0
warp/tests/aux_test_unresolved_symbol.py +22 -0
warp/tests/cuda/__init__.py +0 -0
warp/tests/cuda/test_async.py +676 -0
warp/tests/cuda/test_conditional_captures.py +1147 -0
warp/tests/cuda/test_ipc.py +124 -0
warp/tests/cuda/test_mempool.py +233 -0
warp/tests/cuda/test_multigpu.py +169 -0
warp/tests/cuda/test_peer.py +139 -0
warp/tests/cuda/test_pinned.py +84 -0
warp/tests/cuda/test_streams.py +691 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/geometry/test_bvh.py +335 -0
warp/tests/geometry/test_hash_grid.py +259 -0
warp/tests/geometry/test_marching_cubes.py +294 -0
warp/tests/geometry/test_mesh.py +318 -0
warp/tests/geometry/test_mesh_query_aabb.py +392 -0
warp/tests/geometry/test_mesh_query_point.py +935 -0
warp/tests/geometry/test_mesh_query_ray.py +323 -0
warp/tests/geometry/test_volume.py +1103 -0
warp/tests/geometry/test_volume_write.py +346 -0
warp/tests/interop/__init__.py +0 -0
warp/tests/interop/test_dlpack.py +730 -0
warp/tests/interop/test_jax.py +1673 -0
warp/tests/interop/test_paddle.py +800 -0
warp/tests/interop/test_torch.py +1001 -0
warp/tests/run_coverage_serial.py +39 -0
warp/tests/test_adam.py +162 -0
warp/tests/test_arithmetic.py +1096 -0
warp/tests/test_array.py +3756 -0
warp/tests/test_array_reduce.py +156 -0
warp/tests/test_assert.py +303 -0
warp/tests/test_atomic.py +336 -0
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +312 -0
warp/tests/test_bool.py +220 -0
warp/tests/test_builtins_resolution.py +732 -0
warp/tests/test_closest_point_edge_edge.py +327 -0
warp/tests/test_codegen.py +974 -0
warp/tests/test_codegen_instancing.py +1495 -0
warp/tests/test_compile_consts.py +215 -0
warp/tests/test_conditional.py +298 -0
warp/tests/test_context.py +35 -0
warp/tests/test_copy.py +319 -0
warp/tests/test_ctypes.py +618 -0
warp/tests/test_dense.py +73 -0
warp/tests/test_devices.py +127 -0
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +424 -0
warp/tests/test_fabricarray.py +998 -0
warp/tests/test_fast_math.py +72 -0
warp/tests/test_fem.py +2204 -0
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_fp16.py +136 -0
warp/tests/test_func.py +501 -0
warp/tests/test_future_annotations.py +100 -0
warp/tests/test_generics.py +656 -0
warp/tests/test_grad.py +893 -0
warp/tests/test_grad_customs.py +339 -0
warp/tests/test_grad_debug.py +341 -0
warp/tests/test_implicit_init.py +411 -0
warp/tests/test_import.py +45 -0
warp/tests/test_indexedarray.py +1140 -0
warp/tests/test_intersect.py +103 -0
warp/tests/test_iter.py +76 -0
warp/tests/test_large.py +177 -0
warp/tests/test_launch.py +411 -0
warp/tests/test_lerp.py +151 -0
warp/tests/test_linear_solvers.py +223 -0
warp/tests/test_lvalue.py +427 -0
warp/tests/test_map.py +526 -0
warp/tests/test_mat.py +3515 -0
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +573 -0
warp/tests/test_mat_lite.py +122 -0
warp/tests/test_mat_scalar_ops.py +2913 -0
warp/tests/test_math.py +212 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_module_hashing.py +258 -0
warp/tests/test_modules_lite.py +70 -0
warp/tests/test_noise.py +252 -0
warp/tests/test_operators.py +299 -0
warp/tests/test_options.py +129 -0
warp/tests/test_overwrite.py +551 -0
warp/tests/test_print.py +408 -0
warp/tests/test_quat.py +2653 -0
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_rand.py +339 -0
warp/tests/test_reload.py +303 -0
warp/tests/test_rounding.py +157 -0
warp/tests/test_runlength_encode.py +196 -0
warp/tests/test_scalar_ops.py +133 -0
warp/tests/test_smoothstep.py +108 -0
warp/tests/test_snippet.py +318 -0
warp/tests/test_sparse.py +845 -0
warp/tests/test_spatial.py +2859 -0
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_special_values.py +361 -0
warp/tests/test_static.py +640 -0
warp/tests/test_struct.py +901 -0
warp/tests/test_tape.py +242 -0
warp/tests/test_transient_module.py +93 -0
warp/tests/test_triangle_closest_point.py +192 -0
warp/tests/test_tuple.py +361 -0
warp/tests/test_types.py +615 -0
warp/tests/test_utils.py +594 -0
warp/tests/test_vec.py +1408 -0
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/test_vec_lite.py +80 -0
warp/tests/test_vec_scalar_ops.py +2327 -0
warp/tests/test_verify_fp.py +100 -0
warp/tests/test_version.py +75 -0
warp/tests/tile/__init__.py +0 -0
warp/tests/tile/test_tile.py +1519 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +608 -0
warp/tests/tile/test_tile_load.py +724 -0
warp/tests/tile/test_tile_mathdx.py +156 -0
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +400 -0
warp/tests/tile/test_tile_reduce.py +950 -0
warp/tests/tile/test_tile_shared_memory.py +376 -0
warp/tests/tile/test_tile_sort.py +121 -0
warp/tests/tile/test_tile_view.py +173 -0
warp/tests/unittest_serial.py +47 -0
warp/tests/unittest_suites.py +430 -0
warp/tests/unittest_utils.py +469 -0
warp/tests/walkthrough_debug.py +95 -0
warp/torch.py +24 -0
warp/types.py +51 -0
warp/utils.py +31 -0
warp_lang-1.10.0.dist-info/METADATA +459 -0
warp_lang-1.10.0.dist-info/RECORD +468 -0
warp_lang-1.10.0.dist-info/WHEEL +5 -0
warp_lang-1.10.0.dist-info/licenses/LICENSE.md +176 -0
warp_lang-1.10.0.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/top_level.txt +1 -0

warp/tests/tile/test_tile_shared_memory.py ADDED Viewed

@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+# checks that we can configure shared memory to the expected size
+def test_tile_shared_mem_size(test, device):
+    DIM_M = 32
+    DIM_N = 32
+    BLOCK_DIM = 256
+    @wp.kernel(module="unique")
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = expected_forward_bytes * 2
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that we can configure shared memory > 48kb default
+def test_tile_shared_mem_large(test, device):
+    # set dimensions that require 64kb for the forward kernel
+    DIM_M = 64
+    DIM_N = 128
+    BLOCK_DIM = 256
+    # we disable backward kernel gen since 128k is not supported on most architectures
+    @wp.kernel(enable_backward=False, module="unique")
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = 0
+    assert expected_forward_bytes == 2**16
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that we can configure dynamic shared memory during graph capture
+def test_tile_shared_mem_graph(test, device):
+    DIM_M = 32
+    DIM_N = 32
+    BLOCK_DIM = 256
+    @wp.kernel(module="unique")
+    def compute(out: wp.array2d(dtype=float)):
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        c = a + b
+        wp.tile_store(out, c)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    # preload the unique module
+    wp.load_module(compute.module, device=device, block_dim=BLOCK_DIM)
+    with wp.ScopedCapture(device, force_module_load=False) as capture:
+        wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    wp.capture_launch(capture.graph)
+    # check output
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
+    # check required shared memory
+    expected_forward_bytes = DIM_M * DIM_N * 4 * 2
+    expected_backward_bytes = expected_forward_bytes * 2
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    assert hooks.forward_smem_bytes == expected_forward_bytes
+    assert hooks.backward_smem_bytes == expected_backward_bytes
+# checks that stack allocations work for user functions
+def test_tile_shared_mem_func(test, device):
+    DIM_M = 64
+    DIM_N = 64
+    SMALL_DIM_M = 64 // 4
+    SMALL_DIM_N = 64 // 4
+    BLOCK_DIM = 256
+    @wp.func
+    def add_tile_small():
+        a = wp.tile_ones(shape=(SMALL_DIM_M, SMALL_DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(SMALL_DIM_M, SMALL_DIM_N), dtype=float, storage="shared") * 2.0
+        return a + b
+    @wp.func
+    def add_tile_big():
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        return a + b
+    @wp.kernel(module="unique")
+    def compute(out: wp.array2d(dtype=float)):
+        s = add_tile_small()
+        b = add_tile_big()
+        wp.tile_store(out, b)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    # ensure that total required dynamic shared is the larger of the two tiles
+    expected_required_shared = 64 * 64 * 4 * 2
+    assert hooks.forward_smem_bytes == expected_required_shared
+    assert hooks.backward_smem_bytes == expected_required_shared * 2
+def round_up(a, b):
+    return b * ((a + b - 1) // b)
+# checks that using non-16B aligned sizes work
+def test_tile_shared_non_aligned(test, device):
+    # Tile size = 4 (float) * 1 * 3 = 12B % 16 != 0
+    DIM_M = 1
+    DIM_N = 3
+    BLOCK_DIM = 256
+    @wp.func
+    def foo():
+        a = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 2.0
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared") * 3.0
+        return a + b
+    @wp.kernel(module="unique")
+    def compute(out: wp.array2d(dtype=float)):
+        # This test the logic in the stack allocator, which should increment and
+        # decrement the stack pointer each time foo() is called
+        # Failing to do so correct will make b out of bounds and corrupt the results
+        for _ in range(4096):
+            foo()
+        b = wp.tile_ones(shape=(DIM_M, DIM_N), dtype=float, storage="shared")
+        wp.tile_store(out, b)
+    out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
+    wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
+    assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N), dtype=float))
+    # check shared memory for kernel on the device
+    module_exec = compute.module.load(device, BLOCK_DIM)
+    hooks = module_exec.get_kernel_hooks(compute)
+    # ensure that total required dynamic shared is the larger of the two tiles
+    expected_required_shared = 3 * round_up(DIM_M * DIM_N * 4, 16)
+    assert hooks.forward_smem_bytes == expected_required_shared
+    assert hooks.backward_smem_bytes == expected_required_shared * 2
+def test_tile_shared_vec_accumulation(test, device):
+    BLOCK_DIM = 256
+    @wp.kernel(module="unique")
+    def compute(indices: wp.array(dtype=int), vecs: wp.array(dtype=wp.vec3), output: wp.array2d(dtype=float)):
+        i, j = wp.tid()
+        idx_tile = wp.tile_load(indices, shape=BLOCK_DIM, offset=i * BLOCK_DIM)
+        idx = idx_tile[j]
+        s = wp.tile_zeros(shape=(1, 3), dtype=float)
+        s[0, 0] += vecs[idx].x
+        s[0, 1] += vecs[idx].y
+        s[0, 2] += vecs[idx].z
+        wp.tile_store(output, s, offset=(i, 0))
+    N = BLOCK_DIM * 3
+    basis_vecs = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+    vecs = wp.array(basis_vecs, dtype=wp.vec3, requires_grad=True, device=device)
+    rng = np.random.default_rng(42)
+    indices_np = rng.integers(0, 3, size=N)
+    indices = wp.array(indices_np, dtype=int, requires_grad=True, device=device)
+    output = wp.zeros(shape=(3, 3), dtype=float, requires_grad=True, device=device)
+    tape = wp.Tape()
+    with tape:
+        wp.launch_tiled(compute, dim=3, inputs=[indices, vecs, output], block_dim=BLOCK_DIM, device=device)
+    output.grad = wp.ones_like(output)
+    tape.backward()
+    n0 = np.count_nonzero(indices_np == 0)
+    n1 = np.count_nonzero(indices_np == 1)
+    n2 = np.count_nonzero(indices_np == 2)
+    true_grads = np.array([[n0, n0, n0], [n1, n1, n1], [n2, n2, n2]])
+    indices_np = indices_np.reshape((3, BLOCK_DIM))
+    def compute_row(idx):
+        n0 = np.count_nonzero(indices_np[idx, :] == 0)
+        n1 = np.count_nonzero(indices_np[idx, :] == 1)
+        n2 = np.count_nonzero(indices_np[idx, :] == 2)
+        return np.array([1, 0, 0]) * n0 + np.array([0, 1, 0]) * n1 + np.array([0, 0, 1]) * n2
+    row_0 = compute_row(0)
+    row_1 = compute_row(1)
+    row_2 = compute_row(2)
+    true_vecs = np.stack([row_0, row_1, row_2])
+    assert_np_equal(output.numpy(), true_vecs)
+    assert_np_equal(vecs.grad.numpy(), true_grads)
+def test_tile_shared_simple_reduction_add(test, device):
+    BLOCK_DIM = 256
+    @wp.kernel(module="unique")
+    def compute(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+        i, j = wp.tid()
+        t = wp.tile_load(x, shape=BLOCK_DIM, offset=BLOCK_DIM * i)
+        k = BLOCK_DIM // 2
+        while k > 0:
+            if j < k:
+                t[j] += t[j + k]
+            k //= 2
+        wp.tile_store(y, wp.tile_view(t, offset=(0,), shape=(1,)), i)
+    N = BLOCK_DIM * 4
+    x_np = np.arange(N, dtype=np.float32)
+    x = wp.array(x_np, dtype=float, device=device)
+    y = wp.zeros(4, dtype=float, device=device)
+    wp.launch_tiled(compute, dim=4, inputs=[x], outputs=[y], block_dim=BLOCK_DIM, device=device)
+    assert_np_equal(np.sum(y.numpy()), np.sum(x_np))
+def test_tile_shared_simple_reduction_sub(test, device):
+    BLOCK_DIM = 256
+    @wp.kernel(module="unique")
+    def compute(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+        i, j = wp.tid()
+        t = wp.tile_load(x, shape=BLOCK_DIM, offset=BLOCK_DIM * i)
+        k = BLOCK_DIM // 2
+        while k > 0:
+            if j < k:
+                t[j] -= t[j + k]
+            k //= 2
+        wp.tile_store(y, wp.tile_view(t, offset=(0,), shape=(1,)), i)
+    N = BLOCK_DIM * 4
+    x_np = np.arange(N, dtype=np.float32)
+    x = wp.array(x_np, dtype=float, device=device)
+    y = wp.zeros(4, dtype=float, device=device)
+    wp.launch_tiled(compute, dim=4, inputs=[x], outputs=[y], block_dim=BLOCK_DIM, device=device)
+    assert_np_equal(np.sum(y.numpy()), 0.0)
+devices = get_cuda_test_devices()
+class TestTileSharedMemory(unittest.TestCase):
+    pass
+add_function_test(
+    TestTileSharedMemory, "test_tile_shared_mem_size", test_tile_shared_mem_size, devices=devices, check_output=False
+)
+add_function_test(
+    TestTileSharedMemory, "test_tile_shared_mem_large", test_tile_shared_mem_large, devices=devices, check_output=False
+)
+add_function_test(TestTileSharedMemory, "test_tile_shared_mem_graph", test_tile_shared_mem_graph, devices=devices)
+add_function_test(TestTileSharedMemory, "test_tile_shared_mem_func", test_tile_shared_mem_func, devices=devices)
+add_function_test(TestTileSharedMemory, "test_tile_shared_non_aligned", test_tile_shared_non_aligned, devices=devices)
+add_function_test(
+    TestTileSharedMemory, "test_tile_shared_vec_accumulation", test_tile_shared_vec_accumulation, devices=devices
+)
+add_function_test(
+    TestTileSharedMemory,
+    "test_tile_shared_simple_reduction_add",
+    test_tile_shared_simple_reduction_add,
+    devices=devices,
+)
+add_function_test(
+    TestTileSharedMemory,
+    "test_tile_shared_simple_reduction_sub",
+    test_tile_shared_simple_reduction_sub,
+    devices=devices,
+)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/tile/test_tile_sort.py ADDED Viewed

@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+def create_sort_kernel(KEY_TYPE, MAX_SORT_LENGTH):
+    @wp.kernel
+    def tile_sort_kernel(
+        input_keys: wp.array(dtype=KEY_TYPE),
+        input_values: wp.array(dtype=wp.int32),
+        output_keys: wp.array(dtype=KEY_TYPE),
+        output_values: wp.array(dtype=wp.int32),
+    ):
+        # Load input into shared memory
+        keys = wp.tile_load(input_keys, shape=MAX_SORT_LENGTH, storage="shared")
+        values = wp.tile_load(input_values, shape=MAX_SORT_LENGTH, storage="shared")
+        # Perform in-place sorting
+        wp.tile_sort(keys, values)
+        # Store sorted shared memory into output arrays
+        wp.tile_store(output_keys, keys)
+        wp.tile_store(output_values, values)
+    return tile_sort_kernel
+def test_tile_sort(test, device):
+    # Forward-declare kernels for more efficient compilation
+    kernels = {}
+    for dtype in [int, float]:
+        for i in range(0, 11):
+            length = 2**i + 1
+            kernels[(dtype, length)] = create_sort_kernel(dtype, length)
+    for (dtype, length), kernel in kernels.items():
+        for j in range(5, 10):
+            TILE_DIM = 2**j
+            rng = np.random.default_rng(42)  # Create a random generator instance
+            if dtype == int:
+                np_keys = rng.choice(1000000000, size=length, replace=False)
+            else:  # dtype == float
+                np_keys = rng.uniform(0, 1000000000, size=length).astype(dtype)
+            np_values = np.arange(length)
+            # Generate random keys and iota indexer
+            input_keys = wp.array(np_keys, dtype=dtype, device=device)
+            input_values = wp.array(np_values, dtype=int, device=device)
+            output_keys = wp.zeros_like(input_keys, device=device)
+            output_values = wp.zeros_like(input_values, device=device)
+            # Execute sorting kernel
+            wp.launch_tiled(
+                kernel,
+                dim=1,
+                inputs=[input_keys, input_values, output_keys, output_values],
+                block_dim=TILE_DIM,
+                device=device,
+            )
+            wp.synchronize()
+            # Sort using NumPy for validation
+            sorted_indices = np.argsort(np_keys)
+            np_sorted_keys = np_keys[sorted_indices]
+            np_sorted_values = np_values[sorted_indices]
+            if dtype == int:
+                keys_match = np.array_equal(output_keys.numpy(), np_sorted_keys)
+            else:  # dtype == float
+                keys_match = np.allclose(output_keys.numpy(), np_sorted_keys, atol=1e-6)  # Use tolerance for floats
+            values_match = np.array_equal(output_values.numpy(), np_sorted_values)
+            if not keys_match or not values_match:
+                print(f"Test failed for dtype={dtype}, TILE_DIM={TILE_DIM}, length={length}")
+                print("")
+                print(output_keys.numpy())
+                print(np_sorted_keys)
+                print("")
+                print(output_values.numpy())
+                print(np_sorted_values)
+                print("")
+            # Validate results
+            test.assertTrue(keys_match, f"Key sorting mismatch for dtype={dtype}!")
+            test.assertTrue(values_match, f"Value sorting mismatch for dtype={dtype}!")
+devices = get_test_devices()
+class TestTileSort(unittest.TestCase):
+    pass
+add_function_test(TestTileSort, "test_tile_sort", test_tile_sort, devices=devices)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/tile/test_tile_view.py ADDED Viewed

@@ -0,0 +1,173 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+TILE_DIM = 64
+TILE_M = 16
+TILE_N = 32
+TILE_O = 8
+@wp.kernel
+def test_tile_view_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
+    # load whole source into local memory
+    a = wp.tile_load(src, shape=(TILE_M, TILE_N))
+    # copy the source array row by row
+    for i in range(TILE_M):
+        # create a view on original array and store
+        row = a[i]
+        wp.tile_store(dst[i], row)
+def test_tile_view(test, device):
+    rng = np.random.default_rng(42)
+    a = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    b = wp.array(np.zeros((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(test_tile_view_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+    assert_np_equal(b.numpy(), a.numpy())
+    b.grad = wp.ones_like(b, device=device)
+    tape.backward()
+    assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
+@wp.kernel
+def test_tile_assign_1d_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
+    # load whole source into local memory
+    a = wp.tile_load(src, shape=(TILE_M, TILE_N))
+    b = wp.tile_zeros(dtype=float, shape=(TILE_M, TILE_N))
+    # copy the source array row by row
+    for i in range(int(TILE_M)):
+        # create views onto source and dest rows
+        row_src = a[i]
+        row_dst = b[i]
+        # copy onto dest row
+        wp.tile_assign(row_dst, row_src)
+    wp.tile_store(dst, b)
+def test_tile_assign_1d(test, device):
+    rng = np.random.default_rng(42)
+    a = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    b = wp.array(np.zeros((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(test_tile_assign_1d_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+    assert_np_equal(b.numpy(), a.numpy())
+    b.grad = wp.ones_like(b, device=device)
+    tape.backward()
+    assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
+@wp.kernel
+def test_tile_assign_2d_kernel(src: wp.array3d(dtype=float), dst: wp.array3d(dtype=float)):
+    # load whole source into local memory
+    a = wp.tile_load(src, shape=(TILE_M, TILE_N, TILE_O))
+    b = wp.tile_zeros(dtype=float, shape=(TILE_M, TILE_N, TILE_O))
+    # copy the source array slice by slice
+    for i in range(TILE_M):
+        # create views onto source and dest slice
+        row_src = a[i]
+        row_dst = b[i]
+        # copy onto dest slice
+        wp.tile_assign(row_dst, row_src)
+    wp.tile_store(dst, b)
+def test_tile_assign_2d(test, device):
+    rng = np.random.default_rng(42)
+    a = wp.array(rng.random((TILE_M, TILE_N, TILE_O), dtype=np.float32), requires_grad=True, device=device)
+    b = wp.array(np.zeros((TILE_M, TILE_N, TILE_O), dtype=np.float32), requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(test_tile_assign_2d_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+    assert_np_equal(b.numpy(), a.numpy())
+    b.grad = wp.ones_like(b, device=device)
+    tape.backward()
+    assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
+@wp.kernel
+def test_tile_view_offset_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
+    # load whole source into local memory
+    a = wp.tile_load(src, shape=(TILE_M, TILE_N))
+    b = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=float)
+    # copy the source array slice by slice
+    for i in range(TILE_M // 4):
+        # create views onto source and dest slice 4 rows at a time
+        v = wp.tile_view(a, offset=(i * 4, 0), shape=(4, TILE_N))
+        # copy onto dest slice
+        wp.tile_assign(b, v, offset=(i * 4, 0))
+    wp.tile_store(dst, b)
+def test_tile_view_offset(test, device):
+    rng = np.random.default_rng(42)
+    a = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    b = wp.array(np.zeros((TILE_M, TILE_N), dtype=np.float32), requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(test_tile_view_offset_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+    assert_np_equal(b.numpy(), a.numpy())
+    b.grad = wp.ones_like(b, device=device)
+    tape.backward()
+    assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
+devices = get_test_devices()
+class TestTileView(unittest.TestCase):
+    pass
+add_function_test(TestTileView, "test_tile_view", test_tile_view, devices=devices)
+add_function_test(TestTileView, "test_tile_view_offset", test_tile_view_offset, devices=devices)
+add_function_test(TestTileView, "test_tile_assign_1d", test_tile_assign_1d, devices=devices)
+add_function_test(TestTileView, "test_tile_assign_2d", test_tile_assign_2d, devices=devices)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)