warp-lang 1.7.0__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +139 -0
- warp/__init__.pyi +1 -0
- warp/autograd.py +1142 -0
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +557 -0
- warp/build_dll.py +405 -0
- warp/builtins.py +6855 -0
- warp/codegen.py +3969 -0
- warp/config.py +158 -0
- warp/constants.py +57 -0
- warp/context.py +6812 -0
- warp/dlpack.py +462 -0
- warp/examples/__init__.py +24 -0
- warp/examples/assets/bear.usd +0 -0
- warp/examples/assets/bunny.usd +0 -0
- warp/examples/assets/cartpole.urdf +110 -0
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/cube.usd +0 -0
- warp/examples/assets/nonuniform.usd +0 -0
- warp/examples/assets/nv_ant.xml +92 -0
- warp/examples/assets/nv_humanoid.xml +183 -0
- warp/examples/assets/nvidia_logo.png +0 -0
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/assets/quadruped.urdf +268 -0
- warp/examples/assets/rocks.nvdb +0 -0
- warp/examples/assets/rocks.usd +0 -0
- warp/examples/assets/sphere.usd +0 -0
- warp/examples/assets/square_cloth.usd +0 -0
- warp/examples/benchmarks/benchmark_api.py +389 -0
- warp/examples/benchmarks/benchmark_cloth.py +296 -0
- warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
- warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
- warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
- warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
- warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
- warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
- warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
- warp/examples/benchmarks/benchmark_gemm.py +164 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
- warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
- warp/examples/benchmarks/benchmark_launches.py +301 -0
- warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
- warp/examples/browse.py +37 -0
- warp/examples/core/example_cupy.py +86 -0
- warp/examples/core/example_dem.py +241 -0
- warp/examples/core/example_fluid.py +299 -0
- warp/examples/core/example_graph_capture.py +150 -0
- warp/examples/core/example_marching_cubes.py +194 -0
- warp/examples/core/example_mesh.py +180 -0
- warp/examples/core/example_mesh_intersect.py +211 -0
- warp/examples/core/example_nvdb.py +182 -0
- warp/examples/core/example_raycast.py +111 -0
- warp/examples/core/example_raymarch.py +205 -0
- warp/examples/core/example_render_opengl.py +193 -0
- warp/examples/core/example_sample_mesh.py +300 -0
- warp/examples/core/example_sph.py +411 -0
- warp/examples/core/example_torch.py +211 -0
- warp/examples/core/example_wave.py +269 -0
- warp/examples/fem/example_adaptive_grid.py +286 -0
- warp/examples/fem/example_apic_fluid.py +423 -0
- warp/examples/fem/example_burgers.py +261 -0
- warp/examples/fem/example_convection_diffusion.py +178 -0
- warp/examples/fem/example_convection_diffusion_dg.py +204 -0
- warp/examples/fem/example_deformed_geometry.py +172 -0
- warp/examples/fem/example_diffusion.py +196 -0
- warp/examples/fem/example_diffusion_3d.py +225 -0
- warp/examples/fem/example_diffusion_mgpu.py +220 -0
- warp/examples/fem/example_distortion_energy.py +228 -0
- warp/examples/fem/example_magnetostatics.py +240 -0
- warp/examples/fem/example_mixed_elasticity.py +291 -0
- warp/examples/fem/example_navier_stokes.py +261 -0
- warp/examples/fem/example_nonconforming_contact.py +298 -0
- warp/examples/fem/example_stokes.py +213 -0
- warp/examples/fem/example_stokes_transfer.py +262 -0
- warp/examples/fem/example_streamlines.py +352 -0
- warp/examples/fem/utils.py +1000 -0
- warp/examples/interop/example_jax_callable.py +116 -0
- warp/examples/interop/example_jax_ffi_callback.py +132 -0
- warp/examples/interop/example_jax_kernel.py +205 -0
- warp/examples/optim/example_bounce.py +266 -0
- warp/examples/optim/example_cloth_throw.py +228 -0
- warp/examples/optim/example_diffray.py +561 -0
- warp/examples/optim/example_drone.py +870 -0
- warp/examples/optim/example_fluid_checkpoint.py +497 -0
- warp/examples/optim/example_inverse_kinematics.py +182 -0
- warp/examples/optim/example_inverse_kinematics_torch.py +191 -0
- warp/examples/optim/example_softbody_properties.py +400 -0
- warp/examples/optim/example_spring_cage.py +245 -0
- warp/examples/optim/example_trajectory.py +227 -0
- warp/examples/sim/example_cartpole.py +143 -0
- warp/examples/sim/example_cloth.py +225 -0
- warp/examples/sim/example_cloth_self_contact.py +322 -0
- warp/examples/sim/example_granular.py +130 -0
- warp/examples/sim/example_granular_collision_sdf.py +202 -0
- warp/examples/sim/example_jacobian_ik.py +244 -0
- warp/examples/sim/example_particle_chain.py +124 -0
- warp/examples/sim/example_quadruped.py +203 -0
- warp/examples/sim/example_rigid_chain.py +203 -0
- warp/examples/sim/example_rigid_contact.py +195 -0
- warp/examples/sim/example_rigid_force.py +133 -0
- warp/examples/sim/example_rigid_gyroscopic.py +115 -0
- warp/examples/sim/example_rigid_soft_contact.py +140 -0
- warp/examples/sim/example_soft_body.py +196 -0
- warp/examples/tile/example_tile_cholesky.py +87 -0
- warp/examples/tile/example_tile_convolution.py +66 -0
- warp/examples/tile/example_tile_fft.py +55 -0
- warp/examples/tile/example_tile_filtering.py +113 -0
- warp/examples/tile/example_tile_matmul.py +85 -0
- warp/examples/tile/example_tile_mlp.py +383 -0
- warp/examples/tile/example_tile_nbody.py +199 -0
- warp/examples/tile/example_tile_walker.py +327 -0
- warp/fabric.py +355 -0
- warp/fem/__init__.py +106 -0
- warp/fem/adaptivity.py +508 -0
- warp/fem/cache.py +572 -0
- warp/fem/dirichlet.py +202 -0
- warp/fem/domain.py +411 -0
- warp/fem/field/__init__.py +125 -0
- warp/fem/field/field.py +619 -0
- warp/fem/field/nodal_field.py +326 -0
- warp/fem/field/restriction.py +37 -0
- warp/fem/field/virtual.py +848 -0
- warp/fem/geometry/__init__.py +32 -0
- warp/fem/geometry/adaptive_nanogrid.py +857 -0
- warp/fem/geometry/closest_point.py +84 -0
- warp/fem/geometry/deformed_geometry.py +221 -0
- warp/fem/geometry/element.py +776 -0
- warp/fem/geometry/geometry.py +362 -0
- warp/fem/geometry/grid_2d.py +392 -0
- warp/fem/geometry/grid_3d.py +452 -0
- warp/fem/geometry/hexmesh.py +911 -0
- warp/fem/geometry/nanogrid.py +571 -0
- warp/fem/geometry/partition.py +389 -0
- warp/fem/geometry/quadmesh.py +663 -0
- warp/fem/geometry/tetmesh.py +855 -0
- warp/fem/geometry/trimesh.py +806 -0
- warp/fem/integrate.py +2335 -0
- warp/fem/linalg.py +419 -0
- warp/fem/operator.py +293 -0
- warp/fem/polynomial.py +229 -0
- warp/fem/quadrature/__init__.py +17 -0
- warp/fem/quadrature/pic_quadrature.py +299 -0
- warp/fem/quadrature/quadrature.py +591 -0
- warp/fem/space/__init__.py +228 -0
- warp/fem/space/basis_function_space.py +468 -0
- warp/fem/space/basis_space.py +667 -0
- warp/fem/space/dof_mapper.py +251 -0
- warp/fem/space/function_space.py +309 -0
- warp/fem/space/grid_2d_function_space.py +177 -0
- warp/fem/space/grid_3d_function_space.py +227 -0
- warp/fem/space/hexmesh_function_space.py +257 -0
- warp/fem/space/nanogrid_function_space.py +201 -0
- warp/fem/space/partition.py +367 -0
- warp/fem/space/quadmesh_function_space.py +223 -0
- warp/fem/space/restriction.py +179 -0
- warp/fem/space/shape/__init__.py +143 -0
- warp/fem/space/shape/cube_shape_function.py +1105 -0
- warp/fem/space/shape/shape_function.py +133 -0
- warp/fem/space/shape/square_shape_function.py +926 -0
- warp/fem/space/shape/tet_shape_function.py +834 -0
- warp/fem/space/shape/triangle_shape_function.py +672 -0
- warp/fem/space/tetmesh_function_space.py +271 -0
- warp/fem/space/topology.py +424 -0
- warp/fem/space/trimesh_function_space.py +194 -0
- warp/fem/types.py +99 -0
- warp/fem/utils.py +420 -0
- warp/jax.py +187 -0
- warp/jax_experimental/__init__.py +16 -0
- warp/jax_experimental/custom_call.py +351 -0
- warp/jax_experimental/ffi.py +698 -0
- warp/jax_experimental/xla_ffi.py +602 -0
- warp/math.py +244 -0
- warp/native/array.h +1145 -0
- warp/native/builtin.h +1800 -0
- warp/native/bvh.cpp +492 -0
- warp/native/bvh.cu +791 -0
- warp/native/bvh.h +554 -0
- warp/native/clang/clang.cpp +536 -0
- warp/native/coloring.cpp +613 -0
- warp/native/crt.cpp +51 -0
- warp/native/crt.h +362 -0
- warp/native/cuda_crt.h +1058 -0
- warp/native/cuda_util.cpp +646 -0
- warp/native/cuda_util.h +307 -0
- warp/native/error.cpp +77 -0
- warp/native/error.h +36 -0
- warp/native/exports.h +1878 -0
- warp/native/fabric.h +245 -0
- warp/native/hashgrid.cpp +311 -0
- warp/native/hashgrid.cu +87 -0
- warp/native/hashgrid.h +240 -0
- warp/native/initializer_array.h +41 -0
- warp/native/intersect.h +1230 -0
- warp/native/intersect_adj.h +375 -0
- warp/native/intersect_tri.h +339 -0
- warp/native/marching.cpp +19 -0
- warp/native/marching.cu +514 -0
- warp/native/marching.h +19 -0
- warp/native/mat.h +2220 -0
- warp/native/mathdx.cpp +87 -0
- warp/native/matnn.h +343 -0
- warp/native/mesh.cpp +266 -0
- warp/native/mesh.cu +404 -0
- warp/native/mesh.h +1980 -0
- warp/native/nanovdb/GridHandle.h +366 -0
- warp/native/nanovdb/HostBuffer.h +590 -0
- warp/native/nanovdb/NanoVDB.h +6624 -0
- warp/native/nanovdb/PNanoVDB.h +3390 -0
- warp/native/noise.h +859 -0
- warp/native/quat.h +1371 -0
- warp/native/rand.h +342 -0
- warp/native/range.h +139 -0
- warp/native/reduce.cpp +174 -0
- warp/native/reduce.cu +364 -0
- warp/native/runlength_encode.cpp +79 -0
- warp/native/runlength_encode.cu +61 -0
- warp/native/scan.cpp +47 -0
- warp/native/scan.cu +53 -0
- warp/native/scan.h +23 -0
- warp/native/solid_angle.h +466 -0
- warp/native/sort.cpp +251 -0
- warp/native/sort.cu +277 -0
- warp/native/sort.h +33 -0
- warp/native/sparse.cpp +378 -0
- warp/native/sparse.cu +524 -0
- warp/native/spatial.h +657 -0
- warp/native/svd.h +702 -0
- warp/native/temp_buffer.h +46 -0
- warp/native/tile.h +2584 -0
- warp/native/tile_reduce.h +264 -0
- warp/native/vec.h +1426 -0
- warp/native/volume.cpp +501 -0
- warp/native/volume.cu +67 -0
- warp/native/volume.h +969 -0
- warp/native/volume_builder.cu +477 -0
- warp/native/volume_builder.h +52 -0
- warp/native/volume_impl.h +70 -0
- warp/native/warp.cpp +1082 -0
- warp/native/warp.cu +3636 -0
- warp/native/warp.h +381 -0
- warp/optim/__init__.py +17 -0
- warp/optim/adam.py +163 -0
- warp/optim/linear.py +1137 -0
- warp/optim/sgd.py +112 -0
- warp/paddle.py +407 -0
- warp/render/__init__.py +18 -0
- warp/render/render_opengl.py +3518 -0
- warp/render/render_usd.py +784 -0
- warp/render/utils.py +160 -0
- warp/sim/__init__.py +65 -0
- warp/sim/articulation.py +793 -0
- warp/sim/collide.py +2395 -0
- warp/sim/graph_coloring.py +300 -0
- warp/sim/import_mjcf.py +790 -0
- warp/sim/import_snu.py +227 -0
- warp/sim/import_urdf.py +579 -0
- warp/sim/import_usd.py +894 -0
- warp/sim/inertia.py +324 -0
- warp/sim/integrator.py +242 -0
- warp/sim/integrator_euler.py +1997 -0
- warp/sim/integrator_featherstone.py +2101 -0
- warp/sim/integrator_vbd.py +2048 -0
- warp/sim/integrator_xpbd.py +3292 -0
- warp/sim/model.py +4791 -0
- warp/sim/particles.py +121 -0
- warp/sim/render.py +427 -0
- warp/sim/utils.py +428 -0
- warp/sparse.py +2057 -0
- warp/stubs.py +3333 -0
- warp/tape.py +1203 -0
- warp/tests/__init__.py +1 -0
- warp/tests/__main__.py +4 -0
- warp/tests/assets/curlnoise_golden.npy +0 -0
- warp/tests/assets/mlp_golden.npy +0 -0
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/assets/pnoise_golden.npy +0 -0
- warp/tests/assets/spiky.usd +0 -0
- warp/tests/assets/test_grid.nvdb +0 -0
- warp/tests/assets/test_index_grid.nvdb +0 -0
- warp/tests/assets/test_int32_grid.nvdb +0 -0
- warp/tests/assets/test_vec_grid.nvdb +0 -0
- warp/tests/assets/torus.nvdb +0 -0
- warp/tests/assets/torus.usda +105 -0
- warp/tests/aux_test_class_kernel.py +34 -0
- warp/tests/aux_test_compile_consts_dummy.py +18 -0
- warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
- warp/tests/aux_test_dependent.py +29 -0
- warp/tests/aux_test_grad_customs.py +29 -0
- warp/tests/aux_test_instancing_gc.py +26 -0
- warp/tests/aux_test_module_unload.py +23 -0
- warp/tests/aux_test_name_clash1.py +40 -0
- warp/tests/aux_test_name_clash2.py +40 -0
- warp/tests/aux_test_reference.py +9 -0
- warp/tests/aux_test_reference_reference.py +8 -0
- warp/tests/aux_test_square.py +16 -0
- warp/tests/aux_test_unresolved_func.py +22 -0
- warp/tests/aux_test_unresolved_symbol.py +22 -0
- warp/tests/cuda/__init__.py +0 -0
- warp/tests/cuda/test_async.py +676 -0
- warp/tests/cuda/test_ipc.py +124 -0
- warp/tests/cuda/test_mempool.py +233 -0
- warp/tests/cuda/test_multigpu.py +169 -0
- warp/tests/cuda/test_peer.py +139 -0
- warp/tests/cuda/test_pinned.py +84 -0
- warp/tests/cuda/test_streams.py +634 -0
- warp/tests/geometry/__init__.py +0 -0
- warp/tests/geometry/test_bvh.py +200 -0
- warp/tests/geometry/test_hash_grid.py +221 -0
- warp/tests/geometry/test_marching_cubes.py +74 -0
- warp/tests/geometry/test_mesh.py +316 -0
- warp/tests/geometry/test_mesh_query_aabb.py +399 -0
- warp/tests/geometry/test_mesh_query_point.py +932 -0
- warp/tests/geometry/test_mesh_query_ray.py +311 -0
- warp/tests/geometry/test_volume.py +1103 -0
- warp/tests/geometry/test_volume_write.py +346 -0
- warp/tests/interop/__init__.py +0 -0
- warp/tests/interop/test_dlpack.py +729 -0
- warp/tests/interop/test_jax.py +371 -0
- warp/tests/interop/test_paddle.py +800 -0
- warp/tests/interop/test_torch.py +1001 -0
- warp/tests/run_coverage_serial.py +39 -0
- warp/tests/sim/__init__.py +0 -0
- warp/tests/sim/disabled_kinematics.py +244 -0
- warp/tests/sim/flaky_test_sim_grad.py +290 -0
- warp/tests/sim/test_collision.py +604 -0
- warp/tests/sim/test_coloring.py +258 -0
- warp/tests/sim/test_model.py +224 -0
- warp/tests/sim/test_sim_grad_bounce_linear.py +212 -0
- warp/tests/sim/test_sim_kinematics.py +98 -0
- warp/tests/sim/test_vbd.py +597 -0
- warp/tests/test_adam.py +163 -0
- warp/tests/test_arithmetic.py +1096 -0
- warp/tests/test_array.py +2972 -0
- warp/tests/test_array_reduce.py +156 -0
- warp/tests/test_assert.py +250 -0
- warp/tests/test_atomic.py +153 -0
- warp/tests/test_bool.py +220 -0
- warp/tests/test_builtins_resolution.py +1298 -0
- warp/tests/test_closest_point_edge_edge.py +327 -0
- warp/tests/test_codegen.py +810 -0
- warp/tests/test_codegen_instancing.py +1495 -0
- warp/tests/test_compile_consts.py +215 -0
- warp/tests/test_conditional.py +252 -0
- warp/tests/test_context.py +42 -0
- warp/tests/test_copy.py +238 -0
- warp/tests/test_ctypes.py +638 -0
- warp/tests/test_dense.py +73 -0
- warp/tests/test_devices.py +97 -0
- warp/tests/test_examples.py +482 -0
- warp/tests/test_fabricarray.py +996 -0
- warp/tests/test_fast_math.py +74 -0
- warp/tests/test_fem.py +2003 -0
- warp/tests/test_fp16.py +136 -0
- warp/tests/test_func.py +454 -0
- warp/tests/test_future_annotations.py +98 -0
- warp/tests/test_generics.py +656 -0
- warp/tests/test_grad.py +893 -0
- warp/tests/test_grad_customs.py +339 -0
- warp/tests/test_grad_debug.py +341 -0
- warp/tests/test_implicit_init.py +411 -0
- warp/tests/test_import.py +45 -0
- warp/tests/test_indexedarray.py +1140 -0
- warp/tests/test_intersect.py +73 -0
- warp/tests/test_iter.py +76 -0
- warp/tests/test_large.py +177 -0
- warp/tests/test_launch.py +411 -0
- warp/tests/test_lerp.py +151 -0
- warp/tests/test_linear_solvers.py +193 -0
- warp/tests/test_lvalue.py +427 -0
- warp/tests/test_mat.py +2089 -0
- warp/tests/test_mat_lite.py +122 -0
- warp/tests/test_mat_scalar_ops.py +2913 -0
- warp/tests/test_math.py +178 -0
- warp/tests/test_mlp.py +282 -0
- warp/tests/test_module_hashing.py +258 -0
- warp/tests/test_modules_lite.py +44 -0
- warp/tests/test_noise.py +252 -0
- warp/tests/test_operators.py +299 -0
- warp/tests/test_options.py +129 -0
- warp/tests/test_overwrite.py +551 -0
- warp/tests/test_print.py +339 -0
- warp/tests/test_quat.py +2315 -0
- warp/tests/test_rand.py +339 -0
- warp/tests/test_reload.py +302 -0
- warp/tests/test_rounding.py +185 -0
- warp/tests/test_runlength_encode.py +196 -0
- warp/tests/test_scalar_ops.py +105 -0
- warp/tests/test_smoothstep.py +108 -0
- warp/tests/test_snippet.py +318 -0
- warp/tests/test_sparse.py +582 -0
- warp/tests/test_spatial.py +2229 -0
- warp/tests/test_special_values.py +361 -0
- warp/tests/test_static.py +592 -0
- warp/tests/test_struct.py +734 -0
- warp/tests/test_tape.py +204 -0
- warp/tests/test_transient_module.py +93 -0
- warp/tests/test_triangle_closest_point.py +145 -0
- warp/tests/test_types.py +562 -0
- warp/tests/test_utils.py +588 -0
- warp/tests/test_vec.py +1487 -0
- warp/tests/test_vec_lite.py +80 -0
- warp/tests/test_vec_scalar_ops.py +2327 -0
- warp/tests/test_verify_fp.py +100 -0
- warp/tests/tile/__init__.py +0 -0
- warp/tests/tile/test_tile.py +780 -0
- warp/tests/tile/test_tile_load.py +407 -0
- warp/tests/tile/test_tile_mathdx.py +208 -0
- warp/tests/tile/test_tile_mlp.py +402 -0
- warp/tests/tile/test_tile_reduce.py +447 -0
- warp/tests/tile/test_tile_shared_memory.py +247 -0
- warp/tests/tile/test_tile_view.py +173 -0
- warp/tests/unittest_serial.py +47 -0
- warp/tests/unittest_suites.py +427 -0
- warp/tests/unittest_utils.py +468 -0
- warp/tests/walkthrough_debug.py +93 -0
- warp/thirdparty/__init__.py +0 -0
- warp/thirdparty/appdirs.py +598 -0
- warp/thirdparty/dlpack.py +145 -0
- warp/thirdparty/unittest_parallel.py +570 -0
- warp/torch.py +391 -0
- warp/types.py +5230 -0
- warp/utils.py +1137 -0
- warp_lang-1.7.0.dist-info/METADATA +516 -0
- warp_lang-1.7.0.dist-info/RECORD +429 -0
- warp_lang-1.7.0.dist-info/WHEEL +5 -0
- warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
- warp_lang-1.7.0.dist-info/top_level.txt +1 -0
warp/native/rand.h
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*
|
|
5
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
* you may not use this file except in compliance with the License.
|
|
7
|
+
* You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
# pragma once
|
|
19
|
+
#include "array.h"
|
|
20
|
+
|
|
21
|
+
#ifndef M_PI_F
|
|
22
|
+
#define M_PI_F 3.14159265358979323846f
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
/*
|
|
26
|
+
* Please first read the randf comment. randf returns values uniformly distributed in the range [0.f, 1.f - 2.^-24] in equal intervals of size 2.^-24.
|
|
27
|
+
* randn computes sqrt(-2.f * log(x)). For this to return a real value, log(x) < 0.f (we exclude 0.f as a precaution) and therefore x < 1.f.
|
|
28
|
+
* For it to be finite, x > 0.f. So x must be in (0.f, 1.f). We define RANDN_EPSILON to be 2^-24 truncated to 5.96e-8f and add it to the range of randf,
|
|
29
|
+
* giving the domain [RANDN_EPSILON, 1.f - 2.^-24 + RAND_EPSILON] which satisfies the requirement that x is in (0.f, 1.f).
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
#ifndef RANDN_EPSILON
|
|
33
|
+
#define RANDN_EPSILON 5.96e-8f
|
|
34
|
+
#endif
|
|
35
|
+
|
|
36
|
+
namespace wp
|
|
37
|
+
{
|
|
38
|
+
|
|
39
|
+
/*
|
|
40
|
+
* Mark Jarzynski and Marc Olano, Hash Functions for GPU Rendering, Journal of Computer
|
|
41
|
+
* Graphics Techniques (JCGT), vol. 9, no. 3, 20–38, 2020
|
|
42
|
+
*/
|
|
43
|
+
inline CUDA_CALLABLE uint32 rand_pcg(uint32 state)
|
|
44
|
+
{
|
|
45
|
+
uint32 b = state * 747796405u + 2891336453u;
|
|
46
|
+
uint32 c = ((b >> ((b >> 28u) + 4u)) ^ b) * 277803737u;
|
|
47
|
+
return (c >> 22u) ^ c;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
inline CUDA_CALLABLE uint32 rand_init(int seed) { return rand_pcg(uint32(seed)); }
|
|
51
|
+
inline CUDA_CALLABLE uint32 rand_init(int seed, int offset) { return rand_pcg(uint32(seed) + rand_pcg(uint32(offset))); }
|
|
52
|
+
|
|
53
|
+
inline CUDA_CALLABLE int randi(uint32& state) { state = rand_pcg(state); return int(state); }
|
|
54
|
+
inline CUDA_CALLABLE int randi(uint32& state, int min, int max) { state = rand_pcg(state); return state % (max - min) + min; }
|
|
55
|
+
|
|
56
|
+
inline CUDA_CALLABLE uint32 randu(uint32& state) { state = rand_pcg(state); return state; }
|
|
57
|
+
inline CUDA_CALLABLE uint32 randu(uint32& state, uint32 min, uint32 max) { state = rand_pcg(state); return state % (max - min) + min; }
|
|
58
|
+
|
|
59
|
+
/*
|
|
60
|
+
* We want to ensure randf adheres to a uniform distribution over [0,1). The set of all possible float32 (IEEE 754 standard) values is not uniformly distributed however.
|
|
61
|
+
* On the other hand, for a given sign and exponent, the mantissa of the float32 representation is uniformly distributed.
|
|
62
|
+
* Fixing an exponent of -1, we can craft a uniform distribution using the sign bit and 23-bit mantissa that spans the domain [0, 1) in 2^24 equal intervals.
|
|
63
|
+
* We can map 2^24 unique unsigned integers to these 2^24 intervals, so if our random number generator returns values in the range [0, 2^24) without bias,
|
|
64
|
+
* we can ensure that our float distribution in the range [0, 1) is also without bias.
|
|
65
|
+
* Our random number generator returns values in the range [0, 2^32), so we bit shift a random unsigned int 8 places, and then make the assumption that the remaining bit strings
|
|
66
|
+
* are uniformly distributed. After dividing by 2.^24, randf returns values uniformly distributed in the range [0.f, 1.f - 2.^-24].
|
|
67
|
+
*/
|
|
68
|
+
inline CUDA_CALLABLE float randf(uint32& state) { state = rand_pcg(state); return (state >> 8) * (1.0f / 16777216.0f); }
|
|
69
|
+
inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (max - min) * randf(state) + min; }
|
|
70
|
+
|
|
71
|
+
// Box-Muller method
|
|
72
|
+
inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + RANDN_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
|
|
73
|
+
|
|
74
|
+
inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, float adj_ret) {}
|
|
75
|
+
inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, float adj_ret) {}
|
|
76
|
+
|
|
77
|
+
inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, float adj_ret) {}
|
|
78
|
+
inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, float adj_ret) {}
|
|
79
|
+
|
|
80
|
+
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, float adj_ret) {}
|
|
81
|
+
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, float adj_ret) {}
|
|
82
|
+
|
|
83
|
+
inline CUDA_CALLABLE void adj_randf(uint32& state, uint32& adj_state, float adj_ret) {}
|
|
84
|
+
inline CUDA_CALLABLE void adj_randf(uint32& state, float min, float max, uint32& adj_state, float& adj_min, float& adj_max, float adj_ret) {}
|
|
85
|
+
|
|
86
|
+
inline CUDA_CALLABLE void adj_randn(uint32& state, uint32& adj_state, float adj_ret) {}
|
|
87
|
+
|
|
88
|
+
inline CUDA_CALLABLE int sample_cdf(uint32& state, const array_t<float>& cdf)
|
|
89
|
+
{
|
|
90
|
+
float u = randf(state);
|
|
91
|
+
return lower_bound<float>(cdf, u);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/*
|
|
95
|
+
* uniform sampling methods for various geometries
|
|
96
|
+
*/
|
|
97
|
+
inline CUDA_CALLABLE vec2 sample_triangle(uint32& state)
|
|
98
|
+
{
|
|
99
|
+
float r = sqrt(randf(state));
|
|
100
|
+
float u = 1.f - r;
|
|
101
|
+
float v = randf(state) * r;
|
|
102
|
+
return vec2(u, v);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
inline CUDA_CALLABLE vec2 sample_unit_ring(uint32& state)
|
|
106
|
+
{
|
|
107
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
108
|
+
float x = cos(theta);
|
|
109
|
+
float y = sin(theta);
|
|
110
|
+
return vec2(x, y);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
inline CUDA_CALLABLE vec2 sample_unit_disk(uint32& state)
|
|
114
|
+
{
|
|
115
|
+
float r = sqrt(randf(state));
|
|
116
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
117
|
+
float x = r * cos(theta);
|
|
118
|
+
float y = r * sin(theta);
|
|
119
|
+
return vec2(x, y);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
inline CUDA_CALLABLE vec3 sample_unit_sphere_surface(uint32& state)
|
|
123
|
+
{
|
|
124
|
+
float phi = acos(1.f - 2.f * randf(state));
|
|
125
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
126
|
+
float x = cos(theta) * sin(phi);
|
|
127
|
+
float y = sin(theta) * sin(phi);
|
|
128
|
+
float z = cos(phi);
|
|
129
|
+
return vec3(x, y, z);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
inline CUDA_CALLABLE vec3 sample_unit_sphere(uint32& state)
|
|
133
|
+
{
|
|
134
|
+
float phi = acos(1.f - 2.f * randf(state));
|
|
135
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
136
|
+
float r = pow(randf(state), 1.f/3.f);
|
|
137
|
+
float x = r * cos(theta) * sin(phi);
|
|
138
|
+
float y = r * sin(theta) * sin(phi);
|
|
139
|
+
float z = r * cos(phi);
|
|
140
|
+
return vec3(x, y, z);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
inline CUDA_CALLABLE vec3 sample_unit_hemisphere_surface(uint32& state)
|
|
144
|
+
{
|
|
145
|
+
float phi = acos(1.f - randf(state));
|
|
146
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
147
|
+
float x = cos(theta) * sin(phi);
|
|
148
|
+
float y = sin(theta) * sin(phi);
|
|
149
|
+
float z = cos(phi);
|
|
150
|
+
return vec3(x, y, z);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
inline CUDA_CALLABLE vec3 sample_unit_hemisphere(uint32& state)
|
|
154
|
+
{
|
|
155
|
+
float phi = acos(1.f - randf(state));
|
|
156
|
+
float theta = randf(state, 0.f, 2.f*M_PI_F);
|
|
157
|
+
float r = pow(randf(state), 1.f/3.f);
|
|
158
|
+
float x = r * cos(theta) * sin(phi);
|
|
159
|
+
float y = r * sin(theta) * sin(phi);
|
|
160
|
+
float z = r * cos(phi);
|
|
161
|
+
return vec3(x, y, z);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
inline CUDA_CALLABLE vec2 sample_unit_square(uint32& state)
|
|
165
|
+
{
|
|
166
|
+
float x = randf(state) - 0.5f;
|
|
167
|
+
float y = randf(state) - 0.5f;
|
|
168
|
+
return vec2(x, y);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
inline CUDA_CALLABLE vec3 sample_unit_cube(uint32& state)
|
|
172
|
+
{
|
|
173
|
+
float x = randf(state) - 0.5f;
|
|
174
|
+
float y = randf(state) - 0.5f;
|
|
175
|
+
float z = randf(state) - 0.5f;
|
|
176
|
+
return vec3(x, y, z);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
inline CUDA_CALLABLE vec4 sample_unit_hypercube(uint32& state)
|
|
180
|
+
{
|
|
181
|
+
float a = randf(state) - 0.5f;
|
|
182
|
+
float b = randf(state) - 0.5f;
|
|
183
|
+
float c = randf(state) - 0.5f;
|
|
184
|
+
float d = randf(state) - 0.5f;
|
|
185
|
+
return vec4(a, b, c, d);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
inline CUDA_CALLABLE void adj_sample_cdf(uint32& state, const array_t<float>& cdf, uint32& adj_state, array_t<float>& adj_cdf, const int& adj_ret) {}
|
|
189
|
+
inline CUDA_CALLABLE void adj_sample_triangle(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
|
|
190
|
+
inline CUDA_CALLABLE void adj_sample_unit_ring(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
|
|
191
|
+
inline CUDA_CALLABLE void adj_sample_unit_disk(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
|
|
192
|
+
inline CUDA_CALLABLE void adj_sample_unit_sphere_surface(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
193
|
+
inline CUDA_CALLABLE void adj_sample_unit_sphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
194
|
+
inline CUDA_CALLABLE void adj_sample_unit_hemisphere_surface(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
195
|
+
inline CUDA_CALLABLE void adj_sample_unit_hemisphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
196
|
+
inline CUDA_CALLABLE void adj_sample_unit_square(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
|
|
197
|
+
inline CUDA_CALLABLE void adj_sample_unit_cube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
198
|
+
inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
199
|
+
|
|
200
|
+
/*
|
|
201
|
+
* log-gamma function to support some of these distributions. The
|
|
202
|
+
* algorithm comes from SPECFUN by Shanjie Zhang and Jianming Jin and their
|
|
203
|
+
* book "Computation of Special Functions", 1996, John Wiley & Sons, Inc.
|
|
204
|
+
*
|
|
205
|
+
* If random_loggam(k+1) is being used to compute log(k!) for an integer k, consider
|
|
206
|
+
* using logfactorial(k) instead.
|
|
207
|
+
*/
|
|
208
|
+
inline CUDA_CALLABLE float random_loggam(float x)
|
|
209
|
+
{
|
|
210
|
+
float x0, x2, lg2pi, gl, gl0;
|
|
211
|
+
uint32 n;
|
|
212
|
+
|
|
213
|
+
const float a[10] = {8.333333333333333e-02f, -2.777777777777778e-03f,
|
|
214
|
+
7.936507936507937e-04f, -5.952380952380952e-04f,
|
|
215
|
+
8.417508417508418e-04f, -1.917526917526918e-03f,
|
|
216
|
+
6.410256410256410e-03f, -2.955065359477124e-02f,
|
|
217
|
+
1.796443723688307e-01f, -1.39243221690590e+00f};
|
|
218
|
+
|
|
219
|
+
if ((x == 1.f) || (x == 2.f))
|
|
220
|
+
{
|
|
221
|
+
return 0.f;
|
|
222
|
+
}
|
|
223
|
+
else if (x < 7.f)
|
|
224
|
+
{
|
|
225
|
+
n = uint32((7 - x));
|
|
226
|
+
}
|
|
227
|
+
else
|
|
228
|
+
{
|
|
229
|
+
n = 0;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
x0 = x + float(n);
|
|
233
|
+
x2 = (1.f / x0) * (1.f / x0);
|
|
234
|
+
// log(2 * M_PI_F)
|
|
235
|
+
lg2pi = 1.8378770664093453f;
|
|
236
|
+
gl0 = a[9];
|
|
237
|
+
for (int i = 8; i >= 0; i--)
|
|
238
|
+
{
|
|
239
|
+
gl0 *= x2;
|
|
240
|
+
gl0 += a[i];
|
|
241
|
+
}
|
|
242
|
+
gl = gl0 / x0 + 0.5f * lg2pi + (x0 - 0.5f) * log(x0) - x0;
|
|
243
|
+
if (x < 7.f)
|
|
244
|
+
{
|
|
245
|
+
for (uint32 k = 1; k <= n; k++)
|
|
246
|
+
{
|
|
247
|
+
gl -= log(x0 - 1.f);
|
|
248
|
+
x0 -= 1.f;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return gl;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
inline CUDA_CALLABLE uint32 random_poisson_mult(uint32& state, float lam) {
|
|
255
|
+
uint32 X;
|
|
256
|
+
float prod, U, enlam;
|
|
257
|
+
|
|
258
|
+
enlam = exp(-lam);
|
|
259
|
+
X = 0;
|
|
260
|
+
prod = 1.f;
|
|
261
|
+
|
|
262
|
+
while (1)
|
|
263
|
+
{
|
|
264
|
+
U = randf(state);
|
|
265
|
+
prod *= U;
|
|
266
|
+
if (prod > enlam)
|
|
267
|
+
{
|
|
268
|
+
X += 1;
|
|
269
|
+
}
|
|
270
|
+
else
|
|
271
|
+
{
|
|
272
|
+
return X;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/*
|
|
278
|
+
* The transformed rejection method for generating Poisson random variables
|
|
279
|
+
* W. Hoermann
|
|
280
|
+
* Insurance: Mathematics and Economics 12, 39-45 (1993)
|
|
281
|
+
*/
|
|
282
|
+
inline CUDA_CALLABLE uint32 random_poisson(uint32& state, float lam)
|
|
283
|
+
{
|
|
284
|
+
uint32 k;
|
|
285
|
+
float U, V, slam, loglam, a, b, invalpha, vr, us;
|
|
286
|
+
|
|
287
|
+
slam = sqrt(lam);
|
|
288
|
+
loglam = log(lam);
|
|
289
|
+
b = 0.931f + 2.53f * slam;
|
|
290
|
+
a = -0.059f + 0.02483f * b;
|
|
291
|
+
invalpha = 1.1239f + 1.1328f / (b - 3.4f);
|
|
292
|
+
vr = 0.9277f - 3.6224f / (b - 2.f);
|
|
293
|
+
|
|
294
|
+
while (1)
|
|
295
|
+
{
|
|
296
|
+
U = randf(state) - 0.5f;
|
|
297
|
+
V = randf(state);
|
|
298
|
+
us = 0.5f - abs(U);
|
|
299
|
+
k = uint32(floor((2.f * a / us + b) * U + lam + 0.43f));
|
|
300
|
+
if ((us >= 0.07f) && (V <= vr))
|
|
301
|
+
{
|
|
302
|
+
return k;
|
|
303
|
+
}
|
|
304
|
+
if ((us < 0.013f) && (V > us))
|
|
305
|
+
{
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <= (-lam + k * loglam - random_loggam(k + 1)))
|
|
309
|
+
{
|
|
310
|
+
return k;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
/*
|
|
316
|
+
* Adapted from NumPy's implementation
|
|
317
|
+
* Warp's state variable is half the precision of NumPy's so
|
|
318
|
+
* poisson implementation uses half the precision used in NumPy's implementation
|
|
319
|
+
* both precisions appear to converge in the statistical limit
|
|
320
|
+
*/
|
|
321
|
+
inline CUDA_CALLABLE uint32 poisson(uint32& state, float lam)
|
|
322
|
+
{
|
|
323
|
+
if (lam >= 10.f)
|
|
324
|
+
{
|
|
325
|
+
return random_poisson(state, lam);
|
|
326
|
+
}
|
|
327
|
+
else if (lam == 0.f)
|
|
328
|
+
{
|
|
329
|
+
return 0;
|
|
330
|
+
}
|
|
331
|
+
else
|
|
332
|
+
{
|
|
333
|
+
return random_poisson_mult(state, lam);
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
inline CUDA_CALLABLE void adj_random_loggam(float x, float& adj_x, const float adj_ret) {}
|
|
338
|
+
inline CUDA_CALLABLE void random_poisson_mult(uint32& state, float lam, uint32& adj_state, float& adj_lam, const uint32& adj_ret) {}
|
|
339
|
+
inline CUDA_CALLABLE void adj_random_poisson(uint32& state, float lam, uint32& adj_state, float& adj_lam, const uint32& adj_ret) {}
|
|
340
|
+
inline CUDA_CALLABLE void adj_poisson(uint32& state, float lam, uint32& adj_state, float& adj_lam, const uint32& adj_ret) {}
|
|
341
|
+
|
|
342
|
+
} // namespace wp
|
warp/native/range.h
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*
|
|
5
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
* you may not use this file except in compliance with the License.
|
|
7
|
+
* You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#pragma once
|
|
19
|
+
|
|
20
|
+
namespace wp
|
|
21
|
+
{
|
|
22
|
+
|
|
23
|
+
// All iterable types should implement 3 methods:
|
|
24
|
+
//
|
|
25
|
+
// T iter_next(iter) - returns the current value and moves iterator to next state
|
|
26
|
+
// int iter_cmp(iter) - returns 0 if finished
|
|
27
|
+
// iter iter_reverse(iter) - return an iterator of the same type representing the reverse order
|
|
28
|
+
//
|
|
29
|
+
// iter_next() should also be registered as a built-in hidden function so that code-gen
|
|
30
|
+
// can call it and generate the appropriate variable storage
|
|
31
|
+
|
|
32
|
+
// represents a built-in Python range() loop
|
|
33
|
+
struct range_t
|
|
34
|
+
{
|
|
35
|
+
CUDA_CALLABLE range_t()
|
|
36
|
+
: start(0),
|
|
37
|
+
end(0),
|
|
38
|
+
step(0),
|
|
39
|
+
i(0)
|
|
40
|
+
{}
|
|
41
|
+
|
|
42
|
+
int start;
|
|
43
|
+
int end;
|
|
44
|
+
int step;
|
|
45
|
+
|
|
46
|
+
int i;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
CUDA_CALLABLE inline range_t range(int end)
|
|
50
|
+
{
|
|
51
|
+
range_t r;
|
|
52
|
+
r.start = 0;
|
|
53
|
+
r.end = end;
|
|
54
|
+
r.step = 1;
|
|
55
|
+
|
|
56
|
+
r.i = r.start;
|
|
57
|
+
|
|
58
|
+
return r;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
CUDA_CALLABLE inline range_t range(int start, int end)
|
|
62
|
+
{
|
|
63
|
+
range_t r;
|
|
64
|
+
r.start = start;
|
|
65
|
+
r.end = end;
|
|
66
|
+
r.step = 1;
|
|
67
|
+
|
|
68
|
+
r.i = r.start;
|
|
69
|
+
|
|
70
|
+
return r;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
CUDA_CALLABLE inline range_t range(int start, int end, int step)
|
|
74
|
+
{
|
|
75
|
+
range_t r;
|
|
76
|
+
r.start = start;
|
|
77
|
+
r.end = end;
|
|
78
|
+
r.step = step;
|
|
79
|
+
|
|
80
|
+
r.i = r.start;
|
|
81
|
+
|
|
82
|
+
return r;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
CUDA_CALLABLE inline void adj_range(int end, int adj_end, range_t& adj_ret) {}
|
|
87
|
+
CUDA_CALLABLE inline void adj_range(int start, int end, int adj_start, int adj_end, range_t& adj_ret) {}
|
|
88
|
+
CUDA_CALLABLE inline void adj_range(int start, int end, int step, int adj_start, int adj_end, int adj_step, range_t& adj_ret) {}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
CUDA_CALLABLE inline int iter_next(range_t& r)
|
|
92
|
+
{
|
|
93
|
+
int iter = r.i;
|
|
94
|
+
|
|
95
|
+
r.i += r.step;
|
|
96
|
+
return iter;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
CUDA_CALLABLE inline bool iter_cmp(const range_t& r)
|
|
100
|
+
{
|
|
101
|
+
// implements for-loop comparison to emulate Python range() loops with negative arguments
|
|
102
|
+
if (r.step == 0)
|
|
103
|
+
// degenerate case where step == 0
|
|
104
|
+
return false;
|
|
105
|
+
if (r.step > 0)
|
|
106
|
+
// normal case where step > 0
|
|
107
|
+
return r.i < r.end;
|
|
108
|
+
else
|
|
109
|
+
// reverse case where step < 0
|
|
110
|
+
return r.i > r.end;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
|
|
114
|
+
{
|
|
115
|
+
// generates a reverse range, equivalent to reversed(range())
|
|
116
|
+
range_t rev;
|
|
117
|
+
|
|
118
|
+
if (r.step > 0)
|
|
119
|
+
{
|
|
120
|
+
rev.start = r.start + int((r.end - r.start - 1) / r.step) * r.step;
|
|
121
|
+
}
|
|
122
|
+
else
|
|
123
|
+
{
|
|
124
|
+
rev.start = r.start + int((r.end - r.start + 1) / r.step) * r.step;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
rev.end = r.start - r.step;
|
|
128
|
+
rev.step = -r.step;
|
|
129
|
+
|
|
130
|
+
rev.i = rev.start;
|
|
131
|
+
|
|
132
|
+
return rev;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
CUDA_CALLABLE inline void adj_iter_reverse(const range_t& r, range_t& adj_r, range_t& adj_ret)
|
|
136
|
+
{
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
} // namespace wp
|
warp/native/reduce.cpp
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
*
|
|
5
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
* you may not use this file except in compliance with the License.
|
|
7
|
+
* You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#include "warp.h"
|
|
19
|
+
|
|
20
|
+
namespace
|
|
21
|
+
{
|
|
22
|
+
|
|
23
|
+
// Specialized accumulation functions for common type sizes
|
|
24
|
+
template <int N, typename T> void fixed_len_sum(const T *val, T *sum, int value_size)
|
|
25
|
+
{
|
|
26
|
+
for (int i = 0; i < N; ++i, ++val, ++sum)
|
|
27
|
+
{
|
|
28
|
+
*sum += *val;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
template <typename T> void dyn_len_sum(const T *val, T *sum, int value_size)
|
|
33
|
+
{
|
|
34
|
+
for (int i = 0; i < value_size; ++i, ++val, ++sum)
|
|
35
|
+
{
|
|
36
|
+
*sum += *val;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
template <int N, typename T> void fixed_len_inner(const T *a, const T *b, T *dot, int value_size)
|
|
41
|
+
{
|
|
42
|
+
for (int i = 0; i < N; ++i, ++a, ++b)
|
|
43
|
+
{
|
|
44
|
+
*dot += *a * *b;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
template <typename T> void dyn_len_inner(const T *a, const T *b, T *dot, int value_size)
|
|
49
|
+
{
|
|
50
|
+
for (int i = 0; i < value_size; ++i, ++a, ++b)
|
|
51
|
+
{
|
|
52
|
+
*dot += *a * *b;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
} // namespace
|
|
57
|
+
|
|
58
|
+
template <typename T>
|
|
59
|
+
void array_inner_host(const T *ptr_a, const T *ptr_b, T *ptr_out, int count, int byte_stride_a, int byte_stride_b,
|
|
60
|
+
int type_length)
|
|
61
|
+
{
|
|
62
|
+
assert((byte_stride_a % sizeof(T)) == 0);
|
|
63
|
+
assert((byte_stride_b % sizeof(T)) == 0);
|
|
64
|
+
const int stride_a = byte_stride_a / sizeof(T);
|
|
65
|
+
const int stride_b = byte_stride_b / sizeof(T);
|
|
66
|
+
|
|
67
|
+
void (*inner_func)(const T *, const T *, T *, int);
|
|
68
|
+
switch (type_length)
|
|
69
|
+
{
|
|
70
|
+
case 1:
|
|
71
|
+
inner_func = fixed_len_inner<1, T>;
|
|
72
|
+
break;
|
|
73
|
+
case 2:
|
|
74
|
+
inner_func = fixed_len_inner<2, T>;
|
|
75
|
+
break;
|
|
76
|
+
case 3:
|
|
77
|
+
inner_func = fixed_len_inner<3, T>;
|
|
78
|
+
break;
|
|
79
|
+
case 4:
|
|
80
|
+
inner_func = fixed_len_inner<4, T>;
|
|
81
|
+
break;
|
|
82
|
+
default:
|
|
83
|
+
inner_func = dyn_len_inner<T>;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
*ptr_out = 0.0f;
|
|
87
|
+
for (int i = 0; i < count; ++i)
|
|
88
|
+
{
|
|
89
|
+
inner_func(ptr_a + i * stride_a, ptr_b + i * stride_b, ptr_out, type_length);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
template <typename T> void array_sum_host(const T *ptr_a, T *ptr_out, int count, int byte_stride, int type_length)
|
|
94
|
+
{
|
|
95
|
+
assert((byte_stride % sizeof(T)) == 0);
|
|
96
|
+
const int stride = byte_stride / sizeof(T);
|
|
97
|
+
|
|
98
|
+
void (*accumulate_func)(const T *, T *, int);
|
|
99
|
+
switch (type_length)
|
|
100
|
+
{
|
|
101
|
+
case 1:
|
|
102
|
+
accumulate_func = fixed_len_sum<1, T>;
|
|
103
|
+
break;
|
|
104
|
+
case 2:
|
|
105
|
+
accumulate_func = fixed_len_sum<2, T>;
|
|
106
|
+
break;
|
|
107
|
+
case 3:
|
|
108
|
+
accumulate_func = fixed_len_sum<3, T>;
|
|
109
|
+
break;
|
|
110
|
+
case 4:
|
|
111
|
+
accumulate_func = fixed_len_sum<4, T>;
|
|
112
|
+
break;
|
|
113
|
+
default:
|
|
114
|
+
accumulate_func = dyn_len_sum<T>;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
memset(ptr_out, 0, sizeof(T)*type_length);
|
|
118
|
+
for (int i = 0; i < count; ++i)
|
|
119
|
+
accumulate_func(ptr_a + i * stride, ptr_out, type_length);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
void array_inner_float_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
123
|
+
int type_length)
|
|
124
|
+
{
|
|
125
|
+
const float *ptr_a = (const float *)(a);
|
|
126
|
+
const float *ptr_b = (const float *)(b);
|
|
127
|
+
float *ptr_out = (float *)(out);
|
|
128
|
+
|
|
129
|
+
array_inner_host(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_length);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
void array_inner_double_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
133
|
+
int type_length)
|
|
134
|
+
{
|
|
135
|
+
const double *ptr_a = (const double *)(a);
|
|
136
|
+
const double *ptr_b = (const double *)(b);
|
|
137
|
+
double *ptr_out = (double *)(out);
|
|
138
|
+
|
|
139
|
+
array_inner_host(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_length);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
void array_sum_float_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
|
|
143
|
+
{
|
|
144
|
+
const float *ptr_a = (const float *)(a);
|
|
145
|
+
float *ptr_out = (float *)(out);
|
|
146
|
+
array_sum_host(ptr_a, ptr_out, count, byte_stride_a, type_length);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
void array_sum_double_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
|
|
150
|
+
{
|
|
151
|
+
const double *ptr_a = (const double *)(a);
|
|
152
|
+
double *ptr_out = (double *)(out);
|
|
153
|
+
array_sum_host(ptr_a, ptr_out, count, byte_stride_a, type_length);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#if !WP_ENABLE_CUDA
|
|
157
|
+
void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
158
|
+
int type_length)
|
|
159
|
+
{
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
|
|
163
|
+
int type_length)
|
|
164
|
+
{
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
|
|
168
|
+
{
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
|
|
172
|
+
{
|
|
173
|
+
}
|
|
174
|
+
#endif
|