warp-lang 1.0.2__py3-none-manylinux2014_x86_64.whl → 1.1.0__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +108 -97
- warp/__init__.pyi +1 -1
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +115 -113
- warp/build_dll.py +383 -375
- warp/builtins.py +3425 -3354
- warp/codegen.py +2878 -2792
- warp/config.py +40 -36
- warp/constants.py +45 -45
- warp/context.py +5194 -5102
- warp/dlpack.py +442 -442
- warp/examples/__init__.py +16 -16
- warp/examples/assets/bear.usd +0 -0
- warp/examples/assets/bunny.usd +0 -0
- warp/examples/assets/cartpole.urdf +110 -110
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/cube.usd +0 -0
- warp/examples/assets/nv_ant.xml +92 -92
- warp/examples/assets/nv_humanoid.xml +183 -183
- warp/examples/assets/quadruped.urdf +267 -267
- warp/examples/assets/rocks.nvdb +0 -0
- warp/examples/assets/rocks.usd +0 -0
- warp/examples/assets/sphere.usd +0 -0
- warp/examples/benchmarks/benchmark_api.py +383 -383
- warp/examples/benchmarks/benchmark_cloth.py +278 -277
- warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -88
- warp/examples/benchmarks/benchmark_cloth_jax.py +97 -100
- warp/examples/benchmarks/benchmark_cloth_numba.py +146 -142
- warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -77
- warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -86
- warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -112
- warp/examples/benchmarks/benchmark_cloth_warp.py +146 -146
- warp/examples/benchmarks/benchmark_launches.py +295 -295
- warp/examples/browse.py +29 -29
- warp/examples/core/example_dem.py +234 -219
- warp/examples/core/example_fluid.py +293 -267
- warp/examples/core/example_graph_capture.py +144 -126
- warp/examples/core/example_marching_cubes.py +188 -174
- warp/examples/core/example_mesh.py +174 -155
- warp/examples/core/example_mesh_intersect.py +205 -193
- warp/examples/core/example_nvdb.py +176 -170
- warp/examples/core/example_raycast.py +105 -90
- warp/examples/core/example_raymarch.py +199 -178
- warp/examples/core/example_render_opengl.py +185 -141
- warp/examples/core/example_sph.py +405 -387
- warp/examples/core/example_torch.py +222 -181
- warp/examples/core/example_wave.py +263 -248
- warp/examples/fem/bsr_utils.py +378 -380
- warp/examples/fem/example_apic_fluid.py +407 -389
- warp/examples/fem/example_convection_diffusion.py +182 -168
- warp/examples/fem/example_convection_diffusion_dg.py +219 -209
- warp/examples/fem/example_convection_diffusion_dg0.py +204 -194
- warp/examples/fem/example_deformed_geometry.py +177 -159
- warp/examples/fem/example_diffusion.py +201 -173
- warp/examples/fem/example_diffusion_3d.py +177 -152
- warp/examples/fem/example_diffusion_mgpu.py +221 -214
- warp/examples/fem/example_mixed_elasticity.py +244 -222
- warp/examples/fem/example_navier_stokes.py +259 -243
- warp/examples/fem/example_stokes.py +220 -192
- warp/examples/fem/example_stokes_transfer.py +265 -249
- warp/examples/fem/mesh_utils.py +133 -109
- warp/examples/fem/plot_utils.py +292 -287
- warp/examples/optim/example_bounce.py +260 -246
- warp/examples/optim/example_cloth_throw.py +222 -209
- warp/examples/optim/example_diffray.py +566 -536
- warp/examples/optim/example_drone.py +864 -835
- warp/examples/optim/example_inverse_kinematics.py +176 -168
- warp/examples/optim/example_inverse_kinematics_torch.py +185 -169
- warp/examples/optim/example_spring_cage.py +239 -231
- warp/examples/optim/example_trajectory.py +223 -199
- warp/examples/optim/example_walker.py +306 -293
- warp/examples/sim/example_cartpole.py +139 -129
- warp/examples/sim/example_cloth.py +196 -186
- warp/examples/sim/example_granular.py +124 -111
- warp/examples/sim/example_granular_collision_sdf.py +197 -186
- warp/examples/sim/example_jacobian_ik.py +236 -214
- warp/examples/sim/example_particle_chain.py +118 -105
- warp/examples/sim/example_quadruped.py +193 -180
- warp/examples/sim/example_rigid_chain.py +197 -187
- warp/examples/sim/example_rigid_contact.py +189 -177
- warp/examples/sim/example_rigid_force.py +127 -125
- warp/examples/sim/example_rigid_gyroscopic.py +109 -95
- warp/examples/sim/example_rigid_soft_contact.py +134 -122
- warp/examples/sim/example_soft_body.py +190 -177
- warp/fabric.py +337 -335
- warp/fem/__init__.py +60 -27
- warp/fem/cache.py +401 -388
- warp/fem/dirichlet.py +178 -179
- warp/fem/domain.py +262 -263
- warp/fem/field/__init__.py +100 -101
- warp/fem/field/field.py +148 -149
- warp/fem/field/nodal_field.py +298 -299
- warp/fem/field/restriction.py +22 -21
- warp/fem/field/test.py +180 -181
- warp/fem/field/trial.py +183 -183
- warp/fem/geometry/__init__.py +15 -19
- warp/fem/geometry/closest_point.py +69 -70
- warp/fem/geometry/deformed_geometry.py +270 -271
- warp/fem/geometry/element.py +744 -744
- warp/fem/geometry/geometry.py +184 -186
- warp/fem/geometry/grid_2d.py +380 -373
- warp/fem/geometry/grid_3d.py +441 -435
- warp/fem/geometry/hexmesh.py +953 -953
- warp/fem/geometry/partition.py +374 -376
- warp/fem/geometry/quadmesh_2d.py +532 -532
- warp/fem/geometry/tetmesh.py +840 -840
- warp/fem/geometry/trimesh_2d.py +577 -577
- warp/fem/integrate.py +1630 -1615
- warp/fem/operator.py +190 -191
- warp/fem/polynomial.py +214 -213
- warp/fem/quadrature/__init__.py +2 -2
- warp/fem/quadrature/pic_quadrature.py +243 -245
- warp/fem/quadrature/quadrature.py +295 -294
- warp/fem/space/__init__.py +294 -292
- warp/fem/space/basis_space.py +488 -489
- warp/fem/space/collocated_function_space.py +100 -105
- warp/fem/space/dof_mapper.py +236 -236
- warp/fem/space/function_space.py +148 -145
- warp/fem/space/grid_2d_function_space.py +267 -267
- warp/fem/space/grid_3d_function_space.py +305 -306
- warp/fem/space/hexmesh_function_space.py +350 -352
- warp/fem/space/partition.py +350 -350
- warp/fem/space/quadmesh_2d_function_space.py +368 -369
- warp/fem/space/restriction.py +158 -160
- warp/fem/space/shape/__init__.py +13 -15
- warp/fem/space/shape/cube_shape_function.py +738 -738
- warp/fem/space/shape/shape_function.py +102 -103
- warp/fem/space/shape/square_shape_function.py +611 -611
- warp/fem/space/shape/tet_shape_function.py +565 -567
- warp/fem/space/shape/triangle_shape_function.py +429 -429
- warp/fem/space/tetmesh_function_space.py +294 -292
- warp/fem/space/topology.py +297 -295
- warp/fem/space/trimesh_2d_function_space.py +223 -221
- warp/fem/types.py +77 -77
- warp/fem/utils.py +495 -495
- warp/jax.py +166 -141
- warp/jax_experimental.py +341 -339
- warp/native/array.h +1072 -1025
- warp/native/builtin.h +1560 -1560
- warp/native/bvh.cpp +398 -398
- warp/native/bvh.cu +525 -525
- warp/native/bvh.h +429 -429
- warp/native/clang/clang.cpp +495 -464
- warp/native/crt.cpp +31 -31
- warp/native/crt.h +334 -334
- warp/native/cuda_crt.h +1049 -1049
- warp/native/cuda_util.cpp +549 -540
- warp/native/cuda_util.h +288 -203
- warp/native/cutlass_gemm.cpp +34 -34
- warp/native/cutlass_gemm.cu +372 -372
- warp/native/error.cpp +66 -66
- warp/native/error.h +27 -27
- warp/native/fabric.h +228 -228
- warp/native/hashgrid.cpp +301 -278
- warp/native/hashgrid.cu +78 -77
- warp/native/hashgrid.h +227 -227
- warp/native/initializer_array.h +32 -32
- warp/native/intersect.h +1204 -1204
- warp/native/intersect_adj.h +365 -365
- warp/native/intersect_tri.h +322 -322
- warp/native/marching.cpp +2 -2
- warp/native/marching.cu +497 -497
- warp/native/marching.h +2 -2
- warp/native/mat.h +1498 -1498
- warp/native/matnn.h +333 -333
- warp/native/mesh.cpp +203 -203
- warp/native/mesh.cu +293 -293
- warp/native/mesh.h +1887 -1887
- warp/native/nanovdb/NanoVDB.h +4782 -4782
- warp/native/nanovdb/PNanoVDB.h +2553 -2553
- warp/native/nanovdb/PNanoVDBWrite.h +294 -294
- warp/native/noise.h +850 -850
- warp/native/quat.h +1084 -1084
- warp/native/rand.h +299 -299
- warp/native/range.h +108 -108
- warp/native/reduce.cpp +156 -156
- warp/native/reduce.cu +348 -348
- warp/native/runlength_encode.cpp +61 -61
- warp/native/runlength_encode.cu +46 -46
- warp/native/scan.cpp +30 -30
- warp/native/scan.cu +36 -36
- warp/native/scan.h +7 -7
- warp/native/solid_angle.h +442 -442
- warp/native/sort.cpp +94 -94
- warp/native/sort.cu +97 -97
- warp/native/sort.h +14 -14
- warp/native/sparse.cpp +337 -337
- warp/native/sparse.cu +544 -544
- warp/native/spatial.h +630 -630
- warp/native/svd.h +562 -562
- warp/native/temp_buffer.h +30 -30
- warp/native/vec.h +1132 -1132
- warp/native/volume.cpp +297 -297
- warp/native/volume.cu +32 -32
- warp/native/volume.h +538 -538
- warp/native/volume_builder.cu +425 -425
- warp/native/volume_builder.h +19 -19
- warp/native/warp.cpp +1057 -1052
- warp/native/warp.cu +2943 -2828
- warp/native/warp.h +313 -305
- warp/optim/__init__.py +9 -9
- warp/optim/adam.py +120 -120
- warp/optim/linear.py +1104 -939
- warp/optim/sgd.py +104 -92
- warp/render/__init__.py +10 -10
- warp/render/render_opengl.py +3217 -3204
- warp/render/render_usd.py +768 -749
- warp/render/utils.py +152 -150
- warp/sim/__init__.py +52 -59
- warp/sim/articulation.py +685 -685
- warp/sim/collide.py +1594 -1590
- warp/sim/import_mjcf.py +489 -481
- warp/sim/import_snu.py +220 -221
- warp/sim/import_urdf.py +536 -516
- warp/sim/import_usd.py +887 -881
- warp/sim/inertia.py +316 -317
- warp/sim/integrator.py +234 -233
- warp/sim/integrator_euler.py +1956 -1956
- warp/sim/integrator_featherstone.py +1910 -1991
- warp/sim/integrator_xpbd.py +3294 -3312
- warp/sim/model.py +4473 -4314
- warp/sim/particles.py +113 -112
- warp/sim/render.py +417 -403
- warp/sim/utils.py +413 -410
- warp/sparse.py +1227 -1227
- warp/stubs.py +2109 -2469
- warp/tape.py +1162 -225
- warp/tests/__init__.py +1 -1
- warp/tests/__main__.py +4 -4
- warp/tests/assets/torus.usda +105 -105
- warp/tests/aux_test_class_kernel.py +26 -26
- warp/tests/aux_test_compile_consts_dummy.py +10 -10
- warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -21
- warp/tests/aux_test_dependent.py +22 -22
- warp/tests/aux_test_grad_customs.py +23 -23
- warp/tests/aux_test_reference.py +11 -11
- warp/tests/aux_test_reference_reference.py +10 -10
- warp/tests/aux_test_square.py +17 -17
- warp/tests/aux_test_unresolved_func.py +14 -14
- warp/tests/aux_test_unresolved_symbol.py +14 -14
- warp/tests/disabled_kinematics.py +239 -239
- warp/tests/run_coverage_serial.py +31 -31
- warp/tests/test_adam.py +157 -157
- warp/tests/test_arithmetic.py +1124 -1124
- warp/tests/test_array.py +2417 -2326
- warp/tests/test_array_reduce.py +150 -150
- warp/tests/test_async.py +668 -656
- warp/tests/test_atomic.py +141 -141
- warp/tests/test_bool.py +204 -149
- warp/tests/test_builtins_resolution.py +1292 -1292
- warp/tests/test_bvh.py +164 -171
- warp/tests/test_closest_point_edge_edge.py +228 -228
- warp/tests/test_codegen.py +566 -553
- warp/tests/test_compile_consts.py +97 -101
- warp/tests/test_conditional.py +246 -246
- warp/tests/test_copy.py +232 -215
- warp/tests/test_ctypes.py +632 -632
- warp/tests/test_dense.py +67 -67
- warp/tests/test_devices.py +91 -98
- warp/tests/test_dlpack.py +530 -529
- warp/tests/test_examples.py +400 -378
- warp/tests/test_fabricarray.py +955 -955
- warp/tests/test_fast_math.py +62 -54
- warp/tests/test_fem.py +1277 -1278
- warp/tests/test_fp16.py +130 -130
- warp/tests/test_func.py +338 -337
- warp/tests/test_generics.py +571 -571
- warp/tests/test_grad.py +746 -640
- warp/tests/test_grad_customs.py +333 -336
- warp/tests/test_hash_grid.py +210 -164
- warp/tests/test_import.py +39 -39
- warp/tests/test_indexedarray.py +1134 -1134
- warp/tests/test_intersect.py +67 -67
- warp/tests/test_jax.py +307 -307
- warp/tests/test_large.py +167 -164
- warp/tests/test_launch.py +354 -354
- warp/tests/test_lerp.py +261 -261
- warp/tests/test_linear_solvers.py +191 -171
- warp/tests/test_lvalue.py +421 -493
- warp/tests/test_marching_cubes.py +65 -65
- warp/tests/test_mat.py +1801 -1827
- warp/tests/test_mat_lite.py +115 -115
- warp/tests/test_mat_scalar_ops.py +2907 -2889
- warp/tests/test_math.py +126 -193
- warp/tests/test_matmul.py +500 -499
- warp/tests/test_matmul_lite.py +410 -410
- warp/tests/test_mempool.py +188 -190
- warp/tests/test_mesh.py +284 -324
- warp/tests/test_mesh_query_aabb.py +228 -241
- warp/tests/test_mesh_query_point.py +692 -702
- warp/tests/test_mesh_query_ray.py +292 -303
- warp/tests/test_mlp.py +276 -276
- warp/tests/test_model.py +110 -110
- warp/tests/test_modules_lite.py +39 -39
- warp/tests/test_multigpu.py +163 -163
- warp/tests/test_noise.py +248 -248
- warp/tests/test_operators.py +250 -250
- warp/tests/test_options.py +123 -125
- warp/tests/test_peer.py +133 -137
- warp/tests/test_pinned.py +78 -78
- warp/tests/test_print.py +54 -54
- warp/tests/test_quat.py +2086 -2086
- warp/tests/test_rand.py +288 -288
- warp/tests/test_reload.py +217 -217
- warp/tests/test_rounding.py +179 -179
- warp/tests/test_runlength_encode.py +190 -190
- warp/tests/test_sim_grad.py +243 -0
- warp/tests/test_sim_kinematics.py +91 -97
- warp/tests/test_smoothstep.py +168 -168
- warp/tests/test_snippet.py +305 -266
- warp/tests/test_sparse.py +468 -460
- warp/tests/test_spatial.py +2148 -2148
- warp/tests/test_streams.py +486 -473
- warp/tests/test_struct.py +710 -675
- warp/tests/test_tape.py +173 -148
- warp/tests/test_torch.py +743 -743
- warp/tests/test_transient_module.py +87 -87
- warp/tests/test_types.py +556 -659
- warp/tests/test_utils.py +490 -499
- warp/tests/test_vec.py +1264 -1268
- warp/tests/test_vec_lite.py +73 -73
- warp/tests/test_vec_scalar_ops.py +2099 -2099
- warp/tests/test_verify_fp.py +94 -94
- warp/tests/test_volume.py +737 -736
- warp/tests/test_volume_write.py +255 -265
- warp/tests/unittest_serial.py +37 -37
- warp/tests/unittest_suites.py +363 -359
- warp/tests/unittest_utils.py +603 -578
- warp/tests/unused_test_misc.py +71 -71
- warp/tests/walkthrough_debug.py +85 -85
- warp/thirdparty/appdirs.py +598 -598
- warp/thirdparty/dlpack.py +143 -143
- warp/thirdparty/unittest_parallel.py +566 -561
- warp/torch.py +321 -295
- warp/types.py +4504 -4450
- warp/utils.py +1008 -821
- {warp_lang-1.0.2.dist-info → warp_lang-1.1.0.dist-info}/LICENSE.md +126 -126
- {warp_lang-1.0.2.dist-info → warp_lang-1.1.0.dist-info}/METADATA +338 -400
- warp_lang-1.1.0.dist-info/RECORD +352 -0
- warp/examples/assets/cube.usda +0 -42
- warp/examples/assets/sphere.usda +0 -56
- warp/examples/assets/torus.usda +0 -105
- warp_lang-1.0.2.dist-info/RECORD +0 -352
- {warp_lang-1.0.2.dist-info → warp_lang-1.1.0.dist-info}/WHEEL +0 -0
- {warp_lang-1.0.2.dist-info → warp_lang-1.1.0.dist-info}/top_level.txt +0 -0
warp/native/bvh.cu
CHANGED
|
@@ -1,525 +1,525 @@
|
|
|
1
|
-
/** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
-
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
-
* and proprietary rights in and to this software, related documentation
|
|
4
|
-
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
-
* distribution of this software and related documentation without an express
|
|
6
|
-
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
#include "warp.h"
|
|
10
|
-
#include "cuda_util.h"
|
|
11
|
-
#include "bvh.h"
|
|
12
|
-
#include "sort.h"
|
|
13
|
-
|
|
14
|
-
#include <vector>
|
|
15
|
-
#include <algorithm>
|
|
16
|
-
|
|
17
|
-
#include <cuda.h>
|
|
18
|
-
#include <cuda_runtime_api.h>
|
|
19
|
-
|
|
20
|
-
#define THRUST_IGNORE_CUB_VERSION_CHECK
|
|
21
|
-
|
|
22
|
-
#include <cub/cub.cuh>
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
namespace wp
|
|
26
|
-
{
|
|
27
|
-
|
|
28
|
-
__global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* item_lowers, const vec3* item_uppers)
|
|
29
|
-
{
|
|
30
|
-
int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
31
|
-
|
|
32
|
-
if (index < n)
|
|
33
|
-
{
|
|
34
|
-
bool leaf = node_lowers[index].b;
|
|
35
|
-
|
|
36
|
-
if (leaf)
|
|
37
|
-
{
|
|
38
|
-
// update the leaf node
|
|
39
|
-
const int leaf_index = node_lowers[index].i;
|
|
40
|
-
|
|
41
|
-
vec3 lower = item_lowers[leaf_index];
|
|
42
|
-
vec3 upper = item_uppers[leaf_index];
|
|
43
|
-
|
|
44
|
-
make_node(node_lowers+index, lower, leaf_index, true);
|
|
45
|
-
make_node(node_uppers+index, upper, 0, false);
|
|
46
|
-
}
|
|
47
|
-
else
|
|
48
|
-
{
|
|
49
|
-
// only keep leaf threads
|
|
50
|
-
return;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
// update hierarchy
|
|
54
|
-
for (;;)
|
|
55
|
-
{
|
|
56
|
-
int parent = parents[index];
|
|
57
|
-
|
|
58
|
-
// reached root
|
|
59
|
-
if (parent == -1)
|
|
60
|
-
return;
|
|
61
|
-
|
|
62
|
-
// ensure all writes are visible
|
|
63
|
-
__threadfence();
|
|
64
|
-
|
|
65
|
-
int finished = atomicAdd(&child_count[parent], 1);
|
|
66
|
-
|
|
67
|
-
// if we have are the last thread (such that the parent node is now complete)
|
|
68
|
-
// then update its bounds and move onto the the next parent in the hierarchy
|
|
69
|
-
if (finished == 1)
|
|
70
|
-
{
|
|
71
|
-
const int left_child = node_lowers[parent].i;
|
|
72
|
-
const int right_child = node_uppers[parent].i;
|
|
73
|
-
|
|
74
|
-
vec3 left_lower = vec3(node_lowers[left_child].x,
|
|
75
|
-
node_lowers[left_child].y,
|
|
76
|
-
node_lowers[left_child].z);
|
|
77
|
-
|
|
78
|
-
vec3 left_upper = vec3(node_uppers[left_child].x,
|
|
79
|
-
node_uppers[left_child].y,
|
|
80
|
-
node_uppers[left_child].z);
|
|
81
|
-
|
|
82
|
-
vec3 right_lower = vec3(node_lowers[right_child].x,
|
|
83
|
-
node_lowers[right_child].y,
|
|
84
|
-
node_lowers[right_child].z);
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
vec3 right_upper = vec3(node_uppers[right_child].x,
|
|
88
|
-
node_uppers[right_child].y,
|
|
89
|
-
node_uppers[right_child].z);
|
|
90
|
-
|
|
91
|
-
// union of child bounds
|
|
92
|
-
vec3 lower = min(left_lower, right_lower);
|
|
93
|
-
vec3 upper = max(left_upper, right_upper);
|
|
94
|
-
|
|
95
|
-
// write new BVH nodes
|
|
96
|
-
make_node(node_lowers+parent, lower, left_child, false);
|
|
97
|
-
make_node(node_uppers+parent, upper, right_child, false);
|
|
98
|
-
|
|
99
|
-
// move onto processing the parent
|
|
100
|
-
index = parent;
|
|
101
|
-
}
|
|
102
|
-
else
|
|
103
|
-
{
|
|
104
|
-
// parent not ready (we are the first child), terminate thread
|
|
105
|
-
break;
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
void bvh_refit_device(BVH& bvh)
|
|
113
|
-
{
|
|
114
|
-
ContextGuard guard(bvh.context);
|
|
115
|
-
|
|
116
|
-
// clear child counters
|
|
117
|
-
memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int)*bvh.max_nodes);
|
|
118
|
-
|
|
119
|
-
wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_items, (bvh.num_items, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
/////////////////////////////////////////////////////////////////////////////////////////////
|
|
124
|
-
|
|
125
|
-
// Create a linear BVH as described in Fast and Simple Agglomerative LBVH construction
|
|
126
|
-
// this is a bottom-up clustering method that outputs one node per-leaf
|
|
127
|
-
//
|
|
128
|
-
class LinearBVHBuilderGPU
|
|
129
|
-
{
|
|
130
|
-
public:
|
|
131
|
-
|
|
132
|
-
LinearBVHBuilderGPU();
|
|
133
|
-
~LinearBVHBuilderGPU();
|
|
134
|
-
|
|
135
|
-
// takes a bvh (host ref), and pointers to the GPU lower and upper bounds for each triangle
|
|
136
|
-
void build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds);
|
|
137
|
-
|
|
138
|
-
private:
|
|
139
|
-
|
|
140
|
-
// temporary data used during building
|
|
141
|
-
int* indices;
|
|
142
|
-
int* keys;
|
|
143
|
-
int* deltas;
|
|
144
|
-
int* range_lefts;
|
|
145
|
-
int* range_rights;
|
|
146
|
-
int* num_children;
|
|
147
|
-
|
|
148
|
-
// bounds data when total item bounds built on GPU
|
|
149
|
-
vec3* total_lower;
|
|
150
|
-
vec3* total_upper;
|
|
151
|
-
vec3* total_inv_edges;
|
|
152
|
-
};
|
|
153
|
-
|
|
154
|
-
////////////////////////////////////////////////////////
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
__global__ void compute_morton_codes(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const vec3* grid_lower, const vec3* grid_inv_edges, int* __restrict__ indices, int* __restrict__ keys)
|
|
159
|
-
{
|
|
160
|
-
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
161
|
-
|
|
162
|
-
if (index < n)
|
|
163
|
-
{
|
|
164
|
-
vec3 lower = item_lowers[index];
|
|
165
|
-
vec3 upper = item_uppers[index];
|
|
166
|
-
|
|
167
|
-
vec3 center = 0.5f*(lower+upper);
|
|
168
|
-
|
|
169
|
-
vec3 local = cw_mul((center-grid_lower[0]), grid_inv_edges[0]);
|
|
170
|
-
|
|
171
|
-
// 10-bit Morton codes stored in lower 30bits (1024^3 effective resolution)
|
|
172
|
-
int key = morton3<1024>(local[0], local[1], local[2]);
|
|
173
|
-
|
|
174
|
-
indices[index] = index;
|
|
175
|
-
keys[index] = key;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
// calculate the index of the first differing bit between two adjacent Morton keys
|
|
180
|
-
__global__ void compute_key_deltas(const int* __restrict__ keys, int* __restrict__ deltas, int n)
|
|
181
|
-
{
|
|
182
|
-
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
183
|
-
|
|
184
|
-
if (index < n)
|
|
185
|
-
{
|
|
186
|
-
int a = keys[index];
|
|
187
|
-
int b = keys[index+1];
|
|
188
|
-
|
|
189
|
-
int x = a^b;
|
|
190
|
-
|
|
191
|
-
deltas[index] = x;// __clz(x);
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
__global__ void build_leaves(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const int* __restrict__ indices, int* __restrict__ range_lefts, int* __restrict__ range_rights, BVHPackedNodeHalf* __restrict__ lowers, BVHPackedNodeHalf* __restrict__ uppers)
|
|
196
|
-
{
|
|
197
|
-
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
198
|
-
|
|
199
|
-
if (index < n)
|
|
200
|
-
{
|
|
201
|
-
const int item = indices[index];
|
|
202
|
-
|
|
203
|
-
vec3 lower = item_lowers[item];
|
|
204
|
-
vec3 upper = item_uppers[item];
|
|
205
|
-
|
|
206
|
-
// write leaf nodes
|
|
207
|
-
lowers[index] = make_node(lower, item, true);
|
|
208
|
-
uppers[index] = make_node(upper, item, false);
|
|
209
|
-
|
|
210
|
-
// write leaf key ranges
|
|
211
|
-
range_lefts[index] = index;
|
|
212
|
-
range_rights[index] = index;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// this bottom-up process assigns left and right children and combines bounds to form internal nodes
|
|
217
|
-
// there is one thread launched per-leaf node, each thread calculates it's parent node and assigns
|
|
218
|
-
// itself to either the left or right parent slot, the last child to complete the parent and moves
|
|
219
|
-
// up the hierarchy
|
|
220
|
-
__global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas, int* __restrict__ num_children, volatile int* __restrict__ range_lefts, volatile int* __restrict__ range_rights, volatile int* __restrict__ parents, volatile BVHPackedNodeHalf* __restrict__ lowers, volatile BVHPackedNodeHalf* __restrict__ uppers)
|
|
221
|
-
{
|
|
222
|
-
int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
223
|
-
|
|
224
|
-
if (index < n)
|
|
225
|
-
{
|
|
226
|
-
const int internal_offset = n;
|
|
227
|
-
|
|
228
|
-
for (;;)
|
|
229
|
-
{
|
|
230
|
-
int left = range_lefts[index];
|
|
231
|
-
int right = range_rights[index];
|
|
232
|
-
|
|
233
|
-
// check if we are the root node, if so then store out our index and terminate
|
|
234
|
-
if (left == 0 && right == n-1)
|
|
235
|
-
{
|
|
236
|
-
*root = index;
|
|
237
|
-
parents[index] = -1;
|
|
238
|
-
|
|
239
|
-
break;
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
int childCount = 0;
|
|
243
|
-
|
|
244
|
-
int parent;
|
|
245
|
-
|
|
246
|
-
if (left == 0 || (right != n-1 && deltas[right] < deltas[left-1]))
|
|
247
|
-
{
|
|
248
|
-
parent = right + internal_offset;
|
|
249
|
-
|
|
250
|
-
// set parent left child
|
|
251
|
-
parents[index] = parent;
|
|
252
|
-
lowers[parent].i = index;
|
|
253
|
-
range_lefts[parent] = left;
|
|
254
|
-
|
|
255
|
-
// ensure above writes are visible to all threads
|
|
256
|
-
__threadfence();
|
|
257
|
-
|
|
258
|
-
childCount = atomicAdd(&num_children[parent], 1);
|
|
259
|
-
}
|
|
260
|
-
else
|
|
261
|
-
{
|
|
262
|
-
parent = left + internal_offset - 1;
|
|
263
|
-
|
|
264
|
-
// set parent right child
|
|
265
|
-
parents[index] = parent;
|
|
266
|
-
uppers[parent].i = index;
|
|
267
|
-
range_rights[parent] = right;
|
|
268
|
-
|
|
269
|
-
// ensure above writes are visible to all threads
|
|
270
|
-
__threadfence();
|
|
271
|
-
|
|
272
|
-
childCount = atomicAdd(&num_children[parent], 1);
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
// if we have are the last thread (such that the parent node is now complete)
|
|
276
|
-
// then update its bounds and move onto the the next parent in the hierarchy
|
|
277
|
-
if (childCount == 1)
|
|
278
|
-
{
|
|
279
|
-
const int left_child = lowers[parent].i;
|
|
280
|
-
const int right_child = uppers[parent].i;
|
|
281
|
-
|
|
282
|
-
vec3 left_lower = vec3(lowers[left_child].x,
|
|
283
|
-
lowers[left_child].y,
|
|
284
|
-
lowers[left_child].z);
|
|
285
|
-
|
|
286
|
-
vec3 left_upper = vec3(uppers[left_child].x,
|
|
287
|
-
uppers[left_child].y,
|
|
288
|
-
uppers[left_child].z);
|
|
289
|
-
|
|
290
|
-
vec3 right_lower = vec3(lowers[right_child].x,
|
|
291
|
-
lowers[right_child].y,
|
|
292
|
-
lowers[right_child].z);
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
vec3 right_upper = vec3(uppers[right_child].x,
|
|
296
|
-
uppers[right_child].y,
|
|
297
|
-
uppers[right_child].z);
|
|
298
|
-
|
|
299
|
-
// bounds_union of child bounds
|
|
300
|
-
vec3 lower = min(left_lower, right_lower);
|
|
301
|
-
vec3 upper = max(left_upper, right_upper);
|
|
302
|
-
|
|
303
|
-
// write new BVH nodes
|
|
304
|
-
make_node(lowers+parent, lower, left_child, false);
|
|
305
|
-
make_node(uppers+parent, upper, right_child, false);
|
|
306
|
-
|
|
307
|
-
// move onto processing the parent
|
|
308
|
-
index = parent;
|
|
309
|
-
}
|
|
310
|
-
else
|
|
311
|
-
{
|
|
312
|
-
// parent not ready (we are the first child), terminate thread
|
|
313
|
-
break;
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
CUDA_CALLABLE inline vec3 Vec3Max(const vec3& a, const vec3& b) { return wp::max(a, b); }
|
|
320
|
-
CUDA_CALLABLE inline vec3 Vec3Min(const vec3& a, const vec3& b) { return wp::min(a, b); }
|
|
321
|
-
|
|
322
|
-
__global__ void compute_total_bounds(const vec3* item_lowers, const vec3* item_uppers, vec3* total_lower, vec3* total_upper, int num_items)
|
|
323
|
-
{
|
|
324
|
-
typedef cub::BlockReduce<vec3, 256> BlockReduce;
|
|
325
|
-
|
|
326
|
-
__shared__ typename BlockReduce::TempStorage temp_storage;
|
|
327
|
-
|
|
328
|
-
const int blockStart = blockDim.x*blockIdx.x;
|
|
329
|
-
const int numValid = ::min(num_items-blockStart, blockDim.x);
|
|
330
|
-
|
|
331
|
-
const int tid = blockStart + threadIdx.x;
|
|
332
|
-
|
|
333
|
-
if (tid < num_items)
|
|
334
|
-
{
|
|
335
|
-
vec3 lower = item_lowers[tid];
|
|
336
|
-
vec3 upper = item_uppers[tid];
|
|
337
|
-
|
|
338
|
-
vec3 block_upper = BlockReduce(temp_storage).Reduce(upper, Vec3Max, numValid);
|
|
339
|
-
|
|
340
|
-
// sync threads because second reduce uses same temp storage as first
|
|
341
|
-
__syncthreads();
|
|
342
|
-
|
|
343
|
-
vec3 block_lower = BlockReduce(temp_storage).Reduce(lower, Vec3Min, numValid);
|
|
344
|
-
|
|
345
|
-
if (threadIdx.x == 0)
|
|
346
|
-
{
|
|
347
|
-
// write out block results, expanded by the radius
|
|
348
|
-
atomic_max(total_upper, block_upper);
|
|
349
|
-
atomic_min(total_lower, block_lower);
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
// compute inverse edge length, this is just done on the GPU to avoid a CPU->GPU sync point
|
|
355
|
-
__global__ void compute_total_inv_edges(const vec3* total_lower, const vec3* total_upper, vec3* total_inv_edges)
|
|
356
|
-
{
|
|
357
|
-
vec3 edges = (total_upper[0]-total_lower[0]);
|
|
358
|
-
edges += vec3(0.0001f);
|
|
359
|
-
|
|
360
|
-
total_inv_edges[0] = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
LinearBVHBuilderGPU::LinearBVHBuilderGPU()
|
|
366
|
-
: indices(NULL)
|
|
367
|
-
, keys(NULL)
|
|
368
|
-
, deltas(NULL)
|
|
369
|
-
, range_lefts(NULL)
|
|
370
|
-
, range_rights(NULL)
|
|
371
|
-
, num_children(NULL)
|
|
372
|
-
, total_lower(NULL)
|
|
373
|
-
, total_upper(NULL)
|
|
374
|
-
, total_inv_edges(NULL)
|
|
375
|
-
{
|
|
376
|
-
total_lower = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
377
|
-
total_upper = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
378
|
-
total_inv_edges = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
|
|
382
|
-
{
|
|
383
|
-
free_device(WP_CURRENT_CONTEXT, total_lower);
|
|
384
|
-
free_device(WP_CURRENT_CONTEXT, total_upper);
|
|
385
|
-
free_device(WP_CURRENT_CONTEXT, total_inv_edges);
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
|
|
391
|
-
{
|
|
392
|
-
// allocate temporary memory used during building
|
|
393
|
-
indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
394
|
-
keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
395
|
-
deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differenting bit between keys for item i and i+1
|
|
396
|
-
range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
397
|
-
range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
398
|
-
num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
399
|
-
|
|
400
|
-
// if total bounds supplied by the host then we just
|
|
401
|
-
// compute our edge length and upload it to the GPU directly
|
|
402
|
-
if (total_bounds)
|
|
403
|
-
{
|
|
404
|
-
// calculate Morton codes
|
|
405
|
-
vec3 edges = (*total_bounds).edges();
|
|
406
|
-
edges += vec3(0.0001f);
|
|
407
|
-
|
|
408
|
-
vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
|
|
409
|
-
|
|
410
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
|
|
411
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
|
|
412
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
|
|
413
|
-
}
|
|
414
|
-
else
|
|
415
|
-
{
|
|
416
|
-
static vec3 upper(-FLT_MAX);
|
|
417
|
-
static vec3 lower(FLT_MAX);
|
|
418
|
-
|
|
419
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
|
|
420
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
|
|
421
|
-
|
|
422
|
-
// compute the total bounds on the GPU
|
|
423
|
-
wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
|
|
424
|
-
|
|
425
|
-
// compute the total edge length
|
|
426
|
-
wp_launch_device(WP_CURRENT_CONTEXT, compute_total_inv_edges, 1, (total_lower, total_upper, total_inv_edges));
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
// assign 30-bit Morton code based on the centroid of each triangle and bounds for each leaf
|
|
430
|
-
wp_launch_device(WP_CURRENT_CONTEXT, compute_morton_codes, num_items, (item_lowers, item_uppers, num_items, total_lower, total_inv_edges, indices, keys));
|
|
431
|
-
|
|
432
|
-
// sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
|
|
433
|
-
radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
|
|
434
|
-
|
|
435
|
-
// calculate deltas between adjacent keys
|
|
436
|
-
wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
|
|
437
|
-
|
|
438
|
-
// initialize leaf nodes
|
|
439
|
-
wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
|
|
440
|
-
|
|
441
|
-
// reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
|
|
442
|
-
memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
|
|
443
|
-
|
|
444
|
-
// build the tree and internal node bounds
|
|
445
|
-
wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
|
|
446
|
-
|
|
447
|
-
// free temporary memory
|
|
448
|
-
free_device(WP_CURRENT_CONTEXT, indices);
|
|
449
|
-
free_device(WP_CURRENT_CONTEXT, keys);
|
|
450
|
-
free_device(WP_CURRENT_CONTEXT, deltas);
|
|
451
|
-
|
|
452
|
-
free_device(WP_CURRENT_CONTEXT, range_lefts);
|
|
453
|
-
free_device(WP_CURRENT_CONTEXT, range_rights);
|
|
454
|
-
free_device(WP_CURRENT_CONTEXT, num_children);
|
|
455
|
-
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
void bvh_destroy_device(wp::BVH& bvh)
|
|
459
|
-
{
|
|
460
|
-
ContextGuard guard(bvh.context);
|
|
461
|
-
|
|
462
|
-
free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
|
|
463
|
-
free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
|
|
464
|
-
free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
|
|
465
|
-
free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
|
|
466
|
-
free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
} // namespace wp
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
void bvh_refit_device(uint64_t id)
|
|
473
|
-
{
|
|
474
|
-
wp::BVH bvh;
|
|
475
|
-
if (bvh_get_descriptor(id, bvh))
|
|
476
|
-
{
|
|
477
|
-
ContextGuard guard(bvh.context);
|
|
478
|
-
|
|
479
|
-
bvh_refit_device(bvh);
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items)
|
|
484
|
-
{
|
|
485
|
-
ContextGuard guard(context);
|
|
486
|
-
|
|
487
|
-
wp::BVH bvh_host;
|
|
488
|
-
bvh_host.num_items = num_items;
|
|
489
|
-
bvh_host.max_nodes = 2*num_items;
|
|
490
|
-
bvh_host.node_lowers = (wp::BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVHPackedNodeHalf)*bvh_host.max_nodes);
|
|
491
|
-
bvh_host.node_uppers = (wp::BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVHPackedNodeHalf)*bvh_host.max_nodes);
|
|
492
|
-
bvh_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
|
|
493
|
-
bvh_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
|
|
494
|
-
bvh_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
|
|
495
|
-
bvh_host.item_lowers = lowers;
|
|
496
|
-
bvh_host.item_uppers = uppers;
|
|
497
|
-
|
|
498
|
-
bvh_host.context = context ? context : cuda_context_get_current();
|
|
499
|
-
|
|
500
|
-
wp::LinearBVHBuilderGPU builder;
|
|
501
|
-
builder.build(bvh_host, lowers, uppers, num_items, NULL);
|
|
502
|
-
|
|
503
|
-
// create device-side BVH descriptor
|
|
504
|
-
wp::BVH* bvh_device = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
|
|
505
|
-
memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device, &bvh_host, sizeof(wp::BVH));
|
|
506
|
-
|
|
507
|
-
uint64_t bvh_id = (uint64_t)bvh_device;
|
|
508
|
-
wp::bvh_add_descriptor(bvh_id, bvh_host);
|
|
509
|
-
|
|
510
|
-
return bvh_id;
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
void bvh_destroy_device(uint64_t id)
|
|
515
|
-
{
|
|
516
|
-
wp::BVH bvh;
|
|
517
|
-
if (wp::bvh_get_descriptor(id, bvh))
|
|
518
|
-
{
|
|
519
|
-
wp::bvh_destroy_device(bvh);
|
|
520
|
-
wp::bvh_rem_descriptor(id);
|
|
521
|
-
|
|
522
|
-
// free descriptor
|
|
523
|
-
free_device(WP_CURRENT_CONTEXT, (void*)id);
|
|
524
|
-
}
|
|
525
|
-
}
|
|
1
|
+
/** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
* NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
* and proprietary rights in and to this software, related documentation
|
|
4
|
+
* and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
* distribution of this software and related documentation without an express
|
|
6
|
+
* license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "warp.h"
|
|
10
|
+
#include "cuda_util.h"
|
|
11
|
+
#include "bvh.h"
|
|
12
|
+
#include "sort.h"
|
|
13
|
+
|
|
14
|
+
#include <vector>
|
|
15
|
+
#include <algorithm>
|
|
16
|
+
|
|
17
|
+
#include <cuda.h>
|
|
18
|
+
#include <cuda_runtime_api.h>
|
|
19
|
+
|
|
20
|
+
#define THRUST_IGNORE_CUB_VERSION_CHECK
|
|
21
|
+
|
|
22
|
+
#include <cub/cub.cuh>
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
namespace wp
|
|
26
|
+
{
|
|
27
|
+
|
|
28
|
+
__global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* item_lowers, const vec3* item_uppers)
|
|
29
|
+
{
|
|
30
|
+
int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
31
|
+
|
|
32
|
+
if (index < n)
|
|
33
|
+
{
|
|
34
|
+
bool leaf = node_lowers[index].b;
|
|
35
|
+
|
|
36
|
+
if (leaf)
|
|
37
|
+
{
|
|
38
|
+
// update the leaf node
|
|
39
|
+
const int leaf_index = node_lowers[index].i;
|
|
40
|
+
|
|
41
|
+
vec3 lower = item_lowers[leaf_index];
|
|
42
|
+
vec3 upper = item_uppers[leaf_index];
|
|
43
|
+
|
|
44
|
+
make_node(node_lowers+index, lower, leaf_index, true);
|
|
45
|
+
make_node(node_uppers+index, upper, 0, false);
|
|
46
|
+
}
|
|
47
|
+
else
|
|
48
|
+
{
|
|
49
|
+
// only keep leaf threads
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// update hierarchy
|
|
54
|
+
for (;;)
|
|
55
|
+
{
|
|
56
|
+
int parent = parents[index];
|
|
57
|
+
|
|
58
|
+
// reached root
|
|
59
|
+
if (parent == -1)
|
|
60
|
+
return;
|
|
61
|
+
|
|
62
|
+
// ensure all writes are visible
|
|
63
|
+
__threadfence();
|
|
64
|
+
|
|
65
|
+
int finished = atomicAdd(&child_count[parent], 1);
|
|
66
|
+
|
|
67
|
+
// if we have are the last thread (such that the parent node is now complete)
|
|
68
|
+
// then update its bounds and move onto the the next parent in the hierarchy
|
|
69
|
+
if (finished == 1)
|
|
70
|
+
{
|
|
71
|
+
const int left_child = node_lowers[parent].i;
|
|
72
|
+
const int right_child = node_uppers[parent].i;
|
|
73
|
+
|
|
74
|
+
vec3 left_lower = vec3(node_lowers[left_child].x,
|
|
75
|
+
node_lowers[left_child].y,
|
|
76
|
+
node_lowers[left_child].z);
|
|
77
|
+
|
|
78
|
+
vec3 left_upper = vec3(node_uppers[left_child].x,
|
|
79
|
+
node_uppers[left_child].y,
|
|
80
|
+
node_uppers[left_child].z);
|
|
81
|
+
|
|
82
|
+
vec3 right_lower = vec3(node_lowers[right_child].x,
|
|
83
|
+
node_lowers[right_child].y,
|
|
84
|
+
node_lowers[right_child].z);
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
vec3 right_upper = vec3(node_uppers[right_child].x,
|
|
88
|
+
node_uppers[right_child].y,
|
|
89
|
+
node_uppers[right_child].z);
|
|
90
|
+
|
|
91
|
+
// union of child bounds
|
|
92
|
+
vec3 lower = min(left_lower, right_lower);
|
|
93
|
+
vec3 upper = max(left_upper, right_upper);
|
|
94
|
+
|
|
95
|
+
// write new BVH nodes
|
|
96
|
+
make_node(node_lowers+parent, lower, left_child, false);
|
|
97
|
+
make_node(node_uppers+parent, upper, right_child, false);
|
|
98
|
+
|
|
99
|
+
// move onto processing the parent
|
|
100
|
+
index = parent;
|
|
101
|
+
}
|
|
102
|
+
else
|
|
103
|
+
{
|
|
104
|
+
// parent not ready (we are the first child), terminate thread
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
void bvh_refit_device(BVH& bvh)
|
|
113
|
+
{
|
|
114
|
+
ContextGuard guard(bvh.context);
|
|
115
|
+
|
|
116
|
+
// clear child counters
|
|
117
|
+
memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int)*bvh.max_nodes);
|
|
118
|
+
|
|
119
|
+
wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_items, (bvh.num_items, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
/////////////////////////////////////////////////////////////////////////////////////////////
|
|
124
|
+
|
|
125
|
+
// Create a linear BVH as described in Fast and Simple Agglomerative LBVH construction
|
|
126
|
+
// this is a bottom-up clustering method that outputs one node per-leaf
|
|
127
|
+
//
|
|
128
|
+
class LinearBVHBuilderGPU
|
|
129
|
+
{
|
|
130
|
+
public:
|
|
131
|
+
|
|
132
|
+
LinearBVHBuilderGPU();
|
|
133
|
+
~LinearBVHBuilderGPU();
|
|
134
|
+
|
|
135
|
+
// takes a bvh (host ref), and pointers to the GPU lower and upper bounds for each triangle
|
|
136
|
+
void build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds);
|
|
137
|
+
|
|
138
|
+
private:
|
|
139
|
+
|
|
140
|
+
// temporary data used during building
|
|
141
|
+
int* indices;
|
|
142
|
+
int* keys;
|
|
143
|
+
int* deltas;
|
|
144
|
+
int* range_lefts;
|
|
145
|
+
int* range_rights;
|
|
146
|
+
int* num_children;
|
|
147
|
+
|
|
148
|
+
// bounds data when total item bounds built on GPU
|
|
149
|
+
vec3* total_lower;
|
|
150
|
+
vec3* total_upper;
|
|
151
|
+
vec3* total_inv_edges;
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
////////////////////////////////////////////////////////
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
__global__ void compute_morton_codes(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const vec3* grid_lower, const vec3* grid_inv_edges, int* __restrict__ indices, int* __restrict__ keys)
|
|
159
|
+
{
|
|
160
|
+
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
161
|
+
|
|
162
|
+
if (index < n)
|
|
163
|
+
{
|
|
164
|
+
vec3 lower = item_lowers[index];
|
|
165
|
+
vec3 upper = item_uppers[index];
|
|
166
|
+
|
|
167
|
+
vec3 center = 0.5f*(lower+upper);
|
|
168
|
+
|
|
169
|
+
vec3 local = cw_mul((center-grid_lower[0]), grid_inv_edges[0]);
|
|
170
|
+
|
|
171
|
+
// 10-bit Morton codes stored in lower 30bits (1024^3 effective resolution)
|
|
172
|
+
int key = morton3<1024>(local[0], local[1], local[2]);
|
|
173
|
+
|
|
174
|
+
indices[index] = index;
|
|
175
|
+
keys[index] = key;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// calculate the index of the first differing bit between two adjacent Morton keys
|
|
180
|
+
__global__ void compute_key_deltas(const int* __restrict__ keys, int* __restrict__ deltas, int n)
|
|
181
|
+
{
|
|
182
|
+
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
183
|
+
|
|
184
|
+
if (index < n)
|
|
185
|
+
{
|
|
186
|
+
int a = keys[index];
|
|
187
|
+
int b = keys[index+1];
|
|
188
|
+
|
|
189
|
+
int x = a^b;
|
|
190
|
+
|
|
191
|
+
deltas[index] = x;// __clz(x);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
__global__ void build_leaves(const vec3* __restrict__ item_lowers, const vec3* __restrict__ item_uppers, int n, const int* __restrict__ indices, int* __restrict__ range_lefts, int* __restrict__ range_rights, BVHPackedNodeHalf* __restrict__ lowers, BVHPackedNodeHalf* __restrict__ uppers)
|
|
196
|
+
{
|
|
197
|
+
const int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
198
|
+
|
|
199
|
+
if (index < n)
|
|
200
|
+
{
|
|
201
|
+
const int item = indices[index];
|
|
202
|
+
|
|
203
|
+
vec3 lower = item_lowers[item];
|
|
204
|
+
vec3 upper = item_uppers[item];
|
|
205
|
+
|
|
206
|
+
// write leaf nodes
|
|
207
|
+
lowers[index] = make_node(lower, item, true);
|
|
208
|
+
uppers[index] = make_node(upper, item, false);
|
|
209
|
+
|
|
210
|
+
// write leaf key ranges
|
|
211
|
+
range_lefts[index] = index;
|
|
212
|
+
range_rights[index] = index;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// this bottom-up process assigns left and right children and combines bounds to form internal nodes
|
|
217
|
+
// there is one thread launched per-leaf node, each thread calculates it's parent node and assigns
|
|
218
|
+
// itself to either the left or right parent slot, the last child to complete the parent and moves
|
|
219
|
+
// up the hierarchy
|
|
220
|
+
__global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas, int* __restrict__ num_children, volatile int* __restrict__ range_lefts, volatile int* __restrict__ range_rights, volatile int* __restrict__ parents, volatile BVHPackedNodeHalf* __restrict__ lowers, volatile BVHPackedNodeHalf* __restrict__ uppers)
|
|
221
|
+
{
|
|
222
|
+
int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
223
|
+
|
|
224
|
+
if (index < n)
|
|
225
|
+
{
|
|
226
|
+
const int internal_offset = n;
|
|
227
|
+
|
|
228
|
+
for (;;)
|
|
229
|
+
{
|
|
230
|
+
int left = range_lefts[index];
|
|
231
|
+
int right = range_rights[index];
|
|
232
|
+
|
|
233
|
+
// check if we are the root node, if so then store out our index and terminate
|
|
234
|
+
if (left == 0 && right == n-1)
|
|
235
|
+
{
|
|
236
|
+
*root = index;
|
|
237
|
+
parents[index] = -1;
|
|
238
|
+
|
|
239
|
+
break;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
int childCount = 0;
|
|
243
|
+
|
|
244
|
+
int parent;
|
|
245
|
+
|
|
246
|
+
if (left == 0 || (right != n-1 && deltas[right] < deltas[left-1]))
|
|
247
|
+
{
|
|
248
|
+
parent = right + internal_offset;
|
|
249
|
+
|
|
250
|
+
// set parent left child
|
|
251
|
+
parents[index] = parent;
|
|
252
|
+
lowers[parent].i = index;
|
|
253
|
+
range_lefts[parent] = left;
|
|
254
|
+
|
|
255
|
+
// ensure above writes are visible to all threads
|
|
256
|
+
__threadfence();
|
|
257
|
+
|
|
258
|
+
childCount = atomicAdd(&num_children[parent], 1);
|
|
259
|
+
}
|
|
260
|
+
else
|
|
261
|
+
{
|
|
262
|
+
parent = left + internal_offset - 1;
|
|
263
|
+
|
|
264
|
+
// set parent right child
|
|
265
|
+
parents[index] = parent;
|
|
266
|
+
uppers[parent].i = index;
|
|
267
|
+
range_rights[parent] = right;
|
|
268
|
+
|
|
269
|
+
// ensure above writes are visible to all threads
|
|
270
|
+
__threadfence();
|
|
271
|
+
|
|
272
|
+
childCount = atomicAdd(&num_children[parent], 1);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// if we have are the last thread (such that the parent node is now complete)
|
|
276
|
+
// then update its bounds and move onto the the next parent in the hierarchy
|
|
277
|
+
if (childCount == 1)
|
|
278
|
+
{
|
|
279
|
+
const int left_child = lowers[parent].i;
|
|
280
|
+
const int right_child = uppers[parent].i;
|
|
281
|
+
|
|
282
|
+
vec3 left_lower = vec3(lowers[left_child].x,
|
|
283
|
+
lowers[left_child].y,
|
|
284
|
+
lowers[left_child].z);
|
|
285
|
+
|
|
286
|
+
vec3 left_upper = vec3(uppers[left_child].x,
|
|
287
|
+
uppers[left_child].y,
|
|
288
|
+
uppers[left_child].z);
|
|
289
|
+
|
|
290
|
+
vec3 right_lower = vec3(lowers[right_child].x,
|
|
291
|
+
lowers[right_child].y,
|
|
292
|
+
lowers[right_child].z);
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
vec3 right_upper = vec3(uppers[right_child].x,
|
|
296
|
+
uppers[right_child].y,
|
|
297
|
+
uppers[right_child].z);
|
|
298
|
+
|
|
299
|
+
// bounds_union of child bounds
|
|
300
|
+
vec3 lower = min(left_lower, right_lower);
|
|
301
|
+
vec3 upper = max(left_upper, right_upper);
|
|
302
|
+
|
|
303
|
+
// write new BVH nodes
|
|
304
|
+
make_node(lowers+parent, lower, left_child, false);
|
|
305
|
+
make_node(uppers+parent, upper, right_child, false);
|
|
306
|
+
|
|
307
|
+
// move onto processing the parent
|
|
308
|
+
index = parent;
|
|
309
|
+
}
|
|
310
|
+
else
|
|
311
|
+
{
|
|
312
|
+
// parent not ready (we are the first child), terminate thread
|
|
313
|
+
break;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
CUDA_CALLABLE inline vec3 Vec3Max(const vec3& a, const vec3& b) { return wp::max(a, b); }
|
|
320
|
+
CUDA_CALLABLE inline vec3 Vec3Min(const vec3& a, const vec3& b) { return wp::min(a, b); }
|
|
321
|
+
|
|
322
|
+
__global__ void compute_total_bounds(const vec3* item_lowers, const vec3* item_uppers, vec3* total_lower, vec3* total_upper, int num_items)
|
|
323
|
+
{
|
|
324
|
+
typedef cub::BlockReduce<vec3, 256> BlockReduce;
|
|
325
|
+
|
|
326
|
+
__shared__ typename BlockReduce::TempStorage temp_storage;
|
|
327
|
+
|
|
328
|
+
const int blockStart = blockDim.x*blockIdx.x;
|
|
329
|
+
const int numValid = ::min(num_items-blockStart, blockDim.x);
|
|
330
|
+
|
|
331
|
+
const int tid = blockStart + threadIdx.x;
|
|
332
|
+
|
|
333
|
+
if (tid < num_items)
|
|
334
|
+
{
|
|
335
|
+
vec3 lower = item_lowers[tid];
|
|
336
|
+
vec3 upper = item_uppers[tid];
|
|
337
|
+
|
|
338
|
+
vec3 block_upper = BlockReduce(temp_storage).Reduce(upper, Vec3Max, numValid);
|
|
339
|
+
|
|
340
|
+
// sync threads because second reduce uses same temp storage as first
|
|
341
|
+
__syncthreads();
|
|
342
|
+
|
|
343
|
+
vec3 block_lower = BlockReduce(temp_storage).Reduce(lower, Vec3Min, numValid);
|
|
344
|
+
|
|
345
|
+
if (threadIdx.x == 0)
|
|
346
|
+
{
|
|
347
|
+
// write out block results, expanded by the radius
|
|
348
|
+
atomic_max(total_upper, block_upper);
|
|
349
|
+
atomic_min(total_lower, block_lower);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// compute inverse edge length, this is just done on the GPU to avoid a CPU->GPU sync point
|
|
355
|
+
__global__ void compute_total_inv_edges(const vec3* total_lower, const vec3* total_upper, vec3* total_inv_edges)
|
|
356
|
+
{
|
|
357
|
+
vec3 edges = (total_upper[0]-total_lower[0]);
|
|
358
|
+
edges += vec3(0.0001f);
|
|
359
|
+
|
|
360
|
+
total_inv_edges[0] = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
LinearBVHBuilderGPU::LinearBVHBuilderGPU()
|
|
366
|
+
: indices(NULL)
|
|
367
|
+
, keys(NULL)
|
|
368
|
+
, deltas(NULL)
|
|
369
|
+
, range_lefts(NULL)
|
|
370
|
+
, range_rights(NULL)
|
|
371
|
+
, num_children(NULL)
|
|
372
|
+
, total_lower(NULL)
|
|
373
|
+
, total_upper(NULL)
|
|
374
|
+
, total_inv_edges(NULL)
|
|
375
|
+
{
|
|
376
|
+
total_lower = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
377
|
+
total_upper = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
378
|
+
total_inv_edges = (vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(vec3));
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
LinearBVHBuilderGPU::~LinearBVHBuilderGPU()
|
|
382
|
+
{
|
|
383
|
+
free_device(WP_CURRENT_CONTEXT, total_lower);
|
|
384
|
+
free_device(WP_CURRENT_CONTEXT, total_upper);
|
|
385
|
+
free_device(WP_CURRENT_CONTEXT, total_inv_edges);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* item_uppers, int num_items, bounds3* total_bounds)
|
|
391
|
+
{
|
|
392
|
+
// allocate temporary memory used during building
|
|
393
|
+
indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
394
|
+
keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
395
|
+
deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differenting bit between keys for item i and i+1
|
|
396
|
+
range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
397
|
+
range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
398
|
+
num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
399
|
+
|
|
400
|
+
// if total bounds supplied by the host then we just
|
|
401
|
+
// compute our edge length and upload it to the GPU directly
|
|
402
|
+
if (total_bounds)
|
|
403
|
+
{
|
|
404
|
+
// calculate Morton codes
|
|
405
|
+
vec3 edges = (*total_bounds).edges();
|
|
406
|
+
edges += vec3(0.0001f);
|
|
407
|
+
|
|
408
|
+
vec3 inv_edges = vec3(1.0f/edges[0], 1.0f/edges[1], 1.0f/edges[2]);
|
|
409
|
+
|
|
410
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &total_bounds->lower[0], sizeof(vec3));
|
|
411
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &total_bounds->upper[0], sizeof(vec3));
|
|
412
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, total_inv_edges, &inv_edges[0], sizeof(vec3));
|
|
413
|
+
}
|
|
414
|
+
else
|
|
415
|
+
{
|
|
416
|
+
static vec3 upper(-FLT_MAX);
|
|
417
|
+
static vec3 lower(FLT_MAX);
|
|
418
|
+
|
|
419
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, total_lower, &lower, sizeof(lower));
|
|
420
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, total_upper, &upper, sizeof(upper));
|
|
421
|
+
|
|
422
|
+
// compute the total bounds on the GPU
|
|
423
|
+
wp_launch_device(WP_CURRENT_CONTEXT, compute_total_bounds, num_items, (item_lowers, item_uppers, total_lower, total_upper, num_items));
|
|
424
|
+
|
|
425
|
+
// compute the total edge length
|
|
426
|
+
wp_launch_device(WP_CURRENT_CONTEXT, compute_total_inv_edges, 1, (total_lower, total_upper, total_inv_edges));
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// assign 30-bit Morton code based on the centroid of each triangle and bounds for each leaf
|
|
430
|
+
wp_launch_device(WP_CURRENT_CONTEXT, compute_morton_codes, num_items, (item_lowers, item_uppers, num_items, total_lower, total_inv_edges, indices, keys));
|
|
431
|
+
|
|
432
|
+
// sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
|
|
433
|
+
radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
|
|
434
|
+
|
|
435
|
+
// calculate deltas between adjacent keys
|
|
436
|
+
wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
|
|
437
|
+
|
|
438
|
+
// initialize leaf nodes
|
|
439
|
+
wp_launch_device(WP_CURRENT_CONTEXT, build_leaves, num_items, (item_lowers, item_uppers, num_items, indices, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
|
|
440
|
+
|
|
441
|
+
// reset children count, this is our atomic counter so we know when an internal node is complete, only used during building
|
|
442
|
+
memset_device(WP_CURRENT_CONTEXT, num_children, 0, sizeof(int)*bvh.max_nodes);
|
|
443
|
+
|
|
444
|
+
// build the tree and internal node bounds
|
|
445
|
+
wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
|
|
446
|
+
|
|
447
|
+
// free temporary memory
|
|
448
|
+
free_device(WP_CURRENT_CONTEXT, indices);
|
|
449
|
+
free_device(WP_CURRENT_CONTEXT, keys);
|
|
450
|
+
free_device(WP_CURRENT_CONTEXT, deltas);
|
|
451
|
+
|
|
452
|
+
free_device(WP_CURRENT_CONTEXT, range_lefts);
|
|
453
|
+
free_device(WP_CURRENT_CONTEXT, range_rights);
|
|
454
|
+
free_device(WP_CURRENT_CONTEXT, num_children);
|
|
455
|
+
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
void bvh_destroy_device(wp::BVH& bvh)
|
|
459
|
+
{
|
|
460
|
+
ContextGuard guard(bvh.context);
|
|
461
|
+
|
|
462
|
+
free_device(WP_CURRENT_CONTEXT, bvh.node_lowers); bvh.node_lowers = NULL;
|
|
463
|
+
free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
|
|
464
|
+
free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
|
|
465
|
+
free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
|
|
466
|
+
free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
} // namespace wp
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
void bvh_refit_device(uint64_t id)
|
|
473
|
+
{
|
|
474
|
+
wp::BVH bvh;
|
|
475
|
+
if (bvh_get_descriptor(id, bvh))
|
|
476
|
+
{
|
|
477
|
+
ContextGuard guard(bvh.context);
|
|
478
|
+
|
|
479
|
+
bvh_refit_device(bvh);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items)
|
|
484
|
+
{
|
|
485
|
+
ContextGuard guard(context);
|
|
486
|
+
|
|
487
|
+
wp::BVH bvh_host;
|
|
488
|
+
bvh_host.num_items = num_items;
|
|
489
|
+
bvh_host.max_nodes = 2*num_items;
|
|
490
|
+
bvh_host.node_lowers = (wp::BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVHPackedNodeHalf)*bvh_host.max_nodes);
|
|
491
|
+
bvh_host.node_uppers = (wp::BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVHPackedNodeHalf)*bvh_host.max_nodes);
|
|
492
|
+
bvh_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
|
|
493
|
+
bvh_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
|
|
494
|
+
bvh_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
|
|
495
|
+
bvh_host.item_lowers = lowers;
|
|
496
|
+
bvh_host.item_uppers = uppers;
|
|
497
|
+
|
|
498
|
+
bvh_host.context = context ? context : cuda_context_get_current();
|
|
499
|
+
|
|
500
|
+
wp::LinearBVHBuilderGPU builder;
|
|
501
|
+
builder.build(bvh_host, lowers, uppers, num_items, NULL);
|
|
502
|
+
|
|
503
|
+
// create device-side BVH descriptor
|
|
504
|
+
wp::BVH* bvh_device = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
|
|
505
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device, &bvh_host, sizeof(wp::BVH));
|
|
506
|
+
|
|
507
|
+
uint64_t bvh_id = (uint64_t)bvh_device;
|
|
508
|
+
wp::bvh_add_descriptor(bvh_id, bvh_host);
|
|
509
|
+
|
|
510
|
+
return bvh_id;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
void bvh_destroy_device(uint64_t id)
|
|
515
|
+
{
|
|
516
|
+
wp::BVH bvh;
|
|
517
|
+
if (wp::bvh_get_descriptor(id, bvh))
|
|
518
|
+
{
|
|
519
|
+
wp::bvh_destroy_device(bvh);
|
|
520
|
+
wp::bvh_rem_descriptor(id);
|
|
521
|
+
|
|
522
|
+
// free descriptor
|
|
523
|
+
free_device(WP_CURRENT_CONTEXT, (void*)id);
|
|
524
|
+
}
|
|
525
|
+
}
|