warp-lang 1.5.0__py3-none-manylinux2014_aarch64.whl → 1.6.0__py3-none-manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +5 -0
- warp/autograd.py +414 -191
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +40 -12
- warp/build_dll.py +13 -6
- warp/builtins.py +1124 -497
- warp/codegen.py +261 -136
- warp/config.py +1 -1
- warp/context.py +357 -119
- warp/examples/assets/square_cloth.usd +0 -0
- warp/examples/benchmarks/benchmark_gemm.py +27 -18
- warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
- warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
- warp/examples/core/example_torch.py +18 -34
- warp/examples/fem/example_apic_fluid.py +1 -0
- warp/examples/fem/example_mixed_elasticity.py +1 -1
- warp/examples/optim/example_bounce.py +1 -1
- warp/examples/optim/example_cloth_throw.py +1 -1
- warp/examples/optim/example_diffray.py +4 -15
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/optim/example_softbody_properties.py +392 -0
- warp/examples/optim/example_trajectory.py +1 -3
- warp/examples/optim/example_walker.py +5 -0
- warp/examples/sim/example_cartpole.py +0 -2
- warp/examples/sim/example_cloth.py +3 -1
- warp/examples/sim/example_cloth_self_contact.py +260 -0
- warp/examples/sim/example_granular_collision_sdf.py +4 -5
- warp/examples/sim/example_jacobian_ik.py +0 -2
- warp/examples/sim/example_quadruped.py +5 -2
- warp/examples/tile/example_tile_cholesky.py +79 -0
- warp/examples/tile/example_tile_convolution.py +2 -2
- warp/examples/tile/example_tile_fft.py +2 -2
- warp/examples/tile/example_tile_filtering.py +3 -3
- warp/examples/tile/example_tile_matmul.py +4 -4
- warp/examples/tile/example_tile_mlp.py +12 -12
- warp/examples/tile/example_tile_nbody.py +180 -0
- warp/examples/tile/example_tile_walker.py +319 -0
- warp/fem/geometry/geometry.py +0 -2
- warp/math.py +147 -0
- warp/native/array.h +12 -0
- warp/native/builtin.h +0 -1
- warp/native/bvh.cpp +149 -70
- warp/native/bvh.cu +287 -68
- warp/native/bvh.h +195 -85
- warp/native/clang/clang.cpp +5 -1
- warp/native/coloring.cpp +5 -1
- warp/native/cuda_util.cpp +91 -53
- warp/native/cuda_util.h +5 -0
- warp/native/exports.h +40 -40
- warp/native/intersect.h +17 -0
- warp/native/mat.h +41 -0
- warp/native/mathdx.cpp +19 -0
- warp/native/mesh.cpp +25 -8
- warp/native/mesh.cu +153 -101
- warp/native/mesh.h +482 -403
- warp/native/quat.h +40 -0
- warp/native/solid_angle.h +7 -0
- warp/native/sort.cpp +85 -0
- warp/native/sort.cu +34 -0
- warp/native/sort.h +3 -1
- warp/native/spatial.h +11 -0
- warp/native/tile.h +1187 -669
- warp/native/tile_reduce.h +8 -6
- warp/native/vec.h +41 -0
- warp/native/warp.cpp +8 -1
- warp/native/warp.cu +263 -40
- warp/native/warp.h +19 -5
- warp/optim/linear.py +22 -4
- warp/render/render_opengl.py +130 -64
- warp/sim/__init__.py +6 -1
- warp/sim/collide.py +270 -26
- warp/sim/import_urdf.py +8 -8
- warp/sim/integrator_euler.py +25 -7
- warp/sim/integrator_featherstone.py +154 -35
- warp/sim/integrator_vbd.py +842 -40
- warp/sim/model.py +134 -72
- warp/sparse.py +1 -1
- warp/stubs.py +265 -132
- warp/tape.py +28 -30
- warp/tests/aux_test_module_unload.py +15 -0
- warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
- warp/tests/test_array.py +74 -0
- warp/tests/test_assert.py +242 -0
- warp/tests/test_codegen.py +14 -61
- warp/tests/test_collision.py +2 -2
- warp/tests/test_coloring.py +12 -2
- warp/tests/test_examples.py +12 -1
- warp/tests/test_func.py +21 -4
- warp/tests/test_grad_debug.py +87 -2
- warp/tests/test_hash_grid.py +1 -1
- warp/tests/test_ipc.py +116 -0
- warp/tests/test_lerp.py +13 -87
- warp/tests/test_mat.py +138 -167
- warp/tests/test_math.py +47 -1
- warp/tests/test_matmul.py +17 -16
- warp/tests/test_matmul_lite.py +10 -15
- warp/tests/test_mesh.py +84 -60
- warp/tests/test_mesh_query_aabb.py +165 -0
- warp/tests/test_mesh_query_point.py +328 -286
- warp/tests/test_mesh_query_ray.py +134 -121
- warp/tests/test_mlp.py +2 -2
- warp/tests/test_operators.py +43 -0
- warp/tests/test_overwrite.py +47 -2
- warp/tests/test_quat.py +77 -0
- warp/tests/test_reload.py +29 -0
- warp/tests/test_sim_grad_bounce_linear.py +204 -0
- warp/tests/test_smoothstep.py +17 -83
- warp/tests/test_static.py +19 -3
- warp/tests/test_tape.py +25 -0
- warp/tests/test_tile.py +178 -191
- warp/tests/test_tile_load.py +356 -0
- warp/tests/test_tile_mathdx.py +61 -8
- warp/tests/test_tile_mlp.py +17 -17
- warp/tests/test_tile_reduce.py +24 -18
- warp/tests/test_tile_shared_memory.py +66 -17
- warp/tests/test_tile_view.py +165 -0
- warp/tests/test_torch.py +35 -0
- warp/tests/test_utils.py +36 -24
- warp/tests/test_vec.py +110 -0
- warp/tests/unittest_suites.py +29 -4
- warp/tests/unittest_utils.py +30 -13
- warp/thirdparty/unittest_parallel.py +2 -2
- warp/types.py +411 -101
- warp/utils.py +10 -7
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/METADATA +92 -69
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/RECORD +130 -119
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
- warp/examples/benchmarks/benchmark_tile.py +0 -179
- warp/native/tile_gemm.h +0 -341
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
warp/native/bvh.cu
CHANGED
|
@@ -18,31 +18,49 @@
|
|
|
18
18
|
#include <cuda_runtime_api.h>
|
|
19
19
|
|
|
20
20
|
#define THRUST_IGNORE_CUB_VERSION_CHECK
|
|
21
|
+
#define REORDER_HOST_TREE
|
|
21
22
|
|
|
22
23
|
#include <cub/cub.cuh>
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
namespace wp
|
|
26
27
|
{
|
|
28
|
+
void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
|
|
29
|
+
void bvh_destroy_host(BVH& bvh);
|
|
27
30
|
|
|
28
|
-
|
|
31
|
+
// for LBVH: this will start with some muted leaf nodes, but that is okay, we can still trace up because there parents information is still valid
|
|
32
|
+
// the only thing worth mentioning is that when the parent leaf node is also a leaf node, we need to recompute its bounds, since their child information are lost
|
|
33
|
+
// for a compact tree such as those from SAH or Median constructor, there is no muted leaf nodes
|
|
34
|
+
__global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, int* __restrict__ primitive_indices, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* item_lowers, const vec3* item_uppers)
|
|
29
35
|
{
|
|
30
36
|
int index = blockDim.x*blockIdx.x + threadIdx.x;
|
|
31
37
|
|
|
32
38
|
if (index < n)
|
|
33
39
|
{
|
|
34
40
|
bool leaf = node_lowers[index].b;
|
|
41
|
+
int parent = parents[index];
|
|
35
42
|
|
|
36
43
|
if (leaf)
|
|
37
44
|
{
|
|
45
|
+
BVHPackedNodeHalf& lower = node_lowers[index];
|
|
46
|
+
BVHPackedNodeHalf& upper = node_uppers[index];
|
|
38
47
|
// update the leaf node
|
|
39
|
-
const int leaf_index = node_lowers[index].i;
|
|
40
48
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
// only need to compute bound when this is a valid leaf node
|
|
50
|
+
if (!node_lowers[parent].b)
|
|
51
|
+
{
|
|
52
|
+
const int start = lower.i;
|
|
53
|
+
const int end = upper.i;
|
|
54
|
+
|
|
55
|
+
bounds3 bound;
|
|
56
|
+
for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
|
|
57
|
+
{
|
|
58
|
+
const int primitive = primitive_indices[primitive_counter];
|
|
59
|
+
bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
|
|
60
|
+
}
|
|
61
|
+
(vec3&)lower = bound.lower;
|
|
62
|
+
(vec3&)upper = bound.upper;
|
|
63
|
+
}
|
|
46
64
|
}
|
|
47
65
|
else
|
|
48
66
|
{
|
|
@@ -53,8 +71,7 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
|
|
|
53
71
|
// update hierarchy
|
|
54
72
|
for (;;)
|
|
55
73
|
{
|
|
56
|
-
|
|
57
|
-
|
|
74
|
+
parent = parents[index];
|
|
58
75
|
// reached root
|
|
59
76
|
if (parent == -1)
|
|
60
77
|
return;
|
|
@@ -68,34 +85,49 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
|
|
|
68
85
|
// then update its bounds and move onto the next parent in the hierarchy
|
|
69
86
|
if (finished == 1)
|
|
70
87
|
{
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
88
|
+
BVHPackedNodeHalf& parent_lower = node_lowers[parent];
|
|
89
|
+
BVHPackedNodeHalf& parent_upper = node_uppers[parent];
|
|
90
|
+
if (parent_lower.b)
|
|
91
|
+
// a packed leaf node can still be a parent in LBVH, we need to recompute its bounds
|
|
92
|
+
// since we've lost its left and right child node index in the muting process
|
|
93
|
+
{
|
|
94
|
+
// update the leaf node
|
|
95
|
+
int parent_parent = parents[parent];;
|
|
96
|
+
|
|
97
|
+
// only need to compute bound when this is a valid leaf node
|
|
98
|
+
if (!node_lowers[parent_parent].b)
|
|
99
|
+
{
|
|
100
|
+
const int start = parent_lower.i;
|
|
101
|
+
const int end = parent_upper.i;
|
|
102
|
+
bounds3 bound;
|
|
103
|
+
for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
|
|
104
|
+
{
|
|
105
|
+
const int primitive = primitive_indices[primitive_counter];
|
|
106
|
+
bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
(vec3&)parent_lower = bound.lower;
|
|
110
|
+
(vec3&)parent_upper = bound.upper;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
else
|
|
114
|
+
{
|
|
115
|
+
const int left_child = parent_lower.i;
|
|
116
|
+
const int right_child = parent_upper.i;
|
|
117
|
+
|
|
118
|
+
vec3 left_lower = (vec3&)(node_lowers[left_child]);
|
|
119
|
+
vec3 left_upper = (vec3&)(node_uppers[left_child]);
|
|
120
|
+
vec3 right_lower = (vec3&)(node_lowers[right_child]);
|
|
121
|
+
vec3 right_upper = (vec3&)(node_uppers[right_child]);
|
|
122
|
+
|
|
123
|
+
// union of child bounds
|
|
124
|
+
vec3 lower = min(left_lower, right_lower);
|
|
125
|
+
vec3 upper = max(left_upper, right_upper);
|
|
126
|
+
|
|
127
|
+
// write new BVH nodes
|
|
128
|
+
(vec3&)parent_lower = lower;
|
|
129
|
+
(vec3&)parent_upper = upper;
|
|
130
|
+
}
|
|
99
131
|
// move onto processing the parent
|
|
100
132
|
index = parent;
|
|
101
133
|
}
|
|
@@ -114,9 +146,8 @@ void bvh_refit_device(BVH& bvh)
|
|
|
114
146
|
ContextGuard guard(bvh.context);
|
|
115
147
|
|
|
116
148
|
// clear child counters
|
|
117
|
-
memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int)*bvh.max_nodes);
|
|
118
|
-
|
|
119
|
-
wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_items, (bvh.num_items, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
|
|
149
|
+
memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
|
|
150
|
+
wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
|
|
120
151
|
}
|
|
121
152
|
|
|
122
153
|
|
|
@@ -316,6 +347,39 @@ __global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas
|
|
|
316
347
|
}
|
|
317
348
|
}
|
|
318
349
|
|
|
350
|
+
/*
|
|
351
|
+
* LBVH uses a bottom-up constructor which makes variable-sized leaf nodes more challenging to achieve.
|
|
352
|
+
* Simply splitting the ordered primitives into uniform groups of size BVH_LEAF_SIZE will result in poor
|
|
353
|
+
* quality. Instead, after the hierarchy is built, we convert any intermediate node whose size is
|
|
354
|
+
* <= BVH_LEAF_SIZE into a new leaf node. This process is done using the new kernel function called
|
|
355
|
+
* mark_packed_leaf_nodes .
|
|
356
|
+
*/
|
|
357
|
+
__global__ void mark_packed_leaf_nodes(int n, volatile int* __restrict__ range_lefts, volatile int* __restrict__ range_rights,
|
|
358
|
+
volatile BVHPackedNodeHalf* __restrict__ lowers, volatile BVHPackedNodeHalf* __restrict__ uppers)
|
|
359
|
+
{
|
|
360
|
+
int node_index = blockDim.x * blockIdx.x + threadIdx.x;
|
|
361
|
+
if (node_index < n)
|
|
362
|
+
{
|
|
363
|
+
// mark the node as leaf if its range is less than LEAF_SIZE_LBVH
|
|
364
|
+
// this will forever mute its child nodes so that they will never be accessed
|
|
365
|
+
|
|
366
|
+
int left = range_lefts[node_index];
|
|
367
|
+
// the LBVH constructor's range is defined as left <= i <= right
|
|
368
|
+
// we need to convert it to our convention: left <= i < right
|
|
369
|
+
int right = range_rights[node_index] + 1;
|
|
370
|
+
// printf("node %d (left %d right %d)", node_index, left, right);
|
|
371
|
+
if (right - left <= BVH_LEAF_SIZE)
|
|
372
|
+
{
|
|
373
|
+
lowers[node_index].b = 1;
|
|
374
|
+
lowers[node_index].i = left;
|
|
375
|
+
uppers[node_index].i = right;
|
|
376
|
+
|
|
377
|
+
// printf("node %d (left %d right %d) is set to child\n", node_index, left, right);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
|
|
319
383
|
CUDA_CALLABLE inline vec3 Vec3Max(const vec3& a, const vec3& b) { return wp::max(a, b); }
|
|
320
384
|
CUDA_CALLABLE inline vec3 Vec3Min(const vec3& a, const vec3& b) { return wp::min(a, b); }
|
|
321
385
|
|
|
@@ -392,7 +456,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
392
456
|
// allocate temporary memory used during building
|
|
393
457
|
indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
394
458
|
keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
|
|
395
|
-
deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest
|
|
459
|
+
deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
|
|
396
460
|
range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
397
461
|
range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
398
462
|
num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
|
|
@@ -431,6 +495,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
431
495
|
|
|
432
496
|
// sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
|
|
433
497
|
radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
|
|
498
|
+
memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
|
|
434
499
|
|
|
435
500
|
// calculate deltas between adjacent keys
|
|
436
501
|
wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
|
|
@@ -443,6 +508,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
443
508
|
|
|
444
509
|
// build the tree and internal node bounds
|
|
445
510
|
wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
|
|
511
|
+
wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
|
|
446
512
|
|
|
447
513
|
// free temporary memory
|
|
448
514
|
free_device(WP_CURRENT_CONTEXT, indices);
|
|
@@ -455,27 +521,171 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
|
|
|
455
521
|
|
|
456
522
|
}
|
|
457
523
|
|
|
524
|
+
// buffer_size is the number of T, not the number of bytes
|
|
525
|
+
template<typename T>
|
|
526
|
+
T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
|
|
527
|
+
{
|
|
528
|
+
T* device_buffer = (T*)alloc_device(context, sizeof(T) * buffer_size);;
|
|
529
|
+
memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
|
|
530
|
+
|
|
531
|
+
return device_buffer;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_host)
|
|
535
|
+
{
|
|
536
|
+
#ifdef REORDER_HOST_TREE
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
// reorder bvh_host such that its nodes are in the front
|
|
540
|
+
// this is essential for the device refit
|
|
541
|
+
BVHPackedNodeHalf* node_lowers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
|
|
542
|
+
BVHPackedNodeHalf* node_uppers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
|
|
543
|
+
|
|
544
|
+
int* node_parents_reordered = new int[bvh_host.max_nodes];
|
|
545
|
+
|
|
546
|
+
std::vector<int> old_to_new(bvh_host.max_nodes, -1);
|
|
547
|
+
|
|
548
|
+
// We will place nodes in this order:
|
|
549
|
+
// Pass 1: leaf nodes (except if it's the root index)
|
|
550
|
+
// Pass 2: non-leaf, non-root
|
|
551
|
+
// Pass 3: root node
|
|
552
|
+
int next_pos = 0;
|
|
553
|
+
|
|
554
|
+
const int root_index = *bvh_host.root;
|
|
555
|
+
// Pass 1: place leaf nodes at the front
|
|
556
|
+
for (int i = 0; i < bvh_host.num_nodes; ++i)
|
|
557
|
+
{
|
|
558
|
+
if (bvh_host.node_lowers[i].b)
|
|
559
|
+
{
|
|
560
|
+
node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
|
|
561
|
+
node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
|
|
562
|
+
old_to_new[i] = next_pos;
|
|
563
|
+
next_pos++;
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Pass 2: place non-leaf, non-root nodes
|
|
568
|
+
for (int i = 0; i < bvh_host.num_nodes; ++i)
|
|
569
|
+
{
|
|
570
|
+
if (i == root_index)
|
|
571
|
+
{
|
|
572
|
+
if (bvh_host.node_lowers[i].b)
|
|
573
|
+
// if root node is leaf node, there must be only be one node
|
|
574
|
+
{
|
|
575
|
+
*bvh_host.root = 0;
|
|
576
|
+
}
|
|
577
|
+
else
|
|
578
|
+
{
|
|
579
|
+
*bvh_host.root = next_pos;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
if (!bvh_host.node_lowers[i].b)
|
|
583
|
+
{
|
|
584
|
+
node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
|
|
585
|
+
node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
|
|
586
|
+
old_to_new[i] = next_pos;
|
|
587
|
+
next_pos++;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
// We can do that by enumerating all old->new pairs:
|
|
592
|
+
for (int old_index = 0; old_index < bvh_host.num_nodes; ++old_index) {
|
|
593
|
+
int new_index = old_to_new[old_index]; // new index
|
|
594
|
+
|
|
595
|
+
int old_parent = bvh_host.node_parents[old_index];
|
|
596
|
+
if (old_parent != -1)
|
|
597
|
+
{
|
|
598
|
+
node_parents_reordered[new_index] = old_to_new[old_parent];
|
|
599
|
+
}
|
|
600
|
+
else
|
|
601
|
+
{
|
|
602
|
+
node_parents_reordered[new_index] = -1;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
// only need to modify the child index of non-leaf nodes
|
|
606
|
+
if (!bvh_host.node_lowers[old_index].b)
|
|
607
|
+
{
|
|
608
|
+
node_lowers_reordered[new_index].i = old_to_new[bvh_host.node_lowers[old_index].i];
|
|
609
|
+
node_uppers_reordered[new_index].i = old_to_new[bvh_host.node_uppers[old_index].i];
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
delete[] bvh_host.node_lowers;
|
|
614
|
+
delete[] bvh_host.node_uppers;
|
|
615
|
+
delete[] bvh_host.node_parents;
|
|
616
|
+
|
|
617
|
+
bvh_host.node_lowers = node_lowers_reordered;
|
|
618
|
+
bvh_host.node_uppers = node_uppers_reordered;
|
|
619
|
+
bvh_host.node_parents = node_parents_reordered;
|
|
620
|
+
#endif // REORDER_HOST_TREE
|
|
621
|
+
|
|
622
|
+
bvh_device_on_host.num_nodes = bvh_host.num_nodes;
|
|
623
|
+
bvh_device_on_host.num_leaf_nodes = bvh_host.num_leaf_nodes;
|
|
624
|
+
bvh_device_on_host.max_nodes = bvh_host.max_nodes;
|
|
625
|
+
bvh_device_on_host.num_items = bvh_host.num_items;
|
|
626
|
+
bvh_device_on_host.max_depth = bvh_host.max_depth;
|
|
627
|
+
|
|
628
|
+
bvh_device_on_host.root = (int*)alloc_device(context, sizeof(int));
|
|
629
|
+
memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
|
|
630
|
+
bvh_device_on_host.context = context;
|
|
631
|
+
|
|
632
|
+
bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
|
|
633
|
+
bvh_device_on_host.node_uppers = make_device_buffer_of(context, bvh_host.node_uppers, bvh_host.max_nodes);
|
|
634
|
+
bvh_device_on_host.node_parents = make_device_buffer_of(context, bvh_host.node_parents, bvh_host.max_nodes);
|
|
635
|
+
bvh_device_on_host.primitive_indices = make_device_buffer_of(context, bvh_host.primitive_indices, bvh_host.num_items);
|
|
636
|
+
}
|
|
637
|
+
|
|
458
638
|
// create in-place given existing descriptor
|
|
459
|
-
void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, BVH&
|
|
639
|
+
void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh_device_on_host)
|
|
460
640
|
{
|
|
461
641
|
ContextGuard guard(context);
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
642
|
+
if (constructor_type == BVH_CONSTRUCTOR_SAH || constructor_type == BVH_CONSTRUCTOR_MEDIAN)
|
|
643
|
+
// CPU based constructors
|
|
644
|
+
{
|
|
645
|
+
// copy bounds back to CPU
|
|
646
|
+
std::vector<vec3> lowers_host(num_items);
|
|
647
|
+
std::vector<vec3> uppers_host(num_items);
|
|
648
|
+
memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
|
|
649
|
+
memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
|
|
650
|
+
|
|
651
|
+
// run CPU based constructor
|
|
652
|
+
wp::BVH bvh_host;
|
|
653
|
+
bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
|
|
654
|
+
|
|
655
|
+
// copy host tree to device
|
|
656
|
+
wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
|
|
657
|
+
// replace host bounds with device bounds
|
|
658
|
+
bvh_device_on_host.item_lowers = lowers;
|
|
659
|
+
bvh_device_on_host.item_uppers = uppers;
|
|
660
|
+
// node_counts is not allocated for host tree
|
|
661
|
+
bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
662
|
+
bvh_destroy_host(bvh_host);
|
|
663
|
+
}
|
|
664
|
+
else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
|
|
665
|
+
{
|
|
666
|
+
bvh_device_on_host.num_items = num_items;
|
|
667
|
+
bvh_device_on_host.max_nodes = 2 * num_items;
|
|
668
|
+
bvh_device_on_host.num_leaf_nodes = num_items;
|
|
669
|
+
bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
670
|
+
memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
671
|
+
bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
672
|
+
memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
|
|
673
|
+
bvh_device_on_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
674
|
+
bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
|
|
675
|
+
bvh_device_on_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
|
|
676
|
+
bvh_device_on_host.primitive_indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
|
|
677
|
+
bvh_device_on_host.item_lowers = lowers;
|
|
678
|
+
bvh_device_on_host.item_uppers = uppers;
|
|
679
|
+
|
|
680
|
+
bvh_device_on_host.context = context ? context : cuda_context_get_current();
|
|
681
|
+
|
|
682
|
+
LinearBVHBuilderGPU builder;
|
|
683
|
+
builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
|
|
684
|
+
}
|
|
685
|
+
else
|
|
686
|
+
{
|
|
687
|
+
printf("Unrecognized Constructor type: %d! For GPU constructor it should be SAH (0), Median (1), or LBVH (2)!\n", constructor_type);
|
|
688
|
+
}
|
|
479
689
|
}
|
|
480
690
|
|
|
481
691
|
void bvh_destroy_device(BVH& bvh)
|
|
@@ -486,9 +696,11 @@ void bvh_destroy_device(BVH& bvh)
|
|
|
486
696
|
free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
|
|
487
697
|
free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
|
|
488
698
|
free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
|
|
699
|
+
free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
|
|
489
700
|
free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
|
|
490
701
|
}
|
|
491
702
|
|
|
703
|
+
|
|
492
704
|
} // namespace wp
|
|
493
705
|
|
|
494
706
|
|
|
@@ -503,20 +715,27 @@ void bvh_refit_device(uint64_t id)
|
|
|
503
715
|
}
|
|
504
716
|
}
|
|
505
717
|
|
|
506
|
-
|
|
718
|
+
/*
|
|
719
|
+
* Since we don't even know the number of true leaf nodes, never mention where they are, we will launch
|
|
720
|
+
* the num_items threads, which are identical to the number of leaf nodes in the original tree. The
|
|
721
|
+
* refitting threads will start from the nodes corresponding to the original leaf nodes, which might be
|
|
722
|
+
* muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
|
|
723
|
+
* can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
|
|
724
|
+
*/
|
|
725
|
+
uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
|
|
507
726
|
{
|
|
508
727
|
ContextGuard guard(context);
|
|
509
|
-
|
|
510
|
-
wp::BVH
|
|
511
|
-
|
|
728
|
+
wp::BVH bvh_device_on_host;
|
|
729
|
+
wp::BVH* bvh_device_ptr = nullptr;
|
|
730
|
+
|
|
731
|
+
bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
|
|
512
732
|
|
|
513
733
|
// create device-side BVH descriptor
|
|
514
|
-
|
|
515
|
-
memcpy_h2d(WP_CURRENT_CONTEXT,
|
|
516
|
-
|
|
517
|
-
uint64_t bvh_id = (uint64_t)bvh_device;
|
|
518
|
-
wp::bvh_add_descriptor(bvh_id, bvh_host);
|
|
734
|
+
bvh_device_ptr = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
|
|
735
|
+
memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
|
|
519
736
|
|
|
737
|
+
uint64_t bvh_id = (uint64_t)bvh_device_ptr;
|
|
738
|
+
wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
|
|
520
739
|
return bvh_id;
|
|
521
740
|
}
|
|
522
741
|
|