warp-lang 1.5.1__py3-none-manylinux2014_aarch64.whl → 1.6.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (123) hide show
  1. warp/__init__.py +5 -0
  2. warp/autograd.py +414 -191
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +40 -12
  6. warp/build_dll.py +13 -6
  7. warp/builtins.py +1076 -480
  8. warp/codegen.py +240 -119
  9. warp/config.py +1 -1
  10. warp/context.py +298 -84
  11. warp/examples/assets/square_cloth.usd +0 -0
  12. warp/examples/benchmarks/benchmark_gemm.py +27 -18
  13. warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
  14. warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
  15. warp/examples/core/example_torch.py +18 -34
  16. warp/examples/fem/example_apic_fluid.py +1 -0
  17. warp/examples/fem/example_mixed_elasticity.py +1 -1
  18. warp/examples/optim/example_bounce.py +1 -1
  19. warp/examples/optim/example_cloth_throw.py +1 -1
  20. warp/examples/optim/example_diffray.py +4 -15
  21. warp/examples/optim/example_drone.py +1 -1
  22. warp/examples/optim/example_softbody_properties.py +392 -0
  23. warp/examples/optim/example_trajectory.py +1 -3
  24. warp/examples/optim/example_walker.py +5 -0
  25. warp/examples/sim/example_cartpole.py +0 -2
  26. warp/examples/sim/example_cloth_self_contact.py +260 -0
  27. warp/examples/sim/example_granular_collision_sdf.py +4 -5
  28. warp/examples/sim/example_jacobian_ik.py +0 -2
  29. warp/examples/sim/example_quadruped.py +5 -2
  30. warp/examples/tile/example_tile_cholesky.py +79 -0
  31. warp/examples/tile/example_tile_convolution.py +2 -2
  32. warp/examples/tile/example_tile_fft.py +2 -2
  33. warp/examples/tile/example_tile_filtering.py +3 -3
  34. warp/examples/tile/example_tile_matmul.py +4 -4
  35. warp/examples/tile/example_tile_mlp.py +12 -12
  36. warp/examples/tile/example_tile_nbody.py +180 -0
  37. warp/examples/tile/example_tile_walker.py +319 -0
  38. warp/math.py +147 -0
  39. warp/native/array.h +12 -0
  40. warp/native/builtin.h +0 -1
  41. warp/native/bvh.cpp +149 -70
  42. warp/native/bvh.cu +287 -68
  43. warp/native/bvh.h +195 -85
  44. warp/native/clang/clang.cpp +5 -1
  45. warp/native/cuda_util.cpp +35 -0
  46. warp/native/cuda_util.h +5 -0
  47. warp/native/exports.h +40 -40
  48. warp/native/intersect.h +17 -0
  49. warp/native/mat.h +41 -0
  50. warp/native/mathdx.cpp +19 -0
  51. warp/native/mesh.cpp +25 -8
  52. warp/native/mesh.cu +153 -101
  53. warp/native/mesh.h +482 -403
  54. warp/native/quat.h +40 -0
  55. warp/native/solid_angle.h +7 -0
  56. warp/native/sort.cpp +85 -0
  57. warp/native/sort.cu +34 -0
  58. warp/native/sort.h +3 -1
  59. warp/native/spatial.h +11 -0
  60. warp/native/tile.h +1185 -664
  61. warp/native/tile_reduce.h +8 -6
  62. warp/native/vec.h +41 -0
  63. warp/native/warp.cpp +8 -1
  64. warp/native/warp.cu +263 -40
  65. warp/native/warp.h +19 -5
  66. warp/optim/linear.py +22 -4
  67. warp/render/render_opengl.py +124 -59
  68. warp/sim/__init__.py +6 -1
  69. warp/sim/collide.py +270 -26
  70. warp/sim/integrator_euler.py +25 -7
  71. warp/sim/integrator_featherstone.py +154 -35
  72. warp/sim/integrator_vbd.py +842 -40
  73. warp/sim/model.py +111 -53
  74. warp/stubs.py +248 -115
  75. warp/tape.py +28 -30
  76. warp/tests/aux_test_module_unload.py +15 -0
  77. warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
  78. warp/tests/test_array.py +74 -0
  79. warp/tests/test_assert.py +242 -0
  80. warp/tests/test_codegen.py +14 -61
  81. warp/tests/test_collision.py +2 -2
  82. warp/tests/test_examples.py +9 -0
  83. warp/tests/test_grad_debug.py +87 -2
  84. warp/tests/test_hash_grid.py +1 -1
  85. warp/tests/test_ipc.py +116 -0
  86. warp/tests/test_mat.py +138 -167
  87. warp/tests/test_math.py +47 -1
  88. warp/tests/test_matmul.py +11 -7
  89. warp/tests/test_matmul_lite.py +4 -4
  90. warp/tests/test_mesh.py +84 -60
  91. warp/tests/test_mesh_query_aabb.py +165 -0
  92. warp/tests/test_mesh_query_point.py +328 -286
  93. warp/tests/test_mesh_query_ray.py +134 -121
  94. warp/tests/test_mlp.py +2 -2
  95. warp/tests/test_operators.py +43 -0
  96. warp/tests/test_overwrite.py +2 -2
  97. warp/tests/test_quat.py +77 -0
  98. warp/tests/test_reload.py +29 -0
  99. warp/tests/test_sim_grad_bounce_linear.py +204 -0
  100. warp/tests/test_static.py +16 -0
  101. warp/tests/test_tape.py +25 -0
  102. warp/tests/test_tile.py +134 -191
  103. warp/tests/test_tile_load.py +356 -0
  104. warp/tests/test_tile_mathdx.py +61 -8
  105. warp/tests/test_tile_mlp.py +17 -17
  106. warp/tests/test_tile_reduce.py +24 -18
  107. warp/tests/test_tile_shared_memory.py +66 -17
  108. warp/tests/test_tile_view.py +165 -0
  109. warp/tests/test_torch.py +35 -0
  110. warp/tests/test_utils.py +36 -24
  111. warp/tests/test_vec.py +110 -0
  112. warp/tests/unittest_suites.py +29 -4
  113. warp/tests/unittest_utils.py +30 -11
  114. warp/thirdparty/unittest_parallel.py +2 -2
  115. warp/types.py +409 -99
  116. warp/utils.py +9 -5
  117. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/METADATA +68 -44
  118. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/RECORD +121 -110
  119. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
  120. warp/examples/benchmarks/benchmark_tile.py +0 -179
  121. warp/native/tile_gemm.h +0 -341
  122. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
  123. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
warp/native/bvh.cu CHANGED
@@ -18,31 +18,49 @@
18
18
  #include <cuda_runtime_api.h>
19
19
 
20
20
  #define THRUST_IGNORE_CUB_VERSION_CHECK
21
+ #define REORDER_HOST_TREE
21
22
 
22
23
  #include <cub/cub.cuh>
23
24
 
24
25
 
25
26
  namespace wp
26
27
  {
28
+ void bvh_create_host(vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh);
29
+ void bvh_destroy_host(BVH& bvh);
27
30
 
28
- __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* item_lowers, const vec3* item_uppers)
31
+ // for LBVH: this will start with some muted leaf nodes, but that is okay, we can still trace up because there parents information is still valid
32
+ // the only thing worth mentioning is that when the parent leaf node is also a leaf node, we need to recompute its bounds, since their child information are lost
33
+ // for a compact tree such as those from SAH or Median constructor, there is no muted leaf nodes
34
+ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, int* __restrict__ primitive_indices, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers, const vec3* item_lowers, const vec3* item_uppers)
29
35
  {
30
36
  int index = blockDim.x*blockIdx.x + threadIdx.x;
31
37
 
32
38
  if (index < n)
33
39
  {
34
40
  bool leaf = node_lowers[index].b;
41
+ int parent = parents[index];
35
42
 
36
43
  if (leaf)
37
44
  {
45
+ BVHPackedNodeHalf& lower = node_lowers[index];
46
+ BVHPackedNodeHalf& upper = node_uppers[index];
38
47
  // update the leaf node
39
- const int leaf_index = node_lowers[index].i;
40
48
 
41
- vec3 lower = item_lowers[leaf_index];
42
- vec3 upper = item_uppers[leaf_index];
43
-
44
- make_node(node_lowers+index, lower, leaf_index, true);
45
- make_node(node_uppers+index, upper, 0, false);
49
+ // only need to compute bound when this is a valid leaf node
50
+ if (!node_lowers[parent].b)
51
+ {
52
+ const int start = lower.i;
53
+ const int end = upper.i;
54
+
55
+ bounds3 bound;
56
+ for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
57
+ {
58
+ const int primitive = primitive_indices[primitive_counter];
59
+ bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
60
+ }
61
+ (vec3&)lower = bound.lower;
62
+ (vec3&)upper = bound.upper;
63
+ }
46
64
  }
47
65
  else
48
66
  {
@@ -53,8 +71,7 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
53
71
  // update hierarchy
54
72
  for (;;)
55
73
  {
56
- int parent = parents[index];
57
-
74
+ parent = parents[index];
58
75
  // reached root
59
76
  if (parent == -1)
60
77
  return;
@@ -68,34 +85,49 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
68
85
  // then update its bounds and move onto the next parent in the hierarchy
69
86
  if (finished == 1)
70
87
  {
71
- const int left_child = node_lowers[parent].i;
72
- const int right_child = node_uppers[parent].i;
73
-
74
- vec3 left_lower = vec3(node_lowers[left_child].x,
75
- node_lowers[left_child].y,
76
- node_lowers[left_child].z);
77
-
78
- vec3 left_upper = vec3(node_uppers[left_child].x,
79
- node_uppers[left_child].y,
80
- node_uppers[left_child].z);
81
-
82
- vec3 right_lower = vec3(node_lowers[right_child].x,
83
- node_lowers[right_child].y,
84
- node_lowers[right_child].z);
85
-
86
-
87
- vec3 right_upper = vec3(node_uppers[right_child].x,
88
- node_uppers[right_child].y,
89
- node_uppers[right_child].z);
90
-
91
- // union of child bounds
92
- vec3 lower = min(left_lower, right_lower);
93
- vec3 upper = max(left_upper, right_upper);
94
-
95
- // write new BVH nodes
96
- make_node(node_lowers+parent, lower, left_child, false);
97
- make_node(node_uppers+parent, upper, right_child, false);
98
-
88
+ BVHPackedNodeHalf& parent_lower = node_lowers[parent];
89
+ BVHPackedNodeHalf& parent_upper = node_uppers[parent];
90
+ if (parent_lower.b)
91
+ // a packed leaf node can still be a parent in LBVH, we need to recompute its bounds
92
+ // since we've lost its left and right child node index in the muting process
93
+ {
94
+ // update the leaf node
95
+ int parent_parent = parents[parent];;
96
+
97
+ // only need to compute bound when this is a valid leaf node
98
+ if (!node_lowers[parent_parent].b)
99
+ {
100
+ const int start = parent_lower.i;
101
+ const int end = parent_upper.i;
102
+ bounds3 bound;
103
+ for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
104
+ {
105
+ const int primitive = primitive_indices[primitive_counter];
106
+ bound.add_bounds(item_lowers[primitive], item_uppers[primitive]);
107
+ }
108
+
109
+ (vec3&)parent_lower = bound.lower;
110
+ (vec3&)parent_upper = bound.upper;
111
+ }
112
+ }
113
+ else
114
+ {
115
+ const int left_child = parent_lower.i;
116
+ const int right_child = parent_upper.i;
117
+
118
+ vec3 left_lower = (vec3&)(node_lowers[left_child]);
119
+ vec3 left_upper = (vec3&)(node_uppers[left_child]);
120
+ vec3 right_lower = (vec3&)(node_lowers[right_child]);
121
+ vec3 right_upper = (vec3&)(node_uppers[right_child]);
122
+
123
+ // union of child bounds
124
+ vec3 lower = min(left_lower, right_lower);
125
+ vec3 upper = max(left_upper, right_upper);
126
+
127
+ // write new BVH nodes
128
+ (vec3&)parent_lower = lower;
129
+ (vec3&)parent_upper = upper;
130
+ }
99
131
  // move onto processing the parent
100
132
  index = parent;
101
133
  }
@@ -114,9 +146,8 @@ void bvh_refit_device(BVH& bvh)
114
146
  ContextGuard guard(bvh.context);
115
147
 
116
148
  // clear child counters
117
- memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int)*bvh.max_nodes);
118
-
119
- wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_items, (bvh.num_items, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
149
+ memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
150
+ wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_kernel, bvh.num_leaf_nodes, (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.primitive_indices, bvh.node_lowers, bvh.node_uppers, bvh.item_lowers, bvh.item_uppers));
120
151
  }
121
152
 
122
153
 
@@ -316,6 +347,39 @@ __global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas
316
347
  }
317
348
  }
318
349
 
350
+ /*
351
+ * LBVH uses a bottom-up constructor which makes variable-sized leaf nodes more challenging to achieve.
352
+ * Simply splitting the ordered primitives into uniform groups of size BVH_LEAF_SIZE will result in poor
353
+ * quality. Instead, after the hierarchy is built, we convert any intermediate node whose size is
354
+ * <= BVH_LEAF_SIZE into a new leaf node. This process is done using the new kernel function called
355
+ * mark_packed_leaf_nodes .
356
+ */
357
+ __global__ void mark_packed_leaf_nodes(int n, volatile int* __restrict__ range_lefts, volatile int* __restrict__ range_rights,
358
+ volatile BVHPackedNodeHalf* __restrict__ lowers, volatile BVHPackedNodeHalf* __restrict__ uppers)
359
+ {
360
+ int node_index = blockDim.x * blockIdx.x + threadIdx.x;
361
+ if (node_index < n)
362
+ {
363
+ // mark the node as leaf if its range is less than LEAF_SIZE_LBVH
364
+ // this will forever mute its child nodes so that they will never be accessed
365
+
366
+ int left = range_lefts[node_index];
367
+ // the LBVH constructor's range is defined as left <= i <= right
368
+ // we need to convert it to our convention: left <= i < right
369
+ int right = range_rights[node_index] + 1;
370
+ // printf("node %d (left %d right %d)", node_index, left, right);
371
+ if (right - left <= BVH_LEAF_SIZE)
372
+ {
373
+ lowers[node_index].b = 1;
374
+ lowers[node_index].i = left;
375
+ uppers[node_index].i = right;
376
+
377
+ // printf("node %d (left %d right %d) is set to child\n", node_index, left, right);
378
+ }
379
+ }
380
+ }
381
+
382
+
319
383
  CUDA_CALLABLE inline vec3 Vec3Max(const vec3& a, const vec3& b) { return wp::max(a, b); }
320
384
  CUDA_CALLABLE inline vec3 Vec3Min(const vec3& a, const vec3& b) { return wp::min(a, b); }
321
385
 
@@ -392,7 +456,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
392
456
  // allocate temporary memory used during building
393
457
  indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
394
458
  keys = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items*2); // *2 for radix sort
395
- deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differenting bit between keys for item i and i+1
459
+ deltas = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*num_items); // highest differentiating bit between keys for item i and i+1
396
460
  range_lefts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
397
461
  range_rights = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
398
462
  num_children = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh.max_nodes);
@@ -431,6 +495,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
431
495
 
432
496
  // sort items based on Morton key (note the 32-bit sort key corresponds to the template parameter to morton3, i.e. 3x9 bit keys combined)
433
497
  radix_sort_pairs_device(WP_CURRENT_CONTEXT, keys, indices, num_items);
498
+ memcpy_d2d(WP_CURRENT_CONTEXT, bvh.primitive_indices, indices, sizeof(int) * num_items);
434
499
 
435
500
  // calculate deltas between adjacent keys
436
501
  wp_launch_device(WP_CURRENT_CONTEXT, compute_key_deltas, num_items, (keys, deltas, num_items-1));
@@ -443,6 +508,7 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
443
508
 
444
509
  // build the tree and internal node bounds
445
510
  wp_launch_device(WP_CURRENT_CONTEXT, build_hierarchy, num_items, (num_items, bvh.root, deltas, num_children, range_lefts, range_rights, bvh.node_parents, bvh.node_lowers, bvh.node_uppers));
511
+ wp_launch_device(WP_CURRENT_CONTEXT, mark_packed_leaf_nodes, bvh.max_nodes, (bvh.max_nodes, range_lefts, range_rights, bvh.node_lowers, bvh.node_uppers));
446
512
 
447
513
  // free temporary memory
448
514
  free_device(WP_CURRENT_CONTEXT, indices);
@@ -455,27 +521,171 @@ void LinearBVHBuilderGPU::build(BVH& bvh, const vec3* item_lowers, const vec3* i
455
521
 
456
522
  }
457
523
 
524
+ // buffer_size is the number of T, not the number of bytes
525
+ template<typename T>
526
+ T* make_device_buffer_of(void* context, T* host_buffer, size_t buffer_size)
527
+ {
528
+ T* device_buffer = (T*)alloc_device(context, sizeof(T) * buffer_size);;
529
+ memcpy_h2d(context, device_buffer, host_buffer, sizeof(T) * buffer_size);
530
+
531
+ return device_buffer;
532
+ }
533
+
534
+ void copy_host_tree_to_device(void* context, BVH& bvh_host, BVH& bvh_device_on_host)
535
+ {
536
+ #ifdef REORDER_HOST_TREE
537
+
538
+
539
+ // reorder bvh_host such that its nodes are in the front
540
+ // this is essential for the device refit
541
+ BVHPackedNodeHalf* node_lowers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
542
+ BVHPackedNodeHalf* node_uppers_reordered = new BVHPackedNodeHalf[bvh_host.max_nodes];
543
+
544
+ int* node_parents_reordered = new int[bvh_host.max_nodes];
545
+
546
+ std::vector<int> old_to_new(bvh_host.max_nodes, -1);
547
+
548
+ // We will place nodes in this order:
549
+ // Pass 1: leaf nodes (except if it's the root index)
550
+ // Pass 2: non-leaf, non-root
551
+ // Pass 3: root node
552
+ int next_pos = 0;
553
+
554
+ const int root_index = *bvh_host.root;
555
+ // Pass 1: place leaf nodes at the front
556
+ for (int i = 0; i < bvh_host.num_nodes; ++i)
557
+ {
558
+ if (bvh_host.node_lowers[i].b)
559
+ {
560
+ node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
561
+ node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
562
+ old_to_new[i] = next_pos;
563
+ next_pos++;
564
+ }
565
+ }
566
+
567
+ // Pass 2: place non-leaf, non-root nodes
568
+ for (int i = 0; i < bvh_host.num_nodes; ++i)
569
+ {
570
+ if (i == root_index)
571
+ {
572
+ if (bvh_host.node_lowers[i].b)
573
+ // if root node is leaf node, there must be only be one node
574
+ {
575
+ *bvh_host.root = 0;
576
+ }
577
+ else
578
+ {
579
+ *bvh_host.root = next_pos;
580
+ }
581
+ }
582
+ if (!bvh_host.node_lowers[i].b)
583
+ {
584
+ node_lowers_reordered[next_pos] = bvh_host.node_lowers[i];
585
+ node_uppers_reordered[next_pos] = bvh_host.node_uppers[i];
586
+ old_to_new[i] = next_pos;
587
+ next_pos++;
588
+ }
589
+ }
590
+
591
+ // We can do that by enumerating all old->new pairs:
592
+ for (int old_index = 0; old_index < bvh_host.num_nodes; ++old_index) {
593
+ int new_index = old_to_new[old_index]; // new index
594
+
595
+ int old_parent = bvh_host.node_parents[old_index];
596
+ if (old_parent != -1)
597
+ {
598
+ node_parents_reordered[new_index] = old_to_new[old_parent];
599
+ }
600
+ else
601
+ {
602
+ node_parents_reordered[new_index] = -1;
603
+ }
604
+
605
+ // only need to modify the child index of non-leaf nodes
606
+ if (!bvh_host.node_lowers[old_index].b)
607
+ {
608
+ node_lowers_reordered[new_index].i = old_to_new[bvh_host.node_lowers[old_index].i];
609
+ node_uppers_reordered[new_index].i = old_to_new[bvh_host.node_uppers[old_index].i];
610
+ }
611
+ }
612
+
613
+ delete[] bvh_host.node_lowers;
614
+ delete[] bvh_host.node_uppers;
615
+ delete[] bvh_host.node_parents;
616
+
617
+ bvh_host.node_lowers = node_lowers_reordered;
618
+ bvh_host.node_uppers = node_uppers_reordered;
619
+ bvh_host.node_parents = node_parents_reordered;
620
+ #endif // REORDER_HOST_TREE
621
+
622
+ bvh_device_on_host.num_nodes = bvh_host.num_nodes;
623
+ bvh_device_on_host.num_leaf_nodes = bvh_host.num_leaf_nodes;
624
+ bvh_device_on_host.max_nodes = bvh_host.max_nodes;
625
+ bvh_device_on_host.num_items = bvh_host.num_items;
626
+ bvh_device_on_host.max_depth = bvh_host.max_depth;
627
+
628
+ bvh_device_on_host.root = (int*)alloc_device(context, sizeof(int));
629
+ memcpy_h2d(context, bvh_device_on_host.root, bvh_host.root, sizeof(int));
630
+ bvh_device_on_host.context = context;
631
+
632
+ bvh_device_on_host.node_lowers = make_device_buffer_of(context, bvh_host.node_lowers, bvh_host.max_nodes);
633
+ bvh_device_on_host.node_uppers = make_device_buffer_of(context, bvh_host.node_uppers, bvh_host.max_nodes);
634
+ bvh_device_on_host.node_parents = make_device_buffer_of(context, bvh_host.node_parents, bvh_host.max_nodes);
635
+ bvh_device_on_host.primitive_indices = make_device_buffer_of(context, bvh_host.primitive_indices, bvh_host.num_items);
636
+ }
637
+
458
638
  // create in-place given existing descriptor
459
- void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, BVH& bvh_host)
639
+ void bvh_create_device(void* context, vec3* lowers, vec3* uppers, int num_items, int constructor_type, BVH& bvh_device_on_host)
460
640
  {
461
641
  ContextGuard guard(context);
462
-
463
- bvh_host.num_items = num_items;
464
- bvh_host.max_nodes = 2*num_items;
465
- bvh_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf)*bvh_host.max_nodes);
466
- memset_device(WP_CURRENT_CONTEXT, bvh_host.node_lowers, 0, sizeof(BVHPackedNodeHalf)*bvh_host.max_nodes);
467
- bvh_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf)*bvh_host.max_nodes);
468
- memset_device(WP_CURRENT_CONTEXT, bvh_host.node_uppers, 0, sizeof(BVHPackedNodeHalf)*bvh_host.max_nodes);
469
- bvh_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
470
- bvh_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int)*bvh_host.max_nodes);
471
- bvh_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
472
- bvh_host.item_lowers = lowers;
473
- bvh_host.item_uppers = uppers;
474
-
475
- bvh_host.context = context ? context : cuda_context_get_current();
476
-
477
- LinearBVHBuilderGPU builder;
478
- builder.build(bvh_host, lowers, uppers, num_items, NULL);
642
+ if (constructor_type == BVH_CONSTRUCTOR_SAH || constructor_type == BVH_CONSTRUCTOR_MEDIAN)
643
+ // CPU based constructors
644
+ {
645
+ // copy bounds back to CPU
646
+ std::vector<vec3> lowers_host(num_items);
647
+ std::vector<vec3> uppers_host(num_items);
648
+ memcpy_d2h(WP_CURRENT_CONTEXT, lowers_host.data(), lowers, sizeof(vec3) * num_items);
649
+ memcpy_d2h(WP_CURRENT_CONTEXT, uppers_host.data(), uppers, sizeof(vec3) * num_items);
650
+
651
+ // run CPU based constructor
652
+ wp::BVH bvh_host;
653
+ bvh_create_host(lowers_host.data(), uppers_host.data(), num_items, constructor_type, bvh_host);
654
+
655
+ // copy host tree to device
656
+ wp::copy_host_tree_to_device(WP_CURRENT_CONTEXT, bvh_host, bvh_device_on_host);
657
+ // replace host bounds with device bounds
658
+ bvh_device_on_host.item_lowers = lowers;
659
+ bvh_device_on_host.item_uppers = uppers;
660
+ // node_counts is not allocated for host tree
661
+ bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
662
+ bvh_destroy_host(bvh_host);
663
+ }
664
+ else if (constructor_type == BVH_CONSTRUCTOR_LBVH)
665
+ {
666
+ bvh_device_on_host.num_items = num_items;
667
+ bvh_device_on_host.max_nodes = 2 * num_items;
668
+ bvh_device_on_host.num_leaf_nodes = num_items;
669
+ bvh_device_on_host.node_lowers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
670
+ memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_lowers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
671
+ bvh_device_on_host.node_uppers = (BVHPackedNodeHalf*)alloc_device(WP_CURRENT_CONTEXT, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
672
+ memset_device(WP_CURRENT_CONTEXT, bvh_device_on_host.node_uppers, 0, sizeof(BVHPackedNodeHalf) * bvh_device_on_host.max_nodes);
673
+ bvh_device_on_host.node_parents = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
674
+ bvh_device_on_host.node_counts = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * bvh_device_on_host.max_nodes);
675
+ bvh_device_on_host.root = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int));
676
+ bvh_device_on_host.primitive_indices = (int*)alloc_device(WP_CURRENT_CONTEXT, sizeof(int) * num_items);
677
+ bvh_device_on_host.item_lowers = lowers;
678
+ bvh_device_on_host.item_uppers = uppers;
679
+
680
+ bvh_device_on_host.context = context ? context : cuda_context_get_current();
681
+
682
+ LinearBVHBuilderGPU builder;
683
+ builder.build(bvh_device_on_host, lowers, uppers, num_items, NULL);
684
+ }
685
+ else
686
+ {
687
+ printf("Unrecognized Constructor type: %d! For GPU constructor it should be SAH (0), Median (1), or LBVH (2)!\n", constructor_type);
688
+ }
479
689
  }
480
690
 
481
691
  void bvh_destroy_device(BVH& bvh)
@@ -486,9 +696,11 @@ void bvh_destroy_device(BVH& bvh)
486
696
  free_device(WP_CURRENT_CONTEXT, bvh.node_uppers); bvh.node_uppers = NULL;
487
697
  free_device(WP_CURRENT_CONTEXT, bvh.node_parents); bvh.node_parents = NULL;
488
698
  free_device(WP_CURRENT_CONTEXT, bvh.node_counts); bvh.node_counts = NULL;
699
+ free_device(WP_CURRENT_CONTEXT, bvh.primitive_indices); bvh.primitive_indices = NULL;
489
700
  free_device(WP_CURRENT_CONTEXT, bvh.root); bvh.root = NULL;
490
701
  }
491
702
 
703
+
492
704
  } // namespace wp
493
705
 
494
706
 
@@ -503,20 +715,27 @@ void bvh_refit_device(uint64_t id)
503
715
  }
504
716
  }
505
717
 
506
- uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items)
718
+ /*
719
+ * Since we don't even know the number of true leaf nodes, never mention where they are, we will launch
720
+ * the num_items threads, which are identical to the number of leaf nodes in the original tree. The
721
+ * refitting threads will start from the nodes corresponding to the original leaf nodes, which might be
722
+ * muted. However, the muted leaf nodes will still have the pointer to their parents, thus the up-tracing
723
+ * can still work. We will only compute the bounding box of a leaf node if its parent is not a leaf node.
724
+ */
725
+ uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items, int constructor_type)
507
726
  {
508
727
  ContextGuard guard(context);
509
-
510
- wp::BVH bvh_host;
511
- bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, bvh_host);
728
+ wp::BVH bvh_device_on_host;
729
+ wp::BVH* bvh_device_ptr = nullptr;
730
+
731
+ bvh_create_device(WP_CURRENT_CONTEXT, lowers, uppers, num_items, constructor_type, bvh_device_on_host);
512
732
 
513
733
  // create device-side BVH descriptor
514
- wp::BVH* bvh_device = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
515
- memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device, &bvh_host, sizeof(wp::BVH));
516
-
517
- uint64_t bvh_id = (uint64_t)bvh_device;
518
- wp::bvh_add_descriptor(bvh_id, bvh_host);
734
+ bvh_device_ptr = (wp::BVH*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::BVH));
735
+ memcpy_h2d(WP_CURRENT_CONTEXT, bvh_device_ptr, &bvh_device_on_host, sizeof(wp::BVH));
519
736
 
737
+ uint64_t bvh_id = (uint64_t)bvh_device_ptr;
738
+ wp::bvh_add_descriptor(bvh_id, bvh_device_on_host);
520
739
  return bvh_id;
521
740
  }
522
741