warp-lang 1.0.0b2__py3-none-win_amd64.whl → 1.0.0b6__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (271) hide show
  1. docs/conf.py +17 -5
  2. examples/env/env_ant.py +1 -1
  3. examples/env/env_cartpole.py +1 -1
  4. examples/env/env_humanoid.py +1 -1
  5. examples/env/env_usd.py +4 -1
  6. examples/env/environment.py +8 -9
  7. examples/example_dem.py +34 -33
  8. examples/example_diffray.py +364 -337
  9. examples/example_fluid.py +32 -23
  10. examples/example_jacobian_ik.py +97 -93
  11. examples/example_marching_cubes.py +6 -16
  12. examples/example_mesh.py +6 -16
  13. examples/example_mesh_intersect.py +16 -14
  14. examples/example_nvdb.py +14 -16
  15. examples/example_raycast.py +14 -13
  16. examples/example_raymarch.py +16 -23
  17. examples/example_render_opengl.py +19 -10
  18. examples/example_sim_cartpole.py +82 -78
  19. examples/example_sim_cloth.py +45 -48
  20. examples/example_sim_fk_grad.py +51 -44
  21. examples/example_sim_fk_grad_torch.py +47 -40
  22. examples/example_sim_grad_bounce.py +108 -133
  23. examples/example_sim_grad_cloth.py +99 -113
  24. examples/example_sim_granular.py +5 -6
  25. examples/{example_sim_sdf_shape.py → example_sim_granular_collision_sdf.py} +37 -26
  26. examples/example_sim_neo_hookean.py +51 -55
  27. examples/example_sim_particle_chain.py +4 -4
  28. examples/example_sim_quadruped.py +126 -81
  29. examples/example_sim_rigid_chain.py +54 -61
  30. examples/example_sim_rigid_contact.py +66 -70
  31. examples/example_sim_rigid_fem.py +3 -3
  32. examples/example_sim_rigid_force.py +1 -1
  33. examples/example_sim_rigid_gyroscopic.py +3 -4
  34. examples/example_sim_rigid_kinematics.py +28 -39
  35. examples/example_sim_trajopt.py +112 -110
  36. examples/example_sph.py +9 -8
  37. examples/example_wave.py +7 -7
  38. examples/fem/bsr_utils.py +30 -17
  39. examples/fem/example_apic_fluid.py +85 -69
  40. examples/fem/example_convection_diffusion.py +97 -93
  41. examples/fem/example_convection_diffusion_dg.py +142 -149
  42. examples/fem/example_convection_diffusion_dg0.py +141 -136
  43. examples/fem/example_deformed_geometry.py +146 -0
  44. examples/fem/example_diffusion.py +115 -84
  45. examples/fem/example_diffusion_3d.py +116 -86
  46. examples/fem/example_diffusion_mgpu.py +102 -79
  47. examples/fem/example_mixed_elasticity.py +139 -100
  48. examples/fem/example_navier_stokes.py +175 -162
  49. examples/fem/example_stokes.py +143 -111
  50. examples/fem/example_stokes_transfer.py +186 -157
  51. examples/fem/mesh_utils.py +59 -97
  52. examples/fem/plot_utils.py +138 -17
  53. tools/ci/publishing/build_nodes_info.py +54 -0
  54. warp/__init__.py +4 -3
  55. warp/__init__.pyi +1 -0
  56. warp/bin/warp-clang.dll +0 -0
  57. warp/bin/warp.dll +0 -0
  58. warp/build.py +5 -3
  59. warp/build_dll.py +29 -9
  60. warp/builtins.py +836 -492
  61. warp/codegen.py +864 -553
  62. warp/config.py +3 -1
  63. warp/context.py +389 -172
  64. warp/fem/__init__.py +24 -6
  65. warp/fem/cache.py +318 -25
  66. warp/fem/dirichlet.py +7 -3
  67. warp/fem/domain.py +14 -0
  68. warp/fem/field/__init__.py +30 -38
  69. warp/fem/field/field.py +149 -0
  70. warp/fem/field/nodal_field.py +244 -138
  71. warp/fem/field/restriction.py +8 -6
  72. warp/fem/field/test.py +127 -59
  73. warp/fem/field/trial.py +117 -60
  74. warp/fem/geometry/__init__.py +5 -1
  75. warp/fem/geometry/deformed_geometry.py +271 -0
  76. warp/fem/geometry/element.py +24 -1
  77. warp/fem/geometry/geometry.py +86 -14
  78. warp/fem/geometry/grid_2d.py +112 -54
  79. warp/fem/geometry/grid_3d.py +134 -65
  80. warp/fem/geometry/hexmesh.py +953 -0
  81. warp/fem/geometry/partition.py +85 -33
  82. warp/fem/geometry/quadmesh_2d.py +532 -0
  83. warp/fem/geometry/tetmesh.py +451 -115
  84. warp/fem/geometry/trimesh_2d.py +197 -92
  85. warp/fem/integrate.py +534 -268
  86. warp/fem/operator.py +58 -31
  87. warp/fem/polynomial.py +11 -0
  88. warp/fem/quadrature/__init__.py +1 -1
  89. warp/fem/quadrature/pic_quadrature.py +150 -58
  90. warp/fem/quadrature/quadrature.py +209 -57
  91. warp/fem/space/__init__.py +230 -53
  92. warp/fem/space/basis_space.py +489 -0
  93. warp/fem/space/collocated_function_space.py +105 -0
  94. warp/fem/space/dof_mapper.py +49 -2
  95. warp/fem/space/function_space.py +90 -39
  96. warp/fem/space/grid_2d_function_space.py +149 -496
  97. warp/fem/space/grid_3d_function_space.py +173 -538
  98. warp/fem/space/hexmesh_function_space.py +352 -0
  99. warp/fem/space/partition.py +129 -76
  100. warp/fem/space/quadmesh_2d_function_space.py +369 -0
  101. warp/fem/space/restriction.py +46 -34
  102. warp/fem/space/shape/__init__.py +15 -0
  103. warp/fem/space/shape/cube_shape_function.py +738 -0
  104. warp/fem/space/shape/shape_function.py +103 -0
  105. warp/fem/space/shape/square_shape_function.py +611 -0
  106. warp/fem/space/shape/tet_shape_function.py +567 -0
  107. warp/fem/space/shape/triangle_shape_function.py +429 -0
  108. warp/fem/space/tetmesh_function_space.py +132 -1039
  109. warp/fem/space/topology.py +295 -0
  110. warp/fem/space/trimesh_2d_function_space.py +104 -742
  111. warp/fem/types.py +13 -11
  112. warp/fem/utils.py +335 -60
  113. warp/native/array.h +120 -34
  114. warp/native/builtin.h +101 -72
  115. warp/native/bvh.cpp +73 -325
  116. warp/native/bvh.cu +406 -23
  117. warp/native/bvh.h +22 -40
  118. warp/native/clang/clang.cpp +1 -0
  119. warp/native/crt.h +2 -0
  120. warp/native/cuda_util.cpp +8 -3
  121. warp/native/cuda_util.h +1 -0
  122. warp/native/exports.h +1522 -1243
  123. warp/native/intersect.h +19 -4
  124. warp/native/intersect_adj.h +8 -8
  125. warp/native/mat.h +76 -17
  126. warp/native/mesh.cpp +33 -108
  127. warp/native/mesh.cu +114 -18
  128. warp/native/mesh.h +395 -40
  129. warp/native/noise.h +272 -329
  130. warp/native/quat.h +51 -8
  131. warp/native/rand.h +44 -34
  132. warp/native/reduce.cpp +1 -1
  133. warp/native/sparse.cpp +4 -4
  134. warp/native/sparse.cu +163 -155
  135. warp/native/spatial.h +2 -2
  136. warp/native/temp_buffer.h +18 -14
  137. warp/native/vec.h +103 -21
  138. warp/native/warp.cpp +2 -1
  139. warp/native/warp.cu +28 -3
  140. warp/native/warp.h +4 -3
  141. warp/render/render_opengl.py +261 -109
  142. warp/sim/__init__.py +1 -2
  143. warp/sim/articulation.py +385 -185
  144. warp/sim/import_mjcf.py +59 -48
  145. warp/sim/import_urdf.py +15 -15
  146. warp/sim/import_usd.py +174 -102
  147. warp/sim/inertia.py +17 -18
  148. warp/sim/integrator_xpbd.py +4 -3
  149. warp/sim/model.py +330 -250
  150. warp/sim/render.py +1 -1
  151. warp/sparse.py +625 -152
  152. warp/stubs.py +341 -309
  153. warp/tape.py +9 -6
  154. warp/tests/__main__.py +3 -6
  155. warp/tests/assets/curlnoise_golden.npy +0 -0
  156. warp/tests/assets/pnoise_golden.npy +0 -0
  157. warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
  158. warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
  159. warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
  160. warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
  161. warp/tests/aux_test_unresolved_func.py +14 -0
  162. warp/tests/aux_test_unresolved_symbol.py +14 -0
  163. warp/tests/disabled_kinematics.py +239 -0
  164. warp/tests/run_coverage_serial.py +31 -0
  165. warp/tests/test_adam.py +103 -106
  166. warp/tests/test_arithmetic.py +94 -74
  167. warp/tests/test_array.py +82 -101
  168. warp/tests/test_array_reduce.py +57 -23
  169. warp/tests/test_atomic.py +64 -28
  170. warp/tests/test_bool.py +22 -12
  171. warp/tests/test_builtins_resolution.py +1292 -0
  172. warp/tests/test_bvh.py +18 -18
  173. warp/tests/test_closest_point_edge_edge.py +54 -57
  174. warp/tests/test_codegen.py +165 -134
  175. warp/tests/test_compile_consts.py +28 -20
  176. warp/tests/test_conditional.py +108 -24
  177. warp/tests/test_copy.py +10 -12
  178. warp/tests/test_ctypes.py +112 -88
  179. warp/tests/test_dense.py +21 -14
  180. warp/tests/test_devices.py +98 -0
  181. warp/tests/test_dlpack.py +75 -75
  182. warp/tests/test_examples.py +237 -0
  183. warp/tests/test_fabricarray.py +22 -24
  184. warp/tests/test_fast_math.py +15 -11
  185. warp/tests/test_fem.py +1034 -124
  186. warp/tests/test_fp16.py +23 -16
  187. warp/tests/test_func.py +187 -86
  188. warp/tests/test_generics.py +194 -49
  189. warp/tests/test_grad.py +123 -181
  190. warp/tests/test_grad_customs.py +176 -0
  191. warp/tests/test_hash_grid.py +35 -34
  192. warp/tests/test_import.py +10 -23
  193. warp/tests/test_indexedarray.py +24 -25
  194. warp/tests/test_intersect.py +18 -9
  195. warp/tests/test_large.py +141 -0
  196. warp/tests/test_launch.py +14 -41
  197. warp/tests/test_lerp.py +64 -65
  198. warp/tests/test_lvalue.py +493 -0
  199. warp/tests/test_marching_cubes.py +12 -13
  200. warp/tests/test_mat.py +517 -2898
  201. warp/tests/test_mat_lite.py +115 -0
  202. warp/tests/test_mat_scalar_ops.py +2889 -0
  203. warp/tests/test_math.py +103 -9
  204. warp/tests/test_matmul.py +304 -69
  205. warp/tests/test_matmul_lite.py +410 -0
  206. warp/tests/test_mesh.py +60 -22
  207. warp/tests/test_mesh_query_aabb.py +21 -25
  208. warp/tests/test_mesh_query_point.py +111 -22
  209. warp/tests/test_mesh_query_ray.py +12 -24
  210. warp/tests/test_mlp.py +30 -22
  211. warp/tests/test_model.py +92 -89
  212. warp/tests/test_modules_lite.py +39 -0
  213. warp/tests/test_multigpu.py +88 -114
  214. warp/tests/test_noise.py +12 -11
  215. warp/tests/test_operators.py +16 -20
  216. warp/tests/test_options.py +11 -11
  217. warp/tests/test_pinned.py +17 -18
  218. warp/tests/test_print.py +32 -11
  219. warp/tests/test_quat.py +275 -129
  220. warp/tests/test_rand.py +18 -16
  221. warp/tests/test_reload.py +38 -34
  222. warp/tests/test_rounding.py +50 -43
  223. warp/tests/test_runlength_encode.py +168 -20
  224. warp/tests/test_smoothstep.py +9 -11
  225. warp/tests/test_snippet.py +143 -0
  226. warp/tests/test_sparse.py +261 -63
  227. warp/tests/test_spatial.py +276 -243
  228. warp/tests/test_streams.py +110 -85
  229. warp/tests/test_struct.py +268 -63
  230. warp/tests/test_tape.py +39 -21
  231. warp/tests/test_torch.py +90 -86
  232. warp/tests/test_transient_module.py +10 -12
  233. warp/tests/test_types.py +363 -0
  234. warp/tests/test_utils.py +451 -0
  235. warp/tests/test_vec.py +354 -2050
  236. warp/tests/test_vec_lite.py +73 -0
  237. warp/tests/test_vec_scalar_ops.py +2099 -0
  238. warp/tests/test_volume.py +418 -376
  239. warp/tests/test_volume_write.py +124 -134
  240. warp/tests/unittest_serial.py +35 -0
  241. warp/tests/unittest_suites.py +291 -0
  242. warp/tests/unittest_utils.py +342 -0
  243. warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
  244. warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
  245. warp/thirdparty/appdirs.py +36 -45
  246. warp/thirdparty/unittest_parallel.py +589 -0
  247. warp/types.py +622 -211
  248. warp/utils.py +54 -393
  249. warp_lang-1.0.0b6.dist-info/METADATA +238 -0
  250. warp_lang-1.0.0b6.dist-info/RECORD +409 -0
  251. {warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
  252. examples/example_cache_management.py +0 -40
  253. examples/example_multigpu.py +0 -54
  254. examples/example_struct.py +0 -65
  255. examples/fem/example_stokes_transfer_3d.py +0 -210
  256. warp/bin/warp-clang.so +0 -0
  257. warp/bin/warp.so +0 -0
  258. warp/fem/field/discrete_field.py +0 -80
  259. warp/fem/space/nodal_function_space.py +0 -233
  260. warp/tests/test_all.py +0 -223
  261. warp/tests/test_array_scan.py +0 -60
  262. warp/tests/test_base.py +0 -208
  263. warp/tests/test_unresolved_func.py +0 -7
  264. warp/tests/test_unresolved_symbol.py +0 -7
  265. warp_lang-1.0.0b2.dist-info/METADATA +0 -26
  266. warp_lang-1.0.0b2.dist-info/RECORD +0 -380
  267. /warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
  268. /warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
  269. /warp/tests/{test_square.py → aux_test_square.py} +0 -0
  270. {warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
  271. {warp_lang-1.0.0b2.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0
warp/native/vec.h CHANGED
@@ -33,7 +33,7 @@ struct vec_t
33
33
  {
34
34
  for( unsigned i=0; i < Length; ++i )
35
35
  {
36
- c[i] = other[i];
36
+ c[i] = static_cast<Type>(other[i]);
37
37
  }
38
38
  }
39
39
 
@@ -284,12 +284,41 @@ inline CUDA_CALLABLE vec_t<2, Type> div(vec_t<2, Type> a, Type s)
284
284
  return vec_t<2, Type>(a.c[0]/s,a.c[1]/s);
285
285
  }
286
286
 
287
+ template<unsigned Length, typename Type>
288
+ inline CUDA_CALLABLE vec_t<Length, Type> div(Type s, vec_t<Length, Type> a)
289
+ {
290
+ vec_t<Length, Type> ret;
291
+ for (unsigned i=0; i < Length; ++i)
292
+ {
293
+ ret[i] = s / a[i];
294
+ }
295
+ return ret;
296
+ }
297
+
298
+ template<typename Type>
299
+ inline CUDA_CALLABLE vec_t<3, Type> div(Type s, vec_t<3, Type> a)
300
+ {
301
+ return vec_t<3, Type>(s/a.c[0],s/a.c[1],s/a.c[2]);
302
+ }
303
+
304
+ template<typename Type>
305
+ inline CUDA_CALLABLE vec_t<2, Type> div(Type s, vec_t<2, Type> a)
306
+ {
307
+ return vec_t<2, Type>(s/a.c[0],s/a.c[1]);
308
+ }
309
+
287
310
  template<unsigned Length, typename Type>
288
311
  inline CUDA_CALLABLE vec_t<Length, Type> operator / (vec_t<Length, Type> a, Type s)
289
312
  {
290
313
  return div(a,s);
291
314
  }
292
315
 
316
+ template<unsigned Length, typename Type>
317
+ inline CUDA_CALLABLE vec_t<Length, Type> operator / (Type s, vec_t<Length, Type> a)
318
+ {
319
+ return div(s, a);
320
+ }
321
+
293
322
  // component wise division
294
323
  template<unsigned Length, typename Type>
295
324
  inline CUDA_CALLABLE vec_t<Length, Type> cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b)
@@ -383,7 +412,7 @@ inline CUDA_CALLABLE Type tensordot(vec_t<Length, Type> a, vec_t<Length, Type> b
383
412
 
384
413
 
385
414
  template<unsigned Length, typename Type>
386
- inline CUDA_CALLABLE Type index(const vec_t<Length, Type> & a, int idx)
415
+ inline CUDA_CALLABLE Type extract(const vec_t<Length, Type> & a, int idx)
387
416
  {
388
417
  #ifndef NDEBUG
389
418
  if (idx < 0 || idx >= Length)
@@ -397,7 +426,21 @@ inline CUDA_CALLABLE Type index(const vec_t<Length, Type> & a, int idx)
397
426
  }
398
427
 
399
428
  template<unsigned Length, typename Type>
400
- inline CUDA_CALLABLE void indexset(vec_t<Length, Type>& v, int idx, Type value)
429
+ inline CUDA_CALLABLE Type* index(vec_t<Length, Type>& v, int idx)
430
+ {
431
+ #ifndef NDEBUG
432
+ if (idx < 0 || idx >= Length)
433
+ {
434
+ printf("vec index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
435
+ assert(0);
436
+ }
437
+ #endif
438
+
439
+ return &v[idx];
440
+ }
441
+
442
+ template<unsigned Length, typename Type>
443
+ inline CUDA_CALLABLE Type* indexref(vec_t<Length, Type>* v, int idx)
401
444
  {
402
445
  #ifndef NDEBUG
403
446
  if (idx < 0 || idx >= Length)
@@ -407,17 +450,23 @@ inline CUDA_CALLABLE void indexset(vec_t<Length, Type>& v, int idx, Type value)
407
450
  }
408
451
  #endif
409
452
 
410
- v[idx] = value;
453
+ return &((*v)[idx]);
411
454
  }
412
455
 
413
456
  template<unsigned Length, typename Type>
414
- inline CUDA_CALLABLE void adj_indexset(vec_t<Length, Type>& v, int idx, const Type& value,
457
+ inline CUDA_CALLABLE void adj_index(vec_t<Length, Type>& v, int idx,
415
458
  vec_t<Length, Type>& adj_v, int adj_idx, const Type& adj_value)
416
459
  {
417
460
  // nop
418
461
  }
419
462
 
420
463
 
464
+ template<unsigned Length, typename Type>
465
+ inline CUDA_CALLABLE void adj_indexref(vec_t<Length, Type>* v, int idx,
466
+ vec_t<Length, Type>& adj_v, int adj_idx, const Type& adj_value)
467
+ {
468
+ // nop
469
+ }
421
470
 
422
471
 
423
472
  template<unsigned Length, typename Type>
@@ -645,7 +694,7 @@ inline CUDA_CALLABLE void adj_vec_t(const vec_t<Length, OtherType>& other, vec_t
645
694
  {
646
695
  for( unsigned i=0; i < Length; ++i )
647
696
  {
648
- adj_other[i] += adj_ret[i];
697
+ adj_other[i] += static_cast<OtherType>(adj_ret[i]);
649
698
  }
650
699
  }
651
700
 
@@ -715,9 +764,30 @@ inline CUDA_CALLABLE void adj_div(vec_t<Length, Type> a, Type s, vec_t<Length, T
715
764
  }
716
765
 
717
766
  template<unsigned Length, typename Type>
718
- inline CUDA_CALLABLE void adj_cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b, vec_t<Length, Type>& adj_a, vec_t<Length, Type>& adj_b, const vec_t<Length, Type>& adj_ret) {
767
+ inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
768
+ {
769
+
770
+ adj_s -= dot(a , adj_ret)/ (s * s); // - a / s^2
771
+
772
+ for( unsigned i=0; i < Length; ++i )
773
+ {
774
+ adj_a[i] += s / adj_ret[i];
775
+ }
776
+
777
+ #if FP_CHECK
778
+ if (!isfinite(a) || !isfinite(s) || !isfinite(adj_a) || !isfinite(adj_s) || !isfinite(adj_ret))
779
+ {
780
+ // \TODO: How shall we implement this error message?
781
+ // printf("adj_div((%f %f %f %f), %f, (%f %f %f %f), %f, (%f %f %f %f)\n", a.x, a.y, a.z, a.w, s, adj_a.x, adj_a.y, adj_a.z, adj_a.w, adj_s, adj_ret.x, adj_ret.y, adj_ret.z, adj_ret.w);
782
+ assert(0);
783
+ }
784
+ #endif
785
+ }
786
+
787
+ template<unsigned Length, typename Type>
788
+ inline CUDA_CALLABLE void adj_cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b, vec_t<Length, Type>& ret, vec_t<Length, Type>& adj_a, vec_t<Length, Type>& adj_b, const vec_t<Length, Type>& adj_ret) {
719
789
  adj_a += cw_div(adj_ret, b);
720
- adj_b -= cw_mul(adj_ret, cw_div(cw_div(a, b), b));
790
+ adj_b -= cw_mul(adj_ret, cw_div(ret, b));
721
791
  }
722
792
 
723
793
  template<unsigned Length, typename Type>
@@ -816,7 +886,7 @@ inline CUDA_CALLABLE void adj_dot(vec_t<3, Type> a, vec_t<3, Type> b, vec_t<3, T
816
886
 
817
887
 
818
888
  template<unsigned Length, typename Type>
819
- inline CUDA_CALLABLE void adj_index(const vec_t<Length, Type> & a, int idx, vec_t<Length, Type> & adj_a, int & adj_idx, Type & adj_ret)
889
+ inline CUDA_CALLABLE void adj_extract(const vec_t<Length, Type> & a, int idx, vec_t<Length, Type> & adj_a, int & adj_idx, Type & adj_ret)
820
890
  {
821
891
  #ifndef NDEBUG
822
892
  if (idx < 0 || idx > Length)
@@ -830,9 +900,12 @@ inline CUDA_CALLABLE void adj_index(const vec_t<Length, Type> & a, int idx, vec_
830
900
  }
831
901
 
832
902
  template<unsigned Length, typename Type>
833
- inline CUDA_CALLABLE void adj_length(vec_t<Length, Type> a, vec_t<Length, Type>& adj_a, const Type adj_ret)
903
+ inline CUDA_CALLABLE void adj_length(vec_t<Length, Type> a, Type ret, vec_t<Length, Type>& adj_a, const Type adj_ret)
834
904
  {
835
- adj_a += normalize(a)*adj_ret;
905
+ if (ret > Type(kEps))
906
+ {
907
+ adj_a += div(a, ret) * adj_ret;
908
+ }
836
909
 
837
910
  #if FP_CHECK
838
911
  if (!isfinite(adj_a))
@@ -860,7 +933,7 @@ inline CUDA_CALLABLE void adj_length_sq(vec_t<Length, Type> a, vec_t<Length, Typ
860
933
  }
861
934
 
862
935
  template<unsigned Length, typename Type>
863
- inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
936
+ inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Type>& ret, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
864
937
  {
865
938
  Type d = length(a);
866
939
 
@@ -868,9 +941,7 @@ inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Typ
868
941
  {
869
942
  Type invd = Type(1.0f)/d;
870
943
 
871
- vec_t<Length, Type> ahat = normalize(a);
872
-
873
- adj_a += (adj_ret*invd - ahat*(dot(ahat, adj_ret))*invd);
944
+ adj_a += (adj_ret*invd - ret*(dot(ret, adj_ret))*invd);
874
945
 
875
946
  #if FP_CHECK
876
947
  if (!isfinite(adj_a))
@@ -931,8 +1002,8 @@ inline CUDA_CALLABLE void adj_max(const vec_t<Length,Type> &v, vec_t<Length,Type
931
1002
 
932
1003
  // Do I need to specialize these for different lengths?
933
1004
  template<unsigned Length, typename Type>
934
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
935
-
1005
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1006
+ {
936
1007
  vec_t<Length, Type> ret;
937
1008
  for( unsigned i=0; i < Length; ++i )
938
1009
  {
@@ -943,8 +1014,8 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr,
943
1014
  }
944
1015
 
945
1016
  template<unsigned Length, typename Type>
946
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
947
-
1017
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1018
+ {
948
1019
  vec_t<Length, Type> ret;
949
1020
  for( unsigned i=0; i < Length; ++i )
950
1021
  {
@@ -955,8 +1026,8 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr,
955
1026
  }
956
1027
 
957
1028
  template<unsigned Length, typename Type>
958
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
959
-
1029
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1030
+ {
960
1031
  vec_t<Length, Type> ret;
961
1032
  for( unsigned i=0; i < Length; ++i )
962
1033
  {
@@ -966,6 +1037,17 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr,
966
1037
  return ret;
967
1038
  }
968
1039
 
1040
+ template<unsigned Length, typename Type>
1041
+ inline CUDA_CALLABLE void adj_atomic_minmax(
1042
+ vec_t<Length,Type> *addr,
1043
+ vec_t<Length,Type> *adj_addr,
1044
+ const vec_t<Length,Type> &value,
1045
+ vec_t<Length,Type> &adj_value)
1046
+ {
1047
+ for (unsigned i=0; i < Length; ++i)
1048
+ adj_atomic_minmax(&(addr->c[i]), &(adj_addr->c[i]), value[i], adj_value[i]);
1049
+ }
1050
+
969
1051
  // ok, the original implementation of this didn't take the absolute values.
970
1052
  // I wouldn't consider this expected behavior. It looks like it's only
971
1053
  // being used for bounding boxes at the moment, where this doesn't matter,
warp/native/warp.cpp CHANGED
@@ -945,6 +945,7 @@ void array_fill_device(void* context, void* arr, int arr_type, const void* value
945
945
 
946
946
  WP_API int cuda_driver_version() { return 0; }
947
947
  WP_API int cuda_toolkit_version() { return 0; }
948
+ WP_API bool cuda_driver_is_initialized() { return false; }
948
949
 
949
950
  WP_API int nvrtc_supported_arch_count() { return 0; }
950
951
  WP_API void nvrtc_supported_archs(int* archs) {}
@@ -994,7 +995,7 @@ WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* i
994
995
  WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
995
996
  WP_API void cuda_unload_module(void* context, void* module) {}
996
997
  WP_API void* cuda_get_kernel(void* context, void* module, const char* name) { return NULL; }
997
- WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, void** args) { return 0;}
998
+ WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args) { return 0;}
998
999
 
999
1000
  WP_API void cuda_set_context_restore_policy(bool always_restore) {}
1000
1001
  WP_API int cuda_get_context_restore_policy() { return false; }
warp/native/warp.cu CHANGED
@@ -302,7 +302,7 @@ void memset_device(void* context, void* dest, int value, size_t n)
302
302
  {
303
303
  ContextGuard guard(context);
304
304
 
305
- if ((n%4) > 0)
305
+ if (true)// ((n%4) > 0)
306
306
  {
307
307
  // for unaligned lengths fallback to CUDA memset
308
308
  check_cuda(cudaMemsetAsync(dest, value, n, get_current_stream()));
@@ -1141,6 +1141,11 @@ int cuda_toolkit_version()
1141
1141
  return CUDA_VERSION;
1142
1142
  }
1143
1143
 
1144
+ bool cuda_driver_is_initialized()
1145
+ {
1146
+ return is_cuda_driver_initialized();
1147
+ }
1148
+
1144
1149
  int nvrtc_supported_arch_count()
1145
1150
  {
1146
1151
  int count;
@@ -1841,14 +1846,34 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
1841
1846
  return kernel;
1842
1847
  }
1843
1848
 
1844
- size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, void** args)
1849
+ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args)
1845
1850
  {
1846
1851
  ContextGuard guard(context);
1847
1852
 
1848
1853
  const int block_dim = 256;
1849
1854
  // CUDA specs up to compute capability 9.0 says the max x-dim grid is 2**31-1, so
1850
1855
  // grid_dim is fine as an int for the near future
1851
- const int grid_dim = (dim + block_dim - 1)/block_dim;
1856
+ int grid_dim = (dim + block_dim - 1)/block_dim;
1857
+
1858
+ if (max_blocks <= 0) {
1859
+ max_blocks = 2147483647;
1860
+ }
1861
+
1862
+ if (grid_dim < 0)
1863
+ {
1864
+ #if defined(_DEBUG)
1865
+ fprintf(stderr, "Warp warning: Overflow in grid dimensions detected for %zu total elements and 256 threads "
1866
+ "per block.\n Setting block count to %d.\n", dim, max_blocks);
1867
+ #endif
1868
+ grid_dim = max_blocks;
1869
+ }
1870
+ else
1871
+ {
1872
+ if (grid_dim > max_blocks)
1873
+ {
1874
+ grid_dim = max_blocks;
1875
+ }
1876
+ }
1852
1877
 
1853
1878
  CUresult res = cuLaunchKernel_f(
1854
1879
  (CUfunction)kernel,
warp/native/warp.h CHANGED
@@ -54,11 +54,11 @@ extern "C"
54
54
  WP_API void memtile_host(void* dest, const void* src, size_t srcsize, size_t n);
55
55
  WP_API void memtile_device(void* context, void* dest, const void* src, size_t srcsize, size_t n);
56
56
 
57
- WP_API uint64_t bvh_create_host(wp::vec3* lowers, wp::vec3* uppers, int num_bounds);
57
+ WP_API uint64_t bvh_create_host(wp::vec3* lowers, wp::vec3* uppers, int num_items);
58
58
  WP_API void bvh_destroy_host(uint64_t id);
59
59
  WP_API void bvh_refit_host(uint64_t id);
60
60
 
61
- WP_API uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_bounds);
61
+ WP_API uint64_t bvh_create_device(void* context, wp::vec3* lowers, wp::vec3* uppers, int num_items);
62
62
  WP_API void bvh_destroy_device(uint64_t id);
63
63
  WP_API void bvh_refit_device(uint64_t id);
64
64
 
@@ -214,6 +214,7 @@ extern "C"
214
214
 
215
215
  WP_API int cuda_driver_version(); // CUDA driver version
216
216
  WP_API int cuda_toolkit_version(); // CUDA Toolkit version used to build Warp
217
+ WP_API bool cuda_driver_is_initialized();
217
218
 
218
219
  WP_API int nvrtc_supported_arch_count();
219
220
  WP_API void nvrtc_supported_archs(int* archs);
@@ -267,7 +268,7 @@ extern "C"
267
268
  WP_API void* cuda_load_module(void* context, const char* ptx);
268
269
  WP_API void cuda_unload_module(void* context, void* module);
269
270
  WP_API void* cuda_get_kernel(void* context, void* module, const char* name);
270
- WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, void** args);
271
+ WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args);
271
272
 
272
273
  WP_API void cuda_set_context_restore_policy(bool always_restore);
273
274
  WP_API int cuda_get_context_restore_policy();