warp-lang 1.0.0b5__py3-none-manylinux2014_x86_64.whl → 1.0.0b6__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. docs/conf.py +3 -4
  2. examples/env/env_ant.py +1 -1
  3. examples/env/env_cartpole.py +1 -1
  4. examples/env/env_humanoid.py +1 -1
  5. examples/example_dem.py +28 -26
  6. examples/example_diffray.py +37 -30
  7. examples/example_fluid.py +7 -3
  8. examples/example_jacobian_ik.py +1 -1
  9. examples/example_mesh_intersect.py +10 -7
  10. examples/example_nvdb.py +3 -3
  11. examples/example_render_opengl.py +19 -10
  12. examples/example_sim_cartpole.py +9 -5
  13. examples/example_sim_cloth.py +29 -25
  14. examples/example_sim_fk_grad.py +2 -2
  15. examples/example_sim_fk_grad_torch.py +3 -3
  16. examples/example_sim_grad_bounce.py +11 -8
  17. examples/example_sim_grad_cloth.py +12 -9
  18. examples/example_sim_granular.py +2 -2
  19. examples/example_sim_granular_collision_sdf.py +13 -13
  20. examples/example_sim_neo_hookean.py +3 -3
  21. examples/example_sim_particle_chain.py +2 -2
  22. examples/example_sim_quadruped.py +8 -5
  23. examples/example_sim_rigid_chain.py +8 -5
  24. examples/example_sim_rigid_contact.py +13 -10
  25. examples/example_sim_rigid_fem.py +2 -2
  26. examples/example_sim_rigid_gyroscopic.py +2 -2
  27. examples/example_sim_rigid_kinematics.py +1 -1
  28. examples/example_sim_trajopt.py +3 -2
  29. examples/fem/example_apic_fluid.py +5 -7
  30. examples/fem/example_diffusion_mgpu.py +18 -16
  31. warp/__init__.py +3 -2
  32. warp/bin/warp.so +0 -0
  33. warp/build_dll.py +29 -9
  34. warp/builtins.py +206 -7
  35. warp/codegen.py +58 -38
  36. warp/config.py +3 -1
  37. warp/context.py +234 -128
  38. warp/fem/__init__.py +2 -2
  39. warp/fem/cache.py +2 -1
  40. warp/fem/field/nodal_field.py +18 -17
  41. warp/fem/geometry/hexmesh.py +11 -6
  42. warp/fem/geometry/quadmesh_2d.py +16 -12
  43. warp/fem/geometry/tetmesh.py +19 -8
  44. warp/fem/geometry/trimesh_2d.py +18 -7
  45. warp/fem/integrate.py +341 -196
  46. warp/fem/quadrature/__init__.py +1 -1
  47. warp/fem/quadrature/pic_quadrature.py +138 -53
  48. warp/fem/quadrature/quadrature.py +81 -9
  49. warp/fem/space/__init__.py +1 -1
  50. warp/fem/space/basis_space.py +169 -51
  51. warp/fem/space/grid_2d_function_space.py +2 -2
  52. warp/fem/space/grid_3d_function_space.py +2 -2
  53. warp/fem/space/hexmesh_function_space.py +2 -2
  54. warp/fem/space/partition.py +9 -6
  55. warp/fem/space/quadmesh_2d_function_space.py +2 -2
  56. warp/fem/space/shape/cube_shape_function.py +27 -15
  57. warp/fem/space/shape/square_shape_function.py +29 -18
  58. warp/fem/space/tetmesh_function_space.py +2 -2
  59. warp/fem/space/topology.py +10 -0
  60. warp/fem/space/trimesh_2d_function_space.py +2 -2
  61. warp/fem/utils.py +10 -5
  62. warp/native/array.h +49 -8
  63. warp/native/builtin.h +31 -14
  64. warp/native/cuda_util.cpp +8 -3
  65. warp/native/cuda_util.h +1 -0
  66. warp/native/exports.h +1177 -1108
  67. warp/native/intersect.h +4 -4
  68. warp/native/intersect_adj.h +8 -8
  69. warp/native/mat.h +65 -6
  70. warp/native/mesh.h +126 -5
  71. warp/native/quat.h +28 -4
  72. warp/native/vec.h +76 -14
  73. warp/native/warp.cu +1 -6
  74. warp/render/render_opengl.py +261 -109
  75. warp/sim/import_mjcf.py +13 -7
  76. warp/sim/import_urdf.py +14 -14
  77. warp/sim/inertia.py +17 -18
  78. warp/sim/model.py +67 -67
  79. warp/sim/render.py +1 -1
  80. warp/sparse.py +6 -6
  81. warp/stubs.py +19 -81
  82. warp/tape.py +1 -1
  83. warp/tests/__main__.py +3 -6
  84. warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
  85. warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
  86. warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
  87. warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
  88. warp/tests/aux_test_unresolved_func.py +14 -0
  89. warp/tests/aux_test_unresolved_symbol.py +14 -0
  90. warp/tests/{test_kinematics.py → disabled_kinematics.py} +10 -12
  91. warp/tests/run_coverage_serial.py +31 -0
  92. warp/tests/test_adam.py +102 -106
  93. warp/tests/test_arithmetic.py +39 -40
  94. warp/tests/test_array.py +46 -48
  95. warp/tests/test_array_reduce.py +25 -19
  96. warp/tests/test_atomic.py +62 -26
  97. warp/tests/test_bool.py +16 -11
  98. warp/tests/test_builtins_resolution.py +1292 -0
  99. warp/tests/test_bvh.py +9 -12
  100. warp/tests/test_closest_point_edge_edge.py +53 -57
  101. warp/tests/test_codegen.py +164 -134
  102. warp/tests/test_compile_consts.py +13 -19
  103. warp/tests/test_conditional.py +30 -32
  104. warp/tests/test_copy.py +9 -12
  105. warp/tests/test_ctypes.py +90 -98
  106. warp/tests/test_dense.py +20 -14
  107. warp/tests/test_devices.py +34 -35
  108. warp/tests/test_dlpack.py +74 -75
  109. warp/tests/test_examples.py +215 -97
  110. warp/tests/test_fabricarray.py +15 -21
  111. warp/tests/test_fast_math.py +14 -11
  112. warp/tests/test_fem.py +280 -97
  113. warp/tests/test_fp16.py +19 -15
  114. warp/tests/test_func.py +177 -194
  115. warp/tests/test_generics.py +71 -77
  116. warp/tests/test_grad.py +83 -32
  117. warp/tests/test_grad_customs.py +7 -9
  118. warp/tests/test_hash_grid.py +6 -10
  119. warp/tests/test_import.py +9 -23
  120. warp/tests/test_indexedarray.py +19 -21
  121. warp/tests/test_intersect.py +15 -9
  122. warp/tests/test_large.py +17 -19
  123. warp/tests/test_launch.py +14 -17
  124. warp/tests/test_lerp.py +63 -63
  125. warp/tests/test_lvalue.py +84 -35
  126. warp/tests/test_marching_cubes.py +9 -13
  127. warp/tests/test_mat.py +388 -3004
  128. warp/tests/test_mat_lite.py +9 -12
  129. warp/tests/test_mat_scalar_ops.py +2889 -0
  130. warp/tests/test_math.py +10 -11
  131. warp/tests/test_matmul.py +104 -100
  132. warp/tests/test_matmul_lite.py +72 -98
  133. warp/tests/test_mesh.py +35 -32
  134. warp/tests/test_mesh_query_aabb.py +18 -25
  135. warp/tests/test_mesh_query_point.py +39 -23
  136. warp/tests/test_mesh_query_ray.py +9 -21
  137. warp/tests/test_mlp.py +8 -9
  138. warp/tests/test_model.py +89 -93
  139. warp/tests/test_modules_lite.py +15 -25
  140. warp/tests/test_multigpu.py +87 -114
  141. warp/tests/test_noise.py +10 -12
  142. warp/tests/test_operators.py +14 -21
  143. warp/tests/test_options.py +10 -11
  144. warp/tests/test_pinned.py +16 -18
  145. warp/tests/test_print.py +16 -20
  146. warp/tests/test_quat.py +121 -88
  147. warp/tests/test_rand.py +12 -13
  148. warp/tests/test_reload.py +27 -32
  149. warp/tests/test_rounding.py +7 -10
  150. warp/tests/test_runlength_encode.py +105 -106
  151. warp/tests/test_smoothstep.py +8 -9
  152. warp/tests/test_snippet.py +13 -22
  153. warp/tests/test_sparse.py +30 -29
  154. warp/tests/test_spatial.py +179 -174
  155. warp/tests/test_streams.py +100 -107
  156. warp/tests/test_struct.py +98 -67
  157. warp/tests/test_tape.py +11 -17
  158. warp/tests/test_torch.py +89 -86
  159. warp/tests/test_transient_module.py +9 -12
  160. warp/tests/test_types.py +328 -50
  161. warp/tests/test_utils.py +217 -218
  162. warp/tests/test_vec.py +133 -2133
  163. warp/tests/test_vec_lite.py +8 -11
  164. warp/tests/test_vec_scalar_ops.py +2099 -0
  165. warp/tests/test_volume.py +391 -382
  166. warp/tests/test_volume_write.py +122 -135
  167. warp/tests/unittest_serial.py +35 -0
  168. warp/tests/unittest_suites.py +291 -0
  169. warp/tests/{test_base.py → unittest_utils.py} +138 -25
  170. warp/tests/{test_misc.py → unused_test_misc.py} +13 -5
  171. warp/tests/{test_debug.py → walkthough_debug.py} +2 -15
  172. warp/thirdparty/unittest_parallel.py +257 -54
  173. warp/types.py +119 -98
  174. warp/utils.py +14 -0
  175. {warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/METADATA +2 -1
  176. {warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/RECORD +182 -178
  177. {warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/WHEEL +1 -1
  178. warp/tests/test_all.py +0 -239
  179. warp/tests/test_conditional_unequal_types_kernels.py +0 -14
  180. warp/tests/test_coverage.py +0 -38
  181. warp/tests/test_unresolved_func.py +0 -7
  182. warp/tests/test_unresolved_symbol.py +0 -7
  183. /warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
  184. /warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
  185. /warp/tests/{test_square.py → aux_test_square.py} +0 -0
  186. {warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/LICENSE.md +0 -0
  187. {warp_lang-1.0.0b5.dist-info → warp_lang-1.0.0b6.dist-info}/top_level.txt +0 -0
warp/native/intersect.h CHANGED
@@ -869,7 +869,7 @@ CUDA_CALLABLE inline void adj_closest_point_to_triangle(
869
869
  wp::adj_sub(var_9, var_71, adj_9, adj_71, adj_73);
870
870
  wp::adj_mul(var_21, var_70, adj_21, adj_70, adj_72);
871
871
  wp::adj_mul(var_41, var_70, adj_41, adj_70, adj_71);
872
- wp::adj_div(var_9, var_69, adj_9, adj_69, adj_70);
872
+ wp::adj_div(var_9, var_69, var_70, adj_9, adj_69, adj_70);
873
873
  wp::adj_add(var_68, var_21, adj_68, adj_21, adj_69);
874
874
  wp::adj_add(var_53, var_41, adj_53, adj_41, adj_68);
875
875
  wp::adj_select(var_64, var_50, var_66, adj_64, adj_50, adj_66, adj_67);
@@ -881,7 +881,7 @@ CUDA_CALLABLE inline void adj_closest_point_to_triangle(
881
881
  }
882
882
  wp::adj_sub(var_32, var_33, adj_32, adj_33, adj_62);
883
883
  wp::adj_sub(var_13, var_12, adj_13, adj_12, adj_60);
884
- wp::adj_div(var_54, var_57, adj_54, adj_57, adj_58);
884
+ wp::adj_div(var_54, var_57, var_58, adj_54, adj_57, adj_58);
885
885
  wp::adj_add(var_55, var_56, adj_55, adj_56, adj_57);
886
886
  wp::adj_sub(var_32, var_33, adj_32, adj_33, adj_56);
887
887
  wp::adj_sub(var_13, var_12, adj_13, adj_12, adj_55);
@@ -896,7 +896,7 @@ CUDA_CALLABLE inline void adj_closest_point_to_triangle(
896
896
  wp::adj_vec2(var_48, var_5, adj_48, adj_5, adj_49);
897
897
  wp::adj_sub(var_9, var_43, adj_9, adj_43, adj_48);
898
898
  }
899
- wp::adj_div(var_4, var_42, adj_4, adj_42, adj_43);
899
+ wp::adj_div(var_4, var_42, var_43, adj_4, adj_42, adj_43);
900
900
  wp::adj_sub(var_4, var_33, adj_4, adj_33, adj_42);
901
901
  wp::adj_sub(var_39, var_40, adj_39, adj_40, adj_41);
902
902
  wp::adj_mul(var_3, var_33, adj_3, adj_33, adj_40);
@@ -917,7 +917,7 @@ CUDA_CALLABLE inline void adj_closest_point_to_triangle(
917
917
  wp::adj_vec2(var_28, var_23, adj_28, adj_23, adj_29);
918
918
  wp::adj_sub(var_9, var_23, adj_9, adj_23, adj_28);
919
919
  }
920
- wp::adj_div(var_3, var_22, adj_3, adj_22, adj_23);
920
+ wp::adj_div(var_3, var_22, var_23, adj_3, adj_22, adj_23);
921
921
  wp::adj_sub(var_3, var_12, adj_3, adj_12, adj_22);
922
922
  wp::adj_sub(var_19, var_20, adj_19, adj_20, adj_21);
923
923
  wp::adj_mul(var_12, var_4, adj_12, adj_4, adj_20);
@@ -276,7 +276,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
276
276
  label1:;
277
277
  adj_71 += adj_ret;
278
278
  wp::adj_vec3(var_61, var_62, var_70, adj_61, adj_62, adj_70, adj_71);
279
- wp::adj_length(var_69, adj_69, adj_70);
279
+ wp::adj_length(var_69, var_70, adj_69, adj_70);
280
280
  wp::adj_sub(var_68, var_65, adj_68, adj_65, adj_69);
281
281
  wp::adj_add(var_p2, var_67, adj_p2, adj_67, adj_68);
282
282
  wp::adj_mul(var_66, var_62, adj_66, adj_62, adj_67);
@@ -297,7 +297,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
297
297
  wp::adj_select(var_51, var_49, var_54, adj_51, adj_49, adj_54, adj_55);
298
298
  if (var_51) {
299
299
  wp::adj_clamp(var_53, var_6, var_25, adj_53, adj_6, adj_25, adj_54);
300
- wp::adj_div(var_52, var_3, adj_52, adj_3, adj_53);
300
+ wp::adj_div(var_52, var_3, var_53, adj_52, adj_3, adj_53);
301
301
  wp::adj_sub(var_30, var_21, adj_30, adj_21, adj_52);
302
302
  }
303
303
  }
@@ -305,10 +305,10 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
305
305
  wp::adj_select(var_45, var_41, var_48, adj_45, adj_41, adj_48, adj_49);
306
306
  if (var_45) {
307
307
  wp::adj_clamp(var_47, var_6, var_25, adj_47, adj_6, adj_25, adj_48);
308
- wp::adj_div(var_46, var_3, adj_46, adj_3, adj_47);
308
+ wp::adj_div(var_46, var_3, var_47, adj_46, adj_3, adj_47);
309
309
  wp::adj_neg(var_21, adj_21, adj_46);
310
310
  }
311
- wp::adj_div(var_43, var_4, adj_43, adj_4, adj_44);
311
+ wp::adj_div(var_43, var_4, var_44, adj_43, adj_4, adj_44);
312
312
  wp::adj_add(var_42, var_5, adj_42, adj_5, adj_43);
313
313
  wp::adj_mul(var_30, var_41, adj_30, adj_41, adj_42);
314
314
  wp::adj_select(var_34, var_6, var_40, adj_34, adj_6, adj_40, adj_41);
@@ -317,7 +317,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
317
317
  wp::adj_select(var_34, var_28, var_39, adj_34, adj_28, adj_39, adj_40);
318
318
  if (var_34) {
319
319
  wp::adj_clamp(var_38, var_6, var_25, adj_38, adj_6, adj_25, adj_39);
320
- wp::adj_div(var_37, var_33, adj_37, adj_33, adj_38);
320
+ wp::adj_div(var_37, var_33, var_38, adj_37, adj_33, adj_38);
321
321
  wp::adj_sub(var_35, var_36, adj_35, adj_36, adj_37);
322
322
  wp::adj_mul(var_21, var_4, adj_21, adj_4, adj_36);
323
323
  wp::adj_mul(var_30, var_5, adj_30, adj_5, adj_35);
@@ -332,7 +332,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
332
332
  if (var_22) {
333
333
  wp::adj_cast_float(var_6, adj_6, adj_27);
334
334
  wp::adj_clamp(var_24, var_6, var_25, adj_24, adj_6, adj_25, adj_26);
335
- wp::adj_div(var_23, var_3, adj_23, adj_3, adj_24);
335
+ wp::adj_div(var_23, var_3, var_24, adj_23, adj_3, adj_24);
336
336
  wp::adj_neg(var_21, adj_21, adj_23);
337
337
  }
338
338
  wp::adj_dot(var_0, var_2, adj_0, adj_2, adj_21);
@@ -341,7 +341,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
341
341
  wp::adj_select(var_15, var_7, var_16, adj_15, adj_7, adj_16, adj_19);
342
342
  if (var_15) {
343
343
  wp::adj_cast_float(var_17, adj_17, adj_18);
344
- wp::adj_div(var_5, var_4, adj_5, adj_4, adj_17);
344
+ wp::adj_div(var_5, var_4, var_17, adj_5, adj_4, adj_17);
345
345
  wp::adj_cast_float(var_6, adj_6, adj_16);
346
346
  }
347
347
  if (var_13) {
@@ -349,7 +349,7 @@ static CUDA_CALLABLE void adj_closest_point_edge_edge(vec3 var_p1,
349
349
  adj_14 += adj_ret;
350
350
  wp::adj_vec3(var_7, var_8, var_10, adj_7, adj_8, adj_10, adj_14);
351
351
  }
352
- wp::adj_length(var_9, adj_9, adj_10);
352
+ wp::adj_length(var_9, var_10, adj_9, adj_10);
353
353
  wp::adj_sub(var_p2, var_p1, adj_p2, adj_p1, adj_9);
354
354
  wp::adj_cast_float(var_6, adj_6, adj_8);
355
355
  wp::adj_cast_float(var_6, adj_6, adj_7);
warp/native/mat.h CHANGED
@@ -297,6 +297,18 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_max(mat_t<Rows,Cols,Type> * ad
297
297
  return m;
298
298
  }
299
299
 
300
+ template<unsigned Rows, unsigned Cols, typename Type>
301
+ inline CUDA_CALLABLE void adj_atomic_minmax(
302
+ mat_t<Rows,Cols,Type> *addr,
303
+ mat_t<Rows,Cols,Type> *adj_addr,
304
+ const mat_t<Rows,Cols,Type> &value,
305
+ mat_t<Rows,Cols,Type> &adj_value)
306
+ {
307
+ for (unsigned i=0; i < Rows; ++i)
308
+ for (unsigned j=0; j < Cols; ++j)
309
+ adj_atomic_minmax(&addr->data[i][j], &adj_addr->data[i][j], value.data[i][j], adj_value.data[i][j]);
310
+ }
311
+
300
312
  template<unsigned Rows, unsigned Cols, typename Type>
301
313
  inline CUDA_CALLABLE vec_t<Cols,Type> extract(const mat_t<Rows,Cols,Type>& m, int row)
302
314
  {
@@ -425,7 +437,22 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, T
425
437
  }
426
438
  }
427
439
 
428
- return t;
440
+ return t;
441
+ }
442
+
443
+ template<unsigned Rows, unsigned Cols, typename Type>
444
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(Type b, const mat_t<Rows,Cols,Type>& a)
445
+ {
446
+ mat_t<Rows,Cols,Type> t;
447
+ for (unsigned i=0; i < Rows; ++i)
448
+ {
449
+ for (unsigned j=0; j < Cols; ++j)
450
+ {
451
+ t.data[i][j] = b / a.data[i][j];
452
+ }
453
+ }
454
+
455
+ return t;
429
456
  }
430
457
 
431
458
  template<unsigned Rows, unsigned Cols, typename Type>
@@ -440,7 +467,7 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(const mat_t<Rows,Cols,Type>& a, T
440
467
  }
441
468
  }
442
469
 
443
- return t;
470
+ return t;
444
471
  }
445
472
 
446
473
  template<unsigned Rows, unsigned Cols, typename Type>
@@ -473,6 +500,17 @@ inline CUDA_CALLABLE vec_t<Rows,Type> mul(const mat_t<Rows,Cols,Type>& a, const
473
500
  return r;
474
501
  }
475
502
 
503
+ template<unsigned Rows, unsigned Cols, typename Type>
504
+ inline CUDA_CALLABLE vec_t<Cols,Type> mul(const vec_t<Rows,Type>& b, const mat_t<Rows,Cols,Type>& a)
505
+ {
506
+ vec_t<Cols,Type> r = a.get_row(0)*b[0];
507
+ for( unsigned i=1; i < Rows; ++i )
508
+ {
509
+ r += a.get_row(i)*b[i];
510
+ }
511
+ return r;
512
+ }
513
+
476
514
  template<unsigned Rows, unsigned Cols, unsigned ColsOut, typename Type>
477
515
  inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a, const mat_t<Cols,ColsOut,Type>& b)
478
516
  {
@@ -932,6 +970,20 @@ inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<
932
970
  }
933
971
  }
934
972
 
973
+ template<unsigned Rows, unsigned Cols, typename Type>
974
+ inline CUDA_CALLABLE void adj_div(Type s, const mat_t<Rows,Cols,Type>& a, Type& adj_s, mat_t<Rows,Cols,Type>& adj_a, const mat_t<Rows,Cols,Type>& adj_ret)
975
+ {
976
+ adj_s -= tensordot(a , adj_ret)/ (s * s); // - a / s^2
977
+
978
+ for (unsigned i=0; i < Rows; ++i)
979
+ {
980
+ for (unsigned j=0; j < Cols; ++j)
981
+ {
982
+ adj_a.data[i][j] += s / adj_ret.data[i][j];
983
+ }
984
+ }
985
+ }
986
+
935
987
  template<unsigned Rows, unsigned Cols, typename Type>
936
988
  inline CUDA_CALLABLE void adj_mul(const mat_t<Rows,Cols,Type>& a, Type b, mat_t<Rows,Cols,Type>& adj_a, Type& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
937
989
  {
@@ -965,6 +1017,13 @@ inline CUDA_CALLABLE void adj_mul(const mat_t<Rows,Cols,Type>& a, const vec_t<Co
965
1017
  adj_b += mul(transpose(a), adj_ret);
966
1018
  }
967
1019
 
1020
+ template<unsigned Rows, unsigned Cols, typename Type>
1021
+ inline CUDA_CALLABLE void adj_mul(const vec_t<Rows,Type>& b, const mat_t<Rows,Cols,Type>& a, vec_t<Rows,Type>& adj_b, mat_t<Rows,Cols,Type>& adj_a, const vec_t<Cols,Type>& adj_ret)
1022
+ {
1023
+ adj_a += outer(b, adj_ret);
1024
+ adj_b += mul(adj_ret, transpose(a));
1025
+ }
1026
+
968
1027
  template<unsigned Rows, unsigned Cols, unsigned ColsOut, typename Type>
969
1028
  inline CUDA_CALLABLE void adj_mul(const mat_t<Rows,Cols,Type>& a, const mat_t<Cols,ColsOut,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Cols,ColsOut,Type>& adj_b, const mat_t<Rows,ColsOut,Type>& adj_ret)
970
1029
  {
@@ -1105,10 +1164,10 @@ inline CUDA_CALLABLE void adj_determinant(const mat_t<4,4,Type>& m, mat_t<4,4,Ty
1105
1164
  }
1106
1165
 
1107
1166
  template<unsigned Rows, typename Type>
1108
- inline CUDA_CALLABLE void adj_inverse(const mat_t<Rows,Rows,Type>& m, mat_t<Rows,Rows,Type>& adj_m, const mat_t<Rows,Rows,Type>& adj_ret)
1167
+ inline CUDA_CALLABLE void adj_inverse(const mat_t<Rows,Rows,Type>& m, mat_t<Rows,Rows,Type>& ret, mat_t<Rows,Rows,Type>& adj_m, const mat_t<Rows,Rows,Type>& adj_ret)
1109
1168
  {
1110
1169
  // todo: how to cache this from the forward pass?
1111
- mat_t<Rows,Rows,Type> invt = transpose(inverse(m));
1170
+ mat_t<Rows,Rows,Type> invt = transpose(ret);
1112
1171
 
1113
1172
  // see https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf 2.2.3
1114
1173
  adj_m -= mul(mul(invt, adj_ret), invt);
@@ -1150,10 +1209,10 @@ inline CUDA_CALLABLE void adj_cw_mul(const mat_t<Rows,Cols,Type>& a, const mat_t
1150
1209
  }
1151
1210
 
1152
1211
  template<unsigned Rows, unsigned Cols, typename Type>
1153
- inline CUDA_CALLABLE void adj_cw_div(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
1212
+ inline CUDA_CALLABLE void adj_cw_div(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& ret, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
1154
1213
  {
1155
1214
  adj_a += cw_div(adj_ret, b);
1156
- adj_b -= cw_mul(adj_ret, cw_div(cw_div(a, b), b));
1215
+ adj_b -= cw_mul(adj_ret, cw_div(ret, b));
1157
1216
  }
1158
1217
 
1159
1218
  // adjoint for the constant constructor:
warp/native/mesh.h CHANGED
@@ -1181,7 +1181,7 @@ CUDA_CALLABLE inline bool mesh_query_point_sign_winding_number(uint64_t id, cons
1181
1181
  }
1182
1182
  }
1183
1183
 
1184
- CUDA_CALLABLE inline void adj_mesh_query_point_no_sign(uint64_t id, const vec3& point, float max_dist, int& face, float& u, float& v,
1184
+ CUDA_CALLABLE inline void adj_mesh_query_point_no_sign(uint64_t id, const vec3& point, float max_dist, const int& face, const float& u, const float& v,
1185
1185
  uint64_t adj_id, vec3& adj_point, float& adj_max_dist, int& adj_face, float& adj_u, float& adj_v, bool& adj_ret)
1186
1186
  {
1187
1187
  Mesh mesh = mesh_get(id);
@@ -1202,7 +1202,7 @@ CUDA_CALLABLE inline void adj_mesh_query_point_no_sign(uint64_t id, const vec3&
1202
1202
  adj_closest_point_to_triangle(p, q, r, point, adj_p, adj_q, adj_r, adj_point, adj_uv);
1203
1203
  }
1204
1204
 
1205
- CUDA_CALLABLE inline void adj_mesh_query_furthest_point_no_sign(uint64_t id, const vec3& point, float min_dist, int& face, float& u, float& v,
1205
+ CUDA_CALLABLE inline void adj_mesh_query_furthest_point_no_sign(uint64_t id, const vec3& point, float min_dist, const int& face, const float& u, const float& v,
1206
1206
  uint64_t adj_id, vec3& adj_point, float& adj_min_dist, int& adj_face, float& adj_u, float& adj_v, bool& adj_ret)
1207
1207
  {
1208
1208
  Mesh mesh = mesh_get(id);
@@ -1223,24 +1223,116 @@ CUDA_CALLABLE inline void adj_mesh_query_furthest_point_no_sign(uint64_t id, con
1223
1223
  adj_closest_point_to_triangle(p, q, r, point, adj_p, adj_q, adj_r, adj_point, adj_uv); // Todo for Miles :>
1224
1224
  }
1225
1225
 
1226
- CUDA_CALLABLE inline void adj_mesh_query_point(uint64_t id, const vec3& point, float max_dist, float& inside, int& face, float& u, float& v,
1226
+ CUDA_CALLABLE inline void adj_mesh_query_point(uint64_t id, const vec3& point, float max_dist, const float& inside, const int& face, const float& u, const float& v,
1227
1227
  uint64_t adj_id, vec3& adj_point, float& adj_max_dist, float& adj_inside, int& adj_face, float& adj_u, float& adj_v, bool& adj_ret)
1228
1228
  {
1229
1229
  adj_mesh_query_point_no_sign(id, point, max_dist, face, u, v, adj_id, adj_point, adj_max_dist, adj_face, adj_u, adj_v, adj_ret);
1230
1230
  }
1231
1231
 
1232
- CUDA_CALLABLE inline void adj_mesh_query_point_sign_normal(uint64_t id, const vec3& point, float max_dist, float& inside, int& face, float& u, float& v, const float epsilon,
1232
+ CUDA_CALLABLE inline void adj_mesh_query_point_sign_normal(uint64_t id, const vec3& point, float max_dist, const float& inside, const int& face, const float& u, const float& v, const float epsilon,
1233
1233
  uint64_t adj_id, vec3& adj_point, float& adj_max_dist, float& adj_inside, int& adj_face, float& adj_u, float& adj_v, float& adj_epsilon, bool& adj_ret)
1234
1234
  {
1235
1235
  adj_mesh_query_point_no_sign(id, point, max_dist, face, u, v, adj_id, adj_point, adj_max_dist, adj_face, adj_u, adj_v, adj_ret);
1236
1236
  }
1237
1237
 
1238
- CUDA_CALLABLE inline void adj_mesh_query_point_sign_winding_number(uint64_t id, const vec3& point, float max_dist, float& inside, int& face, float& u, float& v, const float accuracy, const float winding_number_threshold,
1238
+ CUDA_CALLABLE inline void adj_mesh_query_point_sign_winding_number(uint64_t id, const vec3& point, float max_dist, const float& inside, const int& face, const float& u, const float& v, const float accuracy, const float winding_number_threshold,
1239
1239
  uint64_t adj_id, vec3& adj_point, float& adj_max_dist, float& adj_inside, int& adj_face, float& adj_u, float& adj_v, float& adj_accuracy, float& adj_winding_number_threshold, bool& adj_ret)
1240
1240
  {
1241
1241
  adj_mesh_query_point_no_sign(id, point, max_dist, face, u, v, adj_id, adj_point, adj_max_dist, adj_face, adj_u, adj_v, adj_ret);
1242
1242
  }
1243
1243
 
1244
+
1245
+ // Stores the result of querying the closest point on a mesh.
1246
+ struct mesh_query_point_t
1247
+ {
1248
+ CUDA_CALLABLE mesh_query_point_t()
1249
+ {
1250
+ }
1251
+
1252
+ CUDA_CALLABLE mesh_query_point_t(int)
1253
+ {
1254
+ // For backward pass.
1255
+ }
1256
+
1257
+ bool result;
1258
+ float sign;
1259
+ int face;
1260
+ float u;
1261
+ float v;
1262
+ };
1263
+
1264
+ CUDA_CALLABLE inline mesh_query_point_t mesh_query_point(uint64_t id, const vec3& point, float max_dist)
1265
+ {
1266
+ mesh_query_point_t query;
1267
+ query.result = mesh_query_point(id, point, max_dist, query.sign, query.face, query.u, query.v);
1268
+ return query;
1269
+ }
1270
+
1271
+ CUDA_CALLABLE inline mesh_query_point_t mesh_query_point_no_sign(uint64_t id, const vec3& point, float max_dist)
1272
+ {
1273
+ mesh_query_point_t query;
1274
+ query.sign = 0.0;
1275
+ query.result = mesh_query_point_no_sign(id, point, max_dist, query.face, query.u, query.v);
1276
+ return query;
1277
+ }
1278
+
1279
+ CUDA_CALLABLE inline mesh_query_point_t mesh_query_furthest_point_no_sign(uint64_t id, const vec3& point, float min_dist)
1280
+ {
1281
+ mesh_query_point_t query;
1282
+ query.sign = 0.0;
1283
+ query.result = mesh_query_furthest_point_no_sign(id, point, min_dist, query.face, query.u, query.v);
1284
+ return query;
1285
+ }
1286
+
1287
+ CUDA_CALLABLE inline mesh_query_point_t mesh_query_point_sign_normal(uint64_t id, const vec3& point, float max_dist, const float epsilon = 1e-3f)
1288
+ {
1289
+ mesh_query_point_t query;
1290
+ query.result = mesh_query_point_sign_normal(id, point, max_dist, query.sign, query.face, query.u, query.v, epsilon);
1291
+ return query;
1292
+ }
1293
+
1294
+ CUDA_CALLABLE inline mesh_query_point_t mesh_query_point_sign_winding_number(uint64_t id, const vec3& point, float max_dist, float accuracy, float winding_number_threshold)
1295
+ {
1296
+ mesh_query_point_t query;
1297
+ query.result = mesh_query_point_sign_winding_number(id, point, max_dist, query.sign, query.face, query.u, query.v, accuracy, winding_number_threshold);
1298
+ return query;
1299
+ }
1300
+
1301
+ CUDA_CALLABLE inline void adj_mesh_query_point(uint64_t id, const vec3& point, float max_dist, const mesh_query_point_t& ret,
1302
+ uint64_t adj_id, vec3& adj_point, float& adj_max_dist, mesh_query_point_t& adj_ret)
1303
+ {
1304
+ adj_mesh_query_point(id, point, max_dist, ret.sign, ret.face, ret.u, ret.v,
1305
+ adj_id, adj_point, adj_max_dist, adj_ret.sign, adj_ret.face, adj_ret.u, adj_ret.v, adj_ret.result);
1306
+ }
1307
+
1308
+ CUDA_CALLABLE inline void adj_mesh_query_point_no_sign(uint64_t id, const vec3& point, float max_dist, const mesh_query_point_t& ret,
1309
+ uint64_t adj_id, vec3& adj_point, float& adj_max_dist, mesh_query_point_t& adj_ret)
1310
+ {
1311
+ adj_mesh_query_point_no_sign(id, point, max_dist, ret.face, ret.u, ret.v,
1312
+ adj_id, adj_point, adj_max_dist, adj_ret.face, adj_ret.u, adj_ret.v, adj_ret.result);
1313
+ }
1314
+
1315
+ CUDA_CALLABLE inline void adj_mesh_query_furthest_point_no_sign(uint64_t id, const vec3& point, float min_dist, const mesh_query_point_t& ret,
1316
+ uint64_t adj_id, vec3& adj_point, float& adj_min_dist, mesh_query_point_t& adj_ret)
1317
+ {
1318
+ adj_mesh_query_furthest_point_no_sign(id, point, min_dist, ret.face, ret.u, ret.v,
1319
+ adj_id, adj_point, adj_min_dist, adj_ret.face, adj_ret.u, adj_ret.v, adj_ret.result);
1320
+ }
1321
+
1322
+ CUDA_CALLABLE inline void adj_mesh_query_point_sign_normal(uint64_t id, const vec3& point, float max_dist, float epsilon, const mesh_query_point_t& ret,
1323
+ uint64_t adj_id, vec3& adj_point, float& adj_max_dist, float& adj_epsilon, mesh_query_point_t& adj_ret)
1324
+ {
1325
+ adj_mesh_query_point_sign_normal(id, point, max_dist, ret.sign, ret.face, ret.u, ret.v, epsilon,
1326
+ adj_id, adj_point, adj_max_dist, adj_ret.sign, adj_ret.face, adj_ret.u, adj_ret.v, epsilon, adj_ret.result);
1327
+ }
1328
+
1329
+ CUDA_CALLABLE inline void adj_mesh_query_point_sign_winding_number(uint64_t id, const vec3& point, float max_dist, float accuracy, float winding_number_threshold, const mesh_query_point_t& ret,
1330
+ uint64_t adj_id, vec3& adj_point, float& adj_max_dist, float& adj_accuracy, float& adj_winding_number_threshold, mesh_query_point_t& adj_ret)
1331
+ {
1332
+ adj_mesh_query_point_sign_winding_number(id, point, max_dist, ret.sign, ret.face, ret.u, ret.v, accuracy, winding_number_threshold,
1333
+ adj_id, adj_point, adj_max_dist, adj_ret.sign, adj_ret.face, adj_ret.u, adj_ret.v, adj_accuracy, adj_winding_number_threshold, adj_ret.result);
1334
+ }
1335
+
1244
1336
  CUDA_CALLABLE inline bool mesh_query_ray(uint64_t id, const vec3& start, const vec3& dir, float max_t, float& t, float& u, float& v, float& sign, vec3& normal, int& face)
1245
1337
  {
1246
1338
  Mesh mesh = mesh_get(id);
@@ -1353,6 +1445,35 @@ CUDA_CALLABLE inline void adj_mesh_query_ray(
1353
1445
  }
1354
1446
 
1355
1447
 
1448
+ // Stores the result of querying the closest point on a mesh.
1449
+ struct mesh_query_ray_t
1450
+ {
1451
+ CUDA_CALLABLE mesh_query_ray_t()
1452
+ {
1453
+ }
1454
+
1455
+ CUDA_CALLABLE mesh_query_ray_t(int)
1456
+ {
1457
+ // For backward pass.
1458
+ }
1459
+
1460
+ bool result;
1461
+ float sign;
1462
+ int face;
1463
+ float t;
1464
+ float u;
1465
+ float v;
1466
+ vec3 normal;
1467
+ };
1468
+
1469
+ CUDA_CALLABLE inline mesh_query_ray_t mesh_query_ray(uint64_t id, const vec3& start, const vec3& dir, float max_t)
1470
+ {
1471
+ mesh_query_ray_t query;
1472
+ query.result = mesh_query_ray(id, start, dir, max_t, query.t, query.u, query.v, query.sign, query.normal, query.face);
1473
+ return query;
1474
+ }
1475
+
1476
+
1356
1477
  // determine if a point is inside (ret < 0 ) or outside the mesh (ret > 0)
1357
1478
  CUDA_CALLABLE inline float mesh_query_inside(uint64_t id, const vec3& p)
1358
1479
  {
warp/native/quat.h CHANGED
@@ -225,12 +225,24 @@ inline CUDA_CALLABLE quat_t<Type> div(quat_t<Type> q, Type s)
225
225
  return quat_t<Type>(q.x/s, q.y/s, q.z/s, q.w/s);
226
226
  }
227
227
 
228
+ template<typename Type>
229
+ inline CUDA_CALLABLE quat_t<Type> div(Type s, quat_t<Type> q)
230
+ {
231
+ return quat_t<Type>(s/q.x, s/q.y, s/q.z, s/q.w);
232
+ }
233
+
228
234
  template<typename Type>
229
235
  inline CUDA_CALLABLE quat_t<Type> operator / (quat_t<Type> a, Type s)
230
236
  {
231
237
  return div(a,s);
232
238
  }
233
239
 
240
+ template<typename Type>
241
+ inline CUDA_CALLABLE quat_t<Type> operator / (Type s, quat_t<Type> a)
242
+ {
243
+ return div(s,a);
244
+ }
245
+
234
246
  template<typename Type>
235
247
  inline CUDA_CALLABLE quat_t<Type> operator*(Type s, const quat_t<Type>& a)
236
248
  {
@@ -523,9 +535,14 @@ inline CUDA_CALLABLE void tensordot(const quat_t<Type>& a, const quat_t<Type>& b
523
535
  }
524
536
 
525
537
  template<typename Type>
526
- inline CUDA_CALLABLE void adj_length(const quat_t<Type>& a, quat_t<Type>& adj_a, const Type adj_ret)
538
+ inline CUDA_CALLABLE void adj_length(const quat_t<Type>& a, Type ret, quat_t<Type>& adj_a, const Type adj_ret)
527
539
  {
528
- adj_a += normalize(a)*adj_ret;
540
+ if (ret > Type(kEps))
541
+ {
542
+ Type inv_l = Type(1)/ret;
543
+
544
+ adj_a += quat_t<Type>(a.x*inv_l, a.y*inv_l, a.z*inv_l, a.w*inv_l) * adj_ret;
545
+ }
529
546
  }
530
547
 
531
548
  template<typename Type>
@@ -608,6 +625,13 @@ inline CUDA_CALLABLE void adj_div(quat_t<Type> a, Type s, quat_t<Type>& adj_a, T
608
625
  adj_a += adj_ret / s;
609
626
  }
610
627
 
628
+ template<typename Type>
629
+ inline CUDA_CALLABLE void adj_div(Type s, quat_t<Type> a, Type& adj_s, quat_t<Type>& adj_a, const quat_t<Type>& adj_ret)
630
+ {
631
+ adj_s -= dot(a, adj_ret)/ (s * s); // - a / s^2
632
+ adj_a += s / adj_ret;
633
+ }
634
+
611
635
  template<typename Type>
612
636
  inline CUDA_CALLABLE void adj_quat_rotate(const quat_t<Type>& q, const vec_t<3,Type>& p, quat_t<Type>& adj_q, vec_t<3,Type>& adj_p, const vec_t<3,Type>& adj_ret)
613
637
  {
@@ -677,7 +701,7 @@ inline CUDA_CALLABLE void adj_quat_rotate_inv(const quat_t<Type>& q, const vec_t
677
701
  }
678
702
 
679
703
  template<typename Type>
680
- inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Type>& q1, Type t, quat_t<Type>& adj_q0, quat_t<Type>& adj_q1, Type& adj_t, const quat_t<Type>& adj_ret)
704
+ inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Type>& q1, Type t, quat_t<Type>& ret, quat_t<Type>& adj_q0, quat_t<Type>& adj_q1, Type& adj_t, const quat_t<Type>& adj_ret)
681
705
  {
682
706
  vec_t<3,Type> axis;
683
707
  Type angle;
@@ -688,7 +712,7 @@ inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Ty
688
712
  angle = angle * 0.5;
689
713
 
690
714
  // adj_t
691
- adj_t += dot(mul(quat_slerp(q0, q1, t), quat_t<Type>(angle*axis[0], angle*axis[1], angle*axis[2], Type(0))), adj_ret);
715
+ adj_t += dot(mul(ret, quat_t<Type>(angle*axis[0], angle*axis[1], angle*axis[2], Type(0))), adj_ret);
692
716
 
693
717
  // adj_q0
694
718
  quat_t<Type> q_inc_x_q0;
warp/native/vec.h CHANGED
@@ -284,12 +284,41 @@ inline CUDA_CALLABLE vec_t<2, Type> div(vec_t<2, Type> a, Type s)
284
284
  return vec_t<2, Type>(a.c[0]/s,a.c[1]/s);
285
285
  }
286
286
 
287
+ template<unsigned Length, typename Type>
288
+ inline CUDA_CALLABLE vec_t<Length, Type> div(Type s, vec_t<Length, Type> a)
289
+ {
290
+ vec_t<Length, Type> ret;
291
+ for (unsigned i=0; i < Length; ++i)
292
+ {
293
+ ret[i] = s / a[i];
294
+ }
295
+ return ret;
296
+ }
297
+
298
+ template<typename Type>
299
+ inline CUDA_CALLABLE vec_t<3, Type> div(Type s, vec_t<3, Type> a)
300
+ {
301
+ return vec_t<3, Type>(s/a.c[0],s/a.c[1],s/a.c[2]);
302
+ }
303
+
304
+ template<typename Type>
305
+ inline CUDA_CALLABLE vec_t<2, Type> div(Type s, vec_t<2, Type> a)
306
+ {
307
+ return vec_t<2, Type>(s/a.c[0],s/a.c[1]);
308
+ }
309
+
287
310
  template<unsigned Length, typename Type>
288
311
  inline CUDA_CALLABLE vec_t<Length, Type> operator / (vec_t<Length, Type> a, Type s)
289
312
  {
290
313
  return div(a,s);
291
314
  }
292
315
 
316
+ template<unsigned Length, typename Type>
317
+ inline CUDA_CALLABLE vec_t<Length, Type> operator / (Type s, vec_t<Length, Type> a)
318
+ {
319
+ return div(s, a);
320
+ }
321
+
293
322
  // component wise division
294
323
  template<unsigned Length, typename Type>
295
324
  inline CUDA_CALLABLE vec_t<Length, Type> cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b)
@@ -735,9 +764,30 @@ inline CUDA_CALLABLE void adj_div(vec_t<Length, Type> a, Type s, vec_t<Length, T
735
764
  }
736
765
 
737
766
  template<unsigned Length, typename Type>
738
- inline CUDA_CALLABLE void adj_cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b, vec_t<Length, Type>& adj_a, vec_t<Length, Type>& adj_b, const vec_t<Length, Type>& adj_ret) {
767
+ inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
768
+ {
769
+
770
+ adj_s -= dot(a , adj_ret)/ (s * s); // - a / s^2
771
+
772
+ for( unsigned i=0; i < Length; ++i )
773
+ {
774
+ adj_a[i] += s / adj_ret[i];
775
+ }
776
+
777
+ #if FP_CHECK
778
+ if (!isfinite(a) || !isfinite(s) || !isfinite(adj_a) || !isfinite(adj_s) || !isfinite(adj_ret))
779
+ {
780
+ // \TODO: How shall we implement this error message?
781
+ // printf("adj_div((%f %f %f %f), %f, (%f %f %f %f), %f, (%f %f %f %f)\n", a.x, a.y, a.z, a.w, s, adj_a.x, adj_a.y, adj_a.z, adj_a.w, adj_s, adj_ret.x, adj_ret.y, adj_ret.z, adj_ret.w);
782
+ assert(0);
783
+ }
784
+ #endif
785
+ }
786
+
787
+ template<unsigned Length, typename Type>
788
+ inline CUDA_CALLABLE void adj_cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b, vec_t<Length, Type>& ret, vec_t<Length, Type>& adj_a, vec_t<Length, Type>& adj_b, const vec_t<Length, Type>& adj_ret) {
739
789
  adj_a += cw_div(adj_ret, b);
740
- adj_b -= cw_mul(adj_ret, cw_div(cw_div(a, b), b));
790
+ adj_b -= cw_mul(adj_ret, cw_div(ret, b));
741
791
  }
742
792
 
743
793
  template<unsigned Length, typename Type>
@@ -850,9 +900,12 @@ inline CUDA_CALLABLE void adj_extract(const vec_t<Length, Type> & a, int idx, ve
850
900
  }
851
901
 
852
902
  template<unsigned Length, typename Type>
853
- inline CUDA_CALLABLE void adj_length(vec_t<Length, Type> a, vec_t<Length, Type>& adj_a, const Type adj_ret)
903
+ inline CUDA_CALLABLE void adj_length(vec_t<Length, Type> a, Type ret, vec_t<Length, Type>& adj_a, const Type adj_ret)
854
904
  {
855
- adj_a += normalize(a)*adj_ret;
905
+ if (ret > Type(kEps))
906
+ {
907
+ adj_a += div(a, ret) * adj_ret;
908
+ }
856
909
 
857
910
  #if FP_CHECK
858
911
  if (!isfinite(adj_a))
@@ -880,7 +933,7 @@ inline CUDA_CALLABLE void adj_length_sq(vec_t<Length, Type> a, vec_t<Length, Typ
880
933
  }
881
934
 
882
935
  template<unsigned Length, typename Type>
883
- inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
936
+ inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Type>& ret, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
884
937
  {
885
938
  Type d = length(a);
886
939
 
@@ -888,9 +941,7 @@ inline CUDA_CALLABLE void adj_normalize(vec_t<Length, Type> a, vec_t<Length, Typ
888
941
  {
889
942
  Type invd = Type(1.0f)/d;
890
943
 
891
- vec_t<Length, Type> ahat = normalize(a);
892
-
893
- adj_a += (adj_ret*invd - ahat*(dot(ahat, adj_ret))*invd);
944
+ adj_a += (adj_ret*invd - ret*(dot(ret, adj_ret))*invd);
894
945
 
895
946
  #if FP_CHECK
896
947
  if (!isfinite(adj_a))
@@ -951,8 +1002,8 @@ inline CUDA_CALLABLE void adj_max(const vec_t<Length,Type> &v, vec_t<Length,Type
951
1002
 
952
1003
  // Do I need to specialize these for different lengths?
953
1004
  template<unsigned Length, typename Type>
954
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
955
-
1005
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1006
+ {
956
1007
  vec_t<Length, Type> ret;
957
1008
  for( unsigned i=0; i < Length; ++i )
958
1009
  {
@@ -963,8 +1014,8 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_add(vec_t<Length, Type> * addr,
963
1014
  }
964
1015
 
965
1016
  template<unsigned Length, typename Type>
966
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
967
-
1017
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1018
+ {
968
1019
  vec_t<Length, Type> ret;
969
1020
  for( unsigned i=0; i < Length; ++i )
970
1021
  {
@@ -975,8 +1026,8 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_min(vec_t<Length, Type> * addr,
975
1026
  }
976
1027
 
977
1028
  template<unsigned Length, typename Type>
978
- inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr, vec_t<Length, Type> value) {
979
-
1029
+ inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr, vec_t<Length, Type> value)
1030
+ {
980
1031
  vec_t<Length, Type> ret;
981
1032
  for( unsigned i=0; i < Length; ++i )
982
1033
  {
@@ -986,6 +1037,17 @@ inline CUDA_CALLABLE vec_t<Length, Type> atomic_max(vec_t<Length, Type> * addr,
986
1037
  return ret;
987
1038
  }
988
1039
 
1040
+ template<unsigned Length, typename Type>
1041
+ inline CUDA_CALLABLE void adj_atomic_minmax(
1042
+ vec_t<Length,Type> *addr,
1043
+ vec_t<Length,Type> *adj_addr,
1044
+ const vec_t<Length,Type> &value,
1045
+ vec_t<Length,Type> &adj_value)
1046
+ {
1047
+ for (unsigned i=0; i < Length; ++i)
1048
+ adj_atomic_minmax(&(addr->c[i]), &(adj_addr->c[i]), value[i], adj_value[i]);
1049
+ }
1050
+
989
1051
  // ok, the original implementation of this didn't take the absolute values.
990
1052
  // I wouldn't consider this expected behavior. It looks like it's only
991
1053
  // being used for bounding boxes at the moment, where this doesn't matter,
warp/native/warp.cu CHANGED
@@ -1143,12 +1143,7 @@ int cuda_toolkit_version()
1143
1143
 
1144
1144
  bool cuda_driver_is_initialized()
1145
1145
  {
1146
- CUcontext ctx;
1147
-
1148
- // result can be: CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED
1149
- CUresult result = cuCtxGetCurrent_f(&ctx);
1150
-
1151
- return result == CUDA_SUCCESS;
1146
+ return is_cuda_driver_initialized();
1152
1147
  }
1153
1148
 
1154
1149
  int nvrtc_supported_arch_count()