warp-lang 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (350) hide show
  1. warp/__init__.py +301 -287
  2. warp/__init__.pyi +2220 -313
  3. warp/_src/__init__.py +14 -0
  4. warp/_src/autograd.py +1075 -0
  5. warp/_src/build.py +618 -0
  6. warp/_src/build_dll.py +640 -0
  7. warp/{builtins.py → _src/builtins.py} +1497 -226
  8. warp/_src/codegen.py +4359 -0
  9. warp/{config.py → _src/config.py} +178 -169
  10. warp/_src/constants.py +57 -0
  11. warp/_src/context.py +8294 -0
  12. warp/_src/dlpack.py +462 -0
  13. warp/_src/fabric.py +355 -0
  14. warp/_src/fem/__init__.py +14 -0
  15. warp/_src/fem/adaptivity.py +508 -0
  16. warp/_src/fem/cache.py +687 -0
  17. warp/_src/fem/dirichlet.py +188 -0
  18. warp/{fem → _src/fem}/domain.py +40 -30
  19. warp/_src/fem/field/__init__.py +131 -0
  20. warp/_src/fem/field/field.py +701 -0
  21. warp/{fem → _src/fem}/field/nodal_field.py +30 -15
  22. warp/{fem → _src/fem}/field/restriction.py +1 -1
  23. warp/{fem → _src/fem}/field/virtual.py +53 -27
  24. warp/_src/fem/geometry/__init__.py +32 -0
  25. warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
  26. warp/_src/fem/geometry/closest_point.py +97 -0
  27. warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
  28. warp/{fem → _src/fem}/geometry/element.py +32 -10
  29. warp/{fem → _src/fem}/geometry/geometry.py +48 -20
  30. warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
  31. warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
  32. warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
  33. warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
  34. warp/{fem → _src/fem}/geometry/partition.py +121 -63
  35. warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
  36. warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
  37. warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
  38. warp/{fem → _src/fem}/integrate.py +164 -158
  39. warp/_src/fem/linalg.py +383 -0
  40. warp/_src/fem/operator.py +396 -0
  41. warp/_src/fem/polynomial.py +229 -0
  42. warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
  43. warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
  44. warp/_src/fem/space/__init__.py +248 -0
  45. warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
  46. warp/_src/fem/space/basis_space.py +679 -0
  47. warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
  48. warp/{fem → _src/fem}/space/function_space.py +14 -13
  49. warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
  50. warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
  51. warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
  52. warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
  53. warp/{fem → _src/fem}/space/partition.py +117 -60
  54. warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
  55. warp/{fem → _src/fem}/space/restriction.py +66 -33
  56. warp/_src/fem/space/shape/__init__.py +152 -0
  57. warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
  58. warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
  59. warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
  60. warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
  61. warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
  62. warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
  63. warp/_src/fem/space/topology.py +459 -0
  64. warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
  65. warp/_src/fem/types.py +112 -0
  66. warp/_src/fem/utils.py +486 -0
  67. warp/_src/jax.py +186 -0
  68. warp/_src/jax_experimental/__init__.py +14 -0
  69. warp/_src/jax_experimental/custom_call.py +387 -0
  70. warp/_src/jax_experimental/ffi.py +1284 -0
  71. warp/_src/jax_experimental/xla_ffi.py +656 -0
  72. warp/_src/marching_cubes.py +708 -0
  73. warp/_src/math.py +414 -0
  74. warp/_src/optim/__init__.py +14 -0
  75. warp/_src/optim/adam.py +163 -0
  76. warp/_src/optim/linear.py +1606 -0
  77. warp/_src/optim/sgd.py +112 -0
  78. warp/_src/paddle.py +406 -0
  79. warp/_src/render/__init__.py +14 -0
  80. warp/_src/render/imgui_manager.py +289 -0
  81. warp/_src/render/render_opengl.py +3636 -0
  82. warp/_src/render/render_usd.py +937 -0
  83. warp/_src/render/utils.py +160 -0
  84. warp/_src/sparse.py +2716 -0
  85. warp/_src/tape.py +1206 -0
  86. warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
  87. warp/_src/torch.py +391 -0
  88. warp/_src/types.py +5870 -0
  89. warp/_src/utils.py +1693 -0
  90. warp/autograd.py +12 -1054
  91. warp/bin/warp-clang.dll +0 -0
  92. warp/bin/warp.dll +0 -0
  93. warp/build.py +8 -588
  94. warp/build_dll.py +6 -471
  95. warp/codegen.py +6 -4246
  96. warp/constants.py +6 -39
  97. warp/context.py +12 -7851
  98. warp/dlpack.py +6 -444
  99. warp/examples/distributed/example_jacobi_mpi.py +4 -5
  100. warp/examples/fem/example_adaptive_grid.py +1 -1
  101. warp/examples/fem/example_apic_fluid.py +1 -1
  102. warp/examples/fem/example_burgers.py +8 -8
  103. warp/examples/fem/example_diffusion.py +1 -1
  104. warp/examples/fem/example_distortion_energy.py +1 -1
  105. warp/examples/fem/example_mixed_elasticity.py +2 -2
  106. warp/examples/fem/example_navier_stokes.py +1 -1
  107. warp/examples/fem/example_nonconforming_contact.py +7 -7
  108. warp/examples/fem/example_stokes.py +1 -1
  109. warp/examples/fem/example_stokes_transfer.py +1 -1
  110. warp/examples/fem/utils.py +2 -2
  111. warp/examples/interop/example_jax_callable.py +1 -1
  112. warp/examples/interop/example_jax_ffi_callback.py +1 -1
  113. warp/examples/interop/example_jax_kernel.py +3 -2
  114. warp/examples/tile/example_tile_mcgp.py +191 -0
  115. warp/fabric.py +6 -337
  116. warp/fem/__init__.py +159 -97
  117. warp/fem/adaptivity.py +7 -489
  118. warp/fem/cache.py +9 -648
  119. warp/fem/dirichlet.py +6 -184
  120. warp/fem/field/__init__.py +8 -109
  121. warp/fem/field/field.py +7 -652
  122. warp/fem/geometry/__init__.py +7 -18
  123. warp/fem/geometry/closest_point.py +11 -77
  124. warp/fem/linalg.py +18 -366
  125. warp/fem/operator.py +11 -369
  126. warp/fem/polynomial.py +9 -209
  127. warp/fem/space/__init__.py +5 -211
  128. warp/fem/space/basis_space.py +6 -662
  129. warp/fem/space/shape/__init__.py +41 -118
  130. warp/fem/space/topology.py +6 -437
  131. warp/fem/types.py +6 -81
  132. warp/fem/utils.py +11 -444
  133. warp/jax.py +8 -165
  134. warp/jax_experimental/__init__.py +14 -1
  135. warp/jax_experimental/custom_call.py +8 -342
  136. warp/jax_experimental/ffi.py +17 -853
  137. warp/jax_experimental/xla_ffi.py +5 -596
  138. warp/marching_cubes.py +5 -689
  139. warp/math.py +16 -393
  140. warp/native/array.h +385 -37
  141. warp/native/builtin.h +316 -39
  142. warp/native/bvh.cpp +43 -9
  143. warp/native/bvh.cu +62 -27
  144. warp/native/bvh.h +310 -309
  145. warp/native/clang/clang.cpp +102 -97
  146. warp/native/coloring.cpp +0 -1
  147. warp/native/crt.h +208 -0
  148. warp/native/exports.h +156 -0
  149. warp/native/hashgrid.cu +2 -0
  150. warp/native/intersect.h +24 -1
  151. warp/native/intersect_tri.h +44 -35
  152. warp/native/mat.h +1456 -276
  153. warp/native/mesh.cpp +4 -4
  154. warp/native/mesh.cu +4 -2
  155. warp/native/mesh.h +176 -61
  156. warp/native/quat.h +0 -52
  157. warp/native/scan.cu +2 -0
  158. warp/native/sort.cu +22 -13
  159. warp/native/sort.h +2 -0
  160. warp/native/sparse.cu +7 -3
  161. warp/native/spatial.h +12 -0
  162. warp/native/tile.h +837 -70
  163. warp/native/tile_radix_sort.h +1 -1
  164. warp/native/tile_reduce.h +394 -46
  165. warp/native/tile_scan.h +4 -4
  166. warp/native/vec.h +469 -53
  167. warp/native/version.h +23 -0
  168. warp/native/volume.cpp +1 -1
  169. warp/native/volume.cu +1 -0
  170. warp/native/volume.h +1 -1
  171. warp/native/volume_builder.cu +2 -0
  172. warp/native/warp.cpp +60 -32
  173. warp/native/warp.cu +313 -201
  174. warp/native/warp.h +14 -11
  175. warp/optim/__init__.py +6 -3
  176. warp/optim/adam.py +6 -145
  177. warp/optim/linear.py +14 -1585
  178. warp/optim/sgd.py +6 -94
  179. warp/paddle.py +6 -388
  180. warp/render/__init__.py +8 -4
  181. warp/render/imgui_manager.py +7 -267
  182. warp/render/render_opengl.py +6 -3616
  183. warp/render/render_usd.py +6 -918
  184. warp/render/utils.py +6 -142
  185. warp/sparse.py +37 -2563
  186. warp/tape.py +6 -1188
  187. warp/tests/__main__.py +1 -1
  188. warp/tests/cuda/test_async.py +4 -4
  189. warp/tests/cuda/test_conditional_captures.py +1 -1
  190. warp/tests/cuda/test_multigpu.py +1 -1
  191. warp/tests/cuda/test_streams.py +58 -1
  192. warp/tests/geometry/test_bvh.py +157 -22
  193. warp/tests/geometry/test_hash_grid.py +38 -0
  194. warp/tests/geometry/test_marching_cubes.py +0 -1
  195. warp/tests/geometry/test_mesh.py +5 -3
  196. warp/tests/geometry/test_mesh_query_aabb.py +5 -12
  197. warp/tests/geometry/test_mesh_query_point.py +5 -2
  198. warp/tests/geometry/test_mesh_query_ray.py +15 -3
  199. warp/tests/geometry/test_volume_write.py +5 -5
  200. warp/tests/interop/test_dlpack.py +14 -14
  201. warp/tests/interop/test_jax.py +1382 -79
  202. warp/tests/interop/test_paddle.py +1 -1
  203. warp/tests/test_adam.py +0 -1
  204. warp/tests/test_arithmetic.py +9 -9
  205. warp/tests/test_array.py +529 -100
  206. warp/tests/test_array_reduce.py +3 -3
  207. warp/tests/test_atomic.py +12 -8
  208. warp/tests/test_atomic_bitwise.py +209 -0
  209. warp/tests/test_atomic_cas.py +4 -4
  210. warp/tests/test_bool.py +2 -2
  211. warp/tests/test_builtins_resolution.py +5 -571
  212. warp/tests/test_codegen.py +34 -15
  213. warp/tests/test_conditional.py +1 -1
  214. warp/tests/test_context.py +6 -6
  215. warp/tests/test_copy.py +242 -161
  216. warp/tests/test_ctypes.py +3 -3
  217. warp/tests/test_devices.py +24 -2
  218. warp/tests/test_examples.py +16 -84
  219. warp/tests/test_fabricarray.py +35 -35
  220. warp/tests/test_fast_math.py +0 -2
  221. warp/tests/test_fem.py +60 -14
  222. warp/tests/test_fixedarray.py +3 -3
  223. warp/tests/test_func.py +8 -5
  224. warp/tests/test_generics.py +1 -1
  225. warp/tests/test_indexedarray.py +24 -24
  226. warp/tests/test_intersect.py +39 -9
  227. warp/tests/test_large.py +1 -1
  228. warp/tests/test_lerp.py +3 -1
  229. warp/tests/test_linear_solvers.py +1 -1
  230. warp/tests/test_map.py +49 -4
  231. warp/tests/test_mat.py +52 -62
  232. warp/tests/test_mat_constructors.py +4 -5
  233. warp/tests/test_mat_lite.py +1 -1
  234. warp/tests/test_mat_scalar_ops.py +121 -121
  235. warp/tests/test_math.py +34 -0
  236. warp/tests/test_module_aot.py +4 -4
  237. warp/tests/test_modules_lite.py +28 -2
  238. warp/tests/test_print.py +11 -11
  239. warp/tests/test_quat.py +93 -58
  240. warp/tests/test_runlength_encode.py +1 -1
  241. warp/tests/test_scalar_ops.py +38 -10
  242. warp/tests/test_smoothstep.py +1 -1
  243. warp/tests/test_sparse.py +126 -15
  244. warp/tests/test_spatial.py +105 -87
  245. warp/tests/test_special_values.py +6 -6
  246. warp/tests/test_static.py +7 -7
  247. warp/tests/test_struct.py +13 -2
  248. warp/tests/test_triangle_closest_point.py +48 -1
  249. warp/tests/test_tuple.py +96 -0
  250. warp/tests/test_types.py +82 -9
  251. warp/tests/test_utils.py +52 -52
  252. warp/tests/test_vec.py +29 -29
  253. warp/tests/test_vec_constructors.py +5 -5
  254. warp/tests/test_vec_scalar_ops.py +97 -97
  255. warp/tests/test_version.py +75 -0
  256. warp/tests/tile/test_tile.py +239 -0
  257. warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
  258. warp/tests/tile/test_tile_cholesky.py +7 -4
  259. warp/tests/tile/test_tile_load.py +26 -2
  260. warp/tests/tile/test_tile_mathdx.py +3 -3
  261. warp/tests/tile/test_tile_matmul.py +1 -1
  262. warp/tests/tile/test_tile_mlp.py +2 -4
  263. warp/tests/tile/test_tile_reduce.py +214 -13
  264. warp/tests/unittest_suites.py +6 -14
  265. warp/tests/unittest_utils.py +10 -9
  266. warp/tests/walkthrough_debug.py +3 -1
  267. warp/torch.py +6 -373
  268. warp/types.py +29 -5750
  269. warp/utils.py +10 -1659
  270. {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +47 -103
  271. warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
  272. warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
  273. warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
  274. warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
  275. warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
  276. warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
  277. warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
  278. warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
  279. warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
  280. warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
  281. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
  282. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
  283. warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
  284. warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
  285. warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
  286. warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
  287. warp/examples/assets/cartpole.urdf +0 -110
  288. warp/examples/assets/crazyflie.usd +0 -0
  289. warp/examples/assets/nv_ant.xml +0 -92
  290. warp/examples/assets/nv_humanoid.xml +0 -183
  291. warp/examples/assets/quadruped.urdf +0 -268
  292. warp/examples/optim/example_bounce.py +0 -266
  293. warp/examples/optim/example_cloth_throw.py +0 -228
  294. warp/examples/optim/example_drone.py +0 -870
  295. warp/examples/optim/example_inverse_kinematics.py +0 -182
  296. warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
  297. warp/examples/optim/example_softbody_properties.py +0 -400
  298. warp/examples/optim/example_spring_cage.py +0 -245
  299. warp/examples/optim/example_trajectory.py +0 -227
  300. warp/examples/sim/example_cartpole.py +0 -143
  301. warp/examples/sim/example_cloth.py +0 -225
  302. warp/examples/sim/example_cloth_self_contact.py +0 -316
  303. warp/examples/sim/example_granular.py +0 -130
  304. warp/examples/sim/example_granular_collision_sdf.py +0 -202
  305. warp/examples/sim/example_jacobian_ik.py +0 -244
  306. warp/examples/sim/example_particle_chain.py +0 -124
  307. warp/examples/sim/example_quadruped.py +0 -203
  308. warp/examples/sim/example_rigid_chain.py +0 -203
  309. warp/examples/sim/example_rigid_contact.py +0 -195
  310. warp/examples/sim/example_rigid_force.py +0 -133
  311. warp/examples/sim/example_rigid_gyroscopic.py +0 -115
  312. warp/examples/sim/example_rigid_soft_contact.py +0 -140
  313. warp/examples/sim/example_soft_body.py +0 -196
  314. warp/examples/tile/example_tile_walker.py +0 -327
  315. warp/sim/__init__.py +0 -74
  316. warp/sim/articulation.py +0 -793
  317. warp/sim/collide.py +0 -2570
  318. warp/sim/graph_coloring.py +0 -307
  319. warp/sim/import_mjcf.py +0 -791
  320. warp/sim/import_snu.py +0 -227
  321. warp/sim/import_urdf.py +0 -579
  322. warp/sim/import_usd.py +0 -898
  323. warp/sim/inertia.py +0 -357
  324. warp/sim/integrator.py +0 -245
  325. warp/sim/integrator_euler.py +0 -2000
  326. warp/sim/integrator_featherstone.py +0 -2101
  327. warp/sim/integrator_vbd.py +0 -2487
  328. warp/sim/integrator_xpbd.py +0 -3295
  329. warp/sim/model.py +0 -4821
  330. warp/sim/particles.py +0 -121
  331. warp/sim/render.py +0 -431
  332. warp/sim/utils.py +0 -431
  333. warp/tests/sim/disabled_kinematics.py +0 -244
  334. warp/tests/sim/test_cloth.py +0 -863
  335. warp/tests/sim/test_collision.py +0 -743
  336. warp/tests/sim/test_coloring.py +0 -347
  337. warp/tests/sim/test_inertia.py +0 -161
  338. warp/tests/sim/test_model.py +0 -226
  339. warp/tests/sim/test_sim_grad.py +0 -287
  340. warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
  341. warp/tests/sim/test_sim_kinematics.py +0 -98
  342. warp/thirdparty/__init__.py +0 -0
  343. warp_lang-1.9.0.dist-info/RECORD +0 -456
  344. /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
  345. /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
  346. /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
  347. /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
  348. {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
  349. {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
  350. {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
warp/native/mat.h CHANGED
@@ -149,10 +149,6 @@ struct mat_t
149
149
  data[3][3] = m33;
150
150
  }
151
151
 
152
- // implemented in quat.h
153
- inline CUDA_CALLABLE mat_t(const vec_t<3,Type>& pos, const quat_t<Type>& rot, const vec_t<3,Type>& scale);
154
-
155
-
156
152
  inline CUDA_CALLABLE mat_t(const initializer_array<Rows * Cols, Type> &l)
157
153
  {
158
154
  for (unsigned i=0; i < Rows; ++i)
@@ -207,6 +203,17 @@ struct mat_t
207
203
  Type data[Rows < 1 ? 1 : Rows][Cols < 1 ? 1 : Cols];
208
204
  };
209
205
 
206
+ // Type trait to detect if a type is a mat_t
207
+ template<typename T>
208
+ struct is_matrix {
209
+ static constexpr bool value = false;
210
+ };
211
+
212
+ template<unsigned Rows, unsigned Cols, typename Type>
213
+ struct is_matrix<mat_t<Rows, Cols, Type>> {
214
+ static constexpr bool value = true;
215
+ };
216
+
210
217
  template<typename Type>
211
218
  inline CUDA_CALLABLE mat_t<2, 2, Type> matrix_from_cols(vec_t<2, Type> c0, vec_t<2, Type> c1)
212
219
  {
@@ -437,6 +444,42 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_add(mat_t<Rows,Cols,Type> * ad
437
444
  return m;
438
445
  }
439
446
 
447
+ template<unsigned Rows, unsigned Cols, typename Type>
448
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_and(mat_t<Rows,Cols,Type> * addr, mat_t<Rows,Cols,Type> value)
449
+ {
450
+ mat_t<Rows,Cols,Type> m;
451
+
452
+ for (unsigned i=0; i < Rows; ++i)
453
+ for (unsigned j=0; j < Cols; ++j)
454
+ m.data[i][j] = atomic_and(&addr->data[i][j], value.data[i][j]);
455
+
456
+ return m;
457
+ }
458
+
459
+ template<unsigned Rows, unsigned Cols, typename Type>
460
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_or(mat_t<Rows,Cols,Type> * addr, mat_t<Rows,Cols,Type> value)
461
+ {
462
+ mat_t<Rows,Cols,Type> m;
463
+
464
+ for (unsigned i=0; i < Rows; ++i)
465
+ for (unsigned j=0; j < Cols; ++j)
466
+ m.data[i][j] = atomic_or(&addr->data[i][j], value.data[i][j]);
467
+
468
+ return m;
469
+ }
470
+
471
+ template<unsigned Rows, unsigned Cols, typename Type>
472
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_xor(mat_t<Rows,Cols,Type> * addr, mat_t<Rows,Cols,Type> value)
473
+ {
474
+ mat_t<Rows,Cols,Type> m;
475
+
476
+ for (unsigned i=0; i < Rows; ++i)
477
+ for (unsigned j=0; j < Cols; ++j)
478
+ m.data[i][j] = atomic_xor(&addr->data[i][j], value.data[i][j]);
479
+
480
+ return m;
481
+ }
482
+
440
483
  template<unsigned Rows, unsigned Cols, typename Type>
441
484
  inline CUDA_CALLABLE mat_t<Rows,Cols,Type> atomic_min(mat_t<Rows,Cols,Type> * addr, mat_t<Rows,Cols,Type> value)
442
485
  {
@@ -1619,7 +1662,7 @@ inline CUDA_CALLABLE void adj_sub_inplace(
1619
1662
 
1620
1663
 
1621
1664
  template<unsigned Rows, unsigned Cols, typename Type>
1622
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1665
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1623
1666
  {
1624
1667
  #ifndef NDEBUG
1625
1668
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1643,12 +1686,12 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int
1643
1686
  col += Cols;
1644
1687
  }
1645
1688
 
1646
- m.data[row][col] = value;
1689
+ m.data[row][col] &= value;
1647
1690
  }
1648
1691
 
1649
1692
 
1650
1693
  template<unsigned Rows, unsigned Cols, typename Type>
1651
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1694
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1652
1695
  {
1653
1696
  #ifndef NDEBUG
1654
1697
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1665,13 +1708,13 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_
1665
1708
 
1666
1709
  for(unsigned i=0; i < Cols; ++i)
1667
1710
  {
1668
- m.data[row][i] = value[i];
1711
+ m.data[row][i] &= value[i];
1669
1712
  }
1670
1713
  }
1671
1714
 
1672
1715
 
1673
1716
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1674
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1717
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1675
1718
  {
1676
1719
  static_assert(
1677
1720
  RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
@@ -1694,7 +1737,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1694
1737
  {
1695
1738
  for (int j = 0; j < Cols; ++j)
1696
1739
  {
1697
- m.data[i][j] = value.data[ii][j];
1740
+ m.data[i][j] &= value.data[ii][j];
1698
1741
  }
1699
1742
 
1700
1743
  ++ii;
@@ -1705,7 +1748,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1705
1748
 
1706
1749
 
1707
1750
  template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1708
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
1751
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
1709
1752
  {
1710
1753
  #ifndef NDEBUG
1711
1754
  if (col < -(int)Cols || col >= (int)Cols)
@@ -1734,7 +1777,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1734
1777
  i += row_slice.step
1735
1778
  )
1736
1779
  {
1737
- m.data[i][col] = value.c[ii];
1780
+ m.data[i][col] &= value.c[ii];
1738
1781
  ++ii;
1739
1782
  }
1740
1783
 
@@ -1743,7 +1786,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1743
1786
 
1744
1787
 
1745
1788
  template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1746
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
1789
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
1747
1790
  {
1748
1791
  #ifndef NDEBUG
1749
1792
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1772,7 +1815,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slic
1772
1815
  i += col_slice.step
1773
1816
  )
1774
1817
  {
1775
- m.data[row][i] = value.c[ii];
1818
+ m.data[row][i] &= value.c[ii];
1776
1819
  ++ii;
1777
1820
  }
1778
1821
 
@@ -1781,7 +1824,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slic
1781
1824
 
1782
1825
 
1783
1826
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1784
- inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1827
+ inline CUDA_CALLABLE void bit_and_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1785
1828
  {
1786
1829
  assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
1787
1830
  assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
@@ -1810,7 +1853,7 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1810
1853
  j += col_slice.step
1811
1854
  )
1812
1855
  {
1813
- m.data[i][j] = value.data[ii][jj];
1856
+ m.data[i][j] &= value.data[ii][jj];
1814
1857
  ++jj;
1815
1858
  }
1816
1859
 
@@ -1823,8 +1866,50 @@ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_s
1823
1866
 
1824
1867
 
1825
1868
  template<unsigned Rows, unsigned Cols, typename Type>
1826
- inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
1827
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
1869
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1870
+ mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
1871
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value
1872
+ ) {}
1873
+
1874
+
1875
+ template<unsigned Rows, unsigned Cols, typename Type>
1876
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1877
+ mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
1878
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value
1879
+ ) {}
1880
+
1881
+
1882
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1883
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1884
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1885
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1886
+ ) {}
1887
+
1888
+
1889
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1890
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1891
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
1892
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
1893
+ ) {}
1894
+
1895
+
1896
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1897
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1898
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
1899
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
1900
+ ) {}
1901
+
1902
+
1903
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1904
+ inline CUDA_CALLABLE void adj_bit_and_inplace(
1905
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1906
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1907
+ ) {}
1908
+
1909
+
1910
+
1911
+ template<unsigned Rows, unsigned Cols, typename Type>
1912
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
1828
1913
  {
1829
1914
  #ifndef NDEBUG
1830
1915
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1848,13 +1933,12 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
1848
1933
  col += Cols;
1849
1934
  }
1850
1935
 
1851
- adj_value += adj_m.data[row][col];
1936
+ m.data[row][col] |= value;
1852
1937
  }
1853
1938
 
1854
1939
 
1855
1940
  template<unsigned Rows, unsigned Cols, typename Type>
1856
- inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
1857
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value)
1941
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
1858
1942
  {
1859
1943
  #ifndef NDEBUG
1860
1944
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1871,16 +1955,13 @@ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row,
1871
1955
 
1872
1956
  for(unsigned i=0; i < Cols; ++i)
1873
1957
  {
1874
- adj_value[i] += adj_m.data[row][i];
1958
+ m.data[row][i] |= value[i];
1875
1959
  }
1876
1960
  }
1877
1961
 
1878
1962
 
1879
1963
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1880
- inline CUDA_CALLABLE void adj_assign_inplace(
1881
- mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
1882
- mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
1883
- )
1964
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
1884
1965
  {
1885
1966
  static_assert(
1886
1967
  RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
@@ -1903,7 +1984,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1903
1984
  {
1904
1985
  for (int j = 0; j < Cols; ++j)
1905
1986
  {
1906
- adj_value.data[ii][j] += adj_m.data[i][j];
1987
+ m.data[i][j] |= value.data[ii][j];
1907
1988
  }
1908
1989
 
1909
1990
  ++ii;
@@ -1914,10 +1995,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1914
1995
 
1915
1996
 
1916
1997
  template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
1917
- inline CUDA_CALLABLE void adj_assign_inplace(
1918
- mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
1919
- mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
1920
- )
1998
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
1921
1999
  {
1922
2000
  #ifndef NDEBUG
1923
2001
  if (col < -(int)Cols || col >= (int)Cols)
@@ -1946,7 +2024,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1946
2024
  i += row_slice.step
1947
2025
  )
1948
2026
  {
1949
- adj_value.c[ii] += adj_m.data[i][col];
2027
+ m.data[i][col] |= value.c[ii];
1950
2028
  ++ii;
1951
2029
  }
1952
2030
 
@@ -1955,10 +2033,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1955
2033
 
1956
2034
 
1957
2035
  template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1958
- inline CUDA_CALLABLE void adj_assign_inplace(
1959
- mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
1960
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
1961
- )
2036
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
1962
2037
  {
1963
2038
  #ifndef NDEBUG
1964
2039
  if (row < -(int)Rows || row >= (int)Rows)
@@ -1987,7 +2062,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1987
2062
  i += col_slice.step
1988
2063
  )
1989
2064
  {
1990
- adj_value.c[ii] += adj_m.data[row][i];
2065
+ m.data[row][i] |= value.c[ii];
1991
2066
  ++ii;
1992
2067
  }
1993
2068
 
@@ -1996,10 +2071,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
1996
2071
 
1997
2072
 
1998
2073
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
1999
- inline CUDA_CALLABLE void adj_assign_inplace(
2000
- mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2001
- mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2002
- )
2074
+ inline CUDA_CALLABLE void bit_or_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2003
2075
  {
2004
2076
  assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2005
2077
  assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
@@ -2028,7 +2100,7 @@ inline CUDA_CALLABLE void adj_assign_inplace(
2028
2100
  j += col_slice.step
2029
2101
  )
2030
2102
  {
2031
- adj_value.data[ii][jj] += adj_m.data[i][j];
2103
+ m.data[i][j] |= value.data[ii][jj];
2032
2104
  ++jj;
2033
2105
  }
2034
2106
 
@@ -2041,100 +2113,50 @@ inline CUDA_CALLABLE void adj_assign_inplace(
2041
2113
 
2042
2114
 
2043
2115
  template<unsigned Rows, unsigned Cols, typename Type>
2044
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2045
- {
2046
- #ifndef NDEBUG
2047
- if (row < -(int)Rows || row >= (int)Rows)
2048
- {
2049
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2050
- assert(0);
2051
- }
2052
- if (col < -(int)Cols || col >= (int)Cols)
2053
- {
2054
- printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2055
- assert(0);
2056
- }
2057
- #endif
2058
-
2059
- if (row < 0)
2060
- {
2061
- row += Rows;
2062
- }
2063
- if (col < 0)
2064
- {
2065
- col += Cols;
2066
- }
2067
-
2068
- mat_t<Rows,Cols,Type> ret(m);
2069
- ret.data[row][col] = value;
2070
- return ret;
2071
- }
2116
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2117
+ mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2118
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value
2119
+ ) {}
2072
2120
 
2073
2121
 
2074
2122
  template<unsigned Rows, unsigned Cols, typename Type>
2075
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2076
- {
2077
- #ifndef NDEBUG
2078
- if (row < -(int)Rows || row >= (int)Rows)
2079
- {
2080
- printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2081
- assert(0);
2082
- }
2083
- #endif
2084
-
2085
- if (row < 0)
2086
- {
2087
- row += Rows;
2088
- }
2089
-
2090
- mat_t<Rows,Cols,Type> ret(m);
2091
- for(unsigned i=0; i < Cols; ++i)
2092
- {
2093
- ret.data[row][i] = value[i];
2094
- }
2095
- return ret;
2096
- }
2123
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2124
+ mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2125
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value
2126
+ ) {}
2097
2127
 
2098
2128
 
2099
2129
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2100
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2101
- {
2102
- mat_t<Rows, Cols, Type> ret(m);
2103
- assign_inplace(ret, row_slice, value);
2104
- return ret;
2105
- }
2130
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2131
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2132
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2133
+ ) {}
2106
2134
 
2107
2135
 
2108
2136
  template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2109
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
2110
- {
2111
- mat_t<Rows, Cols, Type> ret(m);
2112
- assign_inplace(ret, row_slice, col, value);
2113
- return ret;
2114
- }
2137
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2138
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
2139
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
2140
+ ) {}
2115
2141
 
2116
2142
 
2117
2143
  template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2118
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
2119
- {
2120
- mat_t<Rows, Cols, Type> ret(m);
2121
- assign_inplace(ret, row, col_slice, value);
2122
- return ret;
2123
- }
2144
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2145
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
2146
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
2147
+ ) {}
2124
2148
 
2125
2149
 
2126
2150
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2127
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2128
- {
2129
- mat_t<Rows, Cols, Type> ret(m);
2130
- assign_inplace(ret, row_slice, col_slice, value);
2131
- return ret;
2132
- }
2151
+ inline CUDA_CALLABLE void adj_bit_or_inplace(
2152
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2153
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2154
+ ) {}
2155
+
2133
2156
 
2134
2157
 
2135
2158
  template<unsigned Rows, unsigned Cols, typename Type>
2136
- inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2137
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2159
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2138
2160
  {
2139
2161
  #ifndef NDEBUG
2140
2162
  if (row < -(int)Rows || row >= (int)Rows)
@@ -2158,21 +2180,12 @@ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int
2158
2180
  col += Cols;
2159
2181
  }
2160
2182
 
2161
- adj_value += adj_ret.data[row][col];
2162
- for(unsigned i=0; i < Rows; ++i)
2163
- {
2164
- for(unsigned j=0; j < Cols; ++j)
2165
- {
2166
- if(i != row || j != col)
2167
- adj_m.data[i][j] += adj_ret.data[i][j];
2168
- }
2169
- }
2183
+ m.data[row][col] ^= value;
2170
2184
  }
2171
2185
 
2172
2186
 
2173
2187
  template<unsigned Rows, unsigned Cols, typename Type>
2174
- inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2175
- mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2188
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2176
2189
  {
2177
2190
  #ifndef NDEBUG
2178
2191
  if (row < -(int)Rows || row >= (int)Rows)
@@ -2187,25 +2200,15 @@ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec
2187
2200
  row += Rows;
2188
2201
  }
2189
2202
 
2190
- for(unsigned i=0; i < Rows; ++i)
2203
+ for(unsigned i=0; i < Cols; ++i)
2191
2204
  {
2192
- for(unsigned j=0; j < Cols; ++j)
2193
- {
2194
- if (i==row)
2195
- adj_value[j] += adj_ret.data[i][j];
2196
- else
2197
- adj_m.data[i][j] += adj_ret.data[i][j];
2198
- }
2205
+ m.data[row][i] ^= value[i];
2199
2206
  }
2200
2207
  }
2201
2208
 
2202
2209
 
2203
2210
  template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2204
- inline CUDA_CALLABLE void adj_assign_copy(
2205
- mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2206
- mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
2207
- mat_t<Rows,Cols,Type>& adj_ret
2208
- )
2211
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2209
2212
  {
2210
2213
  static_assert(
2211
2214
  RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
@@ -2220,28 +2223,18 @@ inline CUDA_CALLABLE void adj_assign_copy(
2220
2223
  bool is_row_reversed = row_slice.step < 0;
2221
2224
 
2222
2225
  int ii = 0;
2223
- for (int i = 0; i < Rows; ++i)
2226
+ for (
2227
+ int i = row_slice.start;
2228
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2229
+ i += row_slice.step
2230
+ )
2224
2231
  {
2225
- bool in_row_slice = is_row_reversed
2226
- ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
2227
- : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
2228
-
2229
- if (!in_row_slice)
2232
+ for (int j = 0; j < Cols; ++j)
2230
2233
  {
2231
- for (int j = 0; j < Cols; ++j)
2232
- {
2233
- adj_m.data[i][j] += adj_ret.data[i][j];
2234
- }
2234
+ m.data[i][j] ^= value.data[ii][j];
2235
2235
  }
2236
- else
2237
- {
2238
- for (int j = 0; j < Cols; ++j)
2239
- {
2240
- adj_value.data[ii][j] += adj_ret.data[i][j];
2241
- }
2242
2236
 
2243
- ++ii;
2244
- }
2237
+ ++ii;
2245
2238
  }
2246
2239
 
2247
2240
  assert(ii == RowSliceLength);
@@ -2249,11 +2242,7 @@ inline CUDA_CALLABLE void adj_assign_copy(
2249
2242
 
2250
2243
 
2251
2244
  template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2252
- inline CUDA_CALLABLE void adj_assign_copy(
2253
- mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
2254
- mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value,
2255
- mat_t<Rows,Cols,Type>& adj_ret
2256
- )
2245
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
2257
2246
  {
2258
2247
  #ifndef NDEBUG
2259
2248
  if (col < -(int)Cols || col >= (int)Cols)
@@ -2276,14 +2265,808 @@ inline CUDA_CALLABLE void adj_assign_copy(
2276
2265
  bool is_row_reversed = row_slice.step < 0;
2277
2266
 
2278
2267
  int ii = 0;
2279
- for (int i = 0; i < Rows; ++i)
2268
+ for (
2269
+ int i = row_slice.start;
2270
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2271
+ i += row_slice.step
2272
+ )
2280
2273
  {
2281
- bool in_row_slice = is_row_reversed
2282
- ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
2283
- : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
2274
+ m.data[i][col] ^= value.c[ii];
2275
+ ++ii;
2276
+ }
2284
2277
 
2285
- if (!in_row_slice)
2286
- {
2278
+ assert(ii == RowSliceLength);
2279
+ }
2280
+
2281
+
2282
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2283
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
2284
+ {
2285
+ #ifndef NDEBUG
2286
+ if (row < -(int)Rows || row >= (int)Rows)
2287
+ {
2288
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2289
+ assert(0);
2290
+ }
2291
+ #endif
2292
+
2293
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2294
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2295
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2296
+ assert(slice_get_length(col_slice) == ColSliceLength);
2297
+
2298
+ if (row < 0)
2299
+ {
2300
+ row += Rows;
2301
+ }
2302
+
2303
+ bool is_col_reversed = col_slice.step < 0;
2304
+
2305
+ int ii = 0;
2306
+ for (
2307
+ int i = col_slice.start;
2308
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
2309
+ i += col_slice.step
2310
+ )
2311
+ {
2312
+ m.data[row][i] ^= value.c[ii];
2313
+ ++ii;
2314
+ }
2315
+
2316
+ assert(ii == ColSliceLength);
2317
+ }
2318
+
2319
+
2320
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2321
+ inline CUDA_CALLABLE void bit_xor_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2322
+ {
2323
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2324
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2325
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2326
+ assert(slice_get_length(row_slice) == RowSliceLength);
2327
+
2328
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2329
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2330
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2331
+ assert(slice_get_length(col_slice) == ColSliceLength);
2332
+
2333
+ bool is_row_reversed = row_slice.step < 0;
2334
+ bool is_col_reversed = col_slice.step < 0;
2335
+
2336
+ int ii = 0;
2337
+ for (
2338
+ int i = row_slice.start;
2339
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2340
+ i += row_slice.step
2341
+ )
2342
+ {
2343
+ int jj = 0;
2344
+ for (
2345
+ int j = col_slice.start;
2346
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
2347
+ j += col_slice.step
2348
+ )
2349
+ {
2350
+ m.data[i][j] ^= value.data[ii][jj];
2351
+ ++jj;
2352
+ }
2353
+
2354
+ assert(jj == ColSliceLength);
2355
+ ++ii;
2356
+ }
2357
+
2358
+ assert(ii == RowSliceLength);
2359
+ }
2360
+
2361
+
2362
+ template<unsigned Rows, unsigned Cols, typename Type>
2363
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2364
+ mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2365
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value
2366
+ ) {}
2367
+
2368
+
2369
+ template<unsigned Rows, unsigned Cols, typename Type>
2370
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2371
+ mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2372
+ mat_t<Rows,Cols,Type>& adj_m, int adj_row, vec_t<Cols,Type>& adj_value
2373
+ ) {}
2374
+
2375
+
2376
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2377
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2378
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2379
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2380
+ ) {}
2381
+
2382
+
2383
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2384
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2385
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
2386
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
2387
+ ) {}
2388
+
2389
+
2390
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2391
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2392
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
2393
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
2394
+ ) {}
2395
+
2396
+
2397
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2398
+ inline CUDA_CALLABLE void adj_bit_xor_inplace(
2399
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2400
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2401
+ ) {}
2402
+
2403
+
2404
+ template<unsigned Rows, unsigned Cols, typename Type>
2405
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2406
+ {
2407
+ #ifndef NDEBUG
2408
+ if (row < -(int)Rows || row >= (int)Rows)
2409
+ {
2410
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2411
+ assert(0);
2412
+ }
2413
+ if (col < -(int)Cols || col >= (int)Cols)
2414
+ {
2415
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2416
+ assert(0);
2417
+ }
2418
+ #endif
2419
+
2420
+ if (row < 0)
2421
+ {
2422
+ row += Rows;
2423
+ }
2424
+ if (col < 0)
2425
+ {
2426
+ col += Cols;
2427
+ }
2428
+
2429
+ m.data[row][col] = value;
2430
+ }
2431
+
2432
+
2433
+ template<unsigned Rows, unsigned Cols, typename Type>
2434
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2435
+ {
2436
+ #ifndef NDEBUG
2437
+ if (row < -(int)Rows || row >= (int)Rows)
2438
+ {
2439
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2440
+ assert(0);
2441
+ }
2442
+ #endif
2443
+
2444
+ if (row < 0)
2445
+ {
2446
+ row += Rows;
2447
+ }
2448
+
2449
+ for(unsigned i=0; i < Cols; ++i)
2450
+ {
2451
+ m.data[row][i] = value[i];
2452
+ }
2453
+ }
2454
+
2455
+
2456
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2457
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2458
+ {
2459
+ static_assert(
2460
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
2461
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
2462
+ );
2463
+
2464
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2465
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2466
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2467
+ assert(slice_get_length(row_slice) == RowSliceLength);
2468
+
2469
+ bool is_row_reversed = row_slice.step < 0;
2470
+
2471
+ int ii = 0;
2472
+ for (
2473
+ int i = row_slice.start;
2474
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2475
+ i += row_slice.step
2476
+ )
2477
+ {
2478
+ for (int j = 0; j < Cols; ++j)
2479
+ {
2480
+ m.data[i][j] = value.data[ii][j];
2481
+ }
2482
+
2483
+ ++ii;
2484
+ }
2485
+
2486
+ assert(ii == RowSliceLength);
2487
+ }
2488
+
2489
+
2490
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2491
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
2492
+ {
2493
+ #ifndef NDEBUG
2494
+ if (col < -(int)Cols || col >= (int)Cols)
2495
+ {
2496
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2497
+ assert(0);
2498
+ }
2499
+ #endif
2500
+
2501
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2502
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2503
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2504
+ assert(slice_get_length(row_slice) == RowSliceLength);
2505
+
2506
+ if (col < 0)
2507
+ {
2508
+ col += Cols;
2509
+ }
2510
+
2511
+ bool is_row_reversed = row_slice.step < 0;
2512
+
2513
+ int ii = 0;
2514
+ for (
2515
+ int i = row_slice.start;
2516
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2517
+ i += row_slice.step
2518
+ )
2519
+ {
2520
+ m.data[i][col] = value.c[ii];
2521
+ ++ii;
2522
+ }
2523
+
2524
+ assert(ii == RowSliceLength);
2525
+ }
2526
+
2527
+
2528
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2529
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
2530
+ {
2531
+ #ifndef NDEBUG
2532
+ if (row < -(int)Rows || row >= (int)Rows)
2533
+ {
2534
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2535
+ assert(0);
2536
+ }
2537
+ #endif
2538
+
2539
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2540
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2541
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2542
+ assert(slice_get_length(col_slice) == ColSliceLength);
2543
+
2544
+ if (row < 0)
2545
+ {
2546
+ row += Rows;
2547
+ }
2548
+
2549
+ bool is_col_reversed = col_slice.step < 0;
2550
+
2551
+ int ii = 0;
2552
+ for (
2553
+ int i = col_slice.start;
2554
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
2555
+ i += col_slice.step
2556
+ )
2557
+ {
2558
+ m.data[row][i] = value.c[ii];
2559
+ ++ii;
2560
+ }
2561
+
2562
+ assert(ii == ColSliceLength);
2563
+ }
2564
+
2565
+
2566
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2567
+ inline CUDA_CALLABLE void assign_inplace(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2568
+ {
2569
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2570
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2571
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2572
+ assert(slice_get_length(row_slice) == RowSliceLength);
2573
+
2574
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2575
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2576
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2577
+ assert(slice_get_length(col_slice) == ColSliceLength);
2578
+
2579
+ bool is_row_reversed = row_slice.step < 0;
2580
+ bool is_col_reversed = col_slice.step < 0;
2581
+
2582
+ int ii = 0;
2583
+ for (
2584
+ int i = row_slice.start;
2585
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2586
+ i += row_slice.step
2587
+ )
2588
+ {
2589
+ int jj = 0;
2590
+ for (
2591
+ int j = col_slice.start;
2592
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
2593
+ j += col_slice.step
2594
+ )
2595
+ {
2596
+ m.data[i][j] = value.data[ii][jj];
2597
+ ++jj;
2598
+ }
2599
+
2600
+ assert(jj == ColSliceLength);
2601
+ ++ii;
2602
+ }
2603
+
2604
+ assert(ii == RowSliceLength);
2605
+ }
2606
+
2607
+
2608
+ template<unsigned Rows, unsigned Cols, typename Type>
2609
+ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2610
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value)
2611
+ {
2612
+ #ifndef NDEBUG
2613
+ if (row < -(int)Rows || row >= (int)Rows)
2614
+ {
2615
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2616
+ assert(0);
2617
+ }
2618
+ if (col < -(int)Cols || col >= (int)Cols)
2619
+ {
2620
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2621
+ assert(0);
2622
+ }
2623
+ #endif
2624
+
2625
+ if (row < 0)
2626
+ {
2627
+ row += Rows;
2628
+ }
2629
+ if (col < 0)
2630
+ {
2631
+ col += Cols;
2632
+ }
2633
+
2634
+ adj_value += adj_m.data[row][col];
2635
+ }
2636
+
2637
+
2638
+ template<unsigned Rows, unsigned Cols, typename Type>
2639
+ inline CUDA_CALLABLE void adj_assign_inplace(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2640
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value)
2641
+ {
2642
+ #ifndef NDEBUG
2643
+ if (row < -(int)Rows || row >= (int)Rows)
2644
+ {
2645
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2646
+ assert(0);
2647
+ }
2648
+ #endif
2649
+
2650
+ if (row < 0)
2651
+ {
2652
+ row += Rows;
2653
+ }
2654
+
2655
+ for(unsigned i=0; i < Cols; ++i)
2656
+ {
2657
+ adj_value[i] += adj_m.data[row][i];
2658
+ }
2659
+ }
2660
+
2661
+
2662
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2663
+ inline CUDA_CALLABLE void adj_assign_inplace(
2664
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2665
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2666
+ )
2667
+ {
2668
+ static_assert(
2669
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
2670
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
2671
+ );
2672
+
2673
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2674
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2675
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2676
+ assert(slice_get_length(row_slice) == RowSliceLength);
2677
+
2678
+ bool is_row_reversed = row_slice.step < 0;
2679
+
2680
+ int ii = 0;
2681
+ for (
2682
+ int i = row_slice.start;
2683
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2684
+ i += row_slice.step
2685
+ )
2686
+ {
2687
+ for (int j = 0; j < Cols; ++j)
2688
+ {
2689
+ adj_value.data[ii][j] += adj_m.data[i][j];
2690
+ }
2691
+
2692
+ ++ii;
2693
+ }
2694
+
2695
+ assert(ii == RowSliceLength);
2696
+ }
2697
+
2698
+
2699
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2700
+ inline CUDA_CALLABLE void adj_assign_inplace(
2701
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
2702
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value
2703
+ )
2704
+ {
2705
+ #ifndef NDEBUG
2706
+ if (col < -(int)Cols || col >= (int)Cols)
2707
+ {
2708
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2709
+ assert(0);
2710
+ }
2711
+ #endif
2712
+
2713
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2714
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2715
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2716
+ assert(slice_get_length(row_slice) == RowSliceLength);
2717
+
2718
+ if (col < 0)
2719
+ {
2720
+ col += Cols;
2721
+ }
2722
+
2723
+ bool is_row_reversed = row_slice.step < 0;
2724
+
2725
+ int ii = 0;
2726
+ for (
2727
+ int i = row_slice.start;
2728
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2729
+ i += row_slice.step
2730
+ )
2731
+ {
2732
+ adj_value.c[ii] += adj_m.data[i][col];
2733
+ ++ii;
2734
+ }
2735
+
2736
+ assert(ii == RowSliceLength);
2737
+ }
2738
+
2739
+
2740
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2741
+ inline CUDA_CALLABLE void adj_assign_inplace(
2742
+ mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value,
2743
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, slice_t& adj_col_slice, vec_t<ColSliceLength, Type>& adj_value
2744
+ )
2745
+ {
2746
+ #ifndef NDEBUG
2747
+ if (row < -(int)Rows || row >= (int)Rows)
2748
+ {
2749
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2750
+ assert(0);
2751
+ }
2752
+ #endif
2753
+
2754
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2755
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2756
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2757
+ assert(slice_get_length(col_slice) == ColSliceLength);
2758
+
2759
+ if (row < 0)
2760
+ {
2761
+ row += Rows;
2762
+ }
2763
+
2764
+ bool is_col_reversed = col_slice.step < 0;
2765
+
2766
+ int ii = 0;
2767
+ for (
2768
+ int i = col_slice.start;
2769
+ is_col_reversed ? (i > col_slice.stop) : (i < col_slice.stop);
2770
+ i += col_slice.step
2771
+ )
2772
+ {
2773
+ adj_value.c[ii] += adj_m.data[row][i];
2774
+ ++ii;
2775
+ }
2776
+
2777
+ assert(ii == ColSliceLength);
2778
+ }
2779
+
2780
+
2781
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2782
+ inline CUDA_CALLABLE void adj_assign_inplace(
2783
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2784
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, slice_t& adj_col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value
2785
+ )
2786
+ {
2787
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2788
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
2789
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
2790
+ assert(slice_get_length(row_slice) == RowSliceLength);
2791
+
2792
+ assert(col_slice.start >= 0 && col_slice.start <= (int)Cols);
2793
+ assert(col_slice.stop >= -1 && col_slice.stop <= (int)Cols);
2794
+ assert(col_slice.step != 0 && col_slice.step < 0 ? col_slice.start >= col_slice.stop : col_slice.start <= col_slice.stop);
2795
+ assert(slice_get_length(col_slice) == ColSliceLength);
2796
+
2797
+ bool is_row_reversed = row_slice.step < 0;
2798
+ bool is_col_reversed = col_slice.step < 0;
2799
+
2800
+ int ii = 0;
2801
+ for (
2802
+ int i = row_slice.start;
2803
+ is_row_reversed ? (i > row_slice.stop) : (i < row_slice.stop);
2804
+ i += row_slice.step
2805
+ )
2806
+ {
2807
+ int jj = 0;
2808
+ for (
2809
+ int j = col_slice.start;
2810
+ is_col_reversed ? (j > col_slice.stop) : (j < col_slice.stop);
2811
+ j += col_slice.step
2812
+ )
2813
+ {
2814
+ adj_value.data[ii][jj] += adj_m.data[i][j];
2815
+ ++jj;
2816
+ }
2817
+
2818
+ assert(jj == ColSliceLength);
2819
+ ++ii;
2820
+ }
2821
+
2822
+ assert(ii == RowSliceLength);
2823
+ }
2824
+
2825
+
2826
+ template<unsigned Rows, unsigned Cols, typename Type>
2827
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
2828
+ {
2829
+ #ifndef NDEBUG
2830
+ if (row < -(int)Rows || row >= (int)Rows)
2831
+ {
2832
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2833
+ assert(0);
2834
+ }
2835
+ if (col < -(int)Cols || col >= (int)Cols)
2836
+ {
2837
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2838
+ assert(0);
2839
+ }
2840
+ #endif
2841
+
2842
+ if (row < 0)
2843
+ {
2844
+ row += Rows;
2845
+ }
2846
+ if (col < 0)
2847
+ {
2848
+ col += Cols;
2849
+ }
2850
+
2851
+ mat_t<Rows,Cols,Type> ret(m);
2852
+ ret.data[row][col] = value;
2853
+ return ret;
2854
+ }
2855
+
2856
+
2857
+ template<unsigned Rows, unsigned Cols, typename Type>
2858
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value)
2859
+ {
2860
+ #ifndef NDEBUG
2861
+ if (row < -(int)Rows || row >= (int)Rows)
2862
+ {
2863
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2864
+ assert(0);
2865
+ }
2866
+ #endif
2867
+
2868
+ if (row < 0)
2869
+ {
2870
+ row += Rows;
2871
+ }
2872
+
2873
+ mat_t<Rows,Cols,Type> ret(m);
2874
+ for(unsigned i=0; i < Cols; ++i)
2875
+ {
2876
+ ret.data[row][i] = value[i];
2877
+ }
2878
+ return ret;
2879
+ }
2880
+
2881
+
2882
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2883
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2884
+ {
2885
+ mat_t<Rows, Cols, Type> ret(m);
2886
+ assign_inplace(ret, row_slice, value);
2887
+ return ret;
2888
+ }
2889
+
2890
+
2891
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
2892
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value)
2893
+ {
2894
+ mat_t<Rows, Cols, Type> ret(m);
2895
+ assign_inplace(ret, row_slice, col, value);
2896
+ return ret;
2897
+ }
2898
+
2899
+
2900
+ template<unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2901
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, int row, slice_t col_slice, vec_t<ColSliceLength, Type>& value)
2902
+ {
2903
+ mat_t<Rows, Cols, Type> ret(m);
2904
+ assign_inplace(ret, row, col_slice, value);
2905
+ return ret;
2906
+ }
2907
+
2908
+
2909
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2910
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign_copy(mat_t<Rows,Cols,Type>& m, slice_t row_slice, slice_t col_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value)
2911
+ {
2912
+ mat_t<Rows, Cols, Type> ret(m);
2913
+ assign_inplace(ret, row_slice, col_slice, value);
2914
+ return ret;
2915
+ }
2916
+
2917
+
2918
+ template<unsigned Rows, unsigned Cols, typename Type>
2919
+ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
2920
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, int& adj_col, Type& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2921
+ {
2922
+ #ifndef NDEBUG
2923
+ if (row < -(int)Rows || row >= (int)Rows)
2924
+ {
2925
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2926
+ assert(0);
2927
+ }
2928
+ if (col < -(int)Cols || col >= (int)Cols)
2929
+ {
2930
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
2931
+ assert(0);
2932
+ }
2933
+ #endif
2934
+
2935
+ if (row < 0)
2936
+ {
2937
+ row += Rows;
2938
+ }
2939
+ if (col < 0)
2940
+ {
2941
+ col += Cols;
2942
+ }
2943
+
2944
+ adj_value += adj_ret.data[row][col];
2945
+ for(unsigned i=0; i < Rows; ++i)
2946
+ {
2947
+ for(unsigned j=0; j < Cols; ++j)
2948
+ {
2949
+ if(i != row || j != col)
2950
+ adj_m.data[i][j] += adj_ret.data[i][j];
2951
+ }
2952
+ }
2953
+ }
2954
+
2955
+
2956
+ template<unsigned Rows, unsigned Cols, typename Type>
2957
+ inline CUDA_CALLABLE void adj_assign_copy(mat_t<Rows,Cols,Type>& m, int row, vec_t<Cols,Type>& value,
2958
+ mat_t<Rows,Cols,Type>& adj_m, int& adj_row, vec_t<Cols,Type>& adj_value, const mat_t<Rows,Cols,Type>& adj_ret)
2959
+ {
2960
+ #ifndef NDEBUG
2961
+ if (row < -(int)Rows || row >= (int)Rows)
2962
+ {
2963
+ printf("mat row index %d out of bounds at %s %d\n", row, __FILE__, __LINE__);
2964
+ assert(0);
2965
+ }
2966
+ #endif
2967
+
2968
+ if (row < 0)
2969
+ {
2970
+ row += Rows;
2971
+ }
2972
+
2973
+ for(unsigned i=0; i < Rows; ++i)
2974
+ {
2975
+ for(unsigned j=0; j < Cols; ++j)
2976
+ {
2977
+ if (i==row)
2978
+ adj_value[j] += adj_ret.data[i][j];
2979
+ else
2980
+ adj_m.data[i][j] += adj_ret.data[i][j];
2981
+ }
2982
+ }
2983
+ }
2984
+
2985
+
2986
+ template<unsigned RowSliceLength, unsigned ColSliceLength, unsigned Rows, unsigned Cols, typename Type>
2987
+ inline CUDA_CALLABLE void adj_assign_copy(
2988
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& value,
2989
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, mat_t<RowSliceLength, ColSliceLength, Type>& adj_value,
2990
+ mat_t<Rows,Cols,Type>& adj_ret
2991
+ )
2992
+ {
2993
+ static_assert(
2994
+ RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols,
2995
+ "Expected RowSliceLength == 0 ? ColSliceLength == 0 : ColSliceLength == Cols"
2996
+ );
2997
+
2998
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
2999
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
3000
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
3001
+ assert(slice_get_length(row_slice) == RowSliceLength);
3002
+
3003
+ bool is_row_reversed = row_slice.step < 0;
3004
+
3005
+ int ii = 0;
3006
+ for (int i = 0; i < Rows; ++i)
3007
+ {
3008
+ bool in_row_slice = is_row_reversed
3009
+ ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
3010
+ : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
3011
+
3012
+ if (!in_row_slice)
3013
+ {
3014
+ for (int j = 0; j < Cols; ++j)
3015
+ {
3016
+ adj_m.data[i][j] += adj_ret.data[i][j];
3017
+ }
3018
+ }
3019
+ else
3020
+ {
3021
+ for (int j = 0; j < Cols; ++j)
3022
+ {
3023
+ adj_value.data[ii][j] += adj_ret.data[i][j];
3024
+ }
3025
+
3026
+ ++ii;
3027
+ }
3028
+ }
3029
+
3030
+ assert(ii == RowSliceLength);
3031
+ }
3032
+
3033
+
3034
+ template<unsigned RowSliceLength, unsigned Rows, unsigned Cols, typename Type>
3035
+ inline CUDA_CALLABLE void adj_assign_copy(
3036
+ mat_t<Rows,Cols,Type>& m, slice_t row_slice, int col, vec_t<RowSliceLength, Type>& value,
3037
+ mat_t<Rows,Cols,Type>& adj_m, slice_t& adj_row_slice, int& adj_col, vec_t<RowSliceLength, Type>& adj_value,
3038
+ mat_t<Rows,Cols,Type>& adj_ret
3039
+ )
3040
+ {
3041
+ #ifndef NDEBUG
3042
+ if (col < -(int)Cols || col >= (int)Cols)
3043
+ {
3044
+ printf("mat col index %d out of bounds at %s %d\n", col, __FILE__, __LINE__);
3045
+ assert(0);
3046
+ }
3047
+ #endif
3048
+
3049
+ assert(row_slice.start >= 0 && row_slice.start <= (int)Rows);
3050
+ assert(row_slice.stop >= -1 && row_slice.stop <= (int)Rows);
3051
+ assert(row_slice.step != 0 && row_slice.step < 0 ? row_slice.start >= row_slice.stop : row_slice.start <= row_slice.stop);
3052
+ assert(slice_get_length(row_slice) == RowSliceLength);
3053
+
3054
+ if (col < 0)
3055
+ {
3056
+ col += Cols;
3057
+ }
3058
+
3059
+ bool is_row_reversed = row_slice.step < 0;
3060
+
3061
+ int ii = 0;
3062
+ for (int i = 0; i < Rows; ++i)
3063
+ {
3064
+ bool in_row_slice = is_row_reversed
3065
+ ? (i <= row_slice.start && i > row_slice.stop && (row_slice.start - i) % (-row_slice.step) == 0)
3066
+ : (i >= row_slice.start && i < row_slice.stop && (i - row_slice.start) % row_slice.step == 0);
3067
+
3068
+ if (!in_row_slice)
3069
+ {
2287
3070
  for (int j = 0; j < Cols; ++j)
2288
3071
  {
2289
3072
  adj_m.data[i][j] += adj_ret.data[i][j];
@@ -2427,69 +3210,298 @@ inline CUDA_CALLABLE void adj_assign_copy(
2427
3210
  }
2428
3211
  }
2429
3212
 
2430
- assert(jj == ColSliceLength);
2431
- ++ii;
3213
+ assert(jj == ColSliceLength);
3214
+ ++ii;
3215
+ }
3216
+ }
3217
+
3218
+ assert(ii == RowSliceLength);
3219
+ }
3220
+
3221
+
3222
+ template<unsigned Rows, unsigned Cols, typename Type>
3223
+ inline bool CUDA_CALLABLE isfinite(const mat_t<Rows,Cols,Type>& m)
3224
+ {
3225
+ for (unsigned i=0; i < Rows; ++i)
3226
+ for (unsigned j=0; j < Cols; ++j)
3227
+ if (!isfinite(m.data[i][j]))
3228
+ return false;
3229
+ return true;
3230
+ }
3231
+
3232
+ template<unsigned Rows, unsigned Cols, typename Type>
3233
+ inline void CUDA_CALLABLE adj_isfinite(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
3234
+ {
3235
+ }
3236
+
3237
+ template<unsigned Rows, unsigned Cols, typename Type>
3238
+ inline bool CUDA_CALLABLE isnan(const mat_t<Rows,Cols,Type>& m)
3239
+ {
3240
+ for (unsigned i=0; i < Rows; ++i)
3241
+ for (unsigned j=0; j < Cols; ++j)
3242
+ if (isnan(m.data[i][j]))
3243
+ return true;
3244
+ return false;
3245
+ }
3246
+
3247
+ template<unsigned Rows, unsigned Cols, typename Type>
3248
+ inline void CUDA_CALLABLE adj_isnan(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
3249
+ {
3250
+ }
3251
+
3252
+ template<unsigned Rows, unsigned Cols, typename Type>
3253
+ inline bool CUDA_CALLABLE isinf(const mat_t<Rows,Cols,Type>& m)
3254
+ {
3255
+ for (unsigned i=0; i < Rows; ++i)
3256
+ for (unsigned j=0; j < Cols; ++j)
3257
+ if (isinf(m.data[i][j]))
3258
+ return true;
3259
+ return false;
3260
+ }
3261
+
3262
+ template<unsigned Rows, unsigned Cols, typename Type>
3263
+ inline void CUDA_CALLABLE adj_isinf(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
3264
+ {
3265
+ }
3266
+
3267
+ template<unsigned Rows, unsigned Cols, typename Type>
3268
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
3269
+ {
3270
+ mat_t<Rows,Cols,Type> t;
3271
+ for (unsigned i=0; i < Rows; ++i)
3272
+ {
3273
+ for (unsigned j=0; j < Cols; ++j)
3274
+ {
3275
+ t.data[i][j] = a.data[i][j] + b.data[i][j];
3276
+ }
3277
+ }
3278
+
3279
+ return t;
3280
+ }
3281
+
3282
+ template<unsigned Rows, unsigned Cols, typename Type>
3283
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(Type a, const mat_t<Rows,Cols,Type>& b)
3284
+ {
3285
+ mat_t<Rows,Cols,Type> t;
3286
+ for (unsigned i=0; i < Rows; ++i)
3287
+ {
3288
+ for (unsigned j=0; j < Cols; ++j)
3289
+ {
3290
+ t.data[i][j] = a + b.data[i][j];
3291
+ }
3292
+ }
3293
+
3294
+ return t;
3295
+ }
3296
+
3297
+ template<unsigned Rows, unsigned Cols, typename Type>
3298
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
3299
+ {
3300
+ mat_t<Rows,Cols,Type> t;
3301
+ for (unsigned i=0; i < Rows; ++i)
3302
+ {
3303
+ for (unsigned j=0; j < Cols; ++j)
3304
+ {
3305
+ t.data[i][j] = a.data[i][j] - b.data[i][j];
3306
+ }
3307
+ }
3308
+
3309
+ return t;
3310
+ }
3311
+
3312
+ template<unsigned Rows, unsigned Cols, typename Type>
3313
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(Type a, const mat_t<Rows,Cols,Type>& b)
3314
+ {
3315
+ mat_t<Rows,Cols,Type> t;
3316
+ for (unsigned i=0; i < Rows; ++i)
3317
+ {
3318
+ for (unsigned j=0; j < Cols; ++j)
3319
+ {
3320
+ t.data[i][j] = a - b.data[i][j];
3321
+ }
3322
+ }
3323
+
3324
+ return t;
3325
+ }
3326
+
3327
+ template<unsigned Rows, unsigned Cols, typename Type>
3328
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, Type b)
3329
+ {
3330
+ mat_t<Rows,Cols,Type> t;
3331
+ for (unsigned i=0; i < Rows; ++i)
3332
+ {
3333
+ for (unsigned j=0; j < Cols; ++j)
3334
+ {
3335
+ t.data[i][j] = a.data[i][j]/b;
3336
+ }
3337
+ }
3338
+
3339
+ return t;
3340
+ }
3341
+
3342
+ template<unsigned Rows, unsigned Cols, typename Type>
3343
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(Type b, const mat_t<Rows,Cols,Type>& a)
3344
+ {
3345
+ mat_t<Rows,Cols,Type> t;
3346
+ for (unsigned i=0; i < Rows; ++i)
3347
+ {
3348
+ for (unsigned j=0; j < Cols; ++j)
3349
+ {
3350
+ t.data[i][j] = b / a.data[i][j];
3351
+ }
3352
+ }
3353
+
3354
+ return t;
3355
+ }
3356
+
3357
+ template<unsigned Rows, unsigned Cols, typename Type>
3358
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(const mat_t<Rows,Cols,Type>& a, Type b)
3359
+ {
3360
+ mat_t<Rows,Cols,Type> t;
3361
+ for (unsigned i=0; i < Rows; ++i)
3362
+ {
3363
+ for (unsigned j=0; j < Cols; ++j)
3364
+ {
3365
+ t.data[i][j] = a.data[i][j]*b;
3366
+ }
3367
+ }
3368
+
3369
+ return t;
3370
+ }
3371
+
3372
+ template<unsigned Rows, unsigned Cols, typename Type>
3373
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(Type b, const mat_t<Rows,Cols,Type>& a)
3374
+ {
3375
+ return mul(a,b);
3376
+ }
3377
+
3378
+
3379
+ template<unsigned Rows, unsigned Cols, typename Type>
3380
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> operator*(Type b, const mat_t<Rows,Cols,Type>& a)
3381
+ {
3382
+ return mul(a,b);
3383
+ }
3384
+
3385
+ template<unsigned Rows, unsigned Cols, typename Type>
3386
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> operator*( const mat_t<Rows,Cols,Type>& a, Type b)
3387
+ {
3388
+ return mul(a,b);
3389
+ }
3390
+
3391
+ template<unsigned Rows, unsigned Cols, typename Type>
3392
+ inline CUDA_CALLABLE vec_t<Rows,Type> mul(const mat_t<Rows,Cols,Type>& a, const vec_t<Cols,Type>& b)
3393
+ {
3394
+ vec_t<Rows,Type> r = a.get_col(0)*b[0];
3395
+ for( unsigned i=1; i < Cols; ++i )
3396
+ {
3397
+ r += a.get_col(i)*b[i];
3398
+ }
3399
+ return r;
3400
+ }
3401
+
3402
+ template<unsigned Rows, unsigned Cols, typename Type>
3403
+ inline CUDA_CALLABLE vec_t<Cols,Type> mul(const vec_t<Rows,Type>& b, const mat_t<Rows,Cols,Type>& a)
3404
+ {
3405
+ vec_t<Cols,Type> r = a.get_row(0)*b[0];
3406
+ for( unsigned i=1; i < Rows; ++i )
3407
+ {
3408
+ r += a.get_row(i)*b[i];
3409
+ }
3410
+ return r;
3411
+ }
3412
+
3413
+ template<typename T>
3414
+ inline CUDA_CALLABLE T muladd(T a, T b, T c) {
3415
+ return c + a*b;
3416
+ }
3417
+ template<>
3418
+ inline CUDA_CALLABLE float muladd(float a, float b, float c) {
3419
+ return fmaf(a, b, c);
3420
+ }
3421
+ template<>
3422
+ inline CUDA_CALLABLE double muladd(double a, double b, double c) {
3423
+ return fma(a, b, c);
3424
+ }
3425
+
3426
+
3427
+ template<unsigned Rows, unsigned Cols, unsigned ColsOut, typename Type>
3428
+ inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a, const mat_t<Cols,ColsOut,Type>& b)
3429
+ {
3430
+ mat_t<Rows,ColsOut,Type> t(0);
3431
+ for (unsigned i=0; i < Rows; ++i)
3432
+ {
3433
+ for (unsigned j=0; j < ColsOut; ++j)
3434
+ {
3435
+ Type sum(0.0);
3436
+
3437
+ for (unsigned k=0; k < Cols; ++k)
3438
+ {
3439
+ sum = muladd<Type>(a.data[i][k], b.data[k][j], sum);
3440
+ }
3441
+
3442
+ t.data[i][j] = sum;
2432
3443
  }
2433
3444
  }
2434
-
2435
- assert(ii == RowSliceLength);
3445
+
3446
+ return t;
2436
3447
  }
2437
3448
 
2438
-
3449
+ // bitwise AND
2439
3450
  template<unsigned Rows, unsigned Cols, typename Type>
2440
- inline bool CUDA_CALLABLE isfinite(const mat_t<Rows,Cols,Type>& m)
3451
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_and(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
2441
3452
  {
3453
+ mat_t<Rows,Cols,Type> t;
2442
3454
  for (unsigned i=0; i < Rows; ++i)
3455
+ {
2443
3456
  for (unsigned j=0; j < Cols; ++j)
2444
- if (!isfinite(m.data[i][j]))
2445
- return false;
2446
- return true;
2447
- }
3457
+ {
3458
+ t.data[i][j] = a.data[i][j] & b.data[i][j];
3459
+ }
3460
+ }
2448
3461
 
2449
- template<unsigned Rows, unsigned Cols, typename Type>
2450
- inline void CUDA_CALLABLE adj_isfinite(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
2451
- {
3462
+ return t;
2452
3463
  }
2453
3464
 
2454
3465
  template<unsigned Rows, unsigned Cols, typename Type>
2455
- inline bool CUDA_CALLABLE isnan(const mat_t<Rows,Cols,Type>& m)
3466
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_and(const mat_t<Rows,Cols,Type>& a, Type b)
2456
3467
  {
3468
+ mat_t<Rows,Cols,Type> t;
2457
3469
  for (unsigned i=0; i < Rows; ++i)
3470
+ {
2458
3471
  for (unsigned j=0; j < Cols; ++j)
2459
- if (isnan(m.data[i][j]))
2460
- return true;
2461
- return false;
2462
- }
3472
+ {
3473
+ t.data[i][j] = a.data[i][j] & b;
3474
+ }
3475
+ }
2463
3476
 
2464
- template<unsigned Rows, unsigned Cols, typename Type>
2465
- inline void CUDA_CALLABLE adj_isnan(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
2466
- {
3477
+ return t;
2467
3478
  }
2468
3479
 
2469
3480
  template<unsigned Rows, unsigned Cols, typename Type>
2470
- inline bool CUDA_CALLABLE isinf(const mat_t<Rows,Cols,Type>& m)
3481
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_and(Type a, const mat_t<Rows,Cols,Type>& b)
2471
3482
  {
3483
+ mat_t<Rows,Cols,Type> t;
2472
3484
  for (unsigned i=0; i < Rows; ++i)
3485
+ {
2473
3486
  for (unsigned j=0; j < Cols; ++j)
2474
- if (isinf(m.data[i][j]))
2475
- return true;
2476
- return false;
2477
- }
3487
+ {
3488
+ t.data[i][j] = a & b.data[i][j];
3489
+ }
3490
+ }
2478
3491
 
2479
- template<unsigned Rows, unsigned Cols, typename Type>
2480
- inline void CUDA_CALLABLE adj_isinf(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const bool &adj_ret)
2481
- {
3492
+ return t;
2482
3493
  }
2483
3494
 
3495
+ // bitwise OR
2484
3496
  template<unsigned Rows, unsigned Cols, typename Type>
2485
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
3497
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_or(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
2486
3498
  {
2487
3499
  mat_t<Rows,Cols,Type> t;
2488
3500
  for (unsigned i=0; i < Rows; ++i)
2489
3501
  {
2490
3502
  for (unsigned j=0; j < Cols; ++j)
2491
3503
  {
2492
- t.data[i][j] = a.data[i][j] + b.data[i][j];
3504
+ t.data[i][j] = a.data[i][j] | b.data[i][j];
2493
3505
  }
2494
3506
  }
2495
3507
 
@@ -2497,14 +3509,14 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(const mat_t<Rows,Cols,Type>& a, c
2497
3509
  }
2498
3510
 
2499
3511
  template<unsigned Rows, unsigned Cols, typename Type>
2500
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(Type a, const mat_t<Rows,Cols,Type>& b)
3512
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_or(const mat_t<Rows,Cols,Type>& a, Type b)
2501
3513
  {
2502
3514
  mat_t<Rows,Cols,Type> t;
2503
3515
  for (unsigned i=0; i < Rows; ++i)
2504
3516
  {
2505
3517
  for (unsigned j=0; j < Cols; ++j)
2506
3518
  {
2507
- t.data[i][j] = a + b.data[i][j];
3519
+ t.data[i][j] = a.data[i][j] | b;
2508
3520
  }
2509
3521
  }
2510
3522
 
@@ -2512,29 +3524,30 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> add(Type a, const mat_t<Rows,Cols,Typ
2512
3524
  }
2513
3525
 
2514
3526
  template<unsigned Rows, unsigned Cols, typename Type>
2515
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
3527
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_or(Type a, const mat_t<Rows,Cols,Type>& b)
2516
3528
  {
2517
3529
  mat_t<Rows,Cols,Type> t;
2518
3530
  for (unsigned i=0; i < Rows; ++i)
2519
3531
  {
2520
3532
  for (unsigned j=0; j < Cols; ++j)
2521
3533
  {
2522
- t.data[i][j] = a.data[i][j] - b.data[i][j];
3534
+ t.data[i][j] = a | b.data[i][j];
2523
3535
  }
2524
3536
  }
2525
3537
 
2526
3538
  return t;
2527
3539
  }
2528
3540
 
3541
+ // bitwise XOR
2529
3542
  template<unsigned Rows, unsigned Cols, typename Type>
2530
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(Type a, const mat_t<Rows,Cols,Type>& b)
3543
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_xor(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
2531
3544
  {
2532
3545
  mat_t<Rows,Cols,Type> t;
2533
3546
  for (unsigned i=0; i < Rows; ++i)
2534
3547
  {
2535
3548
  for (unsigned j=0; j < Cols; ++j)
2536
3549
  {
2537
- t.data[i][j] = a - b.data[i][j];
3550
+ t.data[i][j] = a.data[i][j] ^ b.data[i][j];
2538
3551
  }
2539
3552
  }
2540
3553
 
@@ -2542,14 +3555,14 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> sub(Type a, const mat_t<Rows,Cols,Typ
2542
3555
  }
2543
3556
 
2544
3557
  template<unsigned Rows, unsigned Cols, typename Type>
2545
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, Type b)
3558
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_xor(const mat_t<Rows,Cols,Type>& a, Type b)
2546
3559
  {
2547
3560
  mat_t<Rows,Cols,Type> t;
2548
3561
  for (unsigned i=0; i < Rows; ++i)
2549
3562
  {
2550
3563
  for (unsigned j=0; j < Cols; ++j)
2551
3564
  {
2552
- t.data[i][j] = a.data[i][j]/b;
3565
+ t.data[i][j] = a.data[i][j] ^ b;
2553
3566
  }
2554
3567
  }
2555
3568
 
@@ -2557,29 +3570,30 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(const mat_t<Rows,Cols,Type>& a, T
2557
3570
  }
2558
3571
 
2559
3572
  template<unsigned Rows, unsigned Cols, typename Type>
2560
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> div(Type b, const mat_t<Rows,Cols,Type>& a)
3573
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> bit_xor(Type a, const mat_t<Rows,Cols,Type>& b)
2561
3574
  {
2562
3575
  mat_t<Rows,Cols,Type> t;
2563
3576
  for (unsigned i=0; i < Rows; ++i)
2564
3577
  {
2565
3578
  for (unsigned j=0; j < Cols; ++j)
2566
3579
  {
2567
- t.data[i][j] = b / a.data[i][j];
3580
+ t.data[i][j] = a ^ b.data[i][j];
2568
3581
  }
2569
3582
  }
2570
3583
 
2571
3584
  return t;
2572
3585
  }
2573
3586
 
3587
+ // left shift
2574
3588
  template<unsigned Rows, unsigned Cols, typename Type>
2575
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(const mat_t<Rows,Cols,Type>& a, Type b)
3589
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> lshift(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
2576
3590
  {
2577
3591
  mat_t<Rows,Cols,Type> t;
2578
3592
  for (unsigned i=0; i < Rows; ++i)
2579
3593
  {
2580
3594
  for (unsigned j=0; j < Cols; ++j)
2581
3595
  {
2582
- t.data[i][j] = a.data[i][j]*b;
3596
+ t.data[i][j] = a.data[i][j] << b.data[i][j];
2583
3597
  }
2584
3598
  }
2585
3599
 
@@ -2587,79 +3601,94 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(const mat_t<Rows,Cols,Type>& a, T
2587
3601
  }
2588
3602
 
2589
3603
  template<unsigned Rows, unsigned Cols, typename Type>
2590
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> mul(Type b, const mat_t<Rows,Cols,Type>& a)
3604
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> lshift(const mat_t<Rows,Cols,Type>& a, Type b)
2591
3605
  {
2592
- return mul(a,b);
2593
- }
3606
+ mat_t<Rows,Cols,Type> t;
3607
+ for (unsigned i=0; i < Rows; ++i)
3608
+ {
3609
+ for (unsigned j=0; j < Cols; ++j)
3610
+ {
3611
+ t.data[i][j] = a.data[i][j] << b;
3612
+ }
3613
+ }
2594
3614
 
3615
+ return t;
3616
+ }
2595
3617
 
2596
3618
  template<unsigned Rows, unsigned Cols, typename Type>
2597
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> operator*(Type b, const mat_t<Rows,Cols,Type>& a)
3619
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> lshift(Type a, const mat_t<Rows,Cols,Type>& b)
2598
3620
  {
2599
- return mul(a,b);
3621
+ mat_t<Rows,Cols,Type> t;
3622
+ for (unsigned i=0; i < Rows; ++i)
3623
+ {
3624
+ for (unsigned j=0; j < Cols; ++j)
3625
+ {
3626
+ t.data[i][j] = a << b.data[i][j];
3627
+ }
3628
+ }
3629
+
3630
+ return t;
2600
3631
  }
2601
3632
 
3633
+ // right shift
2602
3634
  template<unsigned Rows, unsigned Cols, typename Type>
2603
- inline CUDA_CALLABLE mat_t<Rows,Cols,Type> operator*( const mat_t<Rows,Cols,Type>& a, Type b)
3635
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> rshift(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
2604
3636
  {
2605
- return mul(a,b);
3637
+ mat_t<Rows,Cols,Type> t;
3638
+ for (unsigned i=0; i < Rows; ++i)
3639
+ {
3640
+ for (unsigned j=0; j < Cols; ++j)
3641
+ {
3642
+ t.data[i][j] = a.data[i][j] >> b.data[i][j];
3643
+ }
3644
+ }
3645
+
3646
+ return t;
2606
3647
  }
2607
3648
 
2608
3649
  template<unsigned Rows, unsigned Cols, typename Type>
2609
- inline CUDA_CALLABLE vec_t<Rows,Type> mul(const mat_t<Rows,Cols,Type>& a, const vec_t<Cols,Type>& b)
3650
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> rshift(const mat_t<Rows,Cols,Type>& a, Type b)
2610
3651
  {
2611
- vec_t<Rows,Type> r = a.get_col(0)*b[0];
2612
- for( unsigned i=1; i < Cols; ++i )
3652
+ mat_t<Rows,Cols,Type> t;
3653
+ for (unsigned i=0; i < Rows; ++i)
2613
3654
  {
2614
- r += a.get_col(i)*b[i];
3655
+ for (unsigned j=0; j < Cols; ++j)
3656
+ {
3657
+ t.data[i][j] = a.data[i][j] >> b;
3658
+ }
2615
3659
  }
2616
- return r;
3660
+
3661
+ return t;
2617
3662
  }
2618
3663
 
2619
3664
  template<unsigned Rows, unsigned Cols, typename Type>
2620
- inline CUDA_CALLABLE vec_t<Cols,Type> mul(const vec_t<Rows,Type>& b, const mat_t<Rows,Cols,Type>& a)
3665
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> rshift(Type a, const mat_t<Rows,Cols,Type>& b)
2621
3666
  {
2622
- vec_t<Cols,Type> r = a.get_row(0)*b[0];
2623
- for( unsigned i=1; i < Rows; ++i )
3667
+ mat_t<Rows,Cols,Type> t;
3668
+ for (unsigned i=0; i < Rows; ++i)
2624
3669
  {
2625
- r += a.get_row(i)*b[i];
3670
+ for (unsigned j=0; j < Cols; ++j)
3671
+ {
3672
+ t.data[i][j] = a >> b.data[i][j];
3673
+ }
2626
3674
  }
2627
- return r;
2628
- }
2629
3675
 
2630
- template<typename T>
2631
- inline CUDA_CALLABLE T muladd(T a, T b, T c) {
2632
- return c + a*b;
2633
- }
2634
- template<>
2635
- inline CUDA_CALLABLE float muladd(float a, float b, float c) {
2636
- return fmaf(a, b, c);
2637
- }
2638
- template<>
2639
- inline CUDA_CALLABLE double muladd(double a, double b, double c) {
2640
- return fma(a, b, c);
3676
+ return t;
2641
3677
  }
2642
3678
 
2643
-
2644
- template<unsigned Rows, unsigned Cols, unsigned ColsOut, typename Type>
2645
- inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a, const mat_t<Cols,ColsOut,Type>& b)
3679
+ // invert
3680
+ template<unsigned Rows, unsigned Cols, typename Type>
3681
+ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> invert(const mat_t<Rows,Cols,Type>& m)
2646
3682
  {
2647
- mat_t<Rows,ColsOut,Type> t(0);
3683
+ mat_t<Rows,Cols,Type> t;
2648
3684
  for (unsigned i=0; i < Rows; ++i)
2649
- {
2650
- for (unsigned j=0; j < ColsOut; ++j)
3685
+ {
3686
+ for (unsigned j=0; j < Cols; ++j)
2651
3687
  {
2652
- Type sum(0.0);
2653
-
2654
- for (unsigned k=0; k < Cols; ++k)
2655
- {
2656
- sum = muladd<Type>(a.data[i][k], b.data[k][j], sum);
2657
- }
2658
-
2659
- t.data[i][j] = sum;
3688
+ t.data[i][j] = ~m.data[i][j];
2660
3689
  }
2661
3690
  }
2662
-
3691
+
2663
3692
  return t;
2664
3693
  }
2665
3694
 
@@ -2719,10 +3748,11 @@ inline CUDA_CALLABLE Type determinant(const mat_t<3,3,Type>& m)
2719
3748
  );
2720
3749
  }
2721
3750
 
3751
+ // Adapted from USD - see licenses/usd-LICENSE.txt
3752
+ // Copyright 2016 Pixar
2722
3753
  template<typename Type>
2723
3754
  inline CUDA_CALLABLE Type determinant(const mat_t<4,4,Type>& m)
2724
3755
  {
2725
- // adapted from USD GfMatrix4f::Inverse()
2726
3756
  Type x00, x01, x02, x03;
2727
3757
  Type x10, x11, x12, x13;
2728
3758
  Type x20, x21, x22, x23;
@@ -2818,16 +3848,16 @@ inline CUDA_CALLABLE mat_t<2,2,Type> inverse(const mat_t<2,2,Type>& m)
2818
3848
  template<typename Type>
2819
3849
  inline CUDA_CALLABLE mat_t<3,3,Type> inverse(const mat_t<3,3,Type>& m)
2820
3850
  {
2821
- Type det = determinant(m);
3851
+ Type det = determinant(m);
2822
3852
 
2823
- if (det != Type(0.0f))
2824
- {
2825
- mat_t<3,3,Type> b;
2826
-
2827
- b.data[0][0] = m.data[1][1]*m.data[2][2] - m.data[1][2]*m.data[2][1];
2828
- b.data[1][0] = m.data[1][2]*m.data[2][0] - m.data[1][0]*m.data[2][2];
2829
- b.data[2][0] = m.data[1][0]*m.data[2][1] - m.data[1][1]*m.data[2][0];
2830
-
3853
+ if (det != Type(0.0f))
3854
+ {
3855
+ mat_t<3,3,Type> b;
3856
+
3857
+ b.data[0][0] = m.data[1][1]*m.data[2][2] - m.data[1][2]*m.data[2][1];
3858
+ b.data[1][0] = m.data[1][2]*m.data[2][0] - m.data[1][0]*m.data[2][2];
3859
+ b.data[2][0] = m.data[1][0]*m.data[2][1] - m.data[1][1]*m.data[2][0];
3860
+
2831
3861
  b.data[0][1] = m.data[0][2]*m.data[2][1] - m.data[0][1]*m.data[2][2];
2832
3862
  b.data[1][1] = m.data[0][0]*m.data[2][2] - m.data[0][2]*m.data[2][0];
2833
3863
  b.data[2][1] = m.data[0][1]*m.data[2][0] - m.data[0][0]*m.data[2][1];
@@ -2836,18 +3866,19 @@ inline CUDA_CALLABLE mat_t<3,3,Type> inverse(const mat_t<3,3,Type>& m)
2836
3866
  b.data[1][2] = m.data[0][2]*m.data[1][0] - m.data[0][0]*m.data[1][2];
2837
3867
  b.data[2][2] = m.data[0][0]*m.data[1][1] - m.data[0][1]*m.data[1][0];
2838
3868
 
2839
- return b*(Type(1.0f)/det);
2840
- }
2841
- else
2842
- {
2843
- return mat_t<3,3,Type>();
2844
- }
3869
+ return b*(Type(1.0f)/det);
3870
+ }
3871
+ else
3872
+ {
3873
+ return mat_t<3,3,Type>();
3874
+ }
2845
3875
  }
2846
3876
 
3877
+ // Adapted from USD - see licenses/usd-LICENSE.txt
3878
+ // Copyright 2016 Pixar
2847
3879
  template<typename Type>
2848
3880
  inline CUDA_CALLABLE mat_t<4,4,Type> inverse(const mat_t<4,4,Type>& m)
2849
3881
  {
2850
- // adapted from USD GfMatrix4f::Inverse()
2851
3882
  Type x00, x01, x02, x03;
2852
3883
  Type x10, x11, x12, x13;
2853
3884
  Type x20, x21, x22, x23;
@@ -3310,6 +4341,126 @@ inline CUDA_CALLABLE void adj_sub(
3310
4341
  }
3311
4342
  }
3312
4343
 
4344
+ template<unsigned Rows, unsigned Cols, typename Type>
4345
+ inline CUDA_CALLABLE void adj_bit_and(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
4346
+ {
4347
+ }
4348
+
4349
+ template<unsigned Rows, unsigned Cols, typename Type>
4350
+ inline CUDA_CALLABLE void adj_bit_and(
4351
+ const mat_t<Rows,Cols,Type>& a, Type b,
4352
+ mat_t<Rows,Cols,Type>& adj_a, Type& adj_b,
4353
+ const mat_t<Rows,Cols,Type>& adj_ret
4354
+ )
4355
+ {
4356
+ }
4357
+
4358
+ template<unsigned Rows, unsigned Cols, typename Type>
4359
+ inline CUDA_CALLABLE void adj_bit_and(
4360
+ Type a, const mat_t<Rows,Cols,Type>& b,
4361
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
4362
+ const mat_t<Rows,Cols,Type>& adj_ret
4363
+ )
4364
+ {
4365
+ }
4366
+
4367
+ template<unsigned Rows, unsigned Cols, typename Type>
4368
+ inline CUDA_CALLABLE void adj_bit_or(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
4369
+ {
4370
+ }
4371
+
4372
+ template<unsigned Rows, unsigned Cols, typename Type>
4373
+ inline CUDA_CALLABLE void adj_bit_or(
4374
+ const mat_t<Rows,Cols,Type>& a, Type b,
4375
+ mat_t<Rows,Cols,Type>& adj_a, Type& adj_b,
4376
+ const mat_t<Rows,Cols,Type>& adj_ret
4377
+ )
4378
+ {
4379
+ }
4380
+
4381
+ template<unsigned Rows, unsigned Cols, typename Type>
4382
+ inline CUDA_CALLABLE void adj_bit_or(
4383
+ Type a, const mat_t<Rows,Cols,Type>& b,
4384
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
4385
+ const mat_t<Rows,Cols,Type>& adj_ret
4386
+ )
4387
+ {
4388
+ }
4389
+
4390
+ template<unsigned Rows, unsigned Cols, typename Type>
4391
+ inline CUDA_CALLABLE void adj_bit_xor(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
4392
+ {
4393
+ }
4394
+
4395
+ template<unsigned Rows, unsigned Cols, typename Type>
4396
+ inline CUDA_CALLABLE void adj_bit_xor(
4397
+ const mat_t<Rows,Cols,Type>& a, Type b,
4398
+ mat_t<Rows,Cols,Type>& adj_a, Type& adj_b,
4399
+ const mat_t<Rows,Cols,Type>& adj_ret
4400
+ )
4401
+ {
4402
+ }
4403
+
4404
+ template<unsigned Rows, unsigned Cols, typename Type>
4405
+ inline CUDA_CALLABLE void adj_bit_xor(
4406
+ Type a, const mat_t<Rows,Cols,Type>& b,
4407
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
4408
+ const mat_t<Rows,Cols,Type>& adj_ret
4409
+ )
4410
+ {
4411
+ }
4412
+
4413
+ template<unsigned Rows, unsigned Cols, typename Type>
4414
+ inline CUDA_CALLABLE void adj_lshift(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
4415
+ {
4416
+ }
4417
+
4418
+ template<unsigned Rows, unsigned Cols, typename Type>
4419
+ inline CUDA_CALLABLE void adj_lshift(
4420
+ const mat_t<Rows,Cols,Type>& a, Type b,
4421
+ mat_t<Rows,Cols,Type>& adj_a, Type& adj_b,
4422
+ const mat_t<Rows,Cols,Type>& adj_ret
4423
+ )
4424
+ {
4425
+ }
4426
+
4427
+ template<unsigned Rows, unsigned Cols, typename Type>
4428
+ inline CUDA_CALLABLE void adj_lshift(
4429
+ Type a, const mat_t<Rows,Cols,Type>& b,
4430
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
4431
+ const mat_t<Rows,Cols,Type>& adj_ret
4432
+ )
4433
+ {
4434
+ }
4435
+
4436
+ template<unsigned Rows, unsigned Cols, typename Type>
4437
+ inline CUDA_CALLABLE void adj_rshift(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b, mat_t<Rows,Cols,Type>& adj_a, mat_t<Rows,Cols,Type>& adj_b, const mat_t<Rows,Cols,Type>& adj_ret)
4438
+ {
4439
+ }
4440
+
4441
+ template<unsigned Rows, unsigned Cols, typename Type>
4442
+ inline CUDA_CALLABLE void adj_rshift(
4443
+ const mat_t<Rows,Cols,Type>& a, Type b,
4444
+ mat_t<Rows,Cols,Type>& adj_a, Type& adj_b,
4445
+ const mat_t<Rows,Cols,Type>& adj_ret
4446
+ )
4447
+ {
4448
+ }
4449
+
4450
+ template<unsigned Rows, unsigned Cols, typename Type>
4451
+ inline CUDA_CALLABLE void adj_rshift(
4452
+ Type a, const mat_t<Rows,Cols,Type>& b,
4453
+ Type& adj_a, mat_t<Rows,Cols,Type>& adj_b,
4454
+ const mat_t<Rows,Cols,Type>& adj_ret
4455
+ )
4456
+ {
4457
+ }
4458
+
4459
+ template<unsigned Rows, unsigned Cols, typename Type>
4460
+ inline CUDA_CALLABLE void adj_invert(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const mat_t<Rows,Cols,Type>& adj_ret)
4461
+ {
4462
+ }
4463
+
3313
4464
  template<unsigned Rows, unsigned Cols, typename Type>
3314
4465
  inline CUDA_CALLABLE void adj_div(const mat_t<Rows,Cols,Type>& a, Type s, mat_t<Rows,Cols,Type>& adj_a, Type& adj_s, const mat_t<Rows,Cols,Type>& adj_ret)
3315
4466
  {
@@ -3429,10 +4580,11 @@ inline CUDA_CALLABLE void adj_determinant(const mat_t<3,3,Type>& m, mat_t<3,3,Ty
3429
4580
  (vec_t<3,Type>&)adj_m.data[2] += cross(m.get_row(0), m.get_row(1))*adj_ret;
3430
4581
  }
3431
4582
 
4583
+ // Adapted from USD - see licenses/usd-LICENSE.txt
4584
+ // Copyright 2016 Pixar
3432
4585
  template<typename Type>
3433
4586
  inline CUDA_CALLABLE void adj_determinant(const mat_t<4,4,Type>& m, mat_t<4,4,Type>& adj_m, Type adj_ret)
3434
4587
  {
3435
- // adapted from USD GfMatrix4f::Inverse()
3436
4588
  Type x00, x01, x02, x03;
3437
4589
  Type x10, x11, x12, x13;
3438
4590
  Type x20, x21, x22, x23;
@@ -3864,6 +5016,34 @@ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_add(
3864
5016
  template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_add(mat_t<Rows, Cols, int64>* buf, const mat_t<Rows, Cols, int64> &value) { }
3865
5017
  template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_add(mat_t<Rows, Cols, uint64>* buf, const mat_t<Rows, Cols, uint64> &value) { }
3866
5018
 
5019
+ // for bitwise operations we do not accumulate gradients
5020
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, int8>* buf, const mat_t<Rows, Cols, int8> &value) { }
5021
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, uint8>* buf, const mat_t<Rows, Cols, uint8> &value) { }
5022
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, int16>* buf, const mat_t<Rows, Cols, int16> &value) { }
5023
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, uint16>* buf, const mat_t<Rows, Cols, uint16> &value) { }
5024
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, int32>* buf, const mat_t<Rows, Cols, int32> &value) { }
5025
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, uint32>* buf, const mat_t<Rows, Cols, uint32> &value) { }
5026
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, int64>* buf, const mat_t<Rows, Cols, int64> &value) { }
5027
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_and(mat_t<Rows, Cols, uint64>* buf, const mat_t<Rows, Cols, uint64> &value) { }
5028
+
5029
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, int8>* buf, const mat_t<Rows, Cols, int8> &value) { }
5030
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, uint8>* buf, const mat_t<Rows, Cols, uint8> &value) { }
5031
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, int16>* buf, const mat_t<Rows, Cols, int16> &value) { }
5032
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, uint16>* buf, const mat_t<Rows, Cols, uint16> &value) { }
5033
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, int32>* buf, const mat_t<Rows, Cols, int32> &value) { }
5034
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, uint32>* buf, const mat_t<Rows, Cols, uint32> &value) { }
5035
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, int64>* buf, const mat_t<Rows, Cols, int64> &value) { }
5036
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_or(mat_t<Rows, Cols, uint64>* buf, const mat_t<Rows, Cols, uint64> &value) { }
5037
+
5038
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, int8>* buf, const mat_t<Rows, Cols, int8> &value) { }
5039
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, uint8>* buf, const mat_t<Rows, Cols, uint8> &value) { }
5040
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, int16>* buf, const mat_t<Rows, Cols, int16> &value) { }
5041
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, uint16>* buf, const mat_t<Rows, Cols, uint16> &value) { }
5042
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, int32>* buf, const mat_t<Rows, Cols, int32> &value) { }
5043
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, uint32>* buf, const mat_t<Rows, Cols, uint32> &value) { }
5044
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, int64>* buf, const mat_t<Rows, Cols, int64> &value) { }
5045
+ template<unsigned Rows, unsigned Cols> CUDA_CALLABLE inline void adj_atomic_xor(mat_t<Rows, Cols, uint64>* buf, const mat_t<Rows, Cols, uint64> &value) { }
5046
+
3867
5047
  using mat22h = mat_t<2,2,half>;
3868
5048
  using mat33h = mat_t<3,3,half>;
3869
5049
  using mat44h = mat_t<4,4,half>;