warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show
  1. warp/__init__.py +10 -4
  2. warp/__init__.pyi +1 -0
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +5 -3
  6. warp/build_dll.py +29 -9
  7. warp/builtins.py +868 -507
  8. warp/codegen.py +1074 -638
  9. warp/config.py +3 -3
  10. warp/constants.py +6 -0
  11. warp/context.py +715 -222
  12. warp/fabric.py +326 -0
  13. warp/fem/__init__.py +27 -0
  14. warp/fem/cache.py +389 -0
  15. warp/fem/dirichlet.py +181 -0
  16. warp/fem/domain.py +263 -0
  17. warp/fem/field/__init__.py +101 -0
  18. warp/fem/field/field.py +149 -0
  19. warp/fem/field/nodal_field.py +299 -0
  20. warp/fem/field/restriction.py +21 -0
  21. warp/fem/field/test.py +181 -0
  22. warp/fem/field/trial.py +183 -0
  23. warp/fem/geometry/__init__.py +19 -0
  24. warp/fem/geometry/closest_point.py +70 -0
  25. warp/fem/geometry/deformed_geometry.py +271 -0
  26. warp/fem/geometry/element.py +744 -0
  27. warp/fem/geometry/geometry.py +186 -0
  28. warp/fem/geometry/grid_2d.py +373 -0
  29. warp/fem/geometry/grid_3d.py +435 -0
  30. warp/fem/geometry/hexmesh.py +953 -0
  31. warp/fem/geometry/partition.py +376 -0
  32. warp/fem/geometry/quadmesh_2d.py +532 -0
  33. warp/fem/geometry/tetmesh.py +840 -0
  34. warp/fem/geometry/trimesh_2d.py +577 -0
  35. warp/fem/integrate.py +1616 -0
  36. warp/fem/operator.py +191 -0
  37. warp/fem/polynomial.py +213 -0
  38. warp/fem/quadrature/__init__.py +2 -0
  39. warp/fem/quadrature/pic_quadrature.py +245 -0
  40. warp/fem/quadrature/quadrature.py +294 -0
  41. warp/fem/space/__init__.py +292 -0
  42. warp/fem/space/basis_space.py +489 -0
  43. warp/fem/space/collocated_function_space.py +105 -0
  44. warp/fem/space/dof_mapper.py +236 -0
  45. warp/fem/space/function_space.py +145 -0
  46. warp/fem/space/grid_2d_function_space.py +267 -0
  47. warp/fem/space/grid_3d_function_space.py +306 -0
  48. warp/fem/space/hexmesh_function_space.py +352 -0
  49. warp/fem/space/partition.py +350 -0
  50. warp/fem/space/quadmesh_2d_function_space.py +369 -0
  51. warp/fem/space/restriction.py +160 -0
  52. warp/fem/space/shape/__init__.py +15 -0
  53. warp/fem/space/shape/cube_shape_function.py +738 -0
  54. warp/fem/space/shape/shape_function.py +103 -0
  55. warp/fem/space/shape/square_shape_function.py +611 -0
  56. warp/fem/space/shape/tet_shape_function.py +567 -0
  57. warp/fem/space/shape/triangle_shape_function.py +429 -0
  58. warp/fem/space/tetmesh_function_space.py +292 -0
  59. warp/fem/space/topology.py +295 -0
  60. warp/fem/space/trimesh_2d_function_space.py +221 -0
  61. warp/fem/types.py +77 -0
  62. warp/fem/utils.py +495 -0
  63. warp/native/array.h +147 -44
  64. warp/native/builtin.h +122 -149
  65. warp/native/bvh.cpp +73 -325
  66. warp/native/bvh.cu +406 -23
  67. warp/native/bvh.h +34 -43
  68. warp/native/clang/clang.cpp +13 -8
  69. warp/native/crt.h +2 -0
  70. warp/native/cuda_crt.h +5 -0
  71. warp/native/cuda_util.cpp +15 -3
  72. warp/native/cuda_util.h +3 -1
  73. warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
  74. warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
  75. warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
  76. warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
  77. warp/native/cutlass/tools/library/scripts/library.py +799 -0
  78. warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
  79. warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
  80. warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
  81. warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
  82. warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
  83. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
  84. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
  85. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
  86. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
  87. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
  88. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
  89. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
  90. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
  91. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
  92. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
  93. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
  94. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
  95. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
  96. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
  97. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
  98. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
  99. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
  100. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
  101. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
  102. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
  103. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
  104. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
  105. warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
  106. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
  107. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
  108. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
  109. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
  110. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
  111. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
  112. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
  113. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
  114. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
  115. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
  116. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
  117. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
  118. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
  119. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
  120. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
  121. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
  122. warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
  123. warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
  124. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
  125. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
  126. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
  127. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
  128. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
  129. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
  130. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
  131. warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
  132. warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
  133. warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
  134. warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
  135. warp/native/cutlass/tools/library/scripts/rt.py +796 -0
  136. warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
  137. warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
  138. warp/native/cutlass_gemm.cu +5 -3
  139. warp/native/exports.h +1240 -952
  140. warp/native/fabric.h +228 -0
  141. warp/native/hashgrid.cpp +4 -4
  142. warp/native/hashgrid.h +22 -2
  143. warp/native/intersect.h +22 -7
  144. warp/native/intersect_adj.h +8 -8
  145. warp/native/intersect_tri.h +1 -1
  146. warp/native/marching.cu +157 -161
  147. warp/native/mat.h +80 -19
  148. warp/native/matnn.h +2 -2
  149. warp/native/mesh.cpp +33 -108
  150. warp/native/mesh.cu +114 -23
  151. warp/native/mesh.h +446 -46
  152. warp/native/noise.h +272 -329
  153. warp/native/quat.h +51 -8
  154. warp/native/rand.h +45 -35
  155. warp/native/range.h +6 -2
  156. warp/native/reduce.cpp +1 -1
  157. warp/native/reduce.cu +10 -12
  158. warp/native/runlength_encode.cu +6 -10
  159. warp/native/scan.cu +8 -11
  160. warp/native/sparse.cpp +4 -4
  161. warp/native/sparse.cu +164 -154
  162. warp/native/spatial.h +2 -2
  163. warp/native/temp_buffer.h +14 -30
  164. warp/native/vec.h +107 -23
  165. warp/native/volume.h +120 -0
  166. warp/native/warp.cpp +560 -30
  167. warp/native/warp.cu +431 -44
  168. warp/native/warp.h +13 -4
  169. warp/optim/__init__.py +1 -0
  170. warp/optim/linear.py +922 -0
  171. warp/optim/sgd.py +92 -0
  172. warp/render/render_opengl.py +335 -119
  173. warp/render/render_usd.py +11 -11
  174. warp/sim/__init__.py +2 -2
  175. warp/sim/articulation.py +385 -185
  176. warp/sim/collide.py +8 -0
  177. warp/sim/import_mjcf.py +297 -106
  178. warp/sim/import_urdf.py +389 -210
  179. warp/sim/import_usd.py +198 -97
  180. warp/sim/inertia.py +17 -18
  181. warp/sim/integrator_euler.py +14 -8
  182. warp/sim/integrator_xpbd.py +158 -16
  183. warp/sim/model.py +795 -291
  184. warp/sim/render.py +3 -3
  185. warp/sim/utils.py +3 -0
  186. warp/sparse.py +640 -150
  187. warp/stubs.py +606 -267
  188. warp/tape.py +61 -10
  189. warp/tests/__main__.py +3 -6
  190. warp/tests/assets/curlnoise_golden.npy +0 -0
  191. warp/tests/assets/pnoise_golden.npy +0 -0
  192. warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
  193. warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
  194. warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
  195. warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
  196. warp/tests/aux_test_unresolved_func.py +14 -0
  197. warp/tests/aux_test_unresolved_symbol.py +14 -0
  198. warp/tests/disabled_kinematics.py +239 -0
  199. warp/tests/run_coverage_serial.py +31 -0
  200. warp/tests/test_adam.py +103 -106
  201. warp/tests/test_arithmetic.py +128 -74
  202. warp/tests/test_array.py +212 -97
  203. warp/tests/test_array_reduce.py +57 -23
  204. warp/tests/test_atomic.py +64 -28
  205. warp/tests/test_bool.py +99 -0
  206. warp/tests/test_builtins_resolution.py +1292 -0
  207. warp/tests/test_bvh.py +42 -18
  208. warp/tests/test_closest_point_edge_edge.py +54 -57
  209. warp/tests/test_codegen.py +208 -130
  210. warp/tests/test_compile_consts.py +28 -20
  211. warp/tests/test_conditional.py +108 -24
  212. warp/tests/test_copy.py +10 -12
  213. warp/tests/test_ctypes.py +112 -88
  214. warp/tests/test_dense.py +21 -14
  215. warp/tests/test_devices.py +98 -0
  216. warp/tests/test_dlpack.py +75 -75
  217. warp/tests/test_examples.py +277 -0
  218. warp/tests/test_fabricarray.py +955 -0
  219. warp/tests/test_fast_math.py +15 -11
  220. warp/tests/test_fem.py +1271 -0
  221. warp/tests/test_fp16.py +53 -19
  222. warp/tests/test_func.py +187 -86
  223. warp/tests/test_generics.py +194 -49
  224. warp/tests/test_grad.py +178 -109
  225. warp/tests/test_grad_customs.py +176 -0
  226. warp/tests/test_hash_grid.py +52 -37
  227. warp/tests/test_import.py +10 -23
  228. warp/tests/test_indexedarray.py +32 -31
  229. warp/tests/test_intersect.py +18 -9
  230. warp/tests/test_large.py +141 -0
  231. warp/tests/test_launch.py +14 -41
  232. warp/tests/test_lerp.py +64 -65
  233. warp/tests/test_linear_solvers.py +154 -0
  234. warp/tests/test_lvalue.py +493 -0
  235. warp/tests/test_marching_cubes.py +12 -13
  236. warp/tests/test_mat.py +517 -2898
  237. warp/tests/test_mat_lite.py +115 -0
  238. warp/tests/test_mat_scalar_ops.py +2889 -0
  239. warp/tests/test_math.py +103 -9
  240. warp/tests/test_matmul.py +305 -69
  241. warp/tests/test_matmul_lite.py +410 -0
  242. warp/tests/test_mesh.py +71 -14
  243. warp/tests/test_mesh_query_aabb.py +41 -25
  244. warp/tests/test_mesh_query_point.py +140 -22
  245. warp/tests/test_mesh_query_ray.py +39 -22
  246. warp/tests/test_mlp.py +30 -22
  247. warp/tests/test_model.py +92 -89
  248. warp/tests/test_modules_lite.py +39 -0
  249. warp/tests/test_multigpu.py +88 -114
  250. warp/tests/test_noise.py +12 -11
  251. warp/tests/test_operators.py +16 -20
  252. warp/tests/test_options.py +11 -11
  253. warp/tests/test_pinned.py +17 -18
  254. warp/tests/test_print.py +32 -11
  255. warp/tests/test_quat.py +275 -129
  256. warp/tests/test_rand.py +18 -16
  257. warp/tests/test_reload.py +38 -34
  258. warp/tests/test_rounding.py +50 -43
  259. warp/tests/test_runlength_encode.py +168 -20
  260. warp/tests/test_smoothstep.py +9 -11
  261. warp/tests/test_snippet.py +143 -0
  262. warp/tests/test_sparse.py +261 -63
  263. warp/tests/test_spatial.py +276 -243
  264. warp/tests/test_streams.py +110 -85
  265. warp/tests/test_struct.py +268 -63
  266. warp/tests/test_tape.py +39 -21
  267. warp/tests/test_torch.py +118 -89
  268. warp/tests/test_transient_module.py +12 -13
  269. warp/tests/test_types.py +614 -0
  270. warp/tests/test_utils.py +494 -0
  271. warp/tests/test_vec.py +354 -2050
  272. warp/tests/test_vec_lite.py +73 -0
  273. warp/tests/test_vec_scalar_ops.py +2099 -0
  274. warp/tests/test_volume.py +457 -293
  275. warp/tests/test_volume_write.py +124 -134
  276. warp/tests/unittest_serial.py +35 -0
  277. warp/tests/unittest_suites.py +341 -0
  278. warp/tests/unittest_utils.py +568 -0
  279. warp/tests/unused_test_misc.py +71 -0
  280. warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
  281. warp/thirdparty/appdirs.py +36 -45
  282. warp/thirdparty/unittest_parallel.py +549 -0
  283. warp/torch.py +9 -6
  284. warp/types.py +1089 -366
  285. warp/utils.py +93 -387
  286. warp_lang-0.11.0.dist-info/METADATA +238 -0
  287. warp_lang-0.11.0.dist-info/RECORD +332 -0
  288. {warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
  289. warp/tests/test_all.py +0 -219
  290. warp/tests/test_array_scan.py +0 -60
  291. warp/tests/test_base.py +0 -208
  292. warp/tests/test_unresolved_func.py +0 -7
  293. warp/tests/test_unresolved_symbol.py +0 -7
  294. warp_lang-0.10.1.dist-info/METADATA +0 -21
  295. warp_lang-0.10.1.dist-info/RECORD +0 -188
  296. /warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
  297. /warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
  298. /warp/tests/{test_square.py → aux_test_square.py} +0 -0
  299. {warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
  300. {warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,398 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ # SPDX-License-Identifier: BSD-3-Clause
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above copyright notice, this
10
+ # list of conditions and the following disclaimer.
11
+ #
12
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # 3. Neither the name of the copyright holder nor the names of its
17
+ # contributors may be used to endorse or promote products derived from
18
+ # this software without specific prior written permission.
19
+ #
20
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+ #
31
+ ################################################################################
32
+ from pycutlass import *
33
+ from pycutlass.c_types import get_reduction_params
34
+ import cutlass
35
+ from cuda import cuda
36
+ try:
37
+ import torch
38
+ torch_available = True
39
+ except ImportError:
40
+ torch_available = False
41
+ import numpy as np
42
+ from typing import Union
43
+ from cuda import cudart
44
+
45
+
46
+ class ReductionOperation:
47
+ pass
48
+
49
+
50
+ class ReductionArguments:
51
+ """
52
+ Arguments of reduction
53
+ """
54
+
55
+ def __init__(self, operation: ReductionOperation,
56
+ problem_size: 'list[int]', partitions: int,
57
+ workspace: cuda.CUdeviceptr,
58
+ destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
59
+ source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
60
+
61
+ # tensor_C can be interpreted as the bias with bias=True in keyword args
62
+ if "bias" in kwargs.keys():
63
+ self.bias = kwargs["bias"]
64
+ else:
65
+ # by default, tensor_C is not bias
66
+ self.bias = False
67
+
68
+ self.operation = operation
69
+ #: pointer to the workspace
70
+ self.ptr_workspace = workspace
71
+
72
+ #: number of split-k partitions
73
+ self.partitions = partitions
74
+
75
+ if isinstance(destination, np.ndarray):
76
+ self.host_D = destination
77
+ self.destination_buffer = NumpyFrontend.argument(destination, True)
78
+ self.source_buffer = NumpyFrontend.argument(source, False)
79
+ self.ptr_destination = cuda.CUdeviceptr(
80
+ self.destination_buffer.ptr)
81
+ self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
82
+ elif torch_available and isinstance(destination, torch.Tensor):
83
+ self.ptr_destination = TorchFrontend.argument(destination)
84
+ self.ptr_source = TorchFrontend.argument(source)
85
+ elif isinstance(destination, cuda.CUdeviceptr):
86
+ self.ptr_destination = destination
87
+ self.ptr_source = source
88
+ else:
89
+ raise TypeError("unknown Type")
90
+
91
+ self.problem_size = MatrixCoord_(
92
+ problem_size[0], problem_size[1]
93
+ )
94
+
95
+ self.partition_stride = problem_size[0] * \
96
+ problem_size[1] * DataTypeSize[operation.C.element] // 8
97
+
98
+ if "output_op" in kwargs.keys():
99
+ self.output_op = kwargs['output_op']
100
+ else:
101
+ self.output_op = self.operation.epilogue_type(1.0, 0.0)
102
+
103
+ # get arguments
104
+ self.get_arguments()
105
+
106
+ @staticmethod
107
+ def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
108
+ if layout == cutlass.RowMajor:
109
+ return TensorRef2D_(int(device_ptr), extent[1])
110
+ else:
111
+ raise ValueError("unknonwn layout type")
112
+
113
+ def get_arguments(self):
114
+ ref_workspace = ReductionArguments.get_tensor_ref(
115
+ extent=[self.problem_size.row, self.problem_size.column],
116
+ device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
117
+ if self.bias:
118
+ ref_source = ReductionArguments.get_tensor_ref(
119
+ extent=[0, 0],
120
+ device_ptr=self.ptr_source, layout=cutlass.RowMajor)
121
+ else:
122
+ ref_source = ReductionArguments.get_tensor_ref(
123
+ extent=[self.problem_size.row, self.problem_size.column],
124
+ device_ptr=self.ptr_source, layout=cutlass.RowMajor)
125
+
126
+ ref_destination = ReductionArguments.get_tensor_ref(
127
+ extent=[self.problem_size.row, self.problem_size.column],
128
+ device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
129
+
130
+
131
+ self.c_arguments = self.operation.argument_type(
132
+ self.problem_size, self.partitions,
133
+ self.partition_stride, ref_workspace,
134
+ ref_destination, ref_source,
135
+ self.output_op
136
+ )
137
+
138
+ params_ = self.operation.rt_module.get_args(
139
+ ctypes.byref(self.c_arguments))
140
+ self.host_workspace = bytearray(params_.contents)
141
+
142
+ def sync(self):
143
+ err, = cudart.cudaDeviceSynchronize()
144
+ if err != cuda.CUresult.CUDA_SUCCESS:
145
+ raise RuntimeError("CUDA Error %s" % str(err))
146
+
147
+ if hasattr(self, "host_D"):
148
+ err, = cuda.cuMemcpyDtoH(
149
+ self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
150
+ if err != cuda.CUresult.CUDA_SUCCESS:
151
+ raise RuntimeError("CUDA Error %s" % str(err))
152
+
153
+ def free(self):
154
+ if hasattr(self, "destination_buffer"):
155
+ del self.destination_buffer
156
+ if hasattr(self, "source_buffer"):
157
+ del self.source_buffer
158
+
159
+
160
+ class ReductionRT(ExecutableOperation):
161
+ """
162
+ ReductionRT manages the CUTLASS runtime components for reduction
163
+ """
164
+ KernelTemplate = r'''
165
+ extern "C"
166
+ __global__ void
167
+ ${operation_name}(${operation_name}${operation_suffix}::Params params) {
168
+
169
+ // Dynamic shared memory base pointer
170
+ extern __shared__ int SharedStorageBase[];
171
+
172
+ // Declare pointer to dynamic shared memory.
173
+ ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
174
+ reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
175
+
176
+ ${operation_name}${operation_suffix} op;
177
+
178
+ op(params, *shared_storage);
179
+ }
180
+ '''
181
+ HostTemplate = r'''
182
+ extern "C" {
183
+ // Get the size of params in bytes
184
+ int ${operation_name}_get_param_size(){
185
+ return sizeof(${operation_name}${operation_suffix}::Params);
186
+ }
187
+
188
+ // Get the size of dynamic shared memory in bytes
189
+ int ${operation_name}_shared_memory_size() {
190
+ return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
191
+ }
192
+
193
+ // Get the params as byte array
194
+ char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
195
+ char *bytes = ((char*)(params));
196
+ char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
197
+ for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
198
+ output[i] = bytes[i];
199
+
200
+ return output;
201
+ }
202
+ }
203
+ '''
204
+
205
+ def __init__(self, operation: ReductionOperation):
206
+ super().__init__(operation)
207
+
208
+ self.operation: ReductionOperation = operation
209
+ self.emitter = EmitReductionInstance('_type')
210
+
211
+ self.elements_per_access = self.operation.count
212
+ self.argument_type, self.epilogue_type = get_reduction_params(operation.epilogue_functor)
213
+ self.argtype = [ctypes.POINTER(self.argument_type)]
214
+
215
+ def emit(self):
216
+ return self.emitter.emit(self.operation)
217
+
218
+ def plan(self, arguments: ReductionArguments):
219
+ block_shape = [self.operation.shape.column(
220
+ ) // self.elements_per_access, self.operation.shape.row(), 1]
221
+ grid_shape = [
222
+ (arguments.problem_size.row + self.operation.shape.row() -
223
+ 1) // self.operation.shape.row(),
224
+ (arguments.problem_size.column + self.operation.shape.column() -
225
+ 1) // self.operation.shape.column(),
226
+ 1
227
+ ]
228
+ return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
229
+
230
+ def initialize(self):
231
+ err, = cuda.cuFuncSetAttribute(
232
+ self.kernel,
233
+ attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
234
+ value=self.shared_memory_capacity)
235
+ if err != cuda.CUresult.CUDA_SUCCESS:
236
+ raise RuntimeError('Cuda Error: {}'.format(err))
237
+
238
+
239
+ class ReductionOperation:
240
+ """
241
+ CUTLASS Reduction Operation
242
+ shape: shape of CTA
243
+ outputop: output operator
244
+ r
245
+ """
246
+
247
+ def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
248
+ element_accumulator, element_workspace=None,
249
+ element_compute=None, epilogue_functor=None,
250
+ count: int = 1, partitions_per_stage: int = 4) -> None:
251
+ """ Constructor
252
+ """
253
+
254
+ self.shape = shape
255
+ #: epilogue functor (default: LinearCombination)
256
+ self.epilogue_functor = epilogue_functor
257
+ #: datatype of accumulator
258
+ self.element_accumulator = element_accumulator
259
+
260
+ if element_workspace is None:
261
+ #: datatype of workspace
262
+ self.element_workspace = element_accumulator
263
+ else:
264
+ #: datatype of workspace
265
+ self.element_workspace = element_workspace
266
+
267
+ if element_compute is None:
268
+ #: datatype of workspace
269
+ self.element_compute = element_accumulator
270
+ else:
271
+ #: datatype of workspace
272
+ self.element_compute = element_compute
273
+
274
+ #: datatype of output
275
+ self.element_output = C.element
276
+
277
+ #: operand C
278
+ self.C: TensorDescription = C
279
+
280
+ #: reduce op processing size
281
+ self.count: int = count
282
+
283
+ #: number of partitions to reduce per stage
284
+ self.partitions_per_stage: int = partitions_per_stage
285
+
286
+ self.rt_module: ReductionRT = ReductionRT(self)
287
+ self.argument_type = self.rt_module.argument_type
288
+ self.epilogue_type = self.rt_module.epilogue_type
289
+
290
+ #
291
+ def extended_name(self):
292
+ extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
293
+
294
+ return SubstituteTemplate(extend_name,
295
+ {
296
+ 'element_workspace': DataTypeNames[self.element_workspace],
297
+ 'element_accumulator': DataTypeNames[self.element_accumulator],
298
+ 'element_compute': DataTypeNames[self.element_compute],
299
+ 'element_output': DataTypeNames[self.element_output]
300
+ })
301
+
302
+ #
303
+ def configuration_name(self):
304
+ ''' The full procedural name indicates architecture, extended name, tile size'''
305
+
306
+ configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
307
+
308
+ threadblock = "%dx%d" % (
309
+ self.shape.row(),
310
+ self.shape.column()
311
+ )
312
+
313
+ return SubstituteTemplate(
314
+ configuration_name,
315
+ {
316
+ 'extended_name': self.extended_name(),
317
+ 'threadblock': threadblock
318
+ }
319
+ )
320
+
321
+ #
322
+ def procedural_name(self):
323
+ ''' The full procedural name indicates architeture, extended name, tile size'''
324
+ return self.configuration_name()
325
+
326
+ def run(self, arguments: ReductionArguments) -> cuda.CUresult:
327
+ """
328
+ Configure and launch the cuda kernel with input arguments
329
+ """
330
+ # get launch configuration
331
+ launch_config = self.rt_module.plan(arguments)
332
+
333
+ # get the host and device workspace
334
+ host_workspace = arguments.host_workspace
335
+ device_workspace = None
336
+
337
+ # launch the kernel
338
+ err = self.rt_module.run(
339
+ host_workspace, device_workspace, launch_config)
340
+
341
+ if err != cuda.CUresult.CUDA_SUCCESS:
342
+ raise RuntimeError('CUDA Error %s' % str(err))
343
+
344
+ return err
345
+
346
+
347
+ class EmitReductionInstance:
348
+ def __init__(self, operation_suffix='') -> None:
349
+ self.operation_suffix = operation_suffix
350
+ self.includes = [
351
+ "cutlass/cutlass.h",
352
+ "cutlass/numeric_types.h",
353
+ "cutlass/arch/arch.h",
354
+ "cutlass/arch/mma.h",
355
+ "cutlass/layout/matrix.h",
356
+ "cutlass/gemm/device/gemm.h",
357
+ "cutlass/gemm/device/gemm_universal_adapter.h",
358
+ "cutlass/gemm/kernel/default_gemm_universal.h",
359
+ "cutlass/reduction/kernel/reduce_split_k.h",
360
+ "cutlass/reduction/thread/reduction_operators.h"
361
+ ]
362
+ self.template = """
363
+ // Reduction kernel instance
364
+ using ${operation_name}_base =
365
+ typename cutlass::reduction::kernel::ReduceSplitK<
366
+ cutlass::MatrixShape<${shape_row}, ${shape_column}>,
367
+ ${epilogue_functor},
368
+ cutlass::reduction::thread::ReduceAdd<
369
+ ${element_accumulator},
370
+ ${element_output},
371
+ ${count}>,
372
+ ${partition_per_stage}>;
373
+
374
+ struct ${operation_name}${operation_suffix}:
375
+ public ${operation_name}_base { };
376
+ """
377
+
378
+ def emit(self, operation: ReductionOperation):
379
+
380
+ epilogue_vector_length = int(min(
381
+ operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
382
+
383
+ values = {
384
+ 'operation_name': operation.configuration_name(),
385
+ 'operation_suffix': self.operation_suffix,
386
+ 'shape_row': str(operation.shape.row()),
387
+ 'shape_column': str(operation.shape.column()),
388
+ 'epilogue_functor': operation.epilogue_functor.emit(),
389
+ 'element_output': DataTypeTag[operation.element_output],
390
+ 'epilogue_vector_length': str(epilogue_vector_length),
391
+ 'element_accumulator': DataTypeTag[operation.element_accumulator],
392
+ 'element_compute': DataTypeTag[operation.element_compute],
393
+ 'element_workspace': DataTypeTag[operation.element_workspace],
394
+ 'count': str(operation.count),
395
+ 'partition_per_stage': str(operation.partitions_per_stage)
396
+ }
397
+
398
+ return SubstituteTemplate(self.template, values)
@@ -0,0 +1,70 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ # SPDX-License-Identifier: BSD-3-Clause
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above copyright notice, this
10
+ # list of conditions and the following disclaimer.
11
+ #
12
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # 3. Neither the name of the copyright holder nor the names of its
17
+ # contributors may be used to endorse or promote products derived from
18
+ # this software without specific prior written permission.
19
+ #
20
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+ #
31
+ ################################################################################
32
+
33
+ from typeguard import typechecked
34
+ import numpy as np
35
+ try:
36
+ import torch
37
+ torch_available = True
38
+ except ImportError:
39
+ torch_available = False
40
+ from cuda import cuda
41
+ try:
42
+ import cupy as cp
43
+ cupy_available = True
44
+ except ImportError:
45
+ cupy_available = False
46
+ import cutlass
47
+
48
+
49
+ # @typechecked
50
+ class TensorRef:
51
+ """
52
+ Python Wrapper for cutlass.TensorRef
53
+ """
54
+ def __init__(self, tensor, dtype, layout) -> None:
55
+ if isinstance(tensor, np.ndarray):
56
+ ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
57
+ elif torch_available and isinstance(tensor, torch.Tensor):
58
+ ptr = cuda.CUdeviceptr(tensor.data_ptr())
59
+ elif cupy_available and isinstance(tensor, cp.ndarray):
60
+ ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
61
+ elif isinstance(tensor, cuda.CUdeviceptr):
62
+ ptr = tensor
63
+ elif isinstance(tensor, int):
64
+ ptr = cuda.CUdeviceptr(tensor)
65
+ else:
66
+ raise NotImplementedError(tensor)
67
+
68
+ # the dtype(0) is used to overload between different data types
69
+ # with the same layout
70
+ self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)
@@ -0,0 +1,4 @@
1
+ from pycutlass.test.profiler import *
2
+ from pycutlass.test.conv2d_testbed import *
3
+ from pycutlass.test.gemm_testbed import *
4
+ from pycutlass.test.gemm_grouped_testbed import *