warp-lang 1.9.1__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +301 -287
- warp/__init__.pyi +794 -305
- warp/_src/__init__.py +14 -0
- warp/_src/autograd.py +1075 -0
- warp/_src/build.py +618 -0
- warp/_src/build_dll.py +640 -0
- warp/{builtins.py → _src/builtins.py} +1382 -377
- warp/_src/codegen.py +4359 -0
- warp/{config.py → _src/config.py} +178 -169
- warp/_src/constants.py +57 -0
- warp/_src/context.py +8294 -0
- warp/_src/dlpack.py +462 -0
- warp/_src/fabric.py +355 -0
- warp/_src/fem/__init__.py +14 -0
- warp/_src/fem/adaptivity.py +508 -0
- warp/_src/fem/cache.py +687 -0
- warp/_src/fem/dirichlet.py +188 -0
- warp/{fem → _src/fem}/domain.py +40 -30
- warp/_src/fem/field/__init__.py +131 -0
- warp/_src/fem/field/field.py +701 -0
- warp/{fem → _src/fem}/field/nodal_field.py +30 -15
- warp/{fem → _src/fem}/field/restriction.py +1 -1
- warp/{fem → _src/fem}/field/virtual.py +53 -27
- warp/_src/fem/geometry/__init__.py +32 -0
- warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
- warp/_src/fem/geometry/closest_point.py +97 -0
- warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
- warp/{fem → _src/fem}/geometry/element.py +32 -10
- warp/{fem → _src/fem}/geometry/geometry.py +48 -20
- warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
- warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
- warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
- warp/{fem → _src/fem}/geometry/partition.py +121 -63
- warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
- warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
- warp/{fem → _src/fem}/integrate.py +164 -158
- warp/_src/fem/linalg.py +383 -0
- warp/_src/fem/operator.py +396 -0
- warp/_src/fem/polynomial.py +229 -0
- warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
- warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
- warp/_src/fem/space/__init__.py +248 -0
- warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
- warp/_src/fem/space/basis_space.py +679 -0
- warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
- warp/{fem → _src/fem}/space/function_space.py +14 -13
- warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
- warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
- warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
- warp/{fem → _src/fem}/space/partition.py +117 -60
- warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/restriction.py +66 -33
- warp/_src/fem/space/shape/__init__.py +152 -0
- warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
- warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
- warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
- warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
- warp/_src/fem/space/topology.py +459 -0
- warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
- warp/_src/fem/types.py +112 -0
- warp/_src/fem/utils.py +486 -0
- warp/_src/jax.py +186 -0
- warp/_src/jax_experimental/__init__.py +14 -0
- warp/_src/jax_experimental/custom_call.py +387 -0
- warp/_src/jax_experimental/ffi.py +1284 -0
- warp/_src/jax_experimental/xla_ffi.py +656 -0
- warp/_src/marching_cubes.py +708 -0
- warp/_src/math.py +414 -0
- warp/_src/optim/__init__.py +14 -0
- warp/_src/optim/adam.py +163 -0
- warp/_src/optim/linear.py +1606 -0
- warp/_src/optim/sgd.py +112 -0
- warp/_src/paddle.py +406 -0
- warp/_src/render/__init__.py +14 -0
- warp/_src/render/imgui_manager.py +289 -0
- warp/_src/render/render_opengl.py +3636 -0
- warp/_src/render/render_usd.py +937 -0
- warp/_src/render/utils.py +160 -0
- warp/_src/sparse.py +2716 -0
- warp/_src/tape.py +1206 -0
- warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
- warp/_src/torch.py +391 -0
- warp/_src/types.py +5870 -0
- warp/_src/utils.py +1693 -0
- warp/autograd.py +12 -1054
- warp/bin/warp-clang.dll +0 -0
- warp/bin/warp.dll +0 -0
- warp/build.py +8 -588
- warp/build_dll.py +6 -721
- warp/codegen.py +6 -4251
- warp/constants.py +6 -39
- warp/context.py +12 -8062
- warp/dlpack.py +6 -444
- warp/examples/distributed/example_jacobi_mpi.py +4 -5
- warp/examples/fem/example_adaptive_grid.py +1 -1
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +8 -8
- warp/examples/fem/example_diffusion.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_mixed_elasticity.py +2 -2
- warp/examples/fem/example_navier_stokes.py +1 -1
- warp/examples/fem/example_nonconforming_contact.py +7 -7
- warp/examples/fem/example_stokes.py +1 -1
- warp/examples/fem/example_stokes_transfer.py +1 -1
- warp/examples/fem/utils.py +2 -2
- warp/examples/interop/example_jax_callable.py +1 -1
- warp/examples/interop/example_jax_ffi_callback.py +1 -1
- warp/examples/interop/example_jax_kernel.py +1 -1
- warp/examples/tile/example_tile_mcgp.py +191 -0
- warp/fabric.py +6 -337
- warp/fem/__init__.py +159 -97
- warp/fem/adaptivity.py +7 -489
- warp/fem/cache.py +9 -648
- warp/fem/dirichlet.py +6 -184
- warp/fem/field/__init__.py +8 -109
- warp/fem/field/field.py +7 -652
- warp/fem/geometry/__init__.py +7 -18
- warp/fem/geometry/closest_point.py +11 -77
- warp/fem/linalg.py +18 -366
- warp/fem/operator.py +11 -369
- warp/fem/polynomial.py +9 -209
- warp/fem/space/__init__.py +5 -211
- warp/fem/space/basis_space.py +6 -662
- warp/fem/space/shape/__init__.py +41 -118
- warp/fem/space/topology.py +6 -437
- warp/fem/types.py +6 -81
- warp/fem/utils.py +11 -444
- warp/jax.py +8 -165
- warp/jax_experimental/__init__.py +14 -1
- warp/jax_experimental/custom_call.py +8 -365
- warp/jax_experimental/ffi.py +17 -873
- warp/jax_experimental/xla_ffi.py +5 -605
- warp/marching_cubes.py +5 -689
- warp/math.py +16 -393
- warp/native/array.h +385 -37
- warp/native/builtin.h +314 -37
- warp/native/bvh.cpp +43 -9
- warp/native/bvh.cu +62 -27
- warp/native/bvh.h +310 -309
- warp/native/clang/clang.cpp +102 -97
- warp/native/coloring.cpp +0 -1
- warp/native/crt.h +208 -0
- warp/native/exports.h +156 -0
- warp/native/hashgrid.cu +2 -0
- warp/native/intersect.h +24 -1
- warp/native/intersect_tri.h +44 -35
- warp/native/mat.h +1456 -276
- warp/native/mesh.cpp +4 -4
- warp/native/mesh.cu +4 -2
- warp/native/mesh.h +176 -61
- warp/native/quat.h +0 -52
- warp/native/scan.cu +2 -0
- warp/native/sparse.cu +7 -3
- warp/native/spatial.h +12 -0
- warp/native/tile.h +681 -89
- warp/native/tile_radix_sort.h +1 -1
- warp/native/tile_reduce.h +394 -46
- warp/native/tile_scan.h +4 -4
- warp/native/vec.h +469 -0
- warp/native/version.h +23 -0
- warp/native/volume.cpp +1 -1
- warp/native/volume.cu +1 -0
- warp/native/volume.h +1 -1
- warp/native/volume_builder.cu +2 -0
- warp/native/warp.cpp +57 -29
- warp/native/warp.cu +253 -171
- warp/native/warp.h +11 -8
- warp/optim/__init__.py +6 -3
- warp/optim/adam.py +6 -145
- warp/optim/linear.py +14 -1585
- warp/optim/sgd.py +6 -94
- warp/paddle.py +6 -388
- warp/render/__init__.py +8 -4
- warp/render/imgui_manager.py +7 -267
- warp/render/render_opengl.py +6 -3618
- warp/render/render_usd.py +6 -919
- warp/render/utils.py +6 -142
- warp/sparse.py +37 -2563
- warp/tape.py +6 -1188
- warp/tests/__main__.py +1 -1
- warp/tests/cuda/test_async.py +4 -4
- warp/tests/cuda/test_conditional_captures.py +1 -1
- warp/tests/cuda/test_multigpu.py +1 -1
- warp/tests/cuda/test_streams.py +58 -1
- warp/tests/geometry/test_bvh.py +157 -22
- warp/tests/geometry/test_marching_cubes.py +0 -1
- warp/tests/geometry/test_mesh.py +5 -3
- warp/tests/geometry/test_mesh_query_aabb.py +5 -12
- warp/tests/geometry/test_mesh_query_point.py +5 -2
- warp/tests/geometry/test_mesh_query_ray.py +15 -3
- warp/tests/geometry/test_volume_write.py +5 -5
- warp/tests/interop/test_dlpack.py +14 -14
- warp/tests/interop/test_jax.py +772 -49
- warp/tests/interop/test_paddle.py +1 -1
- warp/tests/test_adam.py +0 -1
- warp/tests/test_arithmetic.py +9 -9
- warp/tests/test_array.py +527 -100
- warp/tests/test_array_reduce.py +3 -3
- warp/tests/test_atomic.py +12 -8
- warp/tests/test_atomic_bitwise.py +209 -0
- warp/tests/test_atomic_cas.py +4 -4
- warp/tests/test_bool.py +2 -2
- warp/tests/test_builtins_resolution.py +5 -571
- warp/tests/test_codegen.py +33 -14
- warp/tests/test_conditional.py +1 -1
- warp/tests/test_context.py +6 -6
- warp/tests/test_copy.py +242 -161
- warp/tests/test_ctypes.py +3 -3
- warp/tests/test_devices.py +24 -2
- warp/tests/test_examples.py +16 -84
- warp/tests/test_fabricarray.py +35 -35
- warp/tests/test_fast_math.py +0 -2
- warp/tests/test_fem.py +56 -10
- warp/tests/test_fixedarray.py +3 -3
- warp/tests/test_func.py +8 -5
- warp/tests/test_generics.py +1 -1
- warp/tests/test_indexedarray.py +24 -24
- warp/tests/test_intersect.py +39 -9
- warp/tests/test_large.py +1 -1
- warp/tests/test_lerp.py +3 -1
- warp/tests/test_linear_solvers.py +1 -1
- warp/tests/test_map.py +35 -4
- warp/tests/test_mat.py +52 -62
- warp/tests/test_mat_constructors.py +4 -5
- warp/tests/test_mat_lite.py +1 -1
- warp/tests/test_mat_scalar_ops.py +121 -121
- warp/tests/test_math.py +34 -0
- warp/tests/test_module_aot.py +4 -4
- warp/tests/test_modules_lite.py +28 -2
- warp/tests/test_print.py +11 -11
- warp/tests/test_quat.py +93 -58
- warp/tests/test_runlength_encode.py +1 -1
- warp/tests/test_scalar_ops.py +38 -10
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +126 -15
- warp/tests/test_spatial.py +105 -87
- warp/tests/test_special_values.py +6 -6
- warp/tests/test_static.py +7 -7
- warp/tests/test_struct.py +13 -2
- warp/tests/test_triangle_closest_point.py +48 -1
- warp/tests/test_types.py +27 -15
- warp/tests/test_utils.py +52 -52
- warp/tests/test_vec.py +29 -29
- warp/tests/test_vec_constructors.py +5 -5
- warp/tests/test_vec_scalar_ops.py +97 -97
- warp/tests/test_version.py +75 -0
- warp/tests/tile/test_tile.py +178 -0
- warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
- warp/tests/tile/test_tile_cholesky.py +7 -4
- warp/tests/tile/test_tile_load.py +26 -2
- warp/tests/tile/test_tile_mathdx.py +3 -3
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +2 -4
- warp/tests/tile/test_tile_reduce.py +214 -13
- warp/tests/unittest_suites.py +6 -14
- warp/tests/unittest_utils.py +10 -9
- warp/tests/walkthrough_debug.py +3 -1
- warp/torch.py +6 -373
- warp/types.py +29 -5764
- warp/utils.py +10 -1659
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
- warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
- warp/examples/assets/cartpole.urdf +0 -110
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/nv_ant.xml +0 -92
- warp/examples/assets/nv_humanoid.xml +0 -183
- warp/examples/assets/quadruped.urdf +0 -268
- warp/examples/optim/example_bounce.py +0 -266
- warp/examples/optim/example_cloth_throw.py +0 -228
- warp/examples/optim/example_drone.py +0 -870
- warp/examples/optim/example_inverse_kinematics.py +0 -182
- warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
- warp/examples/optim/example_softbody_properties.py +0 -400
- warp/examples/optim/example_spring_cage.py +0 -245
- warp/examples/optim/example_trajectory.py +0 -227
- warp/examples/sim/example_cartpole.py +0 -143
- warp/examples/sim/example_cloth.py +0 -225
- warp/examples/sim/example_cloth_self_contact.py +0 -316
- warp/examples/sim/example_granular.py +0 -130
- warp/examples/sim/example_granular_collision_sdf.py +0 -202
- warp/examples/sim/example_jacobian_ik.py +0 -244
- warp/examples/sim/example_particle_chain.py +0 -124
- warp/examples/sim/example_quadruped.py +0 -203
- warp/examples/sim/example_rigid_chain.py +0 -203
- warp/examples/sim/example_rigid_contact.py +0 -195
- warp/examples/sim/example_rigid_force.py +0 -133
- warp/examples/sim/example_rigid_gyroscopic.py +0 -115
- warp/examples/sim/example_rigid_soft_contact.py +0 -140
- warp/examples/sim/example_soft_body.py +0 -196
- warp/examples/tile/example_tile_walker.py +0 -327
- warp/sim/__init__.py +0 -74
- warp/sim/articulation.py +0 -793
- warp/sim/collide.py +0 -2570
- warp/sim/graph_coloring.py +0 -307
- warp/sim/import_mjcf.py +0 -791
- warp/sim/import_snu.py +0 -227
- warp/sim/import_urdf.py +0 -579
- warp/sim/import_usd.py +0 -898
- warp/sim/inertia.py +0 -357
- warp/sim/integrator.py +0 -245
- warp/sim/integrator_euler.py +0 -2000
- warp/sim/integrator_featherstone.py +0 -2101
- warp/sim/integrator_vbd.py +0 -2487
- warp/sim/integrator_xpbd.py +0 -3295
- warp/sim/model.py +0 -4821
- warp/sim/particles.py +0 -121
- warp/sim/render.py +0 -431
- warp/sim/utils.py +0 -431
- warp/tests/sim/disabled_kinematics.py +0 -244
- warp/tests/sim/test_cloth.py +0 -863
- warp/tests/sim/test_collision.py +0 -743
- warp/tests/sim/test_coloring.py +0 -347
- warp/tests/sim/test_inertia.py +0 -161
- warp/tests/sim/test_model.py +0 -226
- warp/tests/sim/test_sim_grad.py +0 -287
- warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
- warp/tests/sim/test_sim_kinematics.py +0 -98
- warp/thirdparty/__init__.py +0 -0
- warp_lang-1.9.1.dist-info/RECORD +0 -456
- /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
- /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import unittest
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
import warp as wp
|
|
21
|
+
from warp.tests.unittest_utils import *
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@wp.kernel
|
|
25
|
+
def test_tile_atomic_bitwise_scalar_kernel(
|
|
26
|
+
a: wp.array(dtype=wp.uint32), b: wp.array(dtype=wp.uint32), c: wp.array(dtype=wp.uint32), op_type: int
|
|
27
|
+
):
|
|
28
|
+
word_idx, bit_idx = wp.tid()
|
|
29
|
+
block_dim = wp.block_dim()
|
|
30
|
+
assert block_dim == 32
|
|
31
|
+
s = wp.tile_zeros(shape=1, dtype=wp.uint32)
|
|
32
|
+
# write to tile first, then write only once to the array
|
|
33
|
+
s[0] = a[word_idx]
|
|
34
|
+
if op_type < 3:
|
|
35
|
+
bit_mask = wp.uint32(1) << wp.uint32(bit_idx)
|
|
36
|
+
if op_type == 0:
|
|
37
|
+
s[0] &= (b[word_idx] & bit_mask) | ~bit_mask
|
|
38
|
+
elif op_type == 1:
|
|
39
|
+
s[0] |= b[word_idx] & bit_mask
|
|
40
|
+
elif op_type == 2:
|
|
41
|
+
s[0] ^= b[word_idx] & bit_mask
|
|
42
|
+
else:
|
|
43
|
+
# inter-tile operations
|
|
44
|
+
s_bit_mask = wp.tile_zeros(shape=32, dtype=wp.uint32)
|
|
45
|
+
s_bit_mask[(bit_idx + 1) % 32] = wp.uint32(1) << wp.uint32((bit_idx + 1) % 32)
|
|
46
|
+
if op_type == 3:
|
|
47
|
+
s[0] &= (b[word_idx] & s_bit_mask[bit_idx]) | ~s_bit_mask[bit_idx]
|
|
48
|
+
elif op_type == 4:
|
|
49
|
+
s[0] |= b[word_idx] & s_bit_mask[bit_idx]
|
|
50
|
+
elif op_type == 5:
|
|
51
|
+
s[0] ^= b[word_idx] & s_bit_mask[bit_idx]
|
|
52
|
+
c[word_idx] = s[0]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@wp.kernel
|
|
56
|
+
def test_tile_atomic_bitwise_scalar_tilewise_kernel(
|
|
57
|
+
a: wp.array(dtype=wp.uint32), b: wp.array(dtype=wp.uint32), c: wp.array(dtype=wp.uint32), op_type: int
|
|
58
|
+
):
|
|
59
|
+
batch_idx, _ = wp.tid()
|
|
60
|
+
block_dim = wp.block_dim()
|
|
61
|
+
assert block_dim == 32
|
|
62
|
+
# Each tile is responsible for a batch of 32 elements
|
|
63
|
+
s1 = wp.tile_load(a, shape=32, offset=batch_idx * 32)
|
|
64
|
+
s2 = wp.tile_load(b, shape=32, offset=batch_idx * 32)
|
|
65
|
+
# inter-tile operations (batch-wise)
|
|
66
|
+
if op_type < 9:
|
|
67
|
+
if op_type == 6:
|
|
68
|
+
s1 &= s2
|
|
69
|
+
elif op_type == 7:
|
|
70
|
+
s1 |= s2
|
|
71
|
+
elif op_type == 8:
|
|
72
|
+
s1 ^= s2
|
|
73
|
+
wp.tile_store(c, s1, offset=batch_idx * 32)
|
|
74
|
+
else:
|
|
75
|
+
if op_type == 9:
|
|
76
|
+
s3 = s1 & s2
|
|
77
|
+
elif op_type == 10:
|
|
78
|
+
s3 = s1 | s2
|
|
79
|
+
elif op_type == 11:
|
|
80
|
+
s3 = s1 ^ s2
|
|
81
|
+
wp.tile_store(c, s3, offset=batch_idx * 32)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_tile_atomic_bitwise_scalar(test, device):
|
|
85
|
+
n = 1024
|
|
86
|
+
rng = np.random.default_rng(42)
|
|
87
|
+
|
|
88
|
+
a = rng.integers(0, np.iinfo(np.uint32).max, size=n, dtype=np.uint32)
|
|
89
|
+
b = rng.integers(0, np.iinfo(np.uint32).max, size=n, dtype=np.uint32)
|
|
90
|
+
|
|
91
|
+
expected_and = a & b
|
|
92
|
+
expected_or = a | b
|
|
93
|
+
expected_xor = a ^ b
|
|
94
|
+
|
|
95
|
+
with wp.ScopedDevice(device):
|
|
96
|
+
a_wp = wp.array(a, dtype=wp.uint32, device=device)
|
|
97
|
+
b_wp = wp.array(b, dtype=wp.uint32, device=device)
|
|
98
|
+
c_wp = wp.zeros(shape=n, dtype=wp.uint32, device=device)
|
|
99
|
+
|
|
100
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 0], block_dim=32)
|
|
101
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
102
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 1], block_dim=32)
|
|
103
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
104
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 2], block_dim=32)
|
|
105
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
106
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 3], block_dim=32)
|
|
107
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
108
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 4], block_dim=32)
|
|
109
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
110
|
+
wp.launch_tiled(test_tile_atomic_bitwise_scalar_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 5], block_dim=32)
|
|
111
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
112
|
+
|
|
113
|
+
wp.launch_tiled(
|
|
114
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 6], block_dim=32
|
|
115
|
+
)
|
|
116
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
117
|
+
wp.launch_tiled(
|
|
118
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 7], block_dim=32
|
|
119
|
+
)
|
|
120
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
121
|
+
wp.launch_tiled(
|
|
122
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 8], block_dim=32
|
|
123
|
+
)
|
|
124
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
125
|
+
wp.launch_tiled(
|
|
126
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 9], block_dim=32
|
|
127
|
+
)
|
|
128
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
129
|
+
wp.launch_tiled(
|
|
130
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 10], block_dim=32
|
|
131
|
+
)
|
|
132
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
133
|
+
wp.launch_tiled(
|
|
134
|
+
test_tile_atomic_bitwise_scalar_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 11], block_dim=32
|
|
135
|
+
)
|
|
136
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@wp.kernel
|
|
140
|
+
def test_tile_atomic_bitwise_vector_kernel(
|
|
141
|
+
a: wp.array(dtype=wp.vec3ui), b: wp.array(dtype=wp.vec3ui), c: wp.array(dtype=wp.vec3ui), op_type: int
|
|
142
|
+
):
|
|
143
|
+
word_idx, bit_idx = wp.tid()
|
|
144
|
+
block_dim = wp.block_dim()
|
|
145
|
+
assert block_dim == 32
|
|
146
|
+
s = wp.tile_zeros(shape=1, dtype=wp.vec3ui)
|
|
147
|
+
# write to tile first, then write only once to the array
|
|
148
|
+
s[0] = a[word_idx]
|
|
149
|
+
if op_type < 3:
|
|
150
|
+
bit_mask = wp.vec3ui(wp.uint32(1)) << wp.vec3ui(wp.uint32(bit_idx))
|
|
151
|
+
if op_type == 0:
|
|
152
|
+
s[0] &= (b[word_idx] & bit_mask) | ~bit_mask
|
|
153
|
+
elif op_type == 1:
|
|
154
|
+
s[0] |= b[word_idx] & bit_mask
|
|
155
|
+
elif op_type == 2:
|
|
156
|
+
s[0] ^= b[word_idx] & bit_mask
|
|
157
|
+
else:
|
|
158
|
+
# inter-tile operations
|
|
159
|
+
s_bit_mask = wp.tile_zeros(shape=32, dtype=wp.vec3ui)
|
|
160
|
+
s_bit_mask[(bit_idx + 1) % 32] = wp.vec3ui(wp.uint32(1) << wp.uint32((bit_idx + 1) % 32))
|
|
161
|
+
if op_type == 3:
|
|
162
|
+
s[0] &= (b[word_idx] & s_bit_mask[bit_idx]) | ~s_bit_mask[bit_idx]
|
|
163
|
+
elif op_type == 4:
|
|
164
|
+
s[0] |= b[word_idx] & s_bit_mask[bit_idx]
|
|
165
|
+
elif op_type == 5:
|
|
166
|
+
s[0] ^= b[word_idx] & s_bit_mask[bit_idx]
|
|
167
|
+
c[word_idx] = s[0]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@wp.kernel
|
|
171
|
+
def test_tile_atomic_bitwise_vector_tilewise_kernel(
|
|
172
|
+
a: wp.array(dtype=wp.vec3ui), b: wp.array(dtype=wp.vec3ui), c: wp.array(dtype=wp.vec3ui), op_type: int
|
|
173
|
+
):
|
|
174
|
+
batch_idx, _ = wp.tid()
|
|
175
|
+
block_dim = wp.block_dim()
|
|
176
|
+
assert block_dim == 32
|
|
177
|
+
# Each tile is responsible for a batch of 32 elements
|
|
178
|
+
s1 = wp.tile_load(a, shape=32, offset=batch_idx * 32)
|
|
179
|
+
s2 = wp.tile_load(b, shape=32, offset=batch_idx * 32)
|
|
180
|
+
# inter-tile operations (batch-wise)
|
|
181
|
+
if op_type < 9:
|
|
182
|
+
if op_type == 6:
|
|
183
|
+
s1 &= s2
|
|
184
|
+
elif op_type == 7:
|
|
185
|
+
s1 |= s2
|
|
186
|
+
elif op_type == 8:
|
|
187
|
+
s1 ^= s2
|
|
188
|
+
wp.tile_store(c, s1, offset=batch_idx * 32)
|
|
189
|
+
else:
|
|
190
|
+
if op_type == 9:
|
|
191
|
+
s3 = s1 & s2
|
|
192
|
+
elif op_type == 10:
|
|
193
|
+
s3 = s1 | s2
|
|
194
|
+
elif op_type == 11:
|
|
195
|
+
s3 = s1 ^ s2
|
|
196
|
+
wp.tile_store(c, s3, offset=batch_idx * 32)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def test_tile_atomic_bitwise_vector(test, device):
|
|
200
|
+
n = 1024
|
|
201
|
+
rng = np.random.default_rng(42)
|
|
202
|
+
|
|
203
|
+
a = rng.integers(0, np.iinfo(np.uint32).max, size=(n, 3), dtype=np.uint32)
|
|
204
|
+
b = rng.integers(0, np.iinfo(np.uint32).max, size=(n, 3), dtype=np.uint32)
|
|
205
|
+
|
|
206
|
+
expected_and = a & b
|
|
207
|
+
expected_or = a | b
|
|
208
|
+
expected_xor = a ^ b
|
|
209
|
+
|
|
210
|
+
with wp.ScopedDevice(device):
|
|
211
|
+
a_wp = wp.array(a, dtype=wp.vec3ui, device=device)
|
|
212
|
+
b_wp = wp.array(b, dtype=wp.vec3ui, device=device)
|
|
213
|
+
c_wp = wp.zeros(shape=n, dtype=wp.vec3ui, device=device)
|
|
214
|
+
|
|
215
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 0], block_dim=32)
|
|
216
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
217
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 1], block_dim=32)
|
|
218
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
219
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 2], block_dim=32)
|
|
220
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
221
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 3], block_dim=32)
|
|
222
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
223
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 4], block_dim=32)
|
|
224
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
225
|
+
wp.launch_tiled(test_tile_atomic_bitwise_vector_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 5], block_dim=32)
|
|
226
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
227
|
+
|
|
228
|
+
wp.launch_tiled(
|
|
229
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 6], block_dim=32
|
|
230
|
+
)
|
|
231
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
232
|
+
wp.launch_tiled(
|
|
233
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 7], block_dim=32
|
|
234
|
+
)
|
|
235
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
236
|
+
wp.launch_tiled(
|
|
237
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 8], block_dim=32
|
|
238
|
+
)
|
|
239
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
240
|
+
wp.launch_tiled(
|
|
241
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 9], block_dim=32
|
|
242
|
+
)
|
|
243
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
244
|
+
wp.launch_tiled(
|
|
245
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 10], block_dim=32
|
|
246
|
+
)
|
|
247
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
248
|
+
wp.launch_tiled(
|
|
249
|
+
test_tile_atomic_bitwise_vector_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 11], block_dim=32
|
|
250
|
+
)
|
|
251
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
mat33ui = wp._src.types.matrix(shape=(3, 3), dtype=wp.uint32)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@wp.kernel
|
|
258
|
+
def test_tile_atomic_bitwise_matrix_kernel(
|
|
259
|
+
a: wp.array(dtype=mat33ui), b: wp.array(dtype=mat33ui), c: wp.array(dtype=mat33ui), op_type: int
|
|
260
|
+
):
|
|
261
|
+
word_idx, bit_idx = wp.tid()
|
|
262
|
+
block_dim = wp.block_dim()
|
|
263
|
+
assert block_dim == 32
|
|
264
|
+
s = wp.tile_zeros(shape=1, dtype=mat33ui)
|
|
265
|
+
# write to tile first, then write only once to the array
|
|
266
|
+
s[0] = a[word_idx]
|
|
267
|
+
if op_type < 3:
|
|
268
|
+
bit_mask = mat33ui(wp.uint32(1)) << mat33ui(wp.uint32(bit_idx))
|
|
269
|
+
if op_type == 0:
|
|
270
|
+
s[0] &= (b[word_idx] & bit_mask) | ~bit_mask
|
|
271
|
+
elif op_type == 1:
|
|
272
|
+
s[0] |= b[word_idx] & bit_mask
|
|
273
|
+
elif op_type == 2:
|
|
274
|
+
s[0] ^= b[word_idx] & bit_mask
|
|
275
|
+
else:
|
|
276
|
+
# inter-tile operations
|
|
277
|
+
s_bit_mask = wp.tile_zeros(shape=32, dtype=mat33ui)
|
|
278
|
+
s_bit_mask[(bit_idx + 1) % 32] = mat33ui(wp.uint32(1) << wp.uint32((bit_idx + 1) % 32))
|
|
279
|
+
if op_type == 3:
|
|
280
|
+
s[0] &= (b[word_idx] & s_bit_mask[bit_idx]) | ~s_bit_mask[bit_idx]
|
|
281
|
+
elif op_type == 4:
|
|
282
|
+
s[0] |= b[word_idx] & s_bit_mask[bit_idx]
|
|
283
|
+
elif op_type == 5:
|
|
284
|
+
s[0] ^= b[word_idx] & s_bit_mask[bit_idx]
|
|
285
|
+
c[word_idx] = s[0]
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@wp.kernel
|
|
289
|
+
def test_tile_atomic_bitwise_matrix_tilewise_kernel(
|
|
290
|
+
a: wp.array(dtype=mat33ui), b: wp.array(dtype=mat33ui), c: wp.array(dtype=mat33ui), op_type: int
|
|
291
|
+
):
|
|
292
|
+
batch_idx, _ = wp.tid()
|
|
293
|
+
block_dim = wp.block_dim()
|
|
294
|
+
assert block_dim == 32
|
|
295
|
+
# Each tile is responsible for a batch of 32 elements
|
|
296
|
+
s1 = wp.tile_load(a, shape=32, offset=batch_idx * 32)
|
|
297
|
+
s2 = wp.tile_load(b, shape=32, offset=batch_idx * 32)
|
|
298
|
+
# inter-tile operations (batch-wise)
|
|
299
|
+
if op_type < 9:
|
|
300
|
+
if op_type == 6:
|
|
301
|
+
s1 &= s2
|
|
302
|
+
elif op_type == 7:
|
|
303
|
+
s1 |= s2
|
|
304
|
+
elif op_type == 8:
|
|
305
|
+
s1 ^= s2
|
|
306
|
+
wp.tile_store(c, s1, offset=batch_idx * 32)
|
|
307
|
+
else:
|
|
308
|
+
if op_type == 9:
|
|
309
|
+
s3 = s1 & s2
|
|
310
|
+
elif op_type == 10:
|
|
311
|
+
s3 = s1 | s2
|
|
312
|
+
elif op_type == 11:
|
|
313
|
+
s3 = s1 ^ s2
|
|
314
|
+
wp.tile_store(c, s3, offset=batch_idx * 32)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def test_tile_atomic_bitwise_matrix(test, device):
|
|
318
|
+
n = 1024
|
|
319
|
+
rng = np.random.default_rng(42)
|
|
320
|
+
|
|
321
|
+
a = rng.integers(0, np.iinfo(np.uint32).max, size=(n, 3, 3), dtype=np.uint32)
|
|
322
|
+
b = rng.integers(0, np.iinfo(np.uint32).max, size=(n, 3, 3), dtype=np.uint32)
|
|
323
|
+
|
|
324
|
+
expected_and = a & b
|
|
325
|
+
expected_or = a | b
|
|
326
|
+
expected_xor = a ^ b
|
|
327
|
+
|
|
328
|
+
with wp.ScopedDevice(device):
|
|
329
|
+
a_wp = wp.array(a, dtype=mat33ui, device=device)
|
|
330
|
+
b_wp = wp.array(b, dtype=mat33ui, device=device)
|
|
331
|
+
c_wp = wp.zeros(shape=n, dtype=mat33ui, device=device)
|
|
332
|
+
|
|
333
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 0], block_dim=32)
|
|
334
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
335
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 1], block_dim=32)
|
|
336
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
337
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 2], block_dim=32)
|
|
338
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
339
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 3], block_dim=32)
|
|
340
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
341
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 4], block_dim=32)
|
|
342
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
343
|
+
wp.launch_tiled(test_tile_atomic_bitwise_matrix_kernel, dim=n, inputs=[a_wp, b_wp, c_wp, 5], block_dim=32)
|
|
344
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
345
|
+
|
|
346
|
+
wp.launch_tiled(
|
|
347
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 6], block_dim=32
|
|
348
|
+
)
|
|
349
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
350
|
+
wp.launch_tiled(
|
|
351
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 7], block_dim=32
|
|
352
|
+
)
|
|
353
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
354
|
+
wp.launch_tiled(
|
|
355
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 8], block_dim=32
|
|
356
|
+
)
|
|
357
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
358
|
+
wp.launch_tiled(
|
|
359
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 9], block_dim=32
|
|
360
|
+
)
|
|
361
|
+
assert_np_equal(c_wp.numpy(), expected_and)
|
|
362
|
+
wp.launch_tiled(
|
|
363
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 10], block_dim=32
|
|
364
|
+
)
|
|
365
|
+
assert_np_equal(c_wp.numpy(), expected_or)
|
|
366
|
+
wp.launch_tiled(
|
|
367
|
+
test_tile_atomic_bitwise_matrix_tilewise_kernel, dim=n // 32, inputs=[a_wp, b_wp, c_wp, 11], block_dim=32
|
|
368
|
+
)
|
|
369
|
+
assert_np_equal(c_wp.numpy(), expected_xor)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
devices = get_cuda_test_devices()
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class TestTileAtomicBitwise(unittest.TestCase):
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
add_function_test(
|
|
380
|
+
TestTileAtomicBitwise,
|
|
381
|
+
"test_tile_atomic_bitwise_scalar",
|
|
382
|
+
test_tile_atomic_bitwise_scalar,
|
|
383
|
+
devices=devices,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
add_function_test(
|
|
387
|
+
TestTileAtomicBitwise,
|
|
388
|
+
"test_tile_atomic_bitwise_vector",
|
|
389
|
+
test_tile_atomic_bitwise_vector,
|
|
390
|
+
devices=devices,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
add_function_test(
|
|
394
|
+
TestTileAtomicBitwise,
|
|
395
|
+
"test_tile_atomic_bitwise_matrix",
|
|
396
|
+
test_tile_atomic_bitwise_matrix,
|
|
397
|
+
devices=devices,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
if __name__ == "__main__":
|
|
402
|
+
wp.clear_kernel_cache()
|
|
403
|
+
unittest.main(verbosity=2)
|
|
@@ -20,7 +20,7 @@ import numpy as np
|
|
|
20
20
|
import warp as wp
|
|
21
21
|
from warp.tests.unittest_utils import *
|
|
22
22
|
|
|
23
|
-
wp.init() # For wp.context.runtime.core.wp_is_mathdx_enabled()
|
|
23
|
+
wp.init() # For wp._src.context.runtime.core.wp_is_mathdx_enabled()
|
|
24
24
|
|
|
25
25
|
TILE_M = wp.constant(8)
|
|
26
26
|
TILE_N = wp.constant(4)
|
|
@@ -490,7 +490,7 @@ def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float),
|
|
|
490
490
|
|
|
491
491
|
|
|
492
492
|
def test_tile_cholesky_singular_matrices(test, device):
|
|
493
|
-
if not wp.context.runtime.core.wp_is_mathdx_enabled():
|
|
493
|
+
if not wp._src.context.runtime.core.wp_is_mathdx_enabled():
|
|
494
494
|
test.skipTest("MathDx is not enabled")
|
|
495
495
|
|
|
496
496
|
rng = np.random.default_rng(42)
|
|
@@ -527,8 +527,11 @@ cuda_devices = get_cuda_test_devices()
|
|
|
527
527
|
|
|
528
528
|
|
|
529
529
|
@unittest.skipUnless(
|
|
530
|
-
not wp.context.runtime.core.wp_is_mathdx_enabled()
|
|
531
|
-
or (
|
|
530
|
+
not wp._src.context.runtime.core.wp_is_mathdx_enabled()
|
|
531
|
+
or (
|
|
532
|
+
wp._src.context.runtime.core.wp_is_mathdx_enabled()
|
|
533
|
+
and wp._src.context.runtime.core.wp_cuda_toolkit_version() >= 12060
|
|
534
|
+
),
|
|
532
535
|
"MathDx is not enabled or is enabled but CUDA toolkit version is less than 12.6",
|
|
533
536
|
)
|
|
534
537
|
class TestTileCholesky(unittest.TestCase):
|
|
@@ -40,6 +40,7 @@ def tile_load_1d_kernel(
|
|
|
40
40
|
input: wp.array1d(dtype=float),
|
|
41
41
|
out_full: wp.array1d(dtype=float),
|
|
42
42
|
out_padded: wp.array1d(dtype=float),
|
|
43
|
+
out_sliced: wp.array1d(dtype=float),
|
|
43
44
|
out_offset: wp.array1d(dtype=float),
|
|
44
45
|
):
|
|
45
46
|
full0 = wp.tile_load(input, TILE_M)
|
|
@@ -50,8 +51,13 @@ def tile_load_1d_kernel(
|
|
|
50
51
|
padded1 = wp.tile_load(input, shape=TILE_M, offset=TILE_OFFSET)
|
|
51
52
|
padded2 = wp.tile_load(input, shape=(TILE_M,), offset=(TILE_OFFSET,))
|
|
52
53
|
|
|
54
|
+
sliced0 = wp.tile_load(input[::2], TILE_M)
|
|
55
|
+
sliced1 = wp.tile_load(input[::2], shape=TILE_M)
|
|
56
|
+
sliced2 = wp.tile_load(input[::2], shape=(TILE_M,))
|
|
57
|
+
|
|
53
58
|
wp.tile_store(out_full, full0)
|
|
54
59
|
wp.tile_store(out_padded, padded0)
|
|
60
|
+
wp.tile_store(out_sliced, sliced0)
|
|
55
61
|
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET,))
|
|
56
62
|
|
|
57
63
|
|
|
@@ -60,13 +66,16 @@ def tile_load_2d_kernel(
|
|
|
60
66
|
input: wp.array2d(dtype=float),
|
|
61
67
|
out_full: wp.array2d(dtype=float),
|
|
62
68
|
out_padded: wp.array2d(dtype=float),
|
|
69
|
+
out_sliced: wp.array2d(dtype=float),
|
|
63
70
|
out_offset: wp.array2d(dtype=float),
|
|
64
71
|
):
|
|
65
72
|
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
66
73
|
padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(TILE_OFFSET, TILE_OFFSET))
|
|
74
|
+
sliced0 = wp.tile_load(input[::2, ::2], shape=(TILE_M, TILE_N))
|
|
67
75
|
|
|
68
76
|
wp.tile_store(out_full, full0)
|
|
69
77
|
wp.tile_store(out_padded, padded0)
|
|
78
|
+
wp.tile_store(out_sliced, sliced0)
|
|
70
79
|
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET))
|
|
71
80
|
|
|
72
81
|
|
|
@@ -75,13 +84,16 @@ def tile_load_3d_kernel(
|
|
|
75
84
|
input: wp.array3d(dtype=float),
|
|
76
85
|
out_full: wp.array3d(dtype=float),
|
|
77
86
|
out_padded: wp.array3d(dtype=float),
|
|
87
|
+
out_sliced: wp.array3d(dtype=float),
|
|
78
88
|
out_offset: wp.array3d(dtype=float),
|
|
79
89
|
):
|
|
80
90
|
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O))
|
|
81
91
|
padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
92
|
+
sliced0 = wp.tile_load(input[::2, ::2, ::2], shape=(TILE_M, TILE_N, TILE_O))
|
|
82
93
|
|
|
83
94
|
wp.tile_store(out_full, full0)
|
|
84
95
|
wp.tile_store(out_padded, padded0)
|
|
96
|
+
wp.tile_store(out_sliced, sliced0)
|
|
85
97
|
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
86
98
|
|
|
87
99
|
|
|
@@ -90,15 +102,18 @@ def tile_load_4d_kernel(
|
|
|
90
102
|
input: wp.array4d(dtype=float),
|
|
91
103
|
out_full: wp.array4d(dtype=float),
|
|
92
104
|
out_padded: wp.array4d(dtype=float),
|
|
105
|
+
out_sliced: wp.array4d(dtype=float),
|
|
93
106
|
out_offset: wp.array4d(dtype=float),
|
|
94
107
|
):
|
|
95
108
|
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O, TILE_P))
|
|
96
109
|
padded0 = wp.tile_load(
|
|
97
110
|
input, shape=(TILE_M, TILE_N, TILE_O, TILE_P), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET)
|
|
98
111
|
)
|
|
112
|
+
sliced0 = wp.tile_load(input[::2, ::2, ::2, ::2], shape=(TILE_M, TILE_N, TILE_O, TILE_P))
|
|
99
113
|
|
|
100
114
|
wp.tile_store(out_full, full0)
|
|
101
115
|
wp.tile_store(out_padded, padded0)
|
|
116
|
+
wp.tile_store(out_sliced, sliced0)
|
|
102
117
|
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
103
118
|
|
|
104
119
|
|
|
@@ -112,13 +127,14 @@ def test_tile_load(kernel, ndim):
|
|
|
112
127
|
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
113
128
|
output_full = wp.zeros(shape, dtype=float, device=device)
|
|
114
129
|
output_padded = wp.zeros(shape, dtype=float, device=device)
|
|
130
|
+
output_sliced = wp.zeros(shape, dtype=float, device=device)
|
|
115
131
|
output_offset = wp.zeros(shape, dtype=float, device=device)
|
|
116
132
|
|
|
117
133
|
with wp.Tape() as tape:
|
|
118
134
|
wp.launch_tiled(
|
|
119
135
|
kernel,
|
|
120
136
|
dim=[1],
|
|
121
|
-
inputs=[input, output_full, output_padded, output_offset],
|
|
137
|
+
inputs=[input, output_full, output_padded, output_sliced, output_offset],
|
|
122
138
|
block_dim=TILE_DIM,
|
|
123
139
|
device=device,
|
|
124
140
|
)
|
|
@@ -134,8 +150,16 @@ def test_tile_load(kernel, ndim):
|
|
|
134
150
|
ref_offset = np.zeros_like(ref_full)
|
|
135
151
|
ref_offset[src_slice] = ref_full[dest_slice]
|
|
136
152
|
|
|
153
|
+
# construct a slice for the source/dest sliced arrays
|
|
154
|
+
src_slice = tuple(slice(0, dim, 2) for dim in shape)
|
|
155
|
+
dest_slice = tuple(slice(0, (dim + 1) // 2) for dim in shape)
|
|
156
|
+
|
|
157
|
+
ref_sliced = np.zeros_like(ref_full)
|
|
158
|
+
ref_sliced[dest_slice] = ref_full[src_slice]
|
|
159
|
+
|
|
137
160
|
assert_np_equal(output_full.numpy(), ref_full)
|
|
138
161
|
assert_np_equal(output_padded.numpy(), ref_padded)
|
|
162
|
+
assert_np_equal(output_sliced.numpy(), ref_sliced)
|
|
139
163
|
assert_np_equal(output_offset.numpy(), ref_offset)
|
|
140
164
|
|
|
141
165
|
output_full.grad = wp.ones_like(output_full)
|
|
@@ -570,7 +594,7 @@ def test_tile_assign(kernel, ndim):
|
|
|
570
594
|
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
571
595
|
output = wp.zeros_like(input)
|
|
572
596
|
|
|
573
|
-
with wp.Tape()
|
|
597
|
+
with wp.Tape():
|
|
574
598
|
wp.launch(
|
|
575
599
|
kernel,
|
|
576
600
|
dim=shape,
|
|
@@ -21,7 +21,7 @@ import numpy as np
|
|
|
21
21
|
import warp as wp
|
|
22
22
|
from warp.tests.unittest_utils import *
|
|
23
23
|
|
|
24
|
-
wp.init() # For wp.context.runtime.core.wp_is_mathdx_enabled()
|
|
24
|
+
wp.init() # For wp._src.context.runtime.core.wp_is_mathdx_enabled()
|
|
25
25
|
|
|
26
26
|
TILE_M = wp.constant(8)
|
|
27
27
|
TILE_N = wp.constant(4)
|
|
@@ -92,7 +92,7 @@ def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dt
|
|
|
92
92
|
wp.tile_store(gy, xy)
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
@unittest.skipUnless(wp.context.runtime.core.wp_is_mathdx_enabled(), "Warp was not built with MathDx support")
|
|
95
|
+
@unittest.skipUnless(wp._src.context.runtime.core.wp_is_mathdx_enabled(), "Warp was not built with MathDx support")
|
|
96
96
|
def test_tile_math_fft(test, device, wp_dtype):
|
|
97
97
|
np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
|
|
98
98
|
np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
|
|
@@ -113,7 +113,7 @@ def test_tile_math_fft(test, device, wp_dtype):
|
|
|
113
113
|
X_c64 = X.view(np_cplx_dtype).reshape(fft_size, fft_size)
|
|
114
114
|
Y_c64 = np.fft.fft(X_c64, axis=-1)
|
|
115
115
|
|
|
116
|
-
with wp.Tape()
|
|
116
|
+
with wp.Tape():
|
|
117
117
|
wp.launch_tiled(kernel, dim=[1, 1], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
|
|
118
118
|
|
|
119
119
|
Y_wp_c64 = Y_wp.numpy().view(np_cplx_dtype).reshape(fft_size, fft_size)
|
|
@@ -60,7 +60,7 @@ def test_tile_grouped_gemm(test, device):
|
|
|
60
60
|
B_wp = wp.array(B, requires_grad=True, device=device)
|
|
61
61
|
C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
|
|
62
62
|
|
|
63
|
-
with wp.Tape()
|
|
63
|
+
with wp.Tape():
|
|
64
64
|
wp.launch_tiled(
|
|
65
65
|
tile_grouped_gemm, dim=[batch_count], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
|
|
66
66
|
)
|
warp/tests/tile/test_tile_mlp.py
CHANGED
|
@@ -43,7 +43,7 @@ def create_array(rng, dim_in, dim_hid, dtype=float):
|
|
|
43
43
|
def test_multi_layer_nn(test, device):
|
|
44
44
|
import torch as tc
|
|
45
45
|
|
|
46
|
-
if device.is_cuda and not wp.context.runtime.core.wp_is_mathdx_enabled():
|
|
46
|
+
if device.is_cuda and not wp._src.context.runtime.core.wp_is_mathdx_enabled():
|
|
47
47
|
test.skipTest("Skipping test on CUDA device without MathDx (tolerance)")
|
|
48
48
|
|
|
49
49
|
NUM_FREQ = wp.constant(8)
|
|
@@ -63,7 +63,7 @@ def test_multi_layer_nn(test, device):
|
|
|
63
63
|
NUM_THREADS = 32
|
|
64
64
|
|
|
65
65
|
dtype = wp.float16
|
|
66
|
-
npdtype = wp.types.warp_type_to_np_dtype[dtype]
|
|
66
|
+
npdtype = wp._src.types.warp_type_to_np_dtype[dtype]
|
|
67
67
|
|
|
68
68
|
@wp.func
|
|
69
69
|
def relu(x: dtype):
|
|
@@ -188,7 +188,6 @@ def test_multi_layer_nn(test, device):
|
|
|
188
188
|
optimizer_inputs = [p.flatten() for p in params]
|
|
189
189
|
optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
|
|
190
190
|
|
|
191
|
-
num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
|
|
192
191
|
max_epochs = 30
|
|
193
192
|
|
|
194
193
|
# create randomized batch indices
|
|
@@ -288,7 +287,6 @@ def test_single_layer_nn(test, device):
|
|
|
288
287
|
import torch as tc
|
|
289
288
|
|
|
290
289
|
DIM_IN = 8
|
|
291
|
-
DIM_HID = 32
|
|
292
290
|
DIM_OUT = 16
|
|
293
291
|
|
|
294
292
|
NUM_BLOCKS = 56
|