PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/quat.h CHANGED Viewed

@@ -19,6 +19,15 @@ struct quat_t
     // zero constructor for adjoint variable initialization
     inline CUDA_CALLABLE quat_t(Type x=Type(0), Type y=Type(0), Type z=Type(0), Type w=Type(0)) : x(x), y(y), z(z), w(w) {}
     explicit inline CUDA_CALLABLE quat_t(const vec_t<3,Type>& v, Type w=Type(0)) : x(v[0]), y(v[1]), z(v[2]), w(w) {}
+    template<typename OtherType>
+    explicit inline CUDA_CALLABLE quat_t(const quat_t<OtherType>& other)
+    {
+        x = static_cast<Type>(other.x);
+        y = static_cast<Type>(other.y);
+        z = static_cast<Type>(other.z);
+        w = static_cast<Type>(other.w);
+    }
     // imaginary part
     Type x;
@@ -73,7 +82,17 @@ inline CUDA_CALLABLE void adj_quat_t(const vec_t<3,Type>& v, Type w, vec_t<3,Typ
     adj_v[0] += adj_ret.x;
     adj_v[1] += adj_ret.y;
     adj_v[2] += adj_ret.z;
-    adj_w   += adj_ret.w;
+    adj_w    += adj_ret.w;
+}
+// casting constructor adjoint
+template<typename Type, typename OtherType>
+inline CUDA_CALLABLE void adj_quat_t(const quat_t<OtherType>& other, quat_t<OtherType>& adj_other, const quat_t<Type>& adj_ret)
+{
+    adj_other.x += static_cast<OtherType>(adj_ret.x);
+    adj_other.y += static_cast<OtherType>(adj_ret.y);
+    adj_other.z += static_cast<OtherType>(adj_ret.z);
+    adj_other.w += static_cast<OtherType>(adj_ret.w);
 }
 // forward methods
@@ -206,12 +225,24 @@ inline CUDA_CALLABLE quat_t<Type> div(quat_t<Type> q, Type s)
     return quat_t<Type>(q.x/s, q.y/s, q.z/s, q.w/s);
 }
+template<typename Type>
+inline CUDA_CALLABLE quat_t<Type> div(Type s, quat_t<Type> q)
+{
+    return quat_t<Type>(s/q.x, s/q.y, s/q.z, s/q.w);
+}
 template<typename Type>
 inline CUDA_CALLABLE quat_t<Type> operator / (quat_t<Type> a, Type s)
 {
     return div(a,s);
 }
+template<typename Type>
+inline CUDA_CALLABLE quat_t<Type> operator / (Type s, quat_t<Type> a)
+{
+    return div(s,a);
+}
 template<typename Type>
 inline CUDA_CALLABLE quat_t<Type> operator*(Type s, const quat_t<Type>& a)
 {
@@ -321,7 +352,7 @@ inline CUDA_CALLABLE quat_t<Type> quat_from_matrix(const mat_t<3,3,Type>& m)
 }
 template<typename Type>
-inline CUDA_CALLABLE Type index(const quat_t<Type>& a, int idx)
+inline CUDA_CALLABLE Type extract(const quat_t<Type>& a, int idx)
 {
 #if FP_CHECK
     if (idx < 0 || idx > 3)
@@ -357,7 +388,7 @@ CUDA_CALLABLE inline void adj_lerp(const quat_t<Type>& a, const quat_t<Type>& b,
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_index(const quat_t<Type>& a, int idx, quat_t<Type>& adj_a, int & adj_idx, Type & adj_ret)
+inline CUDA_CALLABLE void adj_extract(const quat_t<Type>& a, int idx, quat_t<Type>& adj_a, int & adj_idx, Type & adj_ret)
 {
 #if FP_CHECK
     if (idx < 0 || idx > 3)
@@ -367,7 +398,7 @@ inline CUDA_CALLABLE void adj_index(const quat_t<Type>& a, int idx, quat_t<Type>
     }
 #endif
-    // See wp::index(const quat_t<Type>& a, int idx) note
+    // See wp::extract(const quat_t<Type>& a, int idx) note
     if (idx == 0)       {adj_a.x += adj_ret;}
     else if (idx == 1)  {adj_a.y += adj_ret;}
     else if (idx == 2)  {adj_a.z += adj_ret;}
@@ -504,9 +535,14 @@ inline CUDA_CALLABLE void tensordot(const quat_t<Type>& a, const quat_t<Type>& b
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_length(const quat_t<Type>& a, quat_t<Type>& adj_a, const Type adj_ret)
+inline CUDA_CALLABLE void adj_length(const quat_t<Type>& a, Type ret, quat_t<Type>& adj_a, const Type adj_ret)
 {
-    adj_a += normalize(a)*adj_ret;
+    if (ret > Type(kEps))
+    {
+        Type inv_l = Type(1)/ret;
+        adj_a += quat_t<Type>(a.x*inv_l, a.y*inv_l, a.z*inv_l, a.w*inv_l) * adj_ret;
+    }
 }
 template<typename Type>
@@ -589,6 +625,13 @@ inline CUDA_CALLABLE void adj_div(quat_t<Type> a, Type s, quat_t<Type>& adj_a, T
     adj_a += adj_ret / s;
 }
+template<typename Type>
+inline CUDA_CALLABLE void adj_div(Type s, quat_t<Type> a, Type& adj_s, quat_t<Type>& adj_a, const quat_t<Type>& adj_ret)
+{
+    adj_s -= dot(a, adj_ret)/ (s * s); // - a / s^2
+    adj_a += s / adj_ret;
+}
 template<typename Type>
 inline CUDA_CALLABLE void adj_quat_rotate(const quat_t<Type>& q, const vec_t<3,Type>& p, quat_t<Type>& adj_q, vec_t<3,Type>& adj_p, const vec_t<3,Type>& adj_ret)
 {
@@ -658,7 +701,7 @@ inline CUDA_CALLABLE void adj_quat_rotate_inv(const quat_t<Type>& q, const vec_t
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Type>& q1, Type t, quat_t<Type>& adj_q0, quat_t<Type>& adj_q1, Type& adj_t, const quat_t<Type>& adj_ret)
+inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Type>& q1, Type t, quat_t<Type>& ret, quat_t<Type>& adj_q0, quat_t<Type>& adj_q1, Type& adj_t, const quat_t<Type>& adj_ret)
 {
     vec_t<3,Type> axis;
     Type angle;
@@ -669,7 +712,7 @@ inline CUDA_CALLABLE void adj_quat_slerp(const quat_t<Type>& q0, const quat_t<Ty
     angle = angle * 0.5;
     // adj_t
-    adj_t += dot(mul(quat_slerp(q0, q1, t), quat_t<Type>(angle*axis[0], angle*axis[1], angle*axis[2], Type(0))), adj_ret);
+    adj_t += dot(mul(ret, quat_t<Type>(angle*axis[0], angle*axis[1], angle*axis[2], Type(0))), adj_ret);
     // adj_q0
     quat_t<Type> q_inc_x_q0;

warp/native/rand.h CHANGED Viewed

@@ -9,8 +9,8 @@
 # pragma once
 #include "array.h"
-#ifndef M_PI
-#define M_PI 3.14159265358979323846f
+#ifndef M_PI_F
+#define M_PI_F 3.14159265358979323846f
 #endif
 namespace wp
@@ -33,7 +33,7 @@ inline CUDA_CALLABLE float randf(uint32& state) { state = rand_pcg(state); retur
 inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (max - min) * randf(state) + min; }
 // Box-Muller method
-inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state))) * cos(2.f * M_PI * randf(state)); }
+inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state))) * cos(2.f * M_PI_F * randf(state)); }
 inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, float adj_ret) {}
 inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, float adj_ret) {}
@@ -55,14 +55,14 @@ inline CUDA_CALLABLE int sample_cdf(uint32& state, const array_t<float>& cdf)
 inline CUDA_CALLABLE vec2 sample_triangle(uint32& state)
 {
     float r = sqrt(randf(state));
-    float u = 1.0 - r;
+    float u = 1.f - r;
     float v = randf(state) * r;
     return vec2(u, v);
 }
 inline CUDA_CALLABLE vec2 sample_unit_ring(uint32& state)
 {
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float x = cos(theta);
     float y = sin(theta);
     return vec2(x, y);
@@ -71,7 +71,7 @@ inline CUDA_CALLABLE vec2 sample_unit_ring(uint32& state)
 inline CUDA_CALLABLE vec2 sample_unit_disk(uint32& state)
 {
     float r = sqrt(randf(state));
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float x = r * cos(theta);
     float y = r * sin(theta);
     return vec2(x, y);
@@ -80,7 +80,7 @@ inline CUDA_CALLABLE vec2 sample_unit_disk(uint32& state)
 inline CUDA_CALLABLE vec3 sample_unit_sphere_surface(uint32& state)
 {
     float phi = acos(1.f - 2.f * randf(state));
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float x = cos(theta) * sin(phi);
     float y = sin(theta) * sin(phi);
     float z = cos(phi);
@@ -90,7 +90,7 @@ inline CUDA_CALLABLE vec3 sample_unit_sphere_surface(uint32& state)
 inline CUDA_CALLABLE vec3 sample_unit_sphere(uint32& state)
 {
     float phi = acos(1.f  - 2.f * randf(state));
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float r = pow(randf(state), 1.f/3.f);
     float x = r * cos(theta) * sin(phi);
     float y = r * sin(theta) * sin(phi);
@@ -101,7 +101,7 @@ inline CUDA_CALLABLE vec3 sample_unit_sphere(uint32& state)
 inline CUDA_CALLABLE vec3 sample_unit_hemisphere_surface(uint32& state)
 {
     float phi = acos(1.f - randf(state));
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float x = cos(theta) * sin(phi);
     float y = sin(theta) * sin(phi);
     float z = cos(phi);
@@ -111,7 +111,7 @@ inline CUDA_CALLABLE vec3 sample_unit_hemisphere_surface(uint32& state)
 inline CUDA_CALLABLE vec3 sample_unit_hemisphere(uint32& state)
 {
     float phi = acos(1.f - randf(state));
-    float theta = randf(state, 0.f, 2.f*M_PI);
+    float theta = randf(state, 0.f, 2.f*M_PI_F);
     float r = pow(randf(state), 1.f/3.f);
     float x = r * cos(theta) * sin(phi);
     float y = r * sin(theta) * sin(phi);
@@ -134,6 +134,15 @@ inline CUDA_CALLABLE vec3 sample_unit_cube(uint32& state)
     return vec3(x, y, z);
 }
+inline CUDA_CALLABLE vec4 sample_unit_hypercube(uint32& state)
+{
+    float a = randf(state) - 0.5f;
+    float b = randf(state) - 0.5f;
+    float c = randf(state) - 0.5f;
+    float d = randf(state) - 0.5f;
+    return vec4(a, b, c, d);
+}
 inline CUDA_CALLABLE void adj_sample_cdf(uint32& state, const array_t<float>& cdf, uint32& adj_state, array_t<float>& adj_cdf, const int& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_triangle(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_unit_ring(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
@@ -144,6 +153,7 @@ inline CUDA_CALLABLE void adj_sample_unit_hemisphere_surface(uint32& state, uint
 inline CUDA_CALLABLE void adj_sample_unit_hemisphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_unit_square(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_unit_cube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
+inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
 /*
  * log-gamma function to support some of these distributions. The
@@ -158,17 +168,17 @@ inline CUDA_CALLABLE float random_loggam(float x)
     float x0, x2, lg2pi, gl, gl0;
     uint32 n;
-    const float a[10] = {8.333333333333333e-02, -2.777777777777778e-03,
-                        7.936507936507937e-04, -5.952380952380952e-04,
-                        8.417508417508418e-04, -1.917526917526918e-03,
-                        6.410256410256410e-03, -2.955065359477124e-02,
-                        1.796443723688307e-01, -1.39243221690590e+00};
+    const float a[10] = {8.333333333333333e-02f, -2.777777777777778e-03f,
+                        7.936507936507937e-04f, -5.952380952380952e-04f,
+                        8.417508417508418e-04f, -1.917526917526918e-03f,
+                        6.410256410256410e-03f, -2.955065359477124e-02f,
+                        1.796443723688307e-01f, -1.39243221690590e+00f};
-    if ((x == 1.0) || (x == 2.0))
+    if ((x == 1.f) || (x == 2.f))
     {
-        return 0.0;
+        return 0.f;
     }
-    else if (x < 7.0)
+    else if (x < 7.f)
     {
         n = uint32((7 - x));
     }
@@ -178,8 +188,8 @@ inline CUDA_CALLABLE float random_loggam(float x)
     }
     x0 = x + float(n);
-    x2 = (1.0 / x0) * (1.0 / x0);
-    // log(2 * M_PI)
+    x2 = (1.f / x0) * (1.f / x0);
+    // log(2 * M_PI_F)
     lg2pi = 1.8378770664093453f;
     gl0 = a[9];
     for (int i = 8; i >= 0; i--)
@@ -187,13 +197,13 @@ inline CUDA_CALLABLE float random_loggam(float x)
         gl0 *= x2;
         gl0 += a[i];
     }
-    gl = gl0 / x0 + 0.5 * lg2pi + (x0 - 0.5) * log(x0) - x0;
-    if (x < 7.0)
+    gl = gl0 / x0 + 0.5f * lg2pi + (x0 - 0.5f) * log(x0) - x0;
+    if (x < 7.f)
     {
         for (uint32 k = 1; k <= n; k++)
         {
-            gl -= log(x0 - 1.0);
-            x0 -= 1.0;
+            gl -= log(x0 - 1.f);
+            x0 -= 1.f;
         }
     }
     return gl;
@@ -205,7 +215,7 @@ inline CUDA_CALLABLE uint32 random_poisson_mult(uint32& state, float lam) {
     enlam = exp(-lam);
     X = 0;
-    prod = 1.0;
+    prod = 1.f;
     while (1)
     {
@@ -234,22 +244,22 @@ inline CUDA_CALLABLE uint32 random_poisson(uint32& state, float lam)
     slam = sqrt(lam);
     loglam = log(lam);
-    b = 0.931 + 2.53 * slam;
-    a = -0.059 + 0.02483 * b;
-    invalpha = 1.1239 + 1.1328 / (b - 3.4);
-    vr = 0.9277 - 3.6224 / (b - 2.0);
+    b = 0.931f + 2.53f * slam;
+    a = -0.059f + 0.02483f * b;
+    invalpha = 1.1239f + 1.1328f / (b - 3.4f);
+    vr = 0.9277f - 3.6224f / (b - 2.f);
     while (1)
     {
-        U = randf(state) - 0.5;
+        U = randf(state) - 0.5f;
         V = randf(state);
-        us = 0.5 - abs(U);
-        k = uint32(floor((2 * a / us + b) * U + lam + 0.43));
-        if ((us >= 0.07) && (V <= vr))
+        us = 0.5f - abs(U);
+        k = uint32(floor((2.f * a / us + b) * U + lam + 0.43f));
+        if ((us >= 0.07f) && (V <= vr))
         {
             return k;
         }
-        if ((us < 0.013) && (V > us))
+        if ((us < 0.013f) && (V > us))
         {
             continue;
         }
@@ -261,7 +271,7 @@ inline CUDA_CALLABLE uint32 random_poisson(uint32& state, float lam)
 }
 /*
-* Adpated from NumPy's implementation
+* Adapted from NumPy's implementation
 * Warp's state variable is half the precision of NumPy's so
 * poisson implementation uses half the precision used in NumPy's implementation
 * both precisions appear to converge in the statistical limit

warp/native/range.h CHANGED Viewed

@@ -15,8 +15,12 @@ namespace wp
 // represents a built-in Python range() loop
 struct range_t
 {
-    CUDA_CALLABLE range_t() {}
-    CUDA_CALLABLE range_t(int) {} // for backward pass
+    CUDA_CALLABLE range_t()
+        : start(0),
+          end(0),
+          step(0),
+          i(0)
+    {}
     int start;
     int end;

warp/native/reduce.cpp CHANGED Viewed

@@ -97,7 +97,7 @@ template <typename T> void array_sum_host(const T *ptr_a, T *ptr_out, int count,
         accumulate_func = dyn_len_sum<T>;
     }
-    *ptr_out = 0.0f;
+    memset(ptr_out, 0, sizeof(T)*type_length);
     for (int i = 0; i < count; ++i)
         accumulate_func(ptr_a + i * stride, ptr_out, type_length);
 }

warp/native/reduce.cu CHANGED Viewed

@@ -103,23 +103,22 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
     assert((byte_stride % sizeof(T)) == 0);
     const int stride = byte_stride / sizeof(T);
-    void *context = cuda_context_get_current();
-    TemporaryBuffer &cub_temp = g_temp_buffer_map[context];
-    ContextGuard guard(context);
+    ContextGuard guard(cuda_context_get_current());
     cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
     cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
-    cub_temp.ensure_fits(buff_size);
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
     for (int k = 0; k < type_length; ++k)
     {
         cub_strided_iterator<const T> ptr_strided{ptr_a + k, stride};
-        check_cuda(cub::DeviceReduce::Sum(cub_temp.buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
+        check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
     }
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>
@@ -265,19 +264,18 @@ void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out
     const int stride_a = byte_stride_a / sizeof(ElemT);
     const int stride_b = byte_stride_b / sizeof(ElemT);
-    void *context = cuda_context_get_current();
-    TemporaryBuffer &cub_temp = g_temp_buffer_map[context];
-    ContextGuard guard(context);
+    ContextGuard guard(cuda_context_get_current());
     cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
     cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
-    cub_temp.ensure_fits(buff_size);
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
+    check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
-    check_cuda(cub::DeviceReduce::Sum(cub_temp.buffer, buff_size, inner_iterator, ptr_out, count, stream));
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>

warp/native/runlength_encode.cu CHANGED Viewed

@@ -3,8 +3,6 @@
 #include "warp.h"
 #include "cuda_util.h"
-#include "temp_buffer.h"
 #define THRUST_IGNORE_CUB_VERSION_CHECK
 #include <cub/device/device_run_length_encode.cuh>
@@ -15,11 +13,7 @@ void runlength_encode_device(int n,
                              int *run_lengths,
                              int *run_count)
 {
-    void *context = cuda_context_get_current();
-    TemporaryBuffer &cub_temp = g_temp_buffer_map[context];
-    ContextGuard guard(context);
+    ContextGuard guard(cuda_context_get_current());
     cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
     size_t buff_size = 0;
@@ -27,11 +21,13 @@ void runlength_encode_device(int n,
         nullptr, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
-    cub_temp.ensure_fits(buff_size);
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
     check_cuda(cub::DeviceRunLengthEncode::Encode(
-        cub_temp.buffer, buff_size, values, run_values, run_lengths, run_count,
+        temp_buffer, buff_size, values, run_values, run_lengths, run_count,
         n, stream));
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 void runlength_encode_int_device(
@@ -47,4 +43,4 @@ void runlength_encode_int_device(
         reinterpret_cast<int *>(run_values),
         reinterpret_cast<int *>(run_lengths),
         reinterpret_cast<int *>(run_count));
-}
+}

warp/native/scan.cu CHANGED Viewed

@@ -1,8 +1,6 @@
 #include "warp.h"
 #include "scan.h"
-#include "temp_buffer.h"
 #define THRUST_IGNORE_CUB_VERSION_CHECK
 #include <cub/device/device_scan.cuh>
@@ -10,29 +8,28 @@
 template<typename T>
 void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
 {
-    void *context = cuda_context_get_current();
-    TemporaryBuffer &cub_temp = g_temp_buffer_map[context];
-    ContextGuard guard(context);
+    ContextGuard guard(cuda_context_get_current());
     cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
     // compute temporary memory required
 	size_t scan_temp_size;
     if (inclusive) {
-        cub::DeviceScan::InclusiveSum(NULL, scan_temp_size, values_in, values_out, n);
+        check_cuda(cub::DeviceScan::InclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     } else {
-        cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n);
+        check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     }
-    cub_temp.ensure_fits(scan_temp_size);
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, scan_temp_size);
     // scan
     if (inclusive) {
-        cub::DeviceScan::InclusiveSum(cub_temp.buffer, scan_temp_size, values_in, values_out, n, (cudaStream_t)cuda_stream_get_current());
+        check_cuda(cub::DeviceScan::InclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     } else {
-        cub::DeviceScan::ExclusiveSum(cub_temp.buffer, scan_temp_size, values_in, values_out, n, (cudaStream_t)cuda_stream_get_current());
+        check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     }
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template void scan_device(const int*, int*, int, bool);

warp/native/sparse.cpp CHANGED Viewed

@@ -179,10 +179,10 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
     const int block_size = rows_per_block * cols_per_block;
     void (*block_transpose_func)(const T *, T *, int, int) = bsr_dyn_block_transpose<T>;
-    switch (row_count)
+    switch (rows_per_block)
     {
     case 1:
-        switch (col_count)
+        switch (cols_per_block)
         {
         case 1:
             block_transpose_func = bsr_fixed_block_transpose<1, 1, T>;
@@ -196,7 +196,7 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
         }
         break;
     case 2:
-        switch (col_count)
+        switch (cols_per_block)
         {
         case 1:
             block_transpose_func = bsr_fixed_block_transpose<2, 1, T>;
@@ -210,7 +210,7 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
         }
         break;
     case 3:
-        switch (col_count)
+        switch (cols_per_block)
         {
         case 1:
             block_transpose_func = bsr_fixed_block_transpose<3, 1, T>;