PyPI - warp-lang - Versions diffs - 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.10.1__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (300) hide show

warp/__init__.py +10 -4
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +5 -3
warp/build_dll.py +29 -9
warp/builtins.py +868 -507
warp/codegen.py +1074 -638
warp/config.py +3 -3
warp/constants.py +6 -0
warp/context.py +715 -222
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +147 -44
warp/native/builtin.h +122 -149
warp/native/bvh.cpp +73 -325
warp/native/bvh.cu +406 -23
warp/native/bvh.h +34 -43
warp/native/clang/clang.cpp +13 -8
warp/native/crt.h +2 -0
warp/native/cuda_crt.h +5 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -952
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +1 -1
warp/native/marching.cu +157 -161
warp/native/mat.h +80 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +33 -108
warp/native/mesh.cu +114 -23
warp/native/mesh.h +446 -46
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +1 -1
warp/native/reduce.cu +10 -12
warp/native/runlength_encode.cu +6 -10
warp/native/scan.cu +8 -11
warp/native/sparse.cpp +4 -4
warp/native/sparse.cu +164 -154
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +14 -30
warp/native/vec.h +107 -23
warp/native/volume.h +120 -0
warp/native/warp.cpp +560 -30
warp/native/warp.cu +431 -44
warp/native/warp.h +13 -4
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +335 -119
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +8 -0
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +158 -16
warp/sim/model.py +795 -291
warp/sim/render.py +3 -3
warp/sim/utils.py +3 -0
warp/sparse.py +640 -150
warp/stubs.py +606 -267
warp/tape.py +61 -10
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +212 -97
warp/tests/test_array_reduce.py +57 -23
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +42 -18
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +208 -130
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +75 -75
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -86
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +178 -109
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +32 -31
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +14 -41
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +517 -2898
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +140 -22
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +168 -20
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +261 -63
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +268 -63
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -2050
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +9 -6
warp/types.py +1089 -366
warp/utils.py +93 -387
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/tests/test_all.py +0 -219
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.10.1.dist-info/METADATA +0 -21
warp_lang-0.10.1.dist-info/RECORD +0 -188
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/LICENSE.md +0 -0
{warp_lang-0.10.1.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/crt.h CHANGED Viewed

@@ -259,6 +259,8 @@ float expf(float);
 double exp(double);
 float sqrtf(float);
 double sqrt(double);
+float cbrtf(float);
+double cbrt(double);
 float powf(float, float);
 double pow(double, double);
 float floorf(float);

warp/native/cuda_crt.h CHANGED Viewed

@@ -1033,6 +1033,11 @@ __device_forceinline__ unsigned int atomicAdd(unsigned int *const address, const
     return __uAtomicAdd(address, val);
 }
+__device_forceinline__ unsigned int atomicAdd(unsigned long long *const address, const unsigned long long val)
+{
+    return __ullAtomicAdd(address, val);
+}
 __device_forceinline__ int atomicMin(int *const address, const int val)
 {
     return __iAtomicMin(address, val);

warp/native/cuda_util.cpp CHANGED Viewed

@@ -59,6 +59,7 @@ static PFN_cuDeviceGet_v2000 pfn_cuDeviceGet;
 static PFN_cuDeviceGetCount_v2000 pfn_cuDeviceGetCount;
 static PFN_cuDeviceGetName_v2000 pfn_cuDeviceGetName;
 static PFN_cuDeviceGetAttribute_v2000 pfn_cuDeviceGetAttribute;
+static PFN_cuDeviceGetUuid_v11040 pfn_cuDeviceGetUuid;
 static PFN_cuDevicePrimaryCtxRetain_v7000 pfn_cuDevicePrimaryCtxRetain;
 static PFN_cuDevicePrimaryCtxRelease_v11000 pfn_cuDevicePrimaryCtxRelease;
 static PFN_cuDeviceCanAccessPeer_v4000 pfn_cuDeviceCanAccessPeer;
@@ -89,6 +90,7 @@ static PFN_cuGraphicsResourceGetMappedPointer_v3020 pfn_cuGraphicsResourceGetMap
 static PFN_cuGraphicsGLRegisterBuffer_v3000 pfn_cuGraphicsGLRegisterBuffer;
 static PFN_cuGraphicsUnregisterResource_v3000 pfn_cuGraphicsUnregisterResource;
+static bool cuda_driver_initialized = false;
 bool ContextGuard::always_restore = false;
@@ -165,6 +167,7 @@ bool init_cuda_driver()
     get_driver_entry_point("cuDeviceGetCount", &(void*&)pfn_cuDeviceGetCount);
     get_driver_entry_point("cuDeviceGetName", &(void*&)pfn_cuDeviceGetName);
     get_driver_entry_point("cuDeviceGetAttribute", &(void*&)pfn_cuDeviceGetAttribute);
+    get_driver_entry_point("cuDeviceGetUuid", &(void*&)pfn_cuDeviceGetUuid);
     get_driver_entry_point("cuDevicePrimaryCtxRetain", &(void*&)pfn_cuDevicePrimaryCtxRetain);
     get_driver_entry_point("cuDevicePrimaryCtxRelease", &(void*&)pfn_cuDevicePrimaryCtxRelease);
     get_driver_entry_point("cuDeviceCanAccessPeer", &(void*&)pfn_cuDeviceCanAccessPeer);
@@ -196,11 +199,15 @@ bool init_cuda_driver()
     get_driver_entry_point("cuGraphicsUnregisterResource", &(void*&)pfn_cuGraphicsUnregisterResource);
     if (pfn_cuInit)
-        return check_cu(pfn_cuInit(0));
-    else
-        return false;
+        cuda_driver_initialized = check_cu(pfn_cuInit(0));
+    return cuda_driver_initialized;
 }
+bool is_cuda_driver_initialized()
+{
+    return cuda_driver_initialized;
+}
 bool check_cuda_result(cudaError_t code, const char* file, int line)
 {
@@ -284,6 +291,11 @@ CUresult cuDeviceGetAttribute_f(int* value, CUdevice_attribute attrib, CUdevice
     return pfn_cuDeviceGetAttribute ? pfn_cuDeviceGetAttribute(value, attrib, dev) : DRIVER_ENTRY_POINT_ERROR;
 }
+CUresult cuDeviceGetUuid_f(CUuuid* uuid, CUdevice dev)
+{
+    return pfn_cuDeviceGetUuid ? pfn_cuDeviceGetUuid(uuid, dev) : DRIVER_ENTRY_POINT_ERROR;
+}
 CUresult cuDevicePrimaryCtxRetain_f(CUcontext* ctx, CUdevice dev)
 {
     return pfn_cuDevicePrimaryCtxRetain ? pfn_cuDevicePrimaryCtxRetain(ctx, dev) : DRIVER_ENTRY_POINT_ERROR;

warp/native/cuda_util.h CHANGED Viewed

@@ -51,6 +51,7 @@ CUresult cuDeviceGet_f(CUdevice *dev, int ordinal);
 CUresult cuDeviceGetCount_f(int* count);
 CUresult cuDeviceGetName_f(char* name, int len, CUdevice dev);
 CUresult cuDeviceGetAttribute_f(int* value, CUdevice_attribute attrib, CUdevice dev);
+CUresult cuDeviceGetUuid_f(CUuuid* uuid, CUdevice dev);
 CUresult cuDevicePrimaryCtxRetain_f(CUcontext* ctx, CUdevice dev);
 CUresult cuDevicePrimaryCtxRelease_f(CUdevice dev);
 CUresult cuDeviceCanAccessPeer_f(int* can_access, CUdevice dev, CUdevice peer_dev);
@@ -83,6 +84,7 @@ CUresult cuGraphicsUnregisterResource_f(CUgraphicsResource resource);
 bool init_cuda_driver();
+bool is_cuda_driver_initialized();
 bool check_cuda_result(cudaError_t code, const char* file, int line);
 inline bool check_cuda_result(uint64_t code, const char* file, int line)
@@ -166,6 +168,6 @@ public:
 #endif // WP_ENABLE_CUDA
 // Pass this value to device functions as the `context` parameter to bypass unnecessary context management.
-// This works in conjuntion with ContextGuards, which do nothing if the given context is NULL.
+// This works in conjunction with ContextGuards, which do nothing if the given context is NULL.
 // Using this variable instead of passing NULL directly aids readability and makes the intent clear.
 constexpr void* WP_CURRENT_CONTEXT = NULL;

warp/native/cutlass/tools/library/scripts/conv2d_operation.py ADDED Viewed

@@ -0,0 +1,463 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+#
+import enum
+import os.path
+import shutil
+from library import *
+###################################################################################################
+#
+class Conv2dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity1, \
+    group_mode = GroupMode.NoneGroup):
+    self.operation_kind = OperationKind.Conv2d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+    self.group_mode = group_mode
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian
+      ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    intermediate_type = ''
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.accumulator_type():
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    threadblock = self.tile_description.procedural_name()
+    # grouped conv
+    if self.group_mode != GroupMode.NoneGroup:
+      group_conv_name = f"{GroupModeNames[self.group_mode]}_"
+    else:
+      group_conv_name = ""
+    if self.stride_support == StrideSupport.Unity:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_${group_conv_name}align${alignment}"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${group_conv_name}align${alignment}"
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'alignment': "%d" % self.A.alignment,
+        'group_conv_name': group_conv_name
+      }
+    )
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+class EmitConv2dInstance:
+  def __init__(self):
+    self.template = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_group_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2dGroup${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${group_mode},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_depthwise_direct_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConv${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::conv::TensorNHWCShape<${threadblock_output_shape_n}, ${threadblock_output_shape_p}, ${threadblock_output_shape_q}, ${groups_per_cta}>,
+    cutlass::MatrixShape<${filter_shape_r}, ${filter_shape_s}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ${threadblock_output_shape_n},
+          ${threadblock_output_shape_p},
+          ${threadblock_output_shape_q}>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    cutlass::MatrixShape<${stride_r}, ${stride_s}>,
+    cutlass::MatrixShape<${dilation_r}, ${dilation_s}>
+  >::Kernel;
+"""
+  def emit(self, operation):
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+    }
+    if operation.group_mode == GroupMode.NoneGroup:
+      return SubstituteTemplate(self.template, values)
+    elif operation.group_mode == GroupMode.Depthwise:
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      # Setup other template params
+      values['threadblock_output_shape_n'] = str(operation.tile_description.threadblock_output_shape[0])
+      values['threadblock_output_shape_p'] = str(operation.tile_description.threadblock_output_shape[1])
+      values['threadblock_output_shape_q'] = str(operation.tile_description.threadblock_output_shape[2])
+      values['groups_per_cta'] = str(operation.tile_description.threadblock_output_shape[3])
+      values['filter_shape_r'] = str(operation.tile_description.filter_shape[0])
+      values['filter_shape_s'] = str(operation.tile_description.filter_shape[1])
+      values['stride_r'] = str(operation.tile_description.stride[0])
+      values['stride_s'] = str(operation.tile_description.stride[1])
+      values['dilation_r'] = str(operation.tile_description.dilation[0])
+      values['dilation_s'] = str(operation.tile_description.dilation[1])
+      return SubstituteTemplate(self.template_depthwise_direct_conv, values)
+    else:
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      return SubstituteTemplate(self.template_group_conv, values)
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+#
+def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
+          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitConv2dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+    self.instance_emitter = EmitConv2dInstance()
+    self.instance_template = """
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.header_template = """
+/*
+  Generated by conv2d_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+#include "conv2d_operation.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.configuration_header = """
+namespace cutlass {
+namespace library {
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.configuration_instance = """
+  using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution<
+    ${operation_name}>;
+  manifest.append(new cutlass::library::Conv2dOperation<
+    Operation_${operation_name}>(
+      "${operation_name}"));
+"""
+    self.configuration_direct_conv_instance = """
+  using Operation_${operation_name} = cutlass::conv::device::DirectConvolution<
+    ${operation_name}>;
+  manifest.append(new cutlass::library::DirectConv2dOperation<
+    Operation_${operation_name}>(
+      "${operation_name}"));
+"""
+    self.configuration_epilogue = """
+}
+"""
+    self.epilogue_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  #
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+  #
+  def emit(self, operation):
+    self.operations.append(operation)
+    self.configuration_file.write(SubstituteTemplate(self.instance_template, {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation)
+      }))
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+    for operation in self.operations:
+      if operation.group_mode == GroupMode.Depthwise:
+        self.configuration_file.write(SubstituteTemplate(self.configuration_direct_conv_instance, {
+          'configuration_name': self.configuration_name,
+          'operation_name': operation.procedural_name()
+        }))
+      else:
+        self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+          'configuration_name': self.configuration_name,
+          'operation_name': operation.procedural_name()
+        }))
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################
+###################################################################################################