PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/clang/clang.cpp CHANGED Viewed

@@ -25,6 +25,8 @@
 #include <llvm/PassRegistry.h>
 #include <llvm/InitializePasses.h>
 #include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Linker/Linker.h>
 #include <llvm/ExecutionEngine/Orc/LLJIT.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
@@ -45,6 +47,7 @@
 #elif defined(__APPLE__)
     extern "C" void __bzero(void*, size_t);
     extern "C" __double2 __sincos_stret(double);
+    extern "C" __float2 __sincosf_stret(float);
 #endif
 extern "C" {
@@ -54,21 +57,20 @@ extern "C" {
 // On Linux it suffices for these symbols not to be stripped out, while for Windows a .pdb has to contain
 // their information. LLVM defines them, but we don't want a huge .pdb with all LLVM source code's debug
 // info. By forward-declaring them here it suffices to compile this file with /Zi.
-struct jit_descriptor;
-extern jit_descriptor __jit_debug_descriptor;
+extern struct jit_descriptor __jit_debug_descriptor;
 extern void __jit_debug_register_code();
 }
 namespace wp {
 #if defined (_WIN32)
-	// Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
-	// Override it to use the ELF format to support DWARF debug info, but keep using the
-	// Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
-	static const char* target_triple = "x86_64-pc-windows-elf";
+    // Windows defaults to using the COFF binary format (aka. "msvc" in the target triple).
+    // Override it to use the ELF format to support DWARF debug info, but keep using the
+    // Microsoft calling convention (see also https://llvm.org/docs/DebuggingJITedCode.html).
+    static const char* target_triple = "x86_64-pc-windows-elf";
 #else
-	static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
+    static const char* target_triple = LLVM_DEFAULT_TARGET_TRIPLE;
 #endif
 static void initialize_llvm()
@@ -93,6 +95,11 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
     args.push_back("-triple");
     args.push_back(target_triple);
+    #if defined(__x86_64__) || defined(_M_X64)
+        args.push_back("-target-feature");
+        args.push_back("+f16c");  // Enables support for _Float16
+    #endif
     clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
     std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
             std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
@@ -114,8 +121,6 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
     std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
     compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
-    compiler_instance.getPreprocessorOpts().addMacroDef("WP_CPU");
     if(!debug)
     {
         compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
@@ -133,18 +138,71 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
     return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
 }
-extern "C" {
-WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char* output_file, bool debug)
+static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
 {
-    #if defined (_WIN32)
-        const char* obj_ext = ".obj";
-    #else
-        const char* obj_ext = ".o";
-    #endif
+    // Compilation arguments
+    std::vector<const char*> args;
+    args.push_back(input_file.c_str());
+    args.push_back("-I");
+    args.push_back(include_dir);
+    args.push_back(debug ? "-O0" : "-O2");
+    args.push_back("-triple");
+    args.push_back("nvptx64-nvidia-cuda");
+    args.push_back("-target-cpu");
+    args.push_back("sm_70");
+    clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
+    std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
+            std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
+    std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
+            std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
+    clang::CompilerInstance compiler_instance;
+    auto& compiler_invocation = compiler_instance.getInvocation();
+    clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
+    if(debug)
+    {
+        compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
+    }
+    // Map code to a MemoryBuffer
+    std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
+    compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
+    // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
+    // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
+    // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
+    compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
+    if(!debug)
+    {
+        compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
+    }
+    compiler_instance.getLangOpts().CUDA = 1;
+    compiler_instance.getLangOpts().CUDAIsDevice = 1;
+    compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
+    compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
-    std::string input_file = std::string(output_file).substr(0, std::strlen(output_file) - std::strlen(obj_ext));
+    clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
+    bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
+    buffer.release();
+    return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
+}
+extern "C" {
+WP_API int compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
+{
     initialize_llvm();
     llvm::LLVMContext context;
@@ -155,13 +213,13 @@ WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char*
         return -1;
     }
-    std::string Error;
-    const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, Error);
+    std::string error;
+    const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
     const char* CPU = "generic";
     const char* features = "";
     llvm::TargetOptions target_options;
-    llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;  // DLLs need Position Independent Code
+    llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;  // Position Independent Code
     llvm::CodeModel::Model code_model = llvm::CodeModel::Large;  // Don't make assumptions about displacement sizes
     llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
@@ -182,6 +240,59 @@ WP_API int compile_cpp(const char* cpp_src, const char* include_dir, const char*
     return 0;
 }
+WP_API int compile_cuda(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug)
+{
+    initialize_llvm();
+    llvm::LLVMContext context;
+    std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);
+    if(!module)
+    {
+        return -1;
+    }
+    std::string error;
+    const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
+    const char* CPU = "sm_70";
+    const char* features = "+ptx75";  // Warp requires CUDA 11.5, which supports PTX ISA 7.5
+    llvm::TargetOptions target_options;
+    llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
+    llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
+    module->setDataLayout(target_machine->createDataLayout());
+    // Link libdevice
+    llvm::SMDiagnostic diagnostic;
+    std::string libdevice_path = std::string(include_dir) + "/libdevice/libdevice.10.bc";
+    std::unique_ptr<llvm::Module> libdevice(llvm::parseIRFile(libdevice_path, diagnostic, context));
+    if(!libdevice)
+    {
+        return -1;
+    }
+    llvm::Linker linker(*module.get());
+    if(linker.linkInModule(std::move(libdevice), llvm::Linker::Flags::LinkOnlyNeeded) == true)
+    {
+        return -1;
+    }
+    std::error_code error_code;
+    llvm::raw_fd_ostream output(output_file, error_code, llvm::sys::fs::OF_None);
+    llvm::legacy::PassManager pass_manager;
+    llvm::CodeGenFileType file_type = llvm::CGFT_AssemblyFile;
+    target_machine->addPassesToEmitFile(pass_manager, output, nullptr, file_type);
+    pass_manager.run(*module);
+    output.flush();
+    delete target_machine;
+    return 0;
+}
 // Global JIT instance
 static llvm::orc::LLJIT* jit = nullptr;
@@ -248,6 +359,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
             SYMBOL(log10f), SYMBOL_T(log10, double(*)(double)),
             SYMBOL(expf), SYMBOL_T(exp, double(*)(double)),
             SYMBOL(sqrtf), SYMBOL_T(sqrt, double(*)(double)),
+            SYMBOL(cbrtf), SYMBOL_T(cbrt, double(*)(double)),
             SYMBOL(powf), SYMBOL_T(pow, double(*)(double, double)),
             SYMBOL(floorf), SYMBOL_T(floor, double(*)(double)),
             SYMBOL(ceilf), SYMBOL_T(ceil, double(*)(double)),
@@ -276,7 +388,7 @@ WP_API int load_obj(const char* object_file, const char* module_name)
             SYMBOL(__chkstk),
         #elif defined(__APPLE__)
             SYMBOL(__bzero),
-            SYMBOL(__sincos_stret),
+            SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
         #else
             SYMBOL(sincosf), SYMBOL_T(sincos, void(*)(double,double*,double*)),
         #endif
@@ -335,7 +447,7 @@ WP_API uint64_t lookup(const char* dll_name, const char* function_name)
     if(!func)
     {
         std::cerr << "Failed to lookup symbol: " << llvm::toString(func.takeError()) << std::endl;
-        return -1;
+        return 0;
     }
     return func->getValue();

warp/native/crt.cpp CHANGED Viewed

@@ -29,79 +29,4 @@ extern "C" WP_API void _wp_assert(const char* expression, const char* file, unsi
     // Now invoke the standard assert(), which may abort the program or break
     // into the debugger as decided by the runtime environment.
     assert(false && "assert() failed");
-}
-// Export CRT symbols from warp.dll for use by compute kernel DLLs
-// These are declared in crt.h
-#if defined(_MSC_VER)
-#pragma comment(linker,"/export:printf")
-#pragma comment(linker,"/export:abs")
-#pragma comment(linker,"/export:llabs")
-#pragma comment(linker,"/export:fmodf")
-#pragma comment(linker,"/export:fmod")
-#pragma comment(linker,"/export:logf")
-#pragma comment(linker,"/export:log")
-#pragma comment(linker,"/export:log2f")
-#pragma comment(linker,"/export:log2")
-#pragma comment(linker,"/export:log10f")
-#pragma comment(linker,"/export:log10")
-#pragma comment(linker,"/export:expf")
-#pragma comment(linker,"/export:exp")
-#pragma comment(linker,"/export:sqrtf")
-#pragma comment(linker,"/export:sqrt")
-#pragma comment(linker,"/export:powf")
-#pragma comment(linker,"/export:pow")
-#pragma comment(linker,"/export:floorf")
-#pragma comment(linker,"/export:floor")
-#pragma comment(linker,"/export:ceilf")
-#pragma comment(linker,"/export:ceil")
-#pragma comment(linker,"/export:fabsf")
-#pragma comment(linker,"/export:fabs")
-#pragma comment(linker,"/export:roundf")
-#pragma comment(linker,"/export:round")
-#pragma comment(linker,"/export:truncf")
-#pragma comment(linker,"/export:trunc")
-#pragma comment(linker,"/export:rintf")
-#pragma comment(linker,"/export:rint")
-#pragma comment(linker,"/export:acosf")
-#pragma comment(linker,"/export:acos")
-#pragma comment(linker,"/export:asinf")
-#pragma comment(linker,"/export:asin")
-#pragma comment(linker,"/export:atanf")
-#pragma comment(linker,"/export:atan")
-#pragma comment(linker,"/export:atan2f")
-#pragma comment(linker,"/export:atan2")
-#pragma comment(linker,"/export:cosf")
-#pragma comment(linker,"/export:cos")
-#pragma comment(linker,"/export:sinf")
-#pragma comment(linker,"/export:sin")
-#pragma comment(linker,"/export:tanf")
-#pragma comment(linker,"/export:tan")
-#pragma comment(linker,"/export:sinhf")
-#pragma comment(linker,"/export:sinh")
-#pragma comment(linker,"/export:coshf")
-#pragma comment(linker,"/export:cosh")
-#pragma comment(linker,"/export:tanhf")
-#pragma comment(linker,"/export:tanh")
-#pragma comment(linker,"/export:fmaf")
-#pragma comment(linker,"/export:memset")
-#pragma comment(linker,"/export:memcpy")
-#pragma comment(linker,"/export:_wp_isfinite")
-#pragma comment(linker,"/export:_wp_assert")
-// For functions with large stack frames the MSVC compiler will emit a call to
-// __chkstk() to linearly touch each memory page. This grows the stack without
-// triggering the stack overflow guards.
-#pragma comment(linker,"/export:__chkstk")
-// The MSVC linker checks for the _fltused symbol if any floating-point
-// functionality is used. It's defined by the Microsoft CRT to indicate that
-// the x87 FPU control word was properly initialized.
-#pragma comment(linker,"/export:_fltused")
-#endif  // _MSC_VER
+}

warp/native/crt.h CHANGED Viewed

@@ -30,15 +30,15 @@
     #define WP_API
 #endif
-extern "C" {
+#if !defined(__CUDA_ARCH__)
 // Helper for implementing assert() macro
-WP_API void _wp_assert(const char* message, const char* file, unsigned int line);
+extern "C" WP_API void _wp_assert(const char* message, const char* file, unsigned int line);
 // Helper for implementing isfinite()
-WP_API int _wp_isfinite(double);
+extern "C" WP_API int _wp_isfinite(double);
-}  // extern "C"
+#endif  // !__CUDA_ARCH__
 #if !defined(WP_NO_CRT)
@@ -52,106 +52,6 @@ WP_API int _wp_isfinite(double);
 #else
-#if defined(__CUDACC__)
-// stdio.h
-extern "C" __device__ int printf(const char* format, ... );
-#else
-extern "C" {
-// stdio.h
-int printf(const char * format, ... );
-// stdlib.h
-int abs(int);
-long long llabs(long long);
-// math.h
-float fmodf(float, float);
-double fmod(double, double);
-float logf(float);
-double log(double);
-float log2f(float);
-double log2(double);
-float log10f(float);
-double log10(double);
-float expf(float);
-double exp(double);
-float sqrtf(float);
-double sqrt(double);
-float powf(float, float);
-double pow(double, double);
-float floorf(float);
-double floor(double);
-float ceilf(float);
-double ceil(double);
-float fabsf(float);
-double fabs(double);
-float roundf(float);
-double round(double);
-float truncf(float);
-double trunc(double);
-float rintf(float);
-double rint(double);
-float acosf(float);
-double acos(double);
-float asinf(float);
-double asin(double);
-float atanf(float);
-double atan(double);
-float atan2f(float, float);
-double atan2(double, double);
-float cosf(float);
-double cos(double);
-float sinf(float);
-double sin(double);
-float tanf(float);
-double tan(double);
-float sinhf(float);
-double sinh(double);
-float coshf(float);
-double cosh(double);
-float tanhf(float);
-double tanh(double);
-float fmaf(float, float, float);
-// stddef.h
-#if defined(_WIN32)
-using size_t = unsigned __int64;
-#else
-using size_t = unsigned long;
-#endif
-// string.h
-void* memset(void*, int, size_t);
-void* memcpy(void*, const void*, size_t);
-// stdlib.h
-void* malloc(size_t);
-void free(void*);
-}  // extern "C"
-// cmath
-inline bool isfinite(double x)
-{
-    return _wp_isfinite(x);
-}
-// assert.h
-#ifdef NDEBUG
-    #define assert(expression) ((void)0)
-#else
-    #define assert(expression) (void)(                                    \
-            (!!(expression)) ||                                           \
-            (_wp_assert((#expression), (__FILE__), (unsigned)(__LINE__)), 0) \
-        )
-#endif
-#endif  // !__CUDACC__
 // These definitions are taken from Jitify: https://github.com/NVIDIA/jitify
 /// float.h
@@ -221,6 +121,9 @@ enum {
 #define LLONG_MIN  (-LLONG_MAX - 1LL)
 #define ULLONG_MAX 18446744073709551615ULL
+#define INFINITY   ((float)(DBL_MAX * DBL_MAX))
+#define HUGE_VAL   ((double)INFINITY)
+#define HUGE_VALF  ((float)INFINITY)
 /// stdint.h
 typedef signed char      int8_t;
@@ -325,4 +228,108 @@ typedef unsigned long long uint64_t;
 #define M_PI 3.14159265358979323846
+#if defined(__CUDACC__)
+#if defined(__clang__)
+// When compiling CUDA with barebones Clang we need to define its builtins and runtime functions ourselves.
+#include "cuda_crt.h"
+#endif
+#else
+extern "C" {
+// stdio.h
+int printf(const char * format, ... );
+// stdlib.h
+int abs(int);
+long long llabs(long long);
+// math.h
+float fmodf(float, float);
+double fmod(double, double);
+float logf(float);
+double log(double);
+float log2f(float);
+double log2(double);
+float log10f(float);
+double log10(double);
+float expf(float);
+double exp(double);
+float sqrtf(float);
+double sqrt(double);
+float cbrtf(float);
+double cbrt(double);
+float powf(float, float);
+double pow(double, double);
+float floorf(float);
+double floor(double);
+float ceilf(float);
+double ceil(double);
+float fabsf(float);
+double fabs(double);
+float roundf(float);
+double round(double);
+float truncf(float);
+double trunc(double);
+float rintf(float);
+double rint(double);
+float acosf(float);
+double acos(double);
+float asinf(float);
+double asin(double);
+float atanf(float);
+double atan(double);
+float atan2f(float, float);
+double atan2(double, double);
+float cosf(float);
+double cos(double);
+float sinf(float);
+double sin(double);
+float tanf(float);
+double tan(double);
+float sinhf(float);
+double sinh(double);
+float coshf(float);
+double cosh(double);
+float tanhf(float);
+double tanh(double);
+float fmaf(float, float, float);
+// stddef.h
+#if defined(_WIN32)
+using size_t = unsigned __int64;
+#else
+using size_t = unsigned long;
+#endif
+// string.h
+void* memset(void*, int, size_t);
+void* memcpy(void*, const void*, size_t);
+// stdlib.h
+void* malloc(size_t);
+void free(void*);
+}  // extern "C"
+// cmath
+inline bool isfinite(double x)
+{
+    return _wp_isfinite(x);
+}
+// assert.h
+#ifdef NDEBUG
+    #define assert(expression) ((void)0)
+#else
+    #define assert(expression) (void)(                                    \
+            (!!(expression)) ||                                           \
+            (_wp_assert((#expression), (__FILE__), (unsigned)(__LINE__)), 0) \
+        )
+#endif
+#endif  // !__CUDACC__
 #endif // WP_NO_CRT