PyPI - warp-lang - Versions diffs - 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +794 -305
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1075 -0
warp/_src/build.py +618 -0
warp/_src/build_dll.py +640 -0
warp/{builtins.py → _src/builtins.py} +1382 -377
warp/_src/codegen.py +4359 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +57 -0
warp/_src/context.py +8294 -0
warp/_src/dlpack.py +462 -0
warp/_src/fabric.py +355 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +508 -0
warp/_src/fem/cache.py +687 -0
warp/_src/fem/dirichlet.py +188 -0
warp/{fem → _src/fem}/domain.py +40 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +701 -0
warp/{fem → _src/fem}/field/nodal_field.py +30 -15
warp/{fem → _src/fem}/field/restriction.py +1 -1
warp/{fem → _src/fem}/field/virtual.py +53 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
warp/_src/fem/geometry/closest_point.py +97 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
warp/{fem → _src/fem}/geometry/element.py +32 -10
warp/{fem → _src/fem}/geometry/geometry.py +48 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
warp/{fem → _src/fem}/geometry/partition.py +121 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
warp/{fem → _src/fem}/integrate.py +164 -158
warp/_src/fem/linalg.py +383 -0
warp/_src/fem/operator.py +396 -0
warp/_src/fem/polynomial.py +229 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
warp/_src/fem/space/basis_space.py +679 -0
warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
warp/{fem → _src/fem}/space/function_space.py +14 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
warp/{fem → _src/fem}/space/partition.py +117 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/restriction.py +66 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
warp/_src/fem/space/topology.py +459 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
warp/_src/fem/types.py +112 -0
warp/_src/fem/utils.py +486 -0
warp/_src/jax.py +186 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +387 -0
warp/_src/jax_experimental/ffi.py +1284 -0
warp/_src/jax_experimental/xla_ffi.py +656 -0
warp/_src/marching_cubes.py +708 -0
warp/_src/math.py +414 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +163 -0
warp/_src/optim/linear.py +1606 -0
warp/_src/optim/sgd.py +112 -0
warp/_src/paddle.py +406 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +289 -0
warp/_src/render/render_opengl.py +3636 -0
warp/_src/render/render_usd.py +937 -0
warp/_src/render/utils.py +160 -0
warp/_src/sparse.py +2716 -0
warp/_src/tape.py +1206 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +391 -0
warp/_src/types.py +5870 -0
warp/_src/utils.py +1693 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -721
warp/codegen.py +6 -4251
warp/constants.py +6 -39
warp/context.py +12 -8062
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +1 -1
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -365
warp/jax_experimental/ffi.py +17 -873
warp/jax_experimental/xla_ffi.py +5 -605
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +314 -37
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +681 -89
warp/native/tile_radix_sort.h +1 -1
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +57 -29
warp/native/warp.cu +253 -171
warp/native/warp.h +11 -8
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3618
warp/render/render_usd.py +6 -919
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +14 -14
warp/tests/interop/test_jax.py +772 -49
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +527 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +33 -14
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +56 -10
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +35 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_types.py +27 -15
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +178 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5764
warp/utils.py +10 -1659
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.1.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0

warp/native/clang/clang.cpp CHANGED Viewed

@@ -16,6 +16,7 @@
  */
 #include "../native/crt.h"
+#include "../version.h"
 #include <clang/Frontend/CompilerInstance.h>
 #include <clang/Basic/DiagnosticOptions.h>
@@ -58,27 +59,14 @@
 #if defined(_WIN64)
     extern "C" void __chkstk();
 #elif defined(__APPLE__)
-#if defined(__MACH__) && defined(__aarch64__)
     extern "C" void _bzero(void *s, size_t n) {
         memset(s, 0, n);
     }
     extern "C" void __bzero(void *s, size_t n) {
         memset(s, 0, n);
     }
-    extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
-    extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
-#else
-    // // Intel Mac's define bzero in libSystem.dylib
-    extern "C" void __bzero(void *s, size_t n);
     extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
     extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
-#endif
     extern "C" __double2 __sincos_stret(double);
     extern "C" __float2 __sincosf_stret(float);
 #endif // defined(__APPLE__)
@@ -114,7 +102,7 @@ static void initialize_llvm()
     llvm::InitializeAllAsmPrinters();
 }
-static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, bool verify_fp, llvm::LLVMContext& context)
+static std::unique_ptr<llvm::Module> source_to_llvm(bool is_cuda, const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, bool verify_fp, llvm::LLVMContext& context, bool tiles_in_stack_memory)
 {
     // Compilation arguments
     std::vector<const char*> args;
@@ -125,84 +113,50 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
     args.push_back(debug ? "-O0" : "-O2");
-    args.push_back("-triple");
-    args.push_back(target_triple);
-    #if defined(__x86_64__) || defined(_M_X64)
-        args.push_back("-target-feature");
-        args.push_back("+f16c");  // Enables support for _Float16
-    #endif
-    clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
-    std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
-            std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
-    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
-    std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
-            std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
-    clang::CompilerInstance compiler_instance;
-    auto& compiler_invocation = compiler_instance.getInvocation();
-    clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
-    if(debug)
+    if(is_cuda)
     {
-        #if LLVM_VERSION_MAJOR >= 18
-        compiler_invocation.getCodeGenOpts().setDebugInfo(llvm::codegenoptions::FullDebugInfo);
-        #else
-        compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
-        #endif
-    }
+        args.push_back("-triple");
+        args.push_back("nvptx64-nvidia-cuda");
-    // Map code to a MemoryBuffer
-    std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
-    compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
-    if(!debug)
-    {
-        compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
+        args.push_back("-target-cpu");
+        args.push_back("sm_70");
     }
-    if(verify_fp)
+    else
     {
-        compiler_instance.getPreprocessorOpts().addMacroDef("WP_VERIFY_FP");
-    }
-    compiler_instance.getLangOpts().MicrosoftExt = 1;  // __forceinline / __int64
-    compiler_instance.getLangOpts().DeclSpecKeyword = 1;  // __declspec
-    compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
+        args.push_back("-triple");
+        args.push_back(target_triple);
-    clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
-    bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
-    (void)buffer.release();
-    return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
-}
-static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
-{
-    // Compilation arguments
-    std::vector<const char*> args;
-    args.push_back(input_file.c_str());
-    args.push_back("-I");
-    args.push_back(include_dir);
-    args.push_back(debug ? "-O0" : "-O2");
-    args.push_back("-triple");
-    args.push_back("nvptx64-nvidia-cuda");
+        #if defined(__x86_64__) || defined(_M_X64)
+            args.push_back("-target-feature");
+            args.push_back("+f16c");  // Enables support for _Float16
+        #endif
-    args.push_back("-target-cpu");
-    args.push_back("sm_70");
+        #if defined(__aarch64__)
+        if(tiles_in_stack_memory)
+        {
+            // Static memory support is broken on AArch64 CPUs. As a workaround we reserve some stack memory on kernel entry,
+            // and point the callee-saved x28 register to it so we can access it anywhere. See tile_shared_storage_t in tile.h.
+            args.push_back("-target-feature");
+            args.push_back("+reserve-x28");
+        }
+        #endif
+    }
+    #if LLVM_VERSION_MAJOR >= 21
+    clang::DiagnosticOptions diagnostic_options;
+    std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
+            std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), diagnostic_options);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
+    std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
+            std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, diagnostic_options, text_diagnostic_printer.release());
+    #else
     clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
     std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
             std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
     clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
     std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
             std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
+    #endif
     clang::CompilerInstance compiler_instance;
@@ -222,21 +176,43 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
     std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
     compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
-    // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
-    // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
-    // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
-    compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
     if(!debug)
     {
         compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
     }
+    if(is_cuda)
+    {
+        // According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
+        // But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
+        // The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
+        compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
+        compiler_instance.getLangOpts().CUDA = 1;
+        compiler_instance.getLangOpts().CUDAIsDevice = 1;
+        compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
+    }
+    else
+    {
+        if(verify_fp)
+        {
+            compiler_instance.getPreprocessorOpts().addMacroDef("WP_VERIFY_FP");
+        }
-    compiler_instance.getLangOpts().CUDA = 1;
-    compiler_instance.getLangOpts().CUDAIsDevice = 1;
-    compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
+        if(tiles_in_stack_memory)
+        {
+            compiler_instance.getPreprocessorOpts().addMacroDef("WP_ENABLE_TILES_IN_STACK_MEMORY");
+        }
+        compiler_instance.getLangOpts().MicrosoftExt = 1;  // __forceinline / __int64
+        compiler_instance.getLangOpts().DeclSpecKeyword = 1;  // __declspec
+    }
+    #if LLVM_VERSION_MAJOR >= 21
+    compiler_instance.createDiagnostics(*llvm::vfs::getRealFileSystem(), text_diagnostic_printer.get(), false);
+    #else
     compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
+    #endif
     clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
     bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
@@ -247,12 +223,12 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
 extern "C" {
-WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
+WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp, bool tiles_in_stack_memory)
 {
     initialize_llvm();
     llvm::LLVMContext context;
-    std::unique_ptr<llvm::Module> module = cpp_to_llvm(input_file, cpp_src, include_dir, debug, verify_fp, context);
+    std::unique_ptr<llvm::Module> module = source_to_llvm(false, input_file, cpp_src, include_dir, debug, verify_fp, context, tiles_in_stack_memory);
     if(!module)
     {
@@ -260,7 +236,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
     }
     std::string error;
+     #if LLVM_VERSION_MAJOR >= 22
+    const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple(target_triple), error);
+    #else
     const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
+    #endif
     const char* CPU = "generic";
     const char* features = "";
@@ -271,7 +251,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
         target_options.AllowFPOpFusion = llvm::FPOpFusion::Strict;
     llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;  // Position Independent Code
     llvm::CodeModel::Model code_model = llvm::CodeModel::Large;  // Don't make assumptions about displacement sizes
+    #if LLVM_VERSION_MAJOR >= 20
+    llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple(target_triple), CPU, features, target_options, relocation_model, code_model);
+    #else
     llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
+    #endif
     module->setDataLayout(target_machine->createDataLayout());
@@ -299,7 +283,7 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
     initialize_llvm();
     llvm::LLVMContext context;
-    std::unique_ptr<llvm::Module> module = cuda_to_llvm(input_file, cpp_src, include_dir, debug, context);
+    std::unique_ptr<llvm::Module> module = source_to_llvm(true, input_file, cpp_src, include_dir, debug, false, context, false);
     if(!module)
     {
@@ -307,13 +291,22 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
     }
     std::string error;
+    #if LLVM_VERSION_MAJOR >= 22
+    const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple("nvptx64-nvidia-cuda"), error);
+    #else
     const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
+    #endif
     const char* CPU = "sm_70";
     const char* features = "+ptx75";  // Warp requires CUDA 11.5, which supports PTX ISA 7.5
     llvm::TargetOptions target_options;
     llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
+    #if LLVM_VERSION_MAJOR >= 20
+    llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple("nvptx64-nvidia-cuda"), CPU, features, target_options, relocation_model);
+    #else
     llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
+    #endif
     module->setDataLayout(target_machine->createDataLayout());
@@ -363,8 +356,16 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
         auto jit_expected = llvm::orc::LLJITBuilder()
             .setObjectLinkingLayerCreator(
+#if LLVM_VERSION_MAJOR >= 21
+                [&](llvm::orc::ExecutionSession &session) {
+#else
                 [&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
+#endif
+#if LLVM_VERSION_MAJOR >= 21
+                    auto get_memory_manager = [](const llvm::MemoryBuffer &) {
+#else
                     auto get_memory_manager = []() {
+#endif
                         return std::make_unique<llvm::SectionMemoryManager>();
                     };
                     auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager));
@@ -443,6 +444,10 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
             SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)),
             SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)),
             SYMBOL(fmaf), SYMBOL_T(fma, double(*)(double, double, double)),
+            SYMBOL(erff), SYMBOL_T(erf, double(*)(double)),
+            SYMBOL(erfcf), SYMBOL_T(erfc, double(*)(double)),
+            SYMBOL(erfinvf), SYMBOL_T(erfinv, double(*)(double)),
+            SYMBOL(erfcinvf), SYMBOL_T(erfcinv, double(*)(double)),
             SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove),
             SYMBOL(_wp_assert),
             SYMBOL(_wp_isfinite),
@@ -454,13 +459,8 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
             // triggering the stack overflow guards.
             SYMBOL(__chkstk),
         #elif defined(__APPLE__)
-            #if defined(__MACH__) && defined(__aarch64__)
-                SYMBOL(bzero),
-                SYMBOL(_bzero),
-            #else
-                // Intel Mac
-                SYMBOL(__bzero),
-            #endif
+            SYMBOL(bzero),
+            SYMBOL(_bzero),
             SYMBOL(memset_pattern16),
             SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
         #else
@@ -531,6 +531,11 @@ WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
     return func->getValue();
 }
+WP_API const char* wp_warp_clang_version()
+{
+    return WP_VERSION_STRING;
+}
 }  // extern "C"
 }  // namespace wp

warp/native/coloring.cpp CHANGED Viewed

@@ -346,7 +346,6 @@ public:
         return node_weights[node_idx];
     }
     void add_node(int weight, int node_idx)
     {
         if (weight >= weight_buckets.size())

warp/native/crt.h CHANGED Viewed

@@ -311,6 +311,14 @@ float tanhf(float);
 double tanh(double);
 float fmaf(float, float, float);
 double fma(double, double, double);
+double erf(double);
+float erff(float);
+double erfc(double);
+float erfcf(float);
+double erfinv(double);
+float erfinvf(float);
+double erfcinv(double);
+float erfcinvf(float);
 // stddef.h
 #if defined(_WIN32)
@@ -358,3 +366,203 @@ inline bool isinf(double x)
 #endif  // !__CUDACC__
 #endif // WP_NO_CRT
+#if !defined(__CUDACC__)
+/*
+ * From Cephes Library polevl.c
+ * Original source: https://www.netlib.org/cephes/
+ * Copyright (c) 1984 by Stephen L. Moshier.
+ * All rights reserved.
+*/
+// evaluate polynomial using Horner's method
+static inline double polevl(double x, const double* coefs, int N)
+{
+    double ans = coefs[0];
+    for (int i = 1; i <= N; i++)
+    {
+        ans = ans * x + coefs[i];
+    }
+    return ans;
+}
+/*
+ * From Cephes Library polevl.c
+ * Original source: https://www.netlib.org/cephes/
+ * Copyright (c) 1984 by Stephen L. Moshier.
+ * All rights reserved.
+*/
+// evaluate polynomial assuming leading coef = 1, using Horner's method
+static inline double p1evl(double x, const double* coefs, int N)
+{
+    double ans = x + coefs[0];
+    for (int i = 1; i < N; i++)
+    {
+        ans = ans * x + coefs[i];
+    }
+    return ans;
+}
+/*
+ * From Cephes Library ndtri.c
+ * Original source: https://www.netlib.org/cephes/
+ * Copyright (c) 1984 by Stephen L. Moshier.
+ * All rights reserved.
+*/
+// inverse normal distribution function (ndtri)
+static inline double ndtri(double y)
+{
+    // domain check
+    if (y <= 0.0 || y >= 1.0)
+    {
+        return (y <= 0.0) ? -HUGE_VAL : HUGE_VAL;
+    }
+    // constants from Cephes
+    const double s2pi = 2.50662827463100050242E0;  // sqrt(2*pi)
+    const double exp_neg2 = 0.13533528323661269189;  // exp(-2)
+    // approximation for 0 <= abs(z - 0.5) <= 3/8
+    static const double P0[5] = {
+        -5.99633501014107895267e1,
+        9.80010754185999661536e1,
+        -5.66762857469070293439e1,
+        1.39312609387279679503e1,
+        -1.23916583867381258016e0
+    };
+    static const double Q0[8] = {
+        1.95448858338141759834e0,
+        4.67627912898881538453e0,
+        8.63602421390890590575e1,
+        -2.25462687854119370527e2,
+        2.00260212380060660359e2,
+        -8.20372256168333339912e1,
+        1.59056225126211695515e1,
+        -1.18331621121330003142e0
+    };
+    // approximation for interval z = sqrt(-2 log y) between 2 and 8
+    static const double P1[9] = {
+        4.05544892305962419923e0,
+        3.15251094599893866154e1,
+        5.71628192246421288162e1,
+        4.40805073893200834700e1,
+        1.46849561928858024014e1,
+        2.18663306850790267539e0,
+        -1.40256079171354495875e-1,
+        -3.50424626827848203418e-2,
+        -8.57456785154685413611e-4
+    };
+    static const double Q1[8] = {
+        1.57799883256466749731e1,
+        4.53907635128879210584e1,
+        4.13172038254672030440e1,
+        1.50425385692907503408e1,
+        2.50464946208309415979e0,
+        -1.42182922854787788574e-1,
+        -3.80806407691578277194e-2,
+        -9.33259480895457427372e-4
+    };
+    // approximation for interval z = sqrt(-2 log y) between 8 and 64
+    static const double P2[9] = {
+        3.23774891776946035970e0,
+        6.91522889068984211695e0,
+        3.93881025292474443415e0,
+        1.33303460815807542389e0,
+        2.01485389549179081538e-1,
+        1.23716634817820021358e-2,
+        3.01581553508235416007e-4,
+        2.65806974686737550832e-6,
+        6.23974539184983293730e-9
+    };
+    static const double Q2[8] = {
+        6.02427039364742014255e0,
+        3.67983563856160859403e0,
+        1.37702099489081330271e0,
+        2.16236993594496635890e-1,
+        1.34204006088543189037e-2,
+        3.28014464682127739104e-4,
+        2.89247864745380683936e-6,
+        6.79019408009981274425e-9
+    };
+    int code = 1;
+    double y_work = y;
+    if (y_work > (1.0 - exp_neg2))
+    {
+        y_work = 1.0 - y_work;
+        code = 0;
+    }
+    // middle region: 0 <= |y - 0.5| <= 3/8
+    if (y_work > exp_neg2)
+    {
+        y_work -= 0.5;
+        double y2 = y_work * y_work;
+        double x = y_work + y_work * (y2 * polevl(y2, P0, 4) / p1evl(y2, Q0, 8));
+        x = x * s2pi;
+        return x;
+    }
+    double x = ::sqrt(-2.0 * ::log(y_work));
+    double x0 = x - ::log(x) / x;
+    double z = 1.0 / x;
+    double x1;
+    if (x < 8.0)
+    {
+        x1 = z * polevl(z, P1, 8) / p1evl(z, Q1, 8);
+    }
+    else
+    {
+        x1 = z * polevl(z, P2, 8) / p1evl(z, Q2, 8);
+    }
+    x = x0 - x1;
+    if (code != 0)
+    {
+        x = -x;
+    }
+    return x;
+}
+// inverse error function (not in standard C library)
+// only compiled for non-CUDA builds - CUDA provides these in its math headers
+inline double erfinv(double z)
+{
+    // handle special cases
+    if (z == 0.0)
+        return 0.0;
+    if (z == 1.0)
+        return HUGE_VAL;  // infinity
+    if (z == -1.0)
+        return -HUGE_VAL;  // -infinity
+    if (z < -1.0 || z > 1.0)
+        return NAN;  // outside valid range
+    // erfinv(z) = ndtri((z + 1) / 2) / sqrt(2)
+    return ndtri((z + 1.0) / 2.0) / ::sqrt(2.0);
+}
+inline float erfinvf(float x)
+{
+    return (float)erfinv((double)x);
+}
+inline double erfcinv(double x)
+{
+    return erfinv(1.0 - x);
+}
+inline float erfcinvf(float x)
+{
+    return (float)erfcinv((double)x);
+}
+#endif  // !defined(__CUDACC__)