warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +301 -287
- warp/__init__.pyi +794 -305
- warp/_src/__init__.py +14 -0
- warp/_src/autograd.py +1075 -0
- warp/_src/build.py +618 -0
- warp/_src/build_dll.py +640 -0
- warp/{builtins.py → _src/builtins.py} +1382 -377
- warp/_src/codegen.py +4359 -0
- warp/{config.py → _src/config.py} +178 -169
- warp/_src/constants.py +57 -0
- warp/_src/context.py +8294 -0
- warp/_src/dlpack.py +462 -0
- warp/_src/fabric.py +355 -0
- warp/_src/fem/__init__.py +14 -0
- warp/_src/fem/adaptivity.py +508 -0
- warp/_src/fem/cache.py +687 -0
- warp/_src/fem/dirichlet.py +188 -0
- warp/{fem → _src/fem}/domain.py +40 -30
- warp/_src/fem/field/__init__.py +131 -0
- warp/_src/fem/field/field.py +701 -0
- warp/{fem → _src/fem}/field/nodal_field.py +30 -15
- warp/{fem → _src/fem}/field/restriction.py +1 -1
- warp/{fem → _src/fem}/field/virtual.py +53 -27
- warp/_src/fem/geometry/__init__.py +32 -0
- warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
- warp/_src/fem/geometry/closest_point.py +97 -0
- warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
- warp/{fem → _src/fem}/geometry/element.py +32 -10
- warp/{fem → _src/fem}/geometry/geometry.py +48 -20
- warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
- warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
- warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
- warp/{fem → _src/fem}/geometry/partition.py +121 -63
- warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
- warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
- warp/{fem → _src/fem}/integrate.py +164 -158
- warp/_src/fem/linalg.py +383 -0
- warp/_src/fem/operator.py +396 -0
- warp/_src/fem/polynomial.py +229 -0
- warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
- warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
- warp/_src/fem/space/__init__.py +248 -0
- warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
- warp/_src/fem/space/basis_space.py +679 -0
- warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
- warp/{fem → _src/fem}/space/function_space.py +14 -13
- warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
- warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
- warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
- warp/{fem → _src/fem}/space/partition.py +117 -60
- warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/restriction.py +66 -33
- warp/_src/fem/space/shape/__init__.py +152 -0
- warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
- warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
- warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
- warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
- warp/_src/fem/space/topology.py +459 -0
- warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
- warp/_src/fem/types.py +112 -0
- warp/_src/fem/utils.py +486 -0
- warp/_src/jax.py +186 -0
- warp/_src/jax_experimental/__init__.py +14 -0
- warp/_src/jax_experimental/custom_call.py +387 -0
- warp/_src/jax_experimental/ffi.py +1284 -0
- warp/_src/jax_experimental/xla_ffi.py +656 -0
- warp/_src/marching_cubes.py +708 -0
- warp/_src/math.py +414 -0
- warp/_src/optim/__init__.py +14 -0
- warp/_src/optim/adam.py +163 -0
- warp/_src/optim/linear.py +1606 -0
- warp/_src/optim/sgd.py +112 -0
- warp/_src/paddle.py +406 -0
- warp/_src/render/__init__.py +14 -0
- warp/_src/render/imgui_manager.py +289 -0
- warp/_src/render/render_opengl.py +3636 -0
- warp/_src/render/render_usd.py +937 -0
- warp/_src/render/utils.py +160 -0
- warp/_src/sparse.py +2716 -0
- warp/_src/tape.py +1206 -0
- warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
- warp/_src/torch.py +391 -0
- warp/_src/types.py +5870 -0
- warp/_src/utils.py +1693 -0
- warp/autograd.py +12 -1054
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +8 -588
- warp/build_dll.py +6 -721
- warp/codegen.py +6 -4251
- warp/constants.py +6 -39
- warp/context.py +12 -8062
- warp/dlpack.py +6 -444
- warp/examples/distributed/example_jacobi_mpi.py +4 -5
- warp/examples/fem/example_adaptive_grid.py +1 -1
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +8 -8
- warp/examples/fem/example_diffusion.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_mixed_elasticity.py +2 -2
- warp/examples/fem/example_navier_stokes.py +1 -1
- warp/examples/fem/example_nonconforming_contact.py +7 -7
- warp/examples/fem/example_stokes.py +1 -1
- warp/examples/fem/example_stokes_transfer.py +1 -1
- warp/examples/fem/utils.py +2 -2
- warp/examples/interop/example_jax_callable.py +1 -1
- warp/examples/interop/example_jax_ffi_callback.py +1 -1
- warp/examples/interop/example_jax_kernel.py +1 -1
- warp/examples/tile/example_tile_mcgp.py +191 -0
- warp/fabric.py +6 -337
- warp/fem/__init__.py +159 -97
- warp/fem/adaptivity.py +7 -489
- warp/fem/cache.py +9 -648
- warp/fem/dirichlet.py +6 -184
- warp/fem/field/__init__.py +8 -109
- warp/fem/field/field.py +7 -652
- warp/fem/geometry/__init__.py +7 -18
- warp/fem/geometry/closest_point.py +11 -77
- warp/fem/linalg.py +18 -366
- warp/fem/operator.py +11 -369
- warp/fem/polynomial.py +9 -209
- warp/fem/space/__init__.py +5 -211
- warp/fem/space/basis_space.py +6 -662
- warp/fem/space/shape/__init__.py +41 -118
- warp/fem/space/topology.py +6 -437
- warp/fem/types.py +6 -81
- warp/fem/utils.py +11 -444
- warp/jax.py +8 -165
- warp/jax_experimental/__init__.py +14 -1
- warp/jax_experimental/custom_call.py +8 -365
- warp/jax_experimental/ffi.py +17 -873
- warp/jax_experimental/xla_ffi.py +5 -605
- warp/marching_cubes.py +5 -689
- warp/math.py +16 -393
- warp/native/array.h +385 -37
- warp/native/builtin.h +314 -37
- warp/native/bvh.cpp +43 -9
- warp/native/bvh.cu +62 -27
- warp/native/bvh.h +310 -309
- warp/native/clang/clang.cpp +102 -97
- warp/native/coloring.cpp +0 -1
- warp/native/crt.h +208 -0
- warp/native/exports.h +156 -0
- warp/native/hashgrid.cu +2 -0
- warp/native/intersect.h +24 -1
- warp/native/intersect_tri.h +44 -35
- warp/native/mat.h +1456 -276
- warp/native/mesh.cpp +4 -4
- warp/native/mesh.cu +4 -2
- warp/native/mesh.h +176 -61
- warp/native/quat.h +0 -52
- warp/native/scan.cu +2 -0
- warp/native/sparse.cu +7 -3
- warp/native/spatial.h +12 -0
- warp/native/tile.h +681 -89
- warp/native/tile_radix_sort.h +1 -1
- warp/native/tile_reduce.h +394 -46
- warp/native/tile_scan.h +4 -4
- warp/native/vec.h +469 -0
- warp/native/version.h +23 -0
- warp/native/volume.cpp +1 -1
- warp/native/volume.cu +1 -0
- warp/native/volume.h +1 -1
- warp/native/volume_builder.cu +2 -0
- warp/native/warp.cpp +57 -29
- warp/native/warp.cu +253 -171
- warp/native/warp.h +11 -8
- warp/optim/__init__.py +6 -3
- warp/optim/adam.py +6 -145
- warp/optim/linear.py +14 -1585
- warp/optim/sgd.py +6 -94
- warp/paddle.py +6 -388
- warp/render/__init__.py +8 -4
- warp/render/imgui_manager.py +7 -267
- warp/render/render_opengl.py +6 -3618
- warp/render/render_usd.py +6 -919
- warp/render/utils.py +6 -142
- warp/sparse.py +37 -2563
- warp/tape.py +6 -1188
- warp/tests/__main__.py +1 -1
- warp/tests/cuda/test_async.py +4 -4
- warp/tests/cuda/test_conditional_captures.py +1 -1
- warp/tests/cuda/test_multigpu.py +1 -1
- warp/tests/cuda/test_streams.py +58 -1
- warp/tests/geometry/test_bvh.py +157 -22
- warp/tests/geometry/test_marching_cubes.py +0 -1
- warp/tests/geometry/test_mesh.py +5 -3
- warp/tests/geometry/test_mesh_query_aabb.py +5 -12
- warp/tests/geometry/test_mesh_query_point.py +5 -2
- warp/tests/geometry/test_mesh_query_ray.py +15 -3
- warp/tests/geometry/test_volume_write.py +5 -5
- warp/tests/interop/test_dlpack.py +14 -14
- warp/tests/interop/test_jax.py +772 -49
- warp/tests/interop/test_paddle.py +1 -1
- warp/tests/test_adam.py +0 -1
- warp/tests/test_arithmetic.py +9 -9
- warp/tests/test_array.py +527 -100
- warp/tests/test_array_reduce.py +3 -3
- warp/tests/test_atomic.py +12 -8
- warp/tests/test_atomic_bitwise.py +209 -0
- warp/tests/test_atomic_cas.py +4 -4
- warp/tests/test_bool.py +2 -2
- warp/tests/test_builtins_resolution.py +5 -571
- warp/tests/test_codegen.py +33 -14
- warp/tests/test_conditional.py +1 -1
- warp/tests/test_context.py +6 -6
- warp/tests/test_copy.py +242 -161
- warp/tests/test_ctypes.py +3 -3
- warp/tests/test_devices.py +24 -2
- warp/tests/test_examples.py +16 -84
- warp/tests/test_fabricarray.py +35 -35
- warp/tests/test_fast_math.py +0 -2
- warp/tests/test_fem.py +56 -10
- warp/tests/test_fixedarray.py +3 -3
- warp/tests/test_func.py +8 -5
- warp/tests/test_generics.py +1 -1
- warp/tests/test_indexedarray.py +24 -24
- warp/tests/test_intersect.py +39 -9
- warp/tests/test_large.py +1 -1
- warp/tests/test_lerp.py +3 -1
- warp/tests/test_linear_solvers.py +1 -1
- warp/tests/test_map.py +35 -4
- warp/tests/test_mat.py +52 -62
- warp/tests/test_mat_constructors.py +4 -5
- warp/tests/test_mat_lite.py +1 -1
- warp/tests/test_mat_scalar_ops.py +121 -121
- warp/tests/test_math.py +34 -0
- warp/tests/test_module_aot.py +4 -4
- warp/tests/test_modules_lite.py +28 -2
- warp/tests/test_print.py +11 -11
- warp/tests/test_quat.py +93 -58
- warp/tests/test_runlength_encode.py +1 -1
- warp/tests/test_scalar_ops.py +38 -10
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +126 -15
- warp/tests/test_spatial.py +105 -87
- warp/tests/test_special_values.py +6 -6
- warp/tests/test_static.py +7 -7
- warp/tests/test_struct.py +13 -2
- warp/tests/test_triangle_closest_point.py +48 -1
- warp/tests/test_types.py +27 -15
- warp/tests/test_utils.py +52 -52
- warp/tests/test_vec.py +29 -29
- warp/tests/test_vec_constructors.py +5 -5
- warp/tests/test_vec_scalar_ops.py +97 -97
- warp/tests/test_version.py +75 -0
- warp/tests/tile/test_tile.py +178 -0
- warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
- warp/tests/tile/test_tile_cholesky.py +7 -4
- warp/tests/tile/test_tile_load.py +26 -2
- warp/tests/tile/test_tile_mathdx.py +3 -3
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +2 -4
- warp/tests/tile/test_tile_reduce.py +214 -13
- warp/tests/unittest_suites.py +6 -14
- warp/tests/unittest_utils.py +10 -9
- warp/tests/walkthrough_debug.py +3 -1
- warp/torch.py +6 -373
- warp/types.py +29 -5764
- warp/utils.py +10 -1659
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
- warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
- warp/examples/assets/cartpole.urdf +0 -110
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/nv_ant.xml +0 -92
- warp/examples/assets/nv_humanoid.xml +0 -183
- warp/examples/assets/quadruped.urdf +0 -268
- warp/examples/optim/example_bounce.py +0 -266
- warp/examples/optim/example_cloth_throw.py +0 -228
- warp/examples/optim/example_drone.py +0 -870
- warp/examples/optim/example_inverse_kinematics.py +0 -182
- warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
- warp/examples/optim/example_softbody_properties.py +0 -400
- warp/examples/optim/example_spring_cage.py +0 -245
- warp/examples/optim/example_trajectory.py +0 -227
- warp/examples/sim/example_cartpole.py +0 -143
- warp/examples/sim/example_cloth.py +0 -225
- warp/examples/sim/example_cloth_self_contact.py +0 -316
- warp/examples/sim/example_granular.py +0 -130
- warp/examples/sim/example_granular_collision_sdf.py +0 -202
- warp/examples/sim/example_jacobian_ik.py +0 -244
- warp/examples/sim/example_particle_chain.py +0 -124
- warp/examples/sim/example_quadruped.py +0 -203
- warp/examples/sim/example_rigid_chain.py +0 -203
- warp/examples/sim/example_rigid_contact.py +0 -195
- warp/examples/sim/example_rigid_force.py +0 -133
- warp/examples/sim/example_rigid_gyroscopic.py +0 -115
- warp/examples/sim/example_rigid_soft_contact.py +0 -140
- warp/examples/sim/example_soft_body.py +0 -196
- warp/examples/tile/example_tile_walker.py +0 -327
- warp/sim/__init__.py +0 -74
- warp/sim/articulation.py +0 -793
- warp/sim/collide.py +0 -2570
- warp/sim/graph_coloring.py +0 -307
- warp/sim/import_mjcf.py +0 -791
- warp/sim/import_snu.py +0 -227
- warp/sim/import_urdf.py +0 -579
- warp/sim/import_usd.py +0 -898
- warp/sim/inertia.py +0 -357
- warp/sim/integrator.py +0 -245
- warp/sim/integrator_euler.py +0 -2000
- warp/sim/integrator_featherstone.py +0 -2101
- warp/sim/integrator_vbd.py +0 -2487
- warp/sim/integrator_xpbd.py +0 -3295
- warp/sim/model.py +0 -4821
- warp/sim/particles.py +0 -121
- warp/sim/render.py +0 -431
- warp/sim/utils.py +0 -431
- warp/tests/sim/disabled_kinematics.py +0 -244
- warp/tests/sim/test_cloth.py +0 -863
- warp/tests/sim/test_collision.py +0 -743
- warp/tests/sim/test_coloring.py +0 -347
- warp/tests/sim/test_inertia.py +0 -161
- warp/tests/sim/test_model.py +0 -226
- warp/tests/sim/test_sim_grad.py +0 -287
- warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
- warp/tests/sim/test_sim_kinematics.py +0 -98
- warp/thirdparty/__init__.py +0 -0
- warp_lang-1.9.1.dist-info/RECORD +0 -456
- /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
- /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
warp/native/clang/clang.cpp
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
*/
|
|
17
17
|
|
|
18
18
|
#include "../native/crt.h"
|
|
19
|
+
#include "../version.h"
|
|
19
20
|
|
|
20
21
|
#include <clang/Frontend/CompilerInstance.h>
|
|
21
22
|
#include <clang/Basic/DiagnosticOptions.h>
|
|
@@ -58,27 +59,14 @@
|
|
|
58
59
|
#if defined(_WIN64)
|
|
59
60
|
extern "C" void __chkstk();
|
|
60
61
|
#elif defined(__APPLE__)
|
|
61
|
-
|
|
62
|
-
#if defined(__MACH__) && defined(__aarch64__)
|
|
63
62
|
extern "C" void _bzero(void *s, size_t n) {
|
|
64
63
|
memset(s, 0, n);
|
|
65
64
|
}
|
|
66
65
|
extern "C" void __bzero(void *s, size_t n) {
|
|
67
66
|
memset(s, 0, n);
|
|
68
67
|
}
|
|
69
|
-
|
|
70
|
-
extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
|
|
71
|
-
extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
|
|
72
|
-
|
|
73
|
-
#else
|
|
74
|
-
// // Intel Mac's define bzero in libSystem.dylib
|
|
75
|
-
extern "C" void __bzero(void *s, size_t n);
|
|
76
|
-
|
|
77
68
|
extern "C" void _memset_pattern16(void *s, const void *pattern, size_t n);
|
|
78
69
|
extern "C" void __memset_pattern16(void *s, const void *pattern, size_t n);
|
|
79
|
-
|
|
80
|
-
#endif
|
|
81
|
-
|
|
82
70
|
extern "C" __double2 __sincos_stret(double);
|
|
83
71
|
extern "C" __float2 __sincosf_stret(float);
|
|
84
72
|
#endif // defined(__APPLE__)
|
|
@@ -114,7 +102,7 @@ static void initialize_llvm()
|
|
|
114
102
|
llvm::InitializeAllAsmPrinters();
|
|
115
103
|
}
|
|
116
104
|
|
|
117
|
-
static std::unique_ptr<llvm::Module>
|
|
105
|
+
static std::unique_ptr<llvm::Module> source_to_llvm(bool is_cuda, const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, bool verify_fp, llvm::LLVMContext& context, bool tiles_in_stack_memory)
|
|
118
106
|
{
|
|
119
107
|
// Compilation arguments
|
|
120
108
|
std::vector<const char*> args;
|
|
@@ -125,84 +113,50 @@ static std::unique_ptr<llvm::Module> cpp_to_llvm(const std::string& input_file,
|
|
|
125
113
|
|
|
126
114
|
args.push_back(debug ? "-O0" : "-O2");
|
|
127
115
|
|
|
128
|
-
|
|
129
|
-
args.push_back(target_triple);
|
|
130
|
-
|
|
131
|
-
#if defined(__x86_64__) || defined(_M_X64)
|
|
132
|
-
args.push_back("-target-feature");
|
|
133
|
-
args.push_back("+f16c"); // Enables support for _Float16
|
|
134
|
-
#endif
|
|
135
|
-
|
|
136
|
-
clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
|
|
137
|
-
std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
|
|
138
|
-
std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
|
|
139
|
-
clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
|
|
140
|
-
std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
|
|
141
|
-
std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
|
|
142
|
-
|
|
143
|
-
clang::CompilerInstance compiler_instance;
|
|
144
|
-
|
|
145
|
-
auto& compiler_invocation = compiler_instance.getInvocation();
|
|
146
|
-
clang::CompilerInvocation::CreateFromArgs(compiler_invocation, args, *diagnostic_engine.release());
|
|
147
|
-
|
|
148
|
-
if(debug)
|
|
116
|
+
if(is_cuda)
|
|
149
117
|
{
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
#else
|
|
153
|
-
compiler_invocation.getCodeGenOpts().setDebugInfo(clang::codegenoptions::FullDebugInfo);
|
|
154
|
-
#endif
|
|
155
|
-
}
|
|
118
|
+
args.push_back("-triple");
|
|
119
|
+
args.push_back("nvptx64-nvidia-cuda");
|
|
156
120
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
|
|
160
|
-
|
|
161
|
-
if(!debug)
|
|
162
|
-
{
|
|
163
|
-
compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
|
|
121
|
+
args.push_back("-target-cpu");
|
|
122
|
+
args.push_back("sm_70");
|
|
164
123
|
}
|
|
165
|
-
|
|
166
|
-
if(verify_fp)
|
|
124
|
+
else
|
|
167
125
|
{
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
|
|
172
|
-
compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec
|
|
173
|
-
|
|
174
|
-
compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
|
|
126
|
+
args.push_back("-triple");
|
|
127
|
+
args.push_back(target_triple);
|
|
175
128
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
return success ? std::move(emit_llvm_only_action.takeModule()) : nullptr;
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file, const char* cpp_src, const char* include_dir, bool debug, llvm::LLVMContext& context)
|
|
184
|
-
{
|
|
185
|
-
// Compilation arguments
|
|
186
|
-
std::vector<const char*> args;
|
|
187
|
-
args.push_back(input_file.c_str());
|
|
188
|
-
|
|
189
|
-
args.push_back("-I");
|
|
190
|
-
args.push_back(include_dir);
|
|
191
|
-
|
|
192
|
-
args.push_back(debug ? "-O0" : "-O2");
|
|
193
|
-
|
|
194
|
-
args.push_back("-triple");
|
|
195
|
-
args.push_back("nvptx64-nvidia-cuda");
|
|
129
|
+
#if defined(__x86_64__) || defined(_M_X64)
|
|
130
|
+
args.push_back("-target-feature");
|
|
131
|
+
args.push_back("+f16c"); // Enables support for _Float16
|
|
132
|
+
#endif
|
|
196
133
|
|
|
197
|
-
|
|
198
|
-
|
|
134
|
+
#if defined(__aarch64__)
|
|
135
|
+
if(tiles_in_stack_memory)
|
|
136
|
+
{
|
|
137
|
+
// Static memory support is broken on AArch64 CPUs. As a workaround we reserve some stack memory on kernel entry,
|
|
138
|
+
// and point the callee-saved x28 register to it so we can access it anywhere. See tile_shared_storage_t in tile.h.
|
|
139
|
+
args.push_back("-target-feature");
|
|
140
|
+
args.push_back("+reserve-x28");
|
|
141
|
+
}
|
|
142
|
+
#endif
|
|
143
|
+
}
|
|
199
144
|
|
|
145
|
+
#if LLVM_VERSION_MAJOR >= 21
|
|
146
|
+
clang::DiagnosticOptions diagnostic_options;
|
|
147
|
+
std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
|
|
148
|
+
std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), diagnostic_options);
|
|
149
|
+
clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
|
|
150
|
+
std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
|
|
151
|
+
std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, diagnostic_options, text_diagnostic_printer.release());
|
|
152
|
+
#else
|
|
200
153
|
clang::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagnostic_options = new clang::DiagnosticOptions();
|
|
201
154
|
std::unique_ptr<clang::TextDiagnosticPrinter> text_diagnostic_printer =
|
|
202
155
|
std::make_unique<clang::TextDiagnosticPrinter>(llvm::errs(), &*diagnostic_options);
|
|
203
156
|
clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagnostic_ids;
|
|
204
157
|
std::unique_ptr<clang::DiagnosticsEngine> diagnostic_engine =
|
|
205
158
|
std::make_unique<clang::DiagnosticsEngine>(diagnostic_ids, &*diagnostic_options, text_diagnostic_printer.release());
|
|
159
|
+
#endif
|
|
206
160
|
|
|
207
161
|
clang::CompilerInstance compiler_instance;
|
|
208
162
|
|
|
@@ -222,21 +176,43 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
|
|
|
222
176
|
std::unique_ptr<llvm::MemoryBuffer> buffer = llvm::MemoryBuffer::getMemBufferCopy(cpp_src);
|
|
223
177
|
compiler_invocation.getPreprocessorOpts().addRemappedFile(input_file.c_str(), buffer.get());
|
|
224
178
|
|
|
225
|
-
// According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
|
|
226
|
-
// But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
|
|
227
|
-
// The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
|
|
228
|
-
compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
|
|
229
|
-
|
|
230
179
|
if(!debug)
|
|
231
180
|
{
|
|
232
181
|
compiler_instance.getPreprocessorOpts().addMacroDef("NDEBUG");
|
|
233
182
|
}
|
|
183
|
+
|
|
184
|
+
if(is_cuda)
|
|
185
|
+
{
|
|
186
|
+
// According to https://llvm.org/docs/CompileCudaWithLLVM.html, "Both clang and nvcc define `__CUDACC__` during CUDA compilation."
|
|
187
|
+
// But this normally happens in the __clang_cuda_runtime_wrapper.h header, which we don't include.
|
|
188
|
+
// The __CUDA__ and __CUDA_ARCH__ macros are internally defined by llvm-project/clang/lib/Frontend/InitPreprocessor.cpp
|
|
189
|
+
compiler_instance.getPreprocessorOpts().addMacroDef("__CUDACC__");
|
|
190
|
+
|
|
191
|
+
compiler_instance.getLangOpts().CUDA = 1;
|
|
192
|
+
compiler_instance.getLangOpts().CUDAIsDevice = 1;
|
|
193
|
+
compiler_instance.getLangOpts().CUDAAllowVariadicFunctions = 1;
|
|
194
|
+
}
|
|
195
|
+
else
|
|
196
|
+
{
|
|
197
|
+
if(verify_fp)
|
|
198
|
+
{
|
|
199
|
+
compiler_instance.getPreprocessorOpts().addMacroDef("WP_VERIFY_FP");
|
|
200
|
+
}
|
|
234
201
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
202
|
+
if(tiles_in_stack_memory)
|
|
203
|
+
{
|
|
204
|
+
compiler_instance.getPreprocessorOpts().addMacroDef("WP_ENABLE_TILES_IN_STACK_MEMORY");
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
compiler_instance.getLangOpts().MicrosoftExt = 1; // __forceinline / __int64
|
|
208
|
+
compiler_instance.getLangOpts().DeclSpecKeyword = 1; // __declspec
|
|
209
|
+
}
|
|
238
210
|
|
|
211
|
+
#if LLVM_VERSION_MAJOR >= 21
|
|
212
|
+
compiler_instance.createDiagnostics(*llvm::vfs::getRealFileSystem(), text_diagnostic_printer.get(), false);
|
|
213
|
+
#else
|
|
239
214
|
compiler_instance.createDiagnostics(text_diagnostic_printer.get(), false);
|
|
215
|
+
#endif
|
|
240
216
|
|
|
241
217
|
clang::EmitLLVMOnlyAction emit_llvm_only_action(&context);
|
|
242
218
|
bool success = compiler_instance.ExecuteAction(emit_llvm_only_action);
|
|
@@ -247,12 +223,12 @@ static std::unique_ptr<llvm::Module> cuda_to_llvm(const std::string& input_file,
|
|
|
247
223
|
|
|
248
224
|
extern "C" {
|
|
249
225
|
|
|
250
|
-
WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp)
|
|
226
|
+
WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const char* include_dir, const char* output_file, bool debug, bool verify_fp, bool fuse_fp, bool tiles_in_stack_memory)
|
|
251
227
|
{
|
|
252
228
|
initialize_llvm();
|
|
253
229
|
|
|
254
230
|
llvm::LLVMContext context;
|
|
255
|
-
std::unique_ptr<llvm::Module> module =
|
|
231
|
+
std::unique_ptr<llvm::Module> module = source_to_llvm(false, input_file, cpp_src, include_dir, debug, verify_fp, context, tiles_in_stack_memory);
|
|
256
232
|
|
|
257
233
|
if(!module)
|
|
258
234
|
{
|
|
@@ -260,7 +236,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
|
|
|
260
236
|
}
|
|
261
237
|
|
|
262
238
|
std::string error;
|
|
239
|
+
#if LLVM_VERSION_MAJOR >= 22
|
|
240
|
+
const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple(target_triple), error);
|
|
241
|
+
#else
|
|
263
242
|
const llvm::Target* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
|
|
243
|
+
#endif
|
|
264
244
|
|
|
265
245
|
const char* CPU = "generic";
|
|
266
246
|
const char* features = "";
|
|
@@ -271,7 +251,11 @@ WP_API int wp_compile_cpp(const char* cpp_src, const char *input_file, const cha
|
|
|
271
251
|
target_options.AllowFPOpFusion = llvm::FPOpFusion::Strict;
|
|
272
252
|
llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_; // Position Independent Code
|
|
273
253
|
llvm::CodeModel::Model code_model = llvm::CodeModel::Large; // Don't make assumptions about displacement sizes
|
|
254
|
+
#if LLVM_VERSION_MAJOR >= 20
|
|
255
|
+
llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple(target_triple), CPU, features, target_options, relocation_model, code_model);
|
|
256
|
+
#else
|
|
274
257
|
llvm::TargetMachine* target_machine = target->createTargetMachine(target_triple, CPU, features, target_options, relocation_model, code_model);
|
|
258
|
+
#endif
|
|
275
259
|
|
|
276
260
|
module->setDataLayout(target_machine->createDataLayout());
|
|
277
261
|
|
|
@@ -299,7 +283,7 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
|
|
|
299
283
|
initialize_llvm();
|
|
300
284
|
|
|
301
285
|
llvm::LLVMContext context;
|
|
302
|
-
std::unique_ptr<llvm::Module> module =
|
|
286
|
+
std::unique_ptr<llvm::Module> module = source_to_llvm(true, input_file, cpp_src, include_dir, debug, false, context, false);
|
|
303
287
|
|
|
304
288
|
if(!module)
|
|
305
289
|
{
|
|
@@ -307,13 +291,22 @@ WP_API int wp_compile_cuda(const char* cpp_src, const char *input_file, const ch
|
|
|
307
291
|
}
|
|
308
292
|
|
|
309
293
|
std::string error;
|
|
294
|
+
|
|
295
|
+
#if LLVM_VERSION_MAJOR >= 22
|
|
296
|
+
const llvm::Target* target = llvm::TargetRegistry::lookupTarget(llvm::Triple("nvptx64-nvidia-cuda"), error);
|
|
297
|
+
#else
|
|
310
298
|
const llvm::Target* target = llvm::TargetRegistry::lookupTarget("nvptx64-nvidia-cuda", error);
|
|
299
|
+
#endif
|
|
311
300
|
|
|
312
301
|
const char* CPU = "sm_70";
|
|
313
302
|
const char* features = "+ptx75"; // Warp requires CUDA 11.5, which supports PTX ISA 7.5
|
|
314
303
|
llvm::TargetOptions target_options;
|
|
315
304
|
llvm::Reloc::Model relocation_model = llvm::Reloc::PIC_;
|
|
305
|
+
#if LLVM_VERSION_MAJOR >= 20
|
|
306
|
+
llvm::TargetMachine* target_machine = target->createTargetMachine(llvm::Triple("nvptx64-nvidia-cuda"), CPU, features, target_options, relocation_model);
|
|
307
|
+
#else
|
|
316
308
|
llvm::TargetMachine* target_machine = target->createTargetMachine("nvptx64-nvidia-cuda", CPU, features, target_options, relocation_model);
|
|
309
|
+
#endif
|
|
317
310
|
|
|
318
311
|
module->setDataLayout(target_machine->createDataLayout());
|
|
319
312
|
|
|
@@ -363,8 +356,16 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
|
|
|
363
356
|
|
|
364
357
|
auto jit_expected = llvm::orc::LLJITBuilder()
|
|
365
358
|
.setObjectLinkingLayerCreator(
|
|
359
|
+
#if LLVM_VERSION_MAJOR >= 21
|
|
360
|
+
[&](llvm::orc::ExecutionSession &session) {
|
|
361
|
+
#else
|
|
366
362
|
[&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
|
|
363
|
+
#endif
|
|
364
|
+
#if LLVM_VERSION_MAJOR >= 21
|
|
365
|
+
auto get_memory_manager = [](const llvm::MemoryBuffer &) {
|
|
366
|
+
#else
|
|
367
367
|
auto get_memory_manager = []() {
|
|
368
|
+
#endif
|
|
368
369
|
return std::make_unique<llvm::SectionMemoryManager>();
|
|
369
370
|
};
|
|
370
371
|
auto obj_linking_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, std::move(get_memory_manager));
|
|
@@ -443,6 +444,10 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
|
|
|
443
444
|
SYMBOL(coshf), SYMBOL_T(cosh, double(*)(double)),
|
|
444
445
|
SYMBOL(tanhf), SYMBOL_T(tanh, double(*)(double)),
|
|
445
446
|
SYMBOL(fmaf), SYMBOL_T(fma, double(*)(double, double, double)),
|
|
447
|
+
SYMBOL(erff), SYMBOL_T(erf, double(*)(double)),
|
|
448
|
+
SYMBOL(erfcf), SYMBOL_T(erfc, double(*)(double)),
|
|
449
|
+
SYMBOL(erfinvf), SYMBOL_T(erfinv, double(*)(double)),
|
|
450
|
+
SYMBOL(erfcinvf), SYMBOL_T(erfcinv, double(*)(double)),
|
|
446
451
|
SYMBOL(memcpy), SYMBOL(memset), SYMBOL(memmove),
|
|
447
452
|
SYMBOL(_wp_assert),
|
|
448
453
|
SYMBOL(_wp_isfinite),
|
|
@@ -454,13 +459,8 @@ WP_API int wp_load_obj(const char* object_file, const char* module_name)
|
|
|
454
459
|
// triggering the stack overflow guards.
|
|
455
460
|
SYMBOL(__chkstk),
|
|
456
461
|
#elif defined(__APPLE__)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
SYMBOL(_bzero),
|
|
460
|
-
#else
|
|
461
|
-
// Intel Mac
|
|
462
|
-
SYMBOL(__bzero),
|
|
463
|
-
#endif
|
|
462
|
+
SYMBOL(bzero),
|
|
463
|
+
SYMBOL(_bzero),
|
|
464
464
|
SYMBOL(memset_pattern16),
|
|
465
465
|
SYMBOL(__sincos_stret), SYMBOL(__sincosf_stret),
|
|
466
466
|
#else
|
|
@@ -531,6 +531,11 @@ WP_API uint64_t wp_lookup(const char* dll_name, const char* function_name)
|
|
|
531
531
|
return func->getValue();
|
|
532
532
|
}
|
|
533
533
|
|
|
534
|
+
WP_API const char* wp_warp_clang_version()
|
|
535
|
+
{
|
|
536
|
+
return WP_VERSION_STRING;
|
|
537
|
+
}
|
|
538
|
+
|
|
534
539
|
} // extern "C"
|
|
535
540
|
|
|
536
541
|
} // namespace wp
|
warp/native/coloring.cpp
CHANGED
warp/native/crt.h
CHANGED
|
@@ -311,6 +311,14 @@ float tanhf(float);
|
|
|
311
311
|
double tanh(double);
|
|
312
312
|
float fmaf(float, float, float);
|
|
313
313
|
double fma(double, double, double);
|
|
314
|
+
double erf(double);
|
|
315
|
+
float erff(float);
|
|
316
|
+
double erfc(double);
|
|
317
|
+
float erfcf(float);
|
|
318
|
+
double erfinv(double);
|
|
319
|
+
float erfinvf(float);
|
|
320
|
+
double erfcinv(double);
|
|
321
|
+
float erfcinvf(float);
|
|
314
322
|
|
|
315
323
|
// stddef.h
|
|
316
324
|
#if defined(_WIN32)
|
|
@@ -358,3 +366,203 @@ inline bool isinf(double x)
|
|
|
358
366
|
#endif // !__CUDACC__
|
|
359
367
|
|
|
360
368
|
#endif // WP_NO_CRT
|
|
369
|
+
|
|
370
|
+
#if !defined(__CUDACC__)
|
|
371
|
+
|
|
372
|
+
/*
|
|
373
|
+
* From Cephes Library polevl.c
|
|
374
|
+
* Original source: https://www.netlib.org/cephes/
|
|
375
|
+
* Copyright (c) 1984 by Stephen L. Moshier.
|
|
376
|
+
* All rights reserved.
|
|
377
|
+
*/
|
|
378
|
+
// evaluate polynomial using Horner's method
|
|
379
|
+
static inline double polevl(double x, const double* coefs, int N)
|
|
380
|
+
{
|
|
381
|
+
double ans = coefs[0];
|
|
382
|
+
for (int i = 1; i <= N; i++)
|
|
383
|
+
{
|
|
384
|
+
ans = ans * x + coefs[i];
|
|
385
|
+
}
|
|
386
|
+
return ans;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
/*
|
|
390
|
+
* From Cephes Library polevl.c
|
|
391
|
+
* Original source: https://www.netlib.org/cephes/
|
|
392
|
+
* Copyright (c) 1984 by Stephen L. Moshier.
|
|
393
|
+
* All rights reserved.
|
|
394
|
+
*/
|
|
395
|
+
// evaluate polynomial assuming leading coef = 1, using Horner's method
|
|
396
|
+
static inline double p1evl(double x, const double* coefs, int N)
|
|
397
|
+
{
|
|
398
|
+
double ans = x + coefs[0];
|
|
399
|
+
for (int i = 1; i < N; i++)
|
|
400
|
+
{
|
|
401
|
+
ans = ans * x + coefs[i];
|
|
402
|
+
}
|
|
403
|
+
return ans;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/*
|
|
407
|
+
* From Cephes Library ndtri.c
|
|
408
|
+
* Original source: https://www.netlib.org/cephes/
|
|
409
|
+
* Copyright (c) 1984 by Stephen L. Moshier.
|
|
410
|
+
* All rights reserved.
|
|
411
|
+
*/
|
|
412
|
+
// inverse normal distribution function (ndtri)
|
|
413
|
+
static inline double ndtri(double y)
|
|
414
|
+
{
|
|
415
|
+
// domain check
|
|
416
|
+
if (y <= 0.0 || y >= 1.0)
|
|
417
|
+
{
|
|
418
|
+
return (y <= 0.0) ? -HUGE_VAL : HUGE_VAL;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// constants from Cephes
|
|
422
|
+
const double s2pi = 2.50662827463100050242E0; // sqrt(2*pi)
|
|
423
|
+
const double exp_neg2 = 0.13533528323661269189; // exp(-2)
|
|
424
|
+
|
|
425
|
+
// approximation for 0 <= abs(z - 0.5) <= 3/8
|
|
426
|
+
static const double P0[5] = {
|
|
427
|
+
-5.99633501014107895267e1,
|
|
428
|
+
9.80010754185999661536e1,
|
|
429
|
+
-5.66762857469070293439e1,
|
|
430
|
+
1.39312609387279679503e1,
|
|
431
|
+
-1.23916583867381258016e0
|
|
432
|
+
};
|
|
433
|
+
|
|
434
|
+
static const double Q0[8] = {
|
|
435
|
+
1.95448858338141759834e0,
|
|
436
|
+
4.67627912898881538453e0,
|
|
437
|
+
8.63602421390890590575e1,
|
|
438
|
+
-2.25462687854119370527e2,
|
|
439
|
+
2.00260212380060660359e2,
|
|
440
|
+
-8.20372256168333339912e1,
|
|
441
|
+
1.59056225126211695515e1,
|
|
442
|
+
-1.18331621121330003142e0
|
|
443
|
+
};
|
|
444
|
+
|
|
445
|
+
// approximation for interval z = sqrt(-2 log y) between 2 and 8
|
|
446
|
+
static const double P1[9] = {
|
|
447
|
+
4.05544892305962419923e0,
|
|
448
|
+
3.15251094599893866154e1,
|
|
449
|
+
5.71628192246421288162e1,
|
|
450
|
+
4.40805073893200834700e1,
|
|
451
|
+
1.46849561928858024014e1,
|
|
452
|
+
2.18663306850790267539e0,
|
|
453
|
+
-1.40256079171354495875e-1,
|
|
454
|
+
-3.50424626827848203418e-2,
|
|
455
|
+
-8.57456785154685413611e-4
|
|
456
|
+
};
|
|
457
|
+
|
|
458
|
+
static const double Q1[8] = {
|
|
459
|
+
1.57799883256466749731e1,
|
|
460
|
+
4.53907635128879210584e1,
|
|
461
|
+
4.13172038254672030440e1,
|
|
462
|
+
1.50425385692907503408e1,
|
|
463
|
+
2.50464946208309415979e0,
|
|
464
|
+
-1.42182922854787788574e-1,
|
|
465
|
+
-3.80806407691578277194e-2,
|
|
466
|
+
-9.33259480895457427372e-4
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
// approximation for interval z = sqrt(-2 log y) between 8 and 64
|
|
470
|
+
static const double P2[9] = {
|
|
471
|
+
3.23774891776946035970e0,
|
|
472
|
+
6.91522889068984211695e0,
|
|
473
|
+
3.93881025292474443415e0,
|
|
474
|
+
1.33303460815807542389e0,
|
|
475
|
+
2.01485389549179081538e-1,
|
|
476
|
+
1.23716634817820021358e-2,
|
|
477
|
+
3.01581553508235416007e-4,
|
|
478
|
+
2.65806974686737550832e-6,
|
|
479
|
+
6.23974539184983293730e-9
|
|
480
|
+
};
|
|
481
|
+
|
|
482
|
+
static const double Q2[8] = {
|
|
483
|
+
6.02427039364742014255e0,
|
|
484
|
+
3.67983563856160859403e0,
|
|
485
|
+
1.37702099489081330271e0,
|
|
486
|
+
2.16236993594496635890e-1,
|
|
487
|
+
1.34204006088543189037e-2,
|
|
488
|
+
3.28014464682127739104e-4,
|
|
489
|
+
2.89247864745380683936e-6,
|
|
490
|
+
6.79019408009981274425e-9
|
|
491
|
+
};
|
|
492
|
+
|
|
493
|
+
int code = 1;
|
|
494
|
+
double y_work = y;
|
|
495
|
+
|
|
496
|
+
if (y_work > (1.0 - exp_neg2))
|
|
497
|
+
{
|
|
498
|
+
y_work = 1.0 - y_work;
|
|
499
|
+
code = 0;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// middle region: 0 <= |y - 0.5| <= 3/8
|
|
503
|
+
if (y_work > exp_neg2)
|
|
504
|
+
{
|
|
505
|
+
y_work -= 0.5;
|
|
506
|
+
double y2 = y_work * y_work;
|
|
507
|
+
double x = y_work + y_work * (y2 * polevl(y2, P0, 4) / p1evl(y2, Q0, 8));
|
|
508
|
+
x = x * s2pi;
|
|
509
|
+
return x;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
double x = ::sqrt(-2.0 * ::log(y_work));
|
|
513
|
+
double x0 = x - ::log(x) / x;
|
|
514
|
+
|
|
515
|
+
double z = 1.0 / x;
|
|
516
|
+
double x1;
|
|
517
|
+
if (x < 8.0)
|
|
518
|
+
{
|
|
519
|
+
x1 = z * polevl(z, P1, 8) / p1evl(z, Q1, 8);
|
|
520
|
+
}
|
|
521
|
+
else
|
|
522
|
+
{
|
|
523
|
+
x1 = z * polevl(z, P2, 8) / p1evl(z, Q2, 8);
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
x = x0 - x1;
|
|
527
|
+
if (code != 0)
|
|
528
|
+
{
|
|
529
|
+
x = -x;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return x;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// inverse error function (not in standard C library)
|
|
536
|
+
// only compiled for non-CUDA builds - CUDA provides these in its math headers
|
|
537
|
+
inline double erfinv(double z)
|
|
538
|
+
{
|
|
539
|
+
// handle special cases
|
|
540
|
+
if (z == 0.0)
|
|
541
|
+
return 0.0;
|
|
542
|
+
if (z == 1.0)
|
|
543
|
+
return HUGE_VAL; // infinity
|
|
544
|
+
if (z == -1.0)
|
|
545
|
+
return -HUGE_VAL; // -infinity
|
|
546
|
+
if (z < -1.0 || z > 1.0)
|
|
547
|
+
return NAN; // outside valid range
|
|
548
|
+
|
|
549
|
+
// erfinv(z) = ndtri((z + 1) / 2) / sqrt(2)
|
|
550
|
+
return ndtri((z + 1.0) / 2.0) / ::sqrt(2.0);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
inline float erfinvf(float x)
|
|
554
|
+
{
|
|
555
|
+
return (float)erfinv((double)x);
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
inline double erfcinv(double x)
|
|
559
|
+
{
|
|
560
|
+
return erfinv(1.0 - x);
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
inline float erfcinvf(float x)
|
|
564
|
+
{
|
|
565
|
+
return (float)erfcinv((double)x);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
#endif // !defined(__CUDACC__)
|