warp-lang 1.4.2__py3-none-win_amd64.whl → 1.5.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/warp-clang.dll +0 -0
- warp/bin/warp.dll +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1819 -7
- warp/codegen.py +197 -61
- warp/config.py +2 -2
- warp/context.py +379 -107
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/sim/example_cloth.py +4 -25
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -7
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +15 -0
- warp/native/builtin.h +66 -26
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +604 -0
- warp/native/cuda_util.cpp +68 -51
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1854 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +140 -67
- warp/sim/graph_coloring.py +292 -0
- warp/sim/import_urdf.py +8 -8
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +109 -32
- warp/sparse.py +1 -1
- warp/stubs.py +569 -4
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +39 -0
- warp/tests/test_codegen.py +81 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +251 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +21 -5
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +34 -4
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_lerp.py +13 -87
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_matmul.py +6 -9
- warp/tests/test_matmul_lite.py +6 -11
- warp/tests/test_mesh_query_point.py +1 -1
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_overwrite.py +45 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +56 -1
- warp/tests/test_smoothstep.py +17 -83
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_static.py +3 -3
- warp/tests/test_tile.py +744 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +19 -2
- warp/tests/unittest_utils.py +4 -2
- warp/types.py +340 -74
- warp/utils.py +23 -3
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +161 -134
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0
warp/__init__.py
CHANGED
|
@@ -26,6 +26,9 @@ from warp.types import transform, transformh, transformf, transformd
|
|
|
26
26
|
from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
|
|
27
27
|
from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
|
|
28
28
|
|
|
29
|
+
# annotation types
|
|
30
|
+
from warp.types import Int, Float, Scalar
|
|
31
|
+
|
|
29
32
|
# geometry types
|
|
30
33
|
from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
|
|
31
34
|
from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
|
|
@@ -58,6 +61,7 @@ from warp.context import (
|
|
|
58
61
|
copy,
|
|
59
62
|
from_numpy,
|
|
60
63
|
launch,
|
|
64
|
+
launch_tiled,
|
|
61
65
|
synchronize,
|
|
62
66
|
force_load,
|
|
63
67
|
load_module,
|
warp/autograd.py
CHANGED
|
@@ -34,6 +34,7 @@ def gradcheck(
|
|
|
34
34
|
input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
|
|
35
35
|
device: wp.context.Devicelike = None,
|
|
36
36
|
max_blocks=0,
|
|
37
|
+
block_dim=256,
|
|
37
38
|
max_inputs_per_var=-1,
|
|
38
39
|
max_outputs_per_var=-1,
|
|
39
40
|
plot_relative_error=False,
|
|
@@ -44,7 +45,8 @@ def gradcheck(
|
|
|
44
45
|
Checks whether the autodiff gradient of a Warp kernel matches finite differences.
|
|
45
46
|
Fails if the relative or absolute errors between the autodiff and finite difference gradients exceed the specified tolerance, or if the autodiff gradients contain NaN values.
|
|
46
47
|
|
|
47
|
-
The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided
|
|
48
|
+
The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided
|
|
49
|
+
``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
|
|
48
50
|
|
|
49
51
|
Note:
|
|
50
52
|
This function only supports Warp kernels whose input arguments precede the output arguments.
|
|
@@ -65,6 +67,7 @@ def gradcheck(
|
|
|
65
67
|
input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
|
|
66
68
|
device: The device to launch on (optional)
|
|
67
69
|
max_blocks: The maximum number of CUDA thread blocks to use.
|
|
70
|
+
block_dim: The number of threads per block.
|
|
68
71
|
max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
|
|
69
72
|
max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
|
|
70
73
|
plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
|
|
@@ -85,6 +88,7 @@ def gradcheck(
|
|
|
85
88
|
input_output_mask=input_output_mask,
|
|
86
89
|
device=device,
|
|
87
90
|
max_blocks=max_blocks,
|
|
91
|
+
block_dim=block_dim,
|
|
88
92
|
max_inputs_per_var=max_inputs_per_var,
|
|
89
93
|
eps=eps,
|
|
90
94
|
plot_jacobians=False,
|
|
@@ -98,6 +102,7 @@ def gradcheck(
|
|
|
98
102
|
input_output_mask=input_output_mask,
|
|
99
103
|
device=device,
|
|
100
104
|
max_blocks=max_blocks,
|
|
105
|
+
block_dim=block_dim,
|
|
101
106
|
max_outputs_per_var=max_outputs_per_var,
|
|
102
107
|
plot_jacobians=False,
|
|
103
108
|
)
|
|
@@ -237,7 +242,6 @@ def gradcheck_tape(
|
|
|
237
242
|
input_output_masks: Dictionary of input-output masks for each kernel in the tape, mapping from kernel keys to input-output masks. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
|
|
238
243
|
blacklist_kernels: List of kernel keys to exclude from the gradient check.
|
|
239
244
|
whitelist_kernels: List of kernel keys to include in the gradient check. If not empty or None, only kernels in this list are checked.
|
|
240
|
-
max_blocks: The maximum number of CUDA thread blocks to use.
|
|
241
245
|
max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
|
|
242
246
|
max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
|
|
243
247
|
plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
|
|
@@ -262,7 +266,7 @@ def gradcheck_tape(
|
|
|
262
266
|
for launch in tape.launches:
|
|
263
267
|
if not isinstance(launch[0], wp.Kernel):
|
|
264
268
|
continue
|
|
265
|
-
kernel, dim, max_blocks, inputs, outputs, device = launch[:
|
|
269
|
+
kernel, dim, max_blocks, inputs, outputs, device, block_dim = launch[:7]
|
|
266
270
|
if len(whitelist_kernels) > 0 and kernel.key not in whitelist_kernels:
|
|
267
271
|
continue
|
|
268
272
|
if kernel.key in blacklist_kernels:
|
|
@@ -280,6 +284,7 @@ def gradcheck_tape(
|
|
|
280
284
|
input_output_mask=input_output_mask,
|
|
281
285
|
device=device,
|
|
282
286
|
max_blocks=max_blocks,
|
|
287
|
+
block_dim=block_dim,
|
|
283
288
|
max_inputs_per_var=max_inputs_per_var,
|
|
284
289
|
max_outputs_per_var=max_outputs_per_var,
|
|
285
290
|
plot_relative_error=plot_relative_error,
|
|
@@ -611,13 +616,15 @@ def jacobian(
|
|
|
611
616
|
input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
|
|
612
617
|
device: wp.context.Devicelike = None,
|
|
613
618
|
max_blocks=0,
|
|
619
|
+
block_dim=256,
|
|
614
620
|
max_outputs_per_var=-1,
|
|
615
621
|
plot_jacobians=False,
|
|
616
622
|
) -> Dict[Tuple[int, int], wp.array]:
|
|
617
623
|
"""
|
|
618
624
|
Computes the Jacobians of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
|
|
619
625
|
|
|
620
|
-
The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim
|
|
626
|
+
The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim``,
|
|
627
|
+
``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
|
|
621
628
|
|
|
622
629
|
Note:
|
|
623
630
|
This function only supports Warp kernels whose input arguments precede the output arguments.
|
|
@@ -634,6 +641,7 @@ def jacobian(
|
|
|
634
641
|
input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
|
|
635
642
|
device: The device to launch on (optional)
|
|
636
643
|
max_blocks: The maximum number of CUDA thread blocks to use.
|
|
644
|
+
block_dim: The number of threads per block.
|
|
637
645
|
max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
|
|
638
646
|
plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
|
|
639
647
|
|
|
@@ -661,7 +669,15 @@ def jacobian(
|
|
|
661
669
|
device = infer_device(inputs + outputs)
|
|
662
670
|
|
|
663
671
|
tape = wp.Tape()
|
|
664
|
-
tape.record_launch(
|
|
672
|
+
tape.record_launch(
|
|
673
|
+
kernel=kernel,
|
|
674
|
+
dim=dim,
|
|
675
|
+
inputs=inputs,
|
|
676
|
+
outputs=outputs,
|
|
677
|
+
device=device,
|
|
678
|
+
max_blocks=max_blocks,
|
|
679
|
+
block_dim=block_dim,
|
|
680
|
+
)
|
|
665
681
|
|
|
666
682
|
jacobians = {}
|
|
667
683
|
|
|
@@ -709,6 +725,7 @@ def jacobian_fd(
|
|
|
709
725
|
input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
|
|
710
726
|
device: wp.context.Devicelike = None,
|
|
711
727
|
max_blocks=0,
|
|
728
|
+
block_dim=256,
|
|
712
729
|
max_inputs_per_var=-1,
|
|
713
730
|
eps=1e-4,
|
|
714
731
|
plot_jacobians=False,
|
|
@@ -717,7 +734,8 @@ def jacobian_fd(
|
|
|
717
734
|
Computes the finite-difference Jacobian of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
|
|
718
735
|
The method uses a central difference scheme to approximate the Jacobian.
|
|
719
736
|
|
|
720
|
-
The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
|
|
737
|
+
The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
|
|
738
|
+
provided ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
|
|
721
739
|
|
|
722
740
|
Note:
|
|
723
741
|
This function only supports Warp kernels whose input arguments precede the output arguments.
|
|
@@ -734,6 +752,7 @@ def jacobian_fd(
|
|
|
734
752
|
input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
|
|
735
753
|
device: The device to launch on (optional)
|
|
736
754
|
max_blocks: The maximum number of CUDA thread blocks to use.
|
|
755
|
+
block_dim: The number of threads per block.
|
|
737
756
|
max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
|
|
738
757
|
eps: The finite-difference step size.
|
|
739
758
|
plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
|
|
@@ -793,10 +812,26 @@ def jacobian_fd(
|
|
|
793
812
|
input_num = min(input_num, max_inputs_per_var)
|
|
794
813
|
for i in range(input_num):
|
|
795
814
|
set_element(flat_input, i, -eps, relative=True)
|
|
796
|
-
wp.launch(
|
|
815
|
+
wp.launch(
|
|
816
|
+
kernel,
|
|
817
|
+
dim=dim,
|
|
818
|
+
inputs=inputs,
|
|
819
|
+
outputs=left_outputs,
|
|
820
|
+
device=device,
|
|
821
|
+
max_blocks=max_blocks,
|
|
822
|
+
block_dim=block_dim,
|
|
823
|
+
)
|
|
797
824
|
|
|
798
825
|
set_element(flat_input, i, 2 * eps, relative=True)
|
|
799
|
-
wp.launch(
|
|
826
|
+
wp.launch(
|
|
827
|
+
kernel,
|
|
828
|
+
dim=dim,
|
|
829
|
+
inputs=inputs,
|
|
830
|
+
outputs=right_outputs,
|
|
831
|
+
device=device,
|
|
832
|
+
max_blocks=max_blocks,
|
|
833
|
+
block_dim=block_dim,
|
|
834
|
+
)
|
|
800
835
|
|
|
801
836
|
set_element(flat_input, i, -eps, relative=True)
|
|
802
837
|
|
warp/bin/warp-clang.dll
CHANGED
|
Binary file
|
warp/bin/warp.dll
CHANGED
|
Binary file
|
warp/build.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
# distribution of this software and related documentation without an express
|
|
6
6
|
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
7
|
|
|
8
|
+
import ctypes
|
|
8
9
|
import os
|
|
9
10
|
|
|
10
11
|
import warp.config
|
|
@@ -12,7 +13,7 @@ from warp.thirdparty import appdirs
|
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
# builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
|
|
15
|
-
def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False):
|
|
16
|
+
def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=None):
|
|
16
17
|
with open(cu_path, "rb") as src_file:
|
|
17
18
|
src = src_file.read()
|
|
18
19
|
cu_path = cu_path.encode("utf-8")
|
|
@@ -23,8 +24,26 @@ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fa
|
|
|
23
24
|
warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
|
|
24
25
|
|
|
25
26
|
else:
|
|
27
|
+
if ltoirs is None:
|
|
28
|
+
ltoirs = []
|
|
29
|
+
|
|
30
|
+
num_ltoirs = len(ltoirs)
|
|
31
|
+
arr_lroirs = (ctypes.c_char_p * num_ltoirs)(*ltoirs)
|
|
32
|
+
arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs])
|
|
26
33
|
err = warp.context.runtime.core.cuda_compile_program(
|
|
27
|
-
src,
|
|
34
|
+
src,
|
|
35
|
+
arch,
|
|
36
|
+
inc_path,
|
|
37
|
+
0,
|
|
38
|
+
None,
|
|
39
|
+
config == "debug",
|
|
40
|
+
warp.config.verbose,
|
|
41
|
+
verify_fp,
|
|
42
|
+
fast_math,
|
|
43
|
+
output_path,
|
|
44
|
+
num_ltoirs,
|
|
45
|
+
arr_lroirs,
|
|
46
|
+
arr_lroir_sizes,
|
|
28
47
|
)
|
|
29
48
|
if err != 0:
|
|
30
49
|
raise Exception(f"CUDA kernel build failed with error code {err}")
|
warp/build_dll.py
CHANGED
|
@@ -172,6 +172,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
172
172
|
f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
|
|
173
173
|
)
|
|
174
174
|
|
|
175
|
+
if ctk_version[0] < 12 and args.libmathdx_path:
|
|
176
|
+
print("MathDx support requires at least CUDA 12, skipping")
|
|
177
|
+
args.libmathdx_path = None
|
|
178
|
+
|
|
175
179
|
gencode_opts = []
|
|
176
180
|
|
|
177
181
|
if args.quick:
|
|
@@ -223,6 +227,13 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
223
227
|
# is the library being built with CUDA enabled?
|
|
224
228
|
cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
|
|
225
229
|
|
|
230
|
+
if args.libmathdx_path:
|
|
231
|
+
libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
|
|
232
|
+
mathdx_enabled = "WP_ENABLE_MATHDX=1"
|
|
233
|
+
else:
|
|
234
|
+
libmathdx_includes = ""
|
|
235
|
+
mathdx_enabled = "WP_ENABLE_MATHDX=0"
|
|
236
|
+
|
|
226
237
|
if os.name == "nt":
|
|
227
238
|
if args.host_compiler:
|
|
228
239
|
host_linker = os.path.join(os.path.dirname(args.host_compiler), "link.exe")
|
|
@@ -244,7 +255,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
244
255
|
iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
|
|
245
256
|
debug = "_DEBUG"
|
|
246
257
|
|
|
247
|
-
cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
|
|
258
|
+
cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{mathdx_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
|
|
248
259
|
|
|
249
260
|
if args.mode == "debug":
|
|
250
261
|
cpp_flags += "/Zi /Od /D WP_ENABLE_DEBUG=1"
|
|
@@ -273,10 +284,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
273
284
|
cu_out = cu_path + ".o"
|
|
274
285
|
|
|
275
286
|
if mode == "debug":
|
|
276
|
-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
287
|
+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
277
288
|
|
|
278
289
|
elif mode == "release":
|
|
279
|
-
cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
290
|
+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
280
291
|
|
|
281
292
|
with ScopedTimer("build_cuda", active=args.verbose):
|
|
282
293
|
run_cmd(cuda_cmd)
|
|
@@ -285,6 +296,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
285
296
|
f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
|
|
286
297
|
)
|
|
287
298
|
|
|
299
|
+
if args.libmathdx_path:
|
|
300
|
+
linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib" mathdx_static.lib')
|
|
301
|
+
|
|
288
302
|
with ScopedTimer("link", active=args.verbose):
|
|
289
303
|
link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
|
|
290
304
|
run_cmd(link_cmd)
|
|
@@ -300,7 +314,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
300
314
|
else:
|
|
301
315
|
version = "-fabi-version=13" # GCC 8.2+
|
|
302
316
|
|
|
303
|
-
cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
|
|
317
|
+
cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
|
|
304
318
|
|
|
305
319
|
if mode == "debug":
|
|
306
320
|
cpp_flags += "-O0 -g -D_DEBUG -DWP_ENABLE_DEBUG=1 -fkeep-inline-functions"
|
|
@@ -328,10 +342,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
328
342
|
cu_out = cu_path + ".o"
|
|
329
343
|
|
|
330
344
|
if mode == "debug":
|
|
331
|
-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
345
|
+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
332
346
|
|
|
333
347
|
elif mode == "release":
|
|
334
|
-
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
348
|
+
cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
|
|
335
349
|
|
|
336
350
|
with ScopedTimer("build_cuda", active=args.verbose):
|
|
337
351
|
run_cmd(cuda_cmd)
|
|
@@ -341,6 +355,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
|
|
|
341
355
|
f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
|
|
342
356
|
)
|
|
343
357
|
|
|
358
|
+
if args.libmathdx_path:
|
|
359
|
+
ld_inputs.append(f"-lnvJitLink_static -L{args.libmathdx_path}/lib -lmathdx_static")
|
|
360
|
+
|
|
344
361
|
if sys.platform == "darwin":
|
|
345
362
|
opt_no_undefined = "-Wl,-undefined,error"
|
|
346
363
|
opt_exclude_libs = ""
|