PyPI - warp-lang - Versions diffs - 1.4.2__py3-none-win_amd64.whl → 1.5.1__py3-none-win_amd64.whl - Mend

warp-lang 1.4.2__py3-none-win_amd64.whl → 1.5.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (166) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1819 -7
warp/codegen.py +197 -61
warp/config.py +2 -2
warp/context.py +379 -107
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/sim/example_cloth.py +4 -25
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -7
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +15 -0
warp/native/builtin.h +66 -26
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +604 -0
warp/native/cuda_util.cpp +68 -51
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1854 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +140 -67
warp/sim/graph_coloring.py +292 -0
warp/sim/import_urdf.py +8 -8
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +109 -32
warp/sparse.py +1 -1
warp/stubs.py +569 -4
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +39 -0
warp/tests/test_codegen.py +81 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +251 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +21 -5
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +34 -4
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_lerp.py +13 -87
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_matmul.py +6 -9
warp/tests/test_matmul_lite.py +6 -11
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_overwrite.py +45 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +56 -1
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_spatial.py +1 -1
warp/tests/test_static.py +3 -3
warp/tests/test_tile.py +744 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +19 -2
warp/tests/unittest_utils.py +4 -2
warp/types.py +340 -74
warp/utils.py +23 -3
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +161 -134
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0

warp/__init__.py CHANGED Viewed

@@ -26,6 +26,9 @@ from warp.types import transform, transformh, transformf, transformd
 from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
 from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
+# annotation types
+from warp.types import Int, Float, Scalar
 # geometry types
 from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
 from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
@@ -58,6 +61,7 @@ from warp.context import (
     copy,
     from_numpy,
     launch,
+    launch_tiled,
     synchronize,
     force_load,
     load_module,

warp/autograd.py CHANGED Viewed

@@ -34,6 +34,7 @@ def gradcheck(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_inputs_per_var=-1,
     max_outputs_per_var=-1,
     plot_relative_error=False,
@@ -44,7 +45,8 @@ def gradcheck(
     Checks whether the autodiff gradient of a Warp kernel matches finite differences.
     Fails if the relative or absolute errors between the autodiff and finite difference gradients exceed the specified tolerance, or if the autodiff gradients contain NaN values.
-    The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided
+    ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -65,6 +67,7 @@ def gradcheck(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -85,6 +88,7 @@ def gradcheck(
         input_output_mask=input_output_mask,
         device=device,
         max_blocks=max_blocks,
+        block_dim=block_dim,
         max_inputs_per_var=max_inputs_per_var,
         eps=eps,
         plot_jacobians=False,
@@ -98,6 +102,7 @@ def gradcheck(
         input_output_mask=input_output_mask,
         device=device,
         max_blocks=max_blocks,
+        block_dim=block_dim,
         max_outputs_per_var=max_outputs_per_var,
         plot_jacobians=False,
     )
@@ -237,7 +242,6 @@ def gradcheck_tape(
         input_output_masks: Dictionary of input-output masks for each kernel in the tape, mapping from kernel keys to input-output masks. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         blacklist_kernels: List of kernel keys to exclude from the gradient check.
         whitelist_kernels: List of kernel keys to include in the gradient check. If not empty or None, only kernels in this list are checked.
-        max_blocks: The maximum number of CUDA thread blocks to use.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -262,7 +266,7 @@ def gradcheck_tape(
     for launch in tape.launches:
         if not isinstance(launch[0], wp.Kernel):
             continue
-        kernel, dim, max_blocks, inputs, outputs, device = launch[:6]
+        kernel, dim, max_blocks, inputs, outputs, device, block_dim = launch[:7]
         if len(whitelist_kernels) > 0 and kernel.key not in whitelist_kernels:
             continue
         if kernel.key in blacklist_kernels:
@@ -280,6 +284,7 @@ def gradcheck_tape(
             input_output_mask=input_output_mask,
             device=device,
             max_blocks=max_blocks,
+            block_dim=block_dim,
             max_inputs_per_var=max_inputs_per_var,
             max_outputs_per_var=max_outputs_per_var,
             plot_relative_error=plot_relative_error,
@@ -611,13 +616,15 @@ def jacobian(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_outputs_per_var=-1,
     plot_jacobians=False,
 ) -> Dict[Tuple[int, int], wp.array]:
     """
     Computes the Jacobians of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
-    The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim``,
+    ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -634,6 +641,7 @@ def jacobian(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
@@ -661,7 +669,15 @@ def jacobian(
         device = infer_device(inputs + outputs)
     tape = wp.Tape()
-    tape.record_launch(kernel=kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=outputs, device=device)
+    tape.record_launch(
+        kernel=kernel,
+        dim=dim,
+        inputs=inputs,
+        outputs=outputs,
+        device=device,
+        max_blocks=max_blocks,
+        block_dim=block_dim,
+    )
     jacobians = {}
@@ -709,6 +725,7 @@ def jacobian_fd(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_inputs_per_var=-1,
     eps=1e-4,
     plot_jacobians=False,
@@ -717,7 +734,8 @@ def jacobian_fd(
     Computes the finite-difference Jacobian of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
     The method uses a central difference scheme to approximate the Jacobian.
-    The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
+    provided ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -734,6 +752,7 @@ def jacobian_fd(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         eps: The finite-difference step size.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
@@ -793,10 +812,26 @@ def jacobian_fd(
             input_num = min(input_num, max_inputs_per_var)
         for i in range(input_num):
             set_element(flat_input, i, -eps, relative=True)
-            wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=left_outputs, device=device)
+            wp.launch(
+                kernel,
+                dim=dim,
+                inputs=inputs,
+                outputs=left_outputs,
+                device=device,
+                max_blocks=max_blocks,
+                block_dim=block_dim,
+            )
             set_element(flat_input, i, 2 * eps, relative=True)
-            wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=right_outputs, device=device)
+            wp.launch(
+                kernel,
+                dim=dim,
+                inputs=inputs,
+                outputs=right_outputs,
+                device=device,
+                max_blocks=max_blocks,
+                block_dim=block_dim,
+            )
             set_element(flat_input, i, -eps, relative=True)

warp/bin/warp-clang.dll CHANGED Viewed

Binary file

warp/bin/warp.dll CHANGED Viewed

Binary file

warp/build.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import ctypes
 import os
 import warp.config
@@ -12,7 +13,7 @@ from warp.thirdparty import appdirs
 # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
-def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False):
+def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=None):
     with open(cu_path, "rb") as src_file:
         src = src_file.read()
         cu_path = cu_path.encode("utf-8")
@@ -23,8 +24,26 @@ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fa
             warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
         else:
+            if ltoirs is None:
+                ltoirs = []
+            num_ltoirs = len(ltoirs)
+            arr_lroirs = (ctypes.c_char_p * num_ltoirs)(*ltoirs)
+            arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs])
             err = warp.context.runtime.core.cuda_compile_program(
-                src, arch, inc_path, config == "debug", warp.config.verbose, verify_fp, fast_math, output_path
+                src,
+                arch,
+                inc_path,
+                0,
+                None,
+                config == "debug",
+                warp.config.verbose,
+                verify_fp,
+                fast_math,
+                output_path,
+                num_ltoirs,
+                arr_lroirs,
+                arr_lroir_sizes,
             )
             if err != 0:
                 raise Exception(f"CUDA kernel build failed with error code {err}")

warp/build_dll.py CHANGED Viewed

@@ -172,6 +172,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                 f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
             )
+        if ctk_version[0] < 12 and args.libmathdx_path:
+            print("MathDx support requires at least CUDA 12, skipping")
+            args.libmathdx_path = None
         gencode_opts = []
         if args.quick:
@@ -223,6 +227,13 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
     # is the library being built with CUDA enabled?
     cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
+    if args.libmathdx_path:
+        libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
+        mathdx_enabled = "WP_ENABLE_MATHDX=1"
+    else:
+        libmathdx_includes = ""
+        mathdx_enabled = "WP_ENABLE_MATHDX=0"
     if os.name == "nt":
         if args.host_compiler:
             host_linker = os.path.join(os.path.dirname(args.host_compiler), "link.exe")
@@ -244,7 +255,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
             debug = "_DEBUG"
-        cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
+        cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{mathdx_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
         if args.mode == "debug":
             cpp_flags += "/Zi /Od /D WP_ENABLE_DEBUG=1"
@@ -273,10 +284,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
@@ -285,6 +296,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                     f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
                 )
+                if args.libmathdx_path:
+                    linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib" mathdx_static.lib')
         with ScopedTimer("link", active=args.verbose):
             link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
             run_cmd(link_cmd)
@@ -300,7 +314,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
         else:
             version = "-fabi-version=13"  # GCC 8.2+
-        cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
+        cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
         if mode == "debug":
             cpp_flags += "-O0 -g -D_DEBUG -DWP_ENABLE_DEBUG=1 -fkeep-inline-functions"
@@ -328,10 +342,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
@@ -341,6 +355,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                     f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
                 )
+                if args.libmathdx_path:
+                    ld_inputs.append(f"-lnvJitLink_static -L{args.libmathdx_path}/lib -lmathdx_static")
         if sys.platform == "darwin":
             opt_no_undefined = "-Wl,-undefined,error"
             opt_exclude_libs = ""