warp-lang 1.4.2__py3-none-win_amd64.whl → 1.5.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (158) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1783 -2
  8. warp/codegen.py +177 -45
  9. warp/config.py +2 -2
  10. warp/context.py +321 -73
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/fem/example_adaptive_grid.py +37 -10
  17. warp/examples/fem/example_apic_fluid.py +3 -2
  18. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  19. warp/examples/fem/example_deformed_geometry.py +1 -1
  20. warp/examples/fem/example_diffusion_3d.py +47 -4
  21. warp/examples/fem/example_distortion_energy.py +220 -0
  22. warp/examples/fem/example_magnetostatics.py +127 -85
  23. warp/examples/fem/example_nonconforming_contact.py +5 -5
  24. warp/examples/fem/example_stokes.py +3 -1
  25. warp/examples/fem/example_streamlines.py +12 -19
  26. warp/examples/fem/utils.py +38 -15
  27. warp/examples/sim/example_cloth.py +2 -25
  28. warp/examples/sim/example_quadruped.py +2 -1
  29. warp/examples/tile/example_tile_convolution.py +58 -0
  30. warp/examples/tile/example_tile_fft.py +47 -0
  31. warp/examples/tile/example_tile_filtering.py +105 -0
  32. warp/examples/tile/example_tile_matmul.py +79 -0
  33. warp/examples/tile/example_tile_mlp.py +375 -0
  34. warp/fem/__init__.py +8 -0
  35. warp/fem/cache.py +16 -12
  36. warp/fem/dirichlet.py +1 -1
  37. warp/fem/domain.py +44 -1
  38. warp/fem/field/__init__.py +1 -2
  39. warp/fem/field/field.py +31 -19
  40. warp/fem/field/nodal_field.py +101 -49
  41. warp/fem/field/virtual.py +794 -0
  42. warp/fem/geometry/__init__.py +2 -2
  43. warp/fem/geometry/deformed_geometry.py +3 -105
  44. warp/fem/geometry/element.py +13 -0
  45. warp/fem/geometry/geometry.py +165 -5
  46. warp/fem/geometry/grid_2d.py +3 -6
  47. warp/fem/geometry/grid_3d.py +31 -28
  48. warp/fem/geometry/hexmesh.py +3 -46
  49. warp/fem/geometry/nanogrid.py +3 -2
  50. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  51. warp/fem/geometry/tetmesh.py +2 -43
  52. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  53. warp/fem/integrate.py +683 -261
  54. warp/fem/linalg.py +404 -0
  55. warp/fem/operator.py +101 -18
  56. warp/fem/polynomial.py +5 -5
  57. warp/fem/quadrature/quadrature.py +45 -21
  58. warp/fem/space/__init__.py +45 -11
  59. warp/fem/space/basis_function_space.py +451 -0
  60. warp/fem/space/basis_space.py +58 -11
  61. warp/fem/space/function_space.py +146 -5
  62. warp/fem/space/grid_2d_function_space.py +80 -66
  63. warp/fem/space/grid_3d_function_space.py +113 -68
  64. warp/fem/space/hexmesh_function_space.py +96 -108
  65. warp/fem/space/nanogrid_function_space.py +62 -110
  66. warp/fem/space/quadmesh_function_space.py +208 -0
  67. warp/fem/space/shape/__init__.py +45 -7
  68. warp/fem/space/shape/cube_shape_function.py +328 -54
  69. warp/fem/space/shape/shape_function.py +10 -1
  70. warp/fem/space/shape/square_shape_function.py +328 -60
  71. warp/fem/space/shape/tet_shape_function.py +269 -19
  72. warp/fem/space/shape/triangle_shape_function.py +238 -19
  73. warp/fem/space/tetmesh_function_space.py +69 -37
  74. warp/fem/space/topology.py +38 -0
  75. warp/fem/space/trimesh_function_space.py +179 -0
  76. warp/fem/utils.py +6 -331
  77. warp/jax_experimental.py +3 -1
  78. warp/native/array.h +15 -0
  79. warp/native/builtin.h +66 -26
  80. warp/native/bvh.h +4 -0
  81. warp/native/coloring.cpp +600 -0
  82. warp/native/cuda_util.cpp +14 -0
  83. warp/native/cuda_util.h +2 -1
  84. warp/native/fabric.h +8 -0
  85. warp/native/hashgrid.h +4 -0
  86. warp/native/marching.cu +8 -0
  87. warp/native/mat.h +14 -3
  88. warp/native/mathdx.cpp +59 -0
  89. warp/native/mesh.h +4 -0
  90. warp/native/range.h +13 -1
  91. warp/native/reduce.cpp +9 -1
  92. warp/native/reduce.cu +7 -0
  93. warp/native/runlength_encode.cpp +9 -1
  94. warp/native/runlength_encode.cu +7 -1
  95. warp/native/scan.cpp +8 -0
  96. warp/native/scan.cu +8 -0
  97. warp/native/scan.h +8 -1
  98. warp/native/sparse.cpp +8 -0
  99. warp/native/sparse.cu +8 -0
  100. warp/native/temp_buffer.h +7 -0
  101. warp/native/tile.h +1857 -0
  102. warp/native/tile_gemm.h +341 -0
  103. warp/native/tile_reduce.h +210 -0
  104. warp/native/volume_builder.cu +8 -0
  105. warp/native/volume_builder.h +8 -0
  106. warp/native/warp.cpp +10 -2
  107. warp/native/warp.cu +369 -15
  108. warp/native/warp.h +12 -2
  109. warp/optim/adam.py +39 -4
  110. warp/paddle.py +29 -12
  111. warp/render/render_opengl.py +137 -65
  112. warp/sim/graph_coloring.py +292 -0
  113. warp/sim/integrator_euler.py +4 -2
  114. warp/sim/integrator_featherstone.py +115 -44
  115. warp/sim/integrator_vbd.py +6 -0
  116. warp/sim/model.py +88 -15
  117. warp/stubs.py +569 -4
  118. warp/tape.py +12 -7
  119. warp/tests/assets/pixel.npy +0 -0
  120. warp/tests/aux_test_instancing_gc.py +18 -0
  121. warp/tests/test_array.py +39 -0
  122. warp/tests/test_codegen.py +81 -1
  123. warp/tests/test_codegen_instancing.py +30 -0
  124. warp/tests/test_collision.py +110 -0
  125. warp/tests/test_coloring.py +241 -0
  126. warp/tests/test_context.py +34 -0
  127. warp/tests/test_examples.py +18 -4
  128. warp/tests/test_fem.py +453 -113
  129. warp/tests/test_func.py +13 -0
  130. warp/tests/test_generics.py +52 -0
  131. warp/tests/test_iter.py +68 -0
  132. warp/tests/test_mat_scalar_ops.py +1 -1
  133. warp/tests/test_mesh_query_point.py +1 -1
  134. warp/tests/test_module_hashing.py +23 -0
  135. warp/tests/test_paddle.py +27 -87
  136. warp/tests/test_print.py +56 -1
  137. warp/tests/test_spatial.py +1 -1
  138. warp/tests/test_tile.py +700 -0
  139. warp/tests/test_tile_mathdx.py +144 -0
  140. warp/tests/test_tile_mlp.py +383 -0
  141. warp/tests/test_tile_reduce.py +374 -0
  142. warp/tests/test_tile_shared_memory.py +190 -0
  143. warp/tests/test_vbd.py +12 -20
  144. warp/tests/test_volume.py +43 -0
  145. warp/tests/unittest_suites.py +19 -2
  146. warp/tests/unittest_utils.py +4 -0
  147. warp/types.py +338 -72
  148. warp/utils.py +22 -1
  149. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
  150. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
  151. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
  152. warp/fem/field/test.py +0 -180
  153. warp/fem/field/trial.py +0 -183
  154. warp/fem/space/collocated_function_space.py +0 -102
  155. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  156. warp/fem/space/trimesh_2d_function_space.py +0 -153
  157. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
  158. {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
warp/__init__.py CHANGED
@@ -26,6 +26,9 @@ from warp.types import transform, transformh, transformf, transformd
26
26
  from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
27
27
  from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
28
28
 
29
+ # annotation types
30
+ from warp.types import Int, Float, Scalar
31
+
29
32
  # geometry types
30
33
  from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
31
34
  from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
@@ -58,6 +61,7 @@ from warp.context import (
58
61
  copy,
59
62
  from_numpy,
60
63
  launch,
64
+ launch_tiled,
61
65
  synchronize,
62
66
  force_load,
63
67
  load_module,
warp/autograd.py CHANGED
@@ -34,6 +34,7 @@ def gradcheck(
34
34
  input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
35
35
  device: wp.context.Devicelike = None,
36
36
  max_blocks=0,
37
+ block_dim=256,
37
38
  max_inputs_per_var=-1,
38
39
  max_outputs_per_var=-1,
39
40
  plot_relative_error=False,
@@ -44,7 +45,8 @@ def gradcheck(
44
45
  Checks whether the autodiff gradient of a Warp kernel matches finite differences.
45
46
  Fails if the relative or absolute errors between the autodiff and finite difference gradients exceed the specified tolerance, or if the autodiff gradients contain NaN values.
46
47
 
47
- The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
48
+ The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided
49
+ ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
48
50
 
49
51
  Note:
50
52
  This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -65,6 +67,7 @@ def gradcheck(
65
67
  input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
66
68
  device: The device to launch on (optional)
67
69
  max_blocks: The maximum number of CUDA thread blocks to use.
70
+ block_dim: The number of threads per block.
68
71
  max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
69
72
  max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
70
73
  plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -85,6 +88,7 @@ def gradcheck(
85
88
  input_output_mask=input_output_mask,
86
89
  device=device,
87
90
  max_blocks=max_blocks,
91
+ block_dim=block_dim,
88
92
  max_inputs_per_var=max_inputs_per_var,
89
93
  eps=eps,
90
94
  plot_jacobians=False,
@@ -98,6 +102,7 @@ def gradcheck(
98
102
  input_output_mask=input_output_mask,
99
103
  device=device,
100
104
  max_blocks=max_blocks,
105
+ block_dim=block_dim,
101
106
  max_outputs_per_var=max_outputs_per_var,
102
107
  plot_jacobians=False,
103
108
  )
@@ -237,7 +242,6 @@ def gradcheck_tape(
237
242
  input_output_masks: Dictionary of input-output masks for each kernel in the tape, mapping from kernel keys to input-output masks. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
238
243
  blacklist_kernels: List of kernel keys to exclude from the gradient check.
239
244
  whitelist_kernels: List of kernel keys to include in the gradient check. If not empty or None, only kernels in this list are checked.
240
- max_blocks: The maximum number of CUDA thread blocks to use.
241
245
  max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
242
246
  max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
243
247
  plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -262,7 +266,7 @@ def gradcheck_tape(
262
266
  for launch in tape.launches:
263
267
  if not isinstance(launch[0], wp.Kernel):
264
268
  continue
265
- kernel, dim, max_blocks, inputs, outputs, device = launch[:6]
269
+ kernel, dim, max_blocks, inputs, outputs, device, block_dim = launch[:7]
266
270
  if len(whitelist_kernels) > 0 and kernel.key not in whitelist_kernels:
267
271
  continue
268
272
  if kernel.key in blacklist_kernels:
@@ -280,6 +284,7 @@ def gradcheck_tape(
280
284
  input_output_mask=input_output_mask,
281
285
  device=device,
282
286
  max_blocks=max_blocks,
287
+ block_dim=block_dim,
283
288
  max_inputs_per_var=max_inputs_per_var,
284
289
  max_outputs_per_var=max_outputs_per_var,
285
290
  plot_relative_error=plot_relative_error,
@@ -611,13 +616,15 @@ def jacobian(
611
616
  input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
612
617
  device: wp.context.Devicelike = None,
613
618
  max_blocks=0,
619
+ block_dim=256,
614
620
  max_outputs_per_var=-1,
615
621
  plot_jacobians=False,
616
622
  ) -> Dict[Tuple[int, int], wp.array]:
617
623
  """
618
624
  Computes the Jacobians of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
619
625
 
620
- The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
626
+ The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim``,
627
+ ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
621
628
 
622
629
  Note:
623
630
  This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -634,6 +641,7 @@ def jacobian(
634
641
  input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
635
642
  device: The device to launch on (optional)
636
643
  max_blocks: The maximum number of CUDA thread blocks to use.
644
+ block_dim: The number of threads per block.
637
645
  max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
638
646
  plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
639
647
 
@@ -661,7 +669,15 @@ def jacobian(
661
669
  device = infer_device(inputs + outputs)
662
670
 
663
671
  tape = wp.Tape()
664
- tape.record_launch(kernel=kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=outputs, device=device)
672
+ tape.record_launch(
673
+ kernel=kernel,
674
+ dim=dim,
675
+ inputs=inputs,
676
+ outputs=outputs,
677
+ device=device,
678
+ max_blocks=max_blocks,
679
+ block_dim=block_dim,
680
+ )
665
681
 
666
682
  jacobians = {}
667
683
 
@@ -709,6 +725,7 @@ def jacobian_fd(
709
725
  input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
710
726
  device: wp.context.Devicelike = None,
711
727
  max_blocks=0,
728
+ block_dim=256,
712
729
  max_inputs_per_var=-1,
713
730
  eps=1e-4,
714
731
  plot_jacobians=False,
@@ -717,7 +734,8 @@ def jacobian_fd(
717
734
  Computes the finite-difference Jacobian of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
718
735
  The method uses a central difference scheme to approximate the Jacobian.
719
736
 
720
- The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
737
+ The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
738
+ provided ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
721
739
 
722
740
  Note:
723
741
  This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -734,6 +752,7 @@ def jacobian_fd(
734
752
  input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
735
753
  device: The device to launch on (optional)
736
754
  max_blocks: The maximum number of CUDA thread blocks to use.
755
+ block_dim: The number of threads per block.
737
756
  max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
738
757
  eps: The finite-difference step size.
739
758
  plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
@@ -793,10 +812,26 @@ def jacobian_fd(
793
812
  input_num = min(input_num, max_inputs_per_var)
794
813
  for i in range(input_num):
795
814
  set_element(flat_input, i, -eps, relative=True)
796
- wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=left_outputs, device=device)
815
+ wp.launch(
816
+ kernel,
817
+ dim=dim,
818
+ inputs=inputs,
819
+ outputs=left_outputs,
820
+ device=device,
821
+ max_blocks=max_blocks,
822
+ block_dim=block_dim,
823
+ )
797
824
 
798
825
  set_element(flat_input, i, 2 * eps, relative=True)
799
- wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=right_outputs, device=device)
826
+ wp.launch(
827
+ kernel,
828
+ dim=dim,
829
+ inputs=inputs,
830
+ outputs=right_outputs,
831
+ device=device,
832
+ max_blocks=max_blocks,
833
+ block_dim=block_dim,
834
+ )
800
835
 
801
836
  set_element(flat_input, i, -eps, relative=True)
802
837
 
warp/bin/warp-clang.dll CHANGED
Binary file
warp/bin/warp.dll CHANGED
Binary file
warp/build.py CHANGED
@@ -5,6 +5,7 @@
5
5
  # distribution of this software and related documentation without an express
6
6
  # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
7
 
8
+ import ctypes
8
9
  import os
9
10
 
10
11
  import warp.config
@@ -12,7 +13,7 @@ from warp.thirdparty import appdirs
12
13
 
13
14
 
14
15
  # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
15
- def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False):
16
+ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=None):
16
17
  with open(cu_path, "rb") as src_file:
17
18
  src = src_file.read()
18
19
  cu_path = cu_path.encode("utf-8")
@@ -23,8 +24,26 @@ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fa
23
24
  warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
24
25
 
25
26
  else:
27
+ if ltoirs is None:
28
+ ltoirs = []
29
+
30
+ num_ltoirs = len(ltoirs)
31
+ arr_lroirs = (ctypes.c_char_p * num_ltoirs)(*ltoirs)
32
+ arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs])
26
33
  err = warp.context.runtime.core.cuda_compile_program(
27
- src, arch, inc_path, config == "debug", warp.config.verbose, verify_fp, fast_math, output_path
34
+ src,
35
+ arch,
36
+ inc_path,
37
+ 0,
38
+ None,
39
+ config == "debug",
40
+ warp.config.verbose,
41
+ verify_fp,
42
+ fast_math,
43
+ output_path,
44
+ num_ltoirs,
45
+ arr_lroirs,
46
+ arr_lroir_sizes,
28
47
  )
29
48
  if err != 0:
30
49
  raise Exception(f"CUDA kernel build failed with error code {err}")
warp/build_dll.py CHANGED
@@ -172,6 +172,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
172
172
  f"CUDA Toolkit version {min_ctk_version[0]}.{min_ctk_version[1]}+ is required (found {ctk_version[0]}.{ctk_version[1]} in {cuda_home})"
173
173
  )
174
174
 
175
+ if ctk_version[0] < 12 and args.libmathdx_path:
176
+ print("MathDx support requires at least CUDA 12, skipping")
177
+ args.libmathdx_path = None
178
+
175
179
  gencode_opts = []
176
180
 
177
181
  if args.quick:
@@ -223,6 +227,13 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
223
227
  # is the library being built with CUDA enabled?
224
228
  cuda_enabled = "WP_ENABLE_CUDA=1" if (cu_path is not None) else "WP_ENABLE_CUDA=0"
225
229
 
230
+ if args.libmathdx_path:
231
+ libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
232
+ mathdx_enabled = "WP_ENABLE_MATHDX=1"
233
+ else:
234
+ libmathdx_includes = ""
235
+ mathdx_enabled = "WP_ENABLE_MATHDX=0"
236
+
226
237
  if os.name == "nt":
227
238
  if args.host_compiler:
228
239
  host_linker = os.path.join(os.path.dirname(args.host_compiler), "link.exe")
@@ -244,7 +255,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
244
255
  iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
245
256
  debug = "_DEBUG"
246
257
 
247
- cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
258
+ cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{mathdx_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
248
259
 
249
260
  if args.mode == "debug":
250
261
  cpp_flags += "/Zi /Od /D WP_ENABLE_DEBUG=1"
@@ -273,10 +284,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
273
284
  cu_out = cu_path + ".o"
274
285
 
275
286
  if mode == "debug":
276
- cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
287
+ cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
277
288
 
278
289
  elif mode == "release":
279
- cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
290
+ cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
280
291
 
281
292
  with ScopedTimer("build_cuda", active=args.verbose):
282
293
  run_cmd(cuda_cmd)
@@ -285,6 +296,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
285
296
  f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
286
297
  )
287
298
 
299
+ if args.libmathdx_path:
300
+ linkopts.append(f'nvJitLink_static.lib /LIBPATH:"{args.libmathdx_path}/lib" mathdx_static.lib')
301
+
288
302
  with ScopedTimer("link", active=args.verbose):
289
303
  link_cmd = f'"{host_linker}" {" ".join(linkopts + libs)} /out:"{dll_path}"'
290
304
  run_cmd(link_cmd)
@@ -300,7 +314,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
300
314
  else:
301
315
  version = "-fabi-version=13" # GCC 8.2+
302
316
 
303
- cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
317
+ cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
304
318
 
305
319
  if mode == "debug":
306
320
  cpp_flags += "-O0 -g -D_DEBUG -DWP_ENABLE_DEBUG=1 -fkeep-inline-functions"
@@ -328,10 +342,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
328
342
  cu_out = cu_path + ".o"
329
343
 
330
344
  if mode == "debug":
331
- cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
345
+ cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
332
346
 
333
347
  elif mode == "release":
334
- cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
348
+ cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
335
349
 
336
350
  with ScopedTimer("build_cuda", active=args.verbose):
337
351
  run_cmd(cuda_cmd)
@@ -341,6 +355,9 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
341
355
  f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
342
356
  )
343
357
 
358
+ if args.libmathdx_path:
359
+ ld_inputs.append(f"-lnvJitLink_static -L{args.libmathdx_path}/lib -lmathdx_static")
360
+
344
361
  if sys.platform == "darwin":
345
362
  opt_no_undefined = "-Wl,-undefined,error"
346
363
  opt_exclude_libs = ""