PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.1__py3-none-manylinux_2_34_aarch64.whl → 1.9.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/fem/integrate.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import ast
 import inspect
 import textwrap
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Union
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
 import warp as wp
 import warp.fem.operator as operator
@@ -34,7 +34,10 @@ from warp.fem.field import (
     TrialField,
     make_restriction,
 )
-from warp.fem.field.virtual import make_bilinear_dispatch_kernel, make_linear_dispatch_kernel
+from warp.fem.field.virtual import (
+    make_bilinear_dispatch_kernel,
+    make_linear_dispatch_kernel,
+)
 from warp.fem.linalg import array_axpy, basis_coefficient
 from warp.fem.operator import (
     Integrand,
@@ -101,7 +104,8 @@ class IntegrandVisitor(ast.NodeTransformer):
         field: FieldLike
         abstract_type: type
         concrete_type: type
-        root_arg_name: type
+        root_arg_name: str
+        local_arg_name: str
     def __init__(
         self,
@@ -111,6 +115,7 @@ class IntegrandVisitor(ast.NodeTransformer):
         self._integrand = integrand
         self._field_symbols = field_info.copy()
         self._field_nodes = {}
+        self._field_arg_annotation_nodes = {}
     @staticmethod
     def _build_field_info(integrand: Integrand, field_args: Dict[str, FieldLike]):
@@ -127,6 +132,7 @@ class IntegrandVisitor(ast.NodeTransformer):
                 abstract_type=integrand.argspec.annotations[name],
                 concrete_type=get_concrete_type(field),
                 root_arg_name=name,
+                local_arg_name=name,
             )
             for name, field in field_args.items()
         }
@@ -167,6 +173,7 @@ class IntegrandVisitor(ast.NodeTransformer):
                         field=res[0],
                         abstract_type=res[1],
                         concrete_type=res[2],
+                        local_arg_name=field_info.local_arg_name,
                         root_arg_name=f"{field_info.root_arg_name}.{func.name}",
                     )
@@ -191,6 +198,13 @@ class IntegrandVisitor(ast.NodeTransformer):
         return node
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        # record field arg annotation nodes
+        for arg in node.args.args:
+            self._field_arg_annotation_nodes[arg.arg] = arg.annotation
+        return self.generic_visit(node)
     def _get_callee_field_args(self, callee: Integrand, args: List[ast.AST]):
         # Get field types for call site arguments
         call_site_field_args: List[IntegrandVisitor.FieldInfo] = []
@@ -211,7 +225,13 @@ class IntegrandVisitor(ast.NodeTransformer):
                     raise TypeError(
                         f"Attempting to pass a {passed_field_info.abstract_type.__name__} to argument '{arg}' of '{callee.name}' expecting a {arg_type.__name__}"
                     )
-                callee_field_args[arg] = passed_field_info
+                callee_field_args[arg] = IntegrandVisitor.FieldInfo(
+                    field=passed_field_info.field,
+                    abstract_type=passed_field_info.abstract_type,
+                    concrete_type=passed_field_info.concrete_type,
+                    local_arg_name=arg,
+                    root_arg_name=passed_field_info.root_arg_name,
+                )
         return callee_field_args
@@ -263,18 +283,14 @@ class IntegrandTransformer(IntegrandVisitor):
                 f"Operator {operator.func.__name__} is not defined for {field_info.abstract_type.__name__} {field.name}"
             ) from e
-        # Update the ast Call node to use the new function pointer
-        call.func = ast.Attribute(value=call.func, attr=pointer.key, ctx=ast.Load())
         # Save the pointer as an attribute than can be accessed from the calling scope
-        # For usual operator call syntax, we can use the operator itself, but for the
-        # shortcut default operator syntax, we store it on the callee's concrete type
-        if isinstance(callee, Operator):
-            setattr(callee, pointer.key, pointer)
-        else:
-            setattr(field_info.concrete_type, pointer.key, pointer)
+        # (use the annotation node of the argument this field is constructed from)
+        callee_node = self._field_arg_annotation_nodes[field_info.local_arg_name]
+        setattr(self._field_symbols[field_info.local_arg_name].abstract_type, pointer.key, pointer)
+        call.func = ast.Attribute(value=callee_node, attr=pointer.key, ctx=ast.Load())
-            # also insert callee as first argument
+        # For shortcut default operator syntax, insert callee as first argument
+        if not isinstance(callee, Operator):
             call.args = [ast.Name(id=callee, ctx=ast.Load()), *call.args]
         # replace first argument with selected attribute
@@ -592,6 +608,9 @@ def _combined_kernel_options(integrand_options: Optional[Dict[str, Any]], call_s
     return options
+_INTEGRATE_CONSTANT_TILE_SIZE = 256
 def get_integrate_constant_kernel(
     integrand_func: wp.Function,
     domain: GeometryDomain,
@@ -599,8 +618,12 @@ def get_integrate_constant_kernel(
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
     accumulate_dtype,
+    tile_size: int = _INTEGRATE_CONSTANT_TILE_SIZE,
 ):
+    zero_element = type_zero_element(accumulate_dtype)
     def integrate_kernel_fn(
+        qp_count: int,
         qp_arg: quadrature.Arg,
         qp_element_index_arg: quadrature.ElementIndexArg,
         domain_arg: domain.ElementArg,
@@ -609,26 +632,33 @@ def get_integrate_constant_kernel(
         values: ValueStruct,
         result: wp.array(dtype=accumulate_dtype),
     ):
-        qp_eval_index = wp.tid()
-        domain_element_index, qp = quadrature.evaluation_point_element_index(qp_element_index_arg, qp_eval_index)
-        if domain_element_index == NULL_ELEMENT_INDEX:
-            return
+        block_index, lane = wp.tid()
+        qp_eval_index = block_index * tile_size + lane
-        element_index = domain.element_index(domain_index_arg, domain_element_index)
+        if qp_eval_index >= qp_count:
+            domain_element_index, qp = NULL_ELEMENT_INDEX, 0
+        else:
+            domain_element_index, qp = quadrature.evaluation_point_element_index(qp_element_index_arg, qp_eval_index)
-        qp_coords = quadrature.point_coords(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        qp_weight = quadrature.point_weight(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        qp_index = quadrature.point_index(domain_arg, qp_arg, domain_element_index, element_index, qp)
+        if domain_element_index == NULL_ELEMENT_INDEX:
+            val = zero_element()
+        else:
+            element_index = domain.element_index(domain_index_arg, domain_element_index)
-        test_dof_index = NULL_DOF_INDEX
-        trial_dof_index = NULL_DOF_INDEX
+            qp_coords = quadrature.point_coords(domain_arg, qp_arg, domain_element_index, element_index, qp)
+            qp_weight = quadrature.point_weight(domain_arg, qp_arg, domain_element_index, element_index, qp)
+            qp_index = quadrature.point_index(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
-        vol = domain.element_measure(domain_arg, sample)
+            test_dof_index = NULL_DOF_INDEX
+            trial_dof_index = NULL_DOF_INDEX
-        val = integrand_func(sample, fields, values)
+            sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
+            vol = domain.element_measure(domain_arg, sample)
-        wp.atomic_add(result, 0, accumulate_dtype(qp_weight * vol * val))
+            val = accumulate_dtype(qp_weight * vol * integrand_func(sample, fields, values))
+        tile_integral = wp.tile_sum(wp.tile(val))
+        wp.tile_atomic_add(result, tile_integral, offset=0)
     return integrate_kernel_fn
@@ -1020,7 +1050,7 @@ def get_integrate_bilinear_local_kernel(
             sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
             val = integrand_func(sample, fields, values)
-            result[qp_eval_index, test_dof, trial_dof, taylor_dof] = qp_vol * val
+            result[test_dof, trial_dof, qp_eval_index, taylor_dof] = qp_vol * val
     return integrate_kernel_fn
@@ -1150,9 +1180,46 @@ def _generate_integrate_kernel(
     return kernel, FieldStruct, ValueStruct
+def _generate_auxiliary_kernels(
+    quadrature: Quadrature,
+    test: Optional[TestField],
+    trial: Optional[TrialField],
+    accumulate_dtype: type,
+    device,
+    kernel_options: Optional[Dict[str, Any]] = None,
+) -> List[Tuple[wp.Kernel, int]]:
+    if test is None or not isinstance(test, LocalTestField):
+        return ()
+    # For dispatched assembly, generate additional kernels
+    # heuristic to use tiles for "long" quadratures
+    dispatch_tile_size = 32
+    qp_eval_count = quadrature.evaluation_point_count()
+    if trial is None:
+        if (
+            not device.is_cuda
+            or qp_eval_count * test.space_restriction.total_node_element_count()
+            < 3 * dispatch_tile_size * test.space_restriction.node_count() * test.domain.element_count()
+        ):
+            dispatch_tile_size = 1
+        dispatch_kernel = make_linear_dispatch_kernel(
+            test, quadrature, accumulate_dtype, dispatch_tile_size, kernel_options
+        )
+    else:
+        if not device.is_cuda or qp_eval_count < 3 * dispatch_tile_size * test.domain.element_count():
+            dispatch_tile_size = 1
+        dispatch_kernel = make_bilinear_dispatch_kernel(
+            test, trial, quadrature, accumulate_dtype, dispatch_tile_size, kernel_options
+        )
+    return ((dispatch_kernel, dispatch_tile_size),)
 def _launch_integrate_kernel(
     integrand: Integrand,
     kernel: wp.Kernel,
+    auxiliary_kernels: List[Tuple[wp.Kernel, int]],
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
     domain: GeometryDomain,
@@ -1202,10 +1269,15 @@ def _launch_integrate_kernel(
         if output != accumulate_array or not add_to_output:
             accumulate_array.zero_()
+        qp_count = quadrature.evaluation_point_count()
+        tile_size = _INTEGRATE_CONSTANT_TILE_SIZE
+        block_count = (qp_count + tile_size - 1) // tile_size
         wp.launch(
             kernel=kernel,
-            dim=quadrature.evaluation_point_count(),
+            dim=(block_count, tile_size),
+            block_dim=tile_size,
             inputs=[
+                qp_count,
                 qp_arg,
                 quadrature.element_index_arg_value(device),
                 domain_elt_arg,
@@ -1335,10 +1407,11 @@ def _launch_integrate_kernel(
                     stacklevel=2,
                 )
             else:
-                dispatch_kernel = make_linear_dispatch_kernel(test, quadrature, accumulate_dtype)
+                dispatch_kernel, dispatch_tile_size = auxiliary_kernels[0]
                 wp.launch(
                     kernel=dispatch_kernel,
-                    dim=(test.space_restriction.node_count(), test.node_dof_count),
+                    dim=(test.space_restriction.node_count(), dispatch_tile_size),
+                    block_dim=dispatch_tile_size if dispatch_tile_size > 1 else 256,
                     inputs=[
                         qp_arg,
                         domain_elt_arg,
@@ -1422,14 +1495,15 @@ def _launch_integrate_kernel(
             device=device,
         )
     elif isinstance(test, LocalTestField):
+        qp_eval_count = quadrature.evaluation_point_count()
         local_result = cache.borrow_temporary(
             temporary_store=temporary_store,
             device=device,
             requires_grad=False,
             shape=(
-                quadrature.evaluation_point_count(),
                 test.value_dof_count,
                 trial.value_dof_count,
+                qp_eval_count,
                 test.TAYLOR_DOF_COUNT * trial.TAYLOR_DOF_COUNT,
             ),
             dtype=float,
@@ -1438,7 +1512,7 @@ def _launch_integrate_kernel(
         wp.launch(
             kernel=kernel,
             dim=(
-                quadrature.evaluation_point_count(),
+                qp_eval_count,
                 test.value_dof_count,
                 trial.value_dof_count,
                 trial.TAYLOR_DOF_COUNT,
@@ -1455,17 +1529,6 @@ def _launch_integrate_kernel(
             device=device,
         )
-        vec_array_shape = (*local_result.array.shape[:-1], test.TAYLOR_DOF_COUNT)
-        vec_array_dtype = cache.cached_vec_type(length=trial.TAYLOR_DOF_COUNT, dtype=float)
-        local_result_as_vec = wp.array(
-            data=None,
-            ptr=local_result.array.ptr,
-            capacity=local_result.array.capacity,
-            device=local_result.array.device,
-            shape=vec_array_shape,
-            dtype=vec_array_dtype,
-        )
         if test.TAYLOR_DOF_COUNT * trial.TAYLOR_DOF_COUNT == 0:
             wp.utils.warn(
                 f"Test and/or trial fields are never evaluated in integrand '{integrand.name}', result will be zero",
@@ -1474,18 +1537,17 @@ def _launch_integrate_kernel(
             )
             triplet_rows.fill_(-1)
         else:
-            dispatch_kernel = make_bilinear_dispatch_kernel(test, trial, quadrature, accumulate_dtype)
+            dispatch_kernel, dispatch_tile_size = auxiliary_kernels[0]
             trial_partition_arg = trial.space_partition.partition_arg_value(device)
             trial_topology_arg = trial.space_partition.space_topology.topo_arg_value(device)
             wp.launch(
                 kernel=dispatch_kernel,
                 dim=(
-                    test.space_restriction.node_count(),
-                    test.node_dof_count,
-                    trial.node_dof_count,
+                    test.space_restriction.total_node_element_count(),
                     trial.space.topology.MAX_NODES_PER_ELEMENT,
+                    dispatch_tile_size,
                 ),
+                block_dim=dispatch_tile_size if dispatch_tile_size > 1 else 256,
                 inputs=[
                     qp_arg,
                     domain_elt_arg,
@@ -1495,7 +1557,7 @@ def _launch_integrate_kernel(
                     trial_partition_arg,
                     trial_topology_arg,
                     trial.space.space_arg_value(device),
-                    local_result_as_vec,
+                    local_result.array,
                     triplet_rows,
                     triplet_cols,
                     triplet_values,
@@ -1636,6 +1698,9 @@ def integrate(
     if values is None:
         values = {}
+    if device is None:
+        device = wp.get_device()
     if not isinstance(integrand, Integrand):
         raise ValueError("integrand must be tagged with @warp.fem.integrand decorator")
@@ -1728,9 +1793,19 @@ def integrate(
         kernel_options=kernel_options,
     )
+    auxiliary_kernels = _generate_auxiliary_kernels(
+        quadrature=quadrature,
+        test=test,
+        trial=trial,
+        accumulate_dtype=accumulate_dtype,
+        device=device,
+        kernel_options=kernel_options,
+    )
     return _launch_integrate_kernel(
         integrand=integrand,
         kernel=kernel,
+        auxiliary_kernels=auxiliary_kernels,
         FieldStruct=FieldStruct,
         ValueStruct=ValueStruct,
         domain=domain,
@@ -2355,6 +2430,9 @@ def interpolate(
     if values is None:
         values = {}
+    if device is None:
+        device = wp.get_device()
     if not isinstance(integrand, Integrand):
         raise ValueError("integrand must be tagged with @integrand decorator")

warp/fem/space/restriction.py CHANGED Viewed

@@ -159,6 +159,10 @@ class SpaceRestriction:
     def node_partition_index(args: NodeArg, restriction_node_index: int):
         return args.dof_partition_indices[restriction_node_index]
+    @wp.func
+    def node_partition_index_from_element_offset(args: NodeArg, element_offset: int):
+        return wp.lower_bound(args.dof_element_offsets, element_offset + 1) - 1
     @wp.func
     def node_element_range(args: NodeArg, partition_node_index: int):
         return args.dof_element_offsets[partition_node_index], args.dof_element_offsets[partition_node_index + 1]

warp/fem/space/shape/tet_shape_function.py CHANGED Viewed

@@ -168,19 +168,12 @@ class TetrahedronPolynomialShapeFunctions(TetrahedronShapeFunction):
         self.VERTEX_NODE_COUNT = wp.constant(1)
         self.EDGE_NODE_COUNT = wp.constant(degree - 1)
+        self.FACE_NODE_COUNT = wp.constant(max(0, degree - 2) * max(0, degree - 1) // 2)
+        self.INTERIOR_NODE_COUNT = wp.constant(max(0, degree - 1) * max(0, degree - 2) * max(0, degree - 3) // 6)
         self.NODES_PER_ELEMENT = wp.constant((degree + 1) * (degree + 2) * (degree + 3) // 6)
         self.NODES_PER_SIDE = wp.constant((degree + 1) * (degree + 2) // 2)
-        self.SIDE_NODE_COUNT = wp.constant(self.NODES_PER_ELEMENT - 3 * (self.VERTEX_NODE_COUNT + self.EDGE_NODE_COUNT))
-        self.INTERIOR_NODE_COUNT = wp.constant(
-            self.NODES_PER_ELEMENT - 3 * (self.VERTEX_NODE_COUNT + self.EDGE_NODE_COUNT)
-        )
-        self.VERTEX_NODE_COUNT = wp.constant(1)
-        self.EDGE_NODE_COUNT = wp.constant(degree - 1)
-        self.FACE_NODE_COUNT = wp.constant(max(0, degree - 2) * max(0, degree - 1) // 2)
-        self.INERIOR_NODE_COUNT = wp.constant(max(0, degree - 1) * max(0, degree - 2) * max(0, degree - 3) // 6)
         tet_coords = np.empty((self.NODES_PER_ELEMENT, 3), dtype=int)
         for tx in range(degree + 1):

warp/jax_experimental/custom_call.py CHANGED Viewed

@@ -19,6 +19,7 @@ import warp as wp
 from warp.context import type_str
 from warp.jax import get_jax_device
 from warp.types import array_t, launch_bounds_t, strides_from_shape
+from warp.utils import warn
 _jax_warp_p = None
@@ -28,7 +29,7 @@ _registered_kernels = [None]
 _registered_kernel_to_id = {}
-def jax_kernel(kernel, launch_dims=None):
+def jax_kernel(kernel, launch_dims=None, quiet=False):
     """Create a Jax primitive from a Warp kernel.
     NOTE: This is an experimental feature under development.
@@ -38,6 +39,7 @@ def jax_kernel(kernel, launch_dims=None):
         launch_dims: Optional. Specify the kernel launch dimensions. If None,
                      dimensions are inferred from the shape of the first argument.
                      This option when set will specify the output dimensions.
+        quiet: Optional. If True, suppress deprecation warnings with newer JAX versions.
     Limitations:
         - All kernel arguments must be contiguous arrays.
@@ -46,6 +48,27 @@ def jax_kernel(kernel, launch_dims=None):
         - Only the CUDA backend is supported.
     """
+    import jax
+    # check if JAX version supports this
+    if jax.__version_info__ < (0, 4, 25) or jax.__version_info__ >= (0, 8, 0):
+        msg = (
+            "This version of jax_kernel() requires JAX version 0.4.25 - 0.7.x, "
+            f"but installed JAX version is {jax.__version_info__}."
+        )
+        if jax.__version_info__ >= (0, 8, 0):
+            msg += " Please use warp.jax_experimental.ffi.jax_kernel instead."
+        raise RuntimeError(msg)
+    # deprecation warning
+    if jax.__version_info__ >= (0, 5, 0) and not quiet:
+        warn(
+            "This version of jax_kernel() is deprecated and will not be supported with newer JAX versions. "
+            "Please use the newer FFI version instead (warp.jax_experimental.ffi.jax_kernel). "
+            "In Warp release 1.10, the FFI version will become the default implementation of jax_kernel().",
+            DeprecationWarning,
+        )
     if _jax_warp_p is None:
         # Create and register the primitive
         _create_jax_warp_primitive()
@@ -107,7 +130,7 @@ def _warp_custom_callback(stream, buffers, opaque, opaque_len):
     assert hooks.forward, "Failed to find kernel entry point"
     # Launch the kernel.
-    wp.context.runtime.core.cuda_launch_kernel(
+    wp.context.runtime.core.wp_cuda_launch_kernel(
         device.context, hooks.forward, bounds.size, 0, 256, hooks.forward_smem_bytes, kernel_params, stream
     )

warp/jax_experimental/ffi.py CHANGED Viewed

@@ -29,6 +29,18 @@ from warp.types import array_t, launch_bounds_t, strides_from_shape, type_to_war
 from .xla_ffi import *
+def check_jax_version():
+    # check if JAX version supports this
+    if jax.__version_info__ < (0, 5, 0):
+        msg = (
+            "This version of jax_kernel() requires JAX version 0.5.0 or higher, "
+            f"but installed JAX version is {jax.__version_info__}."
+        )
+        if jax.__version_info__ >= (0, 4, 25):
+            msg += " Please use warp.jax_experimental.custom_call.jax_kernel instead."
+        raise RuntimeError(msg)
 class GraphMode(IntEnum):
     NONE = 0  # don't capture a graph
     JAX = 1  # let JAX capture a graph
@@ -317,7 +329,7 @@ class FfiKernel:
             assert hooks.forward, "Failed to find kernel entry point"
             # launch the kernel
-            wp.context.runtime.core.cuda_launch_kernel(
+            wp.context.runtime.core.wp_cuda_launch_kernel(
                 device.context,
                 hooks.forward,
                 launch_bounds.size,
@@ -381,6 +393,7 @@ class FfiCallable:
             if arg_name == "return":
                 if arg_type is not None:
                     raise TypeError("Function must not return a value")
+                continue
             else:
                 arg = FfiArg(arg_name, arg_type, arg_name in in_out_argnames)
                 if arg_name in in_out_argnames:
@@ -667,8 +680,12 @@ def jax_kernel(
         - There must be at least one output or input-output argument.
         - Only the CUDA backend is supported.
     """
+    check_jax_version()
     key = (
         kernel.func,
+        kernel.sig,
         num_outputs,
         vmap_method,
         tuple(launch_dims) if launch_dims else launch_dims,
@@ -725,6 +742,8 @@ def jax_callable(
         - Only the CUDA backend is supported.
     """
+    check_jax_version()
     if graph_compatible is not None:
         wp.utils.warn(
             "The `graph_compatible` argument is deprecated, use `graph_mode` instead.",
@@ -771,6 +790,8 @@ def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = Tr
         graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
     """
+    check_jax_version()
     # TODO check that the name is not already registered
     def ffi_callback(call_frame):

warp/jax_experimental/xla_ffi.py CHANGED Viewed

@@ -475,17 +475,26 @@ _xla_data_type_to_constructor = {
     XLA_FFI_DataType.C64: jnp.complex64,
     XLA_FFI_DataType.C128: jnp.complex128,
     # XLA_FFI_DataType.TOKEN
-    XLA_FFI_DataType.F8E5M2: jnp.float8_e5m2,
-    XLA_FFI_DataType.F8E3M4: jnp.float8_e3m4,
-    XLA_FFI_DataType.F8E4M3: jnp.float8_e4m3,
-    XLA_FFI_DataType.F8E4M3FN: jnp.float8_e4m3fn,
-    XLA_FFI_DataType.F8E4M3B11FNUZ: jnp.float8_e4m3b11fnuz,
-    XLA_FFI_DataType.F8E5M2FNUZ: jnp.float8_e5m2fnuz,
-    XLA_FFI_DataType.F8E4M3FNUZ: jnp.float8_e4m3fnuz,
     # XLA_FFI_DataType.F4E2M1FN: jnp.float4_e2m1fn.dtype,
     # XLA_FFI_DataType.F8E8M0FNU: jnp.float8_e8m0fnu.dtype,
 }
+# newer types not supported by older versions
+if hasattr(jnp, "float8_e5m2"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E5M2] = jnp.float8_e5m2
+if hasattr(jnp, "float8_e3m4"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E3M4] = jnp.float8_e3m4
+if hasattr(jnp, "float8_e4m3"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3] = jnp.float8_e4m3
+if hasattr(jnp, "float8_e4m3fn"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3FN] = jnp.float8_e4m3fn
+if hasattr(jnp, "float8_e4m3b11fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3B11FNUZ] = jnp.float8_e4m3b11fnuz
+if hasattr(jnp, "float8_e5m2fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E5M2FNUZ] = jnp.float8_e5m2fnuz
+if hasattr(jnp, "float8_e4m3fnuz"):
+    _xla_data_type_to_constructor[XLA_FFI_DataType.F8E4M3FNUZ] = jnp.float8_e4m3fnuz
 ########################################################################
 # Helpers for translating between ctypes and python types