PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl - Mend

warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/fem/integrate.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import ast
 import inspect
 import textwrap
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Union
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
 import warp as wp
 import warp.fem.operator as operator
@@ -34,7 +34,10 @@ from warp.fem.field import (
     TrialField,
     make_restriction,
 )
-from warp.fem.field.virtual import make_bilinear_dispatch_kernel, make_linear_dispatch_kernel
+from warp.fem.field.virtual import (
+    make_bilinear_dispatch_kernel,
+    make_linear_dispatch_kernel,
+)
 from warp.fem.linalg import array_axpy, basis_coefficient
 from warp.fem.operator import (
     Integrand,
@@ -56,7 +59,7 @@ from warp.fem.types import (
 )
 from warp.fem.utils import type_zero_element
 from warp.sparse import BsrMatrix, bsr_set_from_triplets, bsr_zeros
-from warp.types import type_size
+from warp.types import is_array, type_size
 from warp.utils import array_cast
@@ -101,7 +104,8 @@ class IntegrandVisitor(ast.NodeTransformer):
         field: FieldLike
         abstract_type: type
         concrete_type: type
-        root_arg_name: type
+        root_arg_name: str
+        local_arg_name: str
     def __init__(
         self,
@@ -111,6 +115,7 @@ class IntegrandVisitor(ast.NodeTransformer):
         self._integrand = integrand
         self._field_symbols = field_info.copy()
         self._field_nodes = {}
+        self._field_arg_annotation_nodes = {}
     @staticmethod
     def _build_field_info(integrand: Integrand, field_args: Dict[str, FieldLike]):
@@ -127,6 +132,7 @@ class IntegrandVisitor(ast.NodeTransformer):
                 abstract_type=integrand.argspec.annotations[name],
                 concrete_type=get_concrete_type(field),
                 root_arg_name=name,
+                local_arg_name=name,
             )
             for name, field in field_args.items()
         }
@@ -167,6 +173,7 @@ class IntegrandVisitor(ast.NodeTransformer):
                         field=res[0],
                         abstract_type=res[1],
                         concrete_type=res[2],
+                        local_arg_name=field_info.local_arg_name,
                         root_arg_name=f"{field_info.root_arg_name}.{func.name}",
                     )
@@ -191,6 +198,13 @@ class IntegrandVisitor(ast.NodeTransformer):
         return node
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        # record field arg annotation nodes
+        for arg in node.args.args:
+            self._field_arg_annotation_nodes[arg.arg] = arg.annotation
+        return self.generic_visit(node)
     def _get_callee_field_args(self, callee: Integrand, args: List[ast.AST]):
         # Get field types for call site arguments
         call_site_field_args: List[IntegrandVisitor.FieldInfo] = []
@@ -211,7 +225,13 @@ class IntegrandVisitor(ast.NodeTransformer):
                     raise TypeError(
                         f"Attempting to pass a {passed_field_info.abstract_type.__name__} to argument '{arg}' of '{callee.name}' expecting a {arg_type.__name__}"
                     )
-                callee_field_args[arg] = passed_field_info
+                callee_field_args[arg] = IntegrandVisitor.FieldInfo(
+                    field=passed_field_info.field,
+                    abstract_type=passed_field_info.abstract_type,
+                    concrete_type=passed_field_info.concrete_type,
+                    local_arg_name=arg,
+                    root_arg_name=passed_field_info.root_arg_name,
+                )
         return callee_field_args
@@ -263,18 +283,14 @@ class IntegrandTransformer(IntegrandVisitor):
                 f"Operator {operator.func.__name__} is not defined for {field_info.abstract_type.__name__} {field.name}"
             ) from e
-        # Update the ast Call node to use the new function pointer
-        call.func = ast.Attribute(value=call.func, attr=pointer.key, ctx=ast.Load())
         # Save the pointer as an attribute than can be accessed from the calling scope
-        # For usual operator call syntax, we can use the operator itself, but for the
-        # shortcut default operator syntax, we store it on the callee's concrete type
-        if isinstance(callee, Operator):
-            setattr(callee, pointer.key, pointer)
-        else:
-            setattr(field_info.concrete_type, pointer.key, pointer)
+        # (use the annotation node of the argument this field is constructed from)
+        callee_node = self._field_arg_annotation_nodes[field_info.local_arg_name]
+        setattr(self._field_symbols[field_info.local_arg_name].abstract_type, pointer.key, pointer)
+        call.func = ast.Attribute(value=callee_node, attr=pointer.key, ctx=ast.Load())
-            # also insert callee as first argument
+        # For shortcut default operator syntax, insert callee as first argument
+        if not isinstance(callee, Operator):
             call.args = [ast.Name(id=callee, ctx=ast.Load()), *call.args]
         # replace first argument with selected attribute
@@ -592,6 +608,9 @@ def _combined_kernel_options(integrand_options: Optional[Dict[str, Any]], call_s
     return options
+_INTEGRATE_CONSTANT_TILE_SIZE = 256
 def get_integrate_constant_kernel(
     integrand_func: wp.Function,
     domain: GeometryDomain,
@@ -599,8 +618,12 @@ def get_integrate_constant_kernel(
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
     accumulate_dtype,
+    tile_size: int = _INTEGRATE_CONSTANT_TILE_SIZE,
 ):
+    zero_element = type_zero_element(accumulate_dtype)
     def integrate_kernel_fn(
+        qp_count: int,
         qp_arg: quadrature.Arg,
         qp_element_index_arg: quadrature.ElementIndexArg,
         domain_arg: domain.ElementArg,
@@ -609,26 +632,33 @@ def get_integrate_constant_kernel(
         values: ValueStruct,
         result: wp.array(dtype=accumulate_dtype),
     ):
-        qp_eval_index = wp.tid()
-        domain_element_index, qp = quadrature.evaluation_point_element_index(qp_element_index_arg, qp_eval_index)
-        if domain_element_index == NULL_ELEMENT_INDEX:
-            return
+        block_index, lane = wp.tid()
+        qp_eval_index = block_index * tile_size + lane
-        element_index = domain.element_index(domain_index_arg, domain_element_index)
+        if qp_eval_index >= qp_count:
+            domain_element_index, qp = NULL_ELEMENT_INDEX, 0
+        else:
+            domain_element_index, qp = quadrature.evaluation_point_element_index(qp_element_index_arg, qp_eval_index)
-        qp_coords = quadrature.point_coords(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        qp_weight = quadrature.point_weight(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        qp_index = quadrature.point_index(domain_arg, qp_arg, domain_element_index, element_index, qp)
+        if domain_element_index == NULL_ELEMENT_INDEX:
+            val = zero_element()
+        else:
+            element_index = domain.element_index(domain_index_arg, domain_element_index)
-        test_dof_index = NULL_DOF_INDEX
-        trial_dof_index = NULL_DOF_INDEX
+            qp_coords = quadrature.point_coords(domain_arg, qp_arg, domain_element_index, element_index, qp)
+            qp_weight = quadrature.point_weight(domain_arg, qp_arg, domain_element_index, element_index, qp)
+            qp_index = quadrature.point_index(domain_arg, qp_arg, domain_element_index, element_index, qp)
-        sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
-        vol = domain.element_measure(domain_arg, sample)
+            test_dof_index = NULL_DOF_INDEX
+            trial_dof_index = NULL_DOF_INDEX
-        val = integrand_func(sample, fields, values)
+            sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
+            vol = domain.element_measure(domain_arg, sample)
+            val = accumulate_dtype(qp_weight * vol * integrand_func(sample, fields, values))
-        wp.atomic_add(result, 0, accumulate_dtype(qp_weight * vol * val))
+        tile_integral = wp.tile_sum(wp.tile(val))
+        wp.tile_atomic_add(result, tile_integral, offset=0)
     return integrate_kernel_fn
@@ -1020,7 +1050,7 @@ def get_integrate_bilinear_local_kernel(
             sample = Sample(element_index, qp_coords, qp_index, qp_weight, test_dof_index, trial_dof_index)
             val = integrand_func(sample, fields, values)
-            result[qp_eval_index, test_dof, trial_dof, taylor_dof] = qp_vol * val
+            result[test_dof, trial_dof, qp_eval_index, taylor_dof] = qp_vol * val
     return integrate_kernel_fn
@@ -1150,9 +1180,46 @@ def _generate_integrate_kernel(
     return kernel, FieldStruct, ValueStruct
+def _generate_auxiliary_kernels(
+    quadrature: Quadrature,
+    test: Optional[TestField],
+    trial: Optional[TrialField],
+    accumulate_dtype: type,
+    device,
+    kernel_options: Optional[Dict[str, Any]] = None,
+) -> List[Tuple[wp.Kernel, int]]:
+    if test is None or not isinstance(test, LocalTestField):
+        return ()
+    # For dispatched assembly, generate additional kernels
+    # heuristic to use tiles for "long" quadratures
+    dispatch_tile_size = 32
+    qp_eval_count = quadrature.evaluation_point_count()
+    if trial is None:
+        if (
+            not device.is_cuda
+            or qp_eval_count * test.space_restriction.total_node_element_count()
+            < 3 * dispatch_tile_size * test.space_restriction.node_count() * test.domain.element_count()
+        ):
+            dispatch_tile_size = 1
+        dispatch_kernel = make_linear_dispatch_kernel(
+            test, quadrature, accumulate_dtype, dispatch_tile_size, kernel_options
+        )
+    else:
+        if not device.is_cuda or qp_eval_count < 3 * dispatch_tile_size * test.domain.element_count():
+            dispatch_tile_size = 1
+        dispatch_kernel = make_bilinear_dispatch_kernel(
+            test, trial, quadrature, accumulate_dtype, dispatch_tile_size, kernel_options
+        )
+    return ((dispatch_kernel, dispatch_tile_size),)
 def _launch_integrate_kernel(
     integrand: Integrand,
     kernel: wp.Kernel,
+    auxiliary_kernels: List[Tuple[wp.Kernel, int]],
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
     domain: GeometryDomain,
@@ -1202,10 +1269,15 @@ def _launch_integrate_kernel(
         if output != accumulate_array or not add_to_output:
             accumulate_array.zero_()
+        qp_count = quadrature.evaluation_point_count()
+        tile_size = _INTEGRATE_CONSTANT_TILE_SIZE
+        block_count = (qp_count + tile_size - 1) // tile_size
         wp.launch(
             kernel=kernel,
-            dim=quadrature.evaluation_point_count(),
+            dim=(block_count, tile_size),
+            block_dim=tile_size,
             inputs=[
+                qp_count,
                 qp_arg,
                 quadrature.element_index_arg_value(device),
                 domain_elt_arg,
@@ -1328,21 +1400,29 @@ def _launch_integrate_kernel(
                 device=device,
             )
-            dispatch_kernel = make_linear_dispatch_kernel(test, quadrature, accumulate_dtype)
-            wp.launch(
-                kernel=dispatch_kernel,
-                dim=(test.space_restriction.node_count(), test.node_dof_count),
-                inputs=[
-                    qp_arg,
-                    domain_elt_arg,
-                    domain_elt_index_arg,
-                    test_arg,
-                    test.space.space_arg_value(device),
-                    local_result.array,
-                    output_view,
-                ],
-                device=device,
-            )
+            if test.TAYLOR_DOF_COUNT == 0:
+                wp.utils.warn(
+                    f"Test field is never evaluated in integrand '{integrand.name}', result will be zero",
+                    category=UserWarning,
+                    stacklevel=2,
+                )
+            else:
+                dispatch_kernel, dispatch_tile_size = auxiliary_kernels[0]
+                wp.launch(
+                    kernel=dispatch_kernel,
+                    dim=(test.space_restriction.node_count(), dispatch_tile_size),
+                    block_dim=dispatch_tile_size if dispatch_tile_size > 1 else 256,
+                    inputs=[
+                        qp_arg,
+                        domain_elt_arg,
+                        domain_elt_index_arg,
+                        test_arg,
+                        test.space.space_arg_value(device),
+                        local_result.array,
+                        output_view,
+                    ],
+                    device=device,
+                )
             local_result.release()
@@ -1415,14 +1495,15 @@ def _launch_integrate_kernel(
             device=device,
         )
     elif isinstance(test, LocalTestField):
+        qp_eval_count = quadrature.evaluation_point_count()
         local_result = cache.borrow_temporary(
             temporary_store=temporary_store,
             device=device,
             requires_grad=False,
             shape=(
-                quadrature.evaluation_point_count(),
                 test.value_dof_count,
                 trial.value_dof_count,
+                qp_eval_count,
                 test.TAYLOR_DOF_COUNT * trial.TAYLOR_DOF_COUNT,
             ),
             dtype=float,
@@ -1431,7 +1512,7 @@ def _launch_integrate_kernel(
         wp.launch(
             kernel=kernel,
             dim=(
-                quadrature.evaluation_point_count(),
+                qp_eval_count,
                 test.value_dof_count,
                 trial.value_dof_count,
                 trial.TAYLOR_DOF_COUNT,
@@ -1448,45 +1529,41 @@ def _launch_integrate_kernel(
             device=device,
         )
-        vec_array_shape = (*local_result.array.shape[:-1], test.TAYLOR_DOF_COUNT)
-        vec_array_dtype = cache.cached_vec_type(length=trial.TAYLOR_DOF_COUNT, dtype=float)
-        local_result_as_vec = wp.array(
-            data=None,
-            ptr=local_result.array.ptr,
-            capacity=local_result.array.capacity,
-            device=local_result.array.device,
-            shape=vec_array_shape,
-            dtype=vec_array_dtype,
-        )
-        dispatch_kernel = make_bilinear_dispatch_kernel(test, trial, quadrature, accumulate_dtype)
-        trial_partition_arg = trial.space_partition.partition_arg_value(device)
-        trial_topology_arg = trial.space_partition.space_topology.topo_arg_value(device)
-        wp.launch(
-            kernel=dispatch_kernel,
-            dim=(
-                test.space_restriction.node_count(),
-                test.node_dof_count,
-                trial.node_dof_count,
-                trial.space.topology.MAX_NODES_PER_ELEMENT,
-            ),
-            inputs=[
-                qp_arg,
-                domain_elt_arg,
-                domain_elt_index_arg,
-                test_arg,
-                test.space.space_arg_value(device),
-                trial_partition_arg,
-                trial_topology_arg,
-                trial.space.space_arg_value(device),
-                local_result_as_vec,
-                triplet_rows,
-                triplet_cols,
-                triplet_values,
-            ],
-            device=device,
-        )
+        if test.TAYLOR_DOF_COUNT * trial.TAYLOR_DOF_COUNT == 0:
+            wp.utils.warn(
+                f"Test and/or trial fields are never evaluated in integrand '{integrand.name}', result will be zero",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            triplet_rows.fill_(-1)
+        else:
+            dispatch_kernel, dispatch_tile_size = auxiliary_kernels[0]
+            trial_partition_arg = trial.space_partition.partition_arg_value(device)
+            trial_topology_arg = trial.space_partition.space_topology.topo_arg_value(device)
+            wp.launch(
+                kernel=dispatch_kernel,
+                dim=(
+                    test.space_restriction.total_node_element_count(),
+                    trial.space.topology.MAX_NODES_PER_ELEMENT,
+                    dispatch_tile_size,
+                ),
+                block_dim=dispatch_tile_size if dispatch_tile_size > 1 else 256,
+                inputs=[
+                    qp_arg,
+                    domain_elt_arg,
+                    domain_elt_index_arg,
+                    test_arg,
+                    test.space.space_arg_value(device),
+                    trial_partition_arg,
+                    trial_topology_arg,
+                    trial.space.space_arg_value(device),
+                    local_result.array,
+                    triplet_rows,
+                    triplet_cols,
+                    triplet_values,
+                ],
+                device=device,
+            )
         local_result.release()
@@ -1621,6 +1698,9 @@ def integrate(
     if values is None:
         values = {}
+    if device is None:
+        device = wp.get_device()
     if not isinstance(integrand, Integrand):
         raise ValueError("integrand must be tagged with @warp.fem.integrand decorator")
@@ -1713,9 +1793,19 @@ def integrate(
         kernel_options=kernel_options,
     )
+    auxiliary_kernels = _generate_auxiliary_kernels(
+        quadrature=quadrature,
+        test=test,
+        trial=trial,
+        accumulate_dtype=accumulate_dtype,
+        device=device,
+        kernel_options=kernel_options,
+    )
     return _launch_integrate_kernel(
         integrand=integrand,
         kernel=kernel,
+        auxiliary_kernels=auxiliary_kernels,
         FieldStruct=FieldStruct,
         ValueStruct=ValueStruct,
         domain=domain,
@@ -2207,6 +2297,9 @@ def _launch_interpolate_kernel(
         return
     if quadrature is None:
+        if dest is not None and (not is_array(dest) or dest.shape[0] != dim):
+            raise ValueError(f"dest must be a warp array with {dim} rows")
         wp.launch(
             kernel=kernel,
             dim=dim,
@@ -2216,21 +2309,34 @@ def _launch_interpolate_kernel(
         return
     qp_arg = quadrature.arg_value(device)
+    qp_eval_count = quadrature.evaluation_point_count()
+    qp_index_count = quadrature.total_point_count()
+    if qp_eval_count != qp_index_count:
+        wp.utils.warn(
+            f"Quadrature used for interpolation of {integrand.name} has different number of evaluation and indexed points, this may lead to incorrect results",
+            category=UserWarning,
+            stacklevel=2,
+        )
     qp_element_index_arg = quadrature.element_index_arg_value(device)
     if trial is None:
+        if dest is not None and (not is_array(dest) or dest.shape[0] != qp_index_count):
+            raise ValueError(f"dest must be a warp array with {qp_index_count} rows")
         wp.launch(
             kernel=kernel,
-            dim=quadrature.evaluation_point_count(),
+            dim=qp_eval_count,
             inputs=[qp_arg, qp_element_index_arg, elt_arg, elt_index_arg, field_arg_values, value_struct_values, dest],
             device=device,
         )
         return
-    nnz = quadrature.total_point_count() * trial.space.topology.MAX_NODES_PER_ELEMENT
+    nnz = qp_eval_count * trial.space.topology.MAX_NODES_PER_ELEMENT
-    if dest.nrow != quadrature.total_point_count() or dest.ncol != trial.space_partition.node_count():
+    if dest.nrow != qp_index_count or dest.ncol != trial.space_partition.node_count():
         raise RuntimeError(
-            f"'dest' matrix must have {quadrature.total_point_count()} rows and {trial.space_partition.node_count()} columns of blocks"
+            f"'dest' matrix must have {qp_index_count} rows and {trial.space_partition.node_count()} columns of blocks"
         )
     if dest.block_shape[1] != trial.node_dof_count:
         raise RuntimeError(f"'dest' matrix blocks must have {trial.node_dof_count} columns")
@@ -2324,6 +2430,9 @@ def interpolate(
     if values is None:
         values = {}
+    if device is None:
+        device = wp.get_device()
     if not isinstance(integrand, Integrand):
         raise ValueError("integrand must be tagged with @integrand decorator")

warp/fem/space/restriction.py CHANGED Viewed

@@ -159,6 +159,10 @@ class SpaceRestriction:
     def node_partition_index(args: NodeArg, restriction_node_index: int):
         return args.dof_partition_indices[restriction_node_index]
+    @wp.func
+    def node_partition_index_from_element_offset(args: NodeArg, element_offset: int):
+        return wp.lower_bound(args.dof_element_offsets, element_offset + 1) - 1
     @wp.func
     def node_element_range(args: NodeArg, partition_node_index: int):
         return args.dof_element_offsets[partition_node_index], args.dof_element_offsets[partition_node_index + 1]

warp/fem/space/shape/tet_shape_function.py CHANGED Viewed

@@ -168,19 +168,12 @@ class TetrahedronPolynomialShapeFunctions(TetrahedronShapeFunction):
         self.VERTEX_NODE_COUNT = wp.constant(1)
         self.EDGE_NODE_COUNT = wp.constant(degree - 1)
+        self.FACE_NODE_COUNT = wp.constant(max(0, degree - 2) * max(0, degree - 1) // 2)
+        self.INTERIOR_NODE_COUNT = wp.constant(max(0, degree - 1) * max(0, degree - 2) * max(0, degree - 3) // 6)
         self.NODES_PER_ELEMENT = wp.constant((degree + 1) * (degree + 2) * (degree + 3) // 6)
         self.NODES_PER_SIDE = wp.constant((degree + 1) * (degree + 2) // 2)
-        self.SIDE_NODE_COUNT = wp.constant(self.NODES_PER_ELEMENT - 3 * (self.VERTEX_NODE_COUNT + self.EDGE_NODE_COUNT))
-        self.INTERIOR_NODE_COUNT = wp.constant(
-            self.NODES_PER_ELEMENT - 3 * (self.VERTEX_NODE_COUNT + self.EDGE_NODE_COUNT)
-        )
-        self.VERTEX_NODE_COUNT = wp.constant(1)
-        self.EDGE_NODE_COUNT = wp.constant(degree - 1)
-        self.FACE_NODE_COUNT = wp.constant(max(0, degree - 2) * max(0, degree - 1) // 2)
-        self.INERIOR_NODE_COUNT = wp.constant(max(0, degree - 1) * max(0, degree - 2) * max(0, degree - 3) // 6)
         tet_coords = np.empty((self.NODES_PER_ELEMENT, 3), dtype=int)
         for tx in range(degree + 1):

warp/jax_experimental/custom_call.py CHANGED Viewed

@@ -107,7 +107,7 @@ def _warp_custom_callback(stream, buffers, opaque, opaque_len):
     assert hooks.forward, "Failed to find kernel entry point"
     # Launch the kernel.
-    wp.context.runtime.core.cuda_launch_kernel(
+    wp.context.runtime.core.wp_cuda_launch_kernel(
         device.context, hooks.forward, bounds.size, 0, 256, hooks.forward_smem_bytes, kernel_params, stream
     )