PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/fem/geometry/quadmesh_2d.py ADDED Viewed

@@ -0,0 +1,532 @@
+from typing import Optional
+import warp as wp
+from warp.fem.cache import (
+    TemporaryStore,
+    borrow_temporary,
+    borrow_temporary_like,
+    cached_arg_value,
+)
+from warp.fem.types import OUTSIDE, Coords, ElementIndex, Sample, make_free_sample
+from .element import LinearEdge, Square
+from .geometry import Geometry
+# from .closest_point import project_on_tet_at_origin
+@wp.struct
+class Quadmesh2DCellArg:
+    quad_vertex_indices: wp.array2d(dtype=int)
+    positions: wp.array(dtype=wp.vec2)
+    # for neighbor cell lookup
+    vertex_quad_offsets: wp.array(dtype=int)
+    vertex_quad_indices: wp.array(dtype=int)
+@wp.struct
+class Quadmesh2DSideArg:
+    cell_arg: Quadmesh2DCellArg
+    edge_vertex_indices: wp.array(dtype=wp.vec2i)
+    edge_quad_indices: wp.array(dtype=wp.vec2i)
+class Quadmesh2D(Geometry):
+    """Two-dimensional quadrilateral mesh geometry"""
+    dimension = 2
+    def __init__(
+        self, quad_vertex_indices: wp.array, positions: wp.array, temporary_store: Optional[TemporaryStore] = None
+    ):
+        """
+        Constructs a two-dimensional quadrilateral mesh.
+        Args:
+            quad_vertex_indices: warp array of shape (num_tris, 4) containing vertex indices for each quad, in counter-clockwise order
+            positions: warp array of shape (num_vertices, 2) containing 2d position for each vertex
+            temporary_store: shared pool from which to allocate temporary arrays
+        """
+        self.quad_vertex_indices = quad_vertex_indices
+        self.positions = positions
+        self._edge_vertex_indices: wp.array = None
+        self._edge_quad_indices: wp.array = None
+        self._vertex_quad_offsets: wp.array = None
+        self._vertex_quad_indices: wp.array = None
+        self._build_topology(temporary_store)
+    def cell_count(self):
+        return self.quad_vertex_indices.shape[0]
+    def vertex_count(self):
+        return self.positions.shape[0]
+    def side_count(self):
+        return self._edge_vertex_indices.shape[0]
+    def boundary_side_count(self):
+        return self._boundary_edge_indices.shape[0]
+    def reference_cell(self) -> Square:
+        return Square()
+    def reference_side(self) -> LinearEdge:
+        return LinearEdge()
+    @property
+    def edge_quad_indices(self) -> wp.array:
+        return self._edge_quad_indices
+    @property
+    def edge_vertex_indices(self) -> wp.array:
+        return self._edge_vertex_indices
+    CellArg = Quadmesh2DCellArg
+    SideArg = Quadmesh2DSideArg
+    @wp.struct
+    class SideIndexArg:
+        boundary_edge_indices: wp.array(dtype=int)
+    # Geometry device interface
+    @cached_arg_value
+    def cell_arg_value(self, device) -> CellArg:
+        args = self.CellArg()
+        args.quad_vertex_indices = self.quad_vertex_indices.to(device)
+        args.positions = self.positions.to(device)
+        args.vertex_quad_offsets = self._vertex_quad_offsets.to(device)
+        args.vertex_quad_indices = self._vertex_quad_indices.to(device)
+        return args
+    @wp.func
+    def cell_position(args: CellArg, s: Sample):
+        quad_idx = args.quad_vertex_indices[s.element_index]
+        w_p = s.element_coords
+        w_m = Coords(1.0) - s.element_coords
+        # 0 : m m
+        # 1 : p m
+        # 2 : p p
+        # 3 : m p
+        return (
+            w_m[0] * w_m[1] * args.positions[quad_idx[0]]
+            + w_p[0] * w_m[1] * args.positions[quad_idx[1]]
+            + w_p[0] * w_p[1] * args.positions[quad_idx[2]]
+            + w_m[0] * w_p[1] * args.positions[quad_idx[3]]
+        )
+    @wp.func
+    def cell_deformation_gradient(cell_arg: CellArg, s: Sample):
+        """Deformation gradient at `coords`"""
+        quad_idx = cell_arg.quad_vertex_indices[s.element_index]
+        w_p = s.element_coords
+        w_m = Coords(1.0) - s.element_coords
+        return (
+            wp.outer(cell_arg.positions[quad_idx[0]], wp.vec2(-w_m[1], -w_m[0]))
+            + wp.outer(cell_arg.positions[quad_idx[1]], wp.vec2(w_m[1], -w_p[0]))
+            + wp.outer(cell_arg.positions[quad_idx[2]], wp.vec2(w_p[1], w_p[0]))
+            + wp.outer(cell_arg.positions[quad_idx[3]], wp.vec2(-w_p[1], w_m[0]))
+        )
+    @wp.func
+    def cell_inverse_deformation_gradient(cell_arg: CellArg, s: Sample):
+        return wp.inverse(Quadmesh2D.cell_deformation_gradient(cell_arg, s))
+    @wp.func
+    def cell_measure(args: CellArg, s: Sample):
+        return wp.abs(wp.determinant(Quadmesh2D.cell_deformation_gradient(args, s)))
+    @wp.func
+    def cell_normal(args: CellArg, s: Sample):
+        return wp.vec2(0.0)
+    @cached_arg_value
+    def side_index_arg_value(self, device) -> SideIndexArg:
+        args = self.SideIndexArg()
+        args.boundary_edge_indices = self._boundary_edge_indices.to(device)
+        return args
+    @wp.func
+    def boundary_side_index(args: SideIndexArg, boundary_side_index: int):
+        """Boundary side to side index"""
+        return args.boundary_edge_indices[boundary_side_index]
+    @cached_arg_value
+    def side_arg_value(self, device) -> CellArg:
+        args = self.SideArg()
+        args.cell_arg = self.cell_arg_value(device)
+        args.edge_vertex_indices = self._edge_vertex_indices.to(device)
+        args.edge_quad_indices = self._edge_quad_indices.to(device)
+        return args
+    @wp.func
+    def side_position(args: SideArg, s: Sample):
+        edge_idx = args.edge_vertex_indices[s.element_index]
+        return (1.0 - s.element_coords[0]) * args.cell_arg.positions[edge_idx[0]] + s.element_coords[
+            0
+        ] * args.cell_arg.positions[edge_idx[1]]
+    @wp.func
+    def side_deformation_gradient(args: SideArg, s: Sample):
+        edge_idx = args.edge_vertex_indices[s.element_index]
+        v0 = args.cell_arg.positions[edge_idx[0]]
+        v1 = args.cell_arg.positions[edge_idx[1]]
+        return v1 - v0
+    @wp.func
+    def side_inner_inverse_deformation_gradient(args: SideArg, s: Sample):
+        cell_index = Quadmesh2D.side_inner_cell_index(args, s.element_index)
+        cell_coords = Quadmesh2D.side_inner_cell_coords(args, s.element_index, s.element_coords)
+        return Quadmesh2D.cell_inverse_deformation_gradient(args.cell_arg, make_free_sample(cell_index, cell_coords))
+    @wp.func
+    def side_outer_inverse_deformation_gradient(args: SideArg, s: Sample):
+        cell_index = Quadmesh2D.side_outer_cell_index(args, s.element_index)
+        cell_coords = Quadmesh2D.side_outer_cell_coords(args, s.element_index, s.element_coords)
+        return Quadmesh2D.cell_inverse_deformation_gradient(args.cell_arg, make_free_sample(cell_index, cell_coords))
+    @wp.func
+    def side_measure(args: SideArg, s: Sample):
+        edge_idx = args.edge_vertex_indices[s.element_index]
+        v0 = args.cell_arg.positions[edge_idx[0]]
+        v1 = args.cell_arg.positions[edge_idx[1]]
+        return wp.length(v1 - v0)
+    @wp.func
+    def side_measure_ratio(args: SideArg, s: Sample):
+        inner = Quadmesh2D.side_inner_cell_index(args, s.element_index)
+        outer = Quadmesh2D.side_outer_cell_index(args, s.element_index)
+        inner_coords = Quadmesh2D.side_inner_cell_coords(args, s.element_index, s.element_coords)
+        outer_coords = Quadmesh2D.side_outer_cell_coords(args, s.element_index, s.element_coords)
+        return Quadmesh2D.side_measure(args, s) / wp.min(
+            Quadmesh2D.cell_measure(args.cell_arg, make_free_sample(inner, inner_coords)),
+            Quadmesh2D.cell_measure(args.cell_arg, make_free_sample(outer, outer_coords)),
+        )
+    @wp.func
+    def side_normal(args: SideArg, s: Sample):
+        edge_idx = args.edge_vertex_indices[s.element_index]
+        v0 = args.cell_arg.positions[edge_idx[0]]
+        v1 = args.cell_arg.positions[edge_idx[1]]
+        e = v1 - v0
+        return wp.normalize(wp.vec2(-e[1], e[0]))
+    @wp.func
+    def side_inner_cell_index(arg: SideArg, side_index: ElementIndex):
+        return arg.edge_quad_indices[side_index][0]
+    @wp.func
+    def side_outer_cell_index(arg: SideArg, side_index: ElementIndex):
+        return arg.edge_quad_indices[side_index][1]
+    @wp.func
+    def edge_to_quad_coords(args: SideArg, side_index: ElementIndex, quad_index: ElementIndex, side_coords: Coords):
+        edge_vidx = args.edge_vertex_indices[side_index]
+        quad_vidx = args.cell_arg.quad_vertex_indices[quad_index]
+        vs = edge_vidx[0]
+        ve = edge_vidx[1]
+        s = side_coords[0]
+        if vs == quad_vidx[0]:
+            return wp.select(ve == quad_vidx[1], Coords(0.0, s, 0.0), Coords(s, 0.0, 0.0))
+        elif vs == quad_vidx[1]:
+            return wp.select(ve == quad_vidx[2], Coords(1.0 - s, 0.0, 0.0), Coords(1.0, s, 0.0))
+        elif vs == quad_vidx[2]:
+            return wp.select(ve == quad_vidx[3], Coords(1.0, 1.0 - s, 0.0), Coords(1.0 - s, 1.0, 0.0))
+        return wp.select(ve == quad_vidx[0], Coords(s, 1.0, 0.0), Coords(0.0, 1.0 - s, 0.0))
+    @wp.func
+    def side_inner_cell_coords(args: SideArg, side_index: ElementIndex, side_coords: Coords):
+        inner_cell_index = Quadmesh2D.side_inner_cell_index(args, side_index)
+        return Quadmesh2D.edge_to_quad_coords(args, side_index, inner_cell_index, side_coords)
+    @wp.func
+    def side_outer_cell_coords(args: SideArg, side_index: ElementIndex, side_coords: Coords):
+        outer_cell_index = Quadmesh2D.side_outer_cell_index(args, side_index)
+        return Quadmesh2D.edge_to_quad_coords(args, side_index, outer_cell_index, side_coords)
+    @wp.func
+    def side_from_cell_coords(
+        args: SideArg,
+        side_index: ElementIndex,
+        quad_index: ElementIndex,
+        quad_coords: Coords,
+    ):
+        edge_vidx = args.edge_vertex_indices[side_index]
+        quad_vidx = args.cell_arg.quad_vertex_indices[quad_index]
+        vs = edge_vidx[0]
+        ve = edge_vidx[1]
+        cx = quad_coords[0]
+        cy = quad_coords[1]
+        if vs == quad_vidx[0]:
+            oc = wp.select(ve == quad_vidx[1], cx, cy)
+            ec = wp.select(ve == quad_vidx[1], cy, cx)
+        elif vs == quad_vidx[1]:
+            oc = wp.select(ve == quad_vidx[2], cy, 1.0 - cx)
+            ec = wp.select(ve == quad_vidx[2], 1.0 - cx, cy)
+        elif vs == quad_vidx[2]:
+            oc = wp.select(ve == quad_vidx[3], 1.0 - cx, 1.0 - cy)
+            ec = wp.select(ve == quad_vidx[3], 1.0 - cy, 1.0 - cx)
+        else:
+            oc = wp.select(ve == quad_vidx[0], 1.0 - cy, cx)
+            ec = wp.select(ve == quad_vidx[0], cx, 1.0 - cy)
+        return wp.select(oc == 0.0, Coords(OUTSIDE), Coords(ec, 0.0, 0.0))
+    @wp.func
+    def side_to_cell_arg(side_arg: SideArg):
+        return side_arg.cell_arg
+    def _build_topology(self, temporary_store: TemporaryStore):
+        from warp.fem.utils import compress_node_indices, masked_indices
+        from warp.utils import array_scan
+        device = self.quad_vertex_indices.device
+        vertex_quad_offsets, vertex_quad_indices, _, __ = compress_node_indices(
+            self.vertex_count(), self.quad_vertex_indices, temporary_store=temporary_store
+        )
+        self._vertex_quad_offsets = vertex_quad_offsets.detach()
+        self._vertex_quad_indices = vertex_quad_indices.detach()
+        vertex_start_edge_count = borrow_temporary(temporary_store, dtype=int, device=device, shape=self.vertex_count())
+        vertex_start_edge_count.array.zero_()
+        vertex_start_edge_offsets = borrow_temporary_like(vertex_start_edge_count, temporary_store=temporary_store)
+        vertex_edge_ends = borrow_temporary(temporary_store, dtype=int, device=device, shape=(4 * self.cell_count()))
+        vertex_edge_quads = borrow_temporary(
+            temporary_store, dtype=int, device=device, shape=(4 * self.cell_count(), 2)
+        )
+        # Count face edges starting at each vertex
+        wp.launch(
+            kernel=Quadmesh2D._count_starting_edges_kernel,
+            device=device,
+            dim=self.cell_count(),
+            inputs=[self.quad_vertex_indices, vertex_start_edge_count.array],
+        )
+        array_scan(in_array=vertex_start_edge_count.array, out_array=vertex_start_edge_offsets.array, inclusive=False)
+        # Count number of unique edges (deduplicate across faces)
+        vertex_unique_edge_count = vertex_start_edge_count
+        wp.launch(
+            kernel=Quadmesh2D._count_unique_starting_edges_kernel,
+            device=device,
+            dim=self.vertex_count(),
+            inputs=[
+                self._vertex_quad_offsets,
+                self._vertex_quad_indices,
+                self.quad_vertex_indices,
+                vertex_start_edge_offsets.array,
+                vertex_unique_edge_count.array,
+                vertex_edge_ends.array,
+                vertex_edge_quads.array,
+            ],
+        )
+        vertex_unique_edge_offsets = borrow_temporary_like(vertex_start_edge_offsets, temporary_store=temporary_store)
+        array_scan(in_array=vertex_start_edge_count.array, out_array=vertex_unique_edge_offsets.array, inclusive=False)
+        # Get back edge count to host
+        if device.is_cuda:
+            edge_count = borrow_temporary(temporary_store, shape=(1,), dtype=int, device="cpu", pinned=True)
+            # Last vertex will not own any edge, so its count will be zero; just fetching last prefix count is ok
+            wp.copy(
+                dest=edge_count.array, src=vertex_unique_edge_offsets.array, src_offset=self.vertex_count() - 1, count=1
+            )
+            wp.synchronize_stream(wp.get_stream(device))
+            edge_count = int(edge_count.array.numpy()[0])
+        else:
+            edge_count = int(vertex_unique_edge_offsets.array.numpy()[self.vertex_count() - 1])
+        self._edge_vertex_indices = wp.empty(shape=(edge_count,), dtype=wp.vec2i, device=device)
+        self._edge_quad_indices = wp.empty(shape=(edge_count,), dtype=wp.vec2i, device=device)
+        boundary_mask = borrow_temporary(temporary_store=temporary_store, shape=(edge_count,), dtype=int, device=device)
+        # Compress edge data
+        wp.launch(
+            kernel=Quadmesh2D._compress_edges_kernel,
+            device=device,
+            dim=self.vertex_count(),
+            inputs=[
+                vertex_start_edge_offsets.array,
+                vertex_unique_edge_offsets.array,
+                vertex_unique_edge_count.array,
+                vertex_edge_ends.array,
+                vertex_edge_quads.array,
+                self._edge_vertex_indices,
+                self._edge_quad_indices,
+                boundary_mask.array,
+            ],
+        )
+        vertex_start_edge_offsets.release()
+        vertex_unique_edge_offsets.release()
+        vertex_unique_edge_count.release()
+        vertex_edge_ends.release()
+        vertex_edge_quads.release()
+        # Flip normals if necessary
+        wp.launch(
+            kernel=Quadmesh2D._flip_edge_normals,
+            device=device,
+            dim=self.side_count(),
+            inputs=[self._edge_vertex_indices, self._edge_quad_indices, self.quad_vertex_indices, self.positions],
+        )
+        boundary_edge_indices, _ = masked_indices(boundary_mask.array, temporary_store=temporary_store)
+        self._boundary_edge_indices = boundary_edge_indices.detach()
+        boundary_mask.release()
+    @wp.kernel
+    def _count_starting_edges_kernel(
+        quad_vertex_indices: wp.array2d(dtype=int), vertex_start_edge_count: wp.array(dtype=int)
+    ):
+        t = wp.tid()
+        for k in range(4):
+            v0 = quad_vertex_indices[t, k]
+            v1 = quad_vertex_indices[t, (k + 1) % 4]
+            if v0 < v1:
+                wp.atomic_add(vertex_start_edge_count, v0, 1)
+            else:
+                wp.atomic_add(vertex_start_edge_count, v1, 1)
+    @wp.func
+    def _find(
+        needle: int,
+        values: wp.array(dtype=int),
+        beg: int,
+        end: int,
+    ):
+        for i in range(beg, end):
+            if values[i] == needle:
+                return i
+        return -1
+    @wp.kernel
+    def _count_unique_starting_edges_kernel(
+        vertex_quad_offsets: wp.array(dtype=int),
+        vertex_quad_indices: wp.array(dtype=int),
+        quad_vertex_indices: wp.array2d(dtype=int),
+        vertex_start_edge_offsets: wp.array(dtype=int),
+        vertex_start_edge_count: wp.array(dtype=int),
+        edge_ends: wp.array(dtype=int),
+        edge_quads: wp.array2d(dtype=int),
+    ):
+        v = wp.tid()
+        edge_beg = vertex_start_edge_offsets[v]
+        quad_beg = vertex_quad_offsets[v]
+        quad_end = vertex_quad_offsets[v + 1]
+        edge_cur = edge_beg
+        for quad in range(quad_beg, quad_end):
+            q = vertex_quad_indices[quad]
+            for k in range(4):
+                v0 = quad_vertex_indices[q, k]
+                v1 = quad_vertex_indices[q, (k + 1) % 4]
+                if v == wp.min(v0, v1):
+                    other_v = wp.max(v0, v1)
+                    # Check if other_v has been seen
+                    seen_idx = Quadmesh2D._find(other_v, edge_ends, edge_beg, edge_cur)
+                    if seen_idx == -1:
+                        edge_ends[edge_cur] = other_v
+                        edge_quads[edge_cur, 0] = q
+                        edge_quads[edge_cur, 1] = q
+                        edge_cur += 1
+                    else:
+                        edge_quads[seen_idx, 1] = q
+        vertex_start_edge_count[v] = edge_cur - edge_beg
+    @wp.kernel
+    def _compress_edges_kernel(
+        vertex_start_edge_offsets: wp.array(dtype=int),
+        vertex_unique_edge_offsets: wp.array(dtype=int),
+        vertex_unique_edge_count: wp.array(dtype=int),
+        uncompressed_edge_ends: wp.array(dtype=int),
+        uncompressed_edge_quads: wp.array2d(dtype=int),
+        edge_vertex_indices: wp.array(dtype=wp.vec2i),
+        edge_quad_indices: wp.array(dtype=wp.vec2i),
+        boundary_mask: wp.array(dtype=int),
+    ):
+        v = wp.tid()
+        start_beg = vertex_start_edge_offsets[v]
+        unique_beg = vertex_unique_edge_offsets[v]
+        unique_count = vertex_unique_edge_count[v]
+        for e in range(unique_count):
+            src_index = start_beg + e
+            edge_index = unique_beg + e
+            edge_vertex_indices[edge_index] = wp.vec2i(v, uncompressed_edge_ends[src_index])
+            q0 = uncompressed_edge_quads[src_index, 0]
+            q1 = uncompressed_edge_quads[src_index, 1]
+            edge_quad_indices[edge_index] = wp.vec2i(q0, q1)
+            if q0 == q1:
+                boundary_mask[edge_index] = 1
+            else:
+                boundary_mask[edge_index] = 0
+    @wp.kernel
+    def _flip_edge_normals(
+        edge_vertex_indices: wp.array(dtype=wp.vec2i),
+        edge_quad_indices: wp.array(dtype=wp.vec2i),
+        quad_vertex_indices: wp.array2d(dtype=int),
+        positions: wp.array(dtype=wp.vec2),
+    ):
+        e = wp.tid()
+        tri = edge_quad_indices[e][0]
+        quad_vidx = quad_vertex_indices[tri]
+        edge_vidx = edge_vertex_indices[e]
+        quad_centroid = (
+            positions[quad_vidx[0]] + positions[quad_vidx[1]] + positions[quad_vidx[2]] + positions[quad_vidx[3]]
+        ) / 4.0
+        v0 = positions[edge_vidx[0]]
+        v1 = positions[edge_vidx[1]]
+        edge_center = 0.5 * (v1 + v0)
+        edge_vec = v1 - v0
+        edge_normal = wp.vec2(-edge_vec[1], edge_vec[0])
+        # if edge normal points toward first triangle centroid, flip indices
+        if wp.dot(quad_centroid - edge_center, edge_normal) > 0.0:
+            edge_vertex_indices[e] = wp.vec2i(edge_vidx[1], edge_vidx[0])