PyPI - warp-lang - Versions diffs - 1.7.0__py3-none-manylinux_2_34_aarch64.whl → 1.7.2__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.0__py3-none-manylinux_2_34_aarch64.whl → 1.7.2__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (60) hide show

warp/autograd.py +12 -2
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +1 -1
warp/builtins.py +103 -66
warp/codegen.py +48 -27
warp/config.py +1 -1
warp/context.py +112 -49
warp/examples/benchmarks/benchmark_cloth.py +1 -1
warp/examples/distributed/example_jacobi_mpi.py +507 -0
warp/fem/cache.py +1 -1
warp/fem/field/field.py +11 -1
warp/fem/field/nodal_field.py +36 -22
warp/fem/geometry/adaptive_nanogrid.py +7 -3
warp/fem/geometry/trimesh.py +4 -12
warp/jax_experimental/custom_call.py +14 -2
warp/jax_experimental/ffi.py +100 -67
warp/native/builtin.h +91 -65
warp/native/svd.h +59 -49
warp/native/tile.h +55 -26
warp/native/volume.cpp +2 -2
warp/native/volume_builder.cu +33 -22
warp/native/warp.cu +1 -1
warp/render/render_opengl.py +41 -34
warp/render/render_usd.py +96 -6
warp/sim/collide.py +11 -9
warp/sim/inertia.py +189 -156
warp/sim/integrator_euler.py +3 -0
warp/sim/integrator_xpbd.py +3 -0
warp/sim/model.py +56 -31
warp/sim/render.py +4 -0
warp/sparse.py +1 -1
warp/stubs.py +73 -25
warp/tests/assets/torus.usda +1 -1
warp/tests/cuda/test_streams.py +1 -1
warp/tests/sim/test_collision.py +237 -206
warp/tests/sim/test_inertia.py +161 -0
warp/tests/sim/test_model.py +5 -3
warp/tests/sim/{flaky_test_sim_grad.py → test_sim_grad.py} +1 -4
warp/tests/sim/test_xpbd.py +399 -0
warp/tests/test_array.py +8 -7
warp/tests/test_atomic.py +181 -2
warp/tests/test_builtins_resolution.py +38 -38
warp/tests/test_codegen.py +24 -3
warp/tests/test_examples.py +16 -6
warp/tests/test_fem.py +93 -14
warp/tests/test_func.py +1 -1
warp/tests/test_mat.py +416 -119
warp/tests/test_quat.py +321 -137
warp/tests/test_struct.py +116 -0
warp/tests/test_vec.py +320 -174
warp/tests/tile/test_tile.py +27 -0
warp/tests/tile/test_tile_load.py +124 -0
warp/tests/unittest_suites.py +2 -5
warp/types.py +107 -9
{warp_lang-1.7.0.dist-info → warp_lang-1.7.2.dist-info}/METADATA +41 -19
{warp_lang-1.7.0.dist-info → warp_lang-1.7.2.dist-info}/RECORD +60 -57
{warp_lang-1.7.0.dist-info → warp_lang-1.7.2.dist-info}/WHEEL +1 -1
{warp_lang-1.7.0.dist-info → warp_lang-1.7.2.dist-info}/licenses/LICENSE.md +0 -26
{warp_lang-1.7.0.dist-info → warp_lang-1.7.2.dist-info}/top_level.txt +0 -0

warp/fem/geometry/trimesh.py CHANGED Viewed

@@ -190,7 +190,7 @@ class Trimesh(Geometry):
         return args
     def _bvh_id(self, device):
-        if self._tri_bvh is None or self._tri_bvh.device != device:
+        if self._tri_bvh is None or self._tri_bvh.device != wp.get_device(device):
             return _NULL_BVH
         return self._tri_bvh.id
@@ -519,7 +519,7 @@ class Trimesh(Geometry):
     @wp.kernel
     def _compute_tri_bounds(
         tri_vertex_indices: wp.array2d(dtype=int),
-        positions: wp.array(dtype=wp.vec2),
+        positions: wp.array(dtype=Any),
         lowers: wp.array(dtype=wp.vec3),
         uppers: wp.array(dtype=wp.vec3),
     ):
@@ -528,16 +528,8 @@ class Trimesh(Geometry):
         p1 = _bvh_vec(positions[tri_vertex_indices[t, 1]])
         p2 = _bvh_vec(positions[tri_vertex_indices[t, 2]])
-        lowers[t] = wp.vec3(
-            wp.min(wp.min(p0[0], p1[0]), p2[0]),
-            wp.min(wp.min(p0[1], p1[1]), p2[1]),
-            wp.min(wp.min(p0[2], p1[2]), p2[2]),
-        )
-        uppers[t] = wp.vec3(
-            wp.max(wp.max(p0[0], p1[0]), p2[0]),
-            wp.max(wp.max(p0[1], p1[1]), p2[1]),
-            wp.max(wp.max(p0[2], p1[2]), p2[2]),
-        )
+        lowers[t] = wp.min(wp.min(p0, p1), p2)
+        uppers[t] = wp.max(wp.max(p0, p1), p2)
 @wp.struct

warp/jax_experimental/custom_call.py CHANGED Viewed

@@ -126,7 +126,14 @@ def _create_jax_warp_primitive():
     # Create and register the primitive.
     # TODO add default implementation that calls the kernel via warp.
-    _jax_warp_p = jax.core.Primitive("jax_warp")
+    try:
+        # newer JAX versions
+        import jax.extend
+        _jax_warp_p = jax.extend.core.Primitive("jax_warp")
+    except (ImportError, AttributeError):
+        # older JAX versions
+        _jax_warp_p = jax.core.Primitive("jax_warp")
     _jax_warp_p.multiple_results = True
     # TODO Just launch the kernel directly, but make sure the argument
@@ -262,7 +269,12 @@ def _create_jax_warp_primitive():
     capsule = PyCapsule_New(ccall_address.value, b"xla._CUSTOM_CALL_TARGET", PyCapsule_Destructor(0))
     # Register the callback in XLA.
-    jax.lib.xla_client.register_custom_call_target("warp_call", capsule, platform="gpu")
+    try:
+        # newer JAX versions
+        jax.ffi.register_ffi_target("warp_call", capsule, platform="gpu", api_version=0)
+    except AttributeError:
+        # older JAX versions
+        jax.lib.xla_client.register_custom_call_target("warp_call", capsule, platform="gpu")
     def default_layout(shape):
         return range(len(shape) - 1, -1, -1)

warp/jax_experimental/ffi.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # limitations under the License.
 import ctypes
+import threading
 import traceback
 from typing import Callable
@@ -27,68 +28,6 @@ from warp.types import array_t, launch_bounds_t, strides_from_shape, type_to_war
 from .xla_ffi import *
-def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=None, output_dims=None):
-    """Create a JAX callback from a Warp kernel.
-    NOTE: This is an experimental feature under development.
-    Args:
-        kernel: The Warp kernel to launch.
-        num_outputs: Optional. Specify the number of output arguments if greater than 1.
-        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
-                     This argument can also be specified for individual calls.
-        launch_dims: Optional. Specify the default kernel launch dimensions. If None, launch
-                     dimensions are inferred from the shape of the first array argument.
-                     This argument can also be specified for individual calls.
-        output_dims: Optional. Specify the default dimensions of output arrays.  If None, output
-                     dimensions are inferred from the launch dimensions.
-                     This argument can also be specified for individual calls.
-    Limitations:
-        - All kernel arguments must be contiguous arrays or scalars.
-        - Scalars must be static arguments in JAX.
-        - Input arguments are followed by output arguments in the Warp kernel definition.
-        - There must be at least one output argument.
-        - Only the CUDA backend is supported.
-    """
-    return FfiKernel(kernel, num_outputs, vmap_method, launch_dims, output_dims)
-def jax_callable(
-    func: Callable,
-    num_outputs: int = 1,
-    graph_compatible: bool = True,
-    vmap_method: str = "broadcast_all",
-    output_dims=None,
-):
-    """Create a JAX callback from an annotated Python function.
-    The Python function arguments must have type annotations like Warp kernels.
-    NOTE: This is an experimental feature under development.
-    Args:
-        func: The Python function to call.
-        num_outputs: Optional. Specify the number of output arguments if greater than 1.
-        graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
-        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
-            This argument can also be specified for individual calls.
-        output_dims: Optional. Specify the default dimensions of output arrays.
-            If ``None``, output dimensions are inferred from the launch dimensions.
-            This argument can also be specified for individual calls.
-    Limitations:
-        - All kernel arguments must be contiguous arrays or scalars.
-        - Scalars must be static arguments in JAX.
-        - Input arguments are followed by output arguments in the Warp kernel definition.
-        - There must be at least one output argument.
-        - Only the CUDA backend is supported.
-    """
-    return FfiCallable(func, num_outputs, graph_compatible, vmap_method, output_dims)
 class FfiArg:
     def __init__(self, name, type):
         self.name = name
@@ -560,7 +499,11 @@ class FfiCallable:
             # call the Python function with reconstructed arguments
             with wp.ScopedStream(stream, sync_enter=False):
-                self.func(*arg_list)
+                if stream.is_capturing:
+                    with wp.ScopedCapture(stream=stream, external=True):
+                        self.func(*arg_list)
+                else:
+                    self.func(*arg_list)
         except Exception as e:
             print(traceback.format_exc())
@@ -571,6 +514,98 @@ class FfiCallable:
         return None
+# Holders for the custom callbacks to keep them alive.
+_FFI_CALLABLE_REGISTRY: dict[str, FfiCallable] = {}
+_FFI_KERNEL_REGISTRY: dict[str, FfiKernel] = {}
+_FFI_REGISTRY_LOCK = threading.Lock()
+def jax_kernel(kernel, num_outputs=1, vmap_method="broadcast_all", launch_dims=None, output_dims=None):
+    """Create a JAX callback from a Warp kernel.
+    NOTE: This is an experimental feature under development.
+    Args:
+        kernel: The Warp kernel to launch.
+        num_outputs: Optional. Specify the number of output arguments if greater than 1.
+        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
+                     This argument can also be specified for individual calls.
+        launch_dims: Optional. Specify the default kernel launch dimensions. If None, launch
+                     dimensions are inferred from the shape of the first array argument.
+                     This argument can also be specified for individual calls.
+        output_dims: Optional. Specify the default dimensions of output arrays.  If None, output
+                     dimensions are inferred from the launch dimensions.
+                     This argument can also be specified for individual calls.
+    Limitations:
+        - All kernel arguments must be contiguous arrays or scalars.
+        - Scalars must be static arguments in JAX.
+        - Input arguments are followed by output arguments in the Warp kernel definition.
+        - There must be at least one output argument.
+        - Only the CUDA backend is supported.
+    """
+    key = (
+        kernel.func,
+        num_outputs,
+        vmap_method,
+        tuple(launch_dims) if launch_dims else launch_dims,
+        tuple(sorted(output_dims.items())) if output_dims else output_dims,
+    )
+    with _FFI_REGISTRY_LOCK:
+        if key not in _FFI_KERNEL_REGISTRY:
+            new_kernel = FfiKernel(kernel, num_outputs, vmap_method, launch_dims, output_dims)
+            _FFI_KERNEL_REGISTRY[key] = new_kernel
+    return _FFI_KERNEL_REGISTRY[key]
+def jax_callable(
+    func: Callable,
+    num_outputs: int = 1,
+    graph_compatible: bool = True,
+    vmap_method: str = "broadcast_all",
+    output_dims=None,
+):
+    """Create a JAX callback from an annotated Python function.
+    The Python function arguments must have type annotations like Warp kernels.
+    NOTE: This is an experimental feature under development.
+    Args:
+        func: The Python function to call.
+        num_outputs: Optional. Specify the number of output arguments if greater than 1.
+        graph_compatible: Optional. Whether the function can be called during CUDA graph capture.
+        vmap_method: Optional. String specifying how the callback transforms under ``vmap()``.
+            This argument can also be specified for individual calls.
+        output_dims: Optional. Specify the default dimensions of output arrays.
+            If ``None``, output dimensions are inferred from the launch dimensions.
+            This argument can also be specified for individual calls.
+    Limitations:
+        - All kernel arguments must be contiguous arrays or scalars.
+        - Scalars must be static arguments in JAX.
+        - Input arguments are followed by output arguments in the Warp kernel definition.
+        - There must be at least one output argument.
+        - Only the CUDA backend is supported.
+    """
+    key = (
+        func,
+        num_outputs,
+        graph_compatible,
+        vmap_method,
+        tuple(sorted(output_dims.items())) if output_dims else output_dims,
+    )
+    with _FFI_REGISTRY_LOCK:
+        if key not in _FFI_CALLABLE_REGISTRY:
+            new_callable = FfiCallable(func, num_outputs, graph_compatible, vmap_method, output_dims)
+            _FFI_CALLABLE_REGISTRY[key] = new_callable
+    return _FFI_CALLABLE_REGISTRY[key]
 ###############################################################################
 #
 # Generic FFI callbacks for Python functions of the form
@@ -578,9 +613,6 @@ class FfiCallable:
 #
 ###############################################################################
-# Holder for the custom callbacks to keep them alive.
-ffi_callbacks = {}
 def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = True) -> None:
     """Create a JAX callback from a Python function.
@@ -640,7 +672,8 @@ def register_ffi_callback(name: str, func: Callable, graph_compatible: bool = Tr
     FFI_CCALLFUNC = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.POINTER(XLA_FFI_CallFrame))
     callback_func = FFI_CCALLFUNC(ffi_callback)
-    ffi_callbacks[name] = callback_func
+    with _FFI_REGISTRY_LOCK:
+        _FFI_CALLABLE_REGISTRY[name] = callback_func
     ffi_ccall_address = ctypes.cast(callback_func, ctypes.c_void_p)
     ffi_capsule = jax.ffi.pycapsule(ffi_ccall_address.value)
     jax.ffi.register_ffi_target(name, ffi_capsule, platform="CUDA")

warp/native/builtin.h CHANGED Viewed

@@ -1271,6 +1271,29 @@ inline CUDA_CALLABLE T atomic_add(T* buf, T value)
 #endif
 }
+// emulate atomic int64 add with atomicCAS()
+template <>
+inline CUDA_CALLABLE int64 atomic_add(int64* address, int64 val)
+{
+#if defined(__CUDA_ARCH__)
+    unsigned long long int *address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    while (val < (int64)old)
+    {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, (int64)val);
+    }
+    return (int64)old;
+#else
+    int64 old = *address;
+    *address = min(old, val);
+    return old;
+#endif
+}
 template<>
 inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value)
 {
@@ -1306,53 +1329,6 @@ inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value)
     #undef __PTR
 #endif  // CUDA compiled by NVRTC
-}
-// emulate atomic float max with atomicCAS()
-inline CUDA_CALLABLE float atomic_max(float* address, float val)
-{
-#if defined(__CUDA_ARCH__)
-    int *address_as_int = (int*)address;
-    int old = *address_as_int, assumed;
-	while (val > __int_as_float(old))
-	{
-        assumed = old;
-        old = atomicCAS(address_as_int, assumed,
-                        __float_as_int(val));
-    }
-    return __int_as_float(old);
-#else
-    float old = *address;
-    *address = max(old, val);
-    return old;
-#endif
-}
-// emulate atomic float min with atomicCAS()
-inline CUDA_CALLABLE float atomic_min(float* address, float val)
-{
-#if defined(__CUDA_ARCH__)
-    int *address_as_int = (int*)address;
-    int old = *address_as_int, assumed;
-    while (val < __int_as_float(old))
-	{
-        assumed = old;
-        old = atomicCAS(address_as_int, assumed,
-                        __float_as_int(val));
-    }
-    return __int_as_float(old);
-#else
-    float old = *address;
-    *address = min(old, val);
-    return old;
-#endif
 }
 template<>
@@ -1388,33 +1364,47 @@ inline CUDA_CALLABLE float64 atomic_add(float64* buf, float64 value)
     #undef __PTR
 #endif  // CUDA compiled by NVRTC
+}
+template <typename T>
+inline CUDA_CALLABLE T atomic_min(T* address, T val)
+{
+#if defined(__CUDA_ARCH__)
+    return atomicMin(address, val);
+#else
+    T old = *address;
+    *address = min(old, val);
+    return old;
+#endif
 }
-// emulate atomic double max with atomicCAS()
-inline CUDA_CALLABLE double atomic_max(double* address, double val)
+// emulate atomic float min with atomicCAS()
+template <>
+inline CUDA_CALLABLE float atomic_min(float* address, float val)
 {
 #if defined(__CUDA_ARCH__)
-        unsigned long long int *address_as_ull = (unsigned long long int*)address;
-        unsigned long long int old = *address_as_ull, assumed;
-	while (val > __longlong_as_double(old))
+    int *address_as_int = (int*)address;
+    int old = *address_as_int, assumed;
+    while (val < __int_as_float(old))
 	{
         assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val));
+        old = atomicCAS(address_as_int, assumed,
+                        __float_as_int(val));
     }
-    return __longlong_as_double(old);
+    return __int_as_float(old);
 #else
-    double old = *address;
-    *address = max(old, val);
+    float old = *address;
+    *address = min(old, val);
     return old;
 #endif
 }
 // emulate atomic double min with atomicCAS()
+template <>
 inline CUDA_CALLABLE double atomic_min(double* address, double val)
 {
 #if defined(__CUDA_ARCH__)
@@ -1437,27 +1427,63 @@ inline CUDA_CALLABLE double atomic_min(double* address, double val)
 #endif
 }
-inline CUDA_CALLABLE int atomic_max(int* address, int val)
+template <typename T>
+inline CUDA_CALLABLE T atomic_max(T* address, T val)
 {
 #if defined(__CUDA_ARCH__)
     return atomicMax(address, val);
 #else
-    int old = *address;
+    T old = *address;
     *address = max(old, val);
     return old;
 #endif
 }
-// atomic int min
-inline CUDA_CALLABLE int atomic_min(int* address, int val)
+// emulate atomic float max with atomicCAS()
+template<>
+inline CUDA_CALLABLE float atomic_max(float* address, float val)
 {
 #if defined(__CUDA_ARCH__)
-    return atomicMin(address, val);
+    int *address_as_int = (int*)address;
+    int old = *address_as_int, assumed;
+	while (val > __int_as_float(old))
+	{
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed,
+                        __float_as_int(val));
+    }
+    return __int_as_float(old);
 #else
-    int old = *address;
-    *address = min(old, val);
+    float old = *address;
+    *address = max(old, val);
+    return old;
+#endif
+}
+// emulate atomic double max with atomicCAS()
+template<>
+inline CUDA_CALLABLE double atomic_max(double* address, double val)
+{
+#if defined(__CUDA_ARCH__)
+        unsigned long long int *address_as_ull = (unsigned long long int*)address;
+        unsigned long long int old = *address_as_ull, assumed;
+	while (val > __longlong_as_double(old))
+	{
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val));
+    }
+    return __longlong_as_double(old);
+#else
+    double old = *address;
+    *address = max(old, val);
     return old;
 #endif
 }

warp/native/svd.h CHANGED Viewed

@@ -60,17 +60,17 @@ struct _svd_config<double> {
     static constexpr int JACOBI_ITERATIONS = 8;
 };
-// TODO: replace sqrt with rsqrt
-template<typename Type>
-inline CUDA_CALLABLE
-Type accurateSqrt(Type x)
+template <typename Type> inline CUDA_CALLABLE Type recipSqrt(Type x)
 {
-  return x / sqrt(x);
+#if defined(__CUDA_ARCH__)
+    return ::rsqrt(x);
+#else
+    return Type(1) / sqrt(x);
+#endif
 }
+template <> inline CUDA_CALLABLE wp::half recipSqrt(wp::half x) { return wp::half(1) / sqrt(x); }
 template<typename Type>
 inline CUDA_CALLABLE
 void condSwap(bool c, Type &X, Type &Y)
@@ -175,7 +175,7 @@ void approximateGivensQuaternion(Type a11, Type a12, Type a22, Type &ch, Type &s
     ch = Type(2)*(a11-a22);
     sh = a12;
     bool b = Type(_gamma)*sh*sh < ch*ch;
-    Type w = Type(1) / sqrt(ch*ch+sh*sh);
+    Type w = recipSqrt(ch*ch+sh*sh);
     ch=b?w*ch:Type(_cstar);
     sh=b?w*sh:Type(_sstar);
 }
@@ -304,13 +304,13 @@ void QRGivensQuaternion(Type a1, Type a2, Type &ch, Type &sh)
     // a1 = pivot point on diagonal
     // a2 = lower triangular entry we want to annihilate
     const Type epsilon = _svd_config<Type>::QR_GIVENS_EPSILON;
-    Type rho = accurateSqrt(a1*a1 + a2*a2);
+    Type rho = sqrt(a1*a1 + a2*a2);
     sh = rho > epsilon ? a2 : Type(0);
     ch = abs(a1) + max(rho,epsilon);
     bool b = a1 < Type(0);
     condSwap(b,sh,ch);
-    Type w = Type(1) / sqrt(ch*ch+sh*sh);
+    Type w = recipSqrt(ch*ch+sh*sh);
     ch *= w;
     sh *= w;
 }
@@ -432,21 +432,15 @@ void _svd(// input A
     );
 }
-template<typename Type>
-inline CUDA_CALLABLE
-void _svd_2(// input A
-        Type a11, Type a12,
-        Type a21, Type a22,
-        // output U
-        Type &u11, Type &u12,
-        Type &u21, Type &u22,
-        // output S
-        Type &s11, Type &s12,
-        Type &s21, Type &s22,
-        // output V
-        Type &v11, Type &v12,
-        Type &v21, Type &v22)
+template <typename Type>
+inline CUDA_CALLABLE void _svd_2( // input A
+    Type a11, Type a12, Type a21, Type a22,
+    // output U
+    Type& u11, Type& u12, Type& u21, Type& u22,
+    // output S
+    Type& s1, Type& s2,
+    // output V
+    Type& v11, Type& v12, Type& v21, Type& v22)
 {
     // Step 1: Compute ATA
     Type ATA11 = a11 * a11 + a21 * a21;
@@ -455,39 +449,56 @@ void _svd_2(// input A
     // Step 2: Eigenanalysis
     Type trace = ATA11 + ATA22;
-    Type det = ATA11 * ATA22 - ATA12 * ATA12;
-    Type sqrt_term = sqrt(trace * trace - Type(4.0) * det);
-    Type lambda1 = (trace + sqrt_term) * Type(0.5);
-    Type lambda2 = (trace - sqrt_term) * Type(0.5);
+    Type diff = ATA11 - ATA22;
+    Type discriminant = diff * diff + Type(4) * ATA12 * ATA12;
     // Step 3: Singular values
-    Type sigma1 = sqrt(lambda1);
+    if (discriminant == Type(0))
+    {
+        // Duplicate eigenvalue, A ~ s Id
+        s1 = s2 = sqrt(Type(0.5) * trace);
+        u11 = v11 = Type(1);
+        u12 = v12 = Type(0);
+        u21 = v21 = Type(0);
+        u22 = v22 = Type(1);
+        return;
+    }
+    // General case
+    Type sqrt_term = sqrt(discriminant);
+    Type lambda1 = (trace + sqrt_term) * Type(0.5);
+    Type lambda2 = (trace - sqrt_term) * Type(0.5);
+    Type inv_sigma1 = recipSqrt(lambda1);
+    Type sigma1 = Type(1) / inv_sigma1;
     Type sigma2 = sqrt(lambda2);
     // Step 4: Eigenvectors (find V)
-    Type v1x = ATA12, v1y = lambda1 - ATA11; // For first eigenvector
-    Type v2x = ATA12, v2y = lambda2 - ATA11; // For second eigenvector
-    Type norm1 = sqrt(v1x * v1x + v1y * v1y);
-    Type norm2 = sqrt(v2x * v2x + v2y * v2y);
-    v11 = v1x / norm1; v12 = v2x / norm2;
-    v21 = v1y / norm1; v22 = v2y / norm2;
+    Type v1y = diff - sqrt_term + Type(2) * ATA12, v1x = diff + sqrt_term - Type(2) * ATA12;
+    Type len1_sq = v1x * v1x + v1y * v1y;
+    if (len1_sq == Type(0)) {
+        v11 = Type(0.707106781186547524401); // M_SQRT1_2
+        v21 = v11;
+    } else {
+        Type inv_len1 = recipSqrt(len1_sq);
+        v11 = v1x * inv_len1;
+        v21 = v1y * inv_len1;
+    }
+    v12 = -v21;
+    v22 = v11;
     // Step 5: Compute U
-    Type inv_sigma1 = (sigma1 > Type(1e-6)) ? Type(1.0) / sigma1 : Type(0.0);
-    Type inv_sigma2 = (sigma2 > Type(1e-6)) ? Type(1.0) / sigma2 : Type(0.0);
     u11 = (a11 * v11 + a12 * v21) * inv_sigma1;
-    u12 = (a11 * v12 + a12 * v22) * inv_sigma2;
     u21 = (a21 * v11 + a22 * v21) * inv_sigma1;
-    u22 = (a21 * v12 + a22 * v22) * inv_sigma2;
+    // sigma2 may be zero, but we can complete U orthogonally up to determinant's sign
+    Type det_sign = wp::sign(a11 * a22 - a12 * a21);
+    u12 = -u21 * det_sign;
+    u22 = u11 * det_sign;
     // Step 6: Set S
-    s11 = sigma1; s12 = Type(0.0);
-    s21 = Type(0.0); s22 = sigma2;
+    s1 = sigma1;
+    s2 = sigma2;
 }
 template<typename Type>
 inline CUDA_CALLABLE void svd3(const mat_t<3,3,Type>& A, mat_t<3,3,Type>& U, vec_t<3,Type>& sigma, mat_t<3,3,Type>& V) {
   Type s12, s13, s21, s23, s31, s32;
@@ -550,15 +561,14 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
 template<typename Type>
 inline CUDA_CALLABLE void svd2(const mat_t<2,2,Type>& A, mat_t<2,2,Type>& U, vec_t<2,Type>& sigma, mat_t<2,2,Type>& V) {
-  Type s12, s21;
   _svd_2(A.data[0][0], A.data[0][1],
        A.data[1][0], A.data[1][1],
        U.data[0][0], U.data[0][1],
        U.data[1][0], U.data[1][1],
-       sigma[0], s12,
-       s21, sigma[1],
+       sigma[0],
+       sigma[1],
        V.data[0][0], V.data[0][1],
        V.data[1][0], V.data[1][1]);