PyPI - dhb-xr - Versions diffs - 0.2.1__py3-none-any.whl - Mend

dhb-xr 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

dhb_xr/__init__.py +61 -0
dhb_xr/cli.py +206 -0
dhb_xr/core/__init__.py +28 -0
dhb_xr/core/geometry.py +167 -0
dhb_xr/core/geometry_torch.py +77 -0
dhb_xr/core/types.py +113 -0
dhb_xr/database/__init__.py +10 -0
dhb_xr/database/motion_db.py +79 -0
dhb_xr/database/retrieval.py +6 -0
dhb_xr/database/similarity.py +71 -0
dhb_xr/decoder/__init__.py +13 -0
dhb_xr/decoder/decoder_torch.py +52 -0
dhb_xr/decoder/dhb_dr.py +261 -0
dhb_xr/decoder/dhb_qr.py +89 -0
dhb_xr/encoder/__init__.py +27 -0
dhb_xr/encoder/dhb_dr.py +418 -0
dhb_xr/encoder/dhb_qr.py +129 -0
dhb_xr/encoder/dhb_ti.py +204 -0
dhb_xr/encoder/encoder_torch.py +54 -0
dhb_xr/encoder/padding.py +82 -0
dhb_xr/generative/__init__.py +78 -0
dhb_xr/generative/flow_matching.py +705 -0
dhb_xr/generative/latent_encoder.py +536 -0
dhb_xr/generative/sampling.py +203 -0
dhb_xr/generative/training.py +475 -0
dhb_xr/generative/vfm_tokenizer.py +485 -0
dhb_xr/integration/__init__.py +13 -0
dhb_xr/integration/vla/__init__.py +11 -0
dhb_xr/integration/vla/libero.py +132 -0
dhb_xr/integration/vla/pipeline.py +85 -0
dhb_xr/integration/vla/robocasa.py +85 -0
dhb_xr/losses/__init__.py +16 -0
dhb_xr/losses/geodesic_loss.py +91 -0
dhb_xr/losses/hybrid_loss.py +36 -0
dhb_xr/losses/invariant_loss.py +73 -0
dhb_xr/optimization/__init__.py +72 -0
dhb_xr/optimization/casadi_solver.py +342 -0
dhb_xr/optimization/constraints.py +32 -0
dhb_xr/optimization/cusadi_solver.py +311 -0
dhb_xr/optimization/export_casadi_decode.py +111 -0
dhb_xr/optimization/fatrop_solver.py +477 -0
dhb_xr/optimization/torch_solver.py +85 -0
dhb_xr/preprocessing/__init__.py +42 -0
dhb_xr/preprocessing/diagnostics.py +330 -0
dhb_xr/preprocessing/trajectory_cleaner.py +485 -0
dhb_xr/tokenization/__init__.py +56 -0
dhb_xr/tokenization/causal_encoder.py +54 -0
dhb_xr/tokenization/compression.py +749 -0
dhb_xr/tokenization/hierarchical.py +359 -0
dhb_xr/tokenization/rvq.py +178 -0
dhb_xr/tokenization/vqvae.py +155 -0
dhb_xr/utils/__init__.py +24 -0
dhb_xr/utils/io.py +59 -0
dhb_xr/utils/resampling.py +66 -0
dhb_xr/utils/xdof_loader.py +89 -0
dhb_xr/visualization/__init__.py +5 -0
dhb_xr/visualization/plot.py +242 -0
dhb_xr-0.2.1.dist-info/METADATA +784 -0
dhb_xr-0.2.1.dist-info/RECORD +82 -0
dhb_xr-0.2.1.dist-info/WHEEL +5 -0
dhb_xr-0.2.1.dist-info/entry_points.txt +2 -0
dhb_xr-0.2.1.dist-info/top_level.txt +3 -0
examples/__init__.py +54 -0
examples/basic_encoding.py +82 -0
examples/benchmark_backends.py +37 -0
examples/dhb_qr_comparison.py +79 -0
examples/dhb_ti_time_invariant.py +72 -0
examples/gpu_batch_optimization.py +102 -0
examples/imitation_learning.py +53 -0
examples/integration/__init__.py +19 -0
examples/integration/libero_full_demo.py +692 -0
examples/integration/libero_pro_dhb_demo.py +1063 -0
examples/integration/libero_simulation_demo.py +286 -0
examples/integration/libero_swap_demo.py +534 -0
examples/integration/robocasa_libero_dhb_pipeline.py +56 -0
examples/integration/test_libero_adapter.py +47 -0
examples/integration/test_libero_encoding.py +75 -0
examples/integration/test_libero_retrieval.py +105 -0
examples/motion_database.py +88 -0
examples/trajectory_adaptation.py +85 -0
examples/vla_tokenization.py +107 -0
notebooks/__init__.py +24 -0

dhb_xr/optimization/casadi_solver.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""
+Trajectory adaptation: resample demo and solve NLP to find invariants that match boundary poses.
+Full CasADi optimization requires casadi (pip install dhb_xr[optimization]).
+Without CasADi, generate_trajectory falls back to simple interpolation.
+"""
+import numpy as np
+from typing import Dict, Any, Optional
+from dhb_xr.encoder.dhb_dr import encode_dhb_dr
+from dhb_xr.decoder.dhb_dr import decode_dhb_dr
+from dhb_xr.core.types import DHBMethod, EncodingMethod
+from dhb_xr.core import geometry as geom
+from dhb_xr.utils.resampling import resample_and_smooth
+try:
+    import casadi as ca
+    HAS_CASADI = True
+except ImportError:
+    HAS_CASADI = False
+def _euler_to_rot_casadi(angles):
+    """Euler XYZ (extrinsic) to 3x3 rotation matrix in CasADi."""
+    rx, ry, rz = angles[0], angles[1], angles[2]
+    cx, sx = ca.cos(rx), ca.sin(rx)
+    cy, sy = ca.cos(ry), ca.sin(ry)
+    cz, sz = ca.cos(rz), ca.sin(rz)
+    Rx = ca.vertcat(ca.horzcat(1, 0, 0), ca.horzcat(0, cx, -sx), ca.horzcat(0, sx, cx))
+    Ry = ca.vertcat(ca.horzcat(cy, 0, sy), ca.horzcat(0, 1, 0), ca.horzcat(-sy, 0, cy))
+    Rz = ca.vertcat(ca.horzcat(cz, -sz, 0), ca.horzcat(sz, cz, 0), ca.horzcat(0, 0, 1))
+    return Rz @ Ry @ Rx
+def _axis_angle_to_rot_casadi(rvec, use_mx=False):
+    """Axis-angle to rotation matrix (Rodrigues) in CasADi."""
+    th = ca.sqrt(rvec[0]**2 + rvec[1]**2 + rvec[2]**2 + 1e-12)
+    k = rvec / th
+    K = ca.vertcat(
+        ca.horzcat(0, -k[2], k[1]),
+        ca.horzcat(k[2], 0, -k[0]),
+        ca.horzcat(-k[1], k[0], 0),
+    )
+    eye = ca.MX.eye(3) if use_mx else ca.SX.eye(3)
+    return eye + ca.sin(th) * K + (1 - ca.cos(th)) * (K @ K)
+def _rot_to_rvec_casadi(R):
+    """Rotation matrix to axis-angle (simplified, for small angles)."""
+    # Use logarithm approximation for CasADi compatibility
+    trace = R[0, 0] + R[1, 1] + R[2, 2]
+    theta = ca.acos(ca.fmax(-1, ca.fmin(1, (trace - 1) / 2)))
+    denom = 2 * ca.sin(theta) + 1e-12
+    rx = (R[2, 1] - R[1, 2]) / denom * theta
+    ry = (R[0, 2] - R[2, 0]) / denom * theta
+    rz = (R[1, 0] - R[0, 1]) / denom * theta
+    return ca.vertcat(rx, ry, rz)
+def generate_trajectory(
+    pos_data: np.ndarray,
+    quat_data: np.ndarray,
+    pose_target_init: Dict[str, np.ndarray],
+    pose_target_final: Dict[str, np.ndarray],
+    traj_length: int,
+    smoothing: bool = False,
+    dhb_method: DHBMethod = DHBMethod.DOUBLE_REFLECTION,
+    enable_smoothing_objective: bool = False,
+    enable_collision_constraints: bool = False,
+    weights: Optional[np.ndarray] = None,
+    use_casadi: bool = True,
+    verbose: bool = False,
+) -> Dict[str, Any]:
+    """
+    Generate trajectory by resampling demo and solving NLP to match boundary poses.
+    If CasADi is available and use_casadi=True, solves an optimization problem:
+    - Minimize ||U - U_demo||^2 (invariants close to demo)
+    - Subject to: start pose = pose_target_init, end pose = pose_target_final
+    - Dynamic constraints: poses are reconstructed from invariants
+    Otherwise, falls back to simple interpolation.
+    Args:
+        pos_data: Demo positions (N, 3)
+        quat_data: Demo quaternions (N, 4) wxyz
+        pose_target_init: {'position': (3,), 'quaternion': (4,)} start pose
+        pose_target_final: {'position': (3,), 'quaternion': (4,)} goal pose
+        traj_length: Output trajectory length
+        smoothing: Apply smoothing to resampled demo
+        dhb_method: DHBMethod.DOUBLE_REFLECTION (4 inv) or ORIGINAL (3 inv)
+        enable_smoothing_objective: Add smoothness penalty to objective
+        weights: Weights for invariant terms (default: equal)
+        use_casadi: Use CasADi NLP solver if available
+        verbose: Print solver output
+    Returns:
+        Dict with adapted_pos_data, adapted_quat_data, invariants, etc.
+    """
+    # Resample demo
+    pos_orig, quat_orig, rvec_orig, pos_resample, quat_resample, rvec_resample = resample_and_smooth(
+        pos_data, quat_data, traj_length, smoothing
+    )
+    # Encode demo to get reference invariants
+    invariants_out = encode_dhb_dr(
+        pos_orig, quat_orig,
+        init_pose=pose_target_init,
+        method=EncodingMethod.POSITION,
+        use_default_initial_frames=False,
+        dhb_method=dhb_method,
+    )
+    lin_inv_demo = invariants_out["linear_motion_invariants"]
+    ang_inv_demo = invariants_out["angular_motion_invariants"]
+    invariants_demo = np.hstack([lin_inv_demo, ang_inv_demo])
+    dim_inv = invariants_demo.shape[1]
+    N = invariants_demo.shape[0]
+    init_pos = np.asarray(pose_target_init["position"]).reshape(3)
+    init_quat = np.asarray(pose_target_init["quaternion"]).reshape(4)
+    goal_pos = np.asarray(pose_target_final["position"]).reshape(3)
+    goal_quat = np.asarray(pose_target_final["quaternion"]).reshape(4)
+    init_rvec = geom.quat_to_axis_angle(init_quat)
+    goal_rvec = geom.quat_to_axis_angle(goal_quat)
+    # Try CasADi optimization
+    if HAS_CASADI and use_casadi:
+        try:
+            result = _solve_casadi_nlp(
+                invariants_demo, N, dim_inv, traj_length,
+                init_pos, init_rvec, goal_pos, goal_rvec,
+                invariants_out["linear_frame_initial"],
+                invariants_out["angular_frame_initial"],
+                dhb_method, enable_smoothing_objective, weights, verbose,
+            )
+            if result is not None:
+                return {
+                    "linear_motion_invariant": result["invariants"][:, :dim_inv // 2],
+                    "angular_motion_invariant": result["invariants"][:, dim_inv // 2:],
+                    "adapted_pos_data": result["positions"],
+                    "adapted_rvec_data": result["rvecs"],
+                    "adapted_quat_data": np.array([geom.axis_angle_to_quat(result["rvecs"][i]) for i in range(len(result["rvecs"]))]),
+                    "resampled_pos_data": pos_resample,
+                    "resampled_quat_data": quat_resample,
+                    "resampled_rvec_data": rvec_resample,
+                    "solver": "casadi",
+                }
+        except Exception as e:
+            if verbose:
+                print(f"CasADi solver failed: {e}, falling back to interpolation")
+    # Fallback: simple decode + smooth interpolation
+    return _fallback_interpolation(
+        lin_inv_demo, ang_inv_demo, pose_target_init, pose_target_final,
+        traj_length, dhb_method, pos_resample, quat_resample, rvec_resample,
+    )
+def _solve_casadi_nlp(
+    invariants_demo, N, dim_inv, traj_length,
+    init_pos, init_rvec, goal_pos, goal_rvec,
+    linear_frame_init, angular_frame_init,
+    dhb_method, enable_smoothing_objective, weights, verbose,
+):
+    """Solve the NLP using CasADi Opti."""
+    opti = ca.Opti()
+    # Decision variables: invariants for each timestep
+    U = opti.variable(N, dim_inv)
+    # Pose variables
+    P = [opti.variable(3) for _ in range(N)]
+    R = [opti.variable(3) for _ in range(N)]  # rotation vectors
+    # Weights
+    if weights is None:
+        weights = np.ones(dim_inv)
+    else:
+        weights = np.asarray(weights).reshape(dim_inv)
+    # Normalize invariants for objective
+    inv_min = invariants_demo.min(axis=0)
+    inv_max = invariants_demo.max(axis=0)
+    inv_range = inv_max - inv_min
+    inv_range[inv_range == 0] = 1.0
+    # Objective: minimize weighted deviation from demo invariants
+    objective = 0
+    for k in range(N):
+        # U[k, :] is a row (1 x dim_inv), invariants_demo[k] is numpy array
+        # Need to ensure compatible shapes for CasADi
+        demo_k = invariants_demo[k, :].reshape(1, -1)  # (1, dim_inv)
+        range_k = inv_range.reshape(1, -1)  # (1, dim_inv)
+        weights_k = weights.reshape(1, -1)  # (1, dim_inv)
+        e = (U[k, :] - demo_k) / range_k
+        e_weighted = ca.sqrt(weights_k) * e
+        objective += ca.sumsqr(e_weighted)
+    # Smoothness penalty
+    if enable_smoothing_objective:
+        smooth_weight = 1e2
+        for k in range(1, N - 1):
+            diff = R[k + 1] - 2 * ca.vertcat(*R[k]) + R[k - 1]
+            objective += smooth_weight * ca.sumsqr(diff)
+    # Boundary constraints
+    opti.subject_to(P[0] == init_pos)
+    opti.subject_to(R[0] == init_rvec)
+    # End pose constraint (on the last reconstructed pose)
+    # We'll add this after setting up the dynamics
+    # Dynamic constraints: reconstruct trajectory from invariants
+    k_lin = dim_inv // 2  # 4 for DR, 3 for original
+    # Use DM for initial constant matrices, then convert to MX-compatible operations
+    linear_frame = ca.MX(linear_frame_init)
+    angular_frame = ca.MX(angular_frame_init)
+    rotm_accum = _axis_angle_to_rot_casadi(ca.MX(init_rvec), use_mx=True)
+    for k in range(N):
+        lin_inv = U[k, :k_lin]
+        ang_inv = U[k, k_lin:]
+        # Linear step
+        mag_lin = lin_inv[0]
+        euler_lin = lin_inv[1:4] if k_lin == 4 else ca.vertcat(0, lin_inv[1], lin_inv[2])
+        R_lin = _euler_to_rot_casadi(euler_lin)
+        trans = ca.vertcat(mag_lin, 0, 0)
+        T_step = ca.vertcat(
+            ca.horzcat(R_lin, trans),
+            ca.horzcat(0, 0, 0, 1),
+        )
+        linear_frame = linear_frame @ T_step
+        new_pos = linear_frame[:3, 3]
+        # Angular step
+        mag_ang = ang_inv[0]
+        euler_ang = ang_inv[1:4] if k_lin == 4 else ca.vertcat(0, ang_inv[1], ang_inv[2])
+        rvec_local = angular_frame[:3, :3] @ ca.vertcat(mag_ang, 0, 0)
+        R_ang = _euler_to_rot_casadi(euler_ang)
+        rotm_accum = rotm_accum @ _axis_angle_to_rot_casadi(rvec_local, use_mx=True).T
+        new_rvec = _rot_to_rvec_casadi(rotm_accum)
+        angular_frame = ca.vertcat(
+            ca.horzcat(angular_frame[:3, :3] @ R_ang, ca.MX.zeros(3, 1)),
+            ca.horzcat(0, 0, 0, 1),
+        )
+        # Add dynamic constraints
+        opti.subject_to(P[k] == new_pos)
+        opti.subject_to(R[k] == new_rvec)
+    # End pose constraint
+    opti.subject_to(P[-1] == goal_pos)
+    opti.subject_to(R[-1] == goal_rvec)
+    # Initial values
+    for k in range(N):
+        opti.set_initial(U[k, :], invariants_demo[k, :])
+    # Minimize objective
+    opti.minimize(objective)
+    # Solver options
+    opts = {"ipopt.print_level": 5 if verbose else 0, "print_time": verbose}
+    opti.solver("ipopt", opts)
+    # Solve
+    sol = opti.solve()
+    # Extract solution
+    U_sol = sol.value(U)
+    P_sol = np.array([sol.value(P[k]).flatten() for k in range(N)])
+    R_sol = np.array([sol.value(R[k]).flatten() for k in range(N)])
+    # Pad/trim to traj_length
+    if len(P_sol) < traj_length:
+        pad_n = traj_length - len(P_sol)
+        P_sol = np.vstack([P_sol, np.tile(P_sol[-1], (pad_n, 1))])
+        R_sol = np.vstack([R_sol, np.tile(R_sol[-1], (pad_n, 1))])
+    elif len(P_sol) > traj_length:
+        P_sol = P_sol[:traj_length]
+        R_sol = R_sol[:traj_length]
+    return {
+        "invariants": U_sol,
+        "positions": P_sol,
+        "rvecs": R_sol,
+    }
+def _fallback_interpolation(
+    lin_inv, ang_inv, pose_target_init, pose_target_final,
+    traj_length, dhb_method, pos_resample, quat_resample, rvec_resample,
+):
+    """Fallback: decode + smooth interpolation when CasADi is not available."""
+    decoded = decode_dhb_dr(
+        lin_inv, ang_inv, pose_target_init,
+        method=EncodingMethod.POSITION, dhb_method=dhb_method, drop_padded=False,
+    )
+    pos_dec = decoded["positions"]
+    quat_dec = decoded["quaternions"]
+    if len(pos_dec) >= traj_length:
+        pos_dec = pos_dec[:traj_length].copy()
+        quat_dec = quat_dec[:traj_length].copy()
+    else:
+        last_pos = np.tile(pos_dec[-1], (traj_length - len(pos_dec), 1))
+        last_quat = np.tile(quat_dec[-1], (traj_length - len(quat_dec), 1))
+        pos_dec = np.vstack([pos_dec, last_pos])
+        quat_dec = np.vstack([quat_dec, last_quat])
+    init_pos = np.asarray(pose_target_init["position"]).reshape(3)
+    init_quat = np.asarray(pose_target_init["quaternion"]).reshape(4)
+    goal_pos = np.asarray(pose_target_final["position"]).reshape(3)
+    goal_quat = np.asarray(pose_target_final["quaternion"]).reshape(4)
+    pos_dec[0] = init_pos.copy()
+    quat_dec[0] = init_quat.copy()
+    end_error = goal_pos - pos_dec[-1]
+    for i in range(traj_length):
+        t = i / max(1, traj_length - 1)
+        s = 3 * t**2 - 2 * t**3
+        pos_dec[i] = pos_dec[i] + s * end_error
+        quat_dec[i] = geom.quat_slerp(quat_dec[i], goal_quat, s)
+    pos_dec[0] = init_pos.copy()
+    quat_dec[0] = init_quat.copy()
+    pos_dec[-1] = goal_pos.copy()
+    quat_dec[-1] = goal_quat.copy()
+    return {
+        "linear_motion_invariant": lin_inv,
+        "angular_motion_invariant": ang_inv,
+        "adapted_pos_data": pos_dec,
+        "adapted_rvec_data": np.array([geom.quat_to_axis_angle(quat_dec[i]) for i in range(len(quat_dec))]),
+        "adapted_quat_data": quat_dec,
+        "resampled_pos_data": pos_resample,
+        "resampled_quat_data": quat_resample,
+        "resampled_rvec_data": rvec_resample,
+        "solver": "interpolation",
+    }

dhb_xr/optimization/constraints.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Constraint helpers for trajectory optimization (obstacles, bounds)."""
+import numpy as np
+from typing import Callable, Optional
+def sphere_obstacle_constraint(
+    center: np.ndarray,
+    radius: float,
+) -> Callable[[np.ndarray], float]:
+    """Returns a constraint function c(positions) -> min distance squared - radius^2 (>= 0 feasible)."""
+    center = np.asarray(center).reshape(3)
+    def c(positions: np.ndarray) -> np.ndarray:
+        pos = np.asarray(positions).reshape(-1, 3)
+        d2 = np.sum((pos - center) ** 2, axis=1)
+        return d2 - radius**2
+    return c
+def box_bounds_constraint(
+    lower: np.ndarray,
+    upper: np.ndarray,
+) -> Callable[[np.ndarray], np.ndarray]:
+    """Returns constraint c(positions) such that c >= 0 when inside box."""
+    lower = np.asarray(lower).reshape(3)
+    upper = np.asarray(upper).reshape(3)
+    def c(positions: np.ndarray) -> np.ndarray:
+        pos = np.asarray(positions).reshape(-1, 3)
+        return np.concatenate([pos - lower, upper - pos], axis=1).ravel()
+    return c

dhb_xr/optimization/cusadi_solver.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""
+Cusadi-based GPU-parallel trajectory optimization (optional).
+This module provides GPU-accelerated batch decoding of DHB invariants using CusADi.
+Setup (one-time):
+    1. Clone and install cusadi: git clone https://github.com/se-hwan/cusadi && pip install -e cusadi
+    2. Export the CasADi decode function:
+       python -m dhb_xr.optimization.export_casadi_decode --out fn_dhb_decode.casadi
+    3. Move to cusadi and compile:
+       mv fn_dhb_decode.casadi cusadi/src/casadi_functions/
+       cd cusadi && python run_codegen.py --fn=fn_dhb_decode
+Benchmark results (RTX A2000, 50-step trajectories):
+    Batch 100:   43x speedup (CPU 34ms vs GPU 0.8ms)
+    Batch 1000: 199x speedup (CPU 342ms vs GPU 1.7ms)
+    Batch 2000: 387x speedup (CPU 685ms vs GPU 1.8ms)
+Without cusadi: CusadiTrajectoryOptimizer.forward() falls back to NumPy batched decode.
+"""
+from __future__ import annotations
+import os
+import numpy as np
+from typing import Dict, List, Any, Optional, Tuple, Union
+from dhb_xr.decoder.dhb_dr import decode_dhb_dr
+from dhb_xr.core.types import DHBMethod, EncodingMethod
+# Check for CasADi
+try:
+    import casadi as ca
+    HAS_CASADI = True
+except ImportError:
+    HAS_CASADI = False
+# Check for CusADi (GPU acceleration)
+HAS_CUSADI = False
+CusadiFunction = None
+try:
+    # Try standard import first
+    from cusadi import CusadiFunction
+    HAS_CUSADI = True
+except ImportError:
+    # Try importing with cusadi root in path
+    try:
+        import sys
+        cusadi_root = os.environ.get("CUSADI_ROOT", "/home/andypark/Projects/repos/cusadi")
+        if cusadi_root not in sys.path:
+            sys.path.insert(0, cusadi_root)
+        from src.CusadiFunction import CusadiFunction
+        HAS_CUSADI = True
+    except ImportError:
+        pass
+# Check for PyTorch with CUDA
+try:
+    import torch
+    HAS_TORCH_CUDA = torch.cuda.is_available()
+except ImportError:
+    HAS_TORCH_CUDA = False
+# Default CusADi paths
+CUSADI_ROOT = os.environ.get("CUSADI_ROOT", "/home/andypark/Projects/repos/cusadi")
+DEFAULT_FN_PATH = os.path.join(CUSADI_ROOT, "src/casadi_functions/fn_dhb_decode_linear.casadi")
+# Global cached CusADi function
+_cusadi_fn_cache: Dict[Tuple[str, int], Any] = {}
+def get_cusadi_function(
+    casadi_path: str = DEFAULT_FN_PATH,
+    batch_size: int = 1000,
+) -> Optional[Any]:
+    """Get or create a cached CusADi function for GPU decode."""
+    if not HAS_CUSADI or not HAS_CASADI:
+        return None
+    cache_key = (casadi_path, batch_size)
+    if cache_key in _cusadi_fn_cache:
+        return _cusadi_fn_cache[cache_key]
+    if not os.path.exists(casadi_path):
+        return None
+    try:
+        fn = ca.Function.load(casadi_path)
+        cusadi_fn = CusadiFunction(fn, batch_size)
+        _cusadi_fn_cache[cache_key] = cusadi_fn
+        return cusadi_fn
+    except Exception as e:
+        print(f"Warning: Failed to load CusADi function: {e}")
+        return None
+def batched_decode_dhb_dr_gpu(
+    linear_invariants_batch: np.ndarray,
+    angular_invariants_batch: np.ndarray,
+    initial_poses: List[Dict[str, np.ndarray]],
+    casadi_path: str = DEFAULT_FN_PATH,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    GPU-accelerated batch decode using CusADi.
+    This function uses CusADi to run the DHB decode on GPU in parallel across
+    all trajectories in the batch.
+    Requirements:
+        - CusADi installed (pip install -e cusadi)
+        - CUDA available and PyTorch with CUDA support
+        - Compiled CasADi decode function (fn_dhb_decode_linear.casadi)
+    Args:
+        linear_invariants_batch: (B, T, 4) linear invariants
+        angular_invariants_batch: (B, T, 4) angular invariants
+        initial_poses: List of B dicts with 'position' (3,) and 'quaternion' (4,) wxyz
+        casadi_path: Path to compiled CasADi function
+    Returns:
+        Tuple of (positions (B, N, 3), quaternions (B, N, 4))
+    Raises:
+        RuntimeError: If CusADi or CUDA is not available
+    """
+    if not HAS_TORCH_CUDA:
+        raise RuntimeError("CUDA not available. Install PyTorch with CUDA support.")
+    if not HAS_CUSADI:
+        raise RuntimeError("CusADi not installed. Clone and install from github.com/se-hwan/cusadi")
+    B = linear_invariants_batch.shape[0]
+    T = linear_invariants_batch.shape[1]
+    # Get or create CusADi function
+    cusadi_fn = get_cusadi_function(casadi_path, B)
+    if cusadi_fn is None:
+        raise RuntimeError(f"Could not load CusADi function from {casadi_path}")
+    # Prepare inputs for CusADi
+    # The fn_dhb_decode_linear function expects:
+    # - i0: flattened linear invariants (T*4,) per sample
+    # - i1: initial position (3,) per sample
+    # - i2: initial rotation matrix (9,) flattened per sample
+    # Stack initial poses
+    init_pos = np.array([p['position'] for p in initial_poses])  # (B, 3)
+    init_quat = np.array([p['quaternion'] for p in initial_poses])  # (B, 4) wxyz
+    # Convert quaternions to rotation matrices
+    from dhb_xr.core.geometry import quat_to_rot
+    init_rot = np.array([quat_to_rot(q).flatten() for q in init_quat])  # (B, 9)
+    # Check function signature - fn_dhb_decode_linear expects:
+    # i0: linear invariants flattened (nnz_in for first input)
+    # i1: initial position (3,)
+    # i2: initial rotation matrix flattened (9,)
+    n_in = cusadi_fn.fn_casadi.n_in()
+    nnz_in = [cusadi_fn.fn_casadi.nnz_in(i) for i in range(n_in)]
+    # Flatten linear invariants to match expected input size
+    lin_flat = linear_invariants_batch.reshape(B, -1)[:, :nnz_in[0]]  # (B, nnz)
+    # Convert to torch tensors on GPU (must be contiguous and double)
+    lin_t = torch.from_numpy(lin_flat.astype(np.float64)).cuda().contiguous()
+    pos_t = torch.from_numpy(init_pos.astype(np.float64)).cuda().contiguous()
+    rot_t = torch.from_numpy(init_rot.astype(np.float64)).cuda().contiguous()
+    # Run CusADi function - evaluate takes a LIST of input tensors
+    cusadi_fn.evaluate([lin_t, pos_t, rot_t])
+    # Get dense output (positions)
+    positions = cusadi_fn.getDenseOutput(0).cpu().numpy()  # (B, N, 3)
+    # Reshape if needed (output might be (B, N*3) or (B, N, 3))
+    if positions.ndim == 2:
+        N = positions.shape[1] // 3
+        positions = positions.reshape(B, N, 3)
+    # For quaternions, we need to decode separately or use angular invariants
+    # For now, decode quaternions using NumPy (they're cheap relative to positions)
+    _, quat_batch = batched_decode_dhb_dr(
+        linear_invariants_batch, angular_invariants_batch,
+        initial_poses, drop_padded=True
+    )
+    return positions, quat_batch
+def batched_decode_dhb_dr(
+    linear_invariants_batch: np.ndarray,
+    angular_invariants_batch: np.ndarray,
+    initial_poses: List[Dict[str, np.ndarray]],
+    method: EncodingMethod = EncodingMethod.POSITION,
+    dhb_method: DHBMethod = DHBMethod.DOUBLE_REFLECTION,
+    drop_padded: bool = True,
+    use_gpu: bool = False,
+    casadi_path: str = DEFAULT_FN_PATH,
+) -> tuple:
+    """
+    Decode multiple trajectories in batch.
+    Args:
+        linear_invariants_batch: (B, T, 4) or list of (T, 4)
+        angular_invariants_batch: (B, T, 4) or list of (T, 4)
+        initial_poses: list of B dicts with 'position' (3,) and 'quaternion' (4,) wxyz.
+        method: 'pos' or 'vel' for invariant interpretation
+        dhb_method: DHBMethod enum (DOUBLE_REFLECTION or ORIGINAL)
+        drop_padded: Whether to drop padded frames
+        use_gpu: If True and CUDA available, use CusADi GPU acceleration
+        casadi_path: Path to compiled CasADi function for GPU decode
+    Returns:
+        (positions_batch, quaternions_batch): (B, N, 3), (B, N, 4).
+    """
+    # Try GPU decode if requested
+    if use_gpu:
+        if HAS_TORCH_CUDA and HAS_CUSADI:
+            try:
+                return batched_decode_dhb_dr_gpu(
+                    linear_invariants_batch, angular_invariants_batch,
+                    initial_poses, casadi_path
+                )
+            except Exception as e:
+                print(f"GPU decode failed, falling back to CPU: {e}")
+        else:
+            missing = []
+            if not HAS_TORCH_CUDA:
+                missing.append("PyTorch CUDA")
+            if not HAS_CUSADI:
+                missing.append("CusADi")
+            print(f"GPU decode unavailable (missing: {', '.join(missing)}), using CPU")
+    # CPU decode (NumPy loop)
+    if isinstance(linear_invariants_batch, np.ndarray) and linear_invariants_batch.ndim == 3:
+        B = linear_invariants_batch.shape[0]
+        lin_list = [linear_invariants_batch[b] for b in range(B)]
+        ang_list = [angular_invariants_batch[b] for b in range(B)]
+    else:
+        lin_list = list(linear_invariants_batch)
+        ang_list = list(angular_invariants_batch)
+        B = len(lin_list)
+    assert len(initial_poses) == B and len(ang_list) == B
+    pos_list = []
+    quat_list = []
+    for b in range(B):
+        decoded = decode_dhb_dr(
+            lin_list[b], ang_list[b],
+            initial_poses[b],
+            method=method,
+            dhb_method=dhb_method,
+            drop_padded=drop_padded,
+        )
+        pos_list.append(decoded["positions"])
+        quat_list.append(decoded["quaternions"])
+    return np.stack(pos_list), np.stack(quat_list)
+class CusadiTrajectoryOptimizer:
+    """
+    Batched trajectory decode / optimization.
+    - If cusadi is not installed: forward() uses batched_decode_dhb_dr (NumPy loop).
+    - If cusadi is installed and a compiled decode function is provided:
+      forward() can use CusadiFunction for GPU batch (set decode_casadi_path).
+    """
+    def __init__(
+        self,
+        batch_size: int = 1000,
+        dhb_method: DHBMethod | str = DHBMethod.DOUBLE_REFLECTION,
+        decode_casadi_path: Optional[str] = None,
+    ):
+        self.batch_size = batch_size
+        self.dhb_method = (
+            dhb_method
+            if isinstance(dhb_method, DHBMethod)
+            else (DHBMethod.DOUBLE_REFLECTION if dhb_method == "double_reflection" else DHBMethod.ORIGINAL)
+        )
+        self.decode_casadi_path = decode_casadi_path
+        self._cusadi_fn: Any = None
+        if HAS_CUSADI and decode_casadi_path:
+            try:
+                fn = ca.Function.load(decode_casadi_path)
+                self._cusadi_fn = CusadiFunction(fn, batch_size)
+            except Exception:
+                self._cusadi_fn = None
+    def forward(
+        self,
+        linear_invariants: np.ndarray,
+        angular_invariants: np.ndarray,
+        initial_poses: List[Dict[str, np.ndarray]],
+        method: EncodingMethod = EncodingMethod.POSITION,
+        drop_padded: bool = True,
+    ) -> tuple:
+        """
+        Batched decode: (linear_inv, angular_inv, initial_poses) -> (positions, quaternions).
+        linear_invariants: (B, T, 4), angular_invariants: (B, T, 4),
+        initial_poses: list of B dicts. Returns (B, N, 3), (B, N, 4).
+        Uses NumPy batched_decode_dhb_dr. For GPU, build a .casadi decode and use
+        CusadiFunction.evaluate() directly (see export_casadi_decode and cusadi repo).
+        """
+        return batched_decode_dhb_dr(
+            linear_invariants,
+            angular_invariants,
+            initial_poses,
+            method=method,
+            dhb_method=self.dhb_method,
+            drop_padded=drop_padded,
+        )