PyPI - gsply - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gsply 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

gsply/__init__.py +55 -0
gsply/formats.py +226 -0
gsply/py.typed +1 -0
gsply/reader.py +894 -0
gsply/writer.py +1019 -0
gsply-0.1.0.dist-info/METADATA +520 -0
gsply-0.1.0.dist-info/RECORD +11 -0
gsply-0.1.0.dist-info/WHEEL +5 -0
gsply-0.1.0.dist-info/licenses/LICENSE +21 -0
gsply-0.1.0.dist-info/top_level.txt +1 -0
gsply-0.1.0.dist-info/zip-safe +1 -0

gsply/writer.py ADDED Viewed

@@ -0,0 +1,1019 @@
+"""Writing functions for Gaussian splatting PLY files.
+This module provides ultra-fast writing of Gaussian splatting PLY files
+in uncompressed format, with compressed format support planned.
+API Examples:
+    >>> from gsply import plywrite
+    >>> plywrite("output.ply", means, scales, quats, opacities, sh0, shN)
+    >>> # Or use format-specific writers
+    >>> from gsply.writer import write_uncompressed
+    >>> write_uncompressed("output.ply", means, scales, quats, opacities, sh0, shN)
+Performance:
+    - Write uncompressed: 5-10ms for 50K Gaussians
+    - Write compressed: Not yet implemented
+"""
+import numpy as np
+from pathlib import Path
+from typing import Optional, Union
+import logging
+# Try to import numba for JIT optimization (optional)
+try:
+    from numba import jit
+    import numba
+    HAS_NUMBA = True
+except ImportError:
+    HAS_NUMBA = False
+    # Fallback: no-op decorator
+    def jit(*args, **kwargs):
+        def decorator(func):
+            return func
+        return decorator
+    # Mock numba module for prange fallback
+    class _MockNumba:
+        @staticmethod
+        def prange(n):
+            return range(n)
+    numba = _MockNumba()
+logger = logging.getLogger(__name__)
+# ======================================================================================
+# JIT-COMPILED COMPRESSION FUNCTIONS
+# ======================================================================================
+@jit(nopython=True, parallel=True, fastmath=True, cache=True)
+def _pack_positions_jit(sorted_means, chunk_indices, min_x, min_y, min_z, max_x, max_y, max_z):
+    """JIT-compiled position quantization and packing (11-10-11 bits) with parallel processing.
+    Args:
+        sorted_means: (N, 3) float32 array of positions
+        chunk_indices: int32 array of chunk indices for each vertex
+        min_x, min_y, min_z: chunk minimum bounds
+        max_x, max_y, max_z: chunk maximum bounds
+    Returns:
+        packed: (N,) uint32 array of packed positions
+    """
+    n = len(sorted_means)
+    packed = np.zeros(n, dtype=np.uint32)
+    for i in numba.prange(n):
+        chunk_idx = chunk_indices[i]
+        # Compute ranges (handle zero range)
+        range_x = max_x[chunk_idx] - min_x[chunk_idx]
+        range_y = max_y[chunk_idx] - min_y[chunk_idx]
+        range_z = max_z[chunk_idx] - min_z[chunk_idx]
+        if range_x == 0.0:
+            range_x = 1.0
+        if range_y == 0.0:
+            range_y = 1.0
+        if range_z == 0.0:
+            range_z = 1.0
+        # Normalize to [0, 1]
+        norm_x = (sorted_means[i, 0] - min_x[chunk_idx]) / range_x
+        norm_y = (sorted_means[i, 1] - min_y[chunk_idx]) / range_y
+        norm_z = (sorted_means[i, 2] - min_z[chunk_idx]) / range_z
+        # Clamp
+        norm_x = max(0.0, min(1.0, norm_x))
+        norm_y = max(0.0, min(1.0, norm_y))
+        norm_z = max(0.0, min(1.0, norm_z))
+        # Quantize
+        px = np.uint32(norm_x * 2047.0)
+        py = np.uint32(norm_y * 1023.0)
+        pz = np.uint32(norm_z * 2047.0)
+        # Pack (11-10-11 bits)
+        packed[i] = (px << 21) | (py << 11) | pz
+    return packed
+@jit(nopython=True, parallel=True, fastmath=True, cache=True)
+def _pack_scales_jit(sorted_scales, chunk_indices, min_sx, min_sy, min_sz, max_sx, max_sy, max_sz):
+    """JIT-compiled scale quantization and packing (11-10-11 bits) with parallel processing.
+    Args:
+        sorted_scales: (N, 3) float32 array of scales
+        chunk_indices: int32 array of chunk indices for each vertex
+        min_sx, min_sy, min_sz: chunk minimum scale bounds
+        max_sx, max_sy, max_sz: chunk maximum scale bounds
+    Returns:
+        packed: (N,) uint32 array of packed scales
+    """
+    n = len(sorted_scales)
+    packed = np.zeros(n, dtype=np.uint32)
+    for i in numba.prange(n):
+        chunk_idx = chunk_indices[i]
+        # Compute ranges (handle zero range)
+        range_sx = max_sx[chunk_idx] - min_sx[chunk_idx]
+        range_sy = max_sy[chunk_idx] - min_sy[chunk_idx]
+        range_sz = max_sz[chunk_idx] - min_sz[chunk_idx]
+        if range_sx == 0.0:
+            range_sx = 1.0
+        if range_sy == 0.0:
+            range_sy = 1.0
+        if range_sz == 0.0:
+            range_sz = 1.0
+        # Normalize to [0, 1]
+        norm_sx = (sorted_scales[i, 0] - min_sx[chunk_idx]) / range_sx
+        norm_sy = (sorted_scales[i, 1] - min_sy[chunk_idx]) / range_sy
+        norm_sz = (sorted_scales[i, 2] - min_sz[chunk_idx]) / range_sz
+        # Clamp
+        norm_sx = max(0.0, min(1.0, norm_sx))
+        norm_sy = max(0.0, min(1.0, norm_sy))
+        norm_sz = max(0.0, min(1.0, norm_sz))
+        # Quantize
+        sx = np.uint32(norm_sx * 2047.0)
+        sy = np.uint32(norm_sy * 1023.0)
+        sz = np.uint32(norm_sz * 2047.0)
+        # Pack (11-10-11 bits)
+        packed[i] = (sx << 21) | (sy << 11) | sz
+    return packed
+@jit(nopython=True, parallel=True, fastmath=True, cache=True)
+def _pack_colors_jit(sorted_sh0, sorted_opacities, chunk_indices, min_r, min_g, min_b, max_r, max_g, max_b, sh_c0):
+    """JIT-compiled color and opacity quantization and packing (8-8-8-8 bits) with parallel processing.
+    Args:
+        sorted_sh0: (N, 3) float32 array of SH0 coefficients
+        sorted_opacities: (N,) float32 array of opacities (logit space)
+        chunk_indices: int32 array of chunk indices for each vertex
+        min_r, min_g, min_b: chunk minimum color bounds
+        max_r, max_g, max_b: chunk maximum color bounds
+        sh_c0: SH constant for conversion
+    Returns:
+        packed: (N,) uint32 array of packed colors
+    """
+    n = len(sorted_sh0)
+    packed = np.zeros(n, dtype=np.uint32)
+    for i in numba.prange(n):
+        chunk_idx = chunk_indices[i]
+        # Convert SH0 to RGB
+        color_r = sorted_sh0[i, 0] * sh_c0 + 0.5
+        color_g = sorted_sh0[i, 1] * sh_c0 + 0.5
+        color_b = sorted_sh0[i, 2] * sh_c0 + 0.5
+        # Compute ranges (handle zero range)
+        range_r = max_r[chunk_idx] - min_r[chunk_idx]
+        range_g = max_g[chunk_idx] - min_g[chunk_idx]
+        range_b = max_b[chunk_idx] - min_b[chunk_idx]
+        if range_r == 0.0:
+            range_r = 1.0
+        if range_g == 0.0:
+            range_g = 1.0
+        if range_b == 0.0:
+            range_b = 1.0
+        # Normalize to [0, 1]
+        norm_r = (color_r - min_r[chunk_idx]) / range_r
+        norm_g = (color_g - min_g[chunk_idx]) / range_g
+        norm_b = (color_b - min_b[chunk_idx]) / range_b
+        # Clamp
+        norm_r = max(0.0, min(1.0, norm_r))
+        norm_g = max(0.0, min(1.0, norm_g))
+        norm_b = max(0.0, min(1.0, norm_b))
+        # Quantize colors
+        cr = np.uint32(norm_r * 255.0)
+        cg = np.uint32(norm_g * 255.0)
+        cb = np.uint32(norm_b * 255.0)
+        # Opacity: logit to linear
+        opacity_linear = 1.0 / (1.0 + np.exp(-sorted_opacities[i]))
+        opacity_linear = max(0.0, min(1.0, opacity_linear))
+        co = np.uint32(opacity_linear * 255.0)
+        # Pack (8-8-8-8 bits)
+        packed[i] = (cr << 24) | (cg << 16) | (cb << 8) | co
+    return packed
+@jit(nopython=True, parallel=True, fastmath=True, cache=True)
+def _pack_quaternions_jit(sorted_quats):
+    """JIT-compiled quaternion normalization and packing (2+10-10-10 bits, smallest-three) with parallel processing.
+    Args:
+        sorted_quats: (N, 4) float32 array of quaternions
+    Returns:
+        packed: (N,) uint32 array of packed quaternions
+    """
+    n = len(sorted_quats)
+    packed = np.zeros(n, dtype=np.uint32)
+    norm_factor = np.sqrt(2.0) * 0.5
+    for i in numba.prange(n):
+        # Normalize quaternion
+        quat = sorted_quats[i]
+        norm = np.sqrt(quat[0]*quat[0] + quat[1]*quat[1] + quat[2]*quat[2] + quat[3]*quat[3])
+        if norm > 0:
+            quat = quat / norm
+        # Find largest component by absolute value
+        abs_vals = np.abs(quat)
+        largest_idx = 0
+        largest_val = abs_vals[0]
+        for j in range(1, 4):
+            if abs_vals[j] > largest_val:
+                largest_val = abs_vals[j]
+                largest_idx = j
+        # Flip quaternion if largest component is negative
+        if quat[largest_idx] < 0:
+            quat = -quat
+        # Extract three smaller components
+        three_components = np.zeros(3, dtype=np.float32)
+        idx = 0
+        for j in range(4):
+            if j != largest_idx:
+                three_components[idx] = quat[j]
+                idx += 1
+        # Normalize to [0, 1] for quantization
+        qa_norm = three_components[0] * norm_factor + 0.5
+        qb_norm = three_components[1] * norm_factor + 0.5
+        qc_norm = three_components[2] * norm_factor + 0.5
+        # Clamp
+        qa_norm = max(0.0, min(1.0, qa_norm))
+        qb_norm = max(0.0, min(1.0, qb_norm))
+        qc_norm = max(0.0, min(1.0, qc_norm))
+        # Quantize
+        qa_int = np.uint32(qa_norm * 1023.0)
+        qb_int = np.uint32(qb_norm * 1023.0)
+        qc_int = np.uint32(qc_norm * 1023.0)
+        # Pack (2 bits for which + 10+10+10 bits)
+        packed[i] = (np.uint32(largest_idx) << 30) | (qa_int << 20) | (qb_int << 10) | qc_int
+    return packed
+@jit(nopython=True, fastmath=True, cache=True)
+def _radix_sort_by_chunks(chunk_indices, num_chunks):
+    """Radix sort (counting sort) for chunk indices (4x faster than argsort).
+    Since chunk indices are small integers (0 to num_chunks-1), counting sort
+    achieves O(n) complexity vs O(n log n) for comparison-based sorting.
+    Args:
+        chunk_indices: (N,) int32 array of chunk indices
+        num_chunks: number of unique chunks
+    Returns:
+        sort_indices: (N,) int32 array of indices that would sort the data
+    """
+    n = len(chunk_indices)
+    # Count occurrences of each chunk
+    counts = np.zeros(num_chunks, dtype=np.int32)
+    for i in range(n):
+        counts[chunk_indices[i]] += 1
+    # Compute starting positions for each chunk
+    offsets = np.zeros(num_chunks, dtype=np.int32)
+    for i in range(1, num_chunks):
+        offsets[i] = offsets[i-1] + counts[i-1]
+    # Build sorted index array
+    sort_indices = np.empty(n, dtype=np.int32)
+    positions = offsets.copy()
+    for i in range(n):
+        chunk_id = chunk_indices[i]
+        sort_indices[positions[chunk_id]] = i
+        positions[chunk_id] += 1
+    return sort_indices
+@jit(nopython=True, parallel=False, fastmath=True, cache=True)
+def _compute_chunk_bounds_jit(sorted_means, sorted_scales, sorted_sh0,
+                               chunk_starts, chunk_ends, sh_c0):
+    """JIT-compiled chunk bounds computation (9x faster than Python loop).
+    Computes min/max bounds for positions, scales, and colors for each chunk.
+    This is the main bottleneck in compressed write (~90ms -> ~10ms).
+    Args:
+        sorted_means: (N, 3) float32 array of positions
+        sorted_scales: (N, 3) float32 array of scales
+        sorted_sh0: (N, 3) float32 array of SH0 coefficients
+        chunk_starts: (num_chunks,) int array of chunk start indices
+        chunk_ends: (num_chunks,) int array of chunk end indices
+        sh_c0: SH constant for RGB conversion
+    Returns:
+        bounds: (num_chunks, 18) float32 array with layout:
+            [0:6]   - min_x, min_y, min_z, max_x, max_y, max_z
+            [6:12]  - min_scale_x/y/z, max_scale_x/y/z (clamped to [-20,20])
+            [12:18] - min_r, min_g, min_b, max_r, max_g, max_b
+    """
+    num_chunks = len(chunk_starts)
+    bounds = np.zeros((num_chunks, 18), dtype=np.float32)
+    for chunk_idx in range(num_chunks):
+        start = chunk_starts[chunk_idx]
+        end = chunk_ends[chunk_idx]
+        if start >= end:  # Empty chunk
+            continue
+        # Initialize with first element
+        bounds[chunk_idx, 0] = sorted_means[start, 0]  # min_x
+        bounds[chunk_idx, 1] = sorted_means[start, 1]  # min_y
+        bounds[chunk_idx, 2] = sorted_means[start, 2]  # min_z
+        bounds[chunk_idx, 3] = sorted_means[start, 0]  # max_x
+        bounds[chunk_idx, 4] = sorted_means[start, 1]  # max_y
+        bounds[chunk_idx, 5] = sorted_means[start, 2]  # max_z
+        bounds[chunk_idx, 6] = sorted_scales[start, 0]   # min_scale_x
+        bounds[chunk_idx, 7] = sorted_scales[start, 1]   # min_scale_y
+        bounds[chunk_idx, 8] = sorted_scales[start, 2]   # min_scale_z
+        bounds[chunk_idx, 9] = sorted_scales[start, 0]   # max_scale_x
+        bounds[chunk_idx, 10] = sorted_scales[start, 1]  # max_scale_y
+        bounds[chunk_idx, 11] = sorted_scales[start, 2]  # max_scale_z
+        # Convert SH0 to RGB for first element
+        color_r = sorted_sh0[start, 0] * sh_c0 + 0.5
+        color_g = sorted_sh0[start, 1] * sh_c0 + 0.5
+        color_b = sorted_sh0[start, 2] * sh_c0 + 0.5
+        bounds[chunk_idx, 12] = color_r  # min_r
+        bounds[chunk_idx, 13] = color_g  # min_g
+        bounds[chunk_idx, 14] = color_b  # min_b
+        bounds[chunk_idx, 15] = color_r  # max_r
+        bounds[chunk_idx, 16] = color_g  # max_g
+        bounds[chunk_idx, 17] = color_b  # max_b
+        # Process remaining elements in chunk
+        for i in range(start + 1, end):
+            # Position bounds
+            bounds[chunk_idx, 0] = min(bounds[chunk_idx, 0], sorted_means[i, 0])
+            bounds[chunk_idx, 1] = min(bounds[chunk_idx, 1], sorted_means[i, 1])
+            bounds[chunk_idx, 2] = min(bounds[chunk_idx, 2], sorted_means[i, 2])
+            bounds[chunk_idx, 3] = max(bounds[chunk_idx, 3], sorted_means[i, 0])
+            bounds[chunk_idx, 4] = max(bounds[chunk_idx, 4], sorted_means[i, 1])
+            bounds[chunk_idx, 5] = max(bounds[chunk_idx, 5], sorted_means[i, 2])
+            # Scale bounds
+            bounds[chunk_idx, 6] = min(bounds[chunk_idx, 6], sorted_scales[i, 0])
+            bounds[chunk_idx, 7] = min(bounds[chunk_idx, 7], sorted_scales[i, 1])
+            bounds[chunk_idx, 8] = min(bounds[chunk_idx, 8], sorted_scales[i, 2])
+            bounds[chunk_idx, 9] = max(bounds[chunk_idx, 9], sorted_scales[i, 0])
+            bounds[chunk_idx, 10] = max(bounds[chunk_idx, 10], sorted_scales[i, 1])
+            bounds[chunk_idx, 11] = max(bounds[chunk_idx, 11], sorted_scales[i, 2])
+            # Color bounds (convert SH0 to RGB)
+            color_r = sorted_sh0[i, 0] * sh_c0 + 0.5
+            color_g = sorted_sh0[i, 1] * sh_c0 + 0.5
+            color_b = sorted_sh0[i, 2] * sh_c0 + 0.5
+            bounds[chunk_idx, 12] = min(bounds[chunk_idx, 12], color_r)
+            bounds[chunk_idx, 13] = min(bounds[chunk_idx, 13], color_g)
+            bounds[chunk_idx, 14] = min(bounds[chunk_idx, 14], color_b)
+            bounds[chunk_idx, 15] = max(bounds[chunk_idx, 15], color_r)
+            bounds[chunk_idx, 16] = max(bounds[chunk_idx, 16], color_g)
+            bounds[chunk_idx, 17] = max(bounds[chunk_idx, 17], color_b)
+        # Clamp scale bounds to [-20, 20] (matches splat-transform)
+        for j in range(6, 12):
+            bounds[chunk_idx, j] = max(-20.0, min(20.0, bounds[chunk_idx, j]))
+    return bounds
+# ======================================================================================
+# UNCOMPRESSED PLY WRITER
+# ======================================================================================
+def write_uncompressed(
+    file_path: Union[str, Path],
+    means: np.ndarray,
+    scales: np.ndarray,
+    quats: np.ndarray,
+    opacities: np.ndarray,
+    sh0: np.ndarray,
+    shN: Optional[np.ndarray] = None,
+    validate: bool = True,
+) -> None:
+    """Write uncompressed Gaussian splatting PLY file.
+    Uses direct file writing for maximum performance (~5-10ms for 50K Gaussians).
+    Automatically determines SH degree from shN shape.
+    Args:
+        file_path: Output PLY file path
+        means: (N, 3) - xyz positions
+        scales: (N, 3) - scale parameters
+        quats: (N, 4) - rotation quaternions
+        opacities: (N,) - opacity values
+        sh0: (N, 3) - DC spherical harmonics
+        shN: (N, K, 3) or (N, K*3) - Higher-order SH coefficients (optional)
+        validate: If True, validate input shapes (default True). Disable for trusted data.
+    Performance:
+        Direct numpy.tofile() achieves ~5-10ms for 50K Gaussians
+    Example:
+        >>> write_uncompressed("output.ply", means, scales, quats, opacities, sh0, shN)
+        >>> # Or without higher-order SH
+        >>> write_uncompressed("output.ply", means, scales, quats, opacities, sh0)
+        >>> # Skip validation for trusted data (5-10% faster)
+        >>> write_uncompressed("output.ply", means, scales, quats, opacities, sh0, validate=False)
+    """
+    file_path = Path(file_path)
+    # Validate and normalize inputs
+    if not isinstance(means, np.ndarray):
+        means = np.asarray(means, dtype=np.float32)
+    if not isinstance(scales, np.ndarray):
+        scales = np.asarray(scales, dtype=np.float32)
+    if not isinstance(quats, np.ndarray):
+        quats = np.asarray(quats, dtype=np.float32)
+    if not isinstance(opacities, np.ndarray):
+        opacities = np.asarray(opacities, dtype=np.float32)
+    if not isinstance(sh0, np.ndarray):
+        sh0 = np.asarray(sh0, dtype=np.float32)
+    if shN is not None and not isinstance(shN, np.ndarray):
+        shN = np.asarray(shN, dtype=np.float32)
+    # Only convert dtype if needed (avoids copy when already float32)
+    if means.dtype != np.float32:
+        means = means.astype(np.float32, copy=False)
+    if scales.dtype != np.float32:
+        scales = scales.astype(np.float32, copy=False)
+    if quats.dtype != np.float32:
+        quats = quats.astype(np.float32, copy=False)
+    if opacities.dtype != np.float32:
+        opacities = opacities.astype(np.float32, copy=False)
+    if sh0.dtype != np.float32:
+        sh0 = sh0.astype(np.float32, copy=False)
+    if shN is not None and shN.dtype != np.float32:
+        shN = shN.astype(np.float32, copy=False)
+    num_gaussians = means.shape[0]
+    # Validate shapes (optional for trusted data)
+    if validate:
+        assert means.shape == (num_gaussians, 3), f"means must be (N, 3), got {means.shape}"
+        assert scales.shape == (num_gaussians, 3), f"scales must be (N, 3), got {scales.shape}"
+        assert quats.shape == (num_gaussians, 4), f"quats must be (N, 4), got {quats.shape}"
+        assert opacities.shape == (num_gaussians,), f"opacities must be (N,), got {opacities.shape}"
+        assert sh0.shape == (num_gaussians, 3), f"sh0 must be (N, 3), got {sh0.shape}"
+    # Use newaxis instead of reshape (creates view without overhead)
+    opacities = opacities[:, np.newaxis]
+    # Flatten shN if needed (from (N, K, 3) to (N, K*3))
+    if shN is not None and shN.ndim == 3:
+        N, K, C = shN.shape
+        if validate:
+            assert C == 3, f"shN must have shape (N, K, 3), got {shN.shape}"
+        shN = shN.reshape(N, K * 3)
+    # Build header
+    header_lines = [
+        "ply",
+        "format binary_little_endian 1.0",
+        f"element vertex {num_gaussians}",
+        "property float x",
+        "property float y",
+        "property float z",
+    ]
+    # Add SH0 properties
+    for i in range(3):
+        header_lines.append(f"property float f_dc_{i}")
+    # Add SHN properties if present
+    if shN is not None:
+        num_sh_rest = shN.shape[1]
+        for i in range(num_sh_rest):
+            header_lines.append(f"property float f_rest_{i}")
+    # Add remaining properties
+    header_lines.extend([
+        "property float opacity",
+        "property float scale_0",
+        "property float scale_1",
+        "property float scale_2",
+        "property float rot_0",
+        "property float rot_1",
+        "property float rot_2",
+        "property float rot_3",
+        "end_header",
+    ])
+    header = "\n".join(header_lines) + "\n"
+    header_bytes = header.encode('ascii')
+    # Preallocate and assign data (optimized approach - 31-35% faster than concatenate)
+    if shN is not None:
+        sh_coeffs = shN.shape[1]  # Number of SH coefficients (already reshaped to N x K*3)
+        total_props = 3 + 3 + sh_coeffs + 1 + 3 + 4  # means, sh0, shN, opacity, scales, quats
+        data = np.empty((num_gaussians, total_props), dtype='<f4')
+        data[:, 0:3] = means
+        data[:, 3:6] = sh0
+        data[:, 6:6+sh_coeffs] = shN
+        data[:, 6+sh_coeffs:7+sh_coeffs] = opacities  # opacities is already (N, 1)
+        data[:, 7+sh_coeffs:10+sh_coeffs] = scales
+        data[:, 10+sh_coeffs:14+sh_coeffs] = quats
+    else:
+        data = np.empty((num_gaussians, 14), dtype='<f4')
+        data[:, 0:3] = means
+        data[:, 3:6] = sh0
+        data[:, 6:7] = opacities  # opacities is already (N, 1)
+        data[:, 7:10] = scales
+        data[:, 10:14] = quats
+    # Write directly to file
+    with open(file_path, 'wb') as f:
+        f.write(header_bytes)
+        data.tofile(f)
+    logger.debug(f"[Gaussian PLY] Wrote uncompressed: {num_gaussians} Gaussians to {file_path.name}")
+# ======================================================================================
+# COMPRESSED PLY WRITER (VECTORIZED)
+# ======================================================================================
+CHUNK_SIZE = 256
+SH_C0 = 0.28209479177387814
+def write_compressed(
+    file_path: Union[str, Path],
+    means: np.ndarray,
+    scales: np.ndarray,
+    quats: np.ndarray,
+    opacities: np.ndarray,
+    sh0: np.ndarray,
+    shN: Optional[np.ndarray] = None,
+    validate: bool = True,
+) -> None:
+    """Write compressed Gaussian splatting PLY file (PlayCanvas format).
+    Compresses data using chunk-based quantization (256 Gaussians per chunk).
+    Achieves 3.8-14.5x compression ratio using highly optimized vectorized operations.
+    Args:
+        file_path: Output PLY file path
+        means: (N, 3) - xyz positions
+        scales: (N, 3) - scale parameters
+        quats: (N, 4) - rotation quaternions (must be normalized)
+        opacities: (N,) - opacity values
+        sh0: (N, 3) - DC spherical harmonics
+        shN: (N, K, 3) or (N, K*3) - Higher-order SH coefficients (optional)
+        validate: If True, validate input shapes (default True)
+    Performance:
+        ~43ms for 50K Gaussians (highly optimized vectorized compression)
+        78% faster than initial implementation
+    Format:
+        Compressed PLY with chunk-based quantization:
+        - 256 Gaussians per chunk
+        - Position: 11-10-11 bit quantization
+        - Scale: 11-10-11 bit quantization
+        - Color: 8-8-8-8 bit quantization
+        - Quaternion: smallest-three encoding (2+10+10+10 bits)
+        - SH coefficients: 8-bit quantization (optional)
+    Example:
+        >>> write_compressed("output.ply", means, scales, quats, opacities, sh0, shN)
+        >>> # File is 14.5x smaller than uncompressed
+    """
+    file_path = Path(file_path)
+    # Validate and normalize inputs
+    if not isinstance(means, np.ndarray):
+        means = np.asarray(means, dtype=np.float32)
+    if not isinstance(scales, np.ndarray):
+        scales = np.asarray(scales, dtype=np.float32)
+    if not isinstance(quats, np.ndarray):
+        quats = np.asarray(quats, dtype=np.float32)
+    if not isinstance(opacities, np.ndarray):
+        opacities = np.asarray(opacities, dtype=np.float32)
+    if not isinstance(sh0, np.ndarray):
+        sh0 = np.asarray(sh0, dtype=np.float32)
+    if shN is not None and not isinstance(shN, np.ndarray):
+        shN = np.asarray(shN, dtype=np.float32)
+    # Only convert dtype if needed
+    if means.dtype != np.float32:
+        means = means.astype(np.float32, copy=False)
+    if scales.dtype != np.float32:
+        scales = scales.astype(np.float32, copy=False)
+    if quats.dtype != np.float32:
+        quats = quats.astype(np.float32, copy=False)
+    if opacities.dtype != np.float32:
+        opacities = opacities.astype(np.float32, copy=False)
+    if sh0.dtype != np.float32:
+        sh0 = sh0.astype(np.float32, copy=False)
+    if shN is not None and shN.dtype != np.float32:
+        shN = shN.astype(np.float32, copy=False)
+    num_gaussians = means.shape[0]
+    # Validate shapes (optional)
+    if validate:
+        assert means.shape == (num_gaussians, 3), f"means must be (N, 3), got {means.shape}"
+        assert scales.shape == (num_gaussians, 3), f"scales must be (N, 3), got {scales.shape}"
+        assert quats.shape == (num_gaussians, 4), f"quats must be (N, 4), got {quats.shape}"
+        assert opacities.shape == (num_gaussians,), f"opacities must be (N,), got {opacities.shape}"
+        assert sh0.shape == (num_gaussians, 3), f"sh0 must be (N, 3), got {sh0.shape}"
+    # Flatten shN if needed
+    if shN is not None and shN.ndim == 3:
+        N, K, C = shN.shape
+        if validate:
+            assert C == 3, f"shN must have shape (N, K, 3), got {shN.shape}"
+        shN = shN.reshape(N, K * 3)
+    # Compute number of chunks
+    num_chunks = (num_gaussians + CHUNK_SIZE - 1) // CHUNK_SIZE
+    # Pre-compute chunk indices for all vertices (vectorized)
+    chunk_indices = np.arange(num_gaussians, dtype=np.int32) // CHUNK_SIZE
+    # ====================================================================================
+    # COMPUTE CHUNK BOUNDS (OPTIMIZED WITH SORTING)
+    # ====================================================================================
+    #
+    # IMPORTANT: The compressed PLY format REQUIRES vertices to be in chunk order.
+    # The reader assumes vertices 0-255 are chunk 0, 256-511 are chunk 1, etc.
+    # This is not a bug - it's a format specification requirement.
+    #
+    # Performance: Sorting once (O(n log n)) + binary search (O(k log n)) is much
+    # faster than boolean masking per chunk (O(n * k) where k = num_chunks).
+    #
+    # ====================================================================================
+    # Allocate chunk bounds arrays
+    chunk_bounds = np.zeros((num_chunks, 18), dtype=np.float32)
+    # Sort all data by chunk indices using radix sort (O(n) vs O(n log n))
+    # Radix sort is 4x faster for small integer keys like chunk indices
+    # This is required by the compressed PLY format specification.
+    if HAS_NUMBA:
+        # JIT-compiled radix sort (21ms -> 5ms)
+        sort_idx = _radix_sort_by_chunks(chunk_indices, num_chunks)
+    else:
+        # Fallback: standard comparison sort
+        sort_idx = np.argsort(chunk_indices)
+    sorted_chunk_indices = chunk_indices[sort_idx]
+    sorted_means = means[sort_idx]
+    sorted_scales = scales[sort_idx]
+    sorted_sh0 = sh0[sort_idx]
+    sorted_quats = quats[sort_idx]
+    sorted_opacities = opacities[sort_idx]
+    if shN is not None:
+        sorted_shN = shN[sort_idx]
+    else:
+        sorted_shN = None
+    # Find chunk boundaries using searchsorted (O(num_chunks log n))
+    chunk_starts = np.searchsorted(sorted_chunk_indices, np.arange(num_chunks), side='left')
+    chunk_ends = np.searchsorted(sorted_chunk_indices, np.arange(num_chunks), side='right')
+    # Compute chunk bounds (JIT-optimized: 9x faster than Python loop)
+    if HAS_NUMBA:
+        # JIT-compiled bounds computation (90ms -> 10ms)
+        chunk_bounds = _compute_chunk_bounds_jit(sorted_means, sorted_scales, sorted_sh0,
+                                                  chunk_starts, chunk_ends, SH_C0)
+    else:
+        # Fallback: Python loop with NumPy operations
+        for chunk_idx in range(num_chunks):
+            start = chunk_starts[chunk_idx]
+            end = chunk_ends[chunk_idx]
+            if start == end:  # Empty chunk (shouldn't happen but handle gracefully)
+                continue
+            # Slice data for this chunk (O(1) operation)
+            chunk_means = sorted_means[start:end]
+            chunk_scales = sorted_scales[start:end]
+            chunk_color_rgb = sorted_sh0[start:end] * SH_C0 + 0.5
+            # Position bounds
+            chunk_bounds[chunk_idx, 0] = np.min(chunk_means[:, 0])  # min_x
+            chunk_bounds[chunk_idx, 1] = np.min(chunk_means[:, 1])  # min_y
+            chunk_bounds[chunk_idx, 2] = np.min(chunk_means[:, 2])  # min_z
+            chunk_bounds[chunk_idx, 3] = np.max(chunk_means[:, 0])  # max_x
+            chunk_bounds[chunk_idx, 4] = np.max(chunk_means[:, 1])  # max_y
+            chunk_bounds[chunk_idx, 5] = np.max(chunk_means[:, 2])  # max_z
+            # Scale bounds (clamped to [-20, 20] to handle infinity, matches splat-transform)
+            chunk_bounds[chunk_idx, 6] = np.clip(np.min(chunk_scales[:, 0]), -20, 20)   # min_scale_x
+            chunk_bounds[chunk_idx, 7] = np.clip(np.min(chunk_scales[:, 1]), -20, 20)   # min_scale_y
+            chunk_bounds[chunk_idx, 8] = np.clip(np.min(chunk_scales[:, 2]), -20, 20)   # min_scale_z
+            chunk_bounds[chunk_idx, 9] = np.clip(np.max(chunk_scales[:, 0]), -20, 20)   # max_scale_x
+            chunk_bounds[chunk_idx, 10] = np.clip(np.max(chunk_scales[:, 1]), -20, 20)  # max_scale_y
+            chunk_bounds[chunk_idx, 11] = np.clip(np.max(chunk_scales[:, 2]), -20, 20)  # max_scale_z
+            # Color bounds
+            chunk_bounds[chunk_idx, 12] = np.min(chunk_color_rgb[:, 0])  # min_r
+            chunk_bounds[chunk_idx, 13] = np.min(chunk_color_rgb[:, 1])  # min_g
+            chunk_bounds[chunk_idx, 14] = np.min(chunk_color_rgb[:, 2])  # min_b
+            chunk_bounds[chunk_idx, 15] = np.max(chunk_color_rgb[:, 0])  # max_r
+            chunk_bounds[chunk_idx, 16] = np.max(chunk_color_rgb[:, 1])  # max_g
+            chunk_bounds[chunk_idx, 17] = np.max(chunk_color_rgb[:, 2])  # max_b
+    # Extract bounds for vectorized quantization
+    min_x, min_y, min_z = chunk_bounds[:, 0], chunk_bounds[:, 1], chunk_bounds[:, 2]
+    max_x, max_y, max_z = chunk_bounds[:, 3], chunk_bounds[:, 4], chunk_bounds[:, 5]
+    min_scale_x, min_scale_y, min_scale_z = chunk_bounds[:, 6], chunk_bounds[:, 7], chunk_bounds[:, 8]
+    max_scale_x, max_scale_y, max_scale_z = chunk_bounds[:, 9], chunk_bounds[:, 10], chunk_bounds[:, 11]
+    min_r, min_g, min_b = chunk_bounds[:, 12], chunk_bounds[:, 13], chunk_bounds[:, 14]
+    max_r, max_g, max_b = chunk_bounds[:, 15], chunk_bounds[:, 16], chunk_bounds[:, 17]
+    # ====================================================================================
+    # QUANTIZATION AND BIT PACKING (JIT-optimized when available)
+    # ====================================================================================
+    # Allocate packed vertex data (4 uint32 per vertex)
+    packed_data = np.zeros((num_gaussians, 4), dtype=np.uint32)
+    # Use JIT-compiled functions if available (5-6x faster than NumPy for writing)
+    # Note: First-time compilation adds ~40s overhead, but subsequent calls are 5-6x faster
+    if HAS_NUMBA:
+        # JIT-compiled compression (parallel, fastmath)
+        packed_data[:, 0] = _pack_positions_jit(sorted_means, sorted_chunk_indices, min_x, min_y, min_z, max_x, max_y, max_z)
+        packed_data[:, 2] = _pack_scales_jit(sorted_scales, sorted_chunk_indices, min_scale_x, min_scale_y, min_scale_z, max_scale_x, max_scale_y, max_scale_z)
+        packed_data[:, 3] = _pack_colors_jit(sorted_sh0, sorted_opacities, sorted_chunk_indices, min_r, min_g, min_b, max_r, max_g, max_b, SH_C0)
+        packed_data[:, 1] = _pack_quaternions_jit(sorted_quats)
+    else:
+        # Vectorized NumPy operations (optimal for writing)
+        # --- POSITION QUANTIZATION (11-10-11 bits) ---
+        range_x = max_x[sorted_chunk_indices] - min_x[sorted_chunk_indices]
+        range_y = max_y[sorted_chunk_indices] - min_y[sorted_chunk_indices]
+        range_z = max_z[sorted_chunk_indices] - min_z[sorted_chunk_indices]
+        range_x = np.where(range_x == 0, 1.0, range_x)
+        range_y = np.where(range_y == 0, 1.0, range_y)
+        range_z = np.where(range_z == 0, 1.0, range_z)
+        norm_x = (sorted_means[:, 0] - min_x[sorted_chunk_indices]) / range_x
+        norm_y = (sorted_means[:, 1] - min_y[sorted_chunk_indices]) / range_y
+        norm_z = (sorted_means[:, 2] - min_z[sorted_chunk_indices]) / range_z
+        norm_x = np.clip(norm_x, 0.0, 1.0)
+        norm_y = np.clip(norm_y, 0.0, 1.0)
+        norm_z = np.clip(norm_z, 0.0, 1.0)
+        px = (norm_x * 2047.0).astype(np.uint32)
+        py = (norm_y * 1023.0).astype(np.uint32)
+        pz = (norm_z * 2047.0).astype(np.uint32)
+        packed_data[:, 0] = (px << 21) | (py << 11) | pz
+        # --- SCALE QUANTIZATION (11-10-11 bits) ---
+        range_sx = max_scale_x[sorted_chunk_indices] - min_scale_x[sorted_chunk_indices]
+        range_sy = max_scale_y[sorted_chunk_indices] - min_scale_y[sorted_chunk_indices]
+        range_sz = max_scale_z[sorted_chunk_indices] - min_scale_z[sorted_chunk_indices]
+        range_sx = np.where(range_sx == 0, 1.0, range_sx)
+        range_sy = np.where(range_sy == 0, 1.0, range_sy)
+        range_sz = np.where(range_sz == 0, 1.0, range_sz)
+        norm_sx = (sorted_scales[:, 0] - min_scale_x[sorted_chunk_indices]) / range_sx
+        norm_sy = (sorted_scales[:, 1] - min_scale_y[sorted_chunk_indices]) / range_sy
+        norm_sz = (sorted_scales[:, 2] - min_scale_z[sorted_chunk_indices]) / range_sz
+        norm_sx = np.clip(norm_sx, 0.0, 1.0)
+        norm_sy = np.clip(norm_sy, 0.0, 1.0)
+        norm_sz = np.clip(norm_sz, 0.0, 1.0)
+        sx = (norm_sx * 2047.0).astype(np.uint32)
+        sy = (norm_sy * 1023.0).astype(np.uint32)
+        sz = (norm_sz * 2047.0).astype(np.uint32)
+        packed_data[:, 2] = (sx << 21) | (sy << 11) | sz
+        # --- COLOR QUANTIZATION (8-8-8-8 bits) ---
+        color_rgb = sorted_sh0 * SH_C0 + 0.5
+        range_r = max_r[sorted_chunk_indices] - min_r[sorted_chunk_indices]
+        range_g = max_g[sorted_chunk_indices] - min_g[sorted_chunk_indices]
+        range_b = max_b[sorted_chunk_indices] - min_b[sorted_chunk_indices]
+        range_r = np.where(range_r == 0, 1.0, range_r)
+        range_g = np.where(range_g == 0, 1.0, range_g)
+        range_b = np.where(range_b == 0, 1.0, range_b)
+        norm_r = (color_rgb[:, 0] - min_r[sorted_chunk_indices]) / range_r
+        norm_g = (color_rgb[:, 1] - min_g[sorted_chunk_indices]) / range_g
+        norm_b = (color_rgb[:, 2] - min_b[sorted_chunk_indices]) / range_b
+        norm_r = np.clip(norm_r, 0.0, 1.0)
+        norm_g = np.clip(norm_g, 0.0, 1.0)
+        norm_b = np.clip(norm_b, 0.0, 1.0)
+        cr = (norm_r * 255.0).astype(np.uint32)
+        cg = (norm_g * 255.0).astype(np.uint32)
+        cb = (norm_b * 255.0).astype(np.uint32)
+        opacity_linear = 1.0 / (1.0 + np.exp(-sorted_opacities))
+        opacity_linear = np.clip(opacity_linear, 0.0, 1.0)
+        co = (opacity_linear * 255.0).astype(np.uint32)
+        packed_data[:, 3] = (cr << 24) | (cg << 16) | (cb << 8) | co
+        # --- QUATERNION QUANTIZATION (smallest-three encoding: 2+10+10+10 bits) ---
+        quats_normalized = sorted_quats / np.linalg.norm(sorted_quats, axis=1, keepdims=True)
+        abs_quats = np.abs(quats_normalized)
+        largest_idx = np.argmax(abs_quats, axis=1).astype(np.uint32)
+        sign_mask = np.take_along_axis(quats_normalized, largest_idx[:, np.newaxis], axis=1) < 0
+        quats_normalized = np.where(sign_mask, -quats_normalized, quats_normalized)
+        mask = np.ones((num_gaussians, 4), dtype=bool)
+        mask[np.arange(num_gaussians), largest_idx] = False
+        three_components = quats_normalized[mask].reshape(num_gaussians, 3)
+        norm = np.sqrt(2.0) * 0.5
+        qa_norm = three_components[:, 0] * norm + 0.5
+        qb_norm = three_components[:, 1] * norm + 0.5
+        qc_norm = three_components[:, 2] * norm + 0.5
+        qa_norm = np.clip(qa_norm, 0.0, 1.0)
+        qb_norm = np.clip(qb_norm, 0.0, 1.0)
+        qc_norm = np.clip(qc_norm, 0.0, 1.0)
+        qa_int = (qa_norm * 1023.0).astype(np.uint32)
+        qb_int = (qb_norm * 1023.0).astype(np.uint32)
+        qc_int = (qc_norm * 1023.0).astype(np.uint32)
+        packed_data[:, 1] = (largest_idx << 30) | (qa_int << 20) | (qb_int << 10) | qc_int
+    # ====================================================================================
+    # SH COEFFICIENT COMPRESSION (8-bit quantization)
+    # ====================================================================================
+    packed_sh = None
+    if sorted_shN is not None and sorted_shN.shape[1] > 0:
+        # Normalize to [0, 1] range
+        # SH values are typically in range [-4, 4]
+        # This matches splat-transform: nvalue = shN / 8 + 0.5 (write-compressed-ply.ts:85)
+        sh_normalized = (sorted_shN / 8.0) + 0.5
+        sh_normalized = np.clip(sh_normalized, 0.0, 1.0)
+        # Quantize to uint8: trunc(nvalue * 256), clamped to [0, 255]
+        # This matches splat-transform: Math.trunc(nvalue * 256) (write-compressed-ply.ts:86)
+        packed_sh = np.clip(np.trunc(sh_normalized * 256.0), 0, 255).astype(np.uint8)
+    # ====================================================================================
+    # WRITE HEADER AND DATA
+    # ====================================================================================
+    # Build header
+    header_lines = [
+        "ply",
+        "format binary_little_endian 1.0",
+        f"element chunk {num_chunks}",
+    ]
+    # Add chunk properties (18 floats)
+    chunk_props = [
+        "min_x", "min_y", "min_z",
+        "max_x", "max_y", "max_z",
+        "min_scale_x", "min_scale_y", "min_scale_z",
+        "max_scale_x", "max_scale_y", "max_scale_z",
+        "min_r", "min_g", "min_b",
+        "max_r", "max_g", "max_b",
+    ]
+    for prop in chunk_props:
+        header_lines.append(f"property float {prop}")
+    # Add vertex element
+    header_lines.append(f"element vertex {num_gaussians}")
+    header_lines.append("property uint packed_position")
+    header_lines.append("property uint packed_rotation")
+    header_lines.append("property uint packed_scale")
+    header_lines.append("property uint packed_color")
+    # Add SH element if present
+    if packed_sh is not None:
+        num_sh_coeffs = packed_sh.shape[1]
+        header_lines.append(f"element sh {num_gaussians}")
+        for i in range(num_sh_coeffs):
+            header_lines.append(f"property uchar coeff_{i}")
+    header_lines.append("end_header")
+    header = "\n".join(header_lines) + "\n"
+    header_bytes = header.encode('ascii')
+    # Write to file
+    with open(file_path, 'wb') as f:
+        f.write(header_bytes)
+        chunk_bounds.tofile(f)
+        packed_data.tofile(f)
+        if packed_sh is not None:
+            packed_sh.tofile(f)
+    logger.debug(f"[Gaussian PLY] Wrote compressed: {num_gaussians} Gaussians to {file_path.name} "
+                 f"({num_chunks} chunks, {len(header_bytes) + chunk_bounds.nbytes + packed_data.nbytes + (packed_sh.nbytes if packed_sh is not None else 0)} bytes)")
+# ======================================================================================
+# UNIFIED WRITING API
+# ======================================================================================
+def plywrite(
+    file_path: Union[str, Path],
+    means: np.ndarray,
+    scales: np.ndarray,
+    quats: np.ndarray,
+    opacities: np.ndarray,
+    sh0: np.ndarray,
+    shN: Optional[np.ndarray] = None,
+    compressed: bool = False,
+    validate: bool = True,
+) -> None:
+    """Write Gaussian splatting PLY file (auto-select format).
+    Automatically selects format based on compressed parameter or file extension:
+    - compressed=False or .ply -> uncompressed (fast)
+    - compressed=True -> automatically saves as .compressed.ply
+    - .compressed.ply or .ply_compressed extension -> compressed format
+    When compressed=True, the output file extension is automatically changed to
+    .compressed.ply (e.g., "output.ply" becomes "output.compressed.ply").
+    Args:
+        file_path: Output PLY file path (extension auto-adjusted if compressed=True)
+        means: (N, 3) - xyz positions
+        scales: (N, 3) - scale parameters
+        quats: (N, 4) - rotation quaternions
+        opacities: (N,) - opacity values
+        sh0: (N, 3) - DC spherical harmonics
+        shN: (N, K, 3) or (N, K*3) - Higher-order SH coefficients (optional)
+        compressed: If True, write compressed format and auto-adjust extension
+        validate: If True, validate input shapes (default True). Disable for trusted data.
+    Example:
+        >>> # Write uncompressed (fast)
+        >>> plywrite("output.ply", means, scales, quats, opacities, sh0, shN)
+        >>> # Write compressed (saves as "output.compressed.ply")
+        >>> plywrite("output.ply", means, scales, quats, opacities, sh0, shN, compressed=True)
+        >>> # Or without higher-order SH
+        >>> plywrite("output.ply", means, scales, quats, opacities, sh0)
+        >>> # Skip validation for trusted data (5-10% faster)
+        >>> plywrite("output.ply", means, scales, quats, opacities, sh0, validate=False)
+    """
+    file_path = Path(file_path)
+    # Auto-detect compression from extension
+    is_compressed_ext = file_path.name.endswith(('.ply_compressed', '.compressed.ply'))
+    # Check if compressed format requested
+    if compressed or is_compressed_ext:
+        # If compressed=True but no compressed extension, add .compressed.ply
+        if compressed and not is_compressed_ext:
+            # Replace .ply with .compressed.ply, or just append if no .ply
+            if file_path.suffix == '.ply':
+                file_path = file_path.with_suffix('.compressed.ply')
+            else:
+                file_path = Path(str(file_path) + '.compressed.ply')
+        write_compressed(file_path, means, scales, quats, opacities, sh0, shN)
+    else:
+        write_uncompressed(file_path, means, scales, quats, opacities, sh0, shN, validate=validate)
+__all__ = [
+    'plywrite',
+    'write_uncompressed',
+    'write_compressed',
+]