PyPI - torchflat - Versions diffs - 0.8.0__py3-none-any.whl - Mend

torchflat 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

torchflat/__init__.py +15 -0
torchflat/_kernel_loader.py +289 -0
torchflat/_utils.py +99 -0
torchflat/batching.py +238 -0
torchflat/cli.py +535 -0
torchflat/clipping.py +85 -0
torchflat/csrc/build/test_combined.cpp +29 -0
torchflat/csrc/build/test_error_check.cpp +47 -0
torchflat/csrc/build/test_kernel.cpp +29 -0
torchflat/csrc/build/umi_kernel_hip.cpp +258 -0
torchflat/csrc/masked_median_kernel_hip.cpp +202 -0
torchflat/csrc/umi_ext.cpp +24 -0
torchflat/csrc/umi_kernel.cu +490 -0
torchflat/gaps.py +146 -0
torchflat/highpass.py +146 -0
torchflat/normalize.py +52 -0
torchflat/pipeline.py +604 -0
torchflat/py.typed +0 -0
torchflat/quality.py +30 -0
torchflat/umi.py +185 -0
torchflat/windows.py +87 -0
torchflat-0.8.0.dist-info/METADATA +234 -0
torchflat-0.8.0.dist-info/RECORD +27 -0
torchflat-0.8.0.dist-info/WHEEL +5 -0
torchflat-0.8.0.dist-info/entry_points.txt +2 -0
torchflat-0.8.0.dist-info/licenses/LICENSE +21 -0
torchflat-0.8.0.dist-info/top_level.txt +1 -0

torchflat/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""TorchFlat: GPU-native photometric preprocessing for exoplanet transit searches."""
+__version__ = "0.8.0"
+from torchflat.umi import umi_detrend
+from torchflat.pipeline import preprocess_sector, preprocess_track_a, preprocess_track_b
+from torchflat.windows import DEFAULT_WINDOW_SCALES
+__all__ = [
+    "umi_detrend",
+    "preprocess_sector",
+    "preprocess_track_a",
+    "preprocess_track_b",
+    "DEFAULT_WINDOW_SCALES",
+]

torchflat/_kernel_loader.py ADDED Viewed

@@ -0,0 +1,289 @@
+"""JIT-compile and load UMI CUDA/HIP kernels.
+Provides two kernels:
+  - ``masked_median``: O(n) median via quickselect (legacy, used by rolling_clip)
+  - ``umi_median_mad``: O(n) median + MAD in single call (used by umi_detrend)
+Kernels are compiled on first use and cached for subsequent imports.
+Falls back gracefully if compilation fails (no GPU, no toolkit, etc.).
+Set TORCHFLAT_NO_KERNEL=1 to disable kernels entirely.
+"""
+from __future__ import annotations
+import ctypes
+import importlib.util
+import logging
+import os
+import subprocess
+import sys
+import sysconfig
+from pathlib import Path
+logger = logging.getLogger("torchflat")
+_umi_kernel_module = None
+_umi_kernel_load_attempted = False
+def _short_path(p: str) -> str:
+    """Get Windows 8.3 short path (avoids spaces breaking compiler args)."""
+    if sys.platform != "win32":
+        return p
+    buf = ctypes.create_unicode_buffer(260)
+    ctypes.windll.kernel32.GetShortPathNameW(str(p), buf, 260)
+    return buf.value or str(p)
+# ---------------------------------------------------------------------------
+# UMI median+MAD kernel (used by umi_detrend)
+# ---------------------------------------------------------------------------
+def _get_umi_kernel():
+    """Load the UMI median+MAD kernel. Returns None if unavailable."""
+    global _umi_kernel_module, _umi_kernel_load_attempted
+    if _umi_kernel_load_attempted:
+        return _umi_kernel_module
+    _umi_kernel_load_attempted = True
+    import torch
+    if not torch.cuda.is_available():
+        return None
+    if os.environ.get("TORCHFLAT_NO_KERNEL", "0") == "1":
+        logger.warning(
+            "UMI kernel disabled by TORCHFLAT_NO_KERNEL=1. "
+            "Using torch.sort fallback (6x slower). "
+            "Unset the variable to enable the kernel."
+        )
+        return None
+    csrc_dir = Path(__file__).parent / "csrc"
+    build_dir = csrc_dir / "build"
+    pyd_name = "torchflat_umi_ext"
+    pyd_path = build_dir / f"{pyd_name}.pyd"
+    # Ensure ROCm DLLs are findable (amdhip64.dll etc.)
+    _add_rocm_dll_dirs()
+    if pyd_path.exists():
+        try:
+            spec = importlib.util.spec_from_file_location(pyd_name, str(pyd_path))
+            mod = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(mod)
+            logger.info("Loaded cached UMI kernel from %s", pyd_path)
+            _umi_kernel_module = mod
+            return _umi_kernel_module
+        except Exception as e:
+            logger.warning("Failed to load cached UMI kernel: %s", e)
+            pyd_path.unlink(missing_ok=True)
+    try:
+        if torch.version.hip:
+            _umi_kernel_module = _compile_hip_rocm72(
+                csrc_dir, pyd_name,
+                csrc_dir / "build" / "umi_kernel_hip.cpp",
+                csrc_dir / "umi_ext.cpp",
+            )
+        else:
+            _umi_kernel_module = _compile_cuda_umi(csrc_dir)
+        if _umi_kernel_module is not None:
+            logger.info("UMI kernel compiled and loaded successfully")
+    except Exception as e:
+        logger.warning("Failed to compile UMI kernel: %s", e)
+        _umi_kernel_module = None
+    return _umi_kernel_module
+# ---------------------------------------------------------------------------
+# Compilation backends
+# ---------------------------------------------------------------------------
+def _compile_hip_rocm72(csrc_dir: Path, pyd_name: str, hip_src: Path, ext_src: Path):
+    """Compile a HIP kernel on Windows using ROCm 7.2 SDK from pip."""
+    import torch
+    build_dir = csrc_dir / "build"
+    build_dir.mkdir(exist_ok=True)
+    pyd_path = build_dir / f"{pyd_name}.pyd"
+    rocm_sdk_path = _find_rocm72_sdk()
+    if rocm_sdk_path is None:
+        raise RuntimeError(
+            "ROCm 7.2 SDK not found. Install it with:\n"
+            "  pip install https://repo.radeon.com/rocm/windows/rocm-rel-7.2/"
+            "rocm_sdk_devel-7.2.0.dev0-py3-none-win_amd64.whl\n"
+            "  pip install https://repo.radeon.com/rocm/windows/rocm-rel-7.2/"
+            "rocm_sdk_core-7.2.0.dev0-py3-none-win_amd64.whl"
+        )
+    clang = str(rocm_sdk_path / "lib" / "llvm" / "bin" / "amdclang++.exe")
+    if not os.path.exists(clang):
+        raise RuntimeError(f"amdclang++ not found at {clang}")
+    if not hip_src.exists():
+        raise RuntimeError(f"Hipified kernel source not found at {hip_src}")
+    rocm_sp = _short_path(str(rocm_sdk_path))
+    device_lib = _short_path(str(rocm_sdk_path / "lib" / "llvm" / "amdgcn" / "bitcode"))
+    _setup_msvc_env()
+    torch_dir = Path(torch.__file__).parent
+    python_inc = _short_path(sysconfig.get_path("include"))
+    python_lib = _short_path(str(Path(sysconfig.get_path("stdlib")).parent / "libs"))
+    torch_lib = _short_path(str(torch_dir / "lib"))
+    inc = [
+        f"-I{_short_path(str(torch_dir / 'include'))}",
+        f"-I{_short_path(str(torch_dir / 'include' / 'torch' / 'csrc' / 'api' / 'include'))}",
+        f"-I{python_inc}",
+        f"-I{rocm_sp}/include",
+    ]
+    defs = [
+        "-D__HIP_PLATFORM_AMD__",
+        f"-DTORCH_EXTENSION_NAME={pyd_name}",
+        "-DTORCH_API_INCLUDE_EXTENSION_H",
+    ]
+    # Step 1: Compile kernel (.cpp as HIP)
+    kernel_obj = build_dir / f"{pyd_name}_kernel.o"
+    logger.info("Compiling %s kernel with amdclang++...", pyd_name)
+    _run_cmd([
+        clang, "-O3", "-c", "-x", "hip",
+        f"--rocm-path={rocm_sp}",
+        f"--rocm-device-lib-path={device_lib}",
+        "--offload-arch=gfx1200",
+        *defs, *inc, "-std=c++17", "-w",
+        str(hip_src), "-o", str(kernel_obj),
+    ])
+    # Step 2: Compile binding (.cpp as C++)
+    ext_obj = build_dir / f"{pyd_name}_ext.o"
+    logger.info("Compiling %s binding...", pyd_name)
+    _run_cmd([
+        clang, "-O3", "-c",
+        *defs, *inc, "-std=c++17", "-w",
+        str(ext_src), "-o", str(ext_obj),
+    ])
+    # Step 3: Link into .pyd
+    logger.info("Linking %s...", pyd_name)
+    _run_cmd([
+        clang, "-shared",
+        str(kernel_obj), str(ext_obj),
+        f"-L{torch_lib}", "-ltorch", "-ltorch_cpu", "-ltorch_python", "-lc10", "-lc10_hip",
+        f"-L{python_lib}", f"-lpython{sys.version_info.major}{sys.version_info.minor}",
+        f"-L{rocm_sp}/lib", "-lamdhip64",
+        "-o", str(pyd_path),
+    ])
+    spec = importlib.util.spec_from_file_location(pyd_name, str(pyd_path))
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+def _compile_cuda_umi(csrc_dir: Path):
+    """Compile UMI median+MAD CUDA kernel."""
+    import torch.utils.cpp_extension as _ext
+    return _ext.load(
+        name="torchflat_umi_ext",
+        sources=[
+            str(csrc_dir / "umi_ext.cpp"),
+            str(csrc_dir / "umi_kernel.cu"),
+        ],
+        extra_cuda_cflags=["-O3", "--use_fast_math"],
+        verbose=False,
+    )
+# ---------------------------------------------------------------------------
+# Shared utilities
+# ---------------------------------------------------------------------------
+def _add_rocm_dll_dirs():
+    """Add ROCm SDK lib directories to DLL search path (Windows).
+    Required so the compiled .pyd can find amdhip64.dll at load time.
+    """
+    if sys.platform != "win32":
+        return
+    sdk = _find_rocm72_sdk()
+    if sdk is None:
+        return
+    lib_dir = sdk / "lib"
+    bin_dir = sdk / "bin"
+    for d in [lib_dir, bin_dir]:
+        if d.exists():
+            try:
+                os.add_dll_directory(str(d))
+            except OSError:
+                pass
+            # Also add to PATH as fallback
+            if str(d) not in os.environ.get("PATH", ""):
+                os.environ["PATH"] = str(d) + os.pathsep + os.environ.get("PATH", "")
+def _find_rocm72_sdk() -> Path | None:
+    """Find the ROCm 7.2 SDK installed via pip (rocm-sdk-core package)."""
+    candidates = [
+        Path(sys.prefix) / "Lib" / "site-packages" / "_rocm_sdk_core",
+        Path(os.path.expanduser("~")) / "AppData" / "Roaming" / "Python"
+        / f"Python{sys.version_info.major}{sys.version_info.minor}"
+        / "site-packages" / "_rocm_sdk_core",
+    ]
+    try:
+        import _rocm_sdk_core
+        candidates.insert(0, Path(_rocm_sdk_core.__file__).parent)
+    except ImportError:
+        pass
+    for p in candidates:
+        clang = p / "lib" / "llvm" / "bin" / "amdclang++.exe"
+        if clang.exists():
+            return p
+    return None
+def _setup_msvc_env():
+    """Setup MSVC compiler environment on Windows."""
+    if sys.platform != "win32":
+        return
+    if os.environ.get("VSINSTALLDIR"):
+        return
+    vcvars_candidates = [
+        r"C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat",
+        r"C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat",
+        r"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat",
+    ]
+    for vcvars in vcvars_candidates:
+        if os.path.exists(vcvars):
+            result = subprocess.run(
+                f'cmd /c ""{vcvars}" x64 && set"',
+                capture_output=True, text=True, shell=True,
+            )
+            for line in result.stdout.splitlines():
+                if "=" in line:
+                    k, _, v = line.partition("=")
+                    os.environ[k] = v
+            return
+    logger.warning("MSVC not found, linking may fail")
+def _run_cmd(cmd: list[str]):
+    """Run a command, raising RuntimeError on failure."""
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"Command failed (return {result.returncode}):\n"
+            f"  {' '.join(cmd[:5])}...\n"
+            f"  {result.stderr[-500:]}"
+        )

torchflat/_utils.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Shared utilities: masked median, padding helpers, constants."""
+from __future__ import annotations
+import torch
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+# Mission-specific quality bitmasks
+TESS_QUALITY_BITMASK: int = 0b0000110101111111  # = 3455
+KEPLER_QUALITY_BITMASK: int = 0b0001111111111111  # = 8191 (all Kepler quality flags)
+# Default is TESS (most common use case)
+QUALITY_BITMASK: int = TESS_QUALITY_BITMASK
+MIN_POINTS: int = 100           # Minimum valid points for a star to be processed
+GAP_THRESHOLD: float = 5.0      # Gap ratio above this = large gap (segment boundary)
+MIN_SEGMENT_LENGTH: int = 50    # Minimum valid points in a biweight window's segment
+# ---------------------------------------------------------------------------
+# Masked median
+# ---------------------------------------------------------------------------
+def masked_median(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Compute median along the last dimension, only over positions where *mask* is True.
+    Matches numpy.median convention for even-length arrays (averages two
+    middle values).
+    Args:
+        x: Tensor of any shape ``[..., N]``.
+        mask: Boolean tensor, same shape as *x*.  True = valid.
+    Returns:
+        Tensor of shape ``[...]`` with the median of valid values along the
+        last dimension.  Returns NaN where no valid values exist.
+    """
+    if x.shape[-1] == 0:
+        out_shape = x.shape[:-1]
+        return torch.full(out_shape, float("nan"), dtype=x.dtype, device=x.device)
+    W = x.shape[-1]
+    # torch.sort (O(n log n))
+    # Clone and push invalid values to +inf so they sort last
+    working = x.clone()
+    working[~mask] = float("inf")
+    n_valid = mask.sum(dim=-1)  # [...]
+    # Sort ascending: valid values first, +inf values last
+    sorted_vals = torch.sort(working, dim=-1).values
+    # Median indices (numpy even-length convention: average two middle values)
+    mid_lo = ((n_valid - 1) // 2).clamp(min=0)
+    mid_hi = (n_valid // 2).clamp(min=0)
+    val_lo = sorted_vals.gather(-1, mid_lo.unsqueeze(-1)).squeeze(-1)
+    val_hi = sorted_vals.gather(-1, mid_hi.unsqueeze(-1)).squeeze(-1)
+    median = (val_lo + val_hi) / 2.0
+    # Guard: no valid data -> NaN
+    median = median.where(n_valid > 0, torch.tensor(float("nan"), dtype=median.dtype, device=median.device))
+    return median
+# ---------------------------------------------------------------------------
+# Padding helper
+# ---------------------------------------------------------------------------
+def pad_to_length(
+    tensors: list[torch.Tensor],
+    target_len: int,
+    pad_value: float = 0.0,
+) -> torch.Tensor:
+    """Right-pad a list of 1-D tensors to *target_len* and stack into 2-D.
+    Args:
+        tensors: List of 1-D tensors (lengths ``<= target_len``).
+        target_len: Length to pad to.
+        pad_value: Fill value for padding positions.
+    Returns:
+        Tensor of shape ``[len(tensors), target_len]``.
+    """
+    B = len(tensors)
+    out = torch.full(
+        (B, target_len),
+        fill_value=pad_value,
+        dtype=tensors[0].dtype,
+        device=tensors[0].device,
+    )
+    for i, t in enumerate(tensors):
+        out[i, : t.shape[0]] = t
+    return out

torchflat/batching.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""CPU pre-scan, length-bucketed batch assembly, VRAM estimation."""
+from __future__ import annotations
+import logging
+from collections import defaultdict
+import numpy as np
+import torch
+from torchflat._utils import GAP_THRESHOLD, MIN_POINTS, QUALITY_BITMASK
+logger = logging.getLogger("torchflat")
+# ---------------------------------------------------------------------------
+# CPU pre-scan
+# ---------------------------------------------------------------------------
+def cpu_prescan(
+    times: list[np.ndarray],
+    fluxes: list[np.ndarray],
+    qualities: list[np.ndarray],
+    bitmask: int = QUALITY_BITMASK,
+    gap_threshold: float = GAP_THRESHOLD,
+    min_points: int = MIN_POINTS,
+    window_samples: int = 360,
+) -> list[dict]:
+    """CPU pre-scan: compute post-filter length and flag degenerate stars.
+    Runs a fast O(N) scan per star on CPU to determine quality counts, gap
+    insertions, and whether the star is degenerate (too few points or
+    segments too short for the biweight window).
+    """
+    results: list[dict] = []
+    for i, (t, f, q) in enumerate(zip(times, fluxes, qualities)):
+        valid = ((q & bitmask) == 0) & np.isfinite(f) & np.isfinite(t)
+        n_valid = int(valid.sum())
+        if n_valid < 2:
+            results.append({
+                "index": i,
+                "n_valid": n_valid,
+                "n_insertions": 0,
+                "post_filter_length": n_valid,
+                "max_segment_length": n_valid,
+                "degenerate": True,
+                "degenerate_reason": "too_few_valid_points",
+            })
+            continue
+        t_valid = t[valid]
+        dt = np.diff(t_valid)
+        med_cadence = float(np.median(dt))
+        if med_cadence <= 0:
+            med_cadence = 1e-10
+        gap_ratio = dt / med_cadence
+        # Count small-gap interpolation insertions
+        small_gaps = (gap_ratio > 1.5) & (gap_ratio < gap_threshold)
+        n_insertions = 0
+        for gr in gap_ratio[small_gaps]:
+            n_insertions += int(round(gr)) - 1
+        # Longest segment
+        large_gap_pos = np.where(gap_ratio >= gap_threshold)[0]
+        boundaries = np.concatenate([[-1], large_gap_pos, [len(dt)]])
+        segment_lengths = np.diff(boundaries)
+        max_segment = int(segment_lengths.max()) if len(segment_lengths) > 0 else n_valid
+        post_filter_length = n_valid + n_insertions
+        degenerate = False
+        reason = None
+        if n_valid < min_points:
+            degenerate = True
+            reason = "too_few_valid_points"
+        elif max_segment < window_samples:
+            degenerate = True
+            reason = "segment_too_short"
+        results.append({
+            "index": i,
+            "n_valid": n_valid,
+            "n_insertions": n_insertions,
+            "post_filter_length": post_filter_length,
+            "max_segment_length": max_segment,
+            "degenerate": degenerate,
+            "degenerate_reason": reason,
+        })
+    return results
+# ---------------------------------------------------------------------------
+# Bucketing
+# ---------------------------------------------------------------------------
+def bucket_stars(
+    prescan_results: list[dict],
+    bucket_width: int = 1000,
+) -> list[dict]:
+    """Group non-degenerate stars into length buckets for batched GPU processing."""
+    buckets_map: dict[int, list[int]] = defaultdict(list)
+    for info in prescan_results:
+        if info["degenerate"]:
+            continue
+        pfl = info["post_filter_length"]
+        bucket_key = (pfl // bucket_width) * bucket_width + bucket_width
+        buckets_map[bucket_key].append(info["index"])
+    buckets: list[dict] = []
+    for pad_length in sorted(buckets_map.keys()):
+        buckets.append({
+            "star_indices": buckets_map[pad_length],
+            "pad_length": pad_length,
+        })
+    return buckets
+# ---------------------------------------------------------------------------
+# Batch assembly
+# ---------------------------------------------------------------------------
+def assemble_batch(
+    star_indices: list[int],
+    times: list[np.ndarray],
+    fluxes: list[np.ndarray],
+    qualities: list[np.ndarray],
+    pad_length: int,
+    device: torch.device,
+) -> dict:
+    """Pad, stack, and transfer a batch of stars to GPU.
+    Returns dict with tensors ``time``, ``flux``, ``quality``, ``lengths``,
+    and ``valid_mask`` on *device*.
+    """
+    B = len(star_indices)
+    time_batch = torch.zeros(B, pad_length, dtype=torch.float64)
+    flux_batch = torch.zeros(B, pad_length, dtype=torch.float32)
+    quality_batch = torch.zeros(B, pad_length, dtype=torch.int32)
+    lengths = torch.zeros(B, dtype=torch.long)
+    valid_mask = torch.zeros(B, pad_length, dtype=torch.bool)
+    for j, idx in enumerate(star_indices):
+        t = times[idx]
+        f = fluxes[idx].astype(np.float32)
+        q = qualities[idx]
+        L = len(t)
+        n = min(L, pad_length)
+        # CPU gap interpolation (faster than GPU cummax/cummin, <1% of points)
+        v = ((q[:n] & QUALITY_BITMASK) == 0) & np.isfinite(f[:n]) & np.isfinite(t[:n])
+        i = 0
+        while i < n:
+            if not v[i]:
+                gs = i
+                while i < n and not v[i]:
+                    i += 1
+                ge = i
+                if gs > 0 and ge < n and (ge - gs) <= 4:
+                    for k in range(ge - gs):
+                        frac = (k + 1) / (ge - gs + 1)
+                        f[gs + k] = f[gs - 1] + frac * (f[ge] - f[gs - 1])
+                        v[gs + k] = True
+            else:
+                i += 1
+        time_batch[j, :n] = torch.from_numpy(t[:n].astype(np.float64))
+        flux_batch[j, :n] = torch.from_numpy(f[:n])
+        quality_batch[j, :n] = torch.from_numpy(q[:n].astype(np.int32))
+        lengths[j] = n
+        valid_mask[j, :n] = True
+    return {
+        "time": time_batch.to(device),
+        "flux": flux_batch.to(device),
+        "quality": quality_batch.to(device),
+        "lengths": lengths.to(device),
+        "valid_mask": valid_mask.to(device),
+    }
+# ---------------------------------------------------------------------------
+# VRAM estimation
+# ---------------------------------------------------------------------------
+def estimate_peak_vram(L: int, win: int, dtype_bytes: int = 4) -> int:
+    """Estimate peak VRAM per star (bytes) during biweight detrending.
+    Accounts for persistent window tensors, per-iteration intermediates,
+    and temporary topk/sort int64 indices.
+    """
+    N_pos = L - win + 1
+    window_bytes = N_pos * win * dtype_bytes
+    bool_window_bytes = N_pos * win * 1
+    indices_bytes = N_pos * win * 8
+    seg_window_bytes = N_pos * win * 4  # int32 segment IDs
+    base = L * 17  # flux(4) + time(8) + valid(1) + seg_id(4)
+    persistent = window_bytes + seg_window_bytes + bool_window_bytes
+    per_iter = 4 * window_bytes  # topk_clone + abs_dev + u + weights
+    peak_temp = indices_bytes
+    return base + persistent + per_iter + peak_temp
+def compute_max_batch(
+    pad_length: int,
+    win: int = 360,
+    device: torch.device | None = None,
+    vram_budget_gb: float | None = None,
+    max_batch_override: int | None = None,
+    safety_factor: float = 0.8,
+) -> int:
+    """Dynamic max_batch with 3-tier priority: override > budget > auto-detect."""
+    if max_batch_override is not None:
+        return max(1, max_batch_override)
+    if vram_budget_gb is not None:
+        available = int(vram_budget_gb * 1024**3)
+    elif device is not None and device.type == "cuda":
+        props = torch.cuda.get_device_properties(device)
+        total = props.total_memory
+        headroom = 4 * 1024**3
+        available = total - headroom
+    else:
+        return 1  # CPU fallback
+    peak_per_star = estimate_peak_vram(pad_length, win)
+    if peak_per_star <= 0:
+        return 1
+    max_batch = int(available * safety_factor / peak_per_star)
+    return max(1, min(max_batch, 50))