PyPI - giga-spatial - Versions diffs - 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
gigaspatial/__init__.py +1 -1
gigaspatial/config.py +1 -0
gigaspatial/core/io/adls_data_store.py +104 -11
gigaspatial/core/io/local_data_store.py +8 -0
gigaspatial/generators/poi.py +226 -82
gigaspatial/generators/zonal/base.py +41 -28
gigaspatial/generators/zonal/geometry.py +91 -41
gigaspatial/grid/h3.py +417 -0
gigaspatial/grid/mercator_tiles.py +1 -1
gigaspatial/handlers/base.py +22 -8
gigaspatial/handlers/ghsl.py +22 -8
gigaspatial/handlers/giga.py +9 -4
gigaspatial/handlers/healthsites.py +350 -0
gigaspatial/handlers/osm.py +325 -105
gigaspatial/handlers/worldpop.py +228 -9
gigaspatial/processing/geo.py +11 -6
gigaspatial/processing/tif_processor.py +1183 -496
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0

gigaspatial/processing/tif_processor.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import numpy as np
 import pandas as pd
 import geopandas as gpd
-from typing import List, Optional, Tuple, Union, Literal, Callable
+import networkx as nx
+import scipy.sparse as sp
+from typing import List, Optional, Tuple, Union, Literal, Callable, Dict, Any
 from pydantic import ConfigDict
 from pydantic.dataclasses import dataclass
 from contextlib import contextmanager
@@ -15,12 +17,17 @@ from functools import partial
 import multiprocessing
 from tqdm import tqdm
 import tempfile
+import shutil
 import os
 from gigaspatial.core.io.data_store import DataStore
 from gigaspatial.core.io.local_data_store import LocalDataStore
 from gigaspatial.config import config
+# Global variables for multiprocessing workers
+src_handle = None
+memfile_handle = None
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class TifProcessor:
@@ -35,50 +42,164 @@ class TifProcessor:
     merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
     target_crs: Optional[str] = None  # For reprojection if needed
     resampling_method: Resampling = Resampling.nearest
+    reprojection_resolution: Optional[Tuple[float, float]] = None
     def __post_init__(self):
         """Validate inputs, merge rasters if needed, and set up logging."""
         self.data_store = self.data_store or LocalDataStore()
         self.logger = config.get_logger(self.__class__.__name__)
         self._cache = {}
+        self._temp_dir = tempfile.mkdtemp()
         self._merged_file_path = None
-        self._temp_dir = None
+        self._reprojected_file_path = None
         # Handle multiple dataset paths
         if isinstance(self.dataset_path, list):
-            self.dataset_paths = [Path(p) for p in self.dataset_path]
-            self._validate_multiple_datasets()
-            self._merge_rasters()
-            self.dataset_path = self._merged_file_path
+            if len(self.dataset_path) > 1:
+                self.dataset_paths = [Path(p) for p in self.dataset_path]
+                self._validate_multiple_datasets()
+                self._merge_rasters()
+                self.dataset_path = self._merged_file_path
         else:
             self.dataset_paths = [Path(self.dataset_path)]
-            if not self.data_store.file_exists(self.dataset_path):
+            if not self.data_store.file_exists(str(self.dataset_path)):
                 raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
+            # Reproject single raster during initialization if target_crs is set
+            if self.target_crs:
+                self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
+                with self.data_store.open(str(self.dataset_path), "rb") as f:
+                    with rasterio.MemoryFile(f.read()) as memfile:
+                        with memfile.open() as src:
+                            self._reprojected_file_path = self._reproject_to_temp_file(
+                                src, self.target_crs
+                            )
+                self.dataset_path = self._reprojected_file_path
         self._load_metadata()
+        self._validate_mode_band_compatibility()
-        # Validate mode and band count
-        if self.mode == "rgba" and self.count != 4:
-            raise ValueError("RGBA mode requires a 4-band TIF file")
-        if self.mode == "rgb" and self.count != 3:
-            raise ValueError("RGB mode requires a 3-band TIF file")
-        if self.mode == "single" and self.count != 1:
-            raise ValueError("Single mode requires a 1-band TIF file")
-        if self.mode == "multi" and self.count < 2:
-            raise ValueError("Multi mode requires a TIF file with 2 or more bands")
+    @contextmanager
+    def open_dataset(self):
+        """Context manager for accessing the dataset, handling temporary reprojected files."""
+        if self._merged_file_path:
+            with rasterio.open(self._merged_file_path) as src:
+                yield src
+        elif self._reprojected_file_path:
+            with rasterio.open(self._reprojected_file_path) as src:
+                yield src
+        elif isinstance(self.data_store, LocalDataStore):
+            with rasterio.open(str(self.dataset_path)) as src:
+                yield src
+        else:
+            with self.data_store.open(str(self.dataset_path), "rb") as f:
+                with rasterio.MemoryFile(f.read()) as memfile:
+                    with memfile.open() as src:
+                        yield src
+    def reproject_to(
+        self,
+        target_crs: str,
+        output_path: Optional[Union[str, Path]] = None,
+        resampling_method: Optional[Resampling] = None,
+        resolution: Optional[Tuple[float, float]] = None,
+    ):
+        """
+        Reprojects the current raster to a new CRS and optionally saves it.
+        Args:
+            target_crs: The CRS to reproject to (e.g., "EPSG:4326").
+            output_path: The path to save the reprojected raster. If None,
+                         it is saved to a temporary file.
+            resampling_method: The resampling method to use.
+            resolution: The target resolution (pixel size) in the new CRS.
+        """
+        self.logger.info(f"Reprojecting raster to {target_crs}...")
+        # Use provided or default values
+        resampling_method = resampling_method or self.resampling_method
+        resolution = resolution or self.reprojection_resolution
+        with self.open_dataset() as src:
+            if src.crs.to_string() == target_crs:
+                self.logger.info(
+                    "Raster is already in the target CRS. No reprojection needed."
+                )
+                # If output_path is specified, copy the file
+                if output_path:
+                    self.data_store.copy_file(str(self.dataset_path), output_path)
+                return self.dataset_path
+            dst_path = output_path or os.path.join(
+                self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
+            )
+            with rasterio.open(
+                dst_path,
+                "w",
+                **self._get_reprojection_profile(src, target_crs, resolution),
+            ) as dst:
+                for band_idx in range(1, src.count + 1):
+                    reproject(
+                        source=rasterio.band(src, band_idx),
+                        destination=rasterio.band(dst, band_idx),
+                        src_transform=src.transform,
+                        src_crs=src.crs,
+                        dst_transform=dst.transform,
+                        dst_crs=dst.crs,
+                        resampling=resampling_method,
+                        num_threads=multiprocessing.cpu_count(),
+                    )
+            self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
+            return Path(dst_path)
+    def get_raster_info(self) -> Dict[str, Any]:
+        """Get comprehensive raster information."""
+        return {
+            "count": self.count,
+            "width": self.width,
+            "height": self.height,
+            "crs": self.crs,
+            "bounds": self.bounds,
+            "transform": self.transform,
+            "dtypes": self.dtype,
+            "nodata": self.nodata,
+            "mode": self.mode,
+            "is_merged": self.is_merged,
+            "source_count": self.source_count,
+        }
+    def _reproject_to_temp_file(
+        self, src: rasterio.DatasetReader, target_crs: str
+    ) -> str:
+        """Helper to reproject a raster and save it to a temporary file."""
+        dst_path = os.path.join(
+            self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
+        )
+        profile = self._get_reprojection_profile(
+            src, target_crs, self.reprojection_resolution
+        )
+        with rasterio.open(dst_path, "w", **profile) as dst:
+            for band_idx in range(1, src.count + 1):
+                reproject(
+                    source=rasterio.band(src, band_idx),
+                    destination=rasterio.band(dst, band_idx),
+                    src_transform=src.transform,
+                    src_crs=src.crs,
+                    dst_transform=dst.transform,
+                    dst_crs=dst.crs,
+                    resampling=self.resampling_method,
+                )
+        return dst_path
     def _validate_multiple_datasets(self):
         """Validate that all datasets exist and have compatible properties."""
         if len(self.dataset_paths) < 2:
             raise ValueError("Multiple dataset paths required for merging")
-        # Check if all files exist
-        for path in self.dataset_paths:
-            if not self.data_store.file_exists(path):
-                raise FileNotFoundError(f"Dataset not found at {path}")
-        # Load first dataset to get reference properties
-        with self.data_store.open(self.dataset_paths[0], "rb") as f:
+        with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
             with rasterio.MemoryFile(f.read()) as memfile:
                 with memfile.open() as ref_src:
                     ref_count = ref_src.count
@@ -87,9 +208,8 @@ class TifProcessor:
                     ref_transform = ref_src.transform
                     ref_nodata = ref_src.nodata
-        # Validate all other datasets against reference
         for i, path in enumerate(self.dataset_paths[1:], 1):
-            with self.data_store.open(path, "rb") as f:
+            with self.data_store.open(str(path), "rb") as f:
                 with rasterio.MemoryFile(f.read()) as memfile:
                     with memfile.open() as src:
                         if src.count != ref_count:
@@ -100,9 +220,10 @@ class TifProcessor:
                             raise ValueError(
                                 f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
                             )
-                        if self.target_crs is None and src.crs != ref_crs:
-                            raise ValueError(
-                                f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. Consider setting target_crs parameter."
+                        if not self.target_crs and src.crs != ref_crs:
+                            self.logger.warning(
+                                f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
+                                "Consider setting target_crs parameter for reprojection before merging."
                             )
                         if self.target_crs is None and not self._transforms_compatible(
                             src.transform, ref_transform
@@ -115,6 +236,46 @@ class TifProcessor:
                                 f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
                             )
+    def _get_reprojection_profile(
+        self,
+        src: rasterio.DatasetReader,
+        target_crs: str,
+        resolution: Optional[Tuple[float, float]],
+        compression: str = "lzw",
+    ):
+        """Calculates and returns the profile for a reprojected raster."""
+        if resolution:
+            src_res = (abs(src.transform.a), abs(src.transform.e))
+            self.logger.info(
+                f"Using target resolution: {resolution}. Source resolution: {src_res}."
+            )
+            # Calculate transform and dimensions based on the new resolution
+            dst_transform, width, height = calculate_default_transform(
+                src.crs,
+                target_crs,
+                src.width,
+                src.height,
+                *src.bounds,
+                resolution=resolution,
+            )
+        else:
+            # Keep original resolution but reproject
+            dst_transform, width, height = calculate_default_transform(
+                src.crs, target_crs, src.width, src.height, *src.bounds
+            )
+        profile = src.profile.copy()
+        profile.update(
+            {
+                "crs": target_crs,
+                "transform": dst_transform,
+                "width": width,
+                "height": height,
+                "compress": compression,  # Add compression to save space
+            }
+        )
+        return profile
     def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
         """Check if two transforms have compatible pixel sizes."""
         return (
@@ -126,151 +287,77 @@ class TifProcessor:
         """Merge multiple rasters into a single raster."""
         self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")
-        # Create temporary directory for merged file
-        self._temp_dir = tempfile.mkdtemp()
-        merged_filename = "merged_raster.tif"
-        self._merged_file_path = os.path.join(self._temp_dir, merged_filename)
         # Open all datasets and handle reprojection if needed
-        src_files = []
-        reprojected_files = []
+        datasets_to_merge = []
+        temp_reprojected_files = []
         try:
             for path in self.dataset_paths:
-                with self.data_store.open(path, "rb") as f:
-                    # Create temporary file for each dataset
-                    temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
-                    temp_file.write(f.read())
-                    temp_file.close()
-                    src_files.append(rasterio.open(temp_file.name))
-            # Handle reprojection if target_crs is specified
-            if self.target_crs:
-                self.logger.info(f"Reprojecting rasters to {self.target_crs}...")
-                processed_files = self._reproject_rasters(src_files, self.target_crs)
-                reprojected_files = processed_files
-            else:
-                processed_files = src_files
+                with self.data_store.open(str(path), "rb") as f:
+                    with rasterio.MemoryFile(f.read()) as memfile:
+                        with memfile.open() as src:
+                            if self.target_crs and src.crs != self.target_crs:
+                                self.logger.info(
+                                    f"Reprojecting {path.name} to {self.target_crs} before merging."
+                                )
+                                reprojected_path = self._reproject_to_temp_file(
+                                    src, self.target_crs
+                                )
+                                temp_reprojected_files.append(reprojected_path)
+                                datasets_to_merge.append(
+                                    rasterio.open(reprojected_path)
+                                )
+                            else:
+                                temp_path = os.path.join(
+                                    self._temp_dir,
+                                    f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
+                                )
+                                temp_reprojected_files.append(temp_path)
+                                profile = src.profile
+                                with rasterio.open(temp_path, "w", **profile) as dst:
+                                    dst.write(src.read())
+                                datasets_to_merge.append(rasterio.open(temp_path))
+            self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")
             if self.merge_method == "mean":
-                # For mean, we need to handle it manually
-                merged_array, merged_transform = self._merge_with_mean(src_files)
-                # Use first source as reference for metadata
-                ref_src = src_files[0]
-                profile = ref_src.profile.copy()
-                profile.update(
-                    {
-                        "height": merged_array.shape[-2],
-                        "width": merged_array.shape[-1],
-                        "transform": merged_transform,
-                    }
+                merged_array, merged_transform = self._merge_with_mean(
+                    datasets_to_merge
                 )
-                # Write merged raster
-                with rasterio.open(self._merged_file_path, "w", **profile) as dst:
-                    dst.write(merged_array)
             else:
-                # Use rasterio's merge function
                 merged_array, merged_transform = merge(
-                    src_files,
+                    datasets_to_merge,
                     method=self.merge_method,
                     resampling=self.resampling_method,
                 )
-                # Use first source as reference for metadata
-                ref_src = src_files[0]
-                profile = ref_src.profile.copy()
-                profile.update(
-                    {
-                        "height": merged_array.shape[-2],
-                        "width": merged_array.shape[-1],
-                        "transform": merged_transform,
-                    }
-                )
-                if self.target_crs:
-                    profile["crs"] = self.target_crs
-                # Write merged raster
-                with rasterio.open(self._merged_file_path, "w", **profile) as dst:
-                    dst.write(merged_array)
-        finally:
-            # Clean up source files
-            for src in src_files:
-                temp_path = src.name
-                src.close()
-                try:
-                    os.unlink(temp_path)
-                except:
-                    pass
-            # Clean up reprojected files
-            for src in reprojected_files:
-                if src not in src_files:  # Don't double-close
-                    temp_path = src.name
-                    src.close()
-                    try:
-                        os.unlink(temp_path)
-                    except:
-                        pass
-        self.logger.info("Raster merging completed!")
-    def _reproject_rasters(self, src_files, target_crs):
-        """Reproject all rasters to a common CRS before merging."""
-        reprojected_files = []
-        for i, src in enumerate(src_files):
-            if src.crs.to_string() == target_crs:
-                # No reprojection needed
-                reprojected_files.append(src)
-                continue
-            # Calculate transform and dimensions for reprojection
-            transform, width, height = calculate_default_transform(
-                src.crs,
-                target_crs,
-                src.width,
-                src.height,
-                *src.bounds,
-                resolution=self.resolution if hasattr(self, "resolution") else None,
-            )
-            # Create temporary file for reprojected raster
-            temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
-            temp_file.close()
-            # Set up profile for reprojected raster
-            profile = src.profile.copy()
+            # Get profile from the first file in the list (all should be compatible now)
+            ref_src = datasets_to_merge[0]
+            profile = ref_src.profile.copy()
             profile.update(
                 {
-                    "crs": target_crs,
-                    "transform": transform,
-                    "width": width,
-                    "height": height,
+                    "height": merged_array.shape[-2],
+                    "width": merged_array.shape[-1],
+                    "transform": merged_transform,
+                    "crs": self.target_crs if self.target_crs else ref_src.crs,
                 }
             )
-            # Reproject and write to temporary file
-            with rasterio.open(temp_file.name, "w", **profile) as dst:
-                for band_idx in range(1, src.count + 1):
-                    reproject(
-                        source=rasterio.band(src, band_idx),
-                        destination=rasterio.band(dst, band_idx),
-                        src_transform=src.transform,
-                        src_crs=src.crs,
-                        dst_transform=transform,
-                        dst_crs=target_crs,
-                        resampling=self.resampling_method,
-                    )
+            with rasterio.open(self._merged_file_path, "w", **profile) as dst:
+                dst.write(merged_array)
+        finally:
+            for dataset in datasets_to_merge:
+                if hasattr(dataset, "close"):
+                    dataset.close()
-            # Open reprojected file
-            reprojected_files.append(rasterio.open(temp_file.name))
+            # Clean up temporary files immediately
+            for temp_file in temp_reprojected_files:
+                try:
+                    os.remove(temp_file)
+                except OSError:
+                    pass
-        return reprojected_files
+        self.logger.info("Raster merging completed!")
     def _merge_with_mean(self, src_files):
         """Merge rasters using mean aggregation."""
@@ -295,6 +382,12 @@ class TifProcessor:
             bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
         )
+        estimated_memory = height * width * src_files[0].count * 8  # float64
+        if estimated_memory > 1e9:  # 1GB threshold
+            self.logger.warning(
+                f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
+            )
         # Initialize arrays for sum and count
         sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
         count_array = np.zeros((height, width), dtype=np.int32)
@@ -336,33 +429,9 @@ class TifProcessor:
         return mean_array.astype(src_files[0].dtypes[0]), merged_transform
-    def __del__(self):
-        """Cleanup temporary files."""
-        if self._temp_dir and os.path.exists(self._temp_dir):
-            try:
-                import shutil
-                shutil.rmtree(self._temp_dir)
-            except:
-                pass
-    @contextmanager
-    def open_dataset(self):
-        """Context manager for accessing the dataset"""
-        if self._merged_file_path:
-            # Open merged file directly
-            with rasterio.open(self._merged_file_path) as src:
-                yield src
-        else:
-            # Original single file logic
-            with self.data_store.open(self.dataset_path, "rb") as f:
-                with rasterio.MemoryFile(f.read()) as memfile:
-                    with memfile.open() as src:
-                        yield src
     def _load_metadata(self):
         """Load metadata from the TIF file if not already cached"""
-        if not self._cache:
+        try:
             with self.open_dataset() as src:
                 self._cache["transform"] = src.transform
                 self._cache["crs"] = src.crs.to_string()
@@ -375,6 +444,10 @@ class TifProcessor:
                 self._cache["nodata"] = src.nodata
                 self._cache["count"] = src.count
                 self._cache["dtype"] = src.dtypes[0]
+        except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
+            raise FileNotFoundError(f"Could not read raster metadata: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Unexpected error loading metadata: {e}")
     @property
     def is_merged(self) -> bool:
@@ -386,7 +459,6 @@ class TifProcessor:
         """Get the number of source rasters."""
         return len(self.dataset_paths)
-    # All other methods remain the same...
     @property
     def transform(self):
         """Get the transform from the TIF file"""
@@ -428,53 +500,48 @@ class TifProcessor:
         return self._cache["nodata"]
     @property
-    def tabular(self) -> pd.DataFrame:
-        """Get the data from the TIF file"""
-        self.logger.warning(
-            "The `tabular` property is deprecated, use `to_dataframe` instead"
-        )
-        if not hasattr(self, "_tabular"):
-            try:
-                if self.mode == "single":
-                    self._tabular = self._to_band_dataframe(
-                        drop_nodata=True, drop_values=[]
-                    )
-                elif self.mode == "rgb":
-                    self._tabular = self._to_rgb_dataframe(drop_nodata=True)
-                elif self.mode == "rgba":
-                    self._tabular = self._to_rgba_dataframe(drop_transparent=True)
-                elif self.mode == "multi":
-                    self._tabular = self._to_multi_band_dataframe(
-                        drop_nodata=True,
-                        drop_values=[],
-                        band_names=None,  # Use default band naming
-                    )
-                else:
-                    raise ValueError(
-                        f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
-                    )
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to process TIF file in mode '{self.mode}'. "
-                    f"Please ensure the file is valid and matches the selected mode. "
-                    f"Original error: {str(e)}"
-                )
+    def dtype(self):
+        """Get the data types from the TIF file"""
+        return self._cache.get("dtype", [])
+    @property
+    def width(self):
+        return self._cache["width"]
+    @property
+    def height(self):
+        return self._cache["height"]
+    def to_dataframe(
+        self, drop_nodata=True, check_memory=True, **kwargs
+    ) -> pd.DataFrame:
+        """
+        Convert raster to DataFrame.
+        Args:
+            drop_nodata: Whether to drop nodata values
+            check_memory: Whether to check memory before operation (default True)
+            **kwargs: Additional arguments
-        return self._tabular
+        Returns:
+            pd.DataFrame with raster data
+        """
+        # Memory guard check
+        if check_memory:
+            self._memory_guard("conversion", threshold_percent=80.0)
-    def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
         try:
             if self.mode == "single":
-                df = self._to_band_dataframe(drop_nodata=drop_nodata, **kwargs)
-            elif self.mode == "rgb":
-                df = self._to_rgb_dataframe(drop_nodata=drop_nodata)
-            elif self.mode == "rgba":
-                df = self._to_rgba_dataframe(drop_transparent=drop_nodata)
-            elif self.mode == "multi":
-                df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
+                return self._to_dataframe(
+                    band_number=kwargs.get("band_number", 1),
+                    drop_nodata=drop_nodata,
+                    band_names=kwargs.get("band_names", None),
+                )
             else:
-                raise ValueError(
-                    f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
+                return self._to_dataframe(
+                    band_number=None,  # All bands
+                    drop_nodata=drop_nodata,
+                    band_names=kwargs.get("band_names", None),
                 )
         except Exception as e:
             raise ValueError(
@@ -485,12 +552,23 @@ class TifProcessor:
         return df
-    def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
+    def to_geodataframe(self, check_memory=True, **kwargs) -> gpd.GeoDataFrame:
         """
         Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
         Each zone is defined by its bounding box, based on pixel resolution and coordinates.
+        Args:
+            check_memory: Whether to check memory before operation
+            **kwargs: Additional arguments passed to to_dataframe()
+        Returns:
+            gpd.GeoDataFrame with raster data
         """
-        df = self.to_dataframe(**kwargs)
+        # Memory guard check
+        if check_memory:
+            self._memory_guard("conversion", threshold_percent=80.0)
+        df = self.to_dataframe(check_memory=False, **kwargs)
         x_res, y_res = self.resolution
@@ -504,30 +582,300 @@ class TifProcessor:
         return gdf
-    def get_zoned_geodataframe(self) -> gpd.GeoDataFrame:
+    def to_dataframe_chunked(
+        self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
+    ):
         """
-        Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
-        Each zone is defined by its bounding box, based on pixel resolution and coordinates.
+        Convert raster to DataFrame using chunked processing for memory efficiency.
+        Automatically routes to the appropriate chunked method based on mode.
+        Chunk size is automatically calculated based on target memory usage.
+        Args:
+            drop_nodata: Whether to drop nodata values
+            chunk_size: Number of rows per chunk (auto-calculated if None)
+            target_memory_mb: Target memory per chunk in MB (default 500)
+            **kwargs: Additional arguments (band_number, band_names, etc.)
+        """
+        if chunk_size is None:
+            chunk_size = self._calculate_optimal_chunk_size(
+                "conversion", target_memory_mb
+            )
+        windows = self._get_chunk_windows(chunk_size)
+        # SIMPLE ROUTING
+        if self.mode == "single":
+            return self._to_dataframe_chunked(
+                windows,
+                band_number=kwargs.get("band_number", 1),
+                drop_nodata=drop_nodata,
+                band_names=kwargs.get("band_names", None),
+            )
+        else:  # rgb, rgba, multi
+            return self._to_dataframe_chunked(
+                windows,
+                band_number=None,
+                drop_nodata=drop_nodata,
+                band_names=kwargs.get("band_names", None),
+            )
+    def clip_to_geometry(
+        self,
+        geometry: Union[
+            Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
+        ],
+        crop: bool = True,
+        all_touched: bool = True,
+        invert: bool = False,
+        nodata: Optional[Union[int, float]] = None,
+        pad: bool = False,
+        pad_width: float = 0.5,
+        return_clipped_processor: bool = True,
+    ) -> Union["TifProcessor", tuple]:
+        """
+        Clip raster to geometry boundaries.
+        Parameters:
+        -----------
+        geometry : various
+            Geometry to clip to. Can be:
+            - Shapely Polygon or MultiPolygon
+            - GeoDataFrame or GeoSeries
+            - List of GeoJSON-like dicts
+            - Single GeoJSON-like dict
+        crop : bool, default True
+            Whether to crop the raster to the extent of the geometry
+        all_touched : bool, default True
+            Include pixels that touch the geometry boundary
+        invert : bool, default False
+            If True, mask pixels inside geometry instead of outside
+        nodata : int or float, optional
+            Value to use for masked pixels. If None, uses raster's nodata value
+        pad : bool, default False
+            Pad geometry by half pixel before clipping
+        pad_width : float, default 0.5
+            Width of padding in pixels if pad=True
+        return_clipped_processor : bool, default True
+            If True, returns new TifProcessor with clipped data
+            If False, returns (clipped_array, transform, metadata)
+        Returns:
+        --------
+        TifProcessor or tuple
+            Either new TifProcessor instance or (array, transform, metadata) tuple
+        """
+        # Handle different geometry input types
+        shapes = self._prepare_geometry_for_clipping(geometry)
+        # Validate CRS compatibility
+        self._validate_geometry_crs(geometry)
+        # Perform the clipping
+        with self.open_dataset() as src:
+            try:
+                clipped_data, clipped_transform = mask(
+                    dataset=src,
+                    shapes=shapes,
+                    crop=crop,
+                    all_touched=all_touched,
+                    invert=invert,
+                    nodata=nodata,
+                    pad=pad,
+                    pad_width=pad_width,
+                    filled=True,
+                )
+                # Update metadata for the clipped raster
+                clipped_meta = src.meta.copy()
+                clipped_meta.update(
+                    {
+                        "height": clipped_data.shape[1],
+                        "width": clipped_data.shape[2],
+                        "transform": clipped_transform,
+                        "nodata": nodata if nodata is not None else src.nodata,
+                    }
+                )
+            except ValueError as e:
+                if "Input shapes do not overlap raster" in str(e):
+                    raise ValueError(
+                        "The geometry does not overlap with the raster. "
+                        "Check that both are in the same coordinate reference system."
+                    ) from e
+                else:
+                    raise e
+        if return_clipped_processor:
+            # Create a new TifProcessor with the clipped data
+            return self._create_clipped_processor(clipped_data, clipped_meta)
+        else:
+            return clipped_data, clipped_transform, clipped_meta
+    def clip_to_bounds(
+        self,
+        bounds: tuple,
+        bounds_crs: Optional[str] = None,
+        return_clipped_processor: bool = True,
+    ) -> Union["TifProcessor", tuple]:
+        """
+        Clip raster to rectangular bounds.
+        Parameters:
+        -----------
+        bounds : tuple
+            Bounding box as (minx, miny, maxx, maxy)
+        bounds_crs : str, optional
+            CRS of the bounds. If None, assumes same as raster CRS
+        return_clipped_processor : bool, default True
+            If True, returns new TifProcessor, else returns (array, transform, metadata)
+        Returns:
+        --------
+        TifProcessor or tuple
+            Either new TifProcessor instance or (array, transform, metadata) tuple
         """
-        self.logger.warning(
-            "The `get_zoned_geodataframe` method is deprecated, use `to_geodataframe` instead"
+        # Create bounding box geometry
+        bbox_geom = box(*bounds)
+        # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
+        if bounds_crs is not None:
+            raster_crs = self.crs
+            if not self.crs == bounds_crs:
+                # Create GeoDataFrame with bounds CRS and reproject
+                bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
+                bbox_gdf = bbox_gdf.to_crs(raster_crs)
+                bbox_geom = bbox_gdf.geometry.iloc[0]
+        return self.clip_to_geometry(
+            geometry=bbox_geom,
+            crop=True,
+            return_clipped_processor=return_clipped_processor,
         )
-        self.logger.info("Converting data to GeoDataFrame with zones...")
-        df = self.tabular
+    def to_graph(
+        self,
+        connectivity: Literal[4, 8] = 4,
+        band: Optional[int] = None,
+        include_coordinates: bool = False,
+        graph_type: Literal["networkx", "sparse"] = "networkx",
+        check_memory: bool = True,
+    ) -> Union[nx.Graph, sp.csr_matrix]:
+        """
+        Convert raster to graph based on pixel adjacency.
-        x_res, y_res = self.resolution
+        Args:
+            connectivity: 4 or 8-connectivity
+            band: Band number (1-indexed)
+            include_coordinates: Include x,y coordinates in nodes
+            graph_type: 'networkx' or 'sparse'
+            check_memory: Whether to check memory before operation
-        # create bounding box for each pixel
-        geometries = [
-            box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
-            for lon, lat in zip(df["lon"], df["lat"])
-        ]
+        Returns:
+            Graph representation of raster
+        """
-        gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
+        # Memory guard check
+        if check_memory:
+            self._memory_guard("graph", threshold_percent=80.0)
-        self.logger.info("Conversion to GeoDataFrame complete!")
-        return gdf
+        with self.open_dataset() as src:
+            band_idx = band - 1 if band is not None else 0
+            if band_idx < 0 or band_idx >= src.count:
+                raise ValueError(
+                    f"Band {band} not available. Raster has {src.count} bands"
+                )
+            data = src.read(band_idx + 1)
+            nodata = src.nodata if src.nodata is not None else self.nodata
+            valid_mask = (
+                data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
+            )
+            height, width = data.shape
+            # Find all valid pixels
+            valid_rows, valid_cols = np.where(valid_mask)
+            num_valid_pixels = len(valid_rows)
+            # Create a sequential mapping from (row, col) to a node ID
+            node_map = np.full(data.shape, -1, dtype=int)
+            node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)
+            # Define neighborhood offsets
+            if connectivity == 4:
+                # von Neumann neighborhood (4-connectivity)
+                offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
+            else:  # connectivity == 8
+                # Moore neighborhood (8-connectivity)
+                offsets = [
+                    (-1, -1),
+                    (-1, 0),
+                    (-1, 1),
+                    (0, -1),
+                    (0, 1),
+                    (1, -1),
+                    (1, 0),
+                    (1, 1),
+                ]
+            # Collect nodes and edges
+            nodes_to_add = []
+            edges_to_add = []
+            for i in range(num_valid_pixels):
+                row, col = valid_rows[i], valid_cols[i]
+                current_node_id = node_map[row, col]
+                # Prepare node attributes
+                node_attrs = {"value": float(data[row, col])}
+                if include_coordinates:
+                    x, y = src.xy(row, col)
+                    node_attrs["x"] = x
+                    node_attrs["y"] = y
+                nodes_to_add.append((current_node_id, node_attrs))
+                # Find neighbors and collect edges
+                for dy, dx in offsets:
+                    neighbor_row, neighbor_col = row + dy, col + dx
+                    # Check if neighbor is within bounds and is a valid pixel
+                    if (
+                        0 <= neighbor_row < height
+                        and 0 <= neighbor_col < width
+                        and valid_mask[neighbor_row, neighbor_col]
+                    ):
+                        neighbor_node_id = node_map[neighbor_row, neighbor_col]
+                        # Ensure each edge is added only once
+                        if current_node_id < neighbor_node_id:
+                            neighbor_value = float(data[neighbor_row, neighbor_col])
+                            edges_to_add.append(
+                                (current_node_id, neighbor_node_id, neighbor_value)
+                            )
+            if graph_type == "networkx":
+                G = nx.Graph()
+                G.add_nodes_from(nodes_to_add)
+                G.add_weighted_edges_from(edges_to_add)
+                return G
+            else:  # sparse matrix
+                edges_array = np.array(edges_to_add)
+                row_indices = edges_array[:, 0]
+                col_indices = edges_array[:, 1]
+                weights = edges_array[:, 2]
+                # Add reverse edges for symmetric matrix
+                from_idx = np.append(row_indices, col_indices)
+                to_idx = np.append(col_indices, row_indices)
+                weights = np.append(weights, weights)
+                return sp.coo_matrix(
+                    (weights, (from_idx, to_idx)),
+                    shape=(num_valid_pixels, num_valid_pixels),
+                ).tocsr()
     def sample_by_coordinates(
         self, coordinate_list: List[Tuple[float, float]], **kwargs
@@ -661,11 +1009,63 @@ class TifProcessor:
         stat: Union[str, Callable] = "mean",
         batch_size: int = 100,
         n_workers: int = 4,
+        show_progress: bool = True,
+        check_memory: bool = True,
         **kwargs,
     ) -> np.ndarray:
         """
         Sample raster values by polygons in parallel using batching.
+        Args:
+            polygon_list: List of Shapely Polygon or MultiPolygon objects
+            stat: Statistic to compute
+            batch_size: Number of polygons per batch
+            n_workers: Number of worker processes
+            show_progress: Whether to display progress bar
+            check_memory: Whether to check memory before operation
+            **kwargs: Additional arguments
+        Returns:
+            np.ndarray of statistics for each polygon
         """
+        import sys
+        # Memory guard check with n_workers consideration
+        if check_memory:
+            is_safe = self._memory_guard(
+                "batched_sampling",
+                threshold_percent=85.0,
+                n_workers=n_workers,
+                raise_error=False,
+            )
+            if not is_safe:
+                # Suggest reducing n_workers
+                memory_info = self._check_available_memory()
+                estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)
+                # Calculate optimal workers
+                suggested_workers = max(
+                    1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
+                )
+                warnings.warn(
+                    f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
+                    f"to reduce memory pressure.",
+                    ResourceWarning,
+                )
+        # Platform check
+        if sys.platform in ["win32", "darwin"]:
+            import warnings
+            import multiprocessing as mp
+            if mp.get_start_method(allow_none=True) != "fork":
+                warnings.warn(
+                    "Batched sampling may not work on Windows/macOS. "
+                    "Use sample_by_polygons() if you encounter errors.",
+                    RuntimeWarning,
+                )
         def _chunk_list(data_list, chunk_size):
             """Yield successive chunks from data_list."""
@@ -676,20 +1076,22 @@ class TifProcessor:
             return np.array([])
         stat_func = stat if callable(stat) else getattr(np, stat)
         polygon_chunks = list(_chunk_list(polygon_list, batch_size))
         with multiprocessing.Pool(
             initializer=self._initializer_worker, processes=n_workers
         ) as pool:
             process_func = partial(self._process_polygon_batch, stat_func=stat_func)
-            batched_results = list(
-                tqdm(
-                    pool.imap(process_func, polygon_chunks),
-                    total=len(polygon_chunks),
-                    desc=f"Sampling polygons",
+            if show_progress:
+                batched_results = list(
+                    tqdm(
+                        pool.imap(process_func, polygon_chunks),
+                        total=len(polygon_chunks),
+                        desc=f"Sampling polygons",
+                    )
                 )
-            )
+            else:
+                batched_results = list(pool.imap(process_func, polygon_chunks))
             results = [item for sublist in batched_results for item in sublist]
@@ -701,24 +1103,46 @@ class TifProcessor:
         Opens the raster dataset and stores it in a process-local variable.
         This function runs once per worker, not for every task.
         """
+        global src_handle, memfile_handle
+        # Priority: merged > reprojected > original (same as open_dataset)
+        local_file_path = None
+        if self._merged_file_path:
+            # Merged file is a local temp file
+            local_file_path = self._merged_file_path
+        elif self._reprojected_file_path:
+            # Reprojected file is a local temp file
+            local_file_path = self._reprojected_file_path
+        elif isinstance(self.data_store, LocalDataStore):
+            # Local file - can open directly
+            local_file_path = str(self.dataset_path)
+        if local_file_path:
+            # Open local file directly
+            with open(local_file_path, "rb") as f:
+                memfile_handle = rasterio.MemoryFile(f.read())
+                src_handle = memfile_handle.open()
+        else:
+            # Custom DataStore
+            with self.data_store.open(str(self.dataset_path), "rb") as f:
+                memfile_handle = rasterio.MemoryFile(f.read())
+                src_handle = memfile_handle.open()
+    def _get_worker_dataset(self):
+        """Get dataset handle for worker process."""
         global src_handle
-        with self.data_store.open(self.dataset_path, "rb") as f:
-            with rasterio.MemoryFile(f.read()) as memfile:
-                src_handle = memfile.open()
+        if src_handle is None:
+            raise RuntimeError("Raster dataset not initialized in this process.")
+        return src_handle
     def _process_single_polygon(self, polygon, stat_func):
         """
         Helper function to process a single polygon.
         This will be run in a separate process.
         """
-        global src_handle
-        if src_handle is None:
-            # This should not happen if the initializer is set up correctly,
-            # but it's a good defensive check.
-            raise RuntimeError("Raster dataset not initialized in this process.")
         try:
-            out_image, _ = mask(src_handle, [polygon], crop=True, filled=False)
+            src = self._get_worker_dataset()
+            out_image, _ = mask(src, [polygon], crop=True, filled=False)
             if hasattr(out_image, "mask"):
                 valid_data = out_image.compressed()
@@ -729,11 +1153,12 @@ class TifProcessor:
                     else out_image.flatten()
                 )
-            if len(valid_data) == 0:
-                return np.nan
-            else:
-                return stat_func(valid_data)
-        except Exception:
+            return stat_func(valid_data) if len(valid_data) > 0 else np.nan
+        except RuntimeError as e:
+            self.logger.error(f"Worker not initialized: {e}")
+            return np.nan
+        except Exception as e:
+            self.logger.debug(f"Error processing polygon: {e}")
             return np.nan
     def _process_polygon_batch(self, polygon_batch, stat_func):
@@ -745,226 +1170,226 @@ class TifProcessor:
             for polygon in polygon_batch
         ]
-    def _to_rgba_dataframe(self, drop_transparent: bool = False) -> pd.DataFrame:
-        """
-        Convert RGBA TIF to DataFrame with separate columns for R, G, B, A values.
+    def _to_dataframe(
+        self,
+        band_number: Optional[int] = None,
+        drop_nodata: bool = True,
+        band_names: Optional[Union[str, List[str]]] = None,
+    ) -> pd.DataFrame:
         """
-        self.logger.info("Processing RGBA dataset...")
-        with self.open_dataset() as src:
-            if self.count != 4:
-                raise ValueError("RGBA mode requires a 4-band TIF file")
-            # Read all four bands
-            red, green, blue, alpha = src.read()
-            x_coords, y_coords = self._get_pixel_coordinates()
-            if drop_transparent:
-                mask = alpha > 0
-                red = np.extract(mask, red)
-                green = np.extract(mask, green)
-                blue = np.extract(mask, blue)
-                alpha = np.extract(mask, alpha)
-                lons = np.extract(mask, x_coords)
-                lats = np.extract(mask, y_coords)
-            else:
-                lons = x_coords.flatten()
-                lats = y_coords.flatten()
-                red = red.flatten()
-                green = green.flatten()
-                blue = blue.flatten()
-                alpha = alpha.flatten()
-            # Create DataFrame with RGBA values
-            data = pd.DataFrame(
-                {
-                    "lon": lons,
-                    "lat": lats,
-                    "red": red,
-                    "green": green,
-                    "blue": blue,
-                    "alpha": alpha,
-                }
-            )
-            # Normalize alpha values if they're not in [0, 1] range
-            if data["alpha"].max() > 1:
-                data["alpha"] = data["alpha"] / data["alpha"].max()
-        self.logger.info("RGBA dataset is processed!")
-        return data
+        Process TIF to DataFrame - handles both single-band and multi-band.
-    def _to_rgb_dataframe(self, drop_nodata: bool = True) -> pd.DataFrame:
-        """Convert RGB TIF to DataFrame with separate columns for R, G, B values."""
-        if self.mode != "rgb":
-            raise ValueError("Use appropriate method for current mode")
-        self.logger.info("Processing RGB dataset...")
+        Args:
+            band_number: Specific band to read (1-indexed). If None, reads all bands.
+            drop_no Whether to drop nodata values
+            band_names: Custom names for bands (multi-band only)
+        Returns:
+            pd.DataFrame with lon, lat, and band value(s)
+        """
         with self.open_dataset() as src:
-            if self.count != 3:
-                raise ValueError("RGB mode requires a 3-band TIF file")
+            if band_number is not None:
+                # SINGLE BAND MODE
+                band = src.read(band_number)
+                mask = self._build_data_mask(band, drop_nodata, src.nodata)
+                lons, lats = self._extract_coordinates_with_mask(mask)
+                pixel_values = (
+                    np.extract(mask, band) if mask is not None else band.flatten()
+                )
+                band_name = band_names if isinstance(band_names, str) else "pixel_value"
-            # Read all three bands
-            red, green, blue = src.read()
+                return pd.DataFrame({"lon": lons, "lat": lats, band_name: pixel_values})
+            else:
+                # MULTI-BAND MODE (all bands)
+                stack = src.read()
+                # Auto-detect band names by mode
+                if band_names is None:
+                    if self.mode == "rgb":
+                        band_names = ["red", "green", "blue"]
+                    elif self.mode == "rgba":
+                        band_names = ["red", "green", "blue", "alpha"]
+                    else:
+                        band_names = [
+                            src.descriptions[i] or f"band_{i+1}"
+                            for i in range(self.count)
+                        ]
-            x_coords, y_coords = self._get_pixel_coordinates()
+                # Build mask (checks ALL bands!)
+                mask = self._build_multi_band_mask(stack, drop_nodata, src.nodata)
-            if drop_nodata:
-                nodata_value = src.nodata
-                if nodata_value is not None:
-                    mask = ~(
-                        (red == nodata_value)
-                        | (green == nodata_value)
-                        | (blue == nodata_value)
-                    )
-                    red = np.extract(mask, red)
-                    green = np.extract(mask, green)
-                    blue = np.extract(mask, blue)
-                    lons = np.extract(mask, x_coords)
-                    lats = np.extract(mask, y_coords)
-                else:
-                    lons = x_coords.flatten()
-                    lats = y_coords.flatten()
-                    red = red.flatten()
-                    green = green.flatten()
-                    blue = blue.flatten()
-            else:
-                lons = x_coords.flatten()
-                lats = y_coords.flatten()
-                red = red.flatten()
-                green = green.flatten()
-                blue = blue.flatten()
+                # Create DataFrame
+                data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
+                df = pd.DataFrame(data_dict)
-            data = pd.DataFrame(
-                {
-                    "lon": lons,
-                    "lat": lats,
-                    "red": red,
-                    "green": green,
-                    "blue": blue,
-                }
-            )
+                # RGBA: normalize alpha if needed
+                if (
+                    self.mode == "rgba"
+                    and "alpha" in df.columns
+                    and df["alpha"].max() > 1
+                ):
+                    df["alpha"] = df["alpha"] / 255.0
-        self.logger.info("RGB dataset is processed!")
-        return data
+            return df
-    def _to_band_dataframe(
-        self, band_number: int = 1, drop_nodata: bool = True, drop_values: list = []
+    def _to_dataframe_chunked(
+        self,
+        windows: List[rasterio.windows.Window],
+        band_number: Optional[int] = None,
+        drop_nodata: bool = True,
+        band_names: Optional[Union[str, List[str]]] = None,
+        show_progress: bool = True,
     ) -> pd.DataFrame:
-        """Process single-band TIF to DataFrame."""
-        if self.mode != "single":
-            raise ValueError("Use appropriate method for current mode")
+        """Universal chunked converter for ALL modes."""
-        self.logger.info("Processing single-band dataset...")
-        if band_number <= 0 or band_number > self.count:
-            self.logger.error(
-                f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
-            )
-            return None
+        chunks = []
+        iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows
         with self.open_dataset() as src:
+            # Auto-detect band names ONCE (before loop)
+            if band_number is None and band_names is None:
+                if self.mode == "rgb":
+                    band_names = ["red", "green", "blue"]
+                elif self.mode == "rgba":
+                    band_names = ["red", "green", "blue", "alpha"]
+                else:  # multi
+                    band_names = [
+                        src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
+                    ]
-            band = src.read(band_number)
-            x_coords, y_coords = self._get_pixel_coordinates()
+            for window in iterator:
+                if band_number is not None:
+                    # SINGLE BAND
+                    band_chunk = src.read(band_number, window=window)
+                    mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
+                    lons, lats = self._get_chunk_coordinates(window, src)
+                    band_name = (
+                        band_names if isinstance(band_names, str) else "pixel_value"
+                    )
-            values_to_mask = []
-            if drop_nodata:
-                nodata_value = src.nodata
-                if nodata_value is not None:
-                    values_to_mask.append(nodata_value)
+                    # Build chunk DataFrame (could use helper but simple enough)
+                    if mask is not None:
+                        mask_flat = mask.flatten()
+                        chunk_df = pd.DataFrame(
+                            {
+                                "lon": lons[mask_flat],
+                                "lat": lats[mask_flat],
+                                band_name: band_chunk.flatten()[mask_flat],
+                            }
+                        )
+                    else:
+                        chunk_df = pd.DataFrame(
+                            {"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
+                        )
+                else:
+                    # MULTI-BAND (includes RGB/RGBA)
+                    stack_chunk = src.read(window=window)
+                    mask = self._build_multi_band_mask(
+                        stack_chunk, drop_nodata, src.nodata
+                    )
+                    lons, lats = self._get_chunk_coordinates(window, src)
-            if drop_values:
-                values_to_mask.extend(drop_values)
+                    # Build DataFrame using helper
+                    band_dict = {
+                        band_names[i]: stack_chunk[i] for i in range(self.count)
+                    }
+                    chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)
-            if values_to_mask:
-                data_mask = ~np.isin(band, values_to_mask)
-                pixel_values = np.extract(data_mask, band)
-                lons = np.extract(data_mask, x_coords)
-                lats = np.extract(data_mask, y_coords)
-            else:
-                pixel_values = band.flatten()
-                lons = x_coords.flatten()
-                lats = y_coords.flatten()
+                    # RGBA: normalize alpha
+                    if self.mode == "rgba" and "alpha" in chunk_df.columns:
+                        if chunk_df["alpha"].max() > 1:
+                            chunk_df["alpha"] = chunk_df["alpha"] / 255.0
-            data = pd.DataFrame({"lon": lons, "lat": lats, "pixel_value": pixel_values})
+                chunks.append(chunk_df)
-        self.logger.info("Dataset is processed!")
-        return data
+        result = pd.concat(chunks, ignore_index=True)
+        return result
-    def _to_multi_band_dataframe(
+    def _prepare_geometry_for_clipping(
         self,
-        drop_nodata: bool = True,
-        drop_values: list = [],
-        band_names: Optional[List[str]] = None,
-    ) -> pd.DataFrame:
-        """
-        Process multi-band TIF to DataFrame with all bands included.
-        Args:
-            drop_nodata (bool): Whether to drop nodata values. Defaults to True.
-            drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
-            band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
-                                            the band descriptions from the GeoTIFF metadata if available,
-                                            otherwise 'band_1', 'band_2', etc.
-        Returns:
-            pd.DataFrame: DataFrame containing coordinates and all band values
-        """
-        self.logger.info("Processing multi-band dataset...")
-        with self.open_dataset() as src:
-            # Read all bands
-            stack = src.read()
+        geometry: Union[
+            Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
+        ],
+    ) -> List[dict]:
+        """Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""
+        if isinstance(geometry, (Polygon, MultiPolygon)):
+            # Shapely geometry
+            return [geometry.__geo_interface__]
+        elif isinstance(geometry, gpd.GeoDataFrame):
+            # GeoDataFrame - use all geometries
+            return [
+                geom.__geo_interface__ for geom in geometry.geometry if geom is not None
+            ]
+        elif isinstance(geometry, gpd.GeoSeries):
+            # GeoSeries
+            return [geom.__geo_interface__ for geom in geometry if geom is not None]
+        elif isinstance(geometry, dict):
+            # Single GeoJSON-like dict
+            return [geometry]
+        elif isinstance(geometry, list):
+            # List of GeoJSON-like dicts
+            return geometry
-            x_coords, y_coords = self._get_pixel_coordinates()
-            # Initialize dictionary with coordinates
-            data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
+        else:
+            raise TypeError(
+                f"Unsupported geometry type: {type(geometry)}. "
+                "Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
+                "GeoJSON-like dict, or list of GeoJSON-like dicts."
+            )
-            # Get band descriptions from metadata if available
-            if band_names is None and hasattr(src, "descriptions") and src.descriptions:
-                band_names = [
-                    desc if desc else f"band_{i+1}"
-                    for i, desc in enumerate(src.descriptions)
-                ]
+    def _validate_geometry_crs(
+        self,
+        original_geometry: Any,
+    ) -> None:
+        """Validate that geometry CRS matches raster CRS"""
+        # Get raster CRS
+        raster_crs = self.crs
+        # Try to get geometry CRS
+        geometry_crs = None
+        if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
+            geometry_crs = original_geometry.crs
+        elif hasattr(original_geometry, "crs"):
+            geometry_crs = original_geometry.crs
+        # Warn if CRS mismatch detected
+        if geometry_crs is not None and raster_crs is not None:
+            if not raster_crs == geometry_crs:
+                self.logger.warning(
+                    f"CRS mismatch detected! Raster CRS: {raster_crs}, "
+                    f"Geometry CRS: {geometry_crs}. "
+                    "Consider reprojecting geometry to match raster CRS for accurate clipping."
+                )
-            # Process each band
-            for band_idx in range(self.count):
-                band_data = stack[band_idx]
-                # Handle nodata and other values to drop
-                if drop_nodata or drop_values:
-                    values_to_mask = []
-                    if drop_nodata and src.nodata is not None:
-                        values_to_mask.append(src.nodata)
-                    if drop_values:
-                        values_to_mask.extend(drop_values)
-                    if values_to_mask:
-                        data_mask = ~np.isin(band_data, values_to_mask)
-                        band_values = np.extract(data_mask, band_data)
-                        if band_idx == 0:  # Only need to mask coordinates once
-                            data_dict["lon"] = np.extract(data_mask, x_coords)
-                            data_dict["lat"] = np.extract(data_mask, y_coords)
-                    else:
-                        band_values = band_data.flatten()
-                else:
-                    band_values = band_data.flatten()
+    def _create_clipped_processor(
+        self, clipped_data: np.ndarray, clipped_meta: dict
+    ) -> "TifProcessor":
+        """
+        Helper to create a new TifProcessor instance from clipped data.
+        Saves the clipped data to a temporary file and initializes a new TifProcessor.
+        """
+        clipped_file_path = os.path.join(
+            self._temp_dir, f"clipped_temp_{os.urandom(8).hex()}.tif"
+        )
+        with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
+            dst.write(clipped_data)
-                # Use custom band names if provided, otherwise use descriptions or default naming
-                band_name = (
-                    band_names[band_idx]
-                    if band_names and len(band_names) > band_idx
-                    else f"band_{band_idx + 1}"
-                )
-                data_dict[band_name] = band_values
+        self.logger.info(f"Clipped raster saved to temporary file: {clipped_file_path}")
-        self.logger.info("Multi-band dataset is processed!")
-        return pd.DataFrame(data_dict)
+        # Create a new TifProcessor instance with the clipped data
+        # Pass relevant parameters from the current instance to maintain consistency
+        return TifProcessor(
+            dataset_path=clipped_file_path,
+            data_store=self.data_store,
+            mode=self.mode,
+        )
     def _get_pixel_coordinates(self):
         """Helper method to generate coordinate arrays for all pixels"""
@@ -991,60 +1416,322 @@ class TifProcessor:
         return self._cache["pixel_coords"]
+    def _get_chunk_coordinates(self, window, src):
+        """Get coordinates for a specific window chunk."""
+        transform = src.window_transform(window)
+        rows, cols = np.meshgrid(
+            np.arange(window.height), np.arange(window.width), indexing="ij"
+        )
+        xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
+        return np.array(xs), np.array(ys)
-def sample_multiple_tifs_by_coordinates(
-    tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]
-):
-    """
-    Sample raster values from multiple TIFF files for given coordinates.
+    def _extract_coordinates_with_mask(self, mask=None):
+        """Extract flattened coordinates, optionally applying a mask."""
+        x_coords, y_coords = self._get_pixel_coordinates()
-    Parameters:
-    - tif_processors: List of TifProcessor instances.
-    - coordinate_list: List of (x, y) coordinates.
+        if mask is not None:
+            return np.extract(mask, x_coords), np.extract(mask, y_coords)
-    Returns:
-    - A NumPy array of sampled values, taking the first non-nodata value encountered.
-    """
-    sampled_values = np.full(len(coordinate_list), np.nan, dtype=np.float32)
+        return x_coords.flatten(), y_coords.flatten()
+    def _build_data_mask(self, data, drop_nodata=True, nodata_value=None):
+        """Build a boolean mask for filtering data based on nodata values."""
+        if not drop_nodata or nodata_value is None:
+            return None
+        return data != nodata_value
+    def _build_multi_band_mask(
+        self,
+        bands: np.ndarray,
+        drop_nodata: bool = True,
+        nodata_value: Optional[float] = None,
+    ) -> Optional[np.ndarray]:
+        """
+        Build mask for multi-band data - drops pixels where ANY band has nodata.
+        Args:
+            bands: 3D array of shape (n_bands, height, width)
+            drop_nodata Whether to drop nodata values
+            nodata_value: The nodata value to check
+        Returns:
+            Boolean mask or None if no masking needed
+        """
+        if not drop_nodata or nodata_value is None:
+            return None
+        # Check if ANY band has nodata at each pixel location
+        has_nodata = np.any(bands == nodata_value, axis=0)
+        # Return True where ALL bands are valid
+        valid_mask = ~has_nodata
+        return valid_mask if not valid_mask.all() else None
+    def _bands_to_dict(self, bands, band_count, band_names, mask=None):
+        """Read specified bands and return as a dictionary with optional masking."""
+        lons, lats = self._extract_coordinates_with_mask(mask)
+        data_dict = {"lon": lons, "lat": lats}
+        for idx, name in enumerate(band_names[:band_count]):
+            band_data = bands[idx]
+            data_dict[name] = (
+                np.extract(mask, band_data) if mask is not None else band_data.flatten()
+            )
+        return data_dict
+    def _calculate_optimal_chunk_size(
+        self, operation: str = "conversion", target_memory_mb: int = 500
+    ) -> int:
+        """
+        Calculate optimal chunk size (number of rows) based on target memory usage.
-    for tp in tif_processors:
-        values = tp.sample_by_coordinates(coordinate_list=coordinate_list)
+        Args:
+            operation: Type of operation ('conversion', 'graph')
+            target_memory_mb: Target memory per chunk in megabytes
-        if tp.nodata is not None:
-            mask = (np.isnan(sampled_values)) & (
-                values != tp.nodata
-            )  # Replace only NaNs
+        Returns:
+            Number of rows per chunk
+        """
+        bytes_per_element = np.dtype(self.dtype).itemsize
+        n_bands = self.count
+        width = self.width
+        # Adjust for operation type
+        if operation == "conversion":
+            # DataFrame overhead is roughly 2x
+            bytes_per_row = width * n_bands * bytes_per_element * 2
+        elif operation == "graph":
+            # Graph needs additional space for edges
+            bytes_per_row = width * bytes_per_element * 4  # Estimate
         else:
-            mask = np.isnan(sampled_values)  # No explicit nodata, replace all NaNs
+            bytes_per_row = width * n_bands * bytes_per_element
-        sampled_values[mask] = values[mask]  # Update only missing values
+        target_bytes = target_memory_mb * 1024 * 1024
+        chunk_rows = max(1, int(target_bytes / bytes_per_row))
-    return sampled_values
+        # Ensure chunk size doesn't exceed total height
+        chunk_rows = min(chunk_rows, self.height)
+        self.logger.info(
+            f"Calculated chunk size: {chunk_rows} rows "
+            f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
+        )
-def sample_multiple_tifs_by_polygons(
-    tif_processors: List[TifProcessor],
-    polygon_list: List[Union[Polygon, MultiPolygon]],
-    stat: str = "mean",
-) -> np.ndarray:
-    """
-    Sample raster values from multiple TIFF files for polygons in a list and join the results.
+        return chunk_rows
+    def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
+        """
+        Generate window objects for chunked reading.
-    Parameters:
-    - tif_processors: List of TifProcessor instances.
-    - polygon_list: List of polygon geometries (can include MultiPolygons).
-    - stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
+        Args:
+            chunk_size: Number of rows per chunk
-    Returns:
-    - A NumPy array of sampled values, taking the first non-nodata value encountered.
-    """
-    sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
+        Returns:
+            List of rasterio.windows.Window objects
+        """
+        windows = []
+        for row_start in range(0, self.height, chunk_size):
+            row_end = min(row_start + chunk_size, self.height)
+            window = rasterio.windows.Window(
+                col_off=0,
+                row_off=row_start,
+                width=self.width,
+                height=row_end - row_start,
+            )
+            windows.append(window)
+        return windows
+    def _format_bytes(self, bytes_value: int) -> str:
+        """Convert bytes to human-readable format."""
+        for unit in ["B", "KB", "MB", "GB", "TB"]:
+            if bytes_value < 1024.0:
+                return f"{bytes_value:.2f} {unit}"
+            bytes_value /= 1024.0
+        return f"{bytes_value:.2f} PB"
+    def _check_available_memory(self) -> dict:
+        """
+        Check available system memory.
+        Returns:
+            Dict with total, available, and used memory info
+        """
+        import psutil
+        memory = psutil.virtual_memory()
+        return {
+            "total": memory.total,
+            "available": memory.available,
+            "used": memory.used,
+            "percent": memory.percent,
+            "available_human": self._format_bytes(memory.available),
+        }
+    def _estimate_memory_usage(
+        self, operation: str = "conversion", n_workers: int = 1
+    ) -> dict:
+        """
+        Estimate memory usage for various operations.
+        Args:
+            operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
+            n_workers: Number of workers (for batched_sampling)
+        Returns:
+            Dict with estimated memory usage in bytes and human-readable format
+        """
+        bytes_per_element = np.dtype(self.dtype).itemsize
+        n_pixels = self.width * self.height
+        n_bands = self.count
+        estimates = {}
+        if operation == "conversion":
+            # to_dataframe/to_geodataframe: full raster + DataFrame overhead
+            raster_memory = n_pixels * n_bands * bytes_per_element
+            # DataFrame overhead (roughly 2x for storage + processing)
+            dataframe_memory = (
+                n_pixels * n_bands * 16
+            )  # 16 bytes per value in DataFrame
+            total = raster_memory + dataframe_memory
+            estimates["raster"] = raster_memory
+            estimates["dataframe"] = dataframe_memory
+            estimates["total"] = total
+        elif operation == "batched_sampling":
+            # Each worker loads full raster into MemoryFile
+            # Need to get file size
+            if self._merged_file_path:
+                file_path = self._merged_file_path
+            elif self._reprojected_file_path:
+                file_path = self._reprojected_file_path
+            else:
+                file_path = str(self.dataset_path)
+            try:
+                import os
+                file_size = os.path.getsize(file_path)
+            except:
+                # Estimate if can't get file size
+                file_size = n_pixels * n_bands * bytes_per_element * 1.2  # Add overhead
+            estimates["per_worker"] = file_size
+            estimates["total"] = file_size * n_workers
+        elif operation == "merge":
+            # _merge_with_mean uses float64 arrays
+            raster_memory = n_pixels * n_bands * 8  # float64
+            estimates["sum_array"] = raster_memory
+            estimates["count_array"] = n_pixels * 4  # int32
+            estimates["total"] = raster_memory + n_pixels * 4
+        elif operation == "graph":
+            # to_graph: data + node_map + edges
+            data_memory = n_pixels * bytes_per_element
+            node_map_memory = n_pixels * 4  # int32
+            # Estimate edges (rough: 4-connectivity = 4 edges per pixel)
+            edges_memory = n_pixels * 4 * 3 * 8  # 3 values per edge, float64
+            total = data_memory + node_map_memory + edges_memory
+            estimates["data"] = data_memory
+            estimates["node_map"] = node_map_memory
+            estimates["edges"] = edges_memory
+            estimates["total"] = total
+        # Add human-readable format
+        estimates["human_readable"] = self._format_bytes(estimates["total"])
+        return estimates
+    def _memory_guard(
+        self,
+        operation: str,
+        threshold_percent: float = 80.0,
+        n_workers: Optional[int] = None,
+        raise_error: bool = False,
+    ) -> bool:
+        """
+        Check if operation is safe to perform given memory constraints.
+        Args:
+            operation: Type of operation to check
+            threshold_percent: Maximum % of available memory to use (default 80%)
+            n_workers: Number of workers (for batched operations)
+            raise_error: If True, raise MemoryError instead of warning
+        Returns:
+            True if operation is safe, False otherwise
-    for tp in tif_processors:
-        values = tp.sample_by_polygons(polygon_list=polygon_list, stat=stat)
+        Raises:
+            MemoryError: If raise_error=True and memory insufficient
+        """
+        import warnings
+        estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
+        memory_info = self._check_available_memory()
+        estimated_usage = estimates["total"]
+        available = memory_info["available"]
+        threshold = available * (threshold_percent / 100.0)
+        is_safe = estimated_usage <= threshold
+        if not is_safe:
+            usage_str = self._format_bytes(estimated_usage)
+            available_str = memory_info["available_human"]
+            message = (
+                f"Memory warning: {operation} operation may require {usage_str} "
+                f"but only {available_str} is available. "
+                f"Current memory usage: {memory_info['percent']:.1f}%"
+            )
-        mask = np.isnan(sampled_values)  # replace all NaNs
+            if raise_error:
+                raise MemoryError(message)
+            else:
+                warnings.warn(message, ResourceWarning)
+                if hasattr(self, "logger"):
+                    self.logger.warning(message)
+        return is_safe
+    def _validate_mode_band_compatibility(self):
+        """Validate that mode matches band count."""
+        mode_requirements = {
+            "single": (1, "1-band"),
+            "rgb": (3, "3-band"),
+            "rgba": (4, "4-band"),
+        }
+        if self.mode in mode_requirements:
+            required_count, description = mode_requirements[self.mode]
+            if self.count != required_count:
+                raise ValueError(
+                    f"{self.mode.upper()} mode requires a {description} TIF file"
+                )
+        elif self.mode == "multi" and self.count < 2:
+            raise ValueError("Multi mode requires a TIF file with 2 or more bands")
-        sampled_values[mask] = values[mask]  # Update only values with samapled value
+    def __enter__(self):
+        return self
-    return sampled_values
+    def __del__(self):
+        """Clean up temporary files and directories."""
+        if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir, ignore_errors=True)
+    def cleanup(self):
+        """Explicit cleanup method for better control."""
+        if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir)
+            self.logger.info("Cleaned up temporary files")
+    def __exit__(self, exc_type, exc_value, traceback):
+        """Proper context manager exit with cleanup."""
+        self.cleanup()
+        return False

giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl