giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,10 @@ from gigaspatial.core.io.data_store import DataStore
24
24
  from gigaspatial.core.io.local_data_store import LocalDataStore
25
25
  from gigaspatial.config import config
26
26
 
27
+ # Global variables for multiprocessing workers
28
+ src_handle = None
29
+ memfile_handle = None
30
+
27
31
 
28
32
  @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
29
33
  class TifProcessor:
@@ -73,16 +77,7 @@ class TifProcessor:
73
77
  self.dataset_path = self._reprojected_file_path
74
78
 
75
79
  self._load_metadata()
76
-
77
- # Validate mode and band count
78
- if self.mode == "rgba" and self.count != 4:
79
- raise ValueError("RGBA mode requires a 4-band TIF file")
80
- if self.mode == "rgb" and self.count != 3:
81
- raise ValueError("RGB mode requires a 3-band TIF file")
82
- if self.mode == "single" and self.count != 1:
83
- raise ValueError("Single mode requires a 1-band TIF file")
84
- if self.mode == "multi" and self.count < 2:
85
- raise ValueError("Multi mode requires a TIF file with 2 or more bands")
80
+ self._validate_mode_band_compatibility()
86
81
 
87
82
  @contextmanager
88
83
  def open_dataset(self):
@@ -93,6 +88,9 @@ class TifProcessor:
93
88
  elif self._reprojected_file_path:
94
89
  with rasterio.open(self._reprojected_file_path) as src:
95
90
  yield src
91
+ elif isinstance(self.data_store, LocalDataStore):
92
+ with rasterio.open(str(self.dataset_path)) as src:
93
+ yield src
96
94
  else:
97
95
  with self.data_store.open(str(self.dataset_path), "rb") as f:
98
96
  with rasterio.MemoryFile(f.read()) as memfile:
@@ -514,19 +512,36 @@ class TifProcessor:
514
512
  def height(self):
515
513
  return self._cache["height"]
516
514
 
517
- def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
515
+ def to_dataframe(
516
+ self, drop_nodata=True, check_memory=True, **kwargs
517
+ ) -> pd.DataFrame:
518
+ """
519
+ Convert raster to DataFrame.
520
+
521
+ Args:
522
+ drop_nodata: Whether to drop nodata values
523
+ check_memory: Whether to check memory before operation (default True)
524
+ **kwargs: Additional arguments
525
+
526
+ Returns:
527
+ pd.DataFrame with raster data
528
+ """
529
+ # Memory guard check
530
+ if check_memory:
531
+ self._memory_guard("conversion", threshold_percent=80.0)
532
+
518
533
  try:
519
534
  if self.mode == "single":
520
- df = self._to_band_dataframe(drop_nodata=drop_nodata, **kwargs)
521
- elif self.mode == "rgb":
522
- df = self._to_rgb_dataframe(drop_nodata=drop_nodata)
523
- elif self.mode == "rgba":
524
- df = self._to_rgba_dataframe(drop_transparent=drop_nodata)
525
- elif self.mode == "multi":
526
- df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
535
+ return self._to_dataframe(
536
+ band_number=kwargs.get("band_number", 1),
537
+ drop_nodata=drop_nodata,
538
+ band_names=kwargs.get("band_names", None),
539
+ )
527
540
  else:
528
- raise ValueError(
529
- f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
541
+ return self._to_dataframe(
542
+ band_number=None, # All bands
543
+ drop_nodata=drop_nodata,
544
+ band_names=kwargs.get("band_names", None),
530
545
  )
531
546
  except Exception as e:
532
547
  raise ValueError(
@@ -537,12 +552,23 @@ class TifProcessor:
537
552
 
538
553
  return df
539
554
 
540
- def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
555
+ def to_geodataframe(self, check_memory=True, **kwargs) -> gpd.GeoDataFrame:
541
556
  """
542
557
  Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
543
558
  Each zone is defined by its bounding box, based on pixel resolution and coordinates.
559
+
560
+ Args:
561
+ check_memory: Whether to check memory before operation
562
+ **kwargs: Additional arguments passed to to_dataframe()
563
+
564
+ Returns:
565
+ gpd.GeoDataFrame with raster data
544
566
  """
545
- df = self.to_dataframe(**kwargs)
567
+ # Memory guard check
568
+ if check_memory:
569
+ self._memory_guard("conversion", threshold_percent=80.0)
570
+
571
+ df = self.to_dataframe(check_memory=False, **kwargs)
546
572
 
547
573
  x_res, y_res = self.resolution
548
574
 
@@ -556,19 +582,204 @@ class TifProcessor:
556
582
 
557
583
  return gdf
558
584
 
585
+ def to_dataframe_chunked(
586
+ self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
587
+ ):
588
+ """
589
+ Convert raster to DataFrame using chunked processing for memory efficiency.
590
+
591
+ Automatically routes to the appropriate chunked method based on mode.
592
+ Chunk size is automatically calculated based on target memory usage.
593
+
594
+ Args:
595
+ drop_nodata: Whether to drop nodata values
596
+ chunk_size: Number of rows per chunk (auto-calculated if None)
597
+ target_memory_mb: Target memory per chunk in MB (default 500)
598
+ **kwargs: Additional arguments (band_number, band_names, etc.)
599
+ """
600
+
601
+ if chunk_size is None:
602
+ chunk_size = self._calculate_optimal_chunk_size(
603
+ "conversion", target_memory_mb
604
+ )
605
+
606
+ windows = self._get_chunk_windows(chunk_size)
607
+
608
+ # SIMPLE ROUTING
609
+ if self.mode == "single":
610
+ return self._to_dataframe_chunked(
611
+ windows,
612
+ band_number=kwargs.get("band_number", 1),
613
+ drop_nodata=drop_nodata,
614
+ band_names=kwargs.get("band_names", None),
615
+ )
616
+ else: # rgb, rgba, multi
617
+ return self._to_dataframe_chunked(
618
+ windows,
619
+ band_number=None,
620
+ drop_nodata=drop_nodata,
621
+ band_names=kwargs.get("band_names", None),
622
+ )
623
+
624
+ def clip_to_geometry(
625
+ self,
626
+ geometry: Union[
627
+ Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
628
+ ],
629
+ crop: bool = True,
630
+ all_touched: bool = True,
631
+ invert: bool = False,
632
+ nodata: Optional[Union[int, float]] = None,
633
+ pad: bool = False,
634
+ pad_width: float = 0.5,
635
+ return_clipped_processor: bool = True,
636
+ ) -> Union["TifProcessor", tuple]:
637
+ """
638
+ Clip raster to geometry boundaries.
639
+
640
+ Parameters:
641
+ -----------
642
+ geometry : various
643
+ Geometry to clip to. Can be:
644
+ - Shapely Polygon or MultiPolygon
645
+ - GeoDataFrame or GeoSeries
646
+ - List of GeoJSON-like dicts
647
+ - Single GeoJSON-like dict
648
+ crop : bool, default True
649
+ Whether to crop the raster to the extent of the geometry
650
+ all_touched : bool, default True
651
+ Include pixels that touch the geometry boundary
652
+ invert : bool, default False
653
+ If True, mask pixels inside geometry instead of outside
654
+ nodata : int or float, optional
655
+ Value to use for masked pixels. If None, uses raster's nodata value
656
+ pad : bool, default False
657
+ Pad geometry by half pixel before clipping
658
+ pad_width : float, default 0.5
659
+ Width of padding in pixels if pad=True
660
+ return_clipped_processor : bool, default True
661
+ If True, returns new TifProcessor with clipped data
662
+ If False, returns (clipped_array, transform, metadata)
663
+
664
+ Returns:
665
+ --------
666
+ TifProcessor or tuple
667
+ Either new TifProcessor instance or (array, transform, metadata) tuple
668
+ """
669
+ # Handle different geometry input types
670
+ shapes = self._prepare_geometry_for_clipping(geometry)
671
+
672
+ # Validate CRS compatibility
673
+ self._validate_geometry_crs(geometry)
674
+
675
+ # Perform the clipping
676
+ with self.open_dataset() as src:
677
+ try:
678
+ clipped_data, clipped_transform = mask(
679
+ dataset=src,
680
+ shapes=shapes,
681
+ crop=crop,
682
+ all_touched=all_touched,
683
+ invert=invert,
684
+ nodata=nodata,
685
+ pad=pad,
686
+ pad_width=pad_width,
687
+ filled=True,
688
+ )
689
+
690
+ # Update metadata for the clipped raster
691
+ clipped_meta = src.meta.copy()
692
+ clipped_meta.update(
693
+ {
694
+ "height": clipped_data.shape[1],
695
+ "width": clipped_data.shape[2],
696
+ "transform": clipped_transform,
697
+ "nodata": nodata if nodata is not None else src.nodata,
698
+ }
699
+ )
700
+
701
+ except ValueError as e:
702
+ if "Input shapes do not overlap raster" in str(e):
703
+ raise ValueError(
704
+ "The geometry does not overlap with the raster. "
705
+ "Check that both are in the same coordinate reference system."
706
+ ) from e
707
+ else:
708
+ raise e
709
+
710
+ if return_clipped_processor:
711
+ # Create a new TifProcessor with the clipped data
712
+ return self._create_clipped_processor(clipped_data, clipped_meta)
713
+ else:
714
+ return clipped_data, clipped_transform, clipped_meta
715
+
716
+ def clip_to_bounds(
717
+ self,
718
+ bounds: tuple,
719
+ bounds_crs: Optional[str] = None,
720
+ return_clipped_processor: bool = True,
721
+ ) -> Union["TifProcessor", tuple]:
722
+ """
723
+ Clip raster to rectangular bounds.
724
+
725
+ Parameters:
726
+ -----------
727
+ bounds : tuple
728
+ Bounding box as (minx, miny, maxx, maxy)
729
+ bounds_crs : str, optional
730
+ CRS of the bounds. If None, assumes same as raster CRS
731
+ return_clipped_processor : bool, default True
732
+ If True, returns new TifProcessor, else returns (array, transform, metadata)
733
+
734
+ Returns:
735
+ --------
736
+ TifProcessor or tuple
737
+ Either new TifProcessor instance or (array, transform, metadata) tuple
738
+ """
739
+ # Create bounding box geometry
740
+ bbox_geom = box(*bounds)
741
+
742
+ # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
743
+ if bounds_crs is not None:
744
+ raster_crs = self.crs
745
+
746
+ if not self.crs == bounds_crs:
747
+ # Create GeoDataFrame with bounds CRS and reproject
748
+ bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
749
+ bbox_gdf = bbox_gdf.to_crs(raster_crs)
750
+ bbox_geom = bbox_gdf.geometry.iloc[0]
751
+
752
+ return self.clip_to_geometry(
753
+ geometry=bbox_geom,
754
+ crop=True,
755
+ return_clipped_processor=return_clipped_processor,
756
+ )
757
+
559
758
  def to_graph(
560
759
  self,
561
760
  connectivity: Literal[4, 8] = 4,
562
761
  band: Optional[int] = None,
563
762
  include_coordinates: bool = False,
564
763
  graph_type: Literal["networkx", "sparse"] = "networkx",
565
- chunk_size: Optional[int] = None,
764
+ check_memory: bool = True,
566
765
  ) -> Union[nx.Graph, sp.csr_matrix]:
567
766
  """
568
767
  Convert raster to graph based on pixel adjacency.
768
+
769
+ Args:
770
+ connectivity: 4 or 8-connectivity
771
+ band: Band number (1-indexed)
772
+ include_coordinates: Include x,y coordinates in nodes
773
+ graph_type: 'networkx' or 'sparse'
774
+ check_memory: Whether to check memory before operation
775
+
776
+ Returns:
777
+ Graph representation of raster
569
778
  """
570
- if chunk_size is not None:
571
- raise NotImplementedError("Chunked processing is not yet implemented.")
779
+
780
+ # Memory guard check
781
+ if check_memory:
782
+ self._memory_guard("graph", threshold_percent=80.0)
572
783
 
573
784
  with self.open_dataset() as src:
574
785
  band_idx = band - 1 if band is not None else 0
@@ -657,12 +868,12 @@ class TifProcessor:
657
868
  weights = edges_array[:, 2]
658
869
 
659
870
  # Add reverse edges for symmetric matrix
660
- row_indices.extend(col_indices)
661
- col_indices.extend(row_indices)
662
- weights.extend(weights)
871
+ from_idx = np.append(row_indices, col_indices)
872
+ to_idx = np.append(col_indices, row_indices)
873
+ weights = np.append(weights, weights)
663
874
 
664
875
  return sp.coo_matrix(
665
- (weights, (row_indices, col_indices)),
876
+ (weights, (from_idx, to_idx)),
666
877
  shape=(num_valid_pixels, num_valid_pixels),
667
878
  ).tocsr()
668
879
 
@@ -798,11 +1009,63 @@ class TifProcessor:
798
1009
  stat: Union[str, Callable] = "mean",
799
1010
  batch_size: int = 100,
800
1011
  n_workers: int = 4,
1012
+ show_progress: bool = True,
1013
+ check_memory: bool = True,
801
1014
  **kwargs,
802
1015
  ) -> np.ndarray:
803
1016
  """
804
1017
  Sample raster values by polygons in parallel using batching.
1018
+
1019
+ Args:
1020
+ polygon_list: List of Shapely Polygon or MultiPolygon objects
1021
+ stat: Statistic to compute
1022
+ batch_size: Number of polygons per batch
1023
+ n_workers: Number of worker processes
1024
+ show_progress: Whether to display progress bar
1025
+ check_memory: Whether to check memory before operation
1026
+ **kwargs: Additional arguments
1027
+
1028
+ Returns:
1029
+ np.ndarray of statistics for each polygon
805
1030
  """
1031
+ import sys
1032
+
1033
+ # Memory guard check with n_workers consideration
1034
+ if check_memory:
1035
+ is_safe = self._memory_guard(
1036
+ "batched_sampling",
1037
+ threshold_percent=85.0,
1038
+ n_workers=n_workers,
1039
+ raise_error=False,
1040
+ )
1041
+
1042
+ if not is_safe:
1043
+ # Suggest reducing n_workers
1044
+ memory_info = self._check_available_memory()
1045
+ estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)
1046
+
1047
+ # Calculate optimal workers
1048
+ suggested_workers = max(
1049
+ 1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
1050
+ )
1051
+
1052
+ warnings.warn(
1053
+ f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
1054
+ f"to reduce memory pressure.",
1055
+ ResourceWarning,
1056
+ )
1057
+
1058
+ # Platform check
1059
+ if sys.platform in ["win32", "darwin"]:
1060
+ import warnings
1061
+ import multiprocessing as mp
1062
+
1063
+ if mp.get_start_method(allow_none=True) != "fork":
1064
+ warnings.warn(
1065
+ "Batched sampling may not work on Windows/macOS. "
1066
+ "Use sample_by_polygons() if you encounter errors.",
1067
+ RuntimeWarning,
1068
+ )
806
1069
 
807
1070
  def _chunk_list(data_list, chunk_size):
808
1071
  """Yield successive chunks from data_list."""
@@ -813,20 +1076,22 @@ class TifProcessor:
813
1076
  return np.array([])
814
1077
 
815
1078
  stat_func = stat if callable(stat) else getattr(np, stat)
816
-
817
1079
  polygon_chunks = list(_chunk_list(polygon_list, batch_size))
818
1080
 
819
1081
  with multiprocessing.Pool(
820
1082
  initializer=self._initializer_worker, processes=n_workers
821
1083
  ) as pool:
822
1084
  process_func = partial(self._process_polygon_batch, stat_func=stat_func)
823
- batched_results = list(
824
- tqdm(
825
- pool.imap(process_func, polygon_chunks),
826
- total=len(polygon_chunks),
827
- desc=f"Sampling polygons",
1085
+ if show_progress:
1086
+ batched_results = list(
1087
+ tqdm(
1088
+ pool.imap(process_func, polygon_chunks),
1089
+ total=len(polygon_chunks),
1090
+ desc=f"Sampling polygons",
1091
+ )
828
1092
  )
829
- )
1093
+ else:
1094
+ batched_results = list(pool.imap(process_func, polygon_chunks))
830
1095
 
831
1096
  results = [item for sublist in batched_results for item in sublist]
832
1097
 
@@ -839,23 +1104,45 @@ class TifProcessor:
839
1104
  This function runs once per worker, not for every task.
840
1105
  """
841
1106
  global src_handle, memfile_handle
842
- with self.data_store.open(str(self.dataset_path), "rb") as f:
843
- memfile_handle = rasterio.MemoryFile(f.read())
844
- src_handle = memfile_handle.open()
1107
+
1108
+ # Priority: merged > reprojected > original (same as open_dataset)
1109
+ local_file_path = None
1110
+ if self._merged_file_path:
1111
+ # Merged file is a local temp file
1112
+ local_file_path = self._merged_file_path
1113
+ elif self._reprojected_file_path:
1114
+ # Reprojected file is a local temp file
1115
+ local_file_path = self._reprojected_file_path
1116
+ elif isinstance(self.data_store, LocalDataStore):
1117
+ # Local file - can open directly
1118
+ local_file_path = str(self.dataset_path)
1119
+
1120
+ if local_file_path:
1121
+ # Open local file directly
1122
+ with open(local_file_path, "rb") as f:
1123
+ memfile_handle = rasterio.MemoryFile(f.read())
1124
+ src_handle = memfile_handle.open()
1125
+ else:
1126
+ # Custom DataStore
1127
+ with self.data_store.open(str(self.dataset_path), "rb") as f:
1128
+ memfile_handle = rasterio.MemoryFile(f.read())
1129
+ src_handle = memfile_handle.open()
1130
+
1131
+ def _get_worker_dataset(self):
1132
+ """Get dataset handle for worker process."""
1133
+ global src_handle
1134
+ if src_handle is None:
1135
+ raise RuntimeError("Raster dataset not initialized in this process.")
1136
+ return src_handle
845
1137
 
846
1138
  def _process_single_polygon(self, polygon, stat_func):
847
1139
  """
848
1140
  Helper function to process a single polygon.
849
1141
  This will be run in a separate process.
850
1142
  """
851
- global src_handle
852
- if src_handle is None:
853
- # This should not happen if the initializer is set up correctly,
854
- # but it's a good defensive check.
855
- raise RuntimeError("Raster dataset not initialized in this process.")
856
-
857
1143
  try:
858
- out_image, _ = mask(src_handle, [polygon], crop=True, filled=False)
1144
+ src = self._get_worker_dataset()
1145
+ out_image, _ = mask(src, [polygon], crop=True, filled=False)
859
1146
 
860
1147
  if hasattr(out_image, "mask"):
861
1148
  valid_data = out_image.compressed()
@@ -866,11 +1153,12 @@ class TifProcessor:
866
1153
  else out_image.flatten()
867
1154
  )
868
1155
 
869
- if len(valid_data) == 0:
870
- return np.nan
871
- else:
872
- return stat_func(valid_data)
873
- except Exception:
1156
+ return stat_func(valid_data) if len(valid_data) > 0 else np.nan
1157
+ except RuntimeError as e:
1158
+ self.logger.error(f"Worker not initialized: {e}")
1159
+ return np.nan
1160
+ except Exception as e:
1161
+ self.logger.debug(f"Error processing polygon: {e}")
874
1162
  return np.nan
875
1163
 
876
1164
  def _process_polygon_batch(self, polygon_batch, stat_func):
@@ -882,226 +1170,226 @@ class TifProcessor:
882
1170
  for polygon in polygon_batch
883
1171
  ]
884
1172
 
885
- def _to_rgba_dataframe(self, drop_transparent: bool = False) -> pd.DataFrame:
886
- """
887
- Convert RGBA TIF to DataFrame with separate columns for R, G, B, A values.
1173
+ def _to_dataframe(
1174
+ self,
1175
+ band_number: Optional[int] = None,
1176
+ drop_nodata: bool = True,
1177
+ band_names: Optional[Union[str, List[str]]] = None,
1178
+ ) -> pd.DataFrame:
888
1179
  """
889
- self.logger.info("Processing RGBA dataset...")
890
-
891
- with self.open_dataset() as src:
892
- if self.count != 4:
893
- raise ValueError("RGBA mode requires a 4-band TIF file")
894
-
895
- # Read all four bands
896
- red, green, blue, alpha = src.read()
897
-
898
- x_coords, y_coords = self._get_pixel_coordinates()
899
-
900
- if drop_transparent:
901
- mask = alpha > 0
902
- red = np.extract(mask, red)
903
- green = np.extract(mask, green)
904
- blue = np.extract(mask, blue)
905
- alpha = np.extract(mask, alpha)
906
- lons = np.extract(mask, x_coords)
907
- lats = np.extract(mask, y_coords)
908
- else:
909
- lons = x_coords.flatten()
910
- lats = y_coords.flatten()
911
- red = red.flatten()
912
- green = green.flatten()
913
- blue = blue.flatten()
914
- alpha = alpha.flatten()
915
-
916
- # Create DataFrame with RGBA values
917
- data = pd.DataFrame(
918
- {
919
- "lon": lons,
920
- "lat": lats,
921
- "red": red,
922
- "green": green,
923
- "blue": blue,
924
- "alpha": alpha,
925
- }
926
- )
927
-
928
- # Normalize alpha values if they're not in [0, 1] range
929
- if data["alpha"].max() > 1:
930
- data["alpha"] = data["alpha"] / data["alpha"].max()
1180
+ Process TIF to DataFrame - handles both single-band and multi-band.
931
1181
 
932
- self.logger.info("RGBA dataset is processed!")
933
- return data
934
-
935
- def _to_rgb_dataframe(self, drop_nodata: bool = True) -> pd.DataFrame:
936
- """Convert RGB TIF to DataFrame with separate columns for R, G, B values."""
937
- if self.mode != "rgb":
938
- raise ValueError("Use appropriate method for current mode")
939
-
940
- self.logger.info("Processing RGB dataset...")
1182
+ Args:
1183
+ band_number: Specific band to read (1-indexed). If None, reads all bands.
1184
+ drop_no Whether to drop nodata values
1185
+ band_names: Custom names for bands (multi-band only)
941
1186
 
1187
+ Returns:
1188
+ pd.DataFrame with lon, lat, and band value(s)
1189
+ """
942
1190
  with self.open_dataset() as src:
943
- if self.count != 3:
944
- raise ValueError("RGB mode requires a 3-band TIF file")
1191
+ if band_number is not None:
1192
+ # SINGLE BAND MODE
1193
+ band = src.read(band_number)
1194
+ mask = self._build_data_mask(band, drop_nodata, src.nodata)
1195
+ lons, lats = self._extract_coordinates_with_mask(mask)
1196
+ pixel_values = (
1197
+ np.extract(mask, band) if mask is not None else band.flatten()
1198
+ )
1199
+ band_name = band_names if isinstance(band_names, str) else "pixel_value"
945
1200
 
946
- # Read all three bands
947
- red, green, blue = src.read()
1201
+ return pd.DataFrame({"lon": lons, "lat": lats, band_name: pixel_values})
1202
+ else:
1203
+ # MULTI-BAND MODE (all bands)
1204
+ stack = src.read()
1205
+
1206
+ # Auto-detect band names by mode
1207
+ if band_names is None:
1208
+ if self.mode == "rgb":
1209
+ band_names = ["red", "green", "blue"]
1210
+ elif self.mode == "rgba":
1211
+ band_names = ["red", "green", "blue", "alpha"]
1212
+ else:
1213
+ band_names = [
1214
+ src.descriptions[i] or f"band_{i+1}"
1215
+ for i in range(self.count)
1216
+ ]
948
1217
 
949
- x_coords, y_coords = self._get_pixel_coordinates()
1218
+ # Build mask (checks ALL bands!)
1219
+ mask = self._build_multi_band_mask(stack, drop_nodata, src.nodata)
950
1220
 
951
- if drop_nodata:
952
- nodata_value = src.nodata
953
- if nodata_value is not None:
954
- mask = ~(
955
- (red == nodata_value)
956
- | (green == nodata_value)
957
- | (blue == nodata_value)
958
- )
959
- red = np.extract(mask, red)
960
- green = np.extract(mask, green)
961
- blue = np.extract(mask, blue)
962
- lons = np.extract(mask, x_coords)
963
- lats = np.extract(mask, y_coords)
964
- else:
965
- lons = x_coords.flatten()
966
- lats = y_coords.flatten()
967
- red = red.flatten()
968
- green = green.flatten()
969
- blue = blue.flatten()
970
- else:
971
- lons = x_coords.flatten()
972
- lats = y_coords.flatten()
973
- red = red.flatten()
974
- green = green.flatten()
975
- blue = blue.flatten()
1221
+ # Create DataFrame
1222
+ data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
1223
+ df = pd.DataFrame(data_dict)
976
1224
 
977
- data = pd.DataFrame(
978
- {
979
- "lon": lons,
980
- "lat": lats,
981
- "red": red,
982
- "green": green,
983
- "blue": blue,
984
- }
985
- )
1225
+ # RGBA: normalize alpha if needed
1226
+ if (
1227
+ self.mode == "rgba"
1228
+ and "alpha" in df.columns
1229
+ and df["alpha"].max() > 1
1230
+ ):
1231
+ df["alpha"] = df["alpha"] / 255.0
986
1232
 
987
- self.logger.info("RGB dataset is processed!")
988
- return data
1233
+ return df
989
1234
 
990
- def _to_band_dataframe(
991
- self, band_number: int = 1, drop_nodata: bool = True, drop_values: list = []
1235
+ def _to_dataframe_chunked(
1236
+ self,
1237
+ windows: List[rasterio.windows.Window],
1238
+ band_number: Optional[int] = None,
1239
+ drop_nodata: bool = True,
1240
+ band_names: Optional[Union[str, List[str]]] = None,
1241
+ show_progress: bool = True,
992
1242
  ) -> pd.DataFrame:
993
- """Process single-band TIF to DataFrame."""
994
- if self.mode != "single":
995
- raise ValueError("Use appropriate method for current mode")
1243
+ """Universal chunked converter for ALL modes."""
996
1244
 
997
- self.logger.info("Processing single-band dataset...")
998
-
999
- if band_number <= 0 or band_number > self.count:
1000
- self.logger.error(
1001
- f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
1002
- )
1003
- return None
1245
+ chunks = []
1246
+ iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows
1004
1247
 
1005
1248
  with self.open_dataset() as src:
1249
+ # Auto-detect band names ONCE (before loop)
1250
+ if band_number is None and band_names is None:
1251
+ if self.mode == "rgb":
1252
+ band_names = ["red", "green", "blue"]
1253
+ elif self.mode == "rgba":
1254
+ band_names = ["red", "green", "blue", "alpha"]
1255
+ else: # multi
1256
+ band_names = [
1257
+ src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
1258
+ ]
1006
1259
 
1007
- band = src.read(band_number)
1008
-
1009
- x_coords, y_coords = self._get_pixel_coordinates()
1260
+ for window in iterator:
1261
+ if band_number is not None:
1262
+ # SINGLE BAND
1263
+ band_chunk = src.read(band_number, window=window)
1264
+ mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
1265
+ lons, lats = self._get_chunk_coordinates(window, src)
1266
+ band_name = (
1267
+ band_names if isinstance(band_names, str) else "pixel_value"
1268
+ )
1010
1269
 
1011
- values_to_mask = []
1012
- if drop_nodata:
1013
- nodata_value = src.nodata
1014
- if nodata_value is not None:
1015
- values_to_mask.append(nodata_value)
1270
+ # Build chunk DataFrame (could use helper but simple enough)
1271
+ if mask is not None:
1272
+ mask_flat = mask.flatten()
1273
+ chunk_df = pd.DataFrame(
1274
+ {
1275
+ "lon": lons[mask_flat],
1276
+ "lat": lats[mask_flat],
1277
+ band_name: band_chunk.flatten()[mask_flat],
1278
+ }
1279
+ )
1280
+ else:
1281
+ chunk_df = pd.DataFrame(
1282
+ {"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
1283
+ )
1284
+ else:
1285
+ # MULTI-BAND (includes RGB/RGBA)
1286
+ stack_chunk = src.read(window=window)
1287
+ mask = self._build_multi_band_mask(
1288
+ stack_chunk, drop_nodata, src.nodata
1289
+ )
1290
+ lons, lats = self._get_chunk_coordinates(window, src)
1016
1291
 
1017
- if drop_values:
1018
- values_to_mask.extend(drop_values)
1292
+ # Build DataFrame using helper
1293
+ band_dict = {
1294
+ band_names[i]: stack_chunk[i] for i in range(self.count)
1295
+ }
1296
+ chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)
1019
1297
 
1020
- if values_to_mask:
1021
- data_mask = ~np.isin(band, values_to_mask)
1022
- pixel_values = np.extract(data_mask, band)
1023
- lons = np.extract(data_mask, x_coords)
1024
- lats = np.extract(data_mask, y_coords)
1025
- else:
1026
- pixel_values = band.flatten()
1027
- lons = x_coords.flatten()
1028
- lats = y_coords.flatten()
1298
+ # RGBA: normalize alpha
1299
+ if self.mode == "rgba" and "alpha" in chunk_df.columns:
1300
+ if chunk_df["alpha"].max() > 1:
1301
+ chunk_df["alpha"] = chunk_df["alpha"] / 255.0
1029
1302
 
1030
- data = pd.DataFrame({"lon": lons, "lat": lats, "pixel_value": pixel_values})
1303
+ chunks.append(chunk_df)
1031
1304
 
1032
- self.logger.info("Dataset is processed!")
1033
- return data
1305
+ result = pd.concat(chunks, ignore_index=True)
1306
+ return result
1034
1307
 
1035
- def _to_multi_band_dataframe(
1308
+ def _prepare_geometry_for_clipping(
1036
1309
  self,
1037
- drop_nodata: bool = True,
1038
- drop_values: list = [],
1039
- band_names: Optional[List[str]] = None,
1040
- ) -> pd.DataFrame:
1041
- """
1042
- Process multi-band TIF to DataFrame with all bands included.
1043
-
1044
- Args:
1045
- drop_nodata (bool): Whether to drop nodata values. Defaults to True.
1046
- drop_values (list): Additional values to drop from the dataset. Defaults to empty list.
1047
- band_names (Optional[List[str]]): Custom names for the bands. If None, bands will be named using
1048
- the band descriptions from the GeoTIFF metadata if available,
1049
- otherwise 'band_1', 'band_2', etc.
1310
+ geometry: Union[
1311
+ Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
1312
+ ],
1313
+ ) -> List[dict]:
1314
+ """Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""
1315
+
1316
+ if isinstance(geometry, (Polygon, MultiPolygon)):
1317
+ # Shapely geometry
1318
+ return [geometry.__geo_interface__]
1319
+
1320
+ elif isinstance(geometry, gpd.GeoDataFrame):
1321
+ # GeoDataFrame - use all geometries
1322
+ return [
1323
+ geom.__geo_interface__ for geom in geometry.geometry if geom is not None
1324
+ ]
1325
+
1326
+ elif isinstance(geometry, gpd.GeoSeries):
1327
+ # GeoSeries
1328
+ return [geom.__geo_interface__ for geom in geometry if geom is not None]
1329
+
1330
+ elif isinstance(geometry, dict):
1331
+ # Single GeoJSON-like dict
1332
+ return [geometry]
1333
+
1334
+ elif isinstance(geometry, list):
1335
+ # List of GeoJSON-like dicts
1336
+ return geometry
1050
1337
 
1051
- Returns:
1052
- pd.DataFrame: DataFrame containing coordinates and all band values
1053
- """
1054
- self.logger.info("Processing multi-band dataset...")
1055
-
1056
- with self.open_dataset() as src:
1057
- # Read all bands
1058
- stack = src.read()
1059
-
1060
- x_coords, y_coords = self._get_pixel_coordinates()
1061
-
1062
- # Initialize dictionary with coordinates
1063
- data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
1338
+ else:
1339
+ raise TypeError(
1340
+ f"Unsupported geometry type: {type(geometry)}. "
1341
+ "Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
1342
+ "GeoJSON-like dict, or list of GeoJSON-like dicts."
1343
+ )
1064
1344
 
1065
- # Get band descriptions from metadata if available
1066
- if band_names is None and hasattr(src, "descriptions") and src.descriptions:
1067
- band_names = [
1068
- desc if desc else f"band_{i+1}"
1069
- for i, desc in enumerate(src.descriptions)
1070
- ]
1345
+ def _validate_geometry_crs(
1346
+ self,
1347
+ original_geometry: Any,
1348
+ ) -> None:
1349
+ """Validate that geometry CRS matches raster CRS"""
1350
+
1351
+ # Get raster CRS
1352
+ raster_crs = self.crs
1353
+
1354
+ # Try to get geometry CRS
1355
+ geometry_crs = None
1356
+
1357
+ if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
1358
+ geometry_crs = original_geometry.crs
1359
+ elif hasattr(original_geometry, "crs"):
1360
+ geometry_crs = original_geometry.crs
1361
+
1362
+ # Warn if CRS mismatch detected
1363
+ if geometry_crs is not None and raster_crs is not None:
1364
+ if not raster_crs == geometry_crs:
1365
+ self.logger.warning(
1366
+ f"CRS mismatch detected! Raster CRS: {raster_crs}, "
1367
+ f"Geometry CRS: {geometry_crs}. "
1368
+ "Consider reprojecting geometry to match raster CRS for accurate clipping."
1369
+ )
1071
1370
 
1072
- # Process each band
1073
- for band_idx in range(self.count):
1074
- band_data = stack[band_idx]
1075
-
1076
- # Handle nodata and other values to drop
1077
- if drop_nodata or drop_values:
1078
- values_to_mask = []
1079
- if drop_nodata and src.nodata is not None:
1080
- values_to_mask.append(src.nodata)
1081
- if drop_values:
1082
- values_to_mask.extend(drop_values)
1083
-
1084
- if values_to_mask:
1085
- data_mask = ~np.isin(band_data, values_to_mask)
1086
- band_values = np.extract(data_mask, band_data)
1087
- if band_idx == 0: # Only need to mask coordinates once
1088
- data_dict["lon"] = np.extract(data_mask, x_coords)
1089
- data_dict["lat"] = np.extract(data_mask, y_coords)
1090
- else:
1091
- band_values = band_data.flatten()
1092
- else:
1093
- band_values = band_data.flatten()
1371
+ def _create_clipped_processor(
1372
+ self, clipped_data: np.ndarray, clipped_meta: dict
1373
+ ) -> "TifProcessor":
1374
+ """
1375
+ Helper to create a new TifProcessor instance from clipped data.
1376
+ Saves the clipped data to a temporary file and initializes a new TifProcessor.
1377
+ """
1378
+ clipped_file_path = os.path.join(
1379
+ self._temp_dir, f"clipped_temp_{os.urandom(8).hex()}.tif"
1380
+ )
1381
+ with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
1382
+ dst.write(clipped_data)
1094
1383
 
1095
- # Use custom band names if provided, otherwise use descriptions or default naming
1096
- band_name = (
1097
- band_names[band_idx]
1098
- if band_names and len(band_names) > band_idx
1099
- else f"band_{band_idx + 1}"
1100
- )
1101
- data_dict[band_name] = band_values
1384
+ self.logger.info(f"Clipped raster saved to temporary file: {clipped_file_path}")
1102
1385
 
1103
- self.logger.info("Multi-band dataset is processed!")
1104
- return pd.DataFrame(data_dict)
1386
+ # Create a new TifProcessor instance with the clipped data
1387
+ # Pass relevant parameters from the current instance to maintain consistency
1388
+ return TifProcessor(
1389
+ dataset_path=clipped_file_path,
1390
+ data_store=self.data_store,
1391
+ mode=self.mode,
1392
+ )
1105
1393
 
1106
1394
  def _get_pixel_coordinates(self):
1107
1395
  """Helper method to generate coordinate arrays for all pixels"""
@@ -1128,79 +1416,322 @@ class TifProcessor:
1128
1416
 
1129
1417
  return self._cache["pixel_coords"]
1130
1418
 
1131
- def __enter__(self):
1132
- return self
1419
+ def _get_chunk_coordinates(self, window, src):
1420
+ """Get coordinates for a specific window chunk."""
1421
+ transform = src.window_transform(window)
1422
+ rows, cols = np.meshgrid(
1423
+ np.arange(window.height), np.arange(window.width), indexing="ij"
1424
+ )
1425
+ xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
1426
+ return np.array(xs), np.array(ys)
1133
1427
 
1134
- def __del__(self):
1135
- """Clean up temporary files and directories."""
1136
- if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1137
- shutil.rmtree(self._temp_dir, ignore_errors=True)
1428
+ def _extract_coordinates_with_mask(self, mask=None):
1429
+ """Extract flattened coordinates, optionally applying a mask."""
1430
+ x_coords, y_coords = self._get_pixel_coordinates()
1138
1431
 
1139
- def cleanup(self):
1140
- """Explicit cleanup method for better control."""
1141
- if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1142
- shutil.rmtree(self._temp_dir)
1143
- self.logger.info("Cleaned up temporary files")
1432
+ if mask is not None:
1433
+ return np.extract(mask, x_coords), np.extract(mask, y_coords)
1144
1434
 
1145
- def __exit__(self, exc_type, exc_value, traceback):
1146
- """Proper context manager exit with cleanup."""
1147
- self.cleanup()
1148
- return False
1435
+ return x_coords.flatten(), y_coords.flatten()
1149
1436
 
1437
+ def _build_data_mask(self, data, drop_nodata=True, nodata_value=None):
1438
+ """Build a boolean mask for filtering data based on nodata values."""
1439
+ if not drop_nodata or nodata_value is None:
1440
+ return None
1150
1441
 
1151
- def sample_multiple_tifs_by_coordinates(
1152
- tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]
1153
- ):
1154
- """
1155
- Sample raster values from multiple TIFF files for given coordinates.
1442
+ return data != nodata_value
1156
1443
 
1157
- Parameters:
1158
- - tif_processors: List of TifProcessor instances.
1159
- - coordinate_list: List of (x, y) coordinates.
1444
+ def _build_multi_band_mask(
1445
+ self,
1446
+ bands: np.ndarray,
1447
+ drop_nodata: bool = True,
1448
+ nodata_value: Optional[float] = None,
1449
+ ) -> Optional[np.ndarray]:
1450
+ """
1451
+ Build mask for multi-band data - drops pixels where ANY band has nodata.
1160
1452
 
1161
- Returns:
1162
- - A NumPy array of sampled values, taking the first non-nodata value encountered.
1163
- """
1164
- sampled_values = np.full(len(coordinate_list), np.nan, dtype=np.float32)
1453
+ Args:
1454
+ bands: 3D array of shape (n_bands, height, width)
1455
+ drop_nodata Whether to drop nodata values
1456
+ nodata_value: The nodata value to check
1457
+
1458
+ Returns:
1459
+ Boolean mask or None if no masking needed
1460
+ """
1461
+ if not drop_nodata or nodata_value is None:
1462
+ return None
1463
+
1464
+ # Check if ANY band has nodata at each pixel location
1465
+ has_nodata = np.any(bands == nodata_value, axis=0)
1466
+
1467
+ # Return True where ALL bands are valid
1468
+ valid_mask = ~has_nodata
1469
+
1470
+ return valid_mask if not valid_mask.all() else None
1165
1471
 
1166
- for tp in tif_processors:
1167
- values = tp.sample_by_coordinates(coordinate_list=coordinate_list)
1472
+ def _bands_to_dict(self, bands, band_count, band_names, mask=None):
1473
+ """Read specified bands and return as a dictionary with optional masking."""
1474
+
1475
+ lons, lats = self._extract_coordinates_with_mask(mask)
1476
+ data_dict = {"lon": lons, "lat": lats}
1477
+
1478
+ for idx, name in enumerate(band_names[:band_count]):
1479
+ band_data = bands[idx]
1480
+ data_dict[name] = (
1481
+ np.extract(mask, band_data) if mask is not None else band_data.flatten()
1482
+ )
1483
+
1484
+ return data_dict
1485
+
1486
+ def _calculate_optimal_chunk_size(
1487
+ self, operation: str = "conversion", target_memory_mb: int = 500
1488
+ ) -> int:
1489
+ """
1490
+ Calculate optimal chunk size (number of rows) based on target memory usage.
1491
+
1492
+ Args:
1493
+ operation: Type of operation ('conversion', 'graph')
1494
+ target_memory_mb: Target memory per chunk in megabytes
1168
1495
 
1169
- if tp.nodata is not None:
1170
- mask = (np.isnan(sampled_values)) & (
1171
- values != tp.nodata
1172
- ) # Replace only NaNs
1496
+ Returns:
1497
+ Number of rows per chunk
1498
+ """
1499
+ bytes_per_element = np.dtype(self.dtype).itemsize
1500
+ n_bands = self.count
1501
+ width = self.width
1502
+
1503
+ # Adjust for operation type
1504
+ if operation == "conversion":
1505
+ # DataFrame overhead is roughly 2x
1506
+ bytes_per_row = width * n_bands * bytes_per_element * 2
1507
+ elif operation == "graph":
1508
+ # Graph needs additional space for edges
1509
+ bytes_per_row = width * bytes_per_element * 4 # Estimate
1173
1510
  else:
1174
- mask = np.isnan(sampled_values) # No explicit nodata, replace all NaNs
1511
+ bytes_per_row = width * n_bands * bytes_per_element
1175
1512
 
1176
- sampled_values[mask] = values[mask] # Update only missing values
1513
+ target_bytes = target_memory_mb * 1024 * 1024
1514
+ chunk_rows = max(1, int(target_bytes / bytes_per_row))
1177
1515
 
1178
- return sampled_values
1516
+ # Ensure chunk size doesn't exceed total height
1517
+ chunk_rows = min(chunk_rows, self.height)
1179
1518
 
1519
+ self.logger.info(
1520
+ f"Calculated chunk size: {chunk_rows} rows "
1521
+ f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
1522
+ )
1180
1523
 
1181
- def sample_multiple_tifs_by_polygons(
1182
- tif_processors: List[TifProcessor],
1183
- polygon_list: List[Union[Polygon, MultiPolygon]],
1184
- stat: str = "mean",
1185
- ) -> np.ndarray:
1186
- """
1187
- Sample raster values from multiple TIFF files for polygons in a list and join the results.
1524
+ return chunk_rows
1188
1525
 
1189
- Parameters:
1190
- - tif_processors: List of TifProcessor instances.
1191
- - polygon_list: List of polygon geometries (can include MultiPolygons).
1192
- - stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
1526
+ def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
1527
+ """
1528
+ Generate window objects for chunked reading.
1193
1529
 
1194
- Returns:
1195
- - A NumPy array of sampled values, taking the first non-nodata value encountered.
1196
- """
1197
- sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
1530
+ Args:
1531
+ chunk_size: Number of rows per chunk
1198
1532
 
1199
- for tp in tif_processors:
1200
- values = tp.sample_by_polygons(polygon_list=polygon_list, stat=stat)
1533
+ Returns:
1534
+ List of rasterio.windows.Window objects
1535
+ """
1536
+ windows = []
1537
+ for row_start in range(0, self.height, chunk_size):
1538
+ row_end = min(row_start + chunk_size, self.height)
1539
+ window = rasterio.windows.Window(
1540
+ col_off=0,
1541
+ row_off=row_start,
1542
+ width=self.width,
1543
+ height=row_end - row_start,
1544
+ )
1545
+ windows.append(window)
1546
+
1547
+ return windows
1201
1548
 
1202
- mask = np.isnan(sampled_values) # replace all NaNs
1549
+ def _format_bytes(self, bytes_value: int) -> str:
1550
+ """Convert bytes to human-readable format."""
1551
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
1552
+ if bytes_value < 1024.0:
1553
+ return f"{bytes_value:.2f} {unit}"
1554
+ bytes_value /= 1024.0
1555
+ return f"{bytes_value:.2f} PB"
1556
+
1557
+ def _check_available_memory(self) -> dict:
1558
+ """
1559
+ Check available system memory.
1203
1560
 
1204
- sampled_values[mask] = values[mask] # Update only values with samapled value
1561
+ Returns:
1562
+ Dict with total, available, and used memory info
1563
+ """
1564
+ import psutil
1205
1565
 
1206
- return sampled_values
1566
+ memory = psutil.virtual_memory()
1567
+ return {
1568
+ "total": memory.total,
1569
+ "available": memory.available,
1570
+ "used": memory.used,
1571
+ "percent": memory.percent,
1572
+ "available_human": self._format_bytes(memory.available),
1573
+ }
1574
+
1575
+ def _estimate_memory_usage(
1576
+ self, operation: str = "conversion", n_workers: int = 1
1577
+ ) -> dict:
1578
+ """
1579
+ Estimate memory usage for various operations.
1580
+
1581
+ Args:
1582
+ operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
1583
+ n_workers: Number of workers (for batched_sampling)
1584
+
1585
+ Returns:
1586
+ Dict with estimated memory usage in bytes and human-readable format
1587
+ """
1588
+ bytes_per_element = np.dtype(self.dtype).itemsize
1589
+ n_pixels = self.width * self.height
1590
+ n_bands = self.count
1591
+
1592
+ estimates = {}
1593
+
1594
+ if operation == "conversion":
1595
+ # to_dataframe/to_geodataframe: full raster + DataFrame overhead
1596
+ raster_memory = n_pixels * n_bands * bytes_per_element
1597
+ # DataFrame overhead (roughly 2x for storage + processing)
1598
+ dataframe_memory = (
1599
+ n_pixels * n_bands * 16
1600
+ ) # 16 bytes per value in DataFrame
1601
+ total = raster_memory + dataframe_memory
1602
+ estimates["raster"] = raster_memory
1603
+ estimates["dataframe"] = dataframe_memory
1604
+ estimates["total"] = total
1605
+
1606
+ elif operation == "batched_sampling":
1607
+ # Each worker loads full raster into MemoryFile
1608
+ # Need to get file size
1609
+ if self._merged_file_path:
1610
+ file_path = self._merged_file_path
1611
+ elif self._reprojected_file_path:
1612
+ file_path = self._reprojected_file_path
1613
+ else:
1614
+ file_path = str(self.dataset_path)
1615
+
1616
+ try:
1617
+ import os
1618
+
1619
+ file_size = os.path.getsize(file_path)
1620
+ except:
1621
+ # Estimate if can't get file size
1622
+ file_size = n_pixels * n_bands * bytes_per_element * 1.2 # Add overhead
1623
+
1624
+ estimates["per_worker"] = file_size
1625
+ estimates["total"] = file_size * n_workers
1626
+
1627
+ elif operation == "merge":
1628
+ # _merge_with_mean uses float64 arrays
1629
+ raster_memory = n_pixels * n_bands * 8 # float64
1630
+ estimates["sum_array"] = raster_memory
1631
+ estimates["count_array"] = n_pixels * 4 # int32
1632
+ estimates["total"] = raster_memory + n_pixels * 4
1633
+
1634
+ elif operation == "graph":
1635
+ # to_graph: data + node_map + edges
1636
+ data_memory = n_pixels * bytes_per_element
1637
+ node_map_memory = n_pixels * 4 # int32
1638
+ # Estimate edges (rough: 4-connectivity = 4 edges per pixel)
1639
+ edges_memory = n_pixels * 4 * 3 * 8 # 3 values per edge, float64
1640
+ total = data_memory + node_map_memory + edges_memory
1641
+ estimates["data"] = data_memory
1642
+ estimates["node_map"] = node_map_memory
1643
+ estimates["edges"] = edges_memory
1644
+ estimates["total"] = total
1645
+
1646
+ # Add human-readable format
1647
+ estimates["human_readable"] = self._format_bytes(estimates["total"])
1648
+
1649
+ return estimates
1650
+
1651
+ def _memory_guard(
1652
+ self,
1653
+ operation: str,
1654
+ threshold_percent: float = 80.0,
1655
+ n_workers: Optional[int] = None,
1656
+ raise_error: bool = False,
1657
+ ) -> bool:
1658
+ """
1659
+ Check if operation is safe to perform given memory constraints.
1660
+
1661
+ Args:
1662
+ operation: Type of operation to check
1663
+ threshold_percent: Maximum % of available memory to use (default 80%)
1664
+ n_workers: Number of workers (for batched operations)
1665
+ raise_error: If True, raise MemoryError instead of warning
1666
+
1667
+ Returns:
1668
+ True if operation is safe, False otherwise
1669
+
1670
+ Raises:
1671
+ MemoryError: If raise_error=True and memory insufficient
1672
+ """
1673
+ import warnings
1674
+
1675
+ estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
1676
+ memory_info = self._check_available_memory()
1677
+
1678
+ estimated_usage = estimates["total"]
1679
+ available = memory_info["available"]
1680
+ threshold = available * (threshold_percent / 100.0)
1681
+
1682
+ is_safe = estimated_usage <= threshold
1683
+
1684
+ if not is_safe:
1685
+ usage_str = self._format_bytes(estimated_usage)
1686
+ available_str = memory_info["available_human"]
1687
+
1688
+ message = (
1689
+ f"Memory warning: {operation} operation may require {usage_str} "
1690
+ f"but only {available_str} is available. "
1691
+ f"Current memory usage: {memory_info['percent']:.1f}%"
1692
+ )
1693
+
1694
+ if raise_error:
1695
+ raise MemoryError(message)
1696
+ else:
1697
+ warnings.warn(message, ResourceWarning)
1698
+ if hasattr(self, "logger"):
1699
+ self.logger.warning(message)
1700
+
1701
+ return is_safe
1702
+
1703
+ def _validate_mode_band_compatibility(self):
1704
+ """Validate that mode matches band count."""
1705
+ mode_requirements = {
1706
+ "single": (1, "1-band"),
1707
+ "rgb": (3, "3-band"),
1708
+ "rgba": (4, "4-band"),
1709
+ }
1710
+
1711
+ if self.mode in mode_requirements:
1712
+ required_count, description = mode_requirements[self.mode]
1713
+ if self.count != required_count:
1714
+ raise ValueError(
1715
+ f"{self.mode.upper()} mode requires a {description} TIF file"
1716
+ )
1717
+ elif self.mode == "multi" and self.count < 2:
1718
+ raise ValueError("Multi mode requires a TIF file with 2 or more bands")
1719
+
1720
+ def __enter__(self):
1721
+ return self
1722
+
1723
+ def __del__(self):
1724
+ """Clean up temporary files and directories."""
1725
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1726
+ shutil.rmtree(self._temp_dir, ignore_errors=True)
1727
+
1728
+ def cleanup(self):
1729
+ """Explicit cleanup method for better control."""
1730
+ if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
1731
+ shutil.rmtree(self._temp_dir)
1732
+ self.logger.info("Cleaned up temporary files")
1733
+
1734
+ def __exit__(self, exc_type, exc_value, traceback):
1735
+ """Proper context manager exit with cleanup."""
1736
+ self.cleanup()
1737
+ return False