giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.7.0.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +1 -1
- {giga_spatial-0.7.0.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +18 -17
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +1 -0
- gigaspatial/generators/poi.py +226 -82
- gigaspatial/generators/zonal/base.py +41 -28
- gigaspatial/generators/zonal/geometry.py +79 -36
- gigaspatial/handlers/base.py +22 -8
- gigaspatial/handlers/ghsl.py +22 -8
- gigaspatial/handlers/giga.py +9 -4
- gigaspatial/handlers/healthsites.py +350 -0
- gigaspatial/handlers/osm.py +325 -105
- gigaspatial/handlers/worldpop.py +228 -9
- gigaspatial/processing/geo.py +1 -1
- gigaspatial/processing/tif_processor.py +831 -300
- {giga_spatial-0.7.0.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
- {giga_spatial-0.7.0.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.7.0.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,10 @@ from gigaspatial.core.io.data_store import DataStore
|
|
24
24
|
from gigaspatial.core.io.local_data_store import LocalDataStore
|
25
25
|
from gigaspatial.config import config
|
26
26
|
|
27
|
+
# Global variables for multiprocessing workers
|
28
|
+
src_handle = None
|
29
|
+
memfile_handle = None
|
30
|
+
|
27
31
|
|
28
32
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
29
33
|
class TifProcessor:
|
@@ -73,16 +77,7 @@ class TifProcessor:
|
|
73
77
|
self.dataset_path = self._reprojected_file_path
|
74
78
|
|
75
79
|
self._load_metadata()
|
76
|
-
|
77
|
-
# Validate mode and band count
|
78
|
-
if self.mode == "rgba" and self.count != 4:
|
79
|
-
raise ValueError("RGBA mode requires a 4-band TIF file")
|
80
|
-
if self.mode == "rgb" and self.count != 3:
|
81
|
-
raise ValueError("RGB mode requires a 3-band TIF file")
|
82
|
-
if self.mode == "single" and self.count != 1:
|
83
|
-
raise ValueError("Single mode requires a 1-band TIF file")
|
84
|
-
if self.mode == "multi" and self.count < 2:
|
85
|
-
raise ValueError("Multi mode requires a TIF file with 2 or more bands")
|
80
|
+
self._validate_mode_band_compatibility()
|
86
81
|
|
87
82
|
@contextmanager
|
88
83
|
def open_dataset(self):
|
@@ -93,6 +88,9 @@ class TifProcessor:
|
|
93
88
|
elif self._reprojected_file_path:
|
94
89
|
with rasterio.open(self._reprojected_file_path) as src:
|
95
90
|
yield src
|
91
|
+
elif isinstance(self.data_store, LocalDataStore):
|
92
|
+
with rasterio.open(str(self.dataset_path)) as src:
|
93
|
+
yield src
|
96
94
|
else:
|
97
95
|
with self.data_store.open(str(self.dataset_path), "rb") as f:
|
98
96
|
with rasterio.MemoryFile(f.read()) as memfile:
|
@@ -514,19 +512,36 @@ class TifProcessor:
|
|
514
512
|
def height(self):
|
515
513
|
return self._cache["height"]
|
516
514
|
|
517
|
-
def to_dataframe(
|
515
|
+
def to_dataframe(
|
516
|
+
self, drop_nodata=True, check_memory=True, **kwargs
|
517
|
+
) -> pd.DataFrame:
|
518
|
+
"""
|
519
|
+
Convert raster to DataFrame.
|
520
|
+
|
521
|
+
Args:
|
522
|
+
drop_nodata: Whether to drop nodata values
|
523
|
+
check_memory: Whether to check memory before operation (default True)
|
524
|
+
**kwargs: Additional arguments
|
525
|
+
|
526
|
+
Returns:
|
527
|
+
pd.DataFrame with raster data
|
528
|
+
"""
|
529
|
+
# Memory guard check
|
530
|
+
if check_memory:
|
531
|
+
self._memory_guard("conversion", threshold_percent=80.0)
|
532
|
+
|
518
533
|
try:
|
519
534
|
if self.mode == "single":
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
elif self.mode == "multi":
|
526
|
-
df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
|
535
|
+
return self._to_dataframe(
|
536
|
+
band_number=kwargs.get("band_number", 1),
|
537
|
+
drop_nodata=drop_nodata,
|
538
|
+
band_names=kwargs.get("band_names", None),
|
539
|
+
)
|
527
540
|
else:
|
528
|
-
|
529
|
-
|
541
|
+
return self._to_dataframe(
|
542
|
+
band_number=None, # All bands
|
543
|
+
drop_nodata=drop_nodata,
|
544
|
+
band_names=kwargs.get("band_names", None),
|
530
545
|
)
|
531
546
|
except Exception as e:
|
532
547
|
raise ValueError(
|
@@ -537,12 +552,23 @@ class TifProcessor:
|
|
537
552
|
|
538
553
|
return df
|
539
554
|
|
540
|
-
def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
|
555
|
+
def to_geodataframe(self, check_memory=True, **kwargs) -> gpd.GeoDataFrame:
|
541
556
|
"""
|
542
557
|
Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
|
543
558
|
Each zone is defined by its bounding box, based on pixel resolution and coordinates.
|
559
|
+
|
560
|
+
Args:
|
561
|
+
check_memory: Whether to check memory before operation
|
562
|
+
**kwargs: Additional arguments passed to to_dataframe()
|
563
|
+
|
564
|
+
Returns:
|
565
|
+
gpd.GeoDataFrame with raster data
|
544
566
|
"""
|
545
|
-
|
567
|
+
# Memory guard check
|
568
|
+
if check_memory:
|
569
|
+
self._memory_guard("conversion", threshold_percent=80.0)
|
570
|
+
|
571
|
+
df = self.to_dataframe(check_memory=False, **kwargs)
|
546
572
|
|
547
573
|
x_res, y_res = self.resolution
|
548
574
|
|
@@ -556,19 +582,204 @@ class TifProcessor:
|
|
556
582
|
|
557
583
|
return gdf
|
558
584
|
|
585
|
+
def to_dataframe_chunked(
|
586
|
+
self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
|
587
|
+
):
|
588
|
+
"""
|
589
|
+
Convert raster to DataFrame using chunked processing for memory efficiency.
|
590
|
+
|
591
|
+
Automatically routes to the appropriate chunked method based on mode.
|
592
|
+
Chunk size is automatically calculated based on target memory usage.
|
593
|
+
|
594
|
+
Args:
|
595
|
+
drop_nodata: Whether to drop nodata values
|
596
|
+
chunk_size: Number of rows per chunk (auto-calculated if None)
|
597
|
+
target_memory_mb: Target memory per chunk in MB (default 500)
|
598
|
+
**kwargs: Additional arguments (band_number, band_names, etc.)
|
599
|
+
"""
|
600
|
+
|
601
|
+
if chunk_size is None:
|
602
|
+
chunk_size = self._calculate_optimal_chunk_size(
|
603
|
+
"conversion", target_memory_mb
|
604
|
+
)
|
605
|
+
|
606
|
+
windows = self._get_chunk_windows(chunk_size)
|
607
|
+
|
608
|
+
# SIMPLE ROUTING
|
609
|
+
if self.mode == "single":
|
610
|
+
return self._to_dataframe_chunked(
|
611
|
+
windows,
|
612
|
+
band_number=kwargs.get("band_number", 1),
|
613
|
+
drop_nodata=drop_nodata,
|
614
|
+
band_names=kwargs.get("band_names", None),
|
615
|
+
)
|
616
|
+
else: # rgb, rgba, multi
|
617
|
+
return self._to_dataframe_chunked(
|
618
|
+
windows,
|
619
|
+
band_number=None,
|
620
|
+
drop_nodata=drop_nodata,
|
621
|
+
band_names=kwargs.get("band_names", None),
|
622
|
+
)
|
623
|
+
|
624
|
+
def clip_to_geometry(
|
625
|
+
self,
|
626
|
+
geometry: Union[
|
627
|
+
Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
|
628
|
+
],
|
629
|
+
crop: bool = True,
|
630
|
+
all_touched: bool = True,
|
631
|
+
invert: bool = False,
|
632
|
+
nodata: Optional[Union[int, float]] = None,
|
633
|
+
pad: bool = False,
|
634
|
+
pad_width: float = 0.5,
|
635
|
+
return_clipped_processor: bool = True,
|
636
|
+
) -> Union["TifProcessor", tuple]:
|
637
|
+
"""
|
638
|
+
Clip raster to geometry boundaries.
|
639
|
+
|
640
|
+
Parameters:
|
641
|
+
-----------
|
642
|
+
geometry : various
|
643
|
+
Geometry to clip to. Can be:
|
644
|
+
- Shapely Polygon or MultiPolygon
|
645
|
+
- GeoDataFrame or GeoSeries
|
646
|
+
- List of GeoJSON-like dicts
|
647
|
+
- Single GeoJSON-like dict
|
648
|
+
crop : bool, default True
|
649
|
+
Whether to crop the raster to the extent of the geometry
|
650
|
+
all_touched : bool, default True
|
651
|
+
Include pixels that touch the geometry boundary
|
652
|
+
invert : bool, default False
|
653
|
+
If True, mask pixels inside geometry instead of outside
|
654
|
+
nodata : int or float, optional
|
655
|
+
Value to use for masked pixels. If None, uses raster's nodata value
|
656
|
+
pad : bool, default False
|
657
|
+
Pad geometry by half pixel before clipping
|
658
|
+
pad_width : float, default 0.5
|
659
|
+
Width of padding in pixels if pad=True
|
660
|
+
return_clipped_processor : bool, default True
|
661
|
+
If True, returns new TifProcessor with clipped data
|
662
|
+
If False, returns (clipped_array, transform, metadata)
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
--------
|
666
|
+
TifProcessor or tuple
|
667
|
+
Either new TifProcessor instance or (array, transform, metadata) tuple
|
668
|
+
"""
|
669
|
+
# Handle different geometry input types
|
670
|
+
shapes = self._prepare_geometry_for_clipping(geometry)
|
671
|
+
|
672
|
+
# Validate CRS compatibility
|
673
|
+
self._validate_geometry_crs(geometry)
|
674
|
+
|
675
|
+
# Perform the clipping
|
676
|
+
with self.open_dataset() as src:
|
677
|
+
try:
|
678
|
+
clipped_data, clipped_transform = mask(
|
679
|
+
dataset=src,
|
680
|
+
shapes=shapes,
|
681
|
+
crop=crop,
|
682
|
+
all_touched=all_touched,
|
683
|
+
invert=invert,
|
684
|
+
nodata=nodata,
|
685
|
+
pad=pad,
|
686
|
+
pad_width=pad_width,
|
687
|
+
filled=True,
|
688
|
+
)
|
689
|
+
|
690
|
+
# Update metadata for the clipped raster
|
691
|
+
clipped_meta = src.meta.copy()
|
692
|
+
clipped_meta.update(
|
693
|
+
{
|
694
|
+
"height": clipped_data.shape[1],
|
695
|
+
"width": clipped_data.shape[2],
|
696
|
+
"transform": clipped_transform,
|
697
|
+
"nodata": nodata if nodata is not None else src.nodata,
|
698
|
+
}
|
699
|
+
)
|
700
|
+
|
701
|
+
except ValueError as e:
|
702
|
+
if "Input shapes do not overlap raster" in str(e):
|
703
|
+
raise ValueError(
|
704
|
+
"The geometry does not overlap with the raster. "
|
705
|
+
"Check that both are in the same coordinate reference system."
|
706
|
+
) from e
|
707
|
+
else:
|
708
|
+
raise e
|
709
|
+
|
710
|
+
if return_clipped_processor:
|
711
|
+
# Create a new TifProcessor with the clipped data
|
712
|
+
return self._create_clipped_processor(clipped_data, clipped_meta)
|
713
|
+
else:
|
714
|
+
return clipped_data, clipped_transform, clipped_meta
|
715
|
+
|
716
|
+
def clip_to_bounds(
|
717
|
+
self,
|
718
|
+
bounds: tuple,
|
719
|
+
bounds_crs: Optional[str] = None,
|
720
|
+
return_clipped_processor: bool = True,
|
721
|
+
) -> Union["TifProcessor", tuple]:
|
722
|
+
"""
|
723
|
+
Clip raster to rectangular bounds.
|
724
|
+
|
725
|
+
Parameters:
|
726
|
+
-----------
|
727
|
+
bounds : tuple
|
728
|
+
Bounding box as (minx, miny, maxx, maxy)
|
729
|
+
bounds_crs : str, optional
|
730
|
+
CRS of the bounds. If None, assumes same as raster CRS
|
731
|
+
return_clipped_processor : bool, default True
|
732
|
+
If True, returns new TifProcessor, else returns (array, transform, metadata)
|
733
|
+
|
734
|
+
Returns:
|
735
|
+
--------
|
736
|
+
TifProcessor or tuple
|
737
|
+
Either new TifProcessor instance or (array, transform, metadata) tuple
|
738
|
+
"""
|
739
|
+
# Create bounding box geometry
|
740
|
+
bbox_geom = box(*bounds)
|
741
|
+
|
742
|
+
# If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
|
743
|
+
if bounds_crs is not None:
|
744
|
+
raster_crs = self.crs
|
745
|
+
|
746
|
+
if not self.crs == bounds_crs:
|
747
|
+
# Create GeoDataFrame with bounds CRS and reproject
|
748
|
+
bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
|
749
|
+
bbox_gdf = bbox_gdf.to_crs(raster_crs)
|
750
|
+
bbox_geom = bbox_gdf.geometry.iloc[0]
|
751
|
+
|
752
|
+
return self.clip_to_geometry(
|
753
|
+
geometry=bbox_geom,
|
754
|
+
crop=True,
|
755
|
+
return_clipped_processor=return_clipped_processor,
|
756
|
+
)
|
757
|
+
|
559
758
|
def to_graph(
|
560
759
|
self,
|
561
760
|
connectivity: Literal[4, 8] = 4,
|
562
761
|
band: Optional[int] = None,
|
563
762
|
include_coordinates: bool = False,
|
564
763
|
graph_type: Literal["networkx", "sparse"] = "networkx",
|
565
|
-
|
764
|
+
check_memory: bool = True,
|
566
765
|
) -> Union[nx.Graph, sp.csr_matrix]:
|
567
766
|
"""
|
568
767
|
Convert raster to graph based on pixel adjacency.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
connectivity: 4 or 8-connectivity
|
771
|
+
band: Band number (1-indexed)
|
772
|
+
include_coordinates: Include x,y coordinates in nodes
|
773
|
+
graph_type: 'networkx' or 'sparse'
|
774
|
+
check_memory: Whether to check memory before operation
|
775
|
+
|
776
|
+
Returns:
|
777
|
+
Graph representation of raster
|
569
778
|
"""
|
570
|
-
|
571
|
-
|
779
|
+
|
780
|
+
# Memory guard check
|
781
|
+
if check_memory:
|
782
|
+
self._memory_guard("graph", threshold_percent=80.0)
|
572
783
|
|
573
784
|
with self.open_dataset() as src:
|
574
785
|
band_idx = band - 1 if band is not None else 0
|
@@ -657,12 +868,12 @@ class TifProcessor:
|
|
657
868
|
weights = edges_array[:, 2]
|
658
869
|
|
659
870
|
# Add reverse edges for symmetric matrix
|
660
|
-
|
661
|
-
|
662
|
-
weights.
|
871
|
+
from_idx = np.append(row_indices, col_indices)
|
872
|
+
to_idx = np.append(col_indices, row_indices)
|
873
|
+
weights = np.append(weights, weights)
|
663
874
|
|
664
875
|
return sp.coo_matrix(
|
665
|
-
(weights, (
|
876
|
+
(weights, (from_idx, to_idx)),
|
666
877
|
shape=(num_valid_pixels, num_valid_pixels),
|
667
878
|
).tocsr()
|
668
879
|
|
@@ -798,11 +1009,63 @@ class TifProcessor:
|
|
798
1009
|
stat: Union[str, Callable] = "mean",
|
799
1010
|
batch_size: int = 100,
|
800
1011
|
n_workers: int = 4,
|
1012
|
+
show_progress: bool = True,
|
1013
|
+
check_memory: bool = True,
|
801
1014
|
**kwargs,
|
802
1015
|
) -> np.ndarray:
|
803
1016
|
"""
|
804
1017
|
Sample raster values by polygons in parallel using batching.
|
1018
|
+
|
1019
|
+
Args:
|
1020
|
+
polygon_list: List of Shapely Polygon or MultiPolygon objects
|
1021
|
+
stat: Statistic to compute
|
1022
|
+
batch_size: Number of polygons per batch
|
1023
|
+
n_workers: Number of worker processes
|
1024
|
+
show_progress: Whether to display progress bar
|
1025
|
+
check_memory: Whether to check memory before operation
|
1026
|
+
**kwargs: Additional arguments
|
1027
|
+
|
1028
|
+
Returns:
|
1029
|
+
np.ndarray of statistics for each polygon
|
805
1030
|
"""
|
1031
|
+
import sys
|
1032
|
+
|
1033
|
+
# Memory guard check with n_workers consideration
|
1034
|
+
if check_memory:
|
1035
|
+
is_safe = self._memory_guard(
|
1036
|
+
"batched_sampling",
|
1037
|
+
threshold_percent=85.0,
|
1038
|
+
n_workers=n_workers,
|
1039
|
+
raise_error=False,
|
1040
|
+
)
|
1041
|
+
|
1042
|
+
if not is_safe:
|
1043
|
+
# Suggest reducing n_workers
|
1044
|
+
memory_info = self._check_available_memory()
|
1045
|
+
estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)
|
1046
|
+
|
1047
|
+
# Calculate optimal workers
|
1048
|
+
suggested_workers = max(
|
1049
|
+
1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
|
1050
|
+
)
|
1051
|
+
|
1052
|
+
warnings.warn(
|
1053
|
+
f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
|
1054
|
+
f"to reduce memory pressure.",
|
1055
|
+
ResourceWarning,
|
1056
|
+
)
|
1057
|
+
|
1058
|
+
# Platform check
|
1059
|
+
if sys.platform in ["win32", "darwin"]:
|
1060
|
+
import warnings
|
1061
|
+
import multiprocessing as mp
|
1062
|
+
|
1063
|
+
if mp.get_start_method(allow_none=True) != "fork":
|
1064
|
+
warnings.warn(
|
1065
|
+
"Batched sampling may not work on Windows/macOS. "
|
1066
|
+
"Use sample_by_polygons() if you encounter errors.",
|
1067
|
+
RuntimeWarning,
|
1068
|
+
)
|
806
1069
|
|
807
1070
|
def _chunk_list(data_list, chunk_size):
|
808
1071
|
"""Yield successive chunks from data_list."""
|
@@ -813,20 +1076,22 @@ class TifProcessor:
|
|
813
1076
|
return np.array([])
|
814
1077
|
|
815
1078
|
stat_func = stat if callable(stat) else getattr(np, stat)
|
816
|
-
|
817
1079
|
polygon_chunks = list(_chunk_list(polygon_list, batch_size))
|
818
1080
|
|
819
1081
|
with multiprocessing.Pool(
|
820
1082
|
initializer=self._initializer_worker, processes=n_workers
|
821
1083
|
) as pool:
|
822
1084
|
process_func = partial(self._process_polygon_batch, stat_func=stat_func)
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
1085
|
+
if show_progress:
|
1086
|
+
batched_results = list(
|
1087
|
+
tqdm(
|
1088
|
+
pool.imap(process_func, polygon_chunks),
|
1089
|
+
total=len(polygon_chunks),
|
1090
|
+
desc=f"Sampling polygons",
|
1091
|
+
)
|
828
1092
|
)
|
829
|
-
|
1093
|
+
else:
|
1094
|
+
batched_results = list(pool.imap(process_func, polygon_chunks))
|
830
1095
|
|
831
1096
|
results = [item for sublist in batched_results for item in sublist]
|
832
1097
|
|
@@ -839,23 +1104,45 @@ class TifProcessor:
|
|
839
1104
|
This function runs once per worker, not for every task.
|
840
1105
|
"""
|
841
1106
|
global src_handle, memfile_handle
|
842
|
-
|
843
|
-
|
844
|
-
|
1107
|
+
|
1108
|
+
# Priority: merged > reprojected > original (same as open_dataset)
|
1109
|
+
local_file_path = None
|
1110
|
+
if self._merged_file_path:
|
1111
|
+
# Merged file is a local temp file
|
1112
|
+
local_file_path = self._merged_file_path
|
1113
|
+
elif self._reprojected_file_path:
|
1114
|
+
# Reprojected file is a local temp file
|
1115
|
+
local_file_path = self._reprojected_file_path
|
1116
|
+
elif isinstance(self.data_store, LocalDataStore):
|
1117
|
+
# Local file - can open directly
|
1118
|
+
local_file_path = str(self.dataset_path)
|
1119
|
+
|
1120
|
+
if local_file_path:
|
1121
|
+
# Open local file directly
|
1122
|
+
with open(local_file_path, "rb") as f:
|
1123
|
+
memfile_handle = rasterio.MemoryFile(f.read())
|
1124
|
+
src_handle = memfile_handle.open()
|
1125
|
+
else:
|
1126
|
+
# Custom DataStore
|
1127
|
+
with self.data_store.open(str(self.dataset_path), "rb") as f:
|
1128
|
+
memfile_handle = rasterio.MemoryFile(f.read())
|
1129
|
+
src_handle = memfile_handle.open()
|
1130
|
+
|
1131
|
+
def _get_worker_dataset(self):
|
1132
|
+
"""Get dataset handle for worker process."""
|
1133
|
+
global src_handle
|
1134
|
+
if src_handle is None:
|
1135
|
+
raise RuntimeError("Raster dataset not initialized in this process.")
|
1136
|
+
return src_handle
|
845
1137
|
|
846
1138
|
def _process_single_polygon(self, polygon, stat_func):
|
847
1139
|
"""
|
848
1140
|
Helper function to process a single polygon.
|
849
1141
|
This will be run in a separate process.
|
850
1142
|
"""
|
851
|
-
global src_handle
|
852
|
-
if src_handle is None:
|
853
|
-
# This should not happen if the initializer is set up correctly,
|
854
|
-
# but it's a good defensive check.
|
855
|
-
raise RuntimeError("Raster dataset not initialized in this process.")
|
856
|
-
|
857
1143
|
try:
|
858
|
-
|
1144
|
+
src = self._get_worker_dataset()
|
1145
|
+
out_image, _ = mask(src, [polygon], crop=True, filled=False)
|
859
1146
|
|
860
1147
|
if hasattr(out_image, "mask"):
|
861
1148
|
valid_data = out_image.compressed()
|
@@ -866,11 +1153,12 @@ class TifProcessor:
|
|
866
1153
|
else out_image.flatten()
|
867
1154
|
)
|
868
1155
|
|
869
|
-
if len(valid_data)
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
except Exception:
|
1156
|
+
return stat_func(valid_data) if len(valid_data) > 0 else np.nan
|
1157
|
+
except RuntimeError as e:
|
1158
|
+
self.logger.error(f"Worker not initialized: {e}")
|
1159
|
+
return np.nan
|
1160
|
+
except Exception as e:
|
1161
|
+
self.logger.debug(f"Error processing polygon: {e}")
|
874
1162
|
return np.nan
|
875
1163
|
|
876
1164
|
def _process_polygon_batch(self, polygon_batch, stat_func):
|
@@ -882,226 +1170,226 @@ class TifProcessor:
|
|
882
1170
|
for polygon in polygon_batch
|
883
1171
|
]
|
884
1172
|
|
885
|
-
def
|
886
|
-
|
887
|
-
|
1173
|
+
def _to_dataframe(
|
1174
|
+
self,
|
1175
|
+
band_number: Optional[int] = None,
|
1176
|
+
drop_nodata: bool = True,
|
1177
|
+
band_names: Optional[Union[str, List[str]]] = None,
|
1178
|
+
) -> pd.DataFrame:
|
888
1179
|
"""
|
889
|
-
|
890
|
-
|
891
|
-
with self.open_dataset() as src:
|
892
|
-
if self.count != 4:
|
893
|
-
raise ValueError("RGBA mode requires a 4-band TIF file")
|
894
|
-
|
895
|
-
# Read all four bands
|
896
|
-
red, green, blue, alpha = src.read()
|
897
|
-
|
898
|
-
x_coords, y_coords = self._get_pixel_coordinates()
|
899
|
-
|
900
|
-
if drop_transparent:
|
901
|
-
mask = alpha > 0
|
902
|
-
red = np.extract(mask, red)
|
903
|
-
green = np.extract(mask, green)
|
904
|
-
blue = np.extract(mask, blue)
|
905
|
-
alpha = np.extract(mask, alpha)
|
906
|
-
lons = np.extract(mask, x_coords)
|
907
|
-
lats = np.extract(mask, y_coords)
|
908
|
-
else:
|
909
|
-
lons = x_coords.flatten()
|
910
|
-
lats = y_coords.flatten()
|
911
|
-
red = red.flatten()
|
912
|
-
green = green.flatten()
|
913
|
-
blue = blue.flatten()
|
914
|
-
alpha = alpha.flatten()
|
915
|
-
|
916
|
-
# Create DataFrame with RGBA values
|
917
|
-
data = pd.DataFrame(
|
918
|
-
{
|
919
|
-
"lon": lons,
|
920
|
-
"lat": lats,
|
921
|
-
"red": red,
|
922
|
-
"green": green,
|
923
|
-
"blue": blue,
|
924
|
-
"alpha": alpha,
|
925
|
-
}
|
926
|
-
)
|
927
|
-
|
928
|
-
# Normalize alpha values if they're not in [0, 1] range
|
929
|
-
if data["alpha"].max() > 1:
|
930
|
-
data["alpha"] = data["alpha"] / data["alpha"].max()
|
1180
|
+
Process TIF to DataFrame - handles both single-band and multi-band.
|
931
1181
|
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
"""Convert RGB TIF to DataFrame with separate columns for R, G, B values."""
|
937
|
-
if self.mode != "rgb":
|
938
|
-
raise ValueError("Use appropriate method for current mode")
|
939
|
-
|
940
|
-
self.logger.info("Processing RGB dataset...")
|
1182
|
+
Args:
|
1183
|
+
band_number: Specific band to read (1-indexed). If None, reads all bands.
|
1184
|
+
drop_no Whether to drop nodata values
|
1185
|
+
band_names: Custom names for bands (multi-band only)
|
941
1186
|
|
1187
|
+
Returns:
|
1188
|
+
pd.DataFrame with lon, lat, and band value(s)
|
1189
|
+
"""
|
942
1190
|
with self.open_dataset() as src:
|
943
|
-
if
|
944
|
-
|
1191
|
+
if band_number is not None:
|
1192
|
+
# SINGLE BAND MODE
|
1193
|
+
band = src.read(band_number)
|
1194
|
+
mask = self._build_data_mask(band, drop_nodata, src.nodata)
|
1195
|
+
lons, lats = self._extract_coordinates_with_mask(mask)
|
1196
|
+
pixel_values = (
|
1197
|
+
np.extract(mask, band) if mask is not None else band.flatten()
|
1198
|
+
)
|
1199
|
+
band_name = band_names if isinstance(band_names, str) else "pixel_value"
|
945
1200
|
|
946
|
-
|
947
|
-
|
1201
|
+
return pd.DataFrame({"lon": lons, "lat": lats, band_name: pixel_values})
|
1202
|
+
else:
|
1203
|
+
# MULTI-BAND MODE (all bands)
|
1204
|
+
stack = src.read()
|
1205
|
+
|
1206
|
+
# Auto-detect band names by mode
|
1207
|
+
if band_names is None:
|
1208
|
+
if self.mode == "rgb":
|
1209
|
+
band_names = ["red", "green", "blue"]
|
1210
|
+
elif self.mode == "rgba":
|
1211
|
+
band_names = ["red", "green", "blue", "alpha"]
|
1212
|
+
else:
|
1213
|
+
band_names = [
|
1214
|
+
src.descriptions[i] or f"band_{i+1}"
|
1215
|
+
for i in range(self.count)
|
1216
|
+
]
|
948
1217
|
|
949
|
-
|
1218
|
+
# Build mask (checks ALL bands!)
|
1219
|
+
mask = self._build_multi_band_mask(stack, drop_nodata, src.nodata)
|
950
1220
|
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
mask = ~(
|
955
|
-
(red == nodata_value)
|
956
|
-
| (green == nodata_value)
|
957
|
-
| (blue == nodata_value)
|
958
|
-
)
|
959
|
-
red = np.extract(mask, red)
|
960
|
-
green = np.extract(mask, green)
|
961
|
-
blue = np.extract(mask, blue)
|
962
|
-
lons = np.extract(mask, x_coords)
|
963
|
-
lats = np.extract(mask, y_coords)
|
964
|
-
else:
|
965
|
-
lons = x_coords.flatten()
|
966
|
-
lats = y_coords.flatten()
|
967
|
-
red = red.flatten()
|
968
|
-
green = green.flatten()
|
969
|
-
blue = blue.flatten()
|
970
|
-
else:
|
971
|
-
lons = x_coords.flatten()
|
972
|
-
lats = y_coords.flatten()
|
973
|
-
red = red.flatten()
|
974
|
-
green = green.flatten()
|
975
|
-
blue = blue.flatten()
|
1221
|
+
# Create DataFrame
|
1222
|
+
data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
|
1223
|
+
df = pd.DataFrame(data_dict)
|
976
1224
|
|
977
|
-
|
978
|
-
|
979
|
-
"
|
980
|
-
"
|
981
|
-
"
|
982
|
-
|
983
|
-
"
|
984
|
-
}
|
985
|
-
)
|
1225
|
+
# RGBA: normalize alpha if needed
|
1226
|
+
if (
|
1227
|
+
self.mode == "rgba"
|
1228
|
+
and "alpha" in df.columns
|
1229
|
+
and df["alpha"].max() > 1
|
1230
|
+
):
|
1231
|
+
df["alpha"] = df["alpha"] / 255.0
|
986
1232
|
|
987
|
-
|
988
|
-
return data
|
1233
|
+
return df
|
989
1234
|
|
990
|
-
def
|
991
|
-
self,
|
1235
|
+
def _to_dataframe_chunked(
|
1236
|
+
self,
|
1237
|
+
windows: List[rasterio.windows.Window],
|
1238
|
+
band_number: Optional[int] = None,
|
1239
|
+
drop_nodata: bool = True,
|
1240
|
+
band_names: Optional[Union[str, List[str]]] = None,
|
1241
|
+
show_progress: bool = True,
|
992
1242
|
) -> pd.DataFrame:
|
993
|
-
"""
|
994
|
-
if self.mode != "single":
|
995
|
-
raise ValueError("Use appropriate method for current mode")
|
1243
|
+
"""Universal chunked converter for ALL modes."""
|
996
1244
|
|
997
|
-
|
998
|
-
|
999
|
-
if band_number <= 0 or band_number > self.count:
|
1000
|
-
self.logger.error(
|
1001
|
-
f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
|
1002
|
-
)
|
1003
|
-
return None
|
1245
|
+
chunks = []
|
1246
|
+
iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows
|
1004
1247
|
|
1005
1248
|
with self.open_dataset() as src:
|
1249
|
+
# Auto-detect band names ONCE (before loop)
|
1250
|
+
if band_number is None and band_names is None:
|
1251
|
+
if self.mode == "rgb":
|
1252
|
+
band_names = ["red", "green", "blue"]
|
1253
|
+
elif self.mode == "rgba":
|
1254
|
+
band_names = ["red", "green", "blue", "alpha"]
|
1255
|
+
else: # multi
|
1256
|
+
band_names = [
|
1257
|
+
src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
|
1258
|
+
]
|
1006
1259
|
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1260
|
+
for window in iterator:
|
1261
|
+
if band_number is not None:
|
1262
|
+
# SINGLE BAND
|
1263
|
+
band_chunk = src.read(band_number, window=window)
|
1264
|
+
mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
|
1265
|
+
lons, lats = self._get_chunk_coordinates(window, src)
|
1266
|
+
band_name = (
|
1267
|
+
band_names if isinstance(band_names, str) else "pixel_value"
|
1268
|
+
)
|
1010
1269
|
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1270
|
+
# Build chunk DataFrame (could use helper but simple enough)
|
1271
|
+
if mask is not None:
|
1272
|
+
mask_flat = mask.flatten()
|
1273
|
+
chunk_df = pd.DataFrame(
|
1274
|
+
{
|
1275
|
+
"lon": lons[mask_flat],
|
1276
|
+
"lat": lats[mask_flat],
|
1277
|
+
band_name: band_chunk.flatten()[mask_flat],
|
1278
|
+
}
|
1279
|
+
)
|
1280
|
+
else:
|
1281
|
+
chunk_df = pd.DataFrame(
|
1282
|
+
{"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
|
1283
|
+
)
|
1284
|
+
else:
|
1285
|
+
# MULTI-BAND (includes RGB/RGBA)
|
1286
|
+
stack_chunk = src.read(window=window)
|
1287
|
+
mask = self._build_multi_band_mask(
|
1288
|
+
stack_chunk, drop_nodata, src.nodata
|
1289
|
+
)
|
1290
|
+
lons, lats = self._get_chunk_coordinates(window, src)
|
1016
1291
|
|
1017
|
-
|
1018
|
-
|
1292
|
+
# Build DataFrame using helper
|
1293
|
+
band_dict = {
|
1294
|
+
band_names[i]: stack_chunk[i] for i in range(self.count)
|
1295
|
+
}
|
1296
|
+
chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)
|
1019
1297
|
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
lats = np.extract(data_mask, y_coords)
|
1025
|
-
else:
|
1026
|
-
pixel_values = band.flatten()
|
1027
|
-
lons = x_coords.flatten()
|
1028
|
-
lats = y_coords.flatten()
|
1298
|
+
# RGBA: normalize alpha
|
1299
|
+
if self.mode == "rgba" and "alpha" in chunk_df.columns:
|
1300
|
+
if chunk_df["alpha"].max() > 1:
|
1301
|
+
chunk_df["alpha"] = chunk_df["alpha"] / 255.0
|
1029
1302
|
|
1030
|
-
|
1303
|
+
chunks.append(chunk_df)
|
1031
1304
|
|
1032
|
-
|
1033
|
-
return
|
1305
|
+
result = pd.concat(chunks, ignore_index=True)
|
1306
|
+
return result
|
1034
1307
|
|
1035
|
-
def
|
1308
|
+
def _prepare_geometry_for_clipping(
|
1036
1309
|
self,
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
) ->
|
1041
|
-
"""
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1310
|
+
geometry: Union[
|
1311
|
+
Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
|
1312
|
+
],
|
1313
|
+
) -> List[dict]:
|
1314
|
+
"""Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""
|
1315
|
+
|
1316
|
+
if isinstance(geometry, (Polygon, MultiPolygon)):
|
1317
|
+
# Shapely geometry
|
1318
|
+
return [geometry.__geo_interface__]
|
1319
|
+
|
1320
|
+
elif isinstance(geometry, gpd.GeoDataFrame):
|
1321
|
+
# GeoDataFrame - use all geometries
|
1322
|
+
return [
|
1323
|
+
geom.__geo_interface__ for geom in geometry.geometry if geom is not None
|
1324
|
+
]
|
1325
|
+
|
1326
|
+
elif isinstance(geometry, gpd.GeoSeries):
|
1327
|
+
# GeoSeries
|
1328
|
+
return [geom.__geo_interface__ for geom in geometry if geom is not None]
|
1329
|
+
|
1330
|
+
elif isinstance(geometry, dict):
|
1331
|
+
# Single GeoJSON-like dict
|
1332
|
+
return [geometry]
|
1333
|
+
|
1334
|
+
elif isinstance(geometry, list):
|
1335
|
+
# List of GeoJSON-like dicts
|
1336
|
+
return geometry
|
1050
1337
|
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
# Read all bands
|
1058
|
-
stack = src.read()
|
1059
|
-
|
1060
|
-
x_coords, y_coords = self._get_pixel_coordinates()
|
1061
|
-
|
1062
|
-
# Initialize dictionary with coordinates
|
1063
|
-
data_dict = {"lon": x_coords.flatten(), "lat": y_coords.flatten()}
|
1338
|
+
else:
|
1339
|
+
raise TypeError(
|
1340
|
+
f"Unsupported geometry type: {type(geometry)}. "
|
1341
|
+
"Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
|
1342
|
+
"GeoJSON-like dict, or list of GeoJSON-like dicts."
|
1343
|
+
)
|
1064
1344
|
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1345
|
+
def _validate_geometry_crs(
|
1346
|
+
self,
|
1347
|
+
original_geometry: Any,
|
1348
|
+
) -> None:
|
1349
|
+
"""Validate that geometry CRS matches raster CRS"""
|
1350
|
+
|
1351
|
+
# Get raster CRS
|
1352
|
+
raster_crs = self.crs
|
1353
|
+
|
1354
|
+
# Try to get geometry CRS
|
1355
|
+
geometry_crs = None
|
1356
|
+
|
1357
|
+
if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
|
1358
|
+
geometry_crs = original_geometry.crs
|
1359
|
+
elif hasattr(original_geometry, "crs"):
|
1360
|
+
geometry_crs = original_geometry.crs
|
1361
|
+
|
1362
|
+
# Warn if CRS mismatch detected
|
1363
|
+
if geometry_crs is not None and raster_crs is not None:
|
1364
|
+
if not raster_crs == geometry_crs:
|
1365
|
+
self.logger.warning(
|
1366
|
+
f"CRS mismatch detected! Raster CRS: {raster_crs}, "
|
1367
|
+
f"Geometry CRS: {geometry_crs}. "
|
1368
|
+
"Consider reprojecting geometry to match raster CRS for accurate clipping."
|
1369
|
+
)
|
1071
1370
|
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
if values_to_mask:
|
1085
|
-
data_mask = ~np.isin(band_data, values_to_mask)
|
1086
|
-
band_values = np.extract(data_mask, band_data)
|
1087
|
-
if band_idx == 0: # Only need to mask coordinates once
|
1088
|
-
data_dict["lon"] = np.extract(data_mask, x_coords)
|
1089
|
-
data_dict["lat"] = np.extract(data_mask, y_coords)
|
1090
|
-
else:
|
1091
|
-
band_values = band_data.flatten()
|
1092
|
-
else:
|
1093
|
-
band_values = band_data.flatten()
|
1371
|
+
def _create_clipped_processor(
|
1372
|
+
self, clipped_data: np.ndarray, clipped_meta: dict
|
1373
|
+
) -> "TifProcessor":
|
1374
|
+
"""
|
1375
|
+
Helper to create a new TifProcessor instance from clipped data.
|
1376
|
+
Saves the clipped data to a temporary file and initializes a new TifProcessor.
|
1377
|
+
"""
|
1378
|
+
clipped_file_path = os.path.join(
|
1379
|
+
self._temp_dir, f"clipped_temp_{os.urandom(8).hex()}.tif"
|
1380
|
+
)
|
1381
|
+
with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
|
1382
|
+
dst.write(clipped_data)
|
1094
1383
|
|
1095
|
-
|
1096
|
-
band_name = (
|
1097
|
-
band_names[band_idx]
|
1098
|
-
if band_names and len(band_names) > band_idx
|
1099
|
-
else f"band_{band_idx + 1}"
|
1100
|
-
)
|
1101
|
-
data_dict[band_name] = band_values
|
1384
|
+
self.logger.info(f"Clipped raster saved to temporary file: {clipped_file_path}")
|
1102
1385
|
|
1103
|
-
|
1104
|
-
|
1386
|
+
# Create a new TifProcessor instance with the clipped data
|
1387
|
+
# Pass relevant parameters from the current instance to maintain consistency
|
1388
|
+
return TifProcessor(
|
1389
|
+
dataset_path=clipped_file_path,
|
1390
|
+
data_store=self.data_store,
|
1391
|
+
mode=self.mode,
|
1392
|
+
)
|
1105
1393
|
|
1106
1394
|
def _get_pixel_coordinates(self):
|
1107
1395
|
"""Helper method to generate coordinate arrays for all pixels"""
|
@@ -1128,79 +1416,322 @@ class TifProcessor:
|
|
1128
1416
|
|
1129
1417
|
return self._cache["pixel_coords"]
|
1130
1418
|
|
1131
|
-
def
|
1132
|
-
|
1419
|
+
def _get_chunk_coordinates(self, window, src):
|
1420
|
+
"""Get coordinates for a specific window chunk."""
|
1421
|
+
transform = src.window_transform(window)
|
1422
|
+
rows, cols = np.meshgrid(
|
1423
|
+
np.arange(window.height), np.arange(window.width), indexing="ij"
|
1424
|
+
)
|
1425
|
+
xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
|
1426
|
+
return np.array(xs), np.array(ys)
|
1133
1427
|
|
1134
|
-
def
|
1135
|
-
"""
|
1136
|
-
|
1137
|
-
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
1428
|
+
def _extract_coordinates_with_mask(self, mask=None):
|
1429
|
+
"""Extract flattened coordinates, optionally applying a mask."""
|
1430
|
+
x_coords, y_coords = self._get_pixel_coordinates()
|
1138
1431
|
|
1139
|
-
|
1140
|
-
|
1141
|
-
if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
|
1142
|
-
shutil.rmtree(self._temp_dir)
|
1143
|
-
self.logger.info("Cleaned up temporary files")
|
1432
|
+
if mask is not None:
|
1433
|
+
return np.extract(mask, x_coords), np.extract(mask, y_coords)
|
1144
1434
|
|
1145
|
-
|
1146
|
-
"""Proper context manager exit with cleanup."""
|
1147
|
-
self.cleanup()
|
1148
|
-
return False
|
1435
|
+
return x_coords.flatten(), y_coords.flatten()
|
1149
1436
|
|
1437
|
+
def _build_data_mask(self, data, drop_nodata=True, nodata_value=None):
|
1438
|
+
"""Build a boolean mask for filtering data based on nodata values."""
|
1439
|
+
if not drop_nodata or nodata_value is None:
|
1440
|
+
return None
|
1150
1441
|
|
1151
|
-
|
1152
|
-
tif_processors: List[TifProcessor], coordinate_list: List[Tuple[float, float]]
|
1153
|
-
):
|
1154
|
-
"""
|
1155
|
-
Sample raster values from multiple TIFF files for given coordinates.
|
1442
|
+
return data != nodata_value
|
1156
1443
|
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1444
|
+
def _build_multi_band_mask(
|
1445
|
+
self,
|
1446
|
+
bands: np.ndarray,
|
1447
|
+
drop_nodata: bool = True,
|
1448
|
+
nodata_value: Optional[float] = None,
|
1449
|
+
) -> Optional[np.ndarray]:
|
1450
|
+
"""
|
1451
|
+
Build mask for multi-band data - drops pixels where ANY band has nodata.
|
1160
1452
|
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1453
|
+
Args:
|
1454
|
+
bands: 3D array of shape (n_bands, height, width)
|
1455
|
+
drop_nodata Whether to drop nodata values
|
1456
|
+
nodata_value: The nodata value to check
|
1457
|
+
|
1458
|
+
Returns:
|
1459
|
+
Boolean mask or None if no masking needed
|
1460
|
+
"""
|
1461
|
+
if not drop_nodata or nodata_value is None:
|
1462
|
+
return None
|
1463
|
+
|
1464
|
+
# Check if ANY band has nodata at each pixel location
|
1465
|
+
has_nodata = np.any(bands == nodata_value, axis=0)
|
1466
|
+
|
1467
|
+
# Return True where ALL bands are valid
|
1468
|
+
valid_mask = ~has_nodata
|
1469
|
+
|
1470
|
+
return valid_mask if not valid_mask.all() else None
|
1165
1471
|
|
1166
|
-
|
1167
|
-
|
1472
|
+
def _bands_to_dict(self, bands, band_count, band_names, mask=None):
|
1473
|
+
"""Read specified bands and return as a dictionary with optional masking."""
|
1474
|
+
|
1475
|
+
lons, lats = self._extract_coordinates_with_mask(mask)
|
1476
|
+
data_dict = {"lon": lons, "lat": lats}
|
1477
|
+
|
1478
|
+
for idx, name in enumerate(band_names[:band_count]):
|
1479
|
+
band_data = bands[idx]
|
1480
|
+
data_dict[name] = (
|
1481
|
+
np.extract(mask, band_data) if mask is not None else band_data.flatten()
|
1482
|
+
)
|
1483
|
+
|
1484
|
+
return data_dict
|
1485
|
+
|
1486
|
+
def _calculate_optimal_chunk_size(
|
1487
|
+
self, operation: str = "conversion", target_memory_mb: int = 500
|
1488
|
+
) -> int:
|
1489
|
+
"""
|
1490
|
+
Calculate optimal chunk size (number of rows) based on target memory usage.
|
1491
|
+
|
1492
|
+
Args:
|
1493
|
+
operation: Type of operation ('conversion', 'graph')
|
1494
|
+
target_memory_mb: Target memory per chunk in megabytes
|
1168
1495
|
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1496
|
+
Returns:
|
1497
|
+
Number of rows per chunk
|
1498
|
+
"""
|
1499
|
+
bytes_per_element = np.dtype(self.dtype).itemsize
|
1500
|
+
n_bands = self.count
|
1501
|
+
width = self.width
|
1502
|
+
|
1503
|
+
# Adjust for operation type
|
1504
|
+
if operation == "conversion":
|
1505
|
+
# DataFrame overhead is roughly 2x
|
1506
|
+
bytes_per_row = width * n_bands * bytes_per_element * 2
|
1507
|
+
elif operation == "graph":
|
1508
|
+
# Graph needs additional space for edges
|
1509
|
+
bytes_per_row = width * bytes_per_element * 4 # Estimate
|
1173
1510
|
else:
|
1174
|
-
|
1511
|
+
bytes_per_row = width * n_bands * bytes_per_element
|
1175
1512
|
|
1176
|
-
|
1513
|
+
target_bytes = target_memory_mb * 1024 * 1024
|
1514
|
+
chunk_rows = max(1, int(target_bytes / bytes_per_row))
|
1177
1515
|
|
1178
|
-
|
1516
|
+
# Ensure chunk size doesn't exceed total height
|
1517
|
+
chunk_rows = min(chunk_rows, self.height)
|
1179
1518
|
|
1519
|
+
self.logger.info(
|
1520
|
+
f"Calculated chunk size: {chunk_rows} rows "
|
1521
|
+
f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
|
1522
|
+
)
|
1180
1523
|
|
1181
|
-
|
1182
|
-
tif_processors: List[TifProcessor],
|
1183
|
-
polygon_list: List[Union[Polygon, MultiPolygon]],
|
1184
|
-
stat: str = "mean",
|
1185
|
-
) -> np.ndarray:
|
1186
|
-
"""
|
1187
|
-
Sample raster values from multiple TIFF files for polygons in a list and join the results.
|
1524
|
+
return chunk_rows
|
1188
1525
|
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
- stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
|
1526
|
+
def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
|
1527
|
+
"""
|
1528
|
+
Generate window objects for chunked reading.
|
1193
1529
|
|
1194
|
-
|
1195
|
-
|
1196
|
-
"""
|
1197
|
-
sampled_values = np.full(len(polygon_list), np.nan, dtype=np.float32)
|
1530
|
+
Args:
|
1531
|
+
chunk_size: Number of rows per chunk
|
1198
1532
|
|
1199
|
-
|
1200
|
-
|
1533
|
+
Returns:
|
1534
|
+
List of rasterio.windows.Window objects
|
1535
|
+
"""
|
1536
|
+
windows = []
|
1537
|
+
for row_start in range(0, self.height, chunk_size):
|
1538
|
+
row_end = min(row_start + chunk_size, self.height)
|
1539
|
+
window = rasterio.windows.Window(
|
1540
|
+
col_off=0,
|
1541
|
+
row_off=row_start,
|
1542
|
+
width=self.width,
|
1543
|
+
height=row_end - row_start,
|
1544
|
+
)
|
1545
|
+
windows.append(window)
|
1546
|
+
|
1547
|
+
return windows
|
1201
1548
|
|
1202
|
-
|
1549
|
+
def _format_bytes(self, bytes_value: int) -> str:
|
1550
|
+
"""Convert bytes to human-readable format."""
|
1551
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
1552
|
+
if bytes_value < 1024.0:
|
1553
|
+
return f"{bytes_value:.2f} {unit}"
|
1554
|
+
bytes_value /= 1024.0
|
1555
|
+
return f"{bytes_value:.2f} PB"
|
1556
|
+
|
1557
|
+
def _check_available_memory(self) -> dict:
|
1558
|
+
"""
|
1559
|
+
Check available system memory.
|
1203
1560
|
|
1204
|
-
|
1561
|
+
Returns:
|
1562
|
+
Dict with total, available, and used memory info
|
1563
|
+
"""
|
1564
|
+
import psutil
|
1205
1565
|
|
1206
|
-
|
1566
|
+
memory = psutil.virtual_memory()
|
1567
|
+
return {
|
1568
|
+
"total": memory.total,
|
1569
|
+
"available": memory.available,
|
1570
|
+
"used": memory.used,
|
1571
|
+
"percent": memory.percent,
|
1572
|
+
"available_human": self._format_bytes(memory.available),
|
1573
|
+
}
|
1574
|
+
|
1575
|
+
def _estimate_memory_usage(
|
1576
|
+
self, operation: str = "conversion", n_workers: int = 1
|
1577
|
+
) -> dict:
|
1578
|
+
"""
|
1579
|
+
Estimate memory usage for various operations.
|
1580
|
+
|
1581
|
+
Args:
|
1582
|
+
operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
|
1583
|
+
n_workers: Number of workers (for batched_sampling)
|
1584
|
+
|
1585
|
+
Returns:
|
1586
|
+
Dict with estimated memory usage in bytes and human-readable format
|
1587
|
+
"""
|
1588
|
+
bytes_per_element = np.dtype(self.dtype).itemsize
|
1589
|
+
n_pixels = self.width * self.height
|
1590
|
+
n_bands = self.count
|
1591
|
+
|
1592
|
+
estimates = {}
|
1593
|
+
|
1594
|
+
if operation == "conversion":
|
1595
|
+
# to_dataframe/to_geodataframe: full raster + DataFrame overhead
|
1596
|
+
raster_memory = n_pixels * n_bands * bytes_per_element
|
1597
|
+
# DataFrame overhead (roughly 2x for storage + processing)
|
1598
|
+
dataframe_memory = (
|
1599
|
+
n_pixels * n_bands * 16
|
1600
|
+
) # 16 bytes per value in DataFrame
|
1601
|
+
total = raster_memory + dataframe_memory
|
1602
|
+
estimates["raster"] = raster_memory
|
1603
|
+
estimates["dataframe"] = dataframe_memory
|
1604
|
+
estimates["total"] = total
|
1605
|
+
|
1606
|
+
elif operation == "batched_sampling":
|
1607
|
+
# Each worker loads full raster into MemoryFile
|
1608
|
+
# Need to get file size
|
1609
|
+
if self._merged_file_path:
|
1610
|
+
file_path = self._merged_file_path
|
1611
|
+
elif self._reprojected_file_path:
|
1612
|
+
file_path = self._reprojected_file_path
|
1613
|
+
else:
|
1614
|
+
file_path = str(self.dataset_path)
|
1615
|
+
|
1616
|
+
try:
|
1617
|
+
import os
|
1618
|
+
|
1619
|
+
file_size = os.path.getsize(file_path)
|
1620
|
+
except:
|
1621
|
+
# Estimate if can't get file size
|
1622
|
+
file_size = n_pixels * n_bands * bytes_per_element * 1.2 # Add overhead
|
1623
|
+
|
1624
|
+
estimates["per_worker"] = file_size
|
1625
|
+
estimates["total"] = file_size * n_workers
|
1626
|
+
|
1627
|
+
elif operation == "merge":
|
1628
|
+
# _merge_with_mean uses float64 arrays
|
1629
|
+
raster_memory = n_pixels * n_bands * 8 # float64
|
1630
|
+
estimates["sum_array"] = raster_memory
|
1631
|
+
estimates["count_array"] = n_pixels * 4 # int32
|
1632
|
+
estimates["total"] = raster_memory + n_pixels * 4
|
1633
|
+
|
1634
|
+
elif operation == "graph":
|
1635
|
+
# to_graph: data + node_map + edges
|
1636
|
+
data_memory = n_pixels * bytes_per_element
|
1637
|
+
node_map_memory = n_pixels * 4 # int32
|
1638
|
+
# Estimate edges (rough: 4-connectivity = 4 edges per pixel)
|
1639
|
+
edges_memory = n_pixels * 4 * 3 * 8 # 3 values per edge, float64
|
1640
|
+
total = data_memory + node_map_memory + edges_memory
|
1641
|
+
estimates["data"] = data_memory
|
1642
|
+
estimates["node_map"] = node_map_memory
|
1643
|
+
estimates["edges"] = edges_memory
|
1644
|
+
estimates["total"] = total
|
1645
|
+
|
1646
|
+
# Add human-readable format
|
1647
|
+
estimates["human_readable"] = self._format_bytes(estimates["total"])
|
1648
|
+
|
1649
|
+
return estimates
|
1650
|
+
|
1651
|
+
def _memory_guard(
|
1652
|
+
self,
|
1653
|
+
operation: str,
|
1654
|
+
threshold_percent: float = 80.0,
|
1655
|
+
n_workers: Optional[int] = None,
|
1656
|
+
raise_error: bool = False,
|
1657
|
+
) -> bool:
|
1658
|
+
"""
|
1659
|
+
Check if operation is safe to perform given memory constraints.
|
1660
|
+
|
1661
|
+
Args:
|
1662
|
+
operation: Type of operation to check
|
1663
|
+
threshold_percent: Maximum % of available memory to use (default 80%)
|
1664
|
+
n_workers: Number of workers (for batched operations)
|
1665
|
+
raise_error: If True, raise MemoryError instead of warning
|
1666
|
+
|
1667
|
+
Returns:
|
1668
|
+
True if operation is safe, False otherwise
|
1669
|
+
|
1670
|
+
Raises:
|
1671
|
+
MemoryError: If raise_error=True and memory insufficient
|
1672
|
+
"""
|
1673
|
+
import warnings
|
1674
|
+
|
1675
|
+
estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
|
1676
|
+
memory_info = self._check_available_memory()
|
1677
|
+
|
1678
|
+
estimated_usage = estimates["total"]
|
1679
|
+
available = memory_info["available"]
|
1680
|
+
threshold = available * (threshold_percent / 100.0)
|
1681
|
+
|
1682
|
+
is_safe = estimated_usage <= threshold
|
1683
|
+
|
1684
|
+
if not is_safe:
|
1685
|
+
usage_str = self._format_bytes(estimated_usage)
|
1686
|
+
available_str = memory_info["available_human"]
|
1687
|
+
|
1688
|
+
message = (
|
1689
|
+
f"Memory warning: {operation} operation may require {usage_str} "
|
1690
|
+
f"but only {available_str} is available. "
|
1691
|
+
f"Current memory usage: {memory_info['percent']:.1f}%"
|
1692
|
+
)
|
1693
|
+
|
1694
|
+
if raise_error:
|
1695
|
+
raise MemoryError(message)
|
1696
|
+
else:
|
1697
|
+
warnings.warn(message, ResourceWarning)
|
1698
|
+
if hasattr(self, "logger"):
|
1699
|
+
self.logger.warning(message)
|
1700
|
+
|
1701
|
+
return is_safe
|
1702
|
+
|
1703
|
+
def _validate_mode_band_compatibility(self):
|
1704
|
+
"""Validate that mode matches band count."""
|
1705
|
+
mode_requirements = {
|
1706
|
+
"single": (1, "1-band"),
|
1707
|
+
"rgb": (3, "3-band"),
|
1708
|
+
"rgba": (4, "4-band"),
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
if self.mode in mode_requirements:
|
1712
|
+
required_count, description = mode_requirements[self.mode]
|
1713
|
+
if self.count != required_count:
|
1714
|
+
raise ValueError(
|
1715
|
+
f"{self.mode.upper()} mode requires a {description} TIF file"
|
1716
|
+
)
|
1717
|
+
elif self.mode == "multi" and self.count < 2:
|
1718
|
+
raise ValueError("Multi mode requires a TIF file with 2 or more bands")
|
1719
|
+
|
1720
|
+
def __enter__(self):
|
1721
|
+
return self
|
1722
|
+
|
1723
|
+
def __del__(self):
|
1724
|
+
"""Clean up temporary files and directories."""
|
1725
|
+
if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
|
1726
|
+
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
1727
|
+
|
1728
|
+
def cleanup(self):
|
1729
|
+
"""Explicit cleanup method for better control."""
|
1730
|
+
if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
|
1731
|
+
shutil.rmtree(self._temp_dir)
|
1732
|
+
self.logger.info("Cleaned up temporary files")
|
1733
|
+
|
1734
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
1735
|
+
"""Proper context manager exit with cleanup."""
|
1736
|
+
self.cleanup()
|
1737
|
+
return False
|