giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +1 -0
- gigaspatial/core/io/adls_data_store.py +104 -11
- gigaspatial/core/io/local_data_store.py +8 -0
- gigaspatial/generators/poi.py +226 -82
- gigaspatial/generators/zonal/base.py +41 -28
- gigaspatial/generators/zonal/geometry.py +91 -41
- gigaspatial/grid/h3.py +417 -0
- gigaspatial/grid/mercator_tiles.py +1 -1
- gigaspatial/handlers/base.py +22 -8
- gigaspatial/handlers/ghsl.py +22 -8
- gigaspatial/handlers/giga.py +9 -4
- gigaspatial/handlers/healthsites.py +350 -0
- gigaspatial/handlers/osm.py +325 -105
- gigaspatial/handlers/worldpop.py +228 -9
- gigaspatial/processing/geo.py +11 -6
- gigaspatial/processing/tif_processor.py +1183 -496
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import pandas as pd
|
3
3
|
import geopandas as gpd
|
4
|
-
|
4
|
+
import networkx as nx
|
5
|
+
import scipy.sparse as sp
|
6
|
+
from typing import List, Optional, Tuple, Union, Literal, Callable, Dict, Any
|
5
7
|
from pydantic import ConfigDict
|
6
8
|
from pydantic.dataclasses import dataclass
|
7
9
|
from contextlib import contextmanager
|
@@ -15,12 +17,17 @@ from functools import partial
|
|
15
17
|
import multiprocessing
|
16
18
|
from tqdm import tqdm
|
17
19
|
import tempfile
|
20
|
+
import shutil
|
18
21
|
import os
|
19
22
|
|
20
23
|
from gigaspatial.core.io.data_store import DataStore
|
21
24
|
from gigaspatial.core.io.local_data_store import LocalDataStore
|
22
25
|
from gigaspatial.config import config
|
23
26
|
|
27
|
+
# Global variables for multiprocessing workers
|
28
|
+
src_handle = None
|
29
|
+
memfile_handle = None
|
30
|
+
|
24
31
|
|
25
32
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
26
33
|
class TifProcessor:
|
@@ -35,50 +42,164 @@ class TifProcessor:
|
|
35
42
|
merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
|
36
43
|
target_crs: Optional[str] = None # For reprojection if needed
|
37
44
|
resampling_method: Resampling = Resampling.nearest
|
45
|
+
reprojection_resolution: Optional[Tuple[float, float]] = None
|
38
46
|
|
39
47
|
def __post_init__(self):
|
40
48
|
"""Validate inputs, merge rasters if needed, and set up logging."""
|
41
49
|
self.data_store = self.data_store or LocalDataStore()
|
42
50
|
self.logger = config.get_logger(self.__class__.__name__)
|
43
51
|
self._cache = {}
|
52
|
+
self._temp_dir = tempfile.mkdtemp()
|
44
53
|
self._merged_file_path = None
|
45
|
-
self.
|
54
|
+
self._reprojected_file_path = None
|
46
55
|
|
47
56
|
# Handle multiple dataset paths
|
48
57
|
if isinstance(self.dataset_path, list):
|
49
|
-
self.
|
50
|
-
|
51
|
-
|
52
|
-
|
58
|
+
if len(self.dataset_path) > 1:
|
59
|
+
self.dataset_paths = [Path(p) for p in self.dataset_path]
|
60
|
+
self._validate_multiple_datasets()
|
61
|
+
self._merge_rasters()
|
62
|
+
self.dataset_path = self._merged_file_path
|
53
63
|
else:
|
54
64
|
self.dataset_paths = [Path(self.dataset_path)]
|
55
|
-
if not self.data_store.file_exists(self.dataset_path):
|
65
|
+
if not self.data_store.file_exists(str(self.dataset_path)):
|
56
66
|
raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
|
57
67
|
|
68
|
+
# Reproject single raster during initialization if target_crs is set
|
69
|
+
if self.target_crs:
|
70
|
+
self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
|
71
|
+
with self.data_store.open(str(self.dataset_path), "rb") as f:
|
72
|
+
with rasterio.MemoryFile(f.read()) as memfile:
|
73
|
+
with memfile.open() as src:
|
74
|
+
self._reprojected_file_path = self._reproject_to_temp_file(
|
75
|
+
src, self.target_crs
|
76
|
+
)
|
77
|
+
self.dataset_path = self._reprojected_file_path
|
78
|
+
|
58
79
|
self._load_metadata()
|
80
|
+
self._validate_mode_band_compatibility()
|
59
81
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if self.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
82
|
+
@contextmanager
|
83
|
+
def open_dataset(self):
|
84
|
+
"""Context manager for accessing the dataset, handling temporary reprojected files."""
|
85
|
+
if self._merged_file_path:
|
86
|
+
with rasterio.open(self._merged_file_path) as src:
|
87
|
+
yield src
|
88
|
+
elif self._reprojected_file_path:
|
89
|
+
with rasterio.open(self._reprojected_file_path) as src:
|
90
|
+
yield src
|
91
|
+
elif isinstance(self.data_store, LocalDataStore):
|
92
|
+
with rasterio.open(str(self.dataset_path)) as src:
|
93
|
+
yield src
|
94
|
+
else:
|
95
|
+
with self.data_store.open(str(self.dataset_path), "rb") as f:
|
96
|
+
with rasterio.MemoryFile(f.read()) as memfile:
|
97
|
+
with memfile.open() as src:
|
98
|
+
yield src
|
99
|
+
|
100
|
+
def reproject_to(
|
101
|
+
self,
|
102
|
+
target_crs: str,
|
103
|
+
output_path: Optional[Union[str, Path]] = None,
|
104
|
+
resampling_method: Optional[Resampling] = None,
|
105
|
+
resolution: Optional[Tuple[float, float]] = None,
|
106
|
+
):
|
107
|
+
"""
|
108
|
+
Reprojects the current raster to a new CRS and optionally saves it.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
target_crs: The CRS to reproject to (e.g., "EPSG:4326").
|
112
|
+
output_path: The path to save the reprojected raster. If None,
|
113
|
+
it is saved to a temporary file.
|
114
|
+
resampling_method: The resampling method to use.
|
115
|
+
resolution: The target resolution (pixel size) in the new CRS.
|
116
|
+
"""
|
117
|
+
self.logger.info(f"Reprojecting raster to {target_crs}...")
|
118
|
+
|
119
|
+
# Use provided or default values
|
120
|
+
resampling_method = resampling_method or self.resampling_method
|
121
|
+
resolution = resolution or self.reprojection_resolution
|
122
|
+
|
123
|
+
with self.open_dataset() as src:
|
124
|
+
if src.crs.to_string() == target_crs:
|
125
|
+
self.logger.info(
|
126
|
+
"Raster is already in the target CRS. No reprojection needed."
|
127
|
+
)
|
128
|
+
# If output_path is specified, copy the file
|
129
|
+
if output_path:
|
130
|
+
self.data_store.copy_file(str(self.dataset_path), output_path)
|
131
|
+
return self.dataset_path
|
132
|
+
|
133
|
+
dst_path = output_path or os.path.join(
|
134
|
+
self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
|
135
|
+
)
|
136
|
+
|
137
|
+
with rasterio.open(
|
138
|
+
dst_path,
|
139
|
+
"w",
|
140
|
+
**self._get_reprojection_profile(src, target_crs, resolution),
|
141
|
+
) as dst:
|
142
|
+
for band_idx in range(1, src.count + 1):
|
143
|
+
reproject(
|
144
|
+
source=rasterio.band(src, band_idx),
|
145
|
+
destination=rasterio.band(dst, band_idx),
|
146
|
+
src_transform=src.transform,
|
147
|
+
src_crs=src.crs,
|
148
|
+
dst_transform=dst.transform,
|
149
|
+
dst_crs=dst.crs,
|
150
|
+
resampling=resampling_method,
|
151
|
+
num_threads=multiprocessing.cpu_count(),
|
152
|
+
)
|
153
|
+
|
154
|
+
self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
|
155
|
+
return Path(dst_path)
|
156
|
+
|
157
|
+
def get_raster_info(self) -> Dict[str, Any]:
|
158
|
+
"""Get comprehensive raster information."""
|
159
|
+
return {
|
160
|
+
"count": self.count,
|
161
|
+
"width": self.width,
|
162
|
+
"height": self.height,
|
163
|
+
"crs": self.crs,
|
164
|
+
"bounds": self.bounds,
|
165
|
+
"transform": self.transform,
|
166
|
+
"dtypes": self.dtype,
|
167
|
+
"nodata": self.nodata,
|
168
|
+
"mode": self.mode,
|
169
|
+
"is_merged": self.is_merged,
|
170
|
+
"source_count": self.source_count,
|
171
|
+
}
|
172
|
+
|
173
|
+
def _reproject_to_temp_file(
|
174
|
+
self, src: rasterio.DatasetReader, target_crs: str
|
175
|
+
) -> str:
|
176
|
+
"""Helper to reproject a raster and save it to a temporary file."""
|
177
|
+
dst_path = os.path.join(
|
178
|
+
self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
|
179
|
+
)
|
180
|
+
profile = self._get_reprojection_profile(
|
181
|
+
src, target_crs, self.reprojection_resolution
|
182
|
+
)
|
183
|
+
|
184
|
+
with rasterio.open(dst_path, "w", **profile) as dst:
|
185
|
+
for band_idx in range(1, src.count + 1):
|
186
|
+
reproject(
|
187
|
+
source=rasterio.band(src, band_idx),
|
188
|
+
destination=rasterio.band(dst, band_idx),
|
189
|
+
src_transform=src.transform,
|
190
|
+
src_crs=src.crs,
|
191
|
+
dst_transform=dst.transform,
|
192
|
+
dst_crs=dst.crs,
|
193
|
+
resampling=self.resampling_method,
|
194
|
+
)
|
195
|
+
return dst_path
|
69
196
|
|
70
197
|
def _validate_multiple_datasets(self):
|
71
198
|
"""Validate that all datasets exist and have compatible properties."""
|
72
199
|
if len(self.dataset_paths) < 2:
|
73
200
|
raise ValueError("Multiple dataset paths required for merging")
|
74
201
|
|
75
|
-
|
76
|
-
for path in self.dataset_paths:
|
77
|
-
if not self.data_store.file_exists(path):
|
78
|
-
raise FileNotFoundError(f"Dataset not found at {path}")
|
79
|
-
|
80
|
-
# Load first dataset to get reference properties
|
81
|
-
with self.data_store.open(self.dataset_paths[0], "rb") as f:
|
202
|
+
with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
|
82
203
|
with rasterio.MemoryFile(f.read()) as memfile:
|
83
204
|
with memfile.open() as ref_src:
|
84
205
|
ref_count = ref_src.count
|
@@ -87,9 +208,8 @@ class TifProcessor:
|
|
87
208
|
ref_transform = ref_src.transform
|
88
209
|
ref_nodata = ref_src.nodata
|
89
210
|
|
90
|
-
# Validate all other datasets against reference
|
91
211
|
for i, path in enumerate(self.dataset_paths[1:], 1):
|
92
|
-
with self.data_store.open(path, "rb") as f:
|
212
|
+
with self.data_store.open(str(path), "rb") as f:
|
93
213
|
with rasterio.MemoryFile(f.read()) as memfile:
|
94
214
|
with memfile.open() as src:
|
95
215
|
if src.count != ref_count:
|
@@ -100,9 +220,10 @@ class TifProcessor:
|
|
100
220
|
raise ValueError(
|
101
221
|
f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
|
102
222
|
)
|
103
|
-
if self.target_crs
|
104
|
-
|
105
|
-
f"Dataset {i} has CRS {src.crs}, expected {ref_crs}.
|
223
|
+
if not self.target_crs and src.crs != ref_crs:
|
224
|
+
self.logger.warning(
|
225
|
+
f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
|
226
|
+
"Consider setting target_crs parameter for reprojection before merging."
|
106
227
|
)
|
107
228
|
if self.target_crs is None and not self._transforms_compatible(
|
108
229
|
src.transform, ref_transform
|
@@ -115,6 +236,46 @@ class TifProcessor:
|
|
115
236
|
f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
|
116
237
|
)
|
117
238
|
|
239
|
+
def _get_reprojection_profile(
|
240
|
+
self,
|
241
|
+
src: rasterio.DatasetReader,
|
242
|
+
target_crs: str,
|
243
|
+
resolution: Optional[Tuple[float, float]],
|
244
|
+
compression: str = "lzw",
|
245
|
+
):
|
246
|
+
"""Calculates and returns the profile for a reprojected raster."""
|
247
|
+
if resolution:
|
248
|
+
src_res = (abs(src.transform.a), abs(src.transform.e))
|
249
|
+
self.logger.info(
|
250
|
+
f"Using target resolution: {resolution}. Source resolution: {src_res}."
|
251
|
+
)
|
252
|
+
# Calculate transform and dimensions based on the new resolution
|
253
|
+
dst_transform, width, height = calculate_default_transform(
|
254
|
+
src.crs,
|
255
|
+
target_crs,
|
256
|
+
src.width,
|
257
|
+
src.height,
|
258
|
+
*src.bounds,
|
259
|
+
resolution=resolution,
|
260
|
+
)
|
261
|
+
else:
|
262
|
+
# Keep original resolution but reproject
|
263
|
+
dst_transform, width, height = calculate_default_transform(
|
264
|
+
src.crs, target_crs, src.width, src.height, *src.bounds
|
265
|
+
)
|
266
|
+
|
267
|
+
profile = src.profile.copy()
|
268
|
+
profile.update(
|
269
|
+
{
|
270
|
+
"crs": target_crs,
|
271
|
+
"transform": dst_transform,
|
272
|
+
"width": width,
|
273
|
+
"height": height,
|
274
|
+
"compress": compression, # Add compression to save space
|
275
|
+
}
|
276
|
+
)
|
277
|
+
return profile
|
278
|
+
|
118
279
|
def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
|
119
280
|
"""Check if two transforms have compatible pixel sizes."""
|
120
281
|
return (
|
@@ -126,151 +287,77 @@ class TifProcessor:
|
|
126
287
|
"""Merge multiple rasters into a single raster."""
|
127
288
|
self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")
|
128
289
|
|
129
|
-
# Create temporary directory for merged file
|
130
|
-
self._temp_dir = tempfile.mkdtemp()
|
131
|
-
merged_filename = "merged_raster.tif"
|
132
|
-
self._merged_file_path = os.path.join(self._temp_dir, merged_filename)
|
133
|
-
|
134
290
|
# Open all datasets and handle reprojection if needed
|
135
|
-
|
136
|
-
|
137
|
-
|
291
|
+
datasets_to_merge = []
|
292
|
+
temp_reprojected_files = []
|
138
293
|
try:
|
139
294
|
for path in self.dataset_paths:
|
140
|
-
with self.data_store.open(path, "rb") as f:
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
295
|
+
with self.data_store.open(str(path), "rb") as f:
|
296
|
+
with rasterio.MemoryFile(f.read()) as memfile:
|
297
|
+
with memfile.open() as src:
|
298
|
+
if self.target_crs and src.crs != self.target_crs:
|
299
|
+
self.logger.info(
|
300
|
+
f"Reprojecting {path.name} to {self.target_crs} before merging."
|
301
|
+
)
|
302
|
+
reprojected_path = self._reproject_to_temp_file(
|
303
|
+
src, self.target_crs
|
304
|
+
)
|
305
|
+
temp_reprojected_files.append(reprojected_path)
|
306
|
+
datasets_to_merge.append(
|
307
|
+
rasterio.open(reprojected_path)
|
308
|
+
)
|
309
|
+
else:
|
310
|
+
temp_path = os.path.join(
|
311
|
+
self._temp_dir,
|
312
|
+
f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
|
313
|
+
)
|
314
|
+
temp_reprojected_files.append(temp_path)
|
315
|
+
|
316
|
+
profile = src.profile
|
317
|
+
with rasterio.open(temp_path, "w", **profile) as dst:
|
318
|
+
dst.write(src.read())
|
319
|
+
datasets_to_merge.append(rasterio.open(temp_path))
|
320
|
+
|
321
|
+
self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")
|
154
322
|
|
155
323
|
if self.merge_method == "mean":
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
# Use first source as reference for metadata
|
160
|
-
ref_src = src_files[0]
|
161
|
-
profile = ref_src.profile.copy()
|
162
|
-
profile.update(
|
163
|
-
{
|
164
|
-
"height": merged_array.shape[-2],
|
165
|
-
"width": merged_array.shape[-1],
|
166
|
-
"transform": merged_transform,
|
167
|
-
}
|
324
|
+
merged_array, merged_transform = self._merge_with_mean(
|
325
|
+
datasets_to_merge
|
168
326
|
)
|
169
|
-
|
170
|
-
# Write merged raster
|
171
|
-
with rasterio.open(self._merged_file_path, "w", **profile) as dst:
|
172
|
-
dst.write(merged_array)
|
173
|
-
|
174
327
|
else:
|
175
|
-
# Use rasterio's merge function
|
176
328
|
merged_array, merged_transform = merge(
|
177
|
-
|
329
|
+
datasets_to_merge,
|
178
330
|
method=self.merge_method,
|
179
331
|
resampling=self.resampling_method,
|
180
332
|
)
|
181
333
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
profile.update(
|
186
|
-
{
|
187
|
-
"height": merged_array.shape[-2],
|
188
|
-
"width": merged_array.shape[-1],
|
189
|
-
"transform": merged_transform,
|
190
|
-
}
|
191
|
-
)
|
192
|
-
|
193
|
-
if self.target_crs:
|
194
|
-
profile["crs"] = self.target_crs
|
195
|
-
|
196
|
-
# Write merged raster
|
197
|
-
with rasterio.open(self._merged_file_path, "w", **profile) as dst:
|
198
|
-
dst.write(merged_array)
|
199
|
-
|
200
|
-
finally:
|
201
|
-
# Clean up source files
|
202
|
-
for src in src_files:
|
203
|
-
temp_path = src.name
|
204
|
-
src.close()
|
205
|
-
try:
|
206
|
-
os.unlink(temp_path)
|
207
|
-
except:
|
208
|
-
pass
|
209
|
-
|
210
|
-
# Clean up reprojected files
|
211
|
-
for src in reprojected_files:
|
212
|
-
if src not in src_files: # Don't double-close
|
213
|
-
temp_path = src.name
|
214
|
-
src.close()
|
215
|
-
try:
|
216
|
-
os.unlink(temp_path)
|
217
|
-
except:
|
218
|
-
pass
|
219
|
-
|
220
|
-
self.logger.info("Raster merging completed!")
|
221
|
-
|
222
|
-
def _reproject_rasters(self, src_files, target_crs):
|
223
|
-
"""Reproject all rasters to a common CRS before merging."""
|
224
|
-
reprojected_files = []
|
225
|
-
|
226
|
-
for i, src in enumerate(src_files):
|
227
|
-
if src.crs.to_string() == target_crs:
|
228
|
-
# No reprojection needed
|
229
|
-
reprojected_files.append(src)
|
230
|
-
continue
|
231
|
-
|
232
|
-
# Calculate transform and dimensions for reprojection
|
233
|
-
transform, width, height = calculate_default_transform(
|
234
|
-
src.crs,
|
235
|
-
target_crs,
|
236
|
-
src.width,
|
237
|
-
src.height,
|
238
|
-
*src.bounds,
|
239
|
-
resolution=self.resolution if hasattr(self, "resolution") else None,
|
240
|
-
)
|
241
|
-
|
242
|
-
# Create temporary file for reprojected raster
|
243
|
-
temp_file = tempfile.NamedTemporaryFile(suffix=".tif", delete=False)
|
244
|
-
temp_file.close()
|
245
|
-
|
246
|
-
# Set up profile for reprojected raster
|
247
|
-
profile = src.profile.copy()
|
334
|
+
# Get profile from the first file in the list (all should be compatible now)
|
335
|
+
ref_src = datasets_to_merge[0]
|
336
|
+
profile = ref_src.profile.copy()
|
248
337
|
profile.update(
|
249
338
|
{
|
250
|
-
"
|
251
|
-
"
|
252
|
-
"
|
253
|
-
"
|
339
|
+
"height": merged_array.shape[-2],
|
340
|
+
"width": merged_array.shape[-1],
|
341
|
+
"transform": merged_transform,
|
342
|
+
"crs": self.target_crs if self.target_crs else ref_src.crs,
|
254
343
|
}
|
255
344
|
)
|
256
345
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
src_transform=src.transform,
|
264
|
-
src_crs=src.crs,
|
265
|
-
dst_transform=transform,
|
266
|
-
dst_crs=target_crs,
|
267
|
-
resampling=self.resampling_method,
|
268
|
-
)
|
346
|
+
with rasterio.open(self._merged_file_path, "w", **profile) as dst:
|
347
|
+
dst.write(merged_array)
|
348
|
+
finally:
|
349
|
+
for dataset in datasets_to_merge:
|
350
|
+
if hasattr(dataset, "close"):
|
351
|
+
dataset.close()
|
269
352
|
|
270
|
-
#
|
271
|
-
|
353
|
+
# Clean up temporary files immediately
|
354
|
+
for temp_file in temp_reprojected_files:
|
355
|
+
try:
|
356
|
+
os.remove(temp_file)
|
357
|
+
except OSError:
|
358
|
+
pass
|
272
359
|
|
273
|
-
|
360
|
+
self.logger.info("Raster merging completed!")
|
274
361
|
|
275
362
|
def _merge_with_mean(self, src_files):
|
276
363
|
"""Merge rasters using mean aggregation."""
|
@@ -295,6 +382,12 @@ class TifProcessor:
|
|
295
382
|
bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
|
296
383
|
)
|
297
384
|
|
385
|
+
estimated_memory = height * width * src_files[0].count * 8 # float64
|
386
|
+
if estimated_memory > 1e9: # 1GB threshold
|
387
|
+
self.logger.warning(
|
388
|
+
f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
|
389
|
+
)
|
390
|
+
|
298
391
|
# Initialize arrays for sum and count
|
299
392
|
sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
|
300
393
|
count_array = np.zeros((height, width), dtype=np.int32)
|
@@ -336,33 +429,9 @@ class TifProcessor:
|
|
336
429
|
|
337
430
|
return mean_array.astype(src_files[0].dtypes[0]), merged_transform
|
338
431
|
|
339
|
-
def __del__(self):
|
340
|
-
"""Cleanup temporary files."""
|
341
|
-
if self._temp_dir and os.path.exists(self._temp_dir):
|
342
|
-
try:
|
343
|
-
import shutil
|
344
|
-
|
345
|
-
shutil.rmtree(self._temp_dir)
|
346
|
-
except:
|
347
|
-
pass
|
348
|
-
|
349
|
-
@contextmanager
|
350
|
-
def open_dataset(self):
|
351
|
-
"""Context manager for accessing the dataset"""
|
352
|
-
if self._merged_file_path:
|
353
|
-
# Open merged file directly
|
354
|
-
with rasterio.open(self._merged_file_path) as src:
|
355
|
-
yield src
|
356
|
-
else:
|
357
|
-
# Original single file logic
|
358
|
-
with self.data_store.open(self.dataset_path, "rb") as f:
|
359
|
-
with rasterio.MemoryFile(f.read()) as memfile:
|
360
|
-
with memfile.open() as src:
|
361
|
-
yield src
|
362
|
-
|
363
432
|
def _load_metadata(self):
|
364
433
|
"""Load metadata from the TIF file if not already cached"""
|
365
|
-
|
434
|
+
try:
|
366
435
|
with self.open_dataset() as src:
|
367
436
|
self._cache["transform"] = src.transform
|
368
437
|
self._cache["crs"] = src.crs.to_string()
|
@@ -375,6 +444,10 @@ class TifProcessor:
|
|
375
444
|
self._cache["nodata"] = src.nodata
|
376
445
|
self._cache["count"] = src.count
|
377
446
|
self._cache["dtype"] = src.dtypes[0]
|
447
|
+
except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
|
448
|
+
raise FileNotFoundError(f"Could not read raster metadata: {e}")
|
449
|
+
except Exception as e:
|
450
|
+
raise RuntimeError(f"Unexpected error loading metadata: {e}")
|
378
451
|
|
379
452
|
@property
|
380
453
|
def is_merged(self) -> bool:
|
@@ -386,7 +459,6 @@ class TifProcessor:
|
|
386
459
|
"""Get the number of source rasters."""
|
387
460
|
return len(self.dataset_paths)
|
388
461
|
|
389
|
-
# All other methods remain the same...
|
390
462
|
@property
|
391
463
|
def transform(self):
|
392
464
|
"""Get the transform from the TIF file"""
|
@@ -428,53 +500,48 @@ class TifProcessor:
|
|
428
500
|
return self._cache["nodata"]
|
429
501
|
|
430
502
|
@property
|
431
|
-
def
|
432
|
-
"""Get the data from the TIF file"""
|
433
|
-
self.
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
raise ValueError(
|
454
|
-
f"Invalid mode: {self.mode}. Must be one of: single, rgb, rgba, multi"
|
455
|
-
)
|
456
|
-
except Exception as e:
|
457
|
-
raise ValueError(
|
458
|
-
f"Failed to process TIF file in mode '{self.mode}'. "
|
459
|
-
f"Please ensure the file is valid and matches the selected mode. "
|
460
|
-
f"Original error: {str(e)}"
|
461
|
-
)
|
503
|
+
def dtype(self):
|
504
|
+
"""Get the data types from the TIF file"""
|
505
|
+
return self._cache.get("dtype", [])
|
506
|
+
|
507
|
+
@property
|
508
|
+
def width(self):
|
509
|
+
return self._cache["width"]
|
510
|
+
|
511
|
+
@property
|
512
|
+
def height(self):
|
513
|
+
return self._cache["height"]
|
514
|
+
|
515
|
+
def to_dataframe(
|
516
|
+
self, drop_nodata=True, check_memory=True, **kwargs
|
517
|
+
) -> pd.DataFrame:
|
518
|
+
"""
|
519
|
+
Convert raster to DataFrame.
|
520
|
+
|
521
|
+
Args:
|
522
|
+
drop_nodata: Whether to drop nodata values
|
523
|
+
check_memory: Whether to check memory before operation (default True)
|
524
|
+
**kwargs: Additional arguments
|
462
525
|
|
463
|
-
|
526
|
+
Returns:
|
527
|
+
pd.DataFrame with raster data
|
528
|
+
"""
|
529
|
+
# Memory guard check
|
530
|
+
if check_memory:
|
531
|
+
self._memory_guard("conversion", threshold_percent=80.0)
|
464
532
|
|
465
|
-
def to_dataframe(self, drop_nodata=True, **kwargs) -> pd.DataFrame:
|
466
533
|
try:
|
467
534
|
if self.mode == "single":
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
elif self.mode == "multi":
|
474
|
-
df = self._to_multi_band_dataframe(drop_nodata=drop_nodata, **kwargs)
|
535
|
+
return self._to_dataframe(
|
536
|
+
band_number=kwargs.get("band_number", 1),
|
537
|
+
drop_nodata=drop_nodata,
|
538
|
+
band_names=kwargs.get("band_names", None),
|
539
|
+
)
|
475
540
|
else:
|
476
|
-
|
477
|
-
|
541
|
+
return self._to_dataframe(
|
542
|
+
band_number=None, # All bands
|
543
|
+
drop_nodata=drop_nodata,
|
544
|
+
band_names=kwargs.get("band_names", None),
|
478
545
|
)
|
479
546
|
except Exception as e:
|
480
547
|
raise ValueError(
|
@@ -485,12 +552,23 @@ class TifProcessor:
|
|
485
552
|
|
486
553
|
return df
|
487
554
|
|
488
|
-
def to_geodataframe(self, **kwargs) -> gpd.GeoDataFrame:
|
555
|
+
def to_geodataframe(self, check_memory=True, **kwargs) -> gpd.GeoDataFrame:
|
489
556
|
"""
|
490
557
|
Convert the processed TIF data into a GeoDataFrame, where each row represents a pixel zone.
|
491
558
|
Each zone is defined by its bounding box, based on pixel resolution and coordinates.
|
559
|
+
|
560
|
+
Args:
|
561
|
+
check_memory: Whether to check memory before operation
|
562
|
+
**kwargs: Additional arguments passed to to_dataframe()
|
563
|
+
|
564
|
+
Returns:
|
565
|
+
gpd.GeoDataFrame with raster data
|
492
566
|
"""
|
493
|
-
|
567
|
+
# Memory guard check
|
568
|
+
if check_memory:
|
569
|
+
self._memory_guard("conversion", threshold_percent=80.0)
|
570
|
+
|
571
|
+
df = self.to_dataframe(check_memory=False, **kwargs)
|
494
572
|
|
495
573
|
x_res, y_res = self.resolution
|
496
574
|
|
@@ -504,30 +582,300 @@ class TifProcessor:
|
|
504
582
|
|
505
583
|
return gdf
|
506
584
|
|
507
|
-
def
|
585
|
+
def to_dataframe_chunked(
|
586
|
+
self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
|
587
|
+
):
|
508
588
|
"""
|
509
|
-
Convert
|
510
|
-
|
589
|
+
Convert raster to DataFrame using chunked processing for memory efficiency.
|
590
|
+
|
591
|
+
Automatically routes to the appropriate chunked method based on mode.
|
592
|
+
Chunk size is automatically calculated based on target memory usage.
|
593
|
+
|
594
|
+
Args:
|
595
|
+
drop_nodata: Whether to drop nodata values
|
596
|
+
chunk_size: Number of rows per chunk (auto-calculated if None)
|
597
|
+
target_memory_mb: Target memory per chunk in MB (default 500)
|
598
|
+
**kwargs: Additional arguments (band_number, band_names, etc.)
|
599
|
+
"""
|
600
|
+
|
601
|
+
if chunk_size is None:
|
602
|
+
chunk_size = self._calculate_optimal_chunk_size(
|
603
|
+
"conversion", target_memory_mb
|
604
|
+
)
|
605
|
+
|
606
|
+
windows = self._get_chunk_windows(chunk_size)
|
607
|
+
|
608
|
+
# SIMPLE ROUTING
|
609
|
+
if self.mode == "single":
|
610
|
+
return self._to_dataframe_chunked(
|
611
|
+
windows,
|
612
|
+
band_number=kwargs.get("band_number", 1),
|
613
|
+
drop_nodata=drop_nodata,
|
614
|
+
band_names=kwargs.get("band_names", None),
|
615
|
+
)
|
616
|
+
else: # rgb, rgba, multi
|
617
|
+
return self._to_dataframe_chunked(
|
618
|
+
windows,
|
619
|
+
band_number=None,
|
620
|
+
drop_nodata=drop_nodata,
|
621
|
+
band_names=kwargs.get("band_names", None),
|
622
|
+
)
|
623
|
+
|
624
|
+
def clip_to_geometry(
|
625
|
+
self,
|
626
|
+
geometry: Union[
|
627
|
+
Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
|
628
|
+
],
|
629
|
+
crop: bool = True,
|
630
|
+
all_touched: bool = True,
|
631
|
+
invert: bool = False,
|
632
|
+
nodata: Optional[Union[int, float]] = None,
|
633
|
+
pad: bool = False,
|
634
|
+
pad_width: float = 0.5,
|
635
|
+
return_clipped_processor: bool = True,
|
636
|
+
) -> Union["TifProcessor", tuple]:
|
637
|
+
"""
|
638
|
+
Clip raster to geometry boundaries.
|
639
|
+
|
640
|
+
Parameters:
|
641
|
+
-----------
|
642
|
+
geometry : various
|
643
|
+
Geometry to clip to. Can be:
|
644
|
+
- Shapely Polygon or MultiPolygon
|
645
|
+
- GeoDataFrame or GeoSeries
|
646
|
+
- List of GeoJSON-like dicts
|
647
|
+
- Single GeoJSON-like dict
|
648
|
+
crop : bool, default True
|
649
|
+
Whether to crop the raster to the extent of the geometry
|
650
|
+
all_touched : bool, default True
|
651
|
+
Include pixels that touch the geometry boundary
|
652
|
+
invert : bool, default False
|
653
|
+
If True, mask pixels inside geometry instead of outside
|
654
|
+
nodata : int or float, optional
|
655
|
+
Value to use for masked pixels. If None, uses raster's nodata value
|
656
|
+
pad : bool, default False
|
657
|
+
Pad geometry by half pixel before clipping
|
658
|
+
pad_width : float, default 0.5
|
659
|
+
Width of padding in pixels if pad=True
|
660
|
+
return_clipped_processor : bool, default True
|
661
|
+
If True, returns new TifProcessor with clipped data
|
662
|
+
If False, returns (clipped_array, transform, metadata)
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
--------
|
666
|
+
TifProcessor or tuple
|
667
|
+
Either new TifProcessor instance or (array, transform, metadata) tuple
|
668
|
+
"""
|
669
|
+
# Handle different geometry input types
|
670
|
+
shapes = self._prepare_geometry_for_clipping(geometry)
|
671
|
+
|
672
|
+
# Validate CRS compatibility
|
673
|
+
self._validate_geometry_crs(geometry)
|
674
|
+
|
675
|
+
# Perform the clipping
|
676
|
+
with self.open_dataset() as src:
|
677
|
+
try:
|
678
|
+
clipped_data, clipped_transform = mask(
|
679
|
+
dataset=src,
|
680
|
+
shapes=shapes,
|
681
|
+
crop=crop,
|
682
|
+
all_touched=all_touched,
|
683
|
+
invert=invert,
|
684
|
+
nodata=nodata,
|
685
|
+
pad=pad,
|
686
|
+
pad_width=pad_width,
|
687
|
+
filled=True,
|
688
|
+
)
|
689
|
+
|
690
|
+
# Update metadata for the clipped raster
|
691
|
+
clipped_meta = src.meta.copy()
|
692
|
+
clipped_meta.update(
|
693
|
+
{
|
694
|
+
"height": clipped_data.shape[1],
|
695
|
+
"width": clipped_data.shape[2],
|
696
|
+
"transform": clipped_transform,
|
697
|
+
"nodata": nodata if nodata is not None else src.nodata,
|
698
|
+
}
|
699
|
+
)
|
700
|
+
|
701
|
+
except ValueError as e:
|
702
|
+
if "Input shapes do not overlap raster" in str(e):
|
703
|
+
raise ValueError(
|
704
|
+
"The geometry does not overlap with the raster. "
|
705
|
+
"Check that both are in the same coordinate reference system."
|
706
|
+
) from e
|
707
|
+
else:
|
708
|
+
raise e
|
709
|
+
|
710
|
+
if return_clipped_processor:
|
711
|
+
# Create a new TifProcessor with the clipped data
|
712
|
+
return self._create_clipped_processor(clipped_data, clipped_meta)
|
713
|
+
else:
|
714
|
+
return clipped_data, clipped_transform, clipped_meta
|
715
|
+
|
716
|
+
def clip_to_bounds(
|
717
|
+
self,
|
718
|
+
bounds: tuple,
|
719
|
+
bounds_crs: Optional[str] = None,
|
720
|
+
return_clipped_processor: bool = True,
|
721
|
+
) -> Union["TifProcessor", tuple]:
|
722
|
+
"""
|
723
|
+
Clip raster to rectangular bounds.
|
724
|
+
|
725
|
+
Parameters:
|
726
|
+
-----------
|
727
|
+
bounds : tuple
|
728
|
+
Bounding box as (minx, miny, maxx, maxy)
|
729
|
+
bounds_crs : str, optional
|
730
|
+
CRS of the bounds. If None, assumes same as raster CRS
|
731
|
+
return_clipped_processor : bool, default True
|
732
|
+
If True, returns new TifProcessor, else returns (array, transform, metadata)
|
733
|
+
|
734
|
+
Returns:
|
735
|
+
--------
|
736
|
+
TifProcessor or tuple
|
737
|
+
Either new TifProcessor instance or (array, transform, metadata) tuple
|
511
738
|
"""
|
512
|
-
|
513
|
-
|
739
|
+
# Create bounding box geometry
|
740
|
+
bbox_geom = box(*bounds)
|
741
|
+
|
742
|
+
# If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
|
743
|
+
if bounds_crs is not None:
|
744
|
+
raster_crs = self.crs
|
745
|
+
|
746
|
+
if not self.crs == bounds_crs:
|
747
|
+
# Create GeoDataFrame with bounds CRS and reproject
|
748
|
+
bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
|
749
|
+
bbox_gdf = bbox_gdf.to_crs(raster_crs)
|
750
|
+
bbox_geom = bbox_gdf.geometry.iloc[0]
|
751
|
+
|
752
|
+
return self.clip_to_geometry(
|
753
|
+
geometry=bbox_geom,
|
754
|
+
crop=True,
|
755
|
+
return_clipped_processor=return_clipped_processor,
|
514
756
|
)
|
515
|
-
self.logger.info("Converting data to GeoDataFrame with zones...")
|
516
757
|
|
517
|
-
|
758
|
+
def to_graph(
|
759
|
+
self,
|
760
|
+
connectivity: Literal[4, 8] = 4,
|
761
|
+
band: Optional[int] = None,
|
762
|
+
include_coordinates: bool = False,
|
763
|
+
graph_type: Literal["networkx", "sparse"] = "networkx",
|
764
|
+
check_memory: bool = True,
|
765
|
+
) -> Union[nx.Graph, sp.csr_matrix]:
|
766
|
+
"""
|
767
|
+
Convert raster to graph based on pixel adjacency.
|
518
768
|
|
519
|
-
|
769
|
+
Args:
|
770
|
+
connectivity: 4 or 8-connectivity
|
771
|
+
band: Band number (1-indexed)
|
772
|
+
include_coordinates: Include x,y coordinates in nodes
|
773
|
+
graph_type: 'networkx' or 'sparse'
|
774
|
+
check_memory: Whether to check memory before operation
|
520
775
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
for lon, lat in zip(df["lon"], df["lat"])
|
525
|
-
]
|
776
|
+
Returns:
|
777
|
+
Graph representation of raster
|
778
|
+
"""
|
526
779
|
|
527
|
-
|
780
|
+
# Memory guard check
|
781
|
+
if check_memory:
|
782
|
+
self._memory_guard("graph", threshold_percent=80.0)
|
528
783
|
|
529
|
-
self.
|
530
|
-
|
784
|
+
with self.open_dataset() as src:
|
785
|
+
band_idx = band - 1 if band is not None else 0
|
786
|
+
if band_idx < 0 or band_idx >= src.count:
|
787
|
+
raise ValueError(
|
788
|
+
f"Band {band} not available. Raster has {src.count} bands"
|
789
|
+
)
|
790
|
+
|
791
|
+
data = src.read(band_idx + 1)
|
792
|
+
nodata = src.nodata if src.nodata is not None else self.nodata
|
793
|
+
valid_mask = (
|
794
|
+
data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
|
795
|
+
)
|
796
|
+
|
797
|
+
height, width = data.shape
|
798
|
+
|
799
|
+
# Find all valid pixels
|
800
|
+
valid_rows, valid_cols = np.where(valid_mask)
|
801
|
+
num_valid_pixels = len(valid_rows)
|
802
|
+
|
803
|
+
# Create a sequential mapping from (row, col) to a node ID
|
804
|
+
node_map = np.full(data.shape, -1, dtype=int)
|
805
|
+
node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)
|
806
|
+
|
807
|
+
# Define neighborhood offsets
|
808
|
+
if connectivity == 4:
|
809
|
+
# von Neumann neighborhood (4-connectivity)
|
810
|
+
offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
|
811
|
+
else: # connectivity == 8
|
812
|
+
# Moore neighborhood (8-connectivity)
|
813
|
+
offsets = [
|
814
|
+
(-1, -1),
|
815
|
+
(-1, 0),
|
816
|
+
(-1, 1),
|
817
|
+
(0, -1),
|
818
|
+
(0, 1),
|
819
|
+
(1, -1),
|
820
|
+
(1, 0),
|
821
|
+
(1, 1),
|
822
|
+
]
|
823
|
+
|
824
|
+
# Collect nodes and edges
|
825
|
+
nodes_to_add = []
|
826
|
+
edges_to_add = []
|
827
|
+
|
828
|
+
for i in range(num_valid_pixels):
|
829
|
+
row, col = valid_rows[i], valid_cols[i]
|
830
|
+
current_node_id = node_map[row, col]
|
831
|
+
|
832
|
+
# Prepare node attributes
|
833
|
+
node_attrs = {"value": float(data[row, col])}
|
834
|
+
if include_coordinates:
|
835
|
+
x, y = src.xy(row, col)
|
836
|
+
node_attrs["x"] = x
|
837
|
+
node_attrs["y"] = y
|
838
|
+
nodes_to_add.append((current_node_id, node_attrs))
|
839
|
+
|
840
|
+
# Find neighbors and collect edges
|
841
|
+
for dy, dx in offsets:
|
842
|
+
neighbor_row, neighbor_col = row + dy, col + dx
|
843
|
+
|
844
|
+
# Check if neighbor is within bounds and is a valid pixel
|
845
|
+
if (
|
846
|
+
0 <= neighbor_row < height
|
847
|
+
and 0 <= neighbor_col < width
|
848
|
+
and valid_mask[neighbor_row, neighbor_col]
|
849
|
+
):
|
850
|
+
neighbor_node_id = node_map[neighbor_row, neighbor_col]
|
851
|
+
|
852
|
+
# Ensure each edge is added only once
|
853
|
+
if current_node_id < neighbor_node_id:
|
854
|
+
neighbor_value = float(data[neighbor_row, neighbor_col])
|
855
|
+
edges_to_add.append(
|
856
|
+
(current_node_id, neighbor_node_id, neighbor_value)
|
857
|
+
)
|
858
|
+
|
859
|
+
if graph_type == "networkx":
|
860
|
+
G = nx.Graph()
|
861
|
+
G.add_nodes_from(nodes_to_add)
|
862
|
+
G.add_weighted_edges_from(edges_to_add)
|
863
|
+
return G
|
864
|
+
else: # sparse matrix
|
865
|
+
edges_array = np.array(edges_to_add)
|
866
|
+
row_indices = edges_array[:, 0]
|
867
|
+
col_indices = edges_array[:, 1]
|
868
|
+
weights = edges_array[:, 2]
|
869
|
+
|
870
|
+
# Add reverse edges for symmetric matrix
|
871
|
+
from_idx = np.append(row_indices, col_indices)
|
872
|
+
to_idx = np.append(col_indices, row_indices)
|
873
|
+
weights = np.append(weights, weights)
|
874
|
+
|
875
|
+
return sp.coo_matrix(
|
876
|
+
(weights, (from_idx, to_idx)),
|
877
|
+
shape=(num_valid_pixels, num_valid_pixels),
|
878
|
+
).tocsr()
|
531
879
|
|
532
880
|
def sample_by_coordinates(
|
533
881
|
self, coordinate_list: List[Tuple[float, float]], **kwargs
|
@@ -661,11 +1009,63 @@ class TifProcessor:
|
|
661
1009
|
stat: Union[str, Callable] = "mean",
|
662
1010
|
batch_size: int = 100,
|
663
1011
|
n_workers: int = 4,
|
1012
|
+
show_progress: bool = True,
|
1013
|
+
check_memory: bool = True,
|
664
1014
|
**kwargs,
|
665
1015
|
) -> np.ndarray:
|
666
1016
|
"""
|
667
1017
|
Sample raster values by polygons in parallel using batching.
|
1018
|
+
|
1019
|
+
Args:
|
1020
|
+
polygon_list: List of Shapely Polygon or MultiPolygon objects
|
1021
|
+
stat: Statistic to compute
|
1022
|
+
batch_size: Number of polygons per batch
|
1023
|
+
n_workers: Number of worker processes
|
1024
|
+
show_progress: Whether to display progress bar
|
1025
|
+
check_memory: Whether to check memory before operation
|
1026
|
+
**kwargs: Additional arguments
|
1027
|
+
|
1028
|
+
Returns:
|
1029
|
+
np.ndarray of statistics for each polygon
|
668
1030
|
"""
|
1031
|
+
import sys
|
1032
|
+
|
1033
|
+
# Memory guard check with n_workers consideration
|
1034
|
+
if check_memory:
|
1035
|
+
is_safe = self._memory_guard(
|
1036
|
+
"batched_sampling",
|
1037
|
+
threshold_percent=85.0,
|
1038
|
+
n_workers=n_workers,
|
1039
|
+
raise_error=False,
|
1040
|
+
)
|
1041
|
+
|
1042
|
+
if not is_safe:
|
1043
|
+
# Suggest reducing n_workers
|
1044
|
+
memory_info = self._check_available_memory()
|
1045
|
+
estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)
|
1046
|
+
|
1047
|
+
# Calculate optimal workers
|
1048
|
+
suggested_workers = max(
|
1049
|
+
1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
|
1050
|
+
)
|
1051
|
+
|
1052
|
+
warnings.warn(
|
1053
|
+
f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
|
1054
|
+
f"to reduce memory pressure.",
|
1055
|
+
ResourceWarning,
|
1056
|
+
)
|
1057
|
+
|
1058
|
+
# Platform check
|
1059
|
+
if sys.platform in ["win32", "darwin"]:
|
1060
|
+
import warnings
|
1061
|
+
import multiprocessing as mp
|
1062
|
+
|
1063
|
+
if mp.get_start_method(allow_none=True) != "fork":
|
1064
|
+
warnings.warn(
|
1065
|
+
"Batched sampling may not work on Windows/macOS. "
|
1066
|
+
"Use sample_by_polygons() if you encounter errors.",
|
1067
|
+
RuntimeWarning,
|
1068
|
+
)
|
669
1069
|
|
670
1070
|
def _chunk_list(data_list, chunk_size):
|
671
1071
|
"""Yield successive chunks from data_list."""
|
@@ -676,20 +1076,22 @@ class TifProcessor:
|
|
676
1076
|
return np.array([])
|
677
1077
|
|
678
1078
|
stat_func = stat if callable(stat) else getattr(np, stat)
|
679
|
-
|
680
1079
|
polygon_chunks = list(_chunk_list(polygon_list, batch_size))
|
681
1080
|
|
682
1081
|
with multiprocessing.Pool(
|
683
1082
|
initializer=self._initializer_worker, processes=n_workers
|
684
1083
|
) as pool:
|
685
1084
|
process_func = partial(self._process_polygon_batch, stat_func=stat_func)
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
1085
|
+
if show_progress:
|
1086
|
+
batched_results = list(
|
1087
|
+
tqdm(
|
1088
|
+
pool.imap(process_func, polygon_chunks),
|
1089
|
+
total=len(polygon_chunks),
|
1090
|
+
desc=f"Sampling polygons",
|
1091
|
+
)
|
691
1092
|
)
|
692
|
-
|
1093
|
+
else:
|
1094
|
+
batched_results = list(pool.imap(process_func, polygon_chunks))
|
693
1095
|
|
694
1096
|
results = [item for sublist in batched_results for item in sublist]
|
695
1097
|
|
@@ -701,24 +1103,46 @@ class TifProcessor:
|
|
701
1103
|
Opens the raster dataset and stores it in a process-local variable.
|
702
1104
|
This function runs once per worker, not for every task.
|
703
1105
|
"""
|
1106
|
+
global src_handle, memfile_handle
|
1107
|
+
|
1108
|
+
# Priority: merged > reprojected > original (same as open_dataset)
|
1109
|
+
local_file_path = None
|
1110
|
+
if self._merged_file_path:
|
1111
|
+
# Merged file is a local temp file
|
1112
|
+
local_file_path = self._merged_file_path
|
1113
|
+
elif self._reprojected_file_path:
|
1114
|
+
# Reprojected file is a local temp file
|
1115
|
+
local_file_path = self._reprojected_file_path
|
1116
|
+
elif isinstance(self.data_store, LocalDataStore):
|
1117
|
+
# Local file - can open directly
|
1118
|
+
local_file_path = str(self.dataset_path)
|
1119
|
+
|
1120
|
+
if local_file_path:
|
1121
|
+
# Open local file directly
|
1122
|
+
with open(local_file_path, "rb") as f:
|
1123
|
+
memfile_handle = rasterio.MemoryFile(f.read())
|
1124
|
+
src_handle = memfile_handle.open()
|
1125
|
+
else:
|
1126
|
+
# Custom DataStore
|
1127
|
+
with self.data_store.open(str(self.dataset_path), "rb") as f:
|
1128
|
+
memfile_handle = rasterio.MemoryFile(f.read())
|
1129
|
+
src_handle = memfile_handle.open()
|
1130
|
+
|
1131
|
+
def _get_worker_dataset(self):
|
1132
|
+
"""Get dataset handle for worker process."""
|
704
1133
|
global src_handle
|
705
|
-
|
706
|
-
|
707
|
-
|
1134
|
+
if src_handle is None:
|
1135
|
+
raise RuntimeError("Raster dataset not initialized in this process.")
|
1136
|
+
return src_handle
|
708
1137
|
|
709
1138
|
def _process_single_polygon(self, polygon, stat_func):
|
710
1139
|
"""
|
711
1140
|
Helper function to process a single polygon.
|
712
1141
|
This will be run in a separate process.
|
713
1142
|
"""
|
714
|
-
global src_handle
|
715
|
-
if src_handle is None:
|
716
|
-
# This should not happen if the initializer is set up correctly,
|
717
|
-
# but it's a good defensive check.
|
718
|
-
raise RuntimeError("Raster dataset not initialized in this process.")
|
719
|
-
|
720
1143
|
try:
|
721
|
-
|
1144
|
+
src = self._get_worker_dataset()
|
1145
|
+
out_image, _ = mask(src, [polygon], crop=True, filled=False)
|
722
1146
|
|
723
1147
|
if hasattr(out_image, "mask"):
|
724
1148
|
valid_data = out_image.compressed()
|
@@ -729,11 +1153,12 @@ class TifProcessor:
|
|
729
1153
|
else out_image.flatten()
|
730
1154
|
)
|
731
1155
|
|
732
|
-
if len(valid_data)
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
except Exception:
|
1156
|
+
return stat_func(valid_data) if len(valid_data) > 0 else np.nan
|
1157
|
+
except RuntimeError as e:
|
1158
|
+
self.logger.error(f"Worker not initialized: {e}")
|
1159
|
+
return np.nan
|
1160
|
+
except Exception as e:
|
1161
|
+
self.logger.debug(f"Error processing polygon: {e}")
|
737
1162
|
return np.nan
|
738
1163
|
|
739
1164
|
def _process_polygon_batch(self, polygon_batch, stat_func):
|
@@ -745,226 +1170,226 @@ class TifProcessor:
|
|
745
1170
|
for polygon in polygon_batch
|
746
1171
|
]
|
747
1172
|
|
748
|
-
def
|
749
|
-
|
750
|
-
|
1173
|
+
def _to_dataframe(
|
1174
|
+
self,
|
1175
|
+
band_number: Optional[int] = None,
|
1176
|
+
drop_nodata: bool = True,
|
1177
|
+
band_names: Optional[Union[str, List[str]]] = None,
|
1178
|
+
) -> pd.DataFrame:
|
751
1179
|
"""
|
752
|
-
|
753
|
-
|
754
|
-
with self.open_dataset() as src:
|
755
|
-
if self.count != 4:
|
756
|
-
raise ValueError("RGBA mode requires a 4-band TIF file")
|
757
|
-
|
758
|
-
# Read all four bands
|
759
|
-
red, green, blue, alpha = src.read()
|
760
|
-
|
761
|
-
x_coords, y_coords = self._get_pixel_coordinates()
|
762
|
-
|
763
|
-
if drop_transparent:
|
764
|
-
mask = alpha > 0
|
765
|
-
red = np.extract(mask, red)
|
766
|
-
green = np.extract(mask, green)
|
767
|
-
blue = np.extract(mask, blue)
|
768
|
-
alpha = np.extract(mask, alpha)
|
769
|
-
lons = np.extract(mask, x_coords)
|
770
|
-
lats = np.extract(mask, y_coords)
|
771
|
-
else:
|
772
|
-
lons = x_coords.flatten()
|
773
|
-
lats = y_coords.flatten()
|
774
|
-
red = red.flatten()
|
775
|
-
green = green.flatten()
|
776
|
-
blue = blue.flatten()
|
777
|
-
alpha = alpha.flatten()
|
778
|
-
|
779
|
-
# Create DataFrame with RGBA values
|
780
|
-
data = pd.DataFrame(
|
781
|
-
{
|
782
|
-
"lon": lons,
|
783
|
-
"lat": lats,
|
784
|
-
"red": red,
|
785
|
-
"green": green,
|
786
|
-
"blue": blue,
|
787
|
-
"alpha": alpha,
|
788
|
-
}
|
789
|
-
)
|
790
|
-
|
791
|
-
# Normalize alpha values if they're not in [0, 1] range
|
792
|
-
if data["alpha"].max() > 1:
|
793
|
-
data["alpha"] = data["alpha"] / data["alpha"].max()
|
794
|
-
|
795
|
-
self.logger.info("RGBA dataset is processed!")
|
796
|
-
return data
|
1180
|
+
Process TIF to DataFrame - handles both single-band and multi-band.
|
797
1181
|
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
self.logger.info("Processing RGB dataset...")
|
1182
|
+
Args:
|
1183
|
+
band_number: Specific band to read (1-indexed). If None, reads all bands.
|
1184
|
+
drop_no Whether to drop nodata values
|
1185
|
+
band_names: Custom names for bands (multi-band only)
|
804
1186
|
|
1187
|
+
Returns:
|
1188
|
+
pd.DataFrame with lon, lat, and band value(s)
|
1189
|
+
"""
|
805
1190
|
with self.open_dataset() as src:
|
806
|
-
if
|
807
|
-
|
1191
|
+
if band_number is not None:
|
1192
|
+
# SINGLE BAND MODE
|
1193
|
+
band = src.read(band_number)
|
1194
|
+
mask = self._build_data_mask(band, drop_nodata, src.nodata)
|
1195
|
+
lons, lats = self._extract_coordinates_with_mask(mask)
|
1196
|
+
pixel_values = (
|
1197
|
+
np.extract(mask, band) if mask is not None else band.flatten()
|
1198
|
+
)
|
1199
|
+
band_name = band_names if isinstance(band_names, str) else "pixel_value"
|
808
1200
|
|
809
|
-
|
810
|
-
|
1201
|
+
return pd.DataFrame({"lon": lons, "lat": lats, band_name: pixel_values})
|
1202
|
+
else:
|
1203
|
+
# MULTI-BAND MODE (all bands)
|
1204
|
+
stack = src.read()
|
1205
|
+
|
1206
|
+
# Auto-detect band names by mode
|
1207
|
+
if band_names is None:
|
1208
|
+
if self.mode == "rgb":
|
1209
|
+
band_names = ["red", "green", "blue"]
|
1210
|
+
elif self.mode == "rgba":
|
1211
|
+
band_names = ["red", "green", "blue", "alpha"]
|
1212
|
+
else:
|
1213
|
+
band_names = [
|
1214
|
+
src.descriptions[i] or f"band_{i+1}"
|
1215
|
+
for i in range(self.count)
|
1216
|
+
]
|
811
1217
|
|
812
|
-
|
1218
|
+
# Build mask (checks ALL bands!)
|
1219
|
+
mask = self._build_multi_band_mask(stack, drop_nodata, src.nodata)
|
813
1220
|
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
mask = ~(
|
818
|
-
(red == nodata_value)
|
819
|
-
| (green == nodata_value)
|
820
|
-
| (blue == nodata_value)
|
821
|
-
)
|
822
|
-
red = np.extract(mask, red)
|
823
|
-
green = np.extract(mask, green)
|
824
|
-
blue = np.extract(mask, blue)
|
825
|
-
lons = np.extract(mask, x_coords)
|
826
|
-
lats = np.extract(mask, y_coords)
|
827
|
-
else:
|
828
|
-
lons = x_coords.flatten()
|
829
|
-
lats = y_coords.flatten()
|
830
|
-
red = red.flatten()
|
831
|
-
green = green.flatten()
|
832
|
-
blue = blue.flatten()
|
833
|
-
else:
|
834
|
-
lons = x_coords.flatten()
|
835
|
-
lats = y_coords.flatten()
|
836
|
-
red = red.flatten()
|
837
|
-
green = green.flatten()
|
838
|
-
blue = blue.flatten()
|
1221
|
+
# Create DataFrame
|
1222
|
+
data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
|
1223
|
+
df = pd.DataFrame(data_dict)
|
839
1224
|
|
840
|
-
|
841
|
-
|
842
|
-
"
|
843
|
-
"
|
844
|
-
"
|
845
|
-
|
846
|
-
"
|
847
|
-
}
|
848
|
-
)
|
1225
|
+
# RGBA: normalize alpha if needed
|
1226
|
+
if (
|
1227
|
+
self.mode == "rgba"
|
1228
|
+
and "alpha" in df.columns
|
1229
|
+
and df["alpha"].max() > 1
|
1230
|
+
):
|
1231
|
+
df["alpha"] = df["alpha"] / 255.0
|
849
1232
|
|
850
|
-
|
851
|
-
return data
|
1233
|
+
return df
|
852
1234
|
|
853
|
-
def
|
854
|
-
self,
|
1235
|
+
def _to_dataframe_chunked(
|
1236
|
+
self,
|
1237
|
+
windows: List[rasterio.windows.Window],
|
1238
|
+
band_number: Optional[int] = None,
|
1239
|
+
drop_nodata: bool = True,
|
1240
|
+
band_names: Optional[Union[str, List[str]]] = None,
|
1241
|
+
show_progress: bool = True,
|
855
1242
|
) -> pd.DataFrame:
|
856
|
-
"""
|
857
|
-
if self.mode != "single":
|
858
|
-
raise ValueError("Use appropriate method for current mode")
|
1243
|
+
"""Universal chunked converter for ALL modes."""
|
859
1244
|
|
860
|
-
|
861
|
-
|
862
|
-
if band_number <= 0 or band_number > self.count:
|
863
|
-
self.logger.error(
|
864
|
-
f"Error: Band number {band_number} is out of range. The file has {self.count} bands."
|
865
|
-
)
|
866
|
-
return None
|
1245
|
+
chunks = []
|
1246
|
+
iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows
|
867
1247
|
|
868
1248
|
with self.open_dataset() as src:
|
1249
|
+
# Auto-detect band names ONCE (before loop)
|
1250
|
+
if band_number is None and band_names is None:
|
1251
|
+
if self.mode == "rgb":
|
1252
|
+
band_names = ["red", "green", "blue"]
|
1253
|
+
elif self.mode == "rgba":
|
1254
|
+
band_names = ["red", "green", "blue", "alpha"]
|
1255
|
+
else: # multi
|
1256
|
+
band_names = [
|
1257
|
+
src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
|
1258
|
+
]
|
869
1259
|
|
870
|
-
|
871
|
-
|
872
|
-
|
1260
|
+
for window in iterator:
|
1261
|
+
if band_number is not None:
|
1262
|
+
# SINGLE BAND
|
1263
|
+
band_chunk = src.read(band_number, window=window)
|
1264
|
+
mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
|
1265
|
+
lons, lats = self._get_chunk_coordinates(window, src)
|
1266
|
+
band_name = (
|
1267
|
+
band_names if isinstance(band_names, str) else "pixel_value"
|
1268
|
+
)
|
873
1269
|
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
1270
|
+
# Build chunk DataFrame (could use helper but simple enough)
|
1271
|
+
if mask is not None:
|
1272
|
+
mask_flat = mask.flatten()
|
1273
|
+
chunk_df = pd.DataFrame(
|
1274
|
+
{
|
1275
|
+
"lon": lons[mask_flat],
|
1276
|
+
"lat": lats[mask_flat],
|
1277
|
+
band_name: band_chunk.flatten()[mask_flat],
|
1278
|
+
}
|
1279
|
+
)
|
1280
|
+
else:
|
1281
|
+
chunk_df = pd.DataFrame(
|
1282
|
+
{"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
|
1283
|
+
)
|
1284
|
+
else:
|
1285
|
+
# MULTI-BAND (includes RGB/RGBA)
|
1286
|
+
stack_chunk = src.read(window=window)
|
1287
|
+
mask = self._build_multi_band_mask(
|
1288
|
+
stack_chunk, drop_nodata, src.nodata
|
1289
|
+
)
|
1290
|
+
lons, lats = self._get_chunk_coordinates(window, src)
|
879
1291
|
|
880
|
-
|
881
|
-
|
1292
|
+
# Build DataFrame using helper
|
1293
|
+
band_dict = {
|
1294
|
+
band_names[i]: stack_chunk[i] for i in range(self.count)
|
1295
|
+
}
|
1296
|
+
chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)
|
882
1297
|
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
lats = np.extract(data_mask, y_coords)
|
888
|
-
else:
|
889
|
-
pixel_values = band.flatten()
|
890
|
-
lons = x_coords.flatten()
|
891
|
-
lats = y_coords.flatten()
|
1298
|
+
# RGBA: normalize alpha
|
1299
|
+
if self.mode == "rgba" and "alpha" in chunk_df.columns:
|
1300
|
+
if chunk_df["alpha"].max() > 1:
|
1301
|
+
chunk_df["alpha"] = chunk_df["alpha"] / 255.0
|
892
1302
|
|
893
|
-
|
1303
|
+
chunks.append(chunk_df)
|
894
1304
|
|
895
|
-
|
896
|
-
return
|
1305
|
+
result = pd.concat(chunks, ignore_index=True)
|
1306
|
+
return result
|
897
1307
|
|
898
|
-
def
|
1308
|
+
def _prepare_geometry_for_clipping(
|
899
1309
|
self,
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
) ->
|
904
|
-
"""
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
1310
|
+
geometry: Union[
|
1311
|
+
Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
|
1312
|
+
],
|
1313
|
+
) -> List[dict]:
|
1314
|
+
"""Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""
|
1315
|
+
|
1316
|
+
if isinstance(geometry, (Polygon, MultiPolygon)):
|
1317
|
+
# Shapely geometry
|
1318
|
+
return [geometry.__geo_interface__]
|
1319
|
+
|
1320
|
+
elif isinstance(geometry, gpd.GeoDataFrame):
|
1321
|
+
# GeoDataFrame - use all geometries
|
1322
|
+
return [
|
1323
|
+
geom.__geo_interface__ for geom in geometry.geometry if geom is not None
|
1324
|
+
]
|
1325
|
+
|
1326
|
+
elif isinstance(geometry, gpd.GeoSeries):
|
1327
|
+
# GeoSeries
|
1328
|
+
return [geom.__geo_interface__ for geom in geometry if geom is not None]
|
1329
|
+
|
1330
|
+
elif isinstance(geometry, dict):
|
1331
|
+
# Single GeoJSON-like dict
|
1332
|
+
return [geometry]
|
1333
|
+
|
1334
|
+
elif isinstance(geometry, list):
|
1335
|
+
# List of GeoJSON-like dicts
|
1336
|
+
return geometry
|
922
1337
|
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
1338
|
+
else:
|
1339
|
+
raise TypeError(
|
1340
|
+
f"Unsupported geometry type: {type(geometry)}. "
|
1341
|
+
"Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
|
1342
|
+
"GeoJSON-like dict, or list of GeoJSON-like dicts."
|
1343
|
+
)
|
927
1344
|
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
1345
|
+
def _validate_geometry_crs(
|
1346
|
+
self,
|
1347
|
+
original_geometry: Any,
|
1348
|
+
) -> None:
|
1349
|
+
"""Validate that geometry CRS matches raster CRS"""
|
1350
|
+
|
1351
|
+
# Get raster CRS
|
1352
|
+
raster_crs = self.crs
|
1353
|
+
|
1354
|
+
# Try to get geometry CRS
|
1355
|
+
geometry_crs = None
|
1356
|
+
|
1357
|
+
if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
|
1358
|
+
geometry_crs = original_geometry.crs
|
1359
|
+
elif hasattr(original_geometry, "crs"):
|
1360
|
+
geometry_crs = original_geometry.crs
|
1361
|
+
|
1362
|
+
# Warn if CRS mismatch detected
|
1363
|
+
if geometry_crs is not None and raster_crs is not None:
|
1364
|
+
if not raster_crs == geometry_crs:
|
1365
|
+
self.logger.warning(
|
1366
|
+
f"CRS mismatch detected! Raster CRS: {raster_crs}, "
|
1367
|
+
f"Geometry CRS: {geometry_crs}. "
|
1368
|
+
"Consider reprojecting geometry to match raster CRS for accurate clipping."
|
1369
|
+
)
|
934
1370
|
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
if values_to_mask:
|
948
|
-
data_mask = ~np.isin(band_data, values_to_mask)
|
949
|
-
band_values = np.extract(data_mask, band_data)
|
950
|
-
if band_idx == 0: # Only need to mask coordinates once
|
951
|
-
data_dict["lon"] = np.extract(data_mask, x_coords)
|
952
|
-
data_dict["lat"] = np.extract(data_mask, y_coords)
|
953
|
-
else:
|
954
|
-
band_values = band_data.flatten()
|
955
|
-
else:
|
956
|
-
band_values = band_data.flatten()
|
1371
|
+
def _create_clipped_processor(
|
1372
|
+
self, clipped_data: np.ndarray, clipped_meta: dict
|
1373
|
+
) -> "TifProcessor":
|
1374
|
+
"""
|
1375
|
+
Helper to create a new TifProcessor instance from clipped data.
|
1376
|
+
Saves the clipped data to a temporary file and initializes a new TifProcessor.
|
1377
|
+
"""
|
1378
|
+
clipped_file_path = os.path.join(
|
1379
|
+
self._temp_dir, f"clipped_temp_{os.urandom(8).hex()}.tif"
|
1380
|
+
)
|
1381
|
+
with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
|
1382
|
+
dst.write(clipped_data)
|
957
1383
|
|
958
|
-
|
959
|
-
band_name = (
|
960
|
-
band_names[band_idx]
|
961
|
-
if band_names and len(band_names) > band_idx
|
962
|
-
else f"band_{band_idx + 1}"
|
963
|
-
)
|
964
|
-
data_dict[band_name] = band_values
|
1384
|
+
self.logger.info(f"Clipped raster saved to temporary file: {clipped_file_path}")
|
965
1385
|
|
966
|
-
|
967
|
-
|
1386
|
+
# Create a new TifProcessor instance with the clipped data
|
1387
|
+
# Pass relevant parameters from the current instance to maintain consistency
|
1388
|
+
return TifProcessor(
|
1389
|
+
dataset_path=clipped_file_path,
|
1390
|
+
data_store=self.data_store,
|
1391
|
+
mode=self.mode,
|
1392
|
+
)
|
968
1393
|
|
969
1394
|
def _get_pixel_coordinates(self):
|
970
1395
|
"""Helper method to generate coordinate arrays for all pixels"""
|
@@ -991,60 +1416,322 @@ class TifProcessor:
|
|
991
1416
|
|
992
1417
|
return self._cache["pixel_coords"]
|
993
1418
|
|
1419
|
+
def _get_chunk_coordinates(self, window, src):
|
1420
|
+
"""Get coordinates for a specific window chunk."""
|
1421
|
+
transform = src.window_transform(window)
|
1422
|
+
rows, cols = np.meshgrid(
|
1423
|
+
np.arange(window.height), np.arange(window.width), indexing="ij"
|
1424
|
+
)
|
1425
|
+
xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
|
1426
|
+
return np.array(xs), np.array(ys)
|
994
1427
|
|
995
|
-
def
|
996
|
-
|
997
|
-
)
|
998
|
-
"""
|
999
|
-
Sample raster values from multiple TIFF files for given coordinates.
|
1428
|
+
def _extract_coordinates_with_mask(self, mask=None):
|
1429
|
+
"""Extract flattened coordinates, optionally applying a mask."""
|
1430
|
+
x_coords, y_coords = self._get_pixel_coordinates()
|
1000
1431
|
|
1001
|
-
|
1002
|
-
|
1003
|
-
- coordinate_list: List of (x, y) coordinates.
|
1432
|
+
if mask is not None:
|
1433
|
+
return np.extract(mask, x_coords), np.extract(mask, y_coords)
|
1004
1434
|
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1435
|
+
return x_coords.flatten(), y_coords.flatten()
|
1436
|
+
|
1437
|
+
def _build_data_mask(self, data, drop_nodata=True, nodata_value=None):
|
1438
|
+
"""Build a boolean mask for filtering data based on nodata values."""
|
1439
|
+
if not drop_nodata or nodata_value is None:
|
1440
|
+
return None
|
1441
|
+
|
1442
|
+
return data != nodata_value
|
1443
|
+
|
1444
|
+
def _build_multi_band_mask(
|
1445
|
+
self,
|
1446
|
+
bands: np.ndarray,
|
1447
|
+
drop_nodata: bool = True,
|
1448
|
+
nodata_value: Optional[float] = None,
|
1449
|
+
) -> Optional[np.ndarray]:
|
1450
|
+
"""
|
1451
|
+
Build mask for multi-band data - drops pixels where ANY band has nodata.
|
1452
|
+
|
1453
|
+
Args:
|
1454
|
+
bands: 3D array of shape (n_bands, height, width)
|
1455
|
+
drop_nodata Whether to drop nodata values
|
1456
|
+
nodata_value: The nodata value to check
|
1457
|
+
|
1458
|
+
Returns:
|
1459
|
+
Boolean mask or None if no masking needed
|
1460
|
+
"""
|
1461
|
+
if not drop_nodata or nodata_value is None:
|
1462
|
+
return None
|
1463
|
+
|
1464
|
+
# Check if ANY band has nodata at each pixel location
|
1465
|
+
has_nodata = np.any(bands == nodata_value, axis=0)
|
1466
|
+
|
1467
|
+
# Return True where ALL bands are valid
|
1468
|
+
valid_mask = ~has_nodata
|
1469
|
+
|
1470
|
+
return valid_mask if not valid_mask.all() else None
|
1471
|
+
|
1472
|
+
def _bands_to_dict(self, bands, band_count, band_names, mask=None):
|
1473
|
+
"""Read specified bands and return as a dictionary with optional masking."""
|
1474
|
+
|
1475
|
+
lons, lats = self._extract_coordinates_with_mask(mask)
|
1476
|
+
data_dict = {"lon": lons, "lat": lats}
|
1477
|
+
|
1478
|
+
for idx, name in enumerate(band_names[:band_count]):
|
1479
|
+
band_data = bands[idx]
|
1480
|
+
data_dict[name] = (
|
1481
|
+
np.extract(mask, band_data) if mask is not None else band_data.flatten()
|
1482
|
+
)
|
1483
|
+
|
1484
|
+
return data_dict
|
1485
|
+
|
1486
|
+
def _calculate_optimal_chunk_size(
|
1487
|
+
self, operation: str = "conversion", target_memory_mb: int = 500
|
1488
|
+
) -> int:
|
1489
|
+
"""
|
1490
|
+
Calculate optimal chunk size (number of rows) based on target memory usage.
|
1009
1491
|
|
1010
|
-
|
1011
|
-
|
1492
|
+
Args:
|
1493
|
+
operation: Type of operation ('conversion', 'graph')
|
1494
|
+
target_memory_mb: Target memory per chunk in megabytes
|
1012
1495
|
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1496
|
+
Returns:
|
1497
|
+
Number of rows per chunk
|
1498
|
+
"""
|
1499
|
+
bytes_per_element = np.dtype(self.dtype).itemsize
|
1500
|
+
n_bands = self.count
|
1501
|
+
width = self.width
|
1502
|
+
|
1503
|
+
# Adjust for operation type
|
1504
|
+
if operation == "conversion":
|
1505
|
+
# DataFrame overhead is roughly 2x
|
1506
|
+
bytes_per_row = width * n_bands * bytes_per_element * 2
|
1507
|
+
elif operation == "graph":
|
1508
|
+
# Graph needs additional space for edges
|
1509
|
+
bytes_per_row = width * bytes_per_element * 4 # Estimate
|
1017
1510
|
else:
|
1018
|
-
|
1511
|
+
bytes_per_row = width * n_bands * bytes_per_element
|
1019
1512
|
|
1020
|
-
|
1513
|
+
target_bytes = target_memory_mb * 1024 * 1024
|
1514
|
+
chunk_rows = max(1, int(target_bytes / bytes_per_row))
|
1021
1515
|
|
1022
|
-
|
1516
|
+
# Ensure chunk size doesn't exceed total height
|
1517
|
+
chunk_rows = min(chunk_rows, self.height)
|
1023
1518
|
|
1519
|
+
self.logger.info(
|
1520
|
+
f"Calculated chunk size: {chunk_rows} rows "
|
1521
|
+
f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
|
1522
|
+
)
|
1024
1523
|
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
"""
|
1031
|
-
Sample raster values from multiple TIFF files for polygons in a list and join the results.
|
1524
|
+
return chunk_rows
|
1525
|
+
|
1526
|
+
def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
|
1527
|
+
"""
|
1528
|
+
Generate window objects for chunked reading.
|
1032
1529
|
|
1033
|
-
|
1034
|
-
|
1035
|
-
- polygon_list: List of polygon geometries (can include MultiPolygons).
|
1036
|
-
- stat: Aggregation statistic to compute within each polygon (mean, median, sum, min, max).
|
1530
|
+
Args:
|
1531
|
+
chunk_size: Number of rows per chunk
|
1037
1532
|
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1533
|
+
Returns:
|
1534
|
+
List of rasterio.windows.Window objects
|
1535
|
+
"""
|
1536
|
+
windows = []
|
1537
|
+
for row_start in range(0, self.height, chunk_size):
|
1538
|
+
row_end = min(row_start + chunk_size, self.height)
|
1539
|
+
window = rasterio.windows.Window(
|
1540
|
+
col_off=0,
|
1541
|
+
row_off=row_start,
|
1542
|
+
width=self.width,
|
1543
|
+
height=row_end - row_start,
|
1544
|
+
)
|
1545
|
+
windows.append(window)
|
1546
|
+
|
1547
|
+
return windows
|
1548
|
+
|
1549
|
+
def _format_bytes(self, bytes_value: int) -> str:
|
1550
|
+
"""Convert bytes to human-readable format."""
|
1551
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
1552
|
+
if bytes_value < 1024.0:
|
1553
|
+
return f"{bytes_value:.2f} {unit}"
|
1554
|
+
bytes_value /= 1024.0
|
1555
|
+
return f"{bytes_value:.2f} PB"
|
1556
|
+
|
1557
|
+
def _check_available_memory(self) -> dict:
|
1558
|
+
"""
|
1559
|
+
Check available system memory.
|
1560
|
+
|
1561
|
+
Returns:
|
1562
|
+
Dict with total, available, and used memory info
|
1563
|
+
"""
|
1564
|
+
import psutil
|
1565
|
+
|
1566
|
+
memory = psutil.virtual_memory()
|
1567
|
+
return {
|
1568
|
+
"total": memory.total,
|
1569
|
+
"available": memory.available,
|
1570
|
+
"used": memory.used,
|
1571
|
+
"percent": memory.percent,
|
1572
|
+
"available_human": self._format_bytes(memory.available),
|
1573
|
+
}
|
1574
|
+
|
1575
|
+
def _estimate_memory_usage(
|
1576
|
+
self, operation: str = "conversion", n_workers: int = 1
|
1577
|
+
) -> dict:
|
1578
|
+
"""
|
1579
|
+
Estimate memory usage for various operations.
|
1580
|
+
|
1581
|
+
Args:
|
1582
|
+
operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
|
1583
|
+
n_workers: Number of workers (for batched_sampling)
|
1584
|
+
|
1585
|
+
Returns:
|
1586
|
+
Dict with estimated memory usage in bytes and human-readable format
|
1587
|
+
"""
|
1588
|
+
bytes_per_element = np.dtype(self.dtype).itemsize
|
1589
|
+
n_pixels = self.width * self.height
|
1590
|
+
n_bands = self.count
|
1591
|
+
|
1592
|
+
estimates = {}
|
1593
|
+
|
1594
|
+
if operation == "conversion":
|
1595
|
+
# to_dataframe/to_geodataframe: full raster + DataFrame overhead
|
1596
|
+
raster_memory = n_pixels * n_bands * bytes_per_element
|
1597
|
+
# DataFrame overhead (roughly 2x for storage + processing)
|
1598
|
+
dataframe_memory = (
|
1599
|
+
n_pixels * n_bands * 16
|
1600
|
+
) # 16 bytes per value in DataFrame
|
1601
|
+
total = raster_memory + dataframe_memory
|
1602
|
+
estimates["raster"] = raster_memory
|
1603
|
+
estimates["dataframe"] = dataframe_memory
|
1604
|
+
estimates["total"] = total
|
1605
|
+
|
1606
|
+
elif operation == "batched_sampling":
|
1607
|
+
# Each worker loads full raster into MemoryFile
|
1608
|
+
# Need to get file size
|
1609
|
+
if self._merged_file_path:
|
1610
|
+
file_path = self._merged_file_path
|
1611
|
+
elif self._reprojected_file_path:
|
1612
|
+
file_path = self._reprojected_file_path
|
1613
|
+
else:
|
1614
|
+
file_path = str(self.dataset_path)
|
1615
|
+
|
1616
|
+
try:
|
1617
|
+
import os
|
1618
|
+
|
1619
|
+
file_size = os.path.getsize(file_path)
|
1620
|
+
except:
|
1621
|
+
# Estimate if can't get file size
|
1622
|
+
file_size = n_pixels * n_bands * bytes_per_element * 1.2 # Add overhead
|
1623
|
+
|
1624
|
+
estimates["per_worker"] = file_size
|
1625
|
+
estimates["total"] = file_size * n_workers
|
1626
|
+
|
1627
|
+
elif operation == "merge":
|
1628
|
+
# _merge_with_mean uses float64 arrays
|
1629
|
+
raster_memory = n_pixels * n_bands * 8 # float64
|
1630
|
+
estimates["sum_array"] = raster_memory
|
1631
|
+
estimates["count_array"] = n_pixels * 4 # int32
|
1632
|
+
estimates["total"] = raster_memory + n_pixels * 4
|
1633
|
+
|
1634
|
+
elif operation == "graph":
|
1635
|
+
# to_graph: data + node_map + edges
|
1636
|
+
data_memory = n_pixels * bytes_per_element
|
1637
|
+
node_map_memory = n_pixels * 4 # int32
|
1638
|
+
# Estimate edges (rough: 4-connectivity = 4 edges per pixel)
|
1639
|
+
edges_memory = n_pixels * 4 * 3 * 8 # 3 values per edge, float64
|
1640
|
+
total = data_memory + node_map_memory + edges_memory
|
1641
|
+
estimates["data"] = data_memory
|
1642
|
+
estimates["node_map"] = node_map_memory
|
1643
|
+
estimates["edges"] = edges_memory
|
1644
|
+
estimates["total"] = total
|
1645
|
+
|
1646
|
+
# Add human-readable format
|
1647
|
+
estimates["human_readable"] = self._format_bytes(estimates["total"])
|
1648
|
+
|
1649
|
+
return estimates
|
1650
|
+
|
1651
|
+
def _memory_guard(
|
1652
|
+
self,
|
1653
|
+
operation: str,
|
1654
|
+
threshold_percent: float = 80.0,
|
1655
|
+
n_workers: Optional[int] = None,
|
1656
|
+
raise_error: bool = False,
|
1657
|
+
) -> bool:
|
1658
|
+
"""
|
1659
|
+
Check if operation is safe to perform given memory constraints.
|
1660
|
+
|
1661
|
+
Args:
|
1662
|
+
operation: Type of operation to check
|
1663
|
+
threshold_percent: Maximum % of available memory to use (default 80%)
|
1664
|
+
n_workers: Number of workers (for batched operations)
|
1665
|
+
raise_error: If True, raise MemoryError instead of warning
|
1666
|
+
|
1667
|
+
Returns:
|
1668
|
+
True if operation is safe, False otherwise
|
1042
1669
|
|
1043
|
-
|
1044
|
-
|
1670
|
+
Raises:
|
1671
|
+
MemoryError: If raise_error=True and memory insufficient
|
1672
|
+
"""
|
1673
|
+
import warnings
|
1674
|
+
|
1675
|
+
estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
|
1676
|
+
memory_info = self._check_available_memory()
|
1677
|
+
|
1678
|
+
estimated_usage = estimates["total"]
|
1679
|
+
available = memory_info["available"]
|
1680
|
+
threshold = available * (threshold_percent / 100.0)
|
1681
|
+
|
1682
|
+
is_safe = estimated_usage <= threshold
|
1683
|
+
|
1684
|
+
if not is_safe:
|
1685
|
+
usage_str = self._format_bytes(estimated_usage)
|
1686
|
+
available_str = memory_info["available_human"]
|
1687
|
+
|
1688
|
+
message = (
|
1689
|
+
f"Memory warning: {operation} operation may require {usage_str} "
|
1690
|
+
f"but only {available_str} is available. "
|
1691
|
+
f"Current memory usage: {memory_info['percent']:.1f}%"
|
1692
|
+
)
|
1045
1693
|
|
1046
|
-
|
1694
|
+
if raise_error:
|
1695
|
+
raise MemoryError(message)
|
1696
|
+
else:
|
1697
|
+
warnings.warn(message, ResourceWarning)
|
1698
|
+
if hasattr(self, "logger"):
|
1699
|
+
self.logger.warning(message)
|
1700
|
+
|
1701
|
+
return is_safe
|
1702
|
+
|
1703
|
+
def _validate_mode_band_compatibility(self):
|
1704
|
+
"""Validate that mode matches band count."""
|
1705
|
+
mode_requirements = {
|
1706
|
+
"single": (1, "1-band"),
|
1707
|
+
"rgb": (3, "3-band"),
|
1708
|
+
"rgba": (4, "4-band"),
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
if self.mode in mode_requirements:
|
1712
|
+
required_count, description = mode_requirements[self.mode]
|
1713
|
+
if self.count != required_count:
|
1714
|
+
raise ValueError(
|
1715
|
+
f"{self.mode.upper()} mode requires a {description} TIF file"
|
1716
|
+
)
|
1717
|
+
elif self.mode == "multi" and self.count < 2:
|
1718
|
+
raise ValueError("Multi mode requires a TIF file with 2 or more bands")
|
1047
1719
|
|
1048
|
-
|
1720
|
+
def __enter__(self):
|
1721
|
+
return self
|
1049
1722
|
|
1050
|
-
|
1723
|
+
def __del__(self):
|
1724
|
+
"""Clean up temporary files and directories."""
|
1725
|
+
if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
|
1726
|
+
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
1727
|
+
|
1728
|
+
def cleanup(self):
|
1729
|
+
"""Explicit cleanup method for better control."""
|
1730
|
+
if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
|
1731
|
+
shutil.rmtree(self._temp_dir)
|
1732
|
+
self.logger.info("Cleaned up temporary files")
|
1733
|
+
|
1734
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
1735
|
+
"""Proper context manager exit with cleanup."""
|
1736
|
+
self.cleanup()
|
1737
|
+
return False
|