giga-spatial 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giga_spatial-0.6.0.dist-info/METADATA +141 -0
- giga_spatial-0.6.0.dist-info/RECORD +47 -0
- giga_spatial-0.6.0.dist-info/WHEEL +5 -0
- giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
- giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
- gigaspatial/__init__.py +1 -0
- gigaspatial/config.py +226 -0
- gigaspatial/core/__init__.py +0 -0
- gigaspatial/core/io/__init__.py +5 -0
- gigaspatial/core/io/adls_data_store.py +325 -0
- gigaspatial/core/io/data_api.py +113 -0
- gigaspatial/core/io/data_store.py +147 -0
- gigaspatial/core/io/local_data_store.py +92 -0
- gigaspatial/core/io/readers.py +265 -0
- gigaspatial/core/io/writers.py +128 -0
- gigaspatial/core/schemas/__init__.py +0 -0
- gigaspatial/core/schemas/entity.py +244 -0
- gigaspatial/generators/__init__.py +2 -0
- gigaspatial/generators/poi.py +636 -0
- gigaspatial/generators/zonal/__init__.py +3 -0
- gigaspatial/generators/zonal/base.py +370 -0
- gigaspatial/generators/zonal/geometry.py +439 -0
- gigaspatial/generators/zonal/mercator.py +78 -0
- gigaspatial/grid/__init__.py +1 -0
- gigaspatial/grid/mercator_tiles.py +286 -0
- gigaspatial/handlers/__init__.py +40 -0
- gigaspatial/handlers/base.py +761 -0
- gigaspatial/handlers/boundaries.py +305 -0
- gigaspatial/handlers/ghsl.py +772 -0
- gigaspatial/handlers/giga.py +145 -0
- gigaspatial/handlers/google_open_buildings.py +472 -0
- gigaspatial/handlers/hdx.py +241 -0
- gigaspatial/handlers/mapbox_image.py +208 -0
- gigaspatial/handlers/maxar_image.py +291 -0
- gigaspatial/handlers/microsoft_global_buildings.py +548 -0
- gigaspatial/handlers/ookla_speedtest.py +199 -0
- gigaspatial/handlers/opencellid.py +290 -0
- gigaspatial/handlers/osm.py +356 -0
- gigaspatial/handlers/overture.py +126 -0
- gigaspatial/handlers/rwi.py +157 -0
- gigaspatial/handlers/unicef_georepo.py +806 -0
- gigaspatial/handlers/worldpop.py +266 -0
- gigaspatial/processing/__init__.py +4 -0
- gigaspatial/processing/geo.py +1054 -0
- gigaspatial/processing/sat_images.py +39 -0
- gigaspatial/processing/tif_processor.py +477 -0
- gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,1054 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
import geopandas as gpd
|
4
|
+
from shapely import wkt
|
5
|
+
from shapely.geometry import base
|
6
|
+
from typing import Literal, List, Tuple, Optional, Union, Dict
|
7
|
+
import re
|
8
|
+
|
9
|
+
from gigaspatial.core.io.data_store import DataStore
|
10
|
+
from gigaspatial.config import config
|
11
|
+
|
12
|
+
LOGGER = config.get_logger("GigaSpatialProcessing")
|
13
|
+
|
14
|
+
|
15
|
+
def detect_coordinate_columns(
|
16
|
+
data, lat_keywords=None, lon_keywords=None, case_sensitive=False
|
17
|
+
) -> Tuple[str, str]:
|
18
|
+
"""
|
19
|
+
Detect latitude and longitude columns in a DataFrame using keyword matching.
|
20
|
+
|
21
|
+
Parameters:
|
22
|
+
----------
|
23
|
+
data : pandas.DataFrame
|
24
|
+
DataFrame to search for coordinate columns.
|
25
|
+
lat_keywords : list of str, optional
|
26
|
+
Keywords for identifying latitude columns. If None, uses default keywords.
|
27
|
+
lon_keywords : list of str, optional
|
28
|
+
Keywords for identifying longitude columns. If None, uses default keywords.
|
29
|
+
case_sensitive : bool, optional
|
30
|
+
Whether to perform case-sensitive matching. Default is False.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
-------
|
34
|
+
tuple[str, str]
|
35
|
+
Names of detected (latitude, longitude) columns.
|
36
|
+
|
37
|
+
Raises:
|
38
|
+
------
|
39
|
+
ValueError
|
40
|
+
If no unique pair of latitude/longitude columns can be found.
|
41
|
+
TypeError
|
42
|
+
If input data is not a pandas DataFrame.
|
43
|
+
"""
|
44
|
+
|
45
|
+
# Default keywords if none provided
|
46
|
+
default_lat = [
|
47
|
+
"latitude",
|
48
|
+
"lat",
|
49
|
+
"y",
|
50
|
+
"lat_",
|
51
|
+
"lat(s)",
|
52
|
+
"_lat",
|
53
|
+
"ylat",
|
54
|
+
"latitude_y",
|
55
|
+
]
|
56
|
+
default_lon = [
|
57
|
+
"longitude",
|
58
|
+
"lon",
|
59
|
+
"long",
|
60
|
+
"x",
|
61
|
+
"lon_",
|
62
|
+
"lon(e)",
|
63
|
+
"long(e)",
|
64
|
+
"_lon",
|
65
|
+
"xlon",
|
66
|
+
"longitude_x",
|
67
|
+
]
|
68
|
+
|
69
|
+
lat_keywords = lat_keywords or default_lat
|
70
|
+
lon_keywords = lon_keywords or default_lon
|
71
|
+
|
72
|
+
# Input validation
|
73
|
+
if not isinstance(data, pd.DataFrame):
|
74
|
+
raise TypeError("Input must be a pandas DataFrame")
|
75
|
+
|
76
|
+
if not data.columns.is_unique:
|
77
|
+
raise ValueError("DataFrame contains duplicate column names")
|
78
|
+
|
79
|
+
def create_pattern(keywords):
|
80
|
+
"""Create regex pattern from keywords."""
|
81
|
+
return "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)
|
82
|
+
|
83
|
+
def find_matching_columns(columns, pattern, case_sensitive) -> List:
|
84
|
+
"""Find columns matching the pattern."""
|
85
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
86
|
+
return [col for col in columns if re.search(pattern, col, flags=flags)]
|
87
|
+
|
88
|
+
try:
|
89
|
+
# Create patterns
|
90
|
+
lat_pattern = create_pattern(lat_keywords)
|
91
|
+
lon_pattern = create_pattern(lon_keywords)
|
92
|
+
|
93
|
+
# Find matching columns
|
94
|
+
lat_cols = find_matching_columns(data.columns, lat_pattern, case_sensitive)
|
95
|
+
lon_cols = find_matching_columns(data.columns, lon_pattern, case_sensitive)
|
96
|
+
|
97
|
+
# Remove any longitude matches from latitude columns and vice versa
|
98
|
+
lat_cols = [col for col in lat_cols if col not in lon_cols]
|
99
|
+
lon_cols = [col for col in lon_cols if col not in lat_cols]
|
100
|
+
|
101
|
+
# Detailed error messages based on what was found
|
102
|
+
if not lat_cols and not lon_cols:
|
103
|
+
columns_list = "\n".join(f"- {col}" for col in data.columns)
|
104
|
+
raise ValueError(
|
105
|
+
f"No latitude or longitude columns found. Available columns are:\n{columns_list}\n"
|
106
|
+
f"Consider adding more keywords or checking column names."
|
107
|
+
)
|
108
|
+
|
109
|
+
if not lat_cols:
|
110
|
+
found_lons = ", ".join(lon_cols)
|
111
|
+
raise ValueError(
|
112
|
+
f"Found longitude columns ({found_lons}) but no latitude columns. "
|
113
|
+
"Check latitude keywords or column names."
|
114
|
+
)
|
115
|
+
|
116
|
+
if not lon_cols:
|
117
|
+
found_lats = ", ".join(lat_cols)
|
118
|
+
raise ValueError(
|
119
|
+
f"Found latitude columns ({found_lats}) but no longitude columns. "
|
120
|
+
"Check longitude keywords or column names."
|
121
|
+
)
|
122
|
+
|
123
|
+
if len(lat_cols) > 1 or len(lon_cols) > 1:
|
124
|
+
raise ValueError(
|
125
|
+
f"Multiple possible coordinate columns found:\n"
|
126
|
+
f"Latitude candidates: {lat_cols}\n"
|
127
|
+
f"Longitude candidates: {lon_cols}\n"
|
128
|
+
"Please specify more precise keywords."
|
129
|
+
)
|
130
|
+
|
131
|
+
return lat_cols[0], lon_cols[0]
|
132
|
+
|
133
|
+
except Exception as e:
|
134
|
+
if isinstance(e, ValueError):
|
135
|
+
raise
|
136
|
+
raise RuntimeError(f"Error detecting coordinate columns: {str(e)}")
|
137
|
+
|
138
|
+
|
139
|
+
def convert_to_geodataframe(
|
140
|
+
data: pd.DataFrame, lat_col: str = None, lon_col: str = None, crs="EPSG:4326"
|
141
|
+
) -> gpd.GeoDataFrame:
|
142
|
+
"""
|
143
|
+
Convert a pandas DataFrame to a GeoDataFrame, either from latitude/longitude columns
|
144
|
+
or from a WKT geometry column.
|
145
|
+
|
146
|
+
Parameters:
|
147
|
+
----------
|
148
|
+
data : pandas.DataFrame
|
149
|
+
Input DataFrame containing either lat/lon columns or a geometry column.
|
150
|
+
lat_col : str, optional
|
151
|
+
Name of the latitude column. Default is 'lat'.
|
152
|
+
lon_col : str, optional
|
153
|
+
Name of the longitude column. Default is 'lon'.
|
154
|
+
crs : str or pyproj.CRS, optional
|
155
|
+
Coordinate Reference System of the geometry data. Default is 'EPSG:4326'.
|
156
|
+
|
157
|
+
Returns:
|
158
|
+
-------
|
159
|
+
geopandas.GeoDataFrame
|
160
|
+
A GeoDataFrame containing the input data with a geometry column.
|
161
|
+
|
162
|
+
Raises:
|
163
|
+
------
|
164
|
+
TypeError
|
165
|
+
If input is not a pandas DataFrame.
|
166
|
+
ValueError
|
167
|
+
If required columns are missing or contain invalid data.
|
168
|
+
"""
|
169
|
+
|
170
|
+
# Input validation
|
171
|
+
if not isinstance(data, pd.DataFrame):
|
172
|
+
raise TypeError("Input 'data' must be a pandas DataFrame")
|
173
|
+
|
174
|
+
# Create a copy to avoid modifying the input
|
175
|
+
df = data.copy()
|
176
|
+
|
177
|
+
try:
|
178
|
+
if "geometry" not in df.columns:
|
179
|
+
# If column names not provided, try to detect them
|
180
|
+
if lat_col is None or lon_col is None:
|
181
|
+
try:
|
182
|
+
detected_lat, detected_lon = detect_coordinate_columns(df)
|
183
|
+
lat_col = lat_col or detected_lat
|
184
|
+
lon_col = lon_col or detected_lon
|
185
|
+
except ValueError as e:
|
186
|
+
raise ValueError(
|
187
|
+
f"Could not automatically detect coordinate columns and no "
|
188
|
+
f"'geometry' column found. Error: {str(e)}"
|
189
|
+
)
|
190
|
+
|
191
|
+
# Validate latitude/longitude columns exist
|
192
|
+
if lat_col not in df.columns or lon_col not in df.columns:
|
193
|
+
raise ValueError(
|
194
|
+
f"Could not find columns: {lat_col} and/or {lon_col} in the DataFrame"
|
195
|
+
)
|
196
|
+
|
197
|
+
# Check for missing values
|
198
|
+
if df[lat_col].isna().any() or df[lon_col].isna().any():
|
199
|
+
raise ValueError(
|
200
|
+
f"Missing values found in {lat_col} and/or {lon_col} columns"
|
201
|
+
)
|
202
|
+
|
203
|
+
# Create geometry from lat/lon
|
204
|
+
geometry = gpd.points_from_xy(x=df[lon_col], y=df[lat_col])
|
205
|
+
|
206
|
+
else:
|
207
|
+
# Check if geometry column already contains valid geometries
|
208
|
+
if df["geometry"].apply(lambda x: isinstance(x, base.BaseGeometry)).all():
|
209
|
+
geometry = df["geometry"]
|
210
|
+
elif df["geometry"].apply(lambda x: isinstance(x, str)).all():
|
211
|
+
# Convert WKT strings to geometry objects
|
212
|
+
geometry = df["geometry"].apply(wkt.loads)
|
213
|
+
else:
|
214
|
+
raise ValueError(
|
215
|
+
"Invalid geometry format: contains mixed or unsupported types"
|
216
|
+
)
|
217
|
+
|
218
|
+
# drop the WKT column if conversion was done
|
219
|
+
if (
|
220
|
+
"geometry" in df.columns
|
221
|
+
and not df["geometry"]
|
222
|
+
.apply(lambda x: isinstance(x, base.BaseGeometry))
|
223
|
+
.all()
|
224
|
+
):
|
225
|
+
df = df.drop(columns=["geometry"])
|
226
|
+
|
227
|
+
return gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
|
228
|
+
|
229
|
+
except Exception as e:
|
230
|
+
raise RuntimeError(f"Error converting to GeoDataFrame: {str(e)}")
|
231
|
+
|
232
|
+
|
233
|
+
def buffer_geodataframe(
|
234
|
+
gdf: gpd.GeoDataFrame,
|
235
|
+
buffer_distance_meters: float,
|
236
|
+
cap_style: Literal["round", "square", "flat"] = "round",
|
237
|
+
copy=True,
|
238
|
+
) -> gpd.GeoDataFrame:
|
239
|
+
"""
|
240
|
+
Buffers a GeoDataFrame with a given buffer distance in meters.
|
241
|
+
|
242
|
+
Parameters:
|
243
|
+
- gdf : geopandas.GeoDataFrame
|
244
|
+
The GeoDataFrame to be buffered.
|
245
|
+
- buffer_distance_meters : float
|
246
|
+
The buffer distance in meters.
|
247
|
+
- cap_style : str, optional
|
248
|
+
The style of caps. round, flat, square. Default is round.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
- geopandas.GeoDataFrame
|
252
|
+
The buffered GeoDataFrame.
|
253
|
+
"""
|
254
|
+
|
255
|
+
# Input validation
|
256
|
+
if not isinstance(gdf, gpd.GeoDataFrame):
|
257
|
+
raise TypeError("Input must be a GeoDataFrame")
|
258
|
+
|
259
|
+
if not isinstance(buffer_distance_meters, (float, int)):
|
260
|
+
raise TypeError("Buffer distance must be a number")
|
261
|
+
|
262
|
+
if cap_style not in ["round", "square", "flat"]:
|
263
|
+
raise ValueError("cap_style must be round, flat or square.")
|
264
|
+
|
265
|
+
if gdf.crs is None:
|
266
|
+
raise ValueError("Input GeoDataFrame must have a defined CRS")
|
267
|
+
|
268
|
+
# Create a copy if requested
|
269
|
+
gdf_work = gdf.copy() if copy else gdf
|
270
|
+
|
271
|
+
# Store input CRS
|
272
|
+
input_crs = gdf_work.crs
|
273
|
+
|
274
|
+
try:
|
275
|
+
# Create a custom UTM CRS based on the calculated UTM zone
|
276
|
+
utm_crs = gdf_work.estimate_utm_crs()
|
277
|
+
|
278
|
+
# Transform to UTM, create buffer, and transform back
|
279
|
+
gdf_work = gdf_work.to_crs(utm_crs)
|
280
|
+
gdf_work["geometry"] = gdf_work["geometry"].buffer(
|
281
|
+
buffer_distance_meters, cap_style=cap_style
|
282
|
+
)
|
283
|
+
gdf_work = gdf_work.to_crs(input_crs)
|
284
|
+
|
285
|
+
return gdf_work
|
286
|
+
|
287
|
+
except Exception as e:
|
288
|
+
raise RuntimeError(f"Error during buffering operation: {str(e)}")
|
289
|
+
|
290
|
+
|
291
|
+
def add_spatial_jitter(
|
292
|
+
df: pd.DataFrame,
|
293
|
+
columns: List[str] = ["latitude", "longitude"],
|
294
|
+
amount: float = 0.0001,
|
295
|
+
seed=None,
|
296
|
+
copy=True,
|
297
|
+
) -> pd.DataFrame:
|
298
|
+
"""
|
299
|
+
Add random jitter to duplicated geographic coordinates to create slight separation
|
300
|
+
between overlapping points.
|
301
|
+
|
302
|
+
Parameters:
|
303
|
+
----------
|
304
|
+
df : pandas.DataFrame
|
305
|
+
DataFrame containing geographic coordinates.
|
306
|
+
columns : list of str, optional
|
307
|
+
Column names containing coordinates to jitter. Default is ['latitude', 'longitude'].
|
308
|
+
amount : float or dict, optional
|
309
|
+
Amount of jitter to add. If float, same amount used for all columns.
|
310
|
+
If dict, specify amount per column, e.g., {'lat': 0.0001, 'lon': 0.0002}.
|
311
|
+
Default is 0.0001 (approximately 11 meters at the equator).
|
312
|
+
seed : int, optional
|
313
|
+
Random seed for reproducibility. Default is None.
|
314
|
+
copy : bool, optional
|
315
|
+
Whether to create a copy of the input DataFrame. Default is True.
|
316
|
+
|
317
|
+
Returns:
|
318
|
+
-------
|
319
|
+
pandas.DataFrame
|
320
|
+
DataFrame with jittered coordinates for previously duplicated points.
|
321
|
+
|
322
|
+
Raises:
|
323
|
+
------
|
324
|
+
ValueError
|
325
|
+
If columns don't exist or jitter amount is invalid.
|
326
|
+
TypeError
|
327
|
+
If input types are incorrect.
|
328
|
+
"""
|
329
|
+
|
330
|
+
# Input validation
|
331
|
+
if not isinstance(df, pd.DataFrame):
|
332
|
+
raise TypeError("Input must be a pandas DataFrame")
|
333
|
+
|
334
|
+
if not all(col in df.columns for col in columns):
|
335
|
+
raise ValueError(f"Not all columns {columns} found in DataFrame")
|
336
|
+
|
337
|
+
# Handle jitter amounts
|
338
|
+
if isinstance(amount, (int, float)):
|
339
|
+
if amount <= 0:
|
340
|
+
raise ValueError("Jitter amount must be positive")
|
341
|
+
jitter_amounts = {col: amount for col in columns}
|
342
|
+
elif isinstance(amount, dict):
|
343
|
+
if not all(col in amount for col in columns):
|
344
|
+
raise ValueError("Must specify jitter amount for each column")
|
345
|
+
if not all(amt > 0 for amt in amount.values()):
|
346
|
+
raise ValueError("All jitter amounts must be positive")
|
347
|
+
jitter_amounts = amount
|
348
|
+
else:
|
349
|
+
raise TypeError("amount must be a number or dictionary")
|
350
|
+
|
351
|
+
# Create copy if requested
|
352
|
+
df_work = df.copy() if copy else df
|
353
|
+
|
354
|
+
# Set random seed if provided
|
355
|
+
if seed is not None:
|
356
|
+
np.random.seed(seed)
|
357
|
+
|
358
|
+
try:
|
359
|
+
# Find duplicated coordinates
|
360
|
+
duplicate_mask = df_work.duplicated(subset=columns, keep=False)
|
361
|
+
n_duplicates = duplicate_mask.sum()
|
362
|
+
|
363
|
+
if n_duplicates > 0:
|
364
|
+
# Add jitter to each column separately
|
365
|
+
for col in columns:
|
366
|
+
jitter = np.random.uniform(
|
367
|
+
low=-jitter_amounts[col],
|
368
|
+
high=jitter_amounts[col],
|
369
|
+
size=n_duplicates,
|
370
|
+
)
|
371
|
+
df_work.loc[duplicate_mask, col] += jitter
|
372
|
+
|
373
|
+
# Validate results (ensure no remaining duplicates)
|
374
|
+
if df_work.duplicated(subset=columns, keep=False).any():
|
375
|
+
# If duplicates remain, recursively add more jitter
|
376
|
+
df_work = add_spatial_jitter(
|
377
|
+
df_work,
|
378
|
+
columns=columns,
|
379
|
+
amount={col: amt * 2 for col, amt in jitter_amounts.items()},
|
380
|
+
seed=seed,
|
381
|
+
copy=False,
|
382
|
+
)
|
383
|
+
|
384
|
+
return df_work
|
385
|
+
|
386
|
+
except Exception as e:
|
387
|
+
raise RuntimeError(f"Error during jittering operation: {str(e)}")
|
388
|
+
|
389
|
+
|
390
|
+
def get_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
391
|
+
"""
|
392
|
+
Calculate the centroids of a (Multi)Polygon GeoDataFrame.
|
393
|
+
|
394
|
+
Parameters:
|
395
|
+
----------
|
396
|
+
gdf : geopandas.GeoDataFrame
|
397
|
+
GeoDataFrame containing (Multi)Polygon geometries.
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
-------
|
401
|
+
geopandas.GeoDataFrame
|
402
|
+
A new GeoDataFrame with Point geometries representing the centroids.
|
403
|
+
|
404
|
+
Raises:
|
405
|
+
------
|
406
|
+
ValueError
|
407
|
+
If the input GeoDataFrame does not contain (Multi)Polygon geometries.
|
408
|
+
"""
|
409
|
+
# Validate input geometries
|
410
|
+
if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
|
411
|
+
raise ValueError(
|
412
|
+
"Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
|
413
|
+
)
|
414
|
+
|
415
|
+
# Calculate centroids
|
416
|
+
centroids = gdf.copy()
|
417
|
+
centroids["geometry"] = centroids.geometry.centroid
|
418
|
+
|
419
|
+
return centroids
|
420
|
+
|
421
|
+
|
422
|
+
def add_area_in_meters(
|
423
|
+
gdf: gpd.GeoDataFrame, area_column_name: str = "area_in_meters"
|
424
|
+
) -> gpd.GeoDataFrame:
|
425
|
+
"""
|
426
|
+
Calculate the area of (Multi)Polygon geometries in square meters and add it as a new column.
|
427
|
+
|
428
|
+
Parameters:
|
429
|
+
----------
|
430
|
+
gdf : geopandas.GeoDataFrame
|
431
|
+
GeoDataFrame containing (Multi)Polygon geometries.
|
432
|
+
area_column_name : str, optional
|
433
|
+
Name of the new column to store the area values. Default is "area_m2".
|
434
|
+
|
435
|
+
Returns:
|
436
|
+
-------
|
437
|
+
geopandas.GeoDataFrame
|
438
|
+
The input GeoDataFrame with an additional column for the area in square meters.
|
439
|
+
|
440
|
+
Raises:
|
441
|
+
------
|
442
|
+
ValueError
|
443
|
+
If the input GeoDataFrame does not contain (Multi)Polygon geometries.
|
444
|
+
"""
|
445
|
+
# Validate input geometries
|
446
|
+
if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
|
447
|
+
raise ValueError(
|
448
|
+
"Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
|
449
|
+
)
|
450
|
+
|
451
|
+
# Create a copy of the GeoDataFrame to avoid modifying the original
|
452
|
+
gdf_with_area = gdf.copy()
|
453
|
+
|
454
|
+
# Calculate the UTM CRS for accurate area calculation
|
455
|
+
utm_crs = gdf_with_area.estimate_utm_crs()
|
456
|
+
|
457
|
+
# Transform to UTM CRS and calculate the area in square meters
|
458
|
+
gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area
|
459
|
+
|
460
|
+
return gdf_with_area
|
461
|
+
|
462
|
+
|
463
|
+
def simplify_geometries(
|
464
|
+
gdf: gpd.GeoDataFrame,
|
465
|
+
tolerance: float = 0.01,
|
466
|
+
preserve_topology: bool = True,
|
467
|
+
geometry_column: str = "geometry",
|
468
|
+
) -> gpd.GeoDataFrame:
|
469
|
+
"""
|
470
|
+
Simplify geometries in a GeoDataFrame to reduce file size and improve visualization performance.
|
471
|
+
|
472
|
+
Parameters
|
473
|
+
----------
|
474
|
+
gdf : geopandas.GeoDataFrame
|
475
|
+
GeoDataFrame containing geometries to simplify.
|
476
|
+
tolerance : float, optional
|
477
|
+
Tolerance for simplification. Larger values simplify more but reduce detail (default is 0.01).
|
478
|
+
preserve_topology : bool, optional
|
479
|
+
Whether to preserve topology while simplifying. Preserving topology prevents invalid geometries (default is True).
|
480
|
+
geometry_column : str, optional
|
481
|
+
Name of the column containing geometries (default is "geometry").
|
482
|
+
|
483
|
+
Returns
|
484
|
+
-------
|
485
|
+
geopandas.GeoDataFrame
|
486
|
+
A new GeoDataFrame with simplified geometries.
|
487
|
+
|
488
|
+
Raises
|
489
|
+
------
|
490
|
+
ValueError
|
491
|
+
If the specified geometry column does not exist or contains invalid geometries.
|
492
|
+
TypeError
|
493
|
+
If the geometry column does not contain valid geometries.
|
494
|
+
|
495
|
+
Examples
|
496
|
+
--------
|
497
|
+
Simplify geometries in a GeoDataFrame:
|
498
|
+
>>> simplified_gdf = simplify_geometries(gdf, tolerance=0.05)
|
499
|
+
"""
|
500
|
+
|
501
|
+
# Check if the specified geometry column exists
|
502
|
+
if geometry_column not in gdf.columns:
|
503
|
+
raise ValueError(
|
504
|
+
f"Geometry column '{geometry_column}' not found in the GeoDataFrame."
|
505
|
+
)
|
506
|
+
|
507
|
+
# Check if the specified column contains geometries
|
508
|
+
if not gpd.GeoSeries(gdf[geometry_column]).is_valid.all():
|
509
|
+
raise TypeError(
|
510
|
+
f"Geometry column '{geometry_column}' contains invalid geometries."
|
511
|
+
)
|
512
|
+
|
513
|
+
# Simplify geometries (non-destructive)
|
514
|
+
gdf_simplified = gdf.copy()
|
515
|
+
gdf_simplified[geometry_column] = gdf_simplified[geometry_column].simplify(
|
516
|
+
tolerance=tolerance, preserve_topology=preserve_topology
|
517
|
+
)
|
518
|
+
|
519
|
+
return gdf_simplified
|
520
|
+
|
521
|
+
|
522
|
+
def map_points_within_polygons(base_points_gdf, polygon_gdf):
|
523
|
+
"""
|
524
|
+
Maps whether each point in `base_points_gdf` is within any polygon in `polygon_gdf`.
|
525
|
+
|
526
|
+
Parameters:
|
527
|
+
----------
|
528
|
+
base_points_gdf : geopandas.GeoDataFrame
|
529
|
+
GeoDataFrame containing point geometries to check.
|
530
|
+
polygon_gdf : geopandas.GeoDataFrame
|
531
|
+
GeoDataFrame containing polygon geometries.
|
532
|
+
|
533
|
+
Returns:
|
534
|
+
-------
|
535
|
+
geopandas.GeoDataFrame
|
536
|
+
The `base_points_gdf` with an additional column `is_within` (True/False).
|
537
|
+
|
538
|
+
Raises:
|
539
|
+
------
|
540
|
+
ValueError
|
541
|
+
If the geometries in either GeoDataFrame are invalid or not of the expected type.
|
542
|
+
"""
|
543
|
+
# Validate input GeoDataFrames
|
544
|
+
if not all(base_points_gdf.geometry.geom_type == "Point"):
|
545
|
+
raise ValueError("`base_points_gdf` must contain only Point geometries.")
|
546
|
+
if not all(polygon_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
|
547
|
+
raise ValueError(
|
548
|
+
"`polygon_gdf` must contain only Polygon or MultiPolygon geometries."
|
549
|
+
)
|
550
|
+
|
551
|
+
if not base_points_gdf.crs == polygon_gdf.crs:
|
552
|
+
raise ValueError("CRS of `base_points_gdf` and `polygon_gdf` must match.")
|
553
|
+
|
554
|
+
# Perform spatial join to check if points fall within any polygon
|
555
|
+
joined_gdf = gpd.sjoin(
|
556
|
+
base_points_gdf, polygon_gdf[["geometry"]], how="left", predicate="within"
|
557
|
+
)
|
558
|
+
|
559
|
+
# Add `is_within` column to base_points_gdf
|
560
|
+
base_points_gdf["is_within"] = base_points_gdf.index.isin(
|
561
|
+
set(joined_gdf.index[~joined_gdf.index_right.isna()])
|
562
|
+
)
|
563
|
+
|
564
|
+
return base_points_gdf
|
565
|
+
|
566
|
+
|
567
|
+
def calculate_distance(lat1, lon1, lat2, lon2, R=6371e3):
|
568
|
+
lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
|
569
|
+
dlat = lat2 - lat1
|
570
|
+
dlon = lon2 - lon1
|
571
|
+
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
|
572
|
+
c = 2 * np.arcsin(np.sqrt(a))
|
573
|
+
distance = R * c
|
574
|
+
return distance
|
575
|
+
|
576
|
+
|
577
|
+
def aggregate_points_to_zones(
|
578
|
+
points: Union[pd.DataFrame, gpd.GeoDataFrame],
|
579
|
+
zones: gpd.GeoDataFrame,
|
580
|
+
value_columns: Optional[Union[str, List[str]]] = None,
|
581
|
+
aggregation: Union[str, Dict[str, str]] = "count",
|
582
|
+
point_zone_predicate: str = "within",
|
583
|
+
zone_id_column: str = "zone_id",
|
584
|
+
output_suffix: str = "",
|
585
|
+
drop_geometry: bool = False,
|
586
|
+
) -> gpd.GeoDataFrame:
|
587
|
+
"""
|
588
|
+
Aggregate point data to zones with flexible aggregation methods.
|
589
|
+
|
590
|
+
Args:
|
591
|
+
points (Union[pd.DataFrame, gpd.GeoDataFrame]): Point data to aggregate
|
592
|
+
zones (gpd.GeoDataFrame): Zones to aggregate points to
|
593
|
+
value_columns (Optional[Union[str, List[str]]]): Column(s) containing values to aggregate
|
594
|
+
If None, only counts will be performed.
|
595
|
+
aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
|
596
|
+
- Single string: Use same method for all columns ("count", "mean", "sum", "min", "max")
|
597
|
+
- Dict: Map column names to aggregation methods
|
598
|
+
point_zone_predicate (str): Spatial predicate for point-to-zone relationship
|
599
|
+
Options: "within", "intersects", "contains"
|
600
|
+
zone_id_column (str): Column in zones containing zone identifiers
|
601
|
+
output_suffix (str): Suffix to add to output column names
|
602
|
+
drop_geometry (bool): Whether to drop the geometry column from output
|
603
|
+
|
604
|
+
Returns:
|
605
|
+
gpd.GeoDataFrame: Zones with aggregated point values
|
606
|
+
|
607
|
+
Example:
|
608
|
+
>>> poi_counts = aggregate_points_to_zones(pois, zones, aggregation="count")
|
609
|
+
>>> poi_value_mean = aggregate_points_to_zones(
|
610
|
+
... pois, zones, value_columns="score", aggregation="mean"
|
611
|
+
... )
|
612
|
+
>>> poi_multiple = aggregate_points_to_zones(
|
613
|
+
... pois, zones,
|
614
|
+
... value_columns=["score", "visits"],
|
615
|
+
... aggregation={"score": "mean", "visits": "sum"}
|
616
|
+
... )
|
617
|
+
"""
|
618
|
+
# Input validation
|
619
|
+
if not isinstance(zones, gpd.GeoDataFrame):
|
620
|
+
raise TypeError("zones must be a GeoDataFrame")
|
621
|
+
|
622
|
+
if zone_id_column not in zones.columns:
|
623
|
+
raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
|
624
|
+
|
625
|
+
# Convert points to GeoDataFrame if necessary
|
626
|
+
if not isinstance(points, gpd.GeoDataFrame):
|
627
|
+
points_gdf = convert_to_geodataframe(points)
|
628
|
+
else:
|
629
|
+
points_gdf = points.copy()
|
630
|
+
|
631
|
+
# Ensure CRS match
|
632
|
+
if points_gdf.crs != zones.crs:
|
633
|
+
points_gdf = points_gdf.to_crs(zones.crs)
|
634
|
+
|
635
|
+
# Handle value columns
|
636
|
+
if value_columns is not None:
|
637
|
+
if isinstance(value_columns, str):
|
638
|
+
value_columns = [value_columns]
|
639
|
+
|
640
|
+
# Validate that all value columns exist
|
641
|
+
missing_cols = [col for col in value_columns if col not in points_gdf.columns]
|
642
|
+
if missing_cols:
|
643
|
+
raise ValueError(f"Value columns not found in points data: {missing_cols}")
|
644
|
+
|
645
|
+
# Handle aggregation method
|
646
|
+
agg_funcs = {}
|
647
|
+
|
648
|
+
if isinstance(aggregation, str):
|
649
|
+
if aggregation == "count":
|
650
|
+
# Special case for count (doesn't need value columns)
|
651
|
+
agg_funcs["__count"] = "count"
|
652
|
+
elif value_columns is not None:
|
653
|
+
# Apply the same aggregation to all value columns
|
654
|
+
agg_funcs = {col: aggregation for col in value_columns}
|
655
|
+
else:
|
656
|
+
raise ValueError(
|
657
|
+
"Value columns must be specified for aggregation methods other than 'count'"
|
658
|
+
)
|
659
|
+
elif isinstance(aggregation, dict):
|
660
|
+
# Validate dictionary keys
|
661
|
+
if value_columns is None:
|
662
|
+
raise ValueError(
|
663
|
+
"Value columns must be specified when using a dictionary of aggregation methods"
|
664
|
+
)
|
665
|
+
|
666
|
+
missing_aggs = [col for col in value_columns if col not in aggregation]
|
667
|
+
extra_aggs = [col for col in aggregation if col not in value_columns]
|
668
|
+
|
669
|
+
if missing_aggs:
|
670
|
+
raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
|
671
|
+
if extra_aggs:
|
672
|
+
raise ValueError(
|
673
|
+
f"Aggregation methods specified for non-existent columns: {extra_aggs}"
|
674
|
+
)
|
675
|
+
|
676
|
+
agg_funcs = aggregation
|
677
|
+
else:
|
678
|
+
raise TypeError("aggregation must be a string or dictionary")
|
679
|
+
|
680
|
+
# Create a copy of the zones
|
681
|
+
result = zones.copy()
|
682
|
+
|
683
|
+
# Spatial join
|
684
|
+
joined = gpd.sjoin(points_gdf, zones, how="inner", predicate=point_zone_predicate)
|
685
|
+
|
686
|
+
# Perform aggregation
|
687
|
+
if "geometry" in joined.columns and not all(
|
688
|
+
value == "count" for value in agg_funcs.values()
|
689
|
+
):
|
690
|
+
# Drop geometry for non-count aggregations to avoid errors
|
691
|
+
joined = joined.drop(columns=["geometry"])
|
692
|
+
|
693
|
+
if "__count" in agg_funcs:
|
694
|
+
# Count points per zone
|
695
|
+
counts = (
|
696
|
+
joined.groupby(zone_id_column)
|
697
|
+
.size()
|
698
|
+
.reset_index(name=f"point_count{output_suffix}")
|
699
|
+
)
|
700
|
+
result = result.merge(counts, on=zone_id_column, how="left")
|
701
|
+
result[f"point_count{output_suffix}"] = (
|
702
|
+
result[f"point_count{output_suffix}"].fillna(0).astype(int)
|
703
|
+
)
|
704
|
+
else:
|
705
|
+
# Aggregate values
|
706
|
+
aggregated = joined.groupby(zone_id_column).agg(agg_funcs).reset_index()
|
707
|
+
|
708
|
+
# Rename columns to include aggregation method
|
709
|
+
if len(value_columns) > 0:
|
710
|
+
# Handle MultiIndex columns from pandas aggregation
|
711
|
+
if isinstance(aggregated.columns, pd.MultiIndex):
|
712
|
+
aggregated.columns = [
|
713
|
+
(
|
714
|
+
f"{col[0]}_{col[1]}{output_suffix}"
|
715
|
+
if col[0] != zone_id_column
|
716
|
+
else zone_id_column
|
717
|
+
)
|
718
|
+
for col in aggregated.columns
|
719
|
+
]
|
720
|
+
|
721
|
+
# Merge back to zones
|
722
|
+
result = result.merge(aggregated, on=zone_id_column, how="left")
|
723
|
+
|
724
|
+
# Fill NaN values with zeros
|
725
|
+
for col in result.columns:
|
726
|
+
if (
|
727
|
+
col != zone_id_column
|
728
|
+
and col != "geometry"
|
729
|
+
and pd.api.types.is_numeric_dtype(result[col])
|
730
|
+
):
|
731
|
+
result[col] = result[col].fillna(0)
|
732
|
+
|
733
|
+
if drop_geometry:
|
734
|
+
result = result.drop(columns=["geometry"])
|
735
|
+
return result
|
736
|
+
|
737
|
+
return result
|
738
|
+
|
739
|
+
|
740
|
+
def annotate_with_admin_regions(
|
741
|
+
gdf: gpd.GeoDataFrame,
|
742
|
+
country_code: str,
|
743
|
+
data_store: Optional[DataStore] = None,
|
744
|
+
admin_id_column_suffix="_giga",
|
745
|
+
) -> gpd.GeoDataFrame:
|
746
|
+
"""
|
747
|
+
Annotate a GeoDataFrame with administrative region information.
|
748
|
+
|
749
|
+
Performs a spatial join between the input points and administrative boundaries
|
750
|
+
at levels 1 and 2, resolving conflicts when points intersect multiple admin regions.
|
751
|
+
|
752
|
+
Args:
|
753
|
+
gdf: GeoDataFrame containing points to annotate
|
754
|
+
country_code: Country code for administrative boundaries
|
755
|
+
data_store: Optional DataStore for loading admin boundary data
|
756
|
+
|
757
|
+
Returns:
|
758
|
+
GeoDataFrame with added administrative region columns
|
759
|
+
"""
|
760
|
+
from gigaspatial.handlers.boundaries import AdminBoundaries
|
761
|
+
|
762
|
+
if not isinstance(gdf, gpd.GeoDataFrame):
|
763
|
+
raise TypeError("gdf must be a GeoDataFrame")
|
764
|
+
|
765
|
+
if gdf.empty:
|
766
|
+
LOGGER.warning("Empty GeoDataFrame provided, returning as-is")
|
767
|
+
return gdf
|
768
|
+
|
769
|
+
# read country admin data
|
770
|
+
admin1_data = AdminBoundaries.create(
|
771
|
+
country_code=country_code, admin_level=1, data_store=data_store
|
772
|
+
).to_geodataframe()
|
773
|
+
|
774
|
+
admin1_data.rename(
|
775
|
+
columns={"id": f"admin1_id{admin_id_column_suffix}", "name": "admin1"},
|
776
|
+
inplace=True,
|
777
|
+
)
|
778
|
+
admin1_data.drop(columns=["name_en", "parent_id", "country_code"], inplace=True)
|
779
|
+
|
780
|
+
admin2_data = AdminBoundaries.create(
|
781
|
+
country_code=country_code, admin_level=2, data_store=data_store
|
782
|
+
).to_geodataframe()
|
783
|
+
|
784
|
+
admin2_data.rename(
|
785
|
+
columns={
|
786
|
+
"id": f"admin2_id{admin_id_column_suffix}",
|
787
|
+
"parent_id": f"admin1_id{admin_id_column_suffix}",
|
788
|
+
"name": "admin2",
|
789
|
+
},
|
790
|
+
inplace=True,
|
791
|
+
)
|
792
|
+
admin2_data.drop(columns=["name_en", "country_code"], inplace=True)
|
793
|
+
|
794
|
+
# Join dataframes based on 'admin1_id_giga'
|
795
|
+
admin_data = admin2_data.merge(
|
796
|
+
admin1_data[[f"admin1_id{admin_id_column_suffix}", "admin1", "geometry"]],
|
797
|
+
left_on=f"admin1_id{admin_id_column_suffix}",
|
798
|
+
right_on=f"admin1_id{admin_id_column_suffix}",
|
799
|
+
how="outer",
|
800
|
+
)
|
801
|
+
|
802
|
+
admin_data["geometry"] = admin_data.apply(
|
803
|
+
lambda x: x.geometry_x if x.geometry_x else x.geometry_y, axis=1
|
804
|
+
)
|
805
|
+
|
806
|
+
admin_data = gpd.GeoDataFrame(
|
807
|
+
admin_data.drop(columns=["geometry_x", "geometry_y"]),
|
808
|
+
geometry="geometry",
|
809
|
+
crs=4326,
|
810
|
+
)
|
811
|
+
|
812
|
+
admin_data["admin2"].fillna("Unknown", inplace=True)
|
813
|
+
admin_data[f"admin2_id{admin_id_column_suffix}"] = admin_data[
|
814
|
+
f"admin2_id{admin_id_column_suffix}"
|
815
|
+
].replace({np.nan: None})
|
816
|
+
|
817
|
+
if gdf.crs is None:
|
818
|
+
LOGGER.warning("Input GeoDataFrame has no CRS, assuming EPSG:4326")
|
819
|
+
gdf.set_crs(epsg=4326, inplace=True)
|
820
|
+
elif gdf.crs != "EPSG:4326":
|
821
|
+
LOGGER.info(f"Reprojecting from {gdf.crs} to EPSG:4326")
|
822
|
+
gdf = gdf.to_crs(epsg=4326)
|
823
|
+
|
824
|
+
# spatial join gdf to admins
|
825
|
+
gdf_w_admins = gdf.copy().sjoin(
|
826
|
+
admin_data,
|
827
|
+
how="left",
|
828
|
+
predicate="intersects",
|
829
|
+
)
|
830
|
+
|
831
|
+
# Check for duplicates caused by points intersecting multiple polygons
|
832
|
+
if len(gdf_w_admins) != len(gdf):
|
833
|
+
LOGGER.warning(
|
834
|
+
"Some points intersect multiple administrative boundaries. Resolving conflicts..."
|
835
|
+
)
|
836
|
+
|
837
|
+
# Group by original index and select the closest admin area for ties
|
838
|
+
gdf_w_admins["distance"] = gdf_w_admins.apply(
|
839
|
+
lambda row: row.geometry.distance(
|
840
|
+
admin_data.loc[row.index_right, "geometry"].centroid
|
841
|
+
),
|
842
|
+
axis=1,
|
843
|
+
)
|
844
|
+
|
845
|
+
# For points with multiple matches, keep the closest polygon
|
846
|
+
gdf_w_admins = gdf_w_admins.loc[
|
847
|
+
gdf_w_admins.groupby(gdf.index)["distance"].idxmin()
|
848
|
+
].drop(columns="distance")
|
849
|
+
|
850
|
+
# Drop unnecessary columns and reset the index
|
851
|
+
gdf_w_admins = gdf_w_admins.drop(columns="index_right").reset_index(drop=True)
|
852
|
+
|
853
|
+
return gdf_w_admins
|
854
|
+
|
855
|
+
|
856
|
+
def aggregate_polygons_to_zones(
|
857
|
+
polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
|
858
|
+
zones: gpd.GeoDataFrame,
|
859
|
+
value_columns: Union[str, List[str]],
|
860
|
+
aggregation: Union[str, Dict[str, str]] = "sum",
|
861
|
+
area_weighted: bool = True,
|
862
|
+
zone_id_column: str = "zone_id",
|
863
|
+
output_suffix: str = "",
|
864
|
+
drop_geometry: bool = False,
|
865
|
+
) -> gpd.GeoDataFrame:
|
866
|
+
"""
|
867
|
+
Aggregate polygon data to zones with area-weighted values.
|
868
|
+
|
869
|
+
This function maps polygon data to zones, weighting values by the
|
870
|
+
fractional area of overlap between polygons and zones.
|
871
|
+
|
872
|
+
Args:
|
873
|
+
polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): Polygon data to aggregate
|
874
|
+
zones (gpd.GeoDataFrame): Zones to aggregate polygons to
|
875
|
+
value_columns (Union[str, List[str]]): Column(s) containing values to aggregate
|
876
|
+
aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
|
877
|
+
- Single string: Use same method for all columns ("sum", "mean", "max", etc.)
|
878
|
+
- Dict: Map column names to aggregation methods
|
879
|
+
area_weighted (bool): Whether to weight values by fractional area overlap
|
880
|
+
If False, values are not weighted before aggregation
|
881
|
+
zone_id_column (str): Column in zones containing zone identifiers
|
882
|
+
output_suffix (str): Suffix to add to output column names
|
883
|
+
drop_geometry (bool): Whether to drop the geometry column from output
|
884
|
+
|
885
|
+
Returns:
|
886
|
+
gpd.GeoDataFrame: Zones with aggregated polygon values
|
887
|
+
|
888
|
+
Example:
|
889
|
+
>>> landuse_stats = aggregate_polygons_to_zones(
|
890
|
+
... landuse_polygons,
|
891
|
+
... grid_zones,
|
892
|
+
... value_columns=["area", "population"],
|
893
|
+
... aggregation="sum"
|
894
|
+
... )
|
895
|
+
"""
|
896
|
+
# Input validation
|
897
|
+
if not isinstance(zones, gpd.GeoDataFrame):
|
898
|
+
raise TypeError("zones must be a GeoDataFrame")
|
899
|
+
|
900
|
+
if zone_id_column not in zones.columns:
|
901
|
+
raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
|
902
|
+
|
903
|
+
# Convert polygons to GeoDataFrame if necessary
|
904
|
+
if not isinstance(polygons, gpd.GeoDataFrame):
|
905
|
+
try:
|
906
|
+
polygons_gdf = convert_to_geodataframe(polygons)
|
907
|
+
except:
|
908
|
+
raise TypeError("polygons must be a GeoDataFrame or convertible to one")
|
909
|
+
else:
|
910
|
+
polygons_gdf = polygons.copy()
|
911
|
+
|
912
|
+
# Validate geometry types
|
913
|
+
non_polygon_geoms = [
|
914
|
+
geom_type
|
915
|
+
for geom_type in polygons_gdf.geometry.geom_type.unique()
|
916
|
+
if geom_type not in ["Polygon", "MultiPolygon"]
|
917
|
+
]
|
918
|
+
if non_polygon_geoms:
|
919
|
+
raise ValueError(
|
920
|
+
f"Input contains non-polygon geometries: {non_polygon_geoms}. "
|
921
|
+
"Use aggregate_points_to_zones for point data."
|
922
|
+
)
|
923
|
+
|
924
|
+
# Process value columns
|
925
|
+
if isinstance(value_columns, str):
|
926
|
+
value_columns = [value_columns]
|
927
|
+
|
928
|
+
# Validate that all value columns exist
|
929
|
+
missing_cols = [col for col in value_columns if col not in polygons_gdf.columns]
|
930
|
+
if missing_cols:
|
931
|
+
raise ValueError(f"Value columns not found in polygons data: {missing_cols}")
|
932
|
+
|
933
|
+
# Ensure CRS match
|
934
|
+
if polygons_gdf.crs != zones.crs:
|
935
|
+
polygons_gdf = polygons_gdf.to_crs(zones.crs)
|
936
|
+
|
937
|
+
# Handle aggregation method
|
938
|
+
if isinstance(aggregation, str):
|
939
|
+
agg_funcs = {col: aggregation for col in value_columns}
|
940
|
+
elif isinstance(aggregation, dict):
|
941
|
+
# Validate dictionary keys
|
942
|
+
missing_aggs = [col for col in value_columns if col not in aggregation]
|
943
|
+
extra_aggs = [col for col in aggregation if col not in value_columns]
|
944
|
+
|
945
|
+
if missing_aggs:
|
946
|
+
raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
|
947
|
+
if extra_aggs:
|
948
|
+
raise ValueError(
|
949
|
+
f"Aggregation methods specified for non-existent columns: {extra_aggs}"
|
950
|
+
)
|
951
|
+
|
952
|
+
agg_funcs = aggregation
|
953
|
+
else:
|
954
|
+
raise TypeError("aggregation must be a string or dictionary")
|
955
|
+
|
956
|
+
# Create a copy of the zones
|
957
|
+
result = zones.copy()
|
958
|
+
|
959
|
+
if area_weighted:
|
960
|
+
# Use area-weighted aggregation with polygon overlay
|
961
|
+
try:
|
962
|
+
# Compute UTM CRS for accurate area calculations
|
963
|
+
overlay_utm_crs = polygons_gdf.estimate_utm_crs()
|
964
|
+
|
965
|
+
# Prepare polygons for overlay
|
966
|
+
polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
|
967
|
+
polygons_utm["orig_area"] = polygons_utm.area
|
968
|
+
|
969
|
+
# Keep only necessary columns
|
970
|
+
overlay_cols = value_columns + ["geometry", "orig_area"]
|
971
|
+
overlay_gdf = polygons_utm[overlay_cols].copy()
|
972
|
+
|
973
|
+
# Prepare zones for overlay
|
974
|
+
zones_utm = zones.to_crs(overlay_utm_crs)
|
975
|
+
|
976
|
+
# Perform the spatial overlay
|
977
|
+
gdf_overlayed = gpd.overlay(
|
978
|
+
overlay_gdf, zones_utm[[zone_id_column, "geometry"]], how="intersection"
|
979
|
+
)
|
980
|
+
|
981
|
+
# Calculate fractional areas
|
982
|
+
gdf_overlayed["intersection_area"] = gdf_overlayed.area
|
983
|
+
gdf_overlayed["area_fraction"] = (
|
984
|
+
gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
|
985
|
+
)
|
986
|
+
|
987
|
+
# Apply area weighting to value columns
|
988
|
+
for col in value_columns:
|
989
|
+
gdf_overlayed[col] = gdf_overlayed[col] * gdf_overlayed["area_fraction"]
|
990
|
+
|
991
|
+
# Aggregate by zone ID
|
992
|
+
aggregated = gdf_overlayed.groupby(zone_id_column)[value_columns].agg(
|
993
|
+
agg_funcs
|
994
|
+
)
|
995
|
+
|
996
|
+
# Handle column naming for multi-level index
|
997
|
+
if isinstance(aggregated.columns, pd.MultiIndex):
|
998
|
+
aggregated.columns = [
|
999
|
+
f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
|
1000
|
+
]
|
1001
|
+
|
1002
|
+
# Reset index
|
1003
|
+
aggregated = aggregated.reset_index()
|
1004
|
+
|
1005
|
+
# Merge aggregated values back to the zones
|
1006
|
+
result = result.merge(aggregated, on=zone_id_column, how="left")
|
1007
|
+
|
1008
|
+
# Fill NaN values with zeros
|
1009
|
+
for col in result.columns:
|
1010
|
+
if (
|
1011
|
+
col != zone_id_column
|
1012
|
+
and col != "geometry"
|
1013
|
+
and pd.api.types.is_numeric_dtype(result[col])
|
1014
|
+
):
|
1015
|
+
result[col] = result[col].fillna(0)
|
1016
|
+
|
1017
|
+
except Exception as e:
|
1018
|
+
raise RuntimeError(f"Error during area-weighted aggregation: {e}")
|
1019
|
+
|
1020
|
+
else:
|
1021
|
+
# Non-weighted aggregation - simpler approach
|
1022
|
+
# Perform spatial join
|
1023
|
+
joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate="intersects")
|
1024
|
+
|
1025
|
+
# Remove geometry column for aggregation
|
1026
|
+
if "geometry" in joined.columns:
|
1027
|
+
joined = joined.drop(columns=["geometry"])
|
1028
|
+
|
1029
|
+
# Group by zone ID and aggregate
|
1030
|
+
aggregated = joined.groupby(zone_id_column)[value_columns].agg(agg_funcs)
|
1031
|
+
|
1032
|
+
# Handle column naming for multi-level index
|
1033
|
+
if isinstance(aggregated.columns, pd.MultiIndex):
|
1034
|
+
aggregated.columns = [
|
1035
|
+
f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
|
1036
|
+
]
|
1037
|
+
|
1038
|
+
# Reset index and merge back to zones
|
1039
|
+
aggregated = aggregated.reset_index()
|
1040
|
+
result = result.merge(aggregated, on=zone_id_column, how="left")
|
1041
|
+
|
1042
|
+
# Fill NaN values with zeros
|
1043
|
+
for col in result.columns:
|
1044
|
+
if (
|
1045
|
+
col != zone_id_column
|
1046
|
+
and col != "geometry"
|
1047
|
+
and pd.api.types.is_numeric_dtype(result[col])
|
1048
|
+
):
|
1049
|
+
result[col] = result[col].fillna(0)
|
1050
|
+
|
1051
|
+
if drop_geometry:
|
1052
|
+
result = result.drop(columns=["geometry"])
|
1053
|
+
|
1054
|
+
return result
|