giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,1054 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import geopandas as gpd
4
+ from shapely import wkt
5
+ from shapely.geometry import base
6
+ from typing import Literal, List, Tuple, Optional, Union, Dict
7
+ import re
8
+
9
+ from gigaspatial.core.io.data_store import DataStore
10
+ from gigaspatial.config import config
11
+
12
+ LOGGER = config.get_logger("GigaSpatialProcessing")
13
+
14
+
15
+ def detect_coordinate_columns(
16
+ data, lat_keywords=None, lon_keywords=None, case_sensitive=False
17
+ ) -> Tuple[str, str]:
18
+ """
19
+ Detect latitude and longitude columns in a DataFrame using keyword matching.
20
+
21
+ Parameters:
22
+ ----------
23
+ data : pandas.DataFrame
24
+ DataFrame to search for coordinate columns.
25
+ lat_keywords : list of str, optional
26
+ Keywords for identifying latitude columns. If None, uses default keywords.
27
+ lon_keywords : list of str, optional
28
+ Keywords for identifying longitude columns. If None, uses default keywords.
29
+ case_sensitive : bool, optional
30
+ Whether to perform case-sensitive matching. Default is False.
31
+
32
+ Returns:
33
+ -------
34
+ tuple[str, str]
35
+ Names of detected (latitude, longitude) columns.
36
+
37
+ Raises:
38
+ ------
39
+ ValueError
40
+ If no unique pair of latitude/longitude columns can be found.
41
+ TypeError
42
+ If input data is not a pandas DataFrame.
43
+ """
44
+
45
+ # Default keywords if none provided
46
+ default_lat = [
47
+ "latitude",
48
+ "lat",
49
+ "y",
50
+ "lat_",
51
+ "lat(s)",
52
+ "_lat",
53
+ "ylat",
54
+ "latitude_y",
55
+ ]
56
+ default_lon = [
57
+ "longitude",
58
+ "lon",
59
+ "long",
60
+ "x",
61
+ "lon_",
62
+ "lon(e)",
63
+ "long(e)",
64
+ "_lon",
65
+ "xlon",
66
+ "longitude_x",
67
+ ]
68
+
69
+ lat_keywords = lat_keywords or default_lat
70
+ lon_keywords = lon_keywords or default_lon
71
+
72
+ # Input validation
73
+ if not isinstance(data, pd.DataFrame):
74
+ raise TypeError("Input must be a pandas DataFrame")
75
+
76
+ if not data.columns.is_unique:
77
+ raise ValueError("DataFrame contains duplicate column names")
78
+
79
+ def create_pattern(keywords):
80
+ """Create regex pattern from keywords."""
81
+ return "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)
82
+
83
+ def find_matching_columns(columns, pattern, case_sensitive) -> List:
84
+ """Find columns matching the pattern."""
85
+ flags = 0 if case_sensitive else re.IGNORECASE
86
+ return [col for col in columns if re.search(pattern, col, flags=flags)]
87
+
88
+ try:
89
+ # Create patterns
90
+ lat_pattern = create_pattern(lat_keywords)
91
+ lon_pattern = create_pattern(lon_keywords)
92
+
93
+ # Find matching columns
94
+ lat_cols = find_matching_columns(data.columns, lat_pattern, case_sensitive)
95
+ lon_cols = find_matching_columns(data.columns, lon_pattern, case_sensitive)
96
+
97
+ # Remove any longitude matches from latitude columns and vice versa
98
+ lat_cols = [col for col in lat_cols if col not in lon_cols]
99
+ lon_cols = [col for col in lon_cols if col not in lat_cols]
100
+
101
+ # Detailed error messages based on what was found
102
+ if not lat_cols and not lon_cols:
103
+ columns_list = "\n".join(f"- {col}" for col in data.columns)
104
+ raise ValueError(
105
+ f"No latitude or longitude columns found. Available columns are:\n{columns_list}\n"
106
+ f"Consider adding more keywords or checking column names."
107
+ )
108
+
109
+ if not lat_cols:
110
+ found_lons = ", ".join(lon_cols)
111
+ raise ValueError(
112
+ f"Found longitude columns ({found_lons}) but no latitude columns. "
113
+ "Check latitude keywords or column names."
114
+ )
115
+
116
+ if not lon_cols:
117
+ found_lats = ", ".join(lat_cols)
118
+ raise ValueError(
119
+ f"Found latitude columns ({found_lats}) but no longitude columns. "
120
+ "Check longitude keywords or column names."
121
+ )
122
+
123
+ if len(lat_cols) > 1 or len(lon_cols) > 1:
124
+ raise ValueError(
125
+ f"Multiple possible coordinate columns found:\n"
126
+ f"Latitude candidates: {lat_cols}\n"
127
+ f"Longitude candidates: {lon_cols}\n"
128
+ "Please specify more precise keywords."
129
+ )
130
+
131
+ return lat_cols[0], lon_cols[0]
132
+
133
+ except Exception as e:
134
+ if isinstance(e, ValueError):
135
+ raise
136
+ raise RuntimeError(f"Error detecting coordinate columns: {str(e)}")
137
+
138
+
139
+ def convert_to_geodataframe(
140
+ data: pd.DataFrame, lat_col: str = None, lon_col: str = None, crs="EPSG:4326"
141
+ ) -> gpd.GeoDataFrame:
142
+ """
143
+ Convert a pandas DataFrame to a GeoDataFrame, either from latitude/longitude columns
144
+ or from a WKT geometry column.
145
+
146
+ Parameters:
147
+ ----------
148
+ data : pandas.DataFrame
149
+ Input DataFrame containing either lat/lon columns or a geometry column.
150
+ lat_col : str, optional
151
+ Name of the latitude column. Default is 'lat'.
152
+ lon_col : str, optional
153
+ Name of the longitude column. Default is 'lon'.
154
+ crs : str or pyproj.CRS, optional
155
+ Coordinate Reference System of the geometry data. Default is 'EPSG:4326'.
156
+
157
+ Returns:
158
+ -------
159
+ geopandas.GeoDataFrame
160
+ A GeoDataFrame containing the input data with a geometry column.
161
+
162
+ Raises:
163
+ ------
164
+ TypeError
165
+ If input is not a pandas DataFrame.
166
+ ValueError
167
+ If required columns are missing or contain invalid data.
168
+ """
169
+
170
+ # Input validation
171
+ if not isinstance(data, pd.DataFrame):
172
+ raise TypeError("Input 'data' must be a pandas DataFrame")
173
+
174
+ # Create a copy to avoid modifying the input
175
+ df = data.copy()
176
+
177
+ try:
178
+ if "geometry" not in df.columns:
179
+ # If column names not provided, try to detect them
180
+ if lat_col is None or lon_col is None:
181
+ try:
182
+ detected_lat, detected_lon = detect_coordinate_columns(df)
183
+ lat_col = lat_col or detected_lat
184
+ lon_col = lon_col or detected_lon
185
+ except ValueError as e:
186
+ raise ValueError(
187
+ f"Could not automatically detect coordinate columns and no "
188
+ f"'geometry' column found. Error: {str(e)}"
189
+ )
190
+
191
+ # Validate latitude/longitude columns exist
192
+ if lat_col not in df.columns or lon_col not in df.columns:
193
+ raise ValueError(
194
+ f"Could not find columns: {lat_col} and/or {lon_col} in the DataFrame"
195
+ )
196
+
197
+ # Check for missing values
198
+ if df[lat_col].isna().any() or df[lon_col].isna().any():
199
+ raise ValueError(
200
+ f"Missing values found in {lat_col} and/or {lon_col} columns"
201
+ )
202
+
203
+ # Create geometry from lat/lon
204
+ geometry = gpd.points_from_xy(x=df[lon_col], y=df[lat_col])
205
+
206
+ else:
207
+ # Check if geometry column already contains valid geometries
208
+ if df["geometry"].apply(lambda x: isinstance(x, base.BaseGeometry)).all():
209
+ geometry = df["geometry"]
210
+ elif df["geometry"].apply(lambda x: isinstance(x, str)).all():
211
+ # Convert WKT strings to geometry objects
212
+ geometry = df["geometry"].apply(wkt.loads)
213
+ else:
214
+ raise ValueError(
215
+ "Invalid geometry format: contains mixed or unsupported types"
216
+ )
217
+
218
+ # drop the WKT column if conversion was done
219
+ if (
220
+ "geometry" in df.columns
221
+ and not df["geometry"]
222
+ .apply(lambda x: isinstance(x, base.BaseGeometry))
223
+ .all()
224
+ ):
225
+ df = df.drop(columns=["geometry"])
226
+
227
+ return gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
228
+
229
+ except Exception as e:
230
+ raise RuntimeError(f"Error converting to GeoDataFrame: {str(e)}")
231
+
232
+
233
+ def buffer_geodataframe(
234
+ gdf: gpd.GeoDataFrame,
235
+ buffer_distance_meters: float,
236
+ cap_style: Literal["round", "square", "flat"] = "round",
237
+ copy=True,
238
+ ) -> gpd.GeoDataFrame:
239
+ """
240
+ Buffers a GeoDataFrame with a given buffer distance in meters.
241
+
242
+ Parameters:
243
+ - gdf : geopandas.GeoDataFrame
244
+ The GeoDataFrame to be buffered.
245
+ - buffer_distance_meters : float
246
+ The buffer distance in meters.
247
+ - cap_style : str, optional
248
+ The style of caps. round, flat, square. Default is round.
249
+
250
+ Returns:
251
+ - geopandas.GeoDataFrame
252
+ The buffered GeoDataFrame.
253
+ """
254
+
255
+ # Input validation
256
+ if not isinstance(gdf, gpd.GeoDataFrame):
257
+ raise TypeError("Input must be a GeoDataFrame")
258
+
259
+ if not isinstance(buffer_distance_meters, (float, int)):
260
+ raise TypeError("Buffer distance must be a number")
261
+
262
+ if cap_style not in ["round", "square", "flat"]:
263
+ raise ValueError("cap_style must be round, flat or square.")
264
+
265
+ if gdf.crs is None:
266
+ raise ValueError("Input GeoDataFrame must have a defined CRS")
267
+
268
+ # Create a copy if requested
269
+ gdf_work = gdf.copy() if copy else gdf
270
+
271
+ # Store input CRS
272
+ input_crs = gdf_work.crs
273
+
274
+ try:
275
+ # Create a custom UTM CRS based on the calculated UTM zone
276
+ utm_crs = gdf_work.estimate_utm_crs()
277
+
278
+ # Transform to UTM, create buffer, and transform back
279
+ gdf_work = gdf_work.to_crs(utm_crs)
280
+ gdf_work["geometry"] = gdf_work["geometry"].buffer(
281
+ buffer_distance_meters, cap_style=cap_style
282
+ )
283
+ gdf_work = gdf_work.to_crs(input_crs)
284
+
285
+ return gdf_work
286
+
287
+ except Exception as e:
288
+ raise RuntimeError(f"Error during buffering operation: {str(e)}")
289
+
290
+
291
+ def add_spatial_jitter(
292
+ df: pd.DataFrame,
293
+ columns: List[str] = ["latitude", "longitude"],
294
+ amount: float = 0.0001,
295
+ seed=None,
296
+ copy=True,
297
+ ) -> pd.DataFrame:
298
+ """
299
+ Add random jitter to duplicated geographic coordinates to create slight separation
300
+ between overlapping points.
301
+
302
+ Parameters:
303
+ ----------
304
+ df : pandas.DataFrame
305
+ DataFrame containing geographic coordinates.
306
+ columns : list of str, optional
307
+ Column names containing coordinates to jitter. Default is ['latitude', 'longitude'].
308
+ amount : float or dict, optional
309
+ Amount of jitter to add. If float, same amount used for all columns.
310
+ If dict, specify amount per column, e.g., {'lat': 0.0001, 'lon': 0.0002}.
311
+ Default is 0.0001 (approximately 11 meters at the equator).
312
+ seed : int, optional
313
+ Random seed for reproducibility. Default is None.
314
+ copy : bool, optional
315
+ Whether to create a copy of the input DataFrame. Default is True.
316
+
317
+ Returns:
318
+ -------
319
+ pandas.DataFrame
320
+ DataFrame with jittered coordinates for previously duplicated points.
321
+
322
+ Raises:
323
+ ------
324
+ ValueError
325
+ If columns don't exist or jitter amount is invalid.
326
+ TypeError
327
+ If input types are incorrect.
328
+ """
329
+
330
+ # Input validation
331
+ if not isinstance(df, pd.DataFrame):
332
+ raise TypeError("Input must be a pandas DataFrame")
333
+
334
+ if not all(col in df.columns for col in columns):
335
+ raise ValueError(f"Not all columns {columns} found in DataFrame")
336
+
337
+ # Handle jitter amounts
338
+ if isinstance(amount, (int, float)):
339
+ if amount <= 0:
340
+ raise ValueError("Jitter amount must be positive")
341
+ jitter_amounts = {col: amount for col in columns}
342
+ elif isinstance(amount, dict):
343
+ if not all(col in amount for col in columns):
344
+ raise ValueError("Must specify jitter amount for each column")
345
+ if not all(amt > 0 for amt in amount.values()):
346
+ raise ValueError("All jitter amounts must be positive")
347
+ jitter_amounts = amount
348
+ else:
349
+ raise TypeError("amount must be a number or dictionary")
350
+
351
+ # Create copy if requested
352
+ df_work = df.copy() if copy else df
353
+
354
+ # Set random seed if provided
355
+ if seed is not None:
356
+ np.random.seed(seed)
357
+
358
+ try:
359
+ # Find duplicated coordinates
360
+ duplicate_mask = df_work.duplicated(subset=columns, keep=False)
361
+ n_duplicates = duplicate_mask.sum()
362
+
363
+ if n_duplicates > 0:
364
+ # Add jitter to each column separately
365
+ for col in columns:
366
+ jitter = np.random.uniform(
367
+ low=-jitter_amounts[col],
368
+ high=jitter_amounts[col],
369
+ size=n_duplicates,
370
+ )
371
+ df_work.loc[duplicate_mask, col] += jitter
372
+
373
+ # Validate results (ensure no remaining duplicates)
374
+ if df_work.duplicated(subset=columns, keep=False).any():
375
+ # If duplicates remain, recursively add more jitter
376
+ df_work = add_spatial_jitter(
377
+ df_work,
378
+ columns=columns,
379
+ amount={col: amt * 2 for col, amt in jitter_amounts.items()},
380
+ seed=seed,
381
+ copy=False,
382
+ )
383
+
384
+ return df_work
385
+
386
+ except Exception as e:
387
+ raise RuntimeError(f"Error during jittering operation: {str(e)}")
388
+
389
+
390
+ def get_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
391
+ """
392
+ Calculate the centroids of a (Multi)Polygon GeoDataFrame.
393
+
394
+ Parameters:
395
+ ----------
396
+ gdf : geopandas.GeoDataFrame
397
+ GeoDataFrame containing (Multi)Polygon geometries.
398
+
399
+ Returns:
400
+ -------
401
+ geopandas.GeoDataFrame
402
+ A new GeoDataFrame with Point geometries representing the centroids.
403
+
404
+ Raises:
405
+ ------
406
+ ValueError
407
+ If the input GeoDataFrame does not contain (Multi)Polygon geometries.
408
+ """
409
+ # Validate input geometries
410
+ if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
411
+ raise ValueError(
412
+ "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
413
+ )
414
+
415
+ # Calculate centroids
416
+ centroids = gdf.copy()
417
+ centroids["geometry"] = centroids.geometry.centroid
418
+
419
+ return centroids
420
+
421
+
422
+ def add_area_in_meters(
423
+ gdf: gpd.GeoDataFrame, area_column_name: str = "area_in_meters"
424
+ ) -> gpd.GeoDataFrame:
425
+ """
426
+ Calculate the area of (Multi)Polygon geometries in square meters and add it as a new column.
427
+
428
+ Parameters:
429
+ ----------
430
+ gdf : geopandas.GeoDataFrame
431
+ GeoDataFrame containing (Multi)Polygon geometries.
432
+ area_column_name : str, optional
433
+ Name of the new column to store the area values. Default is "area_m2".
434
+
435
+ Returns:
436
+ -------
437
+ geopandas.GeoDataFrame
438
+ The input GeoDataFrame with an additional column for the area in square meters.
439
+
440
+ Raises:
441
+ ------
442
+ ValueError
443
+ If the input GeoDataFrame does not contain (Multi)Polygon geometries.
444
+ """
445
+ # Validate input geometries
446
+ if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
447
+ raise ValueError(
448
+ "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
449
+ )
450
+
451
+ # Create a copy of the GeoDataFrame to avoid modifying the original
452
+ gdf_with_area = gdf.copy()
453
+
454
+ # Calculate the UTM CRS for accurate area calculation
455
+ utm_crs = gdf_with_area.estimate_utm_crs()
456
+
457
+ # Transform to UTM CRS and calculate the area in square meters
458
+ gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area
459
+
460
+ return gdf_with_area
461
+
462
+
463
+ def simplify_geometries(
464
+ gdf: gpd.GeoDataFrame,
465
+ tolerance: float = 0.01,
466
+ preserve_topology: bool = True,
467
+ geometry_column: str = "geometry",
468
+ ) -> gpd.GeoDataFrame:
469
+ """
470
+ Simplify geometries in a GeoDataFrame to reduce file size and improve visualization performance.
471
+
472
+ Parameters
473
+ ----------
474
+ gdf : geopandas.GeoDataFrame
475
+ GeoDataFrame containing geometries to simplify.
476
+ tolerance : float, optional
477
+ Tolerance for simplification. Larger values simplify more but reduce detail (default is 0.01).
478
+ preserve_topology : bool, optional
479
+ Whether to preserve topology while simplifying. Preserving topology prevents invalid geometries (default is True).
480
+ geometry_column : str, optional
481
+ Name of the column containing geometries (default is "geometry").
482
+
483
+ Returns
484
+ -------
485
+ geopandas.GeoDataFrame
486
+ A new GeoDataFrame with simplified geometries.
487
+
488
+ Raises
489
+ ------
490
+ ValueError
491
+ If the specified geometry column does not exist or contains invalid geometries.
492
+ TypeError
493
+ If the geometry column does not contain valid geometries.
494
+
495
+ Examples
496
+ --------
497
+ Simplify geometries in a GeoDataFrame:
498
+ >>> simplified_gdf = simplify_geometries(gdf, tolerance=0.05)
499
+ """
500
+
501
+ # Check if the specified geometry column exists
502
+ if geometry_column not in gdf.columns:
503
+ raise ValueError(
504
+ f"Geometry column '{geometry_column}' not found in the GeoDataFrame."
505
+ )
506
+
507
+ # Check if the specified column contains geometries
508
+ if not gpd.GeoSeries(gdf[geometry_column]).is_valid.all():
509
+ raise TypeError(
510
+ f"Geometry column '{geometry_column}' contains invalid geometries."
511
+ )
512
+
513
+ # Simplify geometries (non-destructive)
514
+ gdf_simplified = gdf.copy()
515
+ gdf_simplified[geometry_column] = gdf_simplified[geometry_column].simplify(
516
+ tolerance=tolerance, preserve_topology=preserve_topology
517
+ )
518
+
519
+ return gdf_simplified
520
+
521
+
522
+ def map_points_within_polygons(base_points_gdf, polygon_gdf):
523
+ """
524
+ Maps whether each point in `base_points_gdf` is within any polygon in `polygon_gdf`.
525
+
526
+ Parameters:
527
+ ----------
528
+ base_points_gdf : geopandas.GeoDataFrame
529
+ GeoDataFrame containing point geometries to check.
530
+ polygon_gdf : geopandas.GeoDataFrame
531
+ GeoDataFrame containing polygon geometries.
532
+
533
+ Returns:
534
+ -------
535
+ geopandas.GeoDataFrame
536
+ The `base_points_gdf` with an additional column `is_within` (True/False).
537
+
538
+ Raises:
539
+ ------
540
+ ValueError
541
+ If the geometries in either GeoDataFrame are invalid or not of the expected type.
542
+ """
543
+ # Validate input GeoDataFrames
544
+ if not all(base_points_gdf.geometry.geom_type == "Point"):
545
+ raise ValueError("`base_points_gdf` must contain only Point geometries.")
546
+ if not all(polygon_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
547
+ raise ValueError(
548
+ "`polygon_gdf` must contain only Polygon or MultiPolygon geometries."
549
+ )
550
+
551
+ if not base_points_gdf.crs == polygon_gdf.crs:
552
+ raise ValueError("CRS of `base_points_gdf` and `polygon_gdf` must match.")
553
+
554
+ # Perform spatial join to check if points fall within any polygon
555
+ joined_gdf = gpd.sjoin(
556
+ base_points_gdf, polygon_gdf[["geometry"]], how="left", predicate="within"
557
+ )
558
+
559
+ # Add `is_within` column to base_points_gdf
560
+ base_points_gdf["is_within"] = base_points_gdf.index.isin(
561
+ set(joined_gdf.index[~joined_gdf.index_right.isna()])
562
+ )
563
+
564
+ return base_points_gdf
565
+
566
+
567
+ def calculate_distance(lat1, lon1, lat2, lon2, R=6371e3):
568
+ lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
569
+ dlat = lat2 - lat1
570
+ dlon = lon2 - lon1
571
+ a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
572
+ c = 2 * np.arcsin(np.sqrt(a))
573
+ distance = R * c
574
+ return distance
575
+
576
+
577
+ def aggregate_points_to_zones(
578
+ points: Union[pd.DataFrame, gpd.GeoDataFrame],
579
+ zones: gpd.GeoDataFrame,
580
+ value_columns: Optional[Union[str, List[str]]] = None,
581
+ aggregation: Union[str, Dict[str, str]] = "count",
582
+ point_zone_predicate: str = "within",
583
+ zone_id_column: str = "zone_id",
584
+ output_suffix: str = "",
585
+ drop_geometry: bool = False,
586
+ ) -> gpd.GeoDataFrame:
587
+ """
588
+ Aggregate point data to zones with flexible aggregation methods.
589
+
590
+ Args:
591
+ points (Union[pd.DataFrame, gpd.GeoDataFrame]): Point data to aggregate
592
+ zones (gpd.GeoDataFrame): Zones to aggregate points to
593
+ value_columns (Optional[Union[str, List[str]]]): Column(s) containing values to aggregate
594
+ If None, only counts will be performed.
595
+ aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
596
+ - Single string: Use same method for all columns ("count", "mean", "sum", "min", "max")
597
+ - Dict: Map column names to aggregation methods
598
+ point_zone_predicate (str): Spatial predicate for point-to-zone relationship
599
+ Options: "within", "intersects", "contains"
600
+ zone_id_column (str): Column in zones containing zone identifiers
601
+ output_suffix (str): Suffix to add to output column names
602
+ drop_geometry (bool): Whether to drop the geometry column from output
603
+
604
+ Returns:
605
+ gpd.GeoDataFrame: Zones with aggregated point values
606
+
607
+ Example:
608
+ >>> poi_counts = aggregate_points_to_zones(pois, zones, aggregation="count")
609
+ >>> poi_value_mean = aggregate_points_to_zones(
610
+ ... pois, zones, value_columns="score", aggregation="mean"
611
+ ... )
612
+ >>> poi_multiple = aggregate_points_to_zones(
613
+ ... pois, zones,
614
+ ... value_columns=["score", "visits"],
615
+ ... aggregation={"score": "mean", "visits": "sum"}
616
+ ... )
617
+ """
618
+ # Input validation
619
+ if not isinstance(zones, gpd.GeoDataFrame):
620
+ raise TypeError("zones must be a GeoDataFrame")
621
+
622
+ if zone_id_column not in zones.columns:
623
+ raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
624
+
625
+ # Convert points to GeoDataFrame if necessary
626
+ if not isinstance(points, gpd.GeoDataFrame):
627
+ points_gdf = convert_to_geodataframe(points)
628
+ else:
629
+ points_gdf = points.copy()
630
+
631
+ # Ensure CRS match
632
+ if points_gdf.crs != zones.crs:
633
+ points_gdf = points_gdf.to_crs(zones.crs)
634
+
635
+ # Handle value columns
636
+ if value_columns is not None:
637
+ if isinstance(value_columns, str):
638
+ value_columns = [value_columns]
639
+
640
+ # Validate that all value columns exist
641
+ missing_cols = [col for col in value_columns if col not in points_gdf.columns]
642
+ if missing_cols:
643
+ raise ValueError(f"Value columns not found in points data: {missing_cols}")
644
+
645
+ # Handle aggregation method
646
+ agg_funcs = {}
647
+
648
+ if isinstance(aggregation, str):
649
+ if aggregation == "count":
650
+ # Special case for count (doesn't need value columns)
651
+ agg_funcs["__count"] = "count"
652
+ elif value_columns is not None:
653
+ # Apply the same aggregation to all value columns
654
+ agg_funcs = {col: aggregation for col in value_columns}
655
+ else:
656
+ raise ValueError(
657
+ "Value columns must be specified for aggregation methods other than 'count'"
658
+ )
659
+ elif isinstance(aggregation, dict):
660
+ # Validate dictionary keys
661
+ if value_columns is None:
662
+ raise ValueError(
663
+ "Value columns must be specified when using a dictionary of aggregation methods"
664
+ )
665
+
666
+ missing_aggs = [col for col in value_columns if col not in aggregation]
667
+ extra_aggs = [col for col in aggregation if col not in value_columns]
668
+
669
+ if missing_aggs:
670
+ raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
671
+ if extra_aggs:
672
+ raise ValueError(
673
+ f"Aggregation methods specified for non-existent columns: {extra_aggs}"
674
+ )
675
+
676
+ agg_funcs = aggregation
677
+ else:
678
+ raise TypeError("aggregation must be a string or dictionary")
679
+
680
+ # Create a copy of the zones
681
+ result = zones.copy()
682
+
683
+ # Spatial join
684
+ joined = gpd.sjoin(points_gdf, zones, how="inner", predicate=point_zone_predicate)
685
+
686
+ # Perform aggregation
687
+ if "geometry" in joined.columns and not all(
688
+ value == "count" for value in agg_funcs.values()
689
+ ):
690
+ # Drop geometry for non-count aggregations to avoid errors
691
+ joined = joined.drop(columns=["geometry"])
692
+
693
+ if "__count" in agg_funcs:
694
+ # Count points per zone
695
+ counts = (
696
+ joined.groupby(zone_id_column)
697
+ .size()
698
+ .reset_index(name=f"point_count{output_suffix}")
699
+ )
700
+ result = result.merge(counts, on=zone_id_column, how="left")
701
+ result[f"point_count{output_suffix}"] = (
702
+ result[f"point_count{output_suffix}"].fillna(0).astype(int)
703
+ )
704
+ else:
705
+ # Aggregate values
706
+ aggregated = joined.groupby(zone_id_column).agg(agg_funcs).reset_index()
707
+
708
+ # Rename columns to include aggregation method
709
+ if len(value_columns) > 0:
710
+ # Handle MultiIndex columns from pandas aggregation
711
+ if isinstance(aggregated.columns, pd.MultiIndex):
712
+ aggregated.columns = [
713
+ (
714
+ f"{col[0]}_{col[1]}{output_suffix}"
715
+ if col[0] != zone_id_column
716
+ else zone_id_column
717
+ )
718
+ for col in aggregated.columns
719
+ ]
720
+
721
+ # Merge back to zones
722
+ result = result.merge(aggregated, on=zone_id_column, how="left")
723
+
724
+ # Fill NaN values with zeros
725
+ for col in result.columns:
726
+ if (
727
+ col != zone_id_column
728
+ and col != "geometry"
729
+ and pd.api.types.is_numeric_dtype(result[col])
730
+ ):
731
+ result[col] = result[col].fillna(0)
732
+
733
+ if drop_geometry:
734
+ result = result.drop(columns=["geometry"])
735
+ return result
736
+
737
+ return result
738
+
739
+
740
+ def annotate_with_admin_regions(
741
+ gdf: gpd.GeoDataFrame,
742
+ country_code: str,
743
+ data_store: Optional[DataStore] = None,
744
+ admin_id_column_suffix="_giga",
745
+ ) -> gpd.GeoDataFrame:
746
+ """
747
+ Annotate a GeoDataFrame with administrative region information.
748
+
749
+ Performs a spatial join between the input points and administrative boundaries
750
+ at levels 1 and 2, resolving conflicts when points intersect multiple admin regions.
751
+
752
+ Args:
753
+ gdf: GeoDataFrame containing points to annotate
754
+ country_code: Country code for administrative boundaries
755
+ data_store: Optional DataStore for loading admin boundary data
756
+
757
+ Returns:
758
+ GeoDataFrame with added administrative region columns
759
+ """
760
+ from gigaspatial.handlers.boundaries import AdminBoundaries
761
+
762
+ if not isinstance(gdf, gpd.GeoDataFrame):
763
+ raise TypeError("gdf must be a GeoDataFrame")
764
+
765
+ if gdf.empty:
766
+ LOGGER.warning("Empty GeoDataFrame provided, returning as-is")
767
+ return gdf
768
+
769
+ # read country admin data
770
+ admin1_data = AdminBoundaries.create(
771
+ country_code=country_code, admin_level=1, data_store=data_store
772
+ ).to_geodataframe()
773
+
774
+ admin1_data.rename(
775
+ columns={"id": f"admin1_id{admin_id_column_suffix}", "name": "admin1"},
776
+ inplace=True,
777
+ )
778
+ admin1_data.drop(columns=["name_en", "parent_id", "country_code"], inplace=True)
779
+
780
+ admin2_data = AdminBoundaries.create(
781
+ country_code=country_code, admin_level=2, data_store=data_store
782
+ ).to_geodataframe()
783
+
784
+ admin2_data.rename(
785
+ columns={
786
+ "id": f"admin2_id{admin_id_column_suffix}",
787
+ "parent_id": f"admin1_id{admin_id_column_suffix}",
788
+ "name": "admin2",
789
+ },
790
+ inplace=True,
791
+ )
792
+ admin2_data.drop(columns=["name_en", "country_code"], inplace=True)
793
+
794
+ # Join dataframes based on 'admin1_id_giga'
795
+ admin_data = admin2_data.merge(
796
+ admin1_data[[f"admin1_id{admin_id_column_suffix}", "admin1", "geometry"]],
797
+ left_on=f"admin1_id{admin_id_column_suffix}",
798
+ right_on=f"admin1_id{admin_id_column_suffix}",
799
+ how="outer",
800
+ )
801
+
802
+ admin_data["geometry"] = admin_data.apply(
803
+ lambda x: x.geometry_x if x.geometry_x else x.geometry_y, axis=1
804
+ )
805
+
806
+ admin_data = gpd.GeoDataFrame(
807
+ admin_data.drop(columns=["geometry_x", "geometry_y"]),
808
+ geometry="geometry",
809
+ crs=4326,
810
+ )
811
+
812
+ admin_data["admin2"].fillna("Unknown", inplace=True)
813
+ admin_data[f"admin2_id{admin_id_column_suffix}"] = admin_data[
814
+ f"admin2_id{admin_id_column_suffix}"
815
+ ].replace({np.nan: None})
816
+
817
+ if gdf.crs is None:
818
+ LOGGER.warning("Input GeoDataFrame has no CRS, assuming EPSG:4326")
819
+ gdf.set_crs(epsg=4326, inplace=True)
820
+ elif gdf.crs != "EPSG:4326":
821
+ LOGGER.info(f"Reprojecting from {gdf.crs} to EPSG:4326")
822
+ gdf = gdf.to_crs(epsg=4326)
823
+
824
+ # spatial join gdf to admins
825
+ gdf_w_admins = gdf.copy().sjoin(
826
+ admin_data,
827
+ how="left",
828
+ predicate="intersects",
829
+ )
830
+
831
+ # Check for duplicates caused by points intersecting multiple polygons
832
+ if len(gdf_w_admins) != len(gdf):
833
+ LOGGER.warning(
834
+ "Some points intersect multiple administrative boundaries. Resolving conflicts..."
835
+ )
836
+
837
+ # Group by original index and select the closest admin area for ties
838
+ gdf_w_admins["distance"] = gdf_w_admins.apply(
839
+ lambda row: row.geometry.distance(
840
+ admin_data.loc[row.index_right, "geometry"].centroid
841
+ ),
842
+ axis=1,
843
+ )
844
+
845
+ # For points with multiple matches, keep the closest polygon
846
+ gdf_w_admins = gdf_w_admins.loc[
847
+ gdf_w_admins.groupby(gdf.index)["distance"].idxmin()
848
+ ].drop(columns="distance")
849
+
850
+ # Drop unnecessary columns and reset the index
851
+ gdf_w_admins = gdf_w_admins.drop(columns="index_right").reset_index(drop=True)
852
+
853
+ return gdf_w_admins
854
+
855
+
856
+ def aggregate_polygons_to_zones(
857
+ polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
858
+ zones: gpd.GeoDataFrame,
859
+ value_columns: Union[str, List[str]],
860
+ aggregation: Union[str, Dict[str, str]] = "sum",
861
+ area_weighted: bool = True,
862
+ zone_id_column: str = "zone_id",
863
+ output_suffix: str = "",
864
+ drop_geometry: bool = False,
865
+ ) -> gpd.GeoDataFrame:
866
+ """
867
+ Aggregate polygon data to zones with area-weighted values.
868
+
869
+ This function maps polygon data to zones, weighting values by the
870
+ fractional area of overlap between polygons and zones.
871
+
872
+ Args:
873
+ polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): Polygon data to aggregate
874
+ zones (gpd.GeoDataFrame): Zones to aggregate polygons to
875
+ value_columns (Union[str, List[str]]): Column(s) containing values to aggregate
876
+ aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
877
+ - Single string: Use same method for all columns ("sum", "mean", "max", etc.)
878
+ - Dict: Map column names to aggregation methods
879
+ area_weighted (bool): Whether to weight values by fractional area overlap
880
+ If False, values are not weighted before aggregation
881
+ zone_id_column (str): Column in zones containing zone identifiers
882
+ output_suffix (str): Suffix to add to output column names
883
+ drop_geometry (bool): Whether to drop the geometry column from output
884
+
885
+ Returns:
886
+ gpd.GeoDataFrame: Zones with aggregated polygon values
887
+
888
+ Example:
889
+ >>> landuse_stats = aggregate_polygons_to_zones(
890
+ ... landuse_polygons,
891
+ ... grid_zones,
892
+ ... value_columns=["area", "population"],
893
+ ... aggregation="sum"
894
+ ... )
895
+ """
896
+ # Input validation
897
+ if not isinstance(zones, gpd.GeoDataFrame):
898
+ raise TypeError("zones must be a GeoDataFrame")
899
+
900
+ if zone_id_column not in zones.columns:
901
+ raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
902
+
903
+ # Convert polygons to GeoDataFrame if necessary
904
+ if not isinstance(polygons, gpd.GeoDataFrame):
905
+ try:
906
+ polygons_gdf = convert_to_geodataframe(polygons)
907
+ except:
908
+ raise TypeError("polygons must be a GeoDataFrame or convertible to one")
909
+ else:
910
+ polygons_gdf = polygons.copy()
911
+
912
+ # Validate geometry types
913
+ non_polygon_geoms = [
914
+ geom_type
915
+ for geom_type in polygons_gdf.geometry.geom_type.unique()
916
+ if geom_type not in ["Polygon", "MultiPolygon"]
917
+ ]
918
+ if non_polygon_geoms:
919
+ raise ValueError(
920
+ f"Input contains non-polygon geometries: {non_polygon_geoms}. "
921
+ "Use aggregate_points_to_zones for point data."
922
+ )
923
+
924
+ # Process value columns
925
+ if isinstance(value_columns, str):
926
+ value_columns = [value_columns]
927
+
928
+ # Validate that all value columns exist
929
+ missing_cols = [col for col in value_columns if col not in polygons_gdf.columns]
930
+ if missing_cols:
931
+ raise ValueError(f"Value columns not found in polygons data: {missing_cols}")
932
+
933
+ # Ensure CRS match
934
+ if polygons_gdf.crs != zones.crs:
935
+ polygons_gdf = polygons_gdf.to_crs(zones.crs)
936
+
937
+ # Handle aggregation method
938
+ if isinstance(aggregation, str):
939
+ agg_funcs = {col: aggregation for col in value_columns}
940
+ elif isinstance(aggregation, dict):
941
+ # Validate dictionary keys
942
+ missing_aggs = [col for col in value_columns if col not in aggregation]
943
+ extra_aggs = [col for col in aggregation if col not in value_columns]
944
+
945
+ if missing_aggs:
946
+ raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
947
+ if extra_aggs:
948
+ raise ValueError(
949
+ f"Aggregation methods specified for non-existent columns: {extra_aggs}"
950
+ )
951
+
952
+ agg_funcs = aggregation
953
+ else:
954
+ raise TypeError("aggregation must be a string or dictionary")
955
+
956
+ # Create a copy of the zones
957
+ result = zones.copy()
958
+
959
+ if area_weighted:
960
+ # Use area-weighted aggregation with polygon overlay
961
+ try:
962
+ # Compute UTM CRS for accurate area calculations
963
+ overlay_utm_crs = polygons_gdf.estimate_utm_crs()
964
+
965
+ # Prepare polygons for overlay
966
+ polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
967
+ polygons_utm["orig_area"] = polygons_utm.area
968
+
969
+ # Keep only necessary columns
970
+ overlay_cols = value_columns + ["geometry", "orig_area"]
971
+ overlay_gdf = polygons_utm[overlay_cols].copy()
972
+
973
+ # Prepare zones for overlay
974
+ zones_utm = zones.to_crs(overlay_utm_crs)
975
+
976
+ # Perform the spatial overlay
977
+ gdf_overlayed = gpd.overlay(
978
+ overlay_gdf, zones_utm[[zone_id_column, "geometry"]], how="intersection"
979
+ )
980
+
981
+ # Calculate fractional areas
982
+ gdf_overlayed["intersection_area"] = gdf_overlayed.area
983
+ gdf_overlayed["area_fraction"] = (
984
+ gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
985
+ )
986
+
987
+ # Apply area weighting to value columns
988
+ for col in value_columns:
989
+ gdf_overlayed[col] = gdf_overlayed[col] * gdf_overlayed["area_fraction"]
990
+
991
+ # Aggregate by zone ID
992
+ aggregated = gdf_overlayed.groupby(zone_id_column)[value_columns].agg(
993
+ agg_funcs
994
+ )
995
+
996
+ # Handle column naming for multi-level index
997
+ if isinstance(aggregated.columns, pd.MultiIndex):
998
+ aggregated.columns = [
999
+ f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
1000
+ ]
1001
+
1002
+ # Reset index
1003
+ aggregated = aggregated.reset_index()
1004
+
1005
+ # Merge aggregated values back to the zones
1006
+ result = result.merge(aggregated, on=zone_id_column, how="left")
1007
+
1008
+ # Fill NaN values with zeros
1009
+ for col in result.columns:
1010
+ if (
1011
+ col != zone_id_column
1012
+ and col != "geometry"
1013
+ and pd.api.types.is_numeric_dtype(result[col])
1014
+ ):
1015
+ result[col] = result[col].fillna(0)
1016
+
1017
+ except Exception as e:
1018
+ raise RuntimeError(f"Error during area-weighted aggregation: {e}")
1019
+
1020
+ else:
1021
+ # Non-weighted aggregation - simpler approach
1022
+ # Perform spatial join
1023
+ joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate="intersects")
1024
+
1025
+ # Remove geometry column for aggregation
1026
+ if "geometry" in joined.columns:
1027
+ joined = joined.drop(columns=["geometry"])
1028
+
1029
+ # Group by zone ID and aggregate
1030
+ aggregated = joined.groupby(zone_id_column)[value_columns].agg(agg_funcs)
1031
+
1032
+ # Handle column naming for multi-level index
1033
+ if isinstance(aggregated.columns, pd.MultiIndex):
1034
+ aggregated.columns = [
1035
+ f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
1036
+ ]
1037
+
1038
+ # Reset index and merge back to zones
1039
+ aggregated = aggregated.reset_index()
1040
+ result = result.merge(aggregated, on=zone_id_column, how="left")
1041
+
1042
+ # Fill NaN values with zeros
1043
+ for col in result.columns:
1044
+ if (
1045
+ col != zone_id_column
1046
+ and col != "geometry"
1047
+ and pd.api.types.is_numeric_dtype(result[col])
1048
+ ):
1049
+ result[col] = result[col].fillna(0)
1050
+
1051
+ if drop_geometry:
1052
+ result = result.drop(columns=["geometry"])
1053
+
1054
+ return result