giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/METADATA +3 -1
- giga_spatial-0.6.6.dist-info/RECORD +50 -0
- gigaspatial/__init__.py +1 -1
- gigaspatial/config.py +29 -4
- gigaspatial/core/io/__init__.py +1 -0
- gigaspatial/core/io/data_api.py +3 -1
- gigaspatial/core/io/database.py +319 -0
- gigaspatial/generators/__init__.py +5 -1
- gigaspatial/generators/poi.py +300 -52
- gigaspatial/generators/zonal/__init__.py +2 -1
- gigaspatial/generators/zonal/admin.py +84 -0
- gigaspatial/generators/zonal/base.py +237 -81
- gigaspatial/generators/zonal/geometry.py +151 -53
- gigaspatial/generators/zonal/mercator.py +50 -19
- gigaspatial/grid/__init__.py +1 -1
- gigaspatial/grid/mercator_tiles.py +33 -10
- gigaspatial/handlers/__init__.py +8 -1
- gigaspatial/handlers/base.py +26 -6
- gigaspatial/handlers/boundaries.py +93 -18
- gigaspatial/handlers/ghsl.py +92 -15
- gigaspatial/handlers/rwi.py +5 -2
- gigaspatial/handlers/worldpop.py +771 -186
- gigaspatial/processing/algorithms.py +188 -0
- gigaspatial/processing/geo.py +204 -102
- gigaspatial/processing/tif_processor.py +220 -45
- giga_spatial-0.6.4.dist-info/RECORD +0 -47
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/WHEEL +0 -0
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/licenses/LICENSE +0 -0
- {giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,188 @@
|
|
1
|
+
import sys, os
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from typing import Literal, List, Tuple, Union, Optional
|
5
|
+
import geopandas as gpd
|
6
|
+
import pandas as pd
|
7
|
+
from scipy.spatial import cKDTree
|
8
|
+
import networkx as nx
|
9
|
+
|
10
|
+
from gigaspatial.processing.geo import (
|
11
|
+
convert_to_geodataframe,
|
12
|
+
)
|
13
|
+
from gigaspatial.config import config
|
14
|
+
|
15
|
+
LOGGER = config.get_logger("GigaSpatialProcessing")
|
16
|
+
|
17
|
+
|
18
|
+
def build_distance_graph(
|
19
|
+
left_df: Union[pd.DataFrame, gpd.GeoDataFrame],
|
20
|
+
right_df: Union[pd.DataFrame, gpd.GeoDataFrame],
|
21
|
+
distance_threshold: float,
|
22
|
+
max_k: int = 100,
|
23
|
+
return_dataframe: bool = False,
|
24
|
+
verbose: bool = True,
|
25
|
+
exclude_same_index: Optional[bool] = None,
|
26
|
+
) -> Union[nx.Graph, Tuple[nx.Graph, pd.DataFrame]]:
|
27
|
+
"""
|
28
|
+
Build a graph of spatial matches between two dataframes using KD-tree.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
left_df: Left dataframe to match from
|
32
|
+
right_df: Right dataframe to match to
|
33
|
+
distance_threshold: Maximum distance for matching (in meters)
|
34
|
+
max_k: Maximum number of neighbors to consider per point (default: 100)
|
35
|
+
return_dataframe: If True, also return the matches DataFrame
|
36
|
+
verbose: If True, print statistics about the graph
|
37
|
+
exclude_same_index: If True, exclude self-matches. If None, auto-detect based on df equality
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
NetworkX Graph, or tuple of (Graph, DataFrame) if return_dataframe=True
|
41
|
+
|
42
|
+
Raises:
|
43
|
+
ValueError: If distance_threshold is negative or max_k is not positive
|
44
|
+
"""
|
45
|
+
|
46
|
+
# Input validation
|
47
|
+
if distance_threshold < 0:
|
48
|
+
raise ValueError("distance_threshold must be non-negative")
|
49
|
+
|
50
|
+
if max_k <= 0:
|
51
|
+
raise ValueError("max_k must be positive")
|
52
|
+
|
53
|
+
if left_df.empty or right_df.empty:
|
54
|
+
if verbose:
|
55
|
+
LOGGER.warning("Warning: One or both dataframes are empty")
|
56
|
+
G = nx.Graph()
|
57
|
+
return (G, pd.DataFrame()) if return_dataframe else G
|
58
|
+
|
59
|
+
def get_utm_coordinates(df: Union[pd.DataFrame, gpd.GeoDataFrame]) -> np.ndarray:
|
60
|
+
"""Extract coordinates as numpy array in UTM projection."""
|
61
|
+
if isinstance(df, pd.DataFrame):
|
62
|
+
gdf = convert_to_geodataframe(df)
|
63
|
+
else:
|
64
|
+
gdf = df.copy()
|
65
|
+
|
66
|
+
# More robust UTM CRS estimation
|
67
|
+
try:
|
68
|
+
gdf_utm = gdf.to_crs(gdf.estimate_utm_crs())
|
69
|
+
except Exception as e:
|
70
|
+
if verbose:
|
71
|
+
LOGGER.warning(
|
72
|
+
f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
|
73
|
+
)
|
74
|
+
gdf_utm = gdf.to_crs("EPSG:3857") # Fallback to Web Mercator
|
75
|
+
|
76
|
+
return gdf_utm.get_coordinates().to_numpy()
|
77
|
+
|
78
|
+
# Auto-detect same dataframe case
|
79
|
+
if exclude_same_index is None:
|
80
|
+
exclude_same_index = left_df.equals(right_df)
|
81
|
+
if verbose and exclude_same_index:
|
82
|
+
LOGGER.info("Auto-detected same dataframe - excluding self-matches")
|
83
|
+
|
84
|
+
# Get coordinates
|
85
|
+
left_coords = get_utm_coordinates(left_df)
|
86
|
+
right_coords = (
|
87
|
+
get_utm_coordinates(right_df) if not exclude_same_index else left_coords
|
88
|
+
)
|
89
|
+
|
90
|
+
# Build KD-tree and query
|
91
|
+
kdtree = cKDTree(right_coords)
|
92
|
+
|
93
|
+
# Use the provided max_k parameter, but don't exceed available points
|
94
|
+
k_to_use = min(max_k, len(right_coords))
|
95
|
+
|
96
|
+
if verbose and k_to_use < max_k:
|
97
|
+
LOGGER.info(
|
98
|
+
f"Note: max_k ({max_k}) reduced to {k_to_use} (number of available points)"
|
99
|
+
)
|
100
|
+
|
101
|
+
# Note: Distance calculations here are based on Euclidean distance in UTM projection.
|
102
|
+
# This can introduce errors up to ~50 cm for a 50 meter threshold, especially near the poles where distortion increases.
|
103
|
+
distances, indices = kdtree.query(
|
104
|
+
left_coords, k=k_to_use, distance_upper_bound=distance_threshold
|
105
|
+
)
|
106
|
+
|
107
|
+
# Handle single k case (when k_to_use = 1, results are 1D)
|
108
|
+
if distances.ndim == 1:
|
109
|
+
distances = distances.reshape(-1, 1)
|
110
|
+
indices = indices.reshape(-1, 1)
|
111
|
+
|
112
|
+
# Extract valid pairs using vectorized operations
|
113
|
+
left_indices = np.arange(len(distances))[:, np.newaxis]
|
114
|
+
left_indices = np.broadcast_to(left_indices, distances.shape)
|
115
|
+
valid_mask = np.isfinite(distances)
|
116
|
+
|
117
|
+
if exclude_same_index:
|
118
|
+
same_index_mask = left_indices == indices
|
119
|
+
valid_mask = valid_mask & ~same_index_mask
|
120
|
+
|
121
|
+
valid_left = left_indices[valid_mask]
|
122
|
+
valid_right = indices[valid_mask]
|
123
|
+
valid_distances = distances[valid_mask]
|
124
|
+
|
125
|
+
# Map back to original indices
|
126
|
+
valid_left_indices = left_df.index.values[valid_left]
|
127
|
+
valid_right_indices = right_df.index.values[valid_right]
|
128
|
+
|
129
|
+
# Create matches DataFrame
|
130
|
+
matches_df = pd.DataFrame(
|
131
|
+
{
|
132
|
+
"left_idx": valid_left_indices,
|
133
|
+
"right_idx": valid_right_indices,
|
134
|
+
"distance": valid_distances,
|
135
|
+
}
|
136
|
+
)
|
137
|
+
|
138
|
+
# Build graph more efficiently
|
139
|
+
G = nx.from_pandas_edgelist(
|
140
|
+
matches_df,
|
141
|
+
source="left_idx",
|
142
|
+
target="right_idx",
|
143
|
+
edge_attr="distance",
|
144
|
+
create_using=nx.Graph(),
|
145
|
+
)
|
146
|
+
|
147
|
+
# Add isolated nodes (nodes without any matches within threshold)
|
148
|
+
# This ensures all original indices are represented in the graph
|
149
|
+
all_left_nodes = set(left_df.index.values)
|
150
|
+
all_right_nodes = set(right_df.index.values)
|
151
|
+
|
152
|
+
if not exclude_same_index:
|
153
|
+
all_nodes = all_left_nodes | all_right_nodes
|
154
|
+
else:
|
155
|
+
all_nodes = all_left_nodes # Same dataframe, so same node set
|
156
|
+
|
157
|
+
# Add nodes that don't have edges
|
158
|
+
existing_nodes = set(G.nodes())
|
159
|
+
isolated_nodes = all_nodes - existing_nodes
|
160
|
+
G.add_nodes_from(isolated_nodes)
|
161
|
+
|
162
|
+
# Print statistics
|
163
|
+
if verbose:
|
164
|
+
print(
|
165
|
+
f"Total potential matches: {len(left_df)} × {len(right_df)} = {len(left_df) * len(right_df):,}"
|
166
|
+
)
|
167
|
+
print(f"Matches found within {distance_threshold}m: {len(matches_df):,}")
|
168
|
+
print(f"Graph nodes: {G.number_of_nodes():,}")
|
169
|
+
print(f"Graph edges: {G.number_of_edges():,}")
|
170
|
+
|
171
|
+
components = list(nx.connected_components(G))
|
172
|
+
print(f"Connected components: {len(components):,}")
|
173
|
+
|
174
|
+
if len(components) > 1:
|
175
|
+
component_sizes = [len(c) for c in components]
|
176
|
+
print(f"Largest component size: {max(component_sizes):,}")
|
177
|
+
print(
|
178
|
+
f"Isolated nodes: {sum(1 for size in component_sizes if size == 1):,}"
|
179
|
+
)
|
180
|
+
|
181
|
+
if len(matches_df) > 0:
|
182
|
+
print(
|
183
|
+
f"Distance stats - min: {matches_df['distance'].min():.1f}m, "
|
184
|
+
f"max: {matches_df['distance'].max():.1f}m, "
|
185
|
+
f"mean: {matches_df['distance'].mean():.1f}m"
|
186
|
+
)
|
187
|
+
|
188
|
+
return (G, matches_df) if return_dataframe else G
|
gigaspatial/processing/geo.py
CHANGED
@@ -272,8 +272,13 @@ def buffer_geodataframe(
|
|
272
272
|
input_crs = gdf_work.crs
|
273
273
|
|
274
274
|
try:
|
275
|
-
|
276
|
-
|
275
|
+
try:
|
276
|
+
utm_crs = gdf_work.estimate_utm_crs()
|
277
|
+
except Exception as e:
|
278
|
+
LOGGER.warning(
|
279
|
+
f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
|
280
|
+
)
|
281
|
+
utm_crs = "EPSG:3857" # Fallback to Web Mercator
|
277
282
|
|
278
283
|
# Transform to UTM, create buffer, and transform back
|
279
284
|
gdf_work = gdf_work.to_crs(utm_crs)
|
@@ -452,7 +457,13 @@ def add_area_in_meters(
|
|
452
457
|
gdf_with_area = gdf.copy()
|
453
458
|
|
454
459
|
# Calculate the UTM CRS for accurate area calculation
|
455
|
-
|
460
|
+
try:
|
461
|
+
utm_crs = gdf_with_area.estimate_utm_crs()
|
462
|
+
except Exception as e:
|
463
|
+
LOGGER.warning(
|
464
|
+
f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
|
465
|
+
)
|
466
|
+
utm_crs = "EPSG:3857" # Fallback to Web Mercator
|
456
467
|
|
457
468
|
# Transform to UTM CRS and calculate the area in square meters
|
458
469
|
gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area
|
@@ -858,57 +869,111 @@ def aggregate_polygons_to_zones(
|
|
858
869
|
zones: gpd.GeoDataFrame,
|
859
870
|
value_columns: Union[str, List[str]],
|
860
871
|
aggregation: Union[str, Dict[str, str]] = "sum",
|
861
|
-
|
872
|
+
predicate: Literal["intersects", "within", "fractional"] = "intersects",
|
862
873
|
zone_id_column: str = "zone_id",
|
863
874
|
output_suffix: str = "",
|
864
875
|
drop_geometry: bool = False,
|
865
876
|
) -> gpd.GeoDataFrame:
|
866
877
|
"""
|
867
|
-
|
878
|
+
Aggregates polygon data to zones based on a specified spatial relationship.
|
868
879
|
|
869
|
-
This function
|
870
|
-
|
880
|
+
This function performs a spatial join between polygons and zones and then
|
881
|
+
aggregates values from the polygons to their corresponding zones. The aggregation
|
882
|
+
method depends on the `predicate` parameter, which determines the nature of the
|
883
|
+
spatial relationship.
|
871
884
|
|
872
885
|
Args:
|
873
|
-
polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
886
|
+
polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
|
887
|
+
Polygon data to aggregate. Must be a GeoDataFrame or convertible to one.
|
888
|
+
zones (gpd.GeoDataFrame):
|
889
|
+
The target zones to which the polygon data will be aggregated.
|
890
|
+
value_columns (Union[str, List[str]]):
|
891
|
+
The column(s) in `polygons` containing the numeric values to aggregate.
|
892
|
+
aggregation (Union[str, Dict[str, str]], optional):
|
893
|
+
The aggregation method(s) to use. Can be a single string (e.g., "sum",
|
894
|
+
"mean", "max") to apply the same method to all columns, or a dictionary
|
895
|
+
mapping column names to aggregation methods (e.g., `{'population': 'sum'}`).
|
896
|
+
Defaults to "sum".
|
897
|
+
predicate (Literal["intersects", "within", "fractional"], optional):
|
898
|
+
The spatial relationship to use for aggregation:
|
899
|
+
- "intersects": Aggregates values for any polygon that intersects a zone.
|
900
|
+
- "within": Aggregates values for polygons entirely contained within a zone.
|
901
|
+
- "fractional": Performs area-weighted aggregation. The value of a polygon
|
902
|
+
is distributed proportionally to the area of its overlap with each zone.
|
903
|
+
This requires calculating a UTM CRS for accurate area measurements.
|
904
|
+
Defaults to "intersects".
|
905
|
+
zone_id_column (str, optional):
|
906
|
+
The name of the column in `zones` that contains the unique zone identifiers.
|
907
|
+
Defaults to "zone_id".
|
908
|
+
output_suffix (str, optional):
|
909
|
+
A suffix to add to the names of the new aggregated columns in the output
|
910
|
+
GeoDataFrame. Defaults to "".
|
911
|
+
drop_geometry (bool, optional):
|
912
|
+
If True, the geometry column will be dropped from the output GeoDataFrame.
|
913
|
+
Defaults to False.
|
884
914
|
|
885
915
|
Returns:
|
886
|
-
gpd.GeoDataFrame:
|
916
|
+
gpd.GeoDataFrame:
|
917
|
+
The `zones` GeoDataFrame with new columns containing the aggregated values.
|
918
|
+
Zones with no intersecting or contained polygons will have `0` values.
|
919
|
+
|
920
|
+
Raises:
|
921
|
+
TypeError: If `zones` is not a GeoDataFrame or `polygons` cannot be converted.
|
922
|
+
ValueError: If `zone_id_column` or any `value_columns` are not found, or
|
923
|
+
if the geometry types in `polygons` are not polygons.
|
924
|
+
RuntimeError: If an error occurs during the area-weighted aggregation process.
|
887
925
|
|
888
926
|
Example:
|
889
|
-
>>>
|
927
|
+
>>> import geopandas as gpd
|
928
|
+
>>> # Assuming 'landuse_polygons' and 'grid_zones' are GeoDataFrames
|
929
|
+
>>> # Aggregate total population within each grid zone using area-weighting
|
930
|
+
>>> pop_by_zone = aggregate_polygons_to_zones(
|
890
931
|
... landuse_polygons,
|
891
932
|
... grid_zones,
|
892
|
-
... value_columns=
|
893
|
-
...
|
933
|
+
... value_columns="population",
|
934
|
+
... predicate="fractional",
|
935
|
+
... aggregation="sum",
|
936
|
+
... output_suffix="_pop"
|
937
|
+
... )
|
938
|
+
>>> # Aggregate the count of landuse parcels intersecting each zone
|
939
|
+
>>> count_by_zone = aggregate_polygons_to_zones(
|
940
|
+
... landuse_polygons,
|
941
|
+
... grid_zones,
|
942
|
+
... value_columns="parcel_id",
|
943
|
+
... predicate="intersects",
|
944
|
+
... aggregation="count"
|
894
945
|
... )
|
895
946
|
"""
|
896
947
|
# Input validation
|
897
948
|
if not isinstance(zones, gpd.GeoDataFrame):
|
898
949
|
raise TypeError("zones must be a GeoDataFrame")
|
899
950
|
|
951
|
+
if zones.empty:
|
952
|
+
raise ValueError("zones GeoDataFrame is empty")
|
953
|
+
|
900
954
|
if zone_id_column not in zones.columns:
|
901
955
|
raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
|
902
956
|
|
957
|
+
if predicate not in ["intersects", "within", "fractional"]:
|
958
|
+
raise ValueError(
|
959
|
+
f"Unsupported predicate: {predicate}. Predicate can be one of `intersects`, `within`, `fractional`"
|
960
|
+
)
|
961
|
+
|
903
962
|
# Convert polygons to GeoDataFrame if necessary
|
904
963
|
if not isinstance(polygons, gpd.GeoDataFrame):
|
905
964
|
try:
|
906
965
|
polygons_gdf = convert_to_geodataframe(polygons)
|
907
|
-
except:
|
908
|
-
raise TypeError(
|
966
|
+
except Exception as e:
|
967
|
+
raise TypeError(
|
968
|
+
f"polygons must be a GeoDataFrame or convertible to one: {e}"
|
969
|
+
)
|
909
970
|
else:
|
910
971
|
polygons_gdf = polygons.copy()
|
911
972
|
|
973
|
+
if polygons_gdf.empty:
|
974
|
+
LOGGER.warning("Empty polygons GeoDataFrame provided")
|
975
|
+
return zones
|
976
|
+
|
912
977
|
# Validate geometry types
|
913
978
|
non_polygon_geoms = [
|
914
979
|
geom_type
|
@@ -935,8 +1000,53 @@ def aggregate_polygons_to_zones(
|
|
935
1000
|
polygons_gdf = polygons_gdf.to_crs(zones.crs)
|
936
1001
|
|
937
1002
|
# Handle aggregation method
|
1003
|
+
agg_funcs = _process_aggregation_methods(aggregation, value_columns)
|
1004
|
+
|
1005
|
+
# Prepare minimal zones for spatial operations (only zone_id_column and geometry)
|
1006
|
+
minimal_zones = zones[[zone_id_column, "geometry"]].copy()
|
1007
|
+
|
1008
|
+
if predicate == "fractional":
|
1009
|
+
aggregated_data = _fractional_aggregation(
|
1010
|
+
polygons_gdf, minimal_zones, value_columns, agg_funcs, zone_id_column
|
1011
|
+
)
|
1012
|
+
else:
|
1013
|
+
aggregated_data = _simple_aggregation(
|
1014
|
+
polygons_gdf,
|
1015
|
+
minimal_zones,
|
1016
|
+
value_columns,
|
1017
|
+
agg_funcs,
|
1018
|
+
zone_id_column,
|
1019
|
+
predicate,
|
1020
|
+
)
|
1021
|
+
|
1022
|
+
# Merge aggregated results back to complete zones data
|
1023
|
+
result = zones.merge(
|
1024
|
+
aggregated_data[[col for col in aggregated_data.columns if col != "geometry"]],
|
1025
|
+
on=zone_id_column,
|
1026
|
+
how="left",
|
1027
|
+
)
|
1028
|
+
|
1029
|
+
# Fill NaN values with zeros for the newly aggregated columns only
|
1030
|
+
aggregated_cols = [col for col in result.columns if col not in zones.columns]
|
1031
|
+
for col in aggregated_cols:
|
1032
|
+
if pd.api.types.is_numeric_dtype(result[col]):
|
1033
|
+
result[col] = result[col].fillna(0)
|
1034
|
+
|
1035
|
+
# Apply output suffix consistently to result columns only
|
1036
|
+
if output_suffix:
|
1037
|
+
rename_dict = {col: f"{col}{output_suffix}" for col in aggregated_cols}
|
1038
|
+
result = result.rename(columns=rename_dict)
|
1039
|
+
|
1040
|
+
if drop_geometry:
|
1041
|
+
result = result.drop(columns=["geometry"])
|
1042
|
+
|
1043
|
+
return result
|
1044
|
+
|
1045
|
+
|
1046
|
+
def _process_aggregation_methods(aggregation, value_columns):
|
1047
|
+
"""Process and validate aggregation methods"""
|
938
1048
|
if isinstance(aggregation, str):
|
939
|
-
|
1049
|
+
return {col: aggregation for col in value_columns}
|
940
1050
|
elif isinstance(aggregation, dict):
|
941
1051
|
# Validate dictionary keys
|
942
1052
|
missing_aggs = [col for col in value_columns if col not in aggregation]
|
@@ -949,106 +1059,98 @@ def aggregate_polygons_to_zones(
|
|
949
1059
|
f"Aggregation methods specified for non-existent columns: {extra_aggs}"
|
950
1060
|
)
|
951
1061
|
|
952
|
-
|
1062
|
+
return aggregation
|
953
1063
|
else:
|
954
1064
|
raise TypeError("aggregation must be a string or dictionary")
|
955
1065
|
|
956
|
-
# Create a copy of the zones
|
957
|
-
result = zones.copy()
|
958
1066
|
|
959
|
-
|
960
|
-
|
1067
|
+
def _fractional_aggregation(
|
1068
|
+
polygons_gdf, zones, value_columns, agg_funcs, zone_id_column
|
1069
|
+
):
|
1070
|
+
"""Perform area-weighted (fractional) aggregation"""
|
1071
|
+
try:
|
1072
|
+
# Compute UTM CRS for accurate area calculations
|
961
1073
|
try:
|
962
|
-
# Compute UTM CRS for accurate area calculations
|
963
1074
|
overlay_utm_crs = polygons_gdf.estimate_utm_crs()
|
1075
|
+
except Exception as e:
|
1076
|
+
LOGGER.warning(f"UTM CRS estimation failed, using Web Mercator. Error: {e}")
|
1077
|
+
overlay_utm_crs = "EPSG:3857" # Fallback to Web Mercator
|
964
1078
|
|
965
|
-
|
966
|
-
|
967
|
-
|
1079
|
+
# Prepare polygons for overlay - only necessary columns
|
1080
|
+
polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
|
1081
|
+
polygons_utm["orig_area"] = polygons_utm.area
|
968
1082
|
|
969
|
-
|
970
|
-
|
971
|
-
|
1083
|
+
# Keep only necessary columns
|
1084
|
+
overlay_cols = value_columns + ["geometry", "orig_area"]
|
1085
|
+
overlay_gdf = polygons_utm[overlay_cols].copy()
|
972
1086
|
|
973
|
-
|
974
|
-
|
1087
|
+
# Prepare zones for overlay
|
1088
|
+
zones_utm = zones.to_crs(overlay_utm_crs)
|
975
1089
|
|
976
|
-
|
977
|
-
|
978
|
-
overlay_gdf, zones_utm[[zone_id_column, "geometry"]], how="intersection"
|
979
|
-
)
|
1090
|
+
# Perform the spatial overlay
|
1091
|
+
gdf_overlayed = gpd.overlay(overlay_gdf, zones_utm, how="intersection")
|
980
1092
|
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
|
985
|
-
)
|
1093
|
+
if gdf_overlayed.empty:
|
1094
|
+
LOGGER.warning("No intersections found during fractional aggregation")
|
1095
|
+
return zones
|
986
1096
|
|
987
|
-
|
988
|
-
|
989
|
-
|
1097
|
+
# Calculate fractional areas
|
1098
|
+
gdf_overlayed["intersection_area"] = gdf_overlayed.area
|
1099
|
+
gdf_overlayed["area_fraction"] = (
|
1100
|
+
gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
|
1101
|
+
)
|
990
1102
|
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
)
|
1103
|
+
# Apply area weighting to value columns
|
1104
|
+
for col in value_columns:
|
1105
|
+
gdf_overlayed[col] = gdf_overlayed[col] * gdf_overlayed["area_fraction"]
|
995
1106
|
|
996
|
-
|
997
|
-
|
998
|
-
aggregated.columns = [
|
999
|
-
f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
|
1000
|
-
]
|
1107
|
+
# Aggregate by zone ID
|
1108
|
+
aggregated = gdf_overlayed.groupby(zone_id_column)[value_columns].agg(agg_funcs)
|
1001
1109
|
|
1002
|
-
|
1003
|
-
|
1110
|
+
# Handle column naming for multi-level index
|
1111
|
+
aggregated = _handle_multiindex_columns(aggregated)
|
1004
1112
|
|
1005
|
-
|
1006
|
-
|
1113
|
+
# Reset index and merge back to zones
|
1114
|
+
aggregated = aggregated.reset_index()
|
1007
1115
|
|
1008
|
-
|
1009
|
-
|
1010
|
-
if (
|
1011
|
-
col != zone_id_column
|
1012
|
-
and col != "geometry"
|
1013
|
-
and pd.api.types.is_numeric_dtype(result[col])
|
1014
|
-
):
|
1015
|
-
result[col] = result[col].fillna(0)
|
1116
|
+
# Return only the aggregated data (will be merged with full zones later)
|
1117
|
+
return aggregated
|
1016
1118
|
|
1017
|
-
|
1018
|
-
|
1119
|
+
except Exception as e:
|
1120
|
+
raise RuntimeError(f"Error during area-weighted aggregation: {e}")
|
1019
1121
|
|
1020
|
-
else:
|
1021
|
-
# Non-weighted aggregation - simpler approach
|
1022
|
-
# Perform spatial join
|
1023
|
-
joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate="intersects")
|
1024
1122
|
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1123
|
+
def _simple_aggregation(
|
1124
|
+
polygons_gdf, zones, value_columns, agg_funcs, zone_id_column, predicate
|
1125
|
+
):
|
1126
|
+
"""Perform simple (non-weighted) aggregation"""
|
1127
|
+
# Perform spatial join
|
1128
|
+
joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate=predicate)
|
1028
1129
|
|
1029
|
-
|
1030
|
-
|
1130
|
+
if joined.empty:
|
1131
|
+
LOGGER.warning(f"No {predicate} relationships found during spatial join")
|
1132
|
+
return zones
|
1031
1133
|
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
|
1036
|
-
]
|
1134
|
+
# Remove geometry column for aggregation (keep only necessary columns)
|
1135
|
+
agg_cols = value_columns + [zone_id_column]
|
1136
|
+
joined_subset = joined[agg_cols].copy()
|
1037
1137
|
|
1038
|
-
|
1039
|
-
|
1040
|
-
result = result.merge(aggregated, on=zone_id_column, how="left")
|
1138
|
+
# Group by zone ID and aggregate
|
1139
|
+
aggregated = joined_subset.groupby(zone_id_column)[value_columns].agg(agg_funcs)
|
1041
1140
|
|
1042
|
-
|
1043
|
-
|
1044
|
-
if (
|
1045
|
-
col != zone_id_column
|
1046
|
-
and col != "geometry"
|
1047
|
-
and pd.api.types.is_numeric_dtype(result[col])
|
1048
|
-
):
|
1049
|
-
result[col] = result[col].fillna(0)
|
1141
|
+
# Handle column naming for multi-level index
|
1142
|
+
aggregated = _handle_multiindex_columns(aggregated)
|
1050
1143
|
|
1051
|
-
|
1052
|
-
|
1144
|
+
# Reset index and merge back to zones
|
1145
|
+
aggregated = aggregated.reset_index()
|
1053
1146
|
|
1054
|
-
|
1147
|
+
# Return only the aggregated data (will be merged with full zones later)
|
1148
|
+
return aggregated
|
1149
|
+
|
1150
|
+
|
1151
|
+
def _handle_multiindex_columns(aggregated):
|
1152
|
+
"""Handle multi-level column index from groupby aggregation"""
|
1153
|
+
if isinstance(aggregated.columns, pd.MultiIndex):
|
1154
|
+
# Flatten multi-level columns: combine column name with aggregation method
|
1155
|
+
aggregated.columns = [f"{col[0]}_{col[1]}" for col in aggregated.columns]
|
1156
|
+
return aggregated
|