giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ import sys, os
2
+
3
+ import numpy as np
4
+ from typing import Literal, List, Tuple, Union, Optional
5
+ import geopandas as gpd
6
+ import pandas as pd
7
+ from scipy.spatial import cKDTree
8
+ import networkx as nx
9
+
10
+ from gigaspatial.processing.geo import (
11
+ convert_to_geodataframe,
12
+ )
13
+ from gigaspatial.config import config
14
+
15
+ LOGGER = config.get_logger("GigaSpatialProcessing")
16
+
17
+
18
+ def build_distance_graph(
19
+ left_df: Union[pd.DataFrame, gpd.GeoDataFrame],
20
+ right_df: Union[pd.DataFrame, gpd.GeoDataFrame],
21
+ distance_threshold: float,
22
+ max_k: int = 100,
23
+ return_dataframe: bool = False,
24
+ verbose: bool = True,
25
+ exclude_same_index: Optional[bool] = None,
26
+ ) -> Union[nx.Graph, Tuple[nx.Graph, pd.DataFrame]]:
27
+ """
28
+ Build a graph of spatial matches between two dataframes using KD-tree.
29
+
30
+ Args:
31
+ left_df: Left dataframe to match from
32
+ right_df: Right dataframe to match to
33
+ distance_threshold: Maximum distance for matching (in meters)
34
+ max_k: Maximum number of neighbors to consider per point (default: 100)
35
+ return_dataframe: If True, also return the matches DataFrame
36
+ verbose: If True, print statistics about the graph
37
+ exclude_same_index: If True, exclude self-matches. If None, auto-detect based on df equality
38
+
39
+ Returns:
40
+ NetworkX Graph, or tuple of (Graph, DataFrame) if return_dataframe=True
41
+
42
+ Raises:
43
+ ValueError: If distance_threshold is negative or max_k is not positive
44
+ """
45
+
46
+ # Input validation
47
+ if distance_threshold < 0:
48
+ raise ValueError("distance_threshold must be non-negative")
49
+
50
+ if max_k <= 0:
51
+ raise ValueError("max_k must be positive")
52
+
53
+ if left_df.empty or right_df.empty:
54
+ if verbose:
55
+ LOGGER.warning("Warning: One or both dataframes are empty")
56
+ G = nx.Graph()
57
+ return (G, pd.DataFrame()) if return_dataframe else G
58
+
59
+ def get_utm_coordinates(df: Union[pd.DataFrame, gpd.GeoDataFrame]) -> np.ndarray:
60
+ """Extract coordinates as numpy array in UTM projection."""
61
+ if isinstance(df, pd.DataFrame):
62
+ gdf = convert_to_geodataframe(df)
63
+ else:
64
+ gdf = df.copy()
65
+
66
+ # More robust UTM CRS estimation
67
+ try:
68
+ gdf_utm = gdf.to_crs(gdf.estimate_utm_crs())
69
+ except Exception as e:
70
+ if verbose:
71
+ LOGGER.warning(
72
+ f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
73
+ )
74
+ gdf_utm = gdf.to_crs("EPSG:3857") # Fallback to Web Mercator
75
+
76
+ return gdf_utm.get_coordinates().to_numpy()
77
+
78
+ # Auto-detect same dataframe case
79
+ if exclude_same_index is None:
80
+ exclude_same_index = left_df.equals(right_df)
81
+ if verbose and exclude_same_index:
82
+ LOGGER.info("Auto-detected same dataframe - excluding self-matches")
83
+
84
+ # Get coordinates
85
+ left_coords = get_utm_coordinates(left_df)
86
+ right_coords = (
87
+ get_utm_coordinates(right_df) if not exclude_same_index else left_coords
88
+ )
89
+
90
+ # Build KD-tree and query
91
+ kdtree = cKDTree(right_coords)
92
+
93
+ # Use the provided max_k parameter, but don't exceed available points
94
+ k_to_use = min(max_k, len(right_coords))
95
+
96
+ if verbose and k_to_use < max_k:
97
+ LOGGER.info(
98
+ f"Note: max_k ({max_k}) reduced to {k_to_use} (number of available points)"
99
+ )
100
+
101
+ # Note: Distance calculations here are based on Euclidean distance in UTM projection.
102
+ # This can introduce errors up to ~50 cm for a 50 meter threshold, especially near the poles where distortion increases.
103
+ distances, indices = kdtree.query(
104
+ left_coords, k=k_to_use, distance_upper_bound=distance_threshold
105
+ )
106
+
107
+ # Handle single k case (when k_to_use = 1, results are 1D)
108
+ if distances.ndim == 1:
109
+ distances = distances.reshape(-1, 1)
110
+ indices = indices.reshape(-1, 1)
111
+
112
+ # Extract valid pairs using vectorized operations
113
+ left_indices = np.arange(len(distances))[:, np.newaxis]
114
+ left_indices = np.broadcast_to(left_indices, distances.shape)
115
+ valid_mask = np.isfinite(distances)
116
+
117
+ if exclude_same_index:
118
+ same_index_mask = left_indices == indices
119
+ valid_mask = valid_mask & ~same_index_mask
120
+
121
+ valid_left = left_indices[valid_mask]
122
+ valid_right = indices[valid_mask]
123
+ valid_distances = distances[valid_mask]
124
+
125
+ # Map back to original indices
126
+ valid_left_indices = left_df.index.values[valid_left]
127
+ valid_right_indices = right_df.index.values[valid_right]
128
+
129
+ # Create matches DataFrame
130
+ matches_df = pd.DataFrame(
131
+ {
132
+ "left_idx": valid_left_indices,
133
+ "right_idx": valid_right_indices,
134
+ "distance": valid_distances,
135
+ }
136
+ )
137
+
138
+ # Build graph more efficiently
139
+ G = nx.from_pandas_edgelist(
140
+ matches_df,
141
+ source="left_idx",
142
+ target="right_idx",
143
+ edge_attr="distance",
144
+ create_using=nx.Graph(),
145
+ )
146
+
147
+ # Add isolated nodes (nodes without any matches within threshold)
148
+ # This ensures all original indices are represented in the graph
149
+ all_left_nodes = set(left_df.index.values)
150
+ all_right_nodes = set(right_df.index.values)
151
+
152
+ if not exclude_same_index:
153
+ all_nodes = all_left_nodes | all_right_nodes
154
+ else:
155
+ all_nodes = all_left_nodes # Same dataframe, so same node set
156
+
157
+ # Add nodes that don't have edges
158
+ existing_nodes = set(G.nodes())
159
+ isolated_nodes = all_nodes - existing_nodes
160
+ G.add_nodes_from(isolated_nodes)
161
+
162
+ # Print statistics
163
+ if verbose:
164
+ print(
165
+ f"Total potential matches: {len(left_df)} × {len(right_df)} = {len(left_df) * len(right_df):,}"
166
+ )
167
+ print(f"Matches found within {distance_threshold}m: {len(matches_df):,}")
168
+ print(f"Graph nodes: {G.number_of_nodes():,}")
169
+ print(f"Graph edges: {G.number_of_edges():,}")
170
+
171
+ components = list(nx.connected_components(G))
172
+ print(f"Connected components: {len(components):,}")
173
+
174
+ if len(components) > 1:
175
+ component_sizes = [len(c) for c in components]
176
+ print(f"Largest component size: {max(component_sizes):,}")
177
+ print(
178
+ f"Isolated nodes: {sum(1 for size in component_sizes if size == 1):,}"
179
+ )
180
+
181
+ if len(matches_df) > 0:
182
+ print(
183
+ f"Distance stats - min: {matches_df['distance'].min():.1f}m, "
184
+ f"max: {matches_df['distance'].max():.1f}m, "
185
+ f"mean: {matches_df['distance'].mean():.1f}m"
186
+ )
187
+
188
+ return (G, matches_df) if return_dataframe else G
@@ -272,8 +272,13 @@ def buffer_geodataframe(
272
272
  input_crs = gdf_work.crs
273
273
 
274
274
  try:
275
- # Create a custom UTM CRS based on the calculated UTM zone
276
- utm_crs = gdf_work.estimate_utm_crs()
275
+ try:
276
+ utm_crs = gdf_work.estimate_utm_crs()
277
+ except Exception as e:
278
+ LOGGER.warning(
279
+ f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
280
+ )
281
+ utm_crs = "EPSG:3857" # Fallback to Web Mercator
277
282
 
278
283
  # Transform to UTM, create buffer, and transform back
279
284
  gdf_work = gdf_work.to_crs(utm_crs)
@@ -452,7 +457,13 @@ def add_area_in_meters(
452
457
  gdf_with_area = gdf.copy()
453
458
 
454
459
  # Calculate the UTM CRS for accurate area calculation
455
- utm_crs = gdf_with_area.estimate_utm_crs()
460
+ try:
461
+ utm_crs = gdf_with_area.estimate_utm_crs()
462
+ except Exception as e:
463
+ LOGGER.warning(
464
+ f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
465
+ )
466
+ utm_crs = "EPSG:3857" # Fallback to Web Mercator
456
467
 
457
468
  # Transform to UTM CRS and calculate the area in square meters
458
469
  gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area
@@ -858,57 +869,111 @@ def aggregate_polygons_to_zones(
858
869
  zones: gpd.GeoDataFrame,
859
870
  value_columns: Union[str, List[str]],
860
871
  aggregation: Union[str, Dict[str, str]] = "sum",
861
- area_weighted: bool = True,
872
+ predicate: Literal["intersects", "within", "fractional"] = "intersects",
862
873
  zone_id_column: str = "zone_id",
863
874
  output_suffix: str = "",
864
875
  drop_geometry: bool = False,
865
876
  ) -> gpd.GeoDataFrame:
866
877
  """
867
- Aggregate polygon data to zones with area-weighted values.
878
+ Aggregates polygon data to zones based on a specified spatial relationship.
868
879
 
869
- This function maps polygon data to zones, weighting values by the
870
- fractional area of overlap between polygons and zones.
880
+ This function performs a spatial join between polygons and zones and then
881
+ aggregates values from the polygons to their corresponding zones. The aggregation
882
+ method depends on the `predicate` parameter, which determines the nature of the
883
+ spatial relationship.
871
884
 
872
885
  Args:
873
- polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): Polygon data to aggregate
874
- zones (gpd.GeoDataFrame): Zones to aggregate polygons to
875
- value_columns (Union[str, List[str]]): Column(s) containing values to aggregate
876
- aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
877
- - Single string: Use same method for all columns ("sum", "mean", "max", etc.)
878
- - Dict: Map column names to aggregation methods
879
- area_weighted (bool): Whether to weight values by fractional area overlap
880
- If False, values are not weighted before aggregation
881
- zone_id_column (str): Column in zones containing zone identifiers
882
- output_suffix (str): Suffix to add to output column names
883
- drop_geometry (bool): Whether to drop the geometry column from output
886
+ polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
887
+ Polygon data to aggregate. Must be a GeoDataFrame or convertible to one.
888
+ zones (gpd.GeoDataFrame):
889
+ The target zones to which the polygon data will be aggregated.
890
+ value_columns (Union[str, List[str]]):
891
+ The column(s) in `polygons` containing the numeric values to aggregate.
892
+ aggregation (Union[str, Dict[str, str]], optional):
893
+ The aggregation method(s) to use. Can be a single string (e.g., "sum",
894
+ "mean", "max") to apply the same method to all columns, or a dictionary
895
+ mapping column names to aggregation methods (e.g., `{'population': 'sum'}`).
896
+ Defaults to "sum".
897
+ predicate (Literal["intersects", "within", "fractional"], optional):
898
+ The spatial relationship to use for aggregation:
899
+ - "intersects": Aggregates values for any polygon that intersects a zone.
900
+ - "within": Aggregates values for polygons entirely contained within a zone.
901
+ - "fractional": Performs area-weighted aggregation. The value of a polygon
902
+ is distributed proportionally to the area of its overlap with each zone.
903
+ This requires calculating a UTM CRS for accurate area measurements.
904
+ Defaults to "intersects".
905
+ zone_id_column (str, optional):
906
+ The name of the column in `zones` that contains the unique zone identifiers.
907
+ Defaults to "zone_id".
908
+ output_suffix (str, optional):
909
+ A suffix to add to the names of the new aggregated columns in the output
910
+ GeoDataFrame. Defaults to "".
911
+ drop_geometry (bool, optional):
912
+ If True, the geometry column will be dropped from the output GeoDataFrame.
913
+ Defaults to False.
884
914
 
885
915
  Returns:
886
- gpd.GeoDataFrame: Zones with aggregated polygon values
916
+ gpd.GeoDataFrame:
917
+ The `zones` GeoDataFrame with new columns containing the aggregated values.
918
+ Zones with no intersecting or contained polygons will have `0` values.
919
+
920
+ Raises:
921
+ TypeError: If `zones` is not a GeoDataFrame or `polygons` cannot be converted.
922
+ ValueError: If `zone_id_column` or any `value_columns` are not found, or
923
+ if the geometry types in `polygons` are not polygons.
924
+ RuntimeError: If an error occurs during the area-weighted aggregation process.
887
925
 
888
926
  Example:
889
- >>> landuse_stats = aggregate_polygons_to_zones(
927
+ >>> import geopandas as gpd
928
+ >>> # Assuming 'landuse_polygons' and 'grid_zones' are GeoDataFrames
929
+ >>> # Aggregate total population within each grid zone using area-weighting
930
+ >>> pop_by_zone = aggregate_polygons_to_zones(
890
931
  ... landuse_polygons,
891
932
  ... grid_zones,
892
- ... value_columns=["area", "population"],
893
- ... aggregation="sum"
933
+ ... value_columns="population",
934
+ ... predicate="fractional",
935
+ ... aggregation="sum",
936
+ ... output_suffix="_pop"
937
+ ... )
938
+ >>> # Aggregate the count of landuse parcels intersecting each zone
939
+ >>> count_by_zone = aggregate_polygons_to_zones(
940
+ ... landuse_polygons,
941
+ ... grid_zones,
942
+ ... value_columns="parcel_id",
943
+ ... predicate="intersects",
944
+ ... aggregation="count"
894
945
  ... )
895
946
  """
896
947
  # Input validation
897
948
  if not isinstance(zones, gpd.GeoDataFrame):
898
949
  raise TypeError("zones must be a GeoDataFrame")
899
950
 
951
+ if zones.empty:
952
+ raise ValueError("zones GeoDataFrame is empty")
953
+
900
954
  if zone_id_column not in zones.columns:
901
955
  raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
902
956
 
957
+ if predicate not in ["intersects", "within", "fractional"]:
958
+ raise ValueError(
959
+ f"Unsupported predicate: {predicate}. Predicate can be one of `intersects`, `within`, `fractional`"
960
+ )
961
+
903
962
  # Convert polygons to GeoDataFrame if necessary
904
963
  if not isinstance(polygons, gpd.GeoDataFrame):
905
964
  try:
906
965
  polygons_gdf = convert_to_geodataframe(polygons)
907
- except:
908
- raise TypeError("polygons must be a GeoDataFrame or convertible to one")
966
+ except Exception as e:
967
+ raise TypeError(
968
+ f"polygons must be a GeoDataFrame or convertible to one: {e}"
969
+ )
909
970
  else:
910
971
  polygons_gdf = polygons.copy()
911
972
 
973
+ if polygons_gdf.empty:
974
+ LOGGER.warning("Empty polygons GeoDataFrame provided")
975
+ return zones
976
+
912
977
  # Validate geometry types
913
978
  non_polygon_geoms = [
914
979
  geom_type
@@ -935,8 +1000,53 @@ def aggregate_polygons_to_zones(
935
1000
  polygons_gdf = polygons_gdf.to_crs(zones.crs)
936
1001
 
937
1002
  # Handle aggregation method
1003
+ agg_funcs = _process_aggregation_methods(aggregation, value_columns)
1004
+
1005
+ # Prepare minimal zones for spatial operations (only zone_id_column and geometry)
1006
+ minimal_zones = zones[[zone_id_column, "geometry"]].copy()
1007
+
1008
+ if predicate == "fractional":
1009
+ aggregated_data = _fractional_aggregation(
1010
+ polygons_gdf, minimal_zones, value_columns, agg_funcs, zone_id_column
1011
+ )
1012
+ else:
1013
+ aggregated_data = _simple_aggregation(
1014
+ polygons_gdf,
1015
+ minimal_zones,
1016
+ value_columns,
1017
+ agg_funcs,
1018
+ zone_id_column,
1019
+ predicate,
1020
+ )
1021
+
1022
+ # Merge aggregated results back to complete zones data
1023
+ result = zones.merge(
1024
+ aggregated_data[[col for col in aggregated_data.columns if col != "geometry"]],
1025
+ on=zone_id_column,
1026
+ how="left",
1027
+ )
1028
+
1029
+ # Fill NaN values with zeros for the newly aggregated columns only
1030
+ aggregated_cols = [col for col in result.columns if col not in zones.columns]
1031
+ for col in aggregated_cols:
1032
+ if pd.api.types.is_numeric_dtype(result[col]):
1033
+ result[col] = result[col].fillna(0)
1034
+
1035
+ # Apply output suffix consistently to result columns only
1036
+ if output_suffix:
1037
+ rename_dict = {col: f"{col}{output_suffix}" for col in aggregated_cols}
1038
+ result = result.rename(columns=rename_dict)
1039
+
1040
+ if drop_geometry:
1041
+ result = result.drop(columns=["geometry"])
1042
+
1043
+ return result
1044
+
1045
+
1046
+ def _process_aggregation_methods(aggregation, value_columns):
1047
+ """Process and validate aggregation methods"""
938
1048
  if isinstance(aggregation, str):
939
- agg_funcs = {col: aggregation for col in value_columns}
1049
+ return {col: aggregation for col in value_columns}
940
1050
  elif isinstance(aggregation, dict):
941
1051
  # Validate dictionary keys
942
1052
  missing_aggs = [col for col in value_columns if col not in aggregation]
@@ -949,106 +1059,98 @@ def aggregate_polygons_to_zones(
949
1059
  f"Aggregation methods specified for non-existent columns: {extra_aggs}"
950
1060
  )
951
1061
 
952
- agg_funcs = aggregation
1062
+ return aggregation
953
1063
  else:
954
1064
  raise TypeError("aggregation must be a string or dictionary")
955
1065
 
956
- # Create a copy of the zones
957
- result = zones.copy()
958
1066
 
959
- if area_weighted:
960
- # Use area-weighted aggregation with polygon overlay
1067
+ def _fractional_aggregation(
1068
+ polygons_gdf, zones, value_columns, agg_funcs, zone_id_column
1069
+ ):
1070
+ """Perform area-weighted (fractional) aggregation"""
1071
+ try:
1072
+ # Compute UTM CRS for accurate area calculations
961
1073
  try:
962
- # Compute UTM CRS for accurate area calculations
963
1074
  overlay_utm_crs = polygons_gdf.estimate_utm_crs()
1075
+ except Exception as e:
1076
+ LOGGER.warning(f"UTM CRS estimation failed, using Web Mercator. Error: {e}")
1077
+ overlay_utm_crs = "EPSG:3857" # Fallback to Web Mercator
964
1078
 
965
- # Prepare polygons for overlay
966
- polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
967
- polygons_utm["orig_area"] = polygons_utm.area
1079
+ # Prepare polygons for overlay - only necessary columns
1080
+ polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
1081
+ polygons_utm["orig_area"] = polygons_utm.area
968
1082
 
969
- # Keep only necessary columns
970
- overlay_cols = value_columns + ["geometry", "orig_area"]
971
- overlay_gdf = polygons_utm[overlay_cols].copy()
1083
+ # Keep only necessary columns
1084
+ overlay_cols = value_columns + ["geometry", "orig_area"]
1085
+ overlay_gdf = polygons_utm[overlay_cols].copy()
972
1086
 
973
- # Prepare zones for overlay
974
- zones_utm = zones.to_crs(overlay_utm_crs)
1087
+ # Prepare zones for overlay
1088
+ zones_utm = zones.to_crs(overlay_utm_crs)
975
1089
 
976
- # Perform the spatial overlay
977
- gdf_overlayed = gpd.overlay(
978
- overlay_gdf, zones_utm[[zone_id_column, "geometry"]], how="intersection"
979
- )
1090
+ # Perform the spatial overlay
1091
+ gdf_overlayed = gpd.overlay(overlay_gdf, zones_utm, how="intersection")
980
1092
 
981
- # Calculate fractional areas
982
- gdf_overlayed["intersection_area"] = gdf_overlayed.area
983
- gdf_overlayed["area_fraction"] = (
984
- gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
985
- )
1093
+ if gdf_overlayed.empty:
1094
+ LOGGER.warning("No intersections found during fractional aggregation")
1095
+ return zones
986
1096
 
987
- # Apply area weighting to value columns
988
- for col in value_columns:
989
- gdf_overlayed[col] = gdf_overlayed[col] * gdf_overlayed["area_fraction"]
1097
+ # Calculate fractional areas
1098
+ gdf_overlayed["intersection_area"] = gdf_overlayed.area
1099
+ gdf_overlayed["area_fraction"] = (
1100
+ gdf_overlayed["intersection_area"] / gdf_overlayed["orig_area"]
1101
+ )
990
1102
 
991
- # Aggregate by zone ID
992
- aggregated = gdf_overlayed.groupby(zone_id_column)[value_columns].agg(
993
- agg_funcs
994
- )
1103
+ # Apply area weighting to value columns
1104
+ for col in value_columns:
1105
+ gdf_overlayed[col] = gdf_overlayed[col] * gdf_overlayed["area_fraction"]
995
1106
 
996
- # Handle column naming for multi-level index
997
- if isinstance(aggregated.columns, pd.MultiIndex):
998
- aggregated.columns = [
999
- f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
1000
- ]
1107
+ # Aggregate by zone ID
1108
+ aggregated = gdf_overlayed.groupby(zone_id_column)[value_columns].agg(agg_funcs)
1001
1109
 
1002
- # Reset index
1003
- aggregated = aggregated.reset_index()
1110
+ # Handle column naming for multi-level index
1111
+ aggregated = _handle_multiindex_columns(aggregated)
1004
1112
 
1005
- # Merge aggregated values back to the zones
1006
- result = result.merge(aggregated, on=zone_id_column, how="left")
1113
+ # Reset index and merge back to zones
1114
+ aggregated = aggregated.reset_index()
1007
1115
 
1008
- # Fill NaN values with zeros
1009
- for col in result.columns:
1010
- if (
1011
- col != zone_id_column
1012
- and col != "geometry"
1013
- and pd.api.types.is_numeric_dtype(result[col])
1014
- ):
1015
- result[col] = result[col].fillna(0)
1116
+ # Return only the aggregated data (will be merged with full zones later)
1117
+ return aggregated
1016
1118
 
1017
- except Exception as e:
1018
- raise RuntimeError(f"Error during area-weighted aggregation: {e}")
1119
+ except Exception as e:
1120
+ raise RuntimeError(f"Error during area-weighted aggregation: {e}")
1019
1121
 
1020
- else:
1021
- # Non-weighted aggregation - simpler approach
1022
- # Perform spatial join
1023
- joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate="intersects")
1024
1122
 
1025
- # Remove geometry column for aggregation
1026
- if "geometry" in joined.columns:
1027
- joined = joined.drop(columns=["geometry"])
1123
+ def _simple_aggregation(
1124
+ polygons_gdf, zones, value_columns, agg_funcs, zone_id_column, predicate
1125
+ ):
1126
+ """Perform simple (non-weighted) aggregation"""
1127
+ # Perform spatial join
1128
+ joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate=predicate)
1028
1129
 
1029
- # Group by zone ID and aggregate
1030
- aggregated = joined.groupby(zone_id_column)[value_columns].agg(agg_funcs)
1130
+ if joined.empty:
1131
+ LOGGER.warning(f"No {predicate} relationships found during spatial join")
1132
+ return zones
1031
1133
 
1032
- # Handle column naming for multi-level index
1033
- if isinstance(aggregated.columns, pd.MultiIndex):
1034
- aggregated.columns = [
1035
- f"{col[0]}_{col[1]}{output_suffix}" for col in aggregated.columns
1036
- ]
1134
+ # Remove geometry column for aggregation (keep only necessary columns)
1135
+ agg_cols = value_columns + [zone_id_column]
1136
+ joined_subset = joined[agg_cols].copy()
1037
1137
 
1038
- # Reset index and merge back to zones
1039
- aggregated = aggregated.reset_index()
1040
- result = result.merge(aggregated, on=zone_id_column, how="left")
1138
+ # Group by zone ID and aggregate
1139
+ aggregated = joined_subset.groupby(zone_id_column)[value_columns].agg(agg_funcs)
1041
1140
 
1042
- # Fill NaN values with zeros
1043
- for col in result.columns:
1044
- if (
1045
- col != zone_id_column
1046
- and col != "geometry"
1047
- and pd.api.types.is_numeric_dtype(result[col])
1048
- ):
1049
- result[col] = result[col].fillna(0)
1141
+ # Handle column naming for multi-level index
1142
+ aggregated = _handle_multiindex_columns(aggregated)
1050
1143
 
1051
- if drop_geometry:
1052
- result = result.drop(columns=["geometry"])
1144
+ # Reset index and merge back to zones
1145
+ aggregated = aggregated.reset_index()
1053
1146
 
1054
- return result
1147
+ # Return only the aggregated data (will be merged with full zones later)
1148
+ return aggregated
1149
+
1150
+
1151
+ def _handle_multiindex_columns(aggregated):
1152
+ """Handle multi-level column index from groupby aggregation"""
1153
+ if isinstance(aggregated.columns, pd.MultiIndex):
1154
+ # Flatten multi-level columns: combine column name with aggregation method
1155
+ aggregated.columns = [f"{col[0]}_{col[1]}" for col in aggregated.columns]
1156
+ return aggregated