giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,6 @@ from gigaspatial.core.io.local_data_store import LocalDataStore
13
13
  from gigaspatial.core.io.writers import write_dataset
14
14
  from gigaspatial.config import config as global_config
15
15
  from gigaspatial.processing.geo import (
16
- convert_to_geodataframe,
17
16
  aggregate_polygons_to_zones,
18
17
  aggregate_points_to_zones,
19
18
  )
@@ -77,6 +76,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
77
76
  self.config = config or ZonalViewGeneratorConfig()
78
77
  self.data_store = data_store or LocalDataStore()
79
78
  self.logger = logger or global_config.get_logger(self.__class__.__name__)
79
+ self._view: Optional[pd.DataFrame] = None
80
80
 
81
81
  @abstractmethod
82
82
  def get_zonal_geometries(self) -> List[Polygon]:
@@ -103,7 +103,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
103
103
  """
104
104
  pass
105
105
 
106
- def to_geodataframe(self) -> gpd.GeoDataFrame:
106
+ def get_zone_geodataframe(self) -> gpd.GeoDataFrame:
107
107
  """Convert zones to a GeoDataFrame.
108
108
 
109
109
  Creates a GeoDataFrame containing zone identifiers and their corresponding
@@ -131,9 +131,77 @@ class ZonalViewGenerator(ABC, Generic[T]):
131
131
  and identifiers.
132
132
  """
133
133
  if not hasattr(self, "_zone_gdf"):
134
- self._zone_gdf = self.to_geodataframe()
134
+ self._zone_gdf = self.get_zone_geodataframe()
135
135
  return self._zone_gdf
136
136
 
137
+ @property
138
+ def view(self) -> pd.DataFrame:
139
+ """The DataFrame representing the current zonal view.
140
+
141
+ Returns:
142
+ pd.DataFrame: The DataFrame containing zone IDs, and
143
+ any added variables. If no variables have been added,
144
+ it returns the base `zone_gdf` without geometries.
145
+ """
146
+ if self._view is None:
147
+ self._view = self.zone_gdf.drop(columns="geometry")
148
+ return self._view
149
+
150
+ def add_variable_to_view(self, data_dict: Dict, column_name: str) -> None:
151
+ """
152
+ Adds a new variable (column) to the zonal view GeoDataFrame.
153
+
154
+ This method takes a dictionary (typically the result of map_points or map_polygons)
155
+ and adds its values as a new column to the internal `_view` (or `zone_gdf` if not yet initialized).
156
+ The dictionary keys are expected to be the `zone_id` values.
157
+
158
+ Args:
159
+ data_dict (Dict): A dictionary where keys are `zone_id`s and values are
160
+ the data to be added.
161
+ column_name (str): The name of the new column to be added to the GeoDataFrame.
162
+ Raises:
163
+ ValueError: If the `data_dict` keys do not match the `zone_id`s in the zonal view.
164
+ If the `column_name` already exists in the zonal view.
165
+ """
166
+ if self._view is None:
167
+ self._view = self.zone_gdf.drop(columns="geometry")
168
+
169
+ if column_name in self._view.columns:
170
+ raise ValueError(
171
+ f"Column '{column_name}' already exists in the zonal view."
172
+ )
173
+
174
+ # Create a pandas Series from the dictionary, aligning by index (zone_id)
175
+ new_series = pd.Series(data_dict, name=column_name)
176
+
177
+ # Before merging, ensure the zone_ids in data_dict match those in _view
178
+ missing_zones_in_data = set(self._view["zone_id"]) - set(new_series.index)
179
+ extra_zones_in_data = set(new_series.index) - set(self._view["zone_id"])
180
+
181
+ if missing_zones_in_data:
182
+ self.logger.warning(
183
+ f"Warning: {len(missing_zones_in_data)} zone(s) from the zonal view "
184
+ f"are missing in the provided data_dict for column '{column_name}'. "
185
+ f"These zones will have NaN values for '{column_name}'. Missing: {list(missing_zones_in_data)[:5]}..."
186
+ )
187
+ if extra_zones_in_data:
188
+ self.logger.warning(
189
+ f"Warning: {len(extra_zones_in_data)} zone(s) in the provided data_dict "
190
+ f"are not present in the zonal view for column '{column_name}'. "
191
+ f"These will be ignored. Extra: {list(extra_zones_in_data)[:5]}..."
192
+ )
193
+
194
+ # Merge the new series with the _view based on 'zone_id'
195
+ # Using .set_index() for efficient alignment
196
+ original_index_name = self._view.index.name
197
+ self._view = self._view.set_index("zone_id").join(new_series).reset_index()
198
+ if original_index_name: # Restore original index name if it existed
199
+ self._view.index.name = original_index_name
200
+ else: # If it was a default integer index, ensure it's not named 'index'
201
+ self._view.index.name = None
202
+
203
+ self.logger.info(f"Added variable '{column_name}' to the zonal view.")
204
+
137
205
  def map_points(
138
206
  self,
139
207
  points: Union[pd.DataFrame, gpd.GeoDataFrame],
@@ -173,98 +241,144 @@ class ZonalViewGenerator(ABC, Generic[T]):
173
241
  if mapping_function is not None:
174
242
  return mapping_function(self, points, **mapping_kwargs)
175
243
 
176
- else:
244
+ self.logger.warning(
245
+ "Using default points mapping implementation. Consider creating a specialized mapping function."
246
+ )
247
+ result = aggregate_points_to_zones(
248
+ points=points,
249
+ zones=self.zone_gdf,
250
+ value_columns=value_columns,
251
+ aggregation=aggregation,
252
+ point_zone_predicate=predicate,
253
+ zone_id_column="zone_id",
254
+ output_suffix=output_suffix,
255
+ )
256
+
257
+ if isinstance(value_columns, str):
258
+ return result.set_index("zone_id")[value_columns].to_dict()
259
+ elif isinstance(value_columns, list):
260
+ # If multiple value columns, return a dictionary of dictionaries
261
+ # Or, if preferred, a dictionary where values are lists/tuples of results
262
+ # For now, let's return a dict of series, which is common.
263
+ # The previous version implied a single dictionary result from map_points/polygons
264
+ # but with multiple columns, it's usually {zone_id: {col1: val1, col2: val2}}
265
+ # or {col_name: {zone_id: val}}
266
+ # In this version, it'll return a dictionary for each column.
267
+ return {
268
+ col: result.set_index("zone_id")[col].to_dict() for col in value_columns
269
+ }
270
+ else: # If value_columns is None, it should return point_count
177
271
  self.logger.warning(
178
- "Using default points mapping implementation. Consider creating a specialized mapping function."
272
+ "No `value_columns` provided. Mapping point counts. Consider passing `value_columns` and `aggregation` or `mapping_function`."
179
273
  )
180
- result = aggregate_points_to_zones(
181
- points=points,
182
- zones=self.zone_gdf,
183
- value_columns=value_columns,
184
- aggregation=aggregation,
185
- point_zone_predicate=predicate,
186
- zone_id_column="zone_id",
187
- output_suffix=output_suffix,
188
- )
189
-
190
- if not value_columns:
191
- return result["point_count"].to_dict()
192
-
193
- return result[value_columns].to_dict()
274
+ return result.set_index("zone_id")["point_count"].to_dict()
194
275
 
195
276
  def map_polygons(
196
277
  self,
197
- polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
278
+ polygons,
198
279
  value_columns: Optional[Union[str, List[str]]] = None,
199
- aggregation: Union[str, Dict[str, str]] = "sum",
200
- area_weighted: bool = False,
201
- area_column: str = "area_in_meters",
202
- mapping_function: Optional[Callable] = None,
203
- **mapping_kwargs,
280
+ aggregation: Union[str, Dict[str, str]] = "count",
281
+ predicate: str = "intersects",
282
+ **kwargs,
204
283
  ) -> Dict:
205
- """Map polygon data to zones with optional area weighting.
284
+ """
285
+ Maps polygon data to the instance's zones and aggregates values.
206
286
 
207
- Aggregates polygon data to zones based on spatial intersections. Values can be
208
- weighted by the fractional area of intersection between polygons and zones.
287
+ This method leverages `aggregate_polygons_to_zones` to perform a spatial
288
+ aggregation of polygon data onto the zones stored within this object instance.
289
+ It can count polygons, or aggregate their values, based on different spatial
290
+ relationships defined by the `predicate`.
209
291
 
210
292
  Args:
211
- polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): The polygon data to map.
212
- Must contain geometry information if DataFrame.
213
- value_columns (Union[str, List[str]], optional): Column name(s) to aggregate.
214
- If None, only intersection areas will be calculated.
215
- aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use.
216
- Can be a single string ("sum", "mean", "max", "min") or a dictionary
217
- mapping column names to specific aggregation methods. Defaults to "sum".
218
- area_weighted (bool): Whether to weight values by fractional area of
219
- intersection. Defaults to False.
220
- area_column (str): Name of column to store calculated areas. Only used
221
- if area calculation is needed. Defaults to "area_in_meters".
222
- mapping_function (Callable, optional): Custom function for mapping polygons
223
- to zones. If provided, signature should be mapping_function(self, polygons, **mapping_kwargs).
224
- When used, all other parameters except mapping_kwargs are ignored.
225
- **mapping_kwargs: Additional keyword arguments passed to the mapping function.
293
+ polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
294
+ The polygon data to map. Must contain geometry information if a
295
+ DataFrame.
296
+ value_columns (Union[str, List[str]], optional):
297
+ The column name(s) from the `polygons` data to aggregate. If `None`,
298
+ the method will automatically count the number of polygons that
299
+ match the given `predicate` for each zone.
300
+ aggregation (Union[str, Dict[str, str]], optional):
301
+ The aggregation method(s) to use. Can be a single string (e.g., "sum",
302
+ "mean", "max") or a dictionary mapping column names to specific
303
+ aggregation methods. This is ignored and set to "count" if
304
+ `value_columns` is `None`. Defaults to "count".
305
+ predicate (Literal["intersects", "within", "fractional"], optional):
306
+ The spatial relationship to use for aggregation:
307
+ - "intersects": Counts or aggregates values for any polygon that
308
+ intersects a zone.
309
+ - "within": Counts or aggregates values for polygons that are
310
+ entirely contained within a zone.
311
+ - "fractional": Performs area-weighted aggregation. The value of a
312
+ polygon is distributed proportionally to the area of its overlap
313
+ with each zone.
314
+ Defaults to "intersects".
315
+ **kwargs:
316
+ Additional keyword arguments to be passed to the underlying
317
+ `aggregate_polygons_to_zones_new` function.
226
318
 
227
319
  Returns:
228
- Dict: Dictionary with zone IDs as keys and aggregated values as values.
229
- Returns aggregated values for the specified value_columns.
320
+ Dict:
321
+ A dictionary or a nested dictionary containing the aggregated values,
322
+ with zone IDs as keys. If `value_columns` is a single string, the
323
+ return value is a dictionary mapping zone ID to the aggregated value.
324
+ If `value_columns` is a list, the return value is a nested dictionary
325
+ mapping each column name to its own dictionary of aggregated values.
230
326
 
231
327
  Raises:
232
- TypeError: If polygons cannot be converted to a GeoDataFrame.
328
+ ValueError: If `value_columns` is of an unexpected type after processing.
329
+
330
+ Example:
331
+ >>> # Assuming 'self' is an object with a 'zone_gdf' attribute
332
+ >>> # Count all land parcels that intersect each zone
333
+ >>> parcel_counts = self.map_polygons(landuse_polygons)
334
+ >>>
335
+ >>> # Aggregate total population within zones using area weighting
336
+ >>> population_by_zone = self.map_polygons(
337
+ ... landuse_polygons,
338
+ ... value_columns="population",
339
+ ... predicate="fractional",
340
+ ... aggregation="sum"
341
+ ... )
342
+ >>>
343
+ >>> # Get the sum of residential area and count of buildings within each zone
344
+ >>> residential_stats = self.map_polygons(
345
+ ... building_polygons,
346
+ ... value_columns=["residential_area_sqm", "building_id"],
347
+ ... aggregation={"residential_area_sqm": "sum", "building_id": "count"},
348
+ ... predicate="intersects"
349
+ ... )
233
350
  """
234
- if mapping_function is not None:
235
- return mapping_function(self, polygons, **mapping_kwargs)
236
-
237
- if area_column not in polygons_gdf:
238
- if not isinstance(polygons, gpd.GeoDataFrame):
239
- try:
240
- polygons_gdf = convert_to_geodataframe(polygons)
241
- except:
242
- raise TypeError(
243
- "polygons must be a GeoDataFrame or convertible to one"
244
- )
245
- else:
246
- polygons_gdf = polygons.copy()
247
-
248
- polygons_gdf[area_column] = polygons_gdf.to_crs(
249
- polygons_gdf.estimate_utm_crs()
250
- ).geometry.area
251
351
 
252
352
  if value_columns is None:
253
353
  self.logger.warning(
254
- "Using default polygon mapping implementation. Consider providing value_columns."
354
+ f"No value_columns specified. Defaulting to counting polygons with {predicate} predicate."
255
355
  )
256
- value_columns = area_column
356
+ temp_value_col = "_temp_polygon_count_dummy"
357
+ polygons[temp_value_col] = 1
358
+ actual_value_columns = temp_value_col
359
+ aggregation = "count" # Force count if no value columns
360
+ else:
361
+ actual_value_columns = value_columns
257
362
 
258
363
  result = aggregate_polygons_to_zones(
259
- polygons=polygons_gdf,
364
+ polygons=polygons,
260
365
  zones=self.zone_gdf,
261
- value_columns=value_columns,
366
+ value_columns=actual_value_columns,
262
367
  aggregation=aggregation,
263
- area_weighted=area_weighted,
368
+ predicate=predicate,
264
369
  zone_id_column="zone_id",
265
370
  )
266
371
 
267
- return result[value_columns].to_dict()
372
+ # Convert the result GeoDataFrame to the expected dictionary format
373
+ if isinstance(actual_value_columns, str):
374
+ return result.set_index("zone_id")[actual_value_columns].to_dict()
375
+ elif isinstance(actual_value_columns, list):
376
+ return {
377
+ col: result.set_index("zone_id")[col].to_dict()
378
+ for col in actual_value_columns
379
+ }
380
+ else:
381
+ raise ValueError("Unexpected type for actual_value_columns.")
268
382
 
269
383
  def map_rasters(
270
384
  self,
@@ -291,7 +405,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
291
405
 
292
406
  Returns:
293
407
  Union[np.ndarray, Dict]: By default, returns a NumPy array of sampled values
294
- with shape (n_zones, n_rasters), taking the first non-nodata value encountered.
408
+ with shape (n_zones, 1), taking the first non-nodata value encountered.
295
409
  Custom mapping functions may return different data structures.
296
410
 
297
411
  Note:
@@ -301,10 +415,6 @@ class ZonalViewGenerator(ABC, Generic[T]):
301
415
  if mapping_function is not None:
302
416
  return mapping_function(self, tif_processors, **mapping_kwargs)
303
417
 
304
- self.logger.warning(
305
- "Using default raster mapping implementation. Consider creating a specialized mapping function."
306
- )
307
-
308
418
  raster_crs = tif_processors[0].crs
309
419
 
310
420
  if raster_crs != self.zone_gdf.crs:
@@ -318,7 +428,9 @@ class ZonalViewGenerator(ABC, Generic[T]):
318
428
  tif_processors=tif_processors, polygon_list=zone_geoms, stat=stat
319
429
  )
320
430
 
321
- return sampled_values
431
+ zone_ids = self.get_zone_identifiers()
432
+
433
+ return {zone_id: value for zone_id, value in zip(zone_ids, sampled_values)}
322
434
 
323
435
  @lru_cache(maxsize=32)
324
436
  def _get_transformed_geometries(self, target_crs):
@@ -337,34 +449,78 @@ class ZonalViewGenerator(ABC, Generic[T]):
337
449
 
338
450
  def save_view(
339
451
  self,
340
- view_data: gpd.GeoDataFrame,
341
452
  name: str,
342
453
  output_format: Optional[str] = None,
343
454
  ) -> Path:
344
455
  """Save the generated zonal view to disk.
345
456
 
346
457
  Args:
347
- view_data (gpd.GeoDataFrame): The zonal view data to save.
348
458
  name (str): Base name for the output file (without extension).
349
459
  output_format (str, optional): File format to save in (e.g., "parquet",
350
- "geojson", "shp"). If None, uses the format specified in generator_config.
460
+ "geojson", "shp"). If None, uses the format specified in config.
351
461
 
352
462
  Returns:
353
463
  Path: The full path where the view was saved.
354
464
 
355
465
  Note:
356
- The output directory is determined by the generator_config.base_path setting.
466
+ The output directory is determined by the config.base_path setting.
357
467
  The file extension is automatically added based on the output format.
468
+ This method now saves the internal `self.view`.
358
469
  """
470
+ if self._view is None:
471
+ self.logger.warning(
472
+ "No variables have been added to the zonal view. Saving the base zone_gdf."
473
+ )
474
+ view_to_save = self.zone_gdf
475
+ else:
476
+ view_to_save = self._view
477
+
359
478
  format_to_use = output_format or self.config.output_format
360
479
  output_path = self.config.base_path / f"{name}.{format_to_use}"
361
480
 
362
481
  self.logger.info(f"Saving zonal view to {output_path}")
482
+
483
+ if format_to_use in ["geojson", "shp", "gpkg"]:
484
+ self.logger.warning(
485
+ f"Saving to {format_to_use} requires converting back to GeoDataFrame. Geometry column will be re-added."
486
+ )
487
+ # Re-add geometry for saving to geospatial formats
488
+ view_to_save = self.view.merge(
489
+ self.zone_gdf[["zone_id", "geometry"]], on="zone_id", how="left"
490
+ )
491
+
363
492
  write_dataset(
364
- df=view_data,
493
+ data=view_to_save,
365
494
  path=str(output_path),
366
495
  data_store=self.data_store,
367
- format=format_to_use,
368
496
  )
369
497
 
370
498
  return output_path
499
+
500
+ def to_dataframe(self) -> pd.DataFrame:
501
+ """
502
+ Returns the current zonal view as a DataFrame.
503
+
504
+ This method combines all accumulated variables in the view
505
+
506
+ Returns:
507
+ pd.DataFrame: The current view.
508
+ """
509
+ return self.view
510
+
511
+ def to_geodataframe(self) -> gpd.GeoDataFrame:
512
+ """
513
+ Returns the current zonal view merged with zone geometries as a GeoDataFrame.
514
+
515
+ This method combines all accumulated variables in the view with the corresponding
516
+ zone geometries, providing a spatially-enabled DataFrame for further analysis or export.
517
+
518
+ Returns:
519
+ gpd.GeoDataFrame: The current view merged with zone geometries.
520
+ """
521
+ return gpd.GeoDataFrame(
522
+ (self.view).merge(
523
+ self.zone_gdf[["zone_id", "geometry"]], on="zone_id", how="left"
524
+ ),
525
+ crs=self.zone_gdf.crs,
526
+ )