giga-spatial 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ from gigaspatial.processing.geo import (
18
18
  buffer_geodataframe,
19
19
  detect_coordinate_columns,
20
20
  aggregate_polygons_to_zones,
21
+ get_centroids,
21
22
  )
22
23
  from gigaspatial.processing.tif_processor import (
23
24
  sample_multiple_tifs_by_polygons,
@@ -63,6 +64,7 @@ class PoiViewGenerator:
63
64
  points: Union[
64
65
  List[Tuple[float, float]], List[dict], pd.DataFrame, gpd.GeoDataFrame
65
66
  ],
67
+ poi_id_column: str = "poi_id",
66
68
  config: Optional[PoiViewGeneratorConfig] = None,
67
69
  data_store: Optional[DataStore] = None,
68
70
  logger: logging.Logger = None,
@@ -87,16 +89,21 @@ class PoiViewGenerator:
87
89
  An instance of a data store for managing data access (e.g., LocalDataStore).
88
90
  If None, a default `LocalDataStore` will be used.
89
91
  """
92
+ if hasattr(points, "__len__") and len(points) == 0:
93
+ raise ValueError("Points input cannot be empty")
94
+
90
95
  self.config = config or PoiViewGeneratorConfig()
91
96
  self.data_store = data_store or LocalDataStore()
92
97
  self.logger = logger or global_config.get_logger(self.__class__.__name__)
93
- self._points_gdf = self._init_points_gdf(points)
98
+ self._points_gdf = self._init_points_gdf(points, poi_id_column)
99
+ self._view: pd.DataFrame = self._points_gdf.drop(columns=["geometry"])
94
100
 
95
101
  @staticmethod
96
102
  def _init_points_gdf(
97
103
  points: Union[
98
104
  List[Tuple[float, float]], List[dict], pd.DataFrame, gpd.GeoDataFrame
99
105
  ],
106
+ poi_id_column: str,
100
107
  ) -> gpd.GeoDataFrame:
101
108
  """
102
109
  Internal static method to convert various point input formats into a GeoDataFrame.
@@ -125,8 +132,19 @@ class PoiViewGenerator:
125
132
  points = points.copy()
126
133
  points["latitude"] = points.geometry.y
127
134
  points["longitude"] = points.geometry.x
128
- if "poi_id" not in points.columns:
135
+ if poi_id_column not in points.columns:
129
136
  points["poi_id"] = [f"poi_{i}" for i in range(len(points))]
137
+ else:
138
+ points = points.rename(
139
+ columns={poi_id_column: "poi_id"},
140
+ )
141
+ if points["poi_id"].duplicated().any():
142
+ raise ValueError(
143
+ f"Column '{poi_id_column}' provided as 'poi_id_column' contains duplicate values."
144
+ )
145
+
146
+ if points.crs != "EPSG:4326":
147
+ points = points.to_crs("EPSG:4326")
130
148
  return points
131
149
 
132
150
  elif isinstance(points, pd.DataFrame):
@@ -136,8 +154,16 @@ class PoiViewGenerator:
136
154
  points = points.copy()
137
155
  points["latitude"] = points[lat_col]
138
156
  points["longitude"] = points[lon_col]
139
- if "poi_id" not in points.columns:
157
+ if poi_id_column not in points.columns:
140
158
  points["poi_id"] = [f"poi_{i}" for i in range(len(points))]
159
+ else:
160
+ points = points.rename(
161
+ columns={poi_id_column: "poi_id"},
162
+ )
163
+ if points["poi_id"].duplicated().any():
164
+ raise ValueError(
165
+ f"Column '{poi_id_column}' provided as 'poi_id_column' contains duplicate values."
166
+ )
141
167
  return convert_to_geodataframe(points)
142
168
  except ValueError as e:
143
169
  raise ValueError(
@@ -165,8 +191,16 @@ class PoiViewGenerator:
165
191
  lat_col, lon_col = detect_coordinate_columns(df)
166
192
  df["latitude"] = df[lat_col]
167
193
  df["longitude"] = df[lon_col]
168
- if "poi_id" not in df.columns:
194
+ if poi_id_column not in df.columns:
169
195
  df["poi_id"] = [f"poi_{i}" for i in range(len(points))]
196
+ else:
197
+ df = df.rename(
198
+ columns={poi_id_column: "poi_id"},
199
+ )
200
+ if df["poi_id"].duplicated().any():
201
+ raise ValueError(
202
+ f"Column '{poi_id_column}' provided as 'poi_id_column' contains duplicate values."
203
+ )
170
204
  return convert_to_geodataframe(df)
171
205
  except ValueError as e:
172
206
  raise ValueError(
@@ -180,6 +214,53 @@ class PoiViewGenerator:
180
214
  """Gets the internal GeoDataFrame of points of interest."""
181
215
  return self._points_gdf
182
216
 
217
+ @property
218
+ def view(self) -> pd.DataFrame:
219
+ """The DataFrame representing the current point of interest view."""
220
+ return self._view
221
+
222
+ def _update_view(self, new_data: pd.DataFrame) -> None:
223
+ """
224
+ Internal helper to update the main view DataFrame with new columns.
225
+ This method is designed to be called by map_* methods.
226
+
227
+ Args:
228
+ new_data (pd.DataFrame): A DataFrame containing 'poi_id' and new columns
229
+ to be merged into the main view.
230
+ """
231
+ if "poi_id" not in new_data.columns:
232
+ available_cols = list(new_data.columns)
233
+ raise ValueError(
234
+ f"new_data DataFrame must contain 'poi_id' column. "
235
+ f"Available columns: {available_cols}"
236
+ )
237
+
238
+ # Check for poi_id mismatches
239
+ original_poi_ids = set(self._view["poi_id"])
240
+ new_poi_ids = set(new_data["poi_id"])
241
+ missing_pois = original_poi_ids - new_poi_ids
242
+
243
+ if missing_pois:
244
+ self.logger.warning(
245
+ f"{len(missing_pois)} POIs will have NaN values for new columns"
246
+ )
247
+
248
+ # Ensure poi_id is the index for efficient merging
249
+ # Create a copy to avoid SettingWithCopyWarning if new_data is a slice
250
+ new_data_indexed = new_data.set_index("poi_id").copy()
251
+
252
+ # Merge on 'poi_id' (which is now the index of self._view and new_data_indexed)
253
+ # Using left join to keep all POIs from the original view
254
+ self._view = (
255
+ self._view.set_index("poi_id")
256
+ .join(new_data_indexed, how="left")
257
+ .reset_index()
258
+ )
259
+
260
+ self.logger.debug(
261
+ f"View updated with columns: {list(new_data_indexed.columns)}"
262
+ )
263
+
183
264
  def map_nearest_points(
184
265
  self,
185
266
  points_df: Union[pd.DataFrame, gpd.GeoDataFrame],
@@ -228,7 +309,7 @@ class PoiViewGenerator:
228
309
  # Validate input DataFrame
229
310
  if points_df.empty:
230
311
  self.logger.info("No points found in the input DataFrame")
231
- return self.points_gdf.copy()
312
+ return self.view
232
313
 
233
314
  # Handle GeoDataFrame
234
315
  if isinstance(points_df, gpd.GeoDataFrame):
@@ -275,14 +356,19 @@ class PoiViewGenerator:
275
356
  lat2=df_nearest[lat_column],
276
357
  lon2=df_nearest[lon_column],
277
358
  )
278
- result = points_df_poi.copy()
279
- result[f"{output_prefix}_id"] = df_nearest[id_column].to_numpy()
280
- result[f"{output_prefix}_distance"] = dist
359
+ # Create a temporary DataFrame to hold the results for merging
360
+ temp_result_df = pd.DataFrame(
361
+ {
362
+ "poi_id": points_df_poi["poi_id"],
363
+ f"{output_prefix}_id": points_df.iloc[idx][id_column].values,
364
+ f"{output_prefix}_distance": dist,
365
+ }
366
+ )
367
+ self._update_view(temp_result_df)
281
368
  self.logger.info(
282
369
  f"Nearest points mapping complete with prefix '{output_prefix}'"
283
370
  )
284
- self._points_gdf = result
285
- return result
371
+ return self.view
286
372
 
287
373
  def map_google_buildings(
288
374
  self,
@@ -316,7 +402,7 @@ class PoiViewGenerator:
316
402
  )
317
403
  if buildings_df is None or len(buildings_df) == 0:
318
404
  self.logger.info("No Google buildings data found for the provided POIs")
319
- return self.points_gdf.copy()
405
+ return self.view
320
406
 
321
407
  return self.map_nearest_points(
322
408
  points_df=buildings_df,
@@ -359,16 +445,17 @@ class PoiViewGenerator:
359
445
  self.logger.info("No Microsoft buildings data found for the provided POIs")
360
446
  return self.points_gdf.copy()
361
447
 
448
+ building_centroids = get_centroids(buildings_gdf)
449
+
362
450
  if "building_id" not in buildings_gdf:
363
451
  self.logger.info("Creating building IDs from coordinates")
364
- buildings_gdf = buildings_gdf.copy()
365
- buildings_gdf["building_id"] = buildings_gdf.apply(
452
+ building_centroids["building_id"] = building_centroids.apply(
366
453
  lambda row: f"{row.geometry.y:.6f}_{row.geometry.x:.6f}",
367
454
  axis=1,
368
455
  )
369
456
 
370
457
  return self.map_nearest_points(
371
- points_df=buildings_gdf,
458
+ points_df=building_centroids,
372
459
  id_column="building_id",
373
460
  output_prefix="nearest_ms_building",
374
461
  **kwargs,
@@ -424,35 +511,52 @@ class PoiViewGenerator:
424
511
  ValueError: If no valid data is provided, if parameters are incompatible,
425
512
  or if required parameters (value_column) are missing for polygon data.
426
513
  """
514
+
427
515
  if isinstance(data, list) and all(isinstance(x, TifProcessor) for x in data):
516
+ results_df = pd.DataFrame({"poi_id": self.points_gdf["poi_id"]})
517
+
428
518
  # Handle raster data
429
519
  if not data:
430
520
  self.logger.info("No valid raster data found for the provided POIs")
431
- return self.points_gdf.copy()
521
+ return self.view
522
+
523
+ raster_crs = data[0].crs
524
+
525
+ if not all(tp.crs == raster_crs for tp in data):
526
+ raise ValueError(
527
+ "All TifProcessors must have the same CRS for zonal statistics."
528
+ )
432
529
 
433
530
  if map_radius_meters is not None:
434
531
  self.logger.info(
435
532
  f"Calculating {stat} within {map_radius_meters}m buffers around POIs"
436
533
  )
437
534
  # Create buffers around POIs
438
- polygon_list = buffer_geodataframe(
535
+ buffers_gdf = buffer_geodataframe(
439
536
  self.points_gdf,
440
537
  buffer_distance_meters=map_radius_meters,
441
538
  cap_style="round",
442
- ).geometry
539
+ )
443
540
 
444
541
  # Calculate zonal statistics
445
542
  sampled_values = sample_multiple_tifs_by_polygons(
446
- tif_processors=data, polygon_list=polygon_list, stat=stat, **kwargs
543
+ tif_processors=data,
544
+ polygon_list=buffers_gdf.to_crs(raster_crs).geometry,
545
+ stat=stat,
546
+ **kwargs,
447
547
  )
448
548
  else:
449
549
  self.logger.info(f"Sampling {stat} at POI locations")
450
550
  # Sample directly at POI locations
451
- coord_list = self.points_gdf[["latitude", "longitude"]].to_numpy()
551
+ coord_list = (
552
+ self.points_gdf.to_crs(raster_crs).get_coordinates().to_numpy()
553
+ )
452
554
  sampled_values = sample_multiple_tifs_by_coordinates(
453
555
  tif_processors=data, coordinate_list=coord_list, **kwargs
454
556
  )
455
557
 
558
+ results_df[output_column] = sampled_values
559
+
456
560
  elif isinstance(data, gpd.GeoDataFrame):
457
561
  # Handle polygon data
458
562
  if data.empty:
@@ -465,6 +569,11 @@ class PoiViewGenerator:
465
569
  if value_column is None:
466
570
  raise ValueError("value_column must be provided for polygon data")
467
571
 
572
+ if value_column not in data.columns:
573
+ raise ValueError(
574
+ f"Value column '{value_column}' not found in input polygon GeoDataFrame."
575
+ )
576
+
468
577
  self.logger.info(
469
578
  f"Aggregating {value_column} within {map_radius_meters}m buffers around POIs"
470
579
  )
@@ -477,7 +586,7 @@ class PoiViewGenerator:
477
586
  )
478
587
 
479
588
  # Aggregate polygons to buffers
480
- result = aggregate_polygons_to_zones(
589
+ aggregation_result_gdf = aggregate_polygons_to_zones(
481
590
  polygons=data,
482
591
  zones=buffer_gdf,
483
592
  value_columns=value_column,
@@ -487,19 +596,18 @@ class PoiViewGenerator:
487
596
  **kwargs,
488
597
  )
489
598
 
490
- # Extract values for each POI
491
- sampled_values = result[value_column].values
599
+ results_df = aggregation_result_gdf[["poi_id", value_column]].copy()
492
600
 
493
601
  else:
494
602
  raise ValueError(
495
603
  "data must be either a list of TifProcessor objects or a GeoDataFrame"
496
604
  )
497
605
 
498
- result = self.points_gdf.copy()
499
- result[output_column] = sampled_values
500
- self.logger.info(f"Zonal statistics mapping complete: {output_column}")
501
- self._points_gdf = result
502
- return result
606
+ self._update_view(results_df)
607
+ self.logger.info(
608
+ f"Zonal statistics mapping complete for column(s) derived from '{output_column}' or '{value_column}'"
609
+ )
610
+ return self.view
503
611
 
504
612
  def map_built_s(
505
613
  self,
@@ -539,10 +647,9 @@ class PoiViewGenerator:
539
647
  data_store=self.data_store,
540
648
  **kwargs,
541
649
  )
542
- gdf_points = self.points_gdf.to_crs(handler.config.crs)
543
650
  self.logger.info("Loading GHSL Built Surface raster tiles")
544
651
  tif_processors = handler.load_data(
545
- gdf_points, ensure_available=self.config.ensure_available
652
+ self.points_gdf.copy(), ensure_available=self.config.ensure_available
546
653
  )
547
654
 
548
655
  return self.map_zonal_stats(
@@ -557,7 +664,7 @@ class PoiViewGenerator:
557
664
  self,
558
665
  stat="median",
559
666
  dataset_year=2020,
560
- dataset_resolution=100,
667
+ dataset_resolution=1000,
561
668
  output_column="smod_class",
562
669
  **kwargs,
563
670
  ) -> pd.DataFrame:
@@ -589,10 +696,9 @@ class PoiViewGenerator:
589
696
  **kwargs,
590
697
  )
591
698
 
592
- gdf_points = self.points_gdf.to_crs(handler.config.crs)
593
699
  self.logger.info("Loading GHSL SMOD raster tiles")
594
700
  tif_processors = handler.load_data(
595
- gdf_points, ensure_available=self.config.ensure_available
701
+ self.points_gdf.copy(), ensure_available=self.config.ensure_available
596
702
  )
597
703
 
598
704
  return self.map_zonal_stats(
@@ -608,29 +714,108 @@ class PoiViewGenerator:
608
714
  output_format: Optional[str] = None,
609
715
  ) -> Path:
610
716
  """
611
- Saves the current POI view (the enriched GeoDataFrame) to a file.
717
+ Saves the current POI view (the enriched DataFrame) to a file.
612
718
 
613
- The output path and format are determined by the `generator_config`
719
+ The output path and format are determined by the `config`
614
720
  or overridden by the `output_format` parameter.
615
721
 
616
722
  Args:
617
723
  name (str): The base name for the output file (without extension).
618
724
  output_format (Optional[str]):
619
725
  The desired output format (e.g., "csv", "geojson"). If None,
620
- the `output_format` from `generator_config` will be used.
726
+ the `output_format` from `config` will be used.
621
727
 
622
728
  Returns:
623
729
  Path: The full path to the saved output file.
624
730
  """
625
- format_to_use = output_format or self.generator_config.output_format
626
- output_path = self.generator_config.base_path / f"{name}.{format_to_use}"
731
+ format_to_use = output_format or self.config.output_format
732
+ output_path = self.config.base_path / f"{name}.{format_to_use}"
627
733
 
628
734
  self.logger.info(f"Saving POI view to {output_path}")
629
- write_dataset(
630
- df=self.points_gdf,
631
- path=str(output_path),
632
- data_store=self.data_store,
633
- format=format_to_use,
634
- )
735
+ # Save the current view, which is a pandas DataFrame, not a GeoDataFrame
736
+ # GeoJSON/Shapefile formats would require converting back to GeoDataFrame first.
737
+ # For CSV, Parquet, Feather, this is fine.
738
+ if format_to_use in ["geojson", "shp", "gpkg"]:
739
+ self.logger.warning(
740
+ f"Saving to {format_to_use} requires converting back to GeoDataFrame. Geometry column will be re-added."
741
+ )
742
+ # Re-add geometry for saving to geospatial formats
743
+ view_to_save_gdf = self.view.merge(
744
+ self.points_gdf[["poi_id", "geometry"]], on="poi_id", how="left"
745
+ )
746
+ write_dataset(
747
+ data=view_to_save_gdf,
748
+ path=str(output_path),
749
+ data_store=self.data_store,
750
+ )
751
+ else:
752
+ write_dataset(
753
+ data=self.view, # Use the internal _view DataFrame
754
+ path=str(output_path),
755
+ data_store=self.data_store,
756
+ )
635
757
 
636
758
  return output_path
759
+
760
+ def to_dataframe(self) -> pd.DataFrame:
761
+ """
762
+ Returns the current POI view as a DataFrame.
763
+
764
+ This method combines all accumulated variables in the view
765
+
766
+ Returns:
767
+ pd.DataFrame: The current view.
768
+ """
769
+ return self.view
770
+
771
+ def to_geodataframe(self) -> gpd.GeoDataFrame:
772
+ """
773
+ Returns the current POI view merged with the original point geometries as a GeoDataFrame.
774
+
775
+ This method combines all accumulated variables in the view with the corresponding
776
+ point geometries, providing a spatially-enabled DataFrame for further analysis or export.
777
+
778
+ Returns:
779
+ gpd.GeoDataFrame: The current view merged with point geometries.
780
+ """
781
+ return self.view.merge(
782
+ self.points_gdf[["poi_id", "geometry"]], on="poi_id", how="left"
783
+ )
784
+
785
+ def chain_operations(self, operations: List[dict]) -> "PoiViewGenerator":
786
+ """
787
+ Chain multiple mapping operations for fluent interface.
788
+
789
+ Args:
790
+ operations: List of dicts with 'method' and 'kwargs' keys
791
+
792
+ Example:
793
+ generator.chain_operations([
794
+ {'method': 'map_google_buildings', 'kwargs': {}},
795
+ {'method': 'map_built_s', 'kwargs': {'map_radius_meters': 200}},
796
+ ])
797
+ """
798
+ for op in operations:
799
+ method_name = op["method"]
800
+ kwargs = op.get("kwargs", {})
801
+ if hasattr(self, method_name):
802
+ getattr(self, method_name)(**kwargs)
803
+ else:
804
+ raise AttributeError(f"Method {method_name} not found")
805
+ return self
806
+
807
+ def validate_data_coverage(self, data_bounds: gpd.GeoDataFrame) -> dict:
808
+ """
809
+ Validate how many POIs fall within the data coverage area.
810
+
811
+ Returns:
812
+ dict: Coverage statistics
813
+ """
814
+ poi_within = self.points_gdf.within(data_bounds.union_all())
815
+ coverage_stats = {
816
+ "total_pois": len(self.points_gdf),
817
+ "covered_pois": poi_within.sum(),
818
+ "coverage_percentage": (poi_within.sum() / len(self.points_gdf)) * 100,
819
+ "uncovered_pois": (~poi_within).sum(),
820
+ }
821
+ return coverage_stats
@@ -1,3 +1,4 @@
1
1
  from gigaspatial.generators.zonal.base import ZonalViewGeneratorConfig
2
2
  from gigaspatial.generators.zonal.geometry import GeometryBasedZonalViewGenerator
3
- from gigaspatial.generators.poi import PoiViewGenerator, PoiViewGeneratorConfig
3
+ from gigaspatial.generators.zonal.mercator import MercatorViewGenerator
4
+ from gigaspatial.generators.zonal.admin import AdminBoundariesViewGenerator
@@ -0,0 +1,84 @@
1
+ from typing import Optional, Union
2
+ from pathlib import Path
3
+
4
+ import logging
5
+
6
+ from gigaspatial.core.io.data_store import DataStore
7
+ from gigaspatial.handlers.boundaries import AdminBoundaries
8
+ from gigaspatial.generators.zonal.base import (
9
+ ZonalViewGeneratorConfig,
10
+ T,
11
+ )
12
+ from gigaspatial.generators.zonal.geometry import GeometryBasedZonalViewGenerator
13
+
14
+
15
+ class AdminBoundariesViewGenerator(GeometryBasedZonalViewGenerator[T]):
16
+ """
17
+ Generates zonal views using administrative boundaries as the zones.
18
+
19
+ This class specializes in creating zonal views where the zones are defined by
20
+ administrative boundaries (e.g., countries, states, districts) at a specified
21
+ administrative level. It extends the `GeometryBasedZonalViewGenerator` and
22
+ leverages the `AdminBoundaries` handler to load the necessary geographical data.
23
+
24
+ The administrative boundaries serve as the base geometries to which other
25
+ geospatial data (points, polygons, rasters) can be mapped and aggregated.
26
+
27
+ Attributes:
28
+ country (str): The name or code of the country for which to load administrative boundaries.
29
+ admin_level (int): The administrative level to load (e.g., 0 for country, 1 for states/provinces).
30
+ admin_path (Union[str, Path], optional): Optional path to a local GeoJSON/Shapefile
31
+ containing the administrative boundaries. If provided, this local file will be
32
+ used instead of downloading.
33
+ config (Optional[ZonalViewGeneratorConfig]): Configuration for the zonal view generation process.
34
+ data_store (Optional[DataStore]): A DataStore instance for accessing data.
35
+ logger (Optional[logging.Logger]): A logger instance for logging messages.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ country: str,
41
+ admin_level: int,
42
+ data_store: Optional[DataStore] = None,
43
+ admin_path: Optional[Union[str, Path]] = None,
44
+ config: Optional[ZonalViewGeneratorConfig] = None,
45
+ logger: logging.Logger = None,
46
+ ):
47
+ """
48
+ Initializes the AdminBoundariesViewGenerator.
49
+
50
+ Args:
51
+ country (str): The name or code of the country (e.g., "USA", "Germany").
52
+ admin_level (int): The administrative level to load (e.g., 0 for country, 1 for states, 2 for districts).
53
+ admin_path (Union[str, Path], optional): Path to a local administrative boundaries file (GeoJSON, Shapefile).
54
+ If provided, overrides default data loading.
55
+ config (Optional[ZonalViewGeneratorConfig]): Configuration for the zonal view generator.
56
+ If None, a default config will be used.
57
+ data_store (Optional[DataStore]): Data storage interface. If None, LocalDataStore is used.
58
+ logger (Optional[logging.Logger]): Custom logger instance. If None, a default logger is used.
59
+ """
60
+
61
+ super().__init__(
62
+ zone_data=self._init_zone_data(
63
+ country, admin_level, data_store, admin_path
64
+ ),
65
+ zone_id_column="id",
66
+ config=config,
67
+ data_store=data_store,
68
+ logger=logger,
69
+ )
70
+ self.logger.info(
71
+ f"Initialized AdminBoundariesViewGenerator for {country} (level {admin_level})"
72
+ )
73
+
74
+ def _init_zone_data(
75
+ self,
76
+ country,
77
+ admin_level,
78
+ data_store: Optional[DataStore] = None,
79
+ admin_path: Optional[Union[str, Path]] = None,
80
+ ):
81
+ gdf_boundaries = AdminBoundaries.create(
82
+ country, admin_level, data_store, admin_path
83
+ ).to_geodataframe()
84
+ return gdf_boundaries