giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. giga_spatial-0.6.0.dist-info/METADATA +141 -0
  2. giga_spatial-0.6.0.dist-info/RECORD +47 -0
  3. giga_spatial-0.6.0.dist-info/WHEEL +5 -0
  4. giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
  5. giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
  6. gigaspatial/__init__.py +1 -0
  7. gigaspatial/config.py +226 -0
  8. gigaspatial/core/__init__.py +0 -0
  9. gigaspatial/core/io/__init__.py +5 -0
  10. gigaspatial/core/io/adls_data_store.py +325 -0
  11. gigaspatial/core/io/data_api.py +113 -0
  12. gigaspatial/core/io/data_store.py +147 -0
  13. gigaspatial/core/io/local_data_store.py +92 -0
  14. gigaspatial/core/io/readers.py +265 -0
  15. gigaspatial/core/io/writers.py +128 -0
  16. gigaspatial/core/schemas/__init__.py +0 -0
  17. gigaspatial/core/schemas/entity.py +244 -0
  18. gigaspatial/generators/__init__.py +2 -0
  19. gigaspatial/generators/poi.py +636 -0
  20. gigaspatial/generators/zonal/__init__.py +3 -0
  21. gigaspatial/generators/zonal/base.py +370 -0
  22. gigaspatial/generators/zonal/geometry.py +439 -0
  23. gigaspatial/generators/zonal/mercator.py +78 -0
  24. gigaspatial/grid/__init__.py +1 -0
  25. gigaspatial/grid/mercator_tiles.py +286 -0
  26. gigaspatial/handlers/__init__.py +40 -0
  27. gigaspatial/handlers/base.py +761 -0
  28. gigaspatial/handlers/boundaries.py +305 -0
  29. gigaspatial/handlers/ghsl.py +772 -0
  30. gigaspatial/handlers/giga.py +145 -0
  31. gigaspatial/handlers/google_open_buildings.py +472 -0
  32. gigaspatial/handlers/hdx.py +241 -0
  33. gigaspatial/handlers/mapbox_image.py +208 -0
  34. gigaspatial/handlers/maxar_image.py +291 -0
  35. gigaspatial/handlers/microsoft_global_buildings.py +548 -0
  36. gigaspatial/handlers/ookla_speedtest.py +199 -0
  37. gigaspatial/handlers/opencellid.py +290 -0
  38. gigaspatial/handlers/osm.py +356 -0
  39. gigaspatial/handlers/overture.py +126 -0
  40. gigaspatial/handlers/rwi.py +157 -0
  41. gigaspatial/handlers/unicef_georepo.py +806 -0
  42. gigaspatial/handlers/worldpop.py +266 -0
  43. gigaspatial/processing/__init__.py +4 -0
  44. gigaspatial/processing/geo.py +1054 -0
  45. gigaspatial/processing/sat_images.py +39 -0
  46. gigaspatial/processing/tif_processor.py +477 -0
  47. gigaspatial/processing/utils.py +49 -0
@@ -0,0 +1,370 @@
1
+ from abc import ABC, abstractmethod
2
+ from pydantic import BaseModel, Field
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional, Union, Callable, TypeVar, Generic
5
+ from shapely.geometry import Polygon
6
+
7
+ import geopandas as gpd
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from gigaspatial.core.io.data_store import DataStore
12
+ from gigaspatial.core.io.local_data_store import LocalDataStore
13
+ from gigaspatial.core.io.writers import write_dataset
14
+ from gigaspatial.config import config as global_config
15
+ from gigaspatial.processing.geo import (
16
+ convert_to_geodataframe,
17
+ aggregate_polygons_to_zones,
18
+ aggregate_points_to_zones,
19
+ )
20
+ from gigaspatial.processing.tif_processor import (
21
+ TifProcessor,
22
+ sample_multiple_tifs_by_polygons,
23
+ )
24
+ from functools import lru_cache
25
+ import logging
26
+
27
+
28
+ class ZonalViewGeneratorConfig(BaseModel):
29
+ """Configuration for zonal view generation.
30
+
31
+ Attributes:
32
+ base_path (Path): Base directory path for storing zonal views. Defaults to
33
+ configured zonal views path.
34
+ output_format (str): Default output format for saved views. Defaults to "parquet".
35
+ """
36
+
37
+ base_path: Path = Field(default=global_config.get_path("zonal", "views"))
38
+ output_format: str = "parquet"
39
+ ensure_available: bool = True
40
+
41
+
42
+ T = TypeVar("T") # For zone type
43
+
44
+
45
+ class ZonalViewGenerator(ABC, Generic[T]):
46
+ """Base class for mapping data to zonal datasets.
47
+
48
+ This class provides the framework for mapping various data sources (points, polygons, rasters)
49
+ to zonal geometries like grid tiles or catchment areas. It serves as an abstract base class
50
+ that must be subclassed to implement specific zonal systems.
51
+
52
+ The class supports three main types of data mapping:
53
+ - Point data aggregation to zones
54
+ - Polygon data aggregation with optional area weighting
55
+ - Raster data sampling and statistics
56
+
57
+ Attributes:
58
+ data_store (DataStore): The data store for accessing input data.
59
+ generator_config (ZonalViewGeneratorConfig): Configuration for the generator.
60
+ logger: Logger instance for this class.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ config: Optional[ZonalViewGeneratorConfig] = None,
66
+ data_store: Optional[DataStore] = None,
67
+ logger: logging.Logger = None,
68
+ ):
69
+ """Initialize the ZonalViewGenerator.
70
+
71
+ Args:
72
+ generator_config (ZonalViewGeneratorConfig, optional): Configuration for the generator.
73
+ If None, uses default configuration.
74
+ data_store (DataStore, optional): The data store for accessing input data.
75
+ If None, uses LocalDataStore.
76
+ """
77
+ self.config = config or ZonalViewGeneratorConfig()
78
+ self.data_store = data_store or LocalDataStore()
79
+ self.logger = logger or global_config.get_logger(self.__class__.__name__)
80
+
81
+ @abstractmethod
82
+ def get_zonal_geometries(self) -> List[Polygon]:
83
+ """Get the geometries of the zones.
84
+
85
+ This method must be implemented by subclasses to return the actual geometric
86
+ shapes of the zones (e.g., grid tiles, catchment boundaries, administrative areas).
87
+
88
+ Returns:
89
+ List[Polygon]: A list of Shapely Polygon objects representing zone geometries.
90
+ """
91
+ pass
92
+
93
+ @abstractmethod
94
+ def get_zone_identifiers(self) -> List[T]:
95
+ """Get unique identifiers for each zone.
96
+
97
+ This method must be implemented by subclasses to return identifiers that
98
+ correspond one-to-one with the geometries returned by get_zonal_geometries().
99
+
100
+ Returns:
101
+ List[T]: A list of zone identifiers (e.g., quadkeys, H3 indices, tile IDs).
102
+ The type T is determined by the specific zonal system implementation.
103
+ """
104
+ pass
105
+
106
+ def to_geodataframe(self) -> gpd.GeoDataFrame:
107
+ """Convert zones to a GeoDataFrame.
108
+
109
+ Creates a GeoDataFrame containing zone identifiers and their corresponding
110
+ geometries in WGS84 (EPSG:4326) coordinate reference system.
111
+
112
+ Returns:
113
+ gpd.GeoDataFrame: A GeoDataFrame with 'zone_id' and 'geometry' columns,
114
+ where zone_id contains the identifiers and geometry contains the
115
+ corresponding Polygon objects.
116
+ """
117
+ return gpd.GeoDataFrame(
118
+ {
119
+ "zone_id": self.get_zone_identifiers(),
120
+ "geometry": self.get_zonal_geometries(),
121
+ },
122
+ crs="EPSG:4326",
123
+ )
124
+
125
+ @property
126
+ def zone_gdf(self) -> gpd.GeoDataFrame:
127
+ """Cached GeoDataFrame of zones.
128
+
129
+ Returns:
130
+ gpd.GeoDataFrame: Lazily-computed and cached GeoDataFrame of zone geometries
131
+ and identifiers.
132
+ """
133
+ if not hasattr(self, "_zone_gdf"):
134
+ self._zone_gdf = self.to_geodataframe()
135
+ return self._zone_gdf
136
+
137
+ def map_points(
138
+ self,
139
+ points: Union[pd.DataFrame, gpd.GeoDataFrame],
140
+ value_columns: Optional[Union[str, List[str]]] = None,
141
+ aggregation: Union[str, Dict[str, str]] = "count",
142
+ predicate: str = "within",
143
+ output_suffix: str = "",
144
+ mapping_function: Optional[Callable] = None,
145
+ **mapping_kwargs,
146
+ ) -> Dict:
147
+ """Map point data to zones with spatial aggregation.
148
+
149
+ Aggregates point data to zones using spatial relationships. Points can be
150
+ counted or have their attribute values aggregated using various statistical methods.
151
+
152
+ Args:
153
+ points (Union[pd.DataFrame, gpd.GeoDataFrame]): The point data to map.
154
+ Must contain geometry information if DataFrame.
155
+ value_columns (Union[str, List[str]], optional): Column name(s) containing
156
+ values to aggregate. If None, only point counts are performed.
157
+ aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use.
158
+ Can be a single string ("count", "mean", "sum", "min", "max", etc.)
159
+ or a dictionary mapping column names to aggregation methods.
160
+ predicate (str): Spatial predicate for point-to-zone relationship.
161
+ Options include "within", "intersects", "contains". Defaults to "within".
162
+ output_suffix (str): Suffix to add to output column names. Defaults to empty string.
163
+ mapping_function (Callable, optional): Custom function for mapping points to zones.
164
+ If provided, signature should be mapping_function(self, points, **mapping_kwargs).
165
+ When used, all other parameters except mapping_kwargs are ignored.
166
+ **mapping_kwargs: Additional keyword arguments passed to the mapping function.
167
+
168
+ Returns:
169
+ Dict: Dictionary with zone IDs as keys and aggregated values as values.
170
+ If value_columns is None, returns point counts per zone.
171
+ If value_columns is specified, returns aggregated values per zone.
172
+ """
173
+ if mapping_function is not None:
174
+ return mapping_function(self, points, **mapping_kwargs)
175
+
176
+ else:
177
+ self.logger.warning(
178
+ "Using default points mapping implementation. Consider creating a specialized mapping function."
179
+ )
180
+ result = aggregate_points_to_zones(
181
+ points=points,
182
+ zones=self.zone_gdf,
183
+ value_columns=value_columns,
184
+ aggregation=aggregation,
185
+ point_zone_predicate=predicate,
186
+ zone_id_column="zone_id",
187
+ output_suffix=output_suffix,
188
+ )
189
+
190
+ if not value_columns:
191
+ return result["point_count"].to_dict()
192
+
193
+ return result[value_columns].to_dict()
194
+
195
+ def map_polygons(
196
+ self,
197
+ polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
198
+ value_columns: Optional[Union[str, List[str]]] = None,
199
+ aggregation: Union[str, Dict[str, str]] = "sum",
200
+ area_weighted: bool = False,
201
+ area_column: str = "area_in_meters",
202
+ mapping_function: Optional[Callable] = None,
203
+ **mapping_kwargs,
204
+ ) -> Dict:
205
+ """Map polygon data to zones with optional area weighting.
206
+
207
+ Aggregates polygon data to zones based on spatial intersections. Values can be
208
+ weighted by the fractional area of intersection between polygons and zones.
209
+
210
+ Args:
211
+ polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): The polygon data to map.
212
+ Must contain geometry information if DataFrame.
213
+ value_columns (Union[str, List[str]], optional): Column name(s) to aggregate.
214
+ If None, only intersection areas will be calculated.
215
+ aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use.
216
+ Can be a single string ("sum", "mean", "max", "min") or a dictionary
217
+ mapping column names to specific aggregation methods. Defaults to "sum".
218
+ area_weighted (bool): Whether to weight values by fractional area of
219
+ intersection. Defaults to False.
220
+ area_column (str): Name of column to store calculated areas. Only used
221
+ if area calculation is needed. Defaults to "area_in_meters".
222
+ mapping_function (Callable, optional): Custom function for mapping polygons
223
+ to zones. If provided, signature should be mapping_function(self, polygons, **mapping_kwargs).
224
+ When used, all other parameters except mapping_kwargs are ignored.
225
+ **mapping_kwargs: Additional keyword arguments passed to the mapping function.
226
+
227
+ Returns:
228
+ Dict: Dictionary with zone IDs as keys and aggregated values as values.
229
+ Returns aggregated values for the specified value_columns.
230
+
231
+ Raises:
232
+ TypeError: If polygons cannot be converted to a GeoDataFrame.
233
+ """
234
+ if mapping_function is not None:
235
+ return mapping_function(self, polygons, **mapping_kwargs)
236
+
237
+ if area_column not in polygons_gdf:
238
+ if not isinstance(polygons, gpd.GeoDataFrame):
239
+ try:
240
+ polygons_gdf = convert_to_geodataframe(polygons)
241
+ except:
242
+ raise TypeError(
243
+ "polygons must be a GeoDataFrame or convertible to one"
244
+ )
245
+ else:
246
+ polygons_gdf = polygons.copy()
247
+
248
+ polygons_gdf[area_column] = polygons_gdf.to_crs(
249
+ polygons_gdf.estimate_utm_crs()
250
+ ).geometry.area
251
+
252
+ if value_columns is None:
253
+ self.logger.warning(
254
+ "Using default polygon mapping implementation. Consider providing value_columns."
255
+ )
256
+ value_columns = area_column
257
+
258
+ result = aggregate_polygons_to_zones(
259
+ polygons=polygons_gdf,
260
+ zones=self.zone_gdf,
261
+ value_columns=value_columns,
262
+ aggregation=aggregation,
263
+ area_weighted=area_weighted,
264
+ zone_id_column="zone_id",
265
+ )
266
+
267
+ return result[value_columns].to_dict()
268
+
269
+ def map_rasters(
270
+ self,
271
+ tif_processors: List[TifProcessor],
272
+ mapping_function: Optional[Callable] = None,
273
+ stat: str = "mean",
274
+ **mapping_kwargs,
275
+ ) -> Union[np.ndarray, Dict]:
276
+ """Map raster data to zones using zonal statistics.
277
+
278
+ Samples raster values within each zone and computes statistics. Automatically
279
+ handles coordinate reference system transformations between raster and zone data.
280
+
281
+ Args:
282
+ tif_processors (List[TifProcessor]): List of TifProcessor objects for
283
+ accessing raster data. All processors should have the same CRS.
284
+ mapping_function (Callable, optional): Custom function for mapping rasters
285
+ to zones. If provided, signature should be mapping_function(self, tif_processors, **mapping_kwargs).
286
+ When used, stat and other parameters except mapping_kwargs are ignored.
287
+ stat (str): Statistic to calculate when aggregating raster values within
288
+ each zone. Options include "mean", "sum", "min", "max", "std", etc.
289
+ Defaults to "mean".
290
+ **mapping_kwargs: Additional keyword arguments passed to the mapping function.
291
+
292
+ Returns:
293
+ Union[np.ndarray, Dict]: By default, returns a NumPy array of sampled values
294
+ with shape (n_zones, n_rasters), taking the first non-nodata value encountered.
295
+ Custom mapping functions may return different data structures.
296
+
297
+ Note:
298
+ If the coordinate reference system of the rasters differs from the zones,
299
+ the zone geometries will be automatically transformed to match the raster CRS.
300
+ """
301
+ if mapping_function is not None:
302
+ return mapping_function(self, tif_processors, **mapping_kwargs)
303
+
304
+ self.logger.warning(
305
+ "Using default raster mapping implementation. Consider creating a specialized mapping function."
306
+ )
307
+
308
+ raster_crs = tif_processors[0].crs
309
+
310
+ if raster_crs != self.zone_gdf.crs:
311
+ self.logger.info(f"Projecting zones to raster CRS: {raster_crs}")
312
+ zone_geoms = self._get_transformed_geometries(raster_crs)
313
+ else:
314
+ zone_geoms = self.get_zonal_geometries()
315
+
316
+ # Sample raster values
317
+ sampled_values = sample_multiple_tifs_by_polygons(
318
+ tif_processors=tif_processors, polygon_list=zone_geoms, stat=stat
319
+ )
320
+
321
+ return sampled_values
322
+
323
+ @lru_cache(maxsize=32)
324
+ def _get_transformed_geometries(self, target_crs):
325
+ """Get zone geometries transformed to target coordinate reference system.
326
+
327
+ This method is cached to avoid repeated coordinate transformations for
328
+ the same target CRS.
329
+
330
+ Args:
331
+ target_crs: Target coordinate reference system for transformation.
332
+
333
+ Returns:
334
+ List[Polygon]: List of zone geometries transformed to the target CRS.
335
+ """
336
+ return self.zone_gdf.to_crs(target_crs).geometry.tolist()
337
+
338
+ def save_view(
339
+ self,
340
+ view_data: gpd.GeoDataFrame,
341
+ name: str,
342
+ output_format: Optional[str] = None,
343
+ ) -> Path:
344
+ """Save the generated zonal view to disk.
345
+
346
+ Args:
347
+ view_data (gpd.GeoDataFrame): The zonal view data to save.
348
+ name (str): Base name for the output file (without extension).
349
+ output_format (str, optional): File format to save in (e.g., "parquet",
350
+ "geojson", "shp"). If None, uses the format specified in generator_config.
351
+
352
+ Returns:
353
+ Path: The full path where the view was saved.
354
+
355
+ Note:
356
+ The output directory is determined by the generator_config.base_path setting.
357
+ The file extension is automatically added based on the output format.
358
+ """
359
+ format_to_use = output_format or self.config.output_format
360
+ output_path = self.config.base_path / f"{name}.{format_to_use}"
361
+
362
+ self.logger.info(f"Saving zonal view to {output_path}")
363
+ write_dataset(
364
+ df=view_data,
365
+ path=str(output_path),
366
+ data_store=self.data_store,
367
+ format=format_to_use,
368
+ )
369
+
370
+ return output_path