giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from pydantic import BaseModel, Field
3
3
  from pathlib import Path
4
- from typing import Dict, List, Optional, Union, Callable, TypeVar, Generic
4
+ from typing import Dict, List, Optional, Union, TypeVar, Generic
5
5
  from shapely.geometry import Polygon
6
6
 
7
7
  import geopandas as gpd
@@ -16,10 +16,8 @@ from gigaspatial.processing.geo import (
16
16
  aggregate_polygons_to_zones,
17
17
  aggregate_points_to_zones,
18
18
  )
19
- from gigaspatial.processing.tif_processor import (
20
- TifProcessor,
21
- sample_multiple_tifs_by_polygons,
22
- )
19
+ from gigaspatial.processing.tif_processor import TifProcessor
20
+
23
21
  from functools import lru_cache
24
22
  import logging
25
23
 
@@ -209,8 +207,6 @@ class ZonalViewGenerator(ABC, Generic[T]):
209
207
  aggregation: Union[str, Dict[str, str]] = "count",
210
208
  predicate: str = "within",
211
209
  output_suffix: str = "",
212
- mapping_function: Optional[Callable] = None,
213
- **mapping_kwargs,
214
210
  ) -> Dict:
215
211
  """Map point data to zones with spatial aggregation.
216
212
 
@@ -228,18 +224,12 @@ class ZonalViewGenerator(ABC, Generic[T]):
228
224
  predicate (str): Spatial predicate for point-to-zone relationship.
229
225
  Options include "within", "intersects", "contains". Defaults to "within".
230
226
  output_suffix (str): Suffix to add to output column names. Defaults to empty string.
231
- mapping_function (Callable, optional): Custom function for mapping points to zones.
232
- If provided, signature should be mapping_function(self, points, **mapping_kwargs).
233
- When used, all other parameters except mapping_kwargs are ignored.
234
- **mapping_kwargs: Additional keyword arguments passed to the mapping function.
235
227
 
236
228
  Returns:
237
229
  Dict: Dictionary with zone IDs as keys and aggregated values as values.
238
230
  If value_columns is None, returns point counts per zone.
239
231
  If value_columns is specified, returns aggregated values per zone.
240
232
  """
241
- if mapping_function is not None:
242
- return mapping_function(self, points, **mapping_kwargs)
243
233
 
244
234
  self.logger.warning(
245
235
  "Using default points mapping implementation. Consider creating a specialized mapping function."
@@ -382,40 +372,63 @@ class ZonalViewGenerator(ABC, Generic[T]):
382
372
 
383
373
  def map_rasters(
384
374
  self,
385
- tif_processors: List[TifProcessor],
386
- mapping_function: Optional[Callable] = None,
375
+ raster_data: Union[TifProcessor, List[TifProcessor]],
387
376
  stat: str = "mean",
388
- **mapping_kwargs,
389
- ) -> Union[np.ndarray, Dict]:
377
+ **kwargs,
378
+ ) -> Dict:
390
379
  """Map raster data to zones using zonal statistics.
391
380
 
392
381
  Samples raster values within each zone and computes statistics. Automatically
393
382
  handles coordinate reference system transformations between raster and zone data.
394
383
 
395
384
  Args:
396
- tif_processors (List[TifProcessor]): List of TifProcessor objects for
397
- accessing raster data. All processors should have the same CRS.
385
+ raster_data (Union[TifProcessor, List[TifProcessor]]):
386
+ Either a TifProcessor object or a list of TifProcessor objects (which will be merged
387
+ into a single TifProcessor for processing).
398
388
  mapping_function (Callable, optional): Custom function for mapping rasters
399
389
  to zones. If provided, signature should be mapping_function(self, tif_processors, **mapping_kwargs).
400
390
  When used, stat and other parameters except mapping_kwargs are ignored.
401
391
  stat (str): Statistic to calculate when aggregating raster values within
402
392
  each zone. Options include "mean", "sum", "min", "max", "std", etc.
403
393
  Defaults to "mean".
404
- **mapping_kwargs: Additional keyword arguments passed to the mapping function.
394
+ **mapping_kwargs: Additional keyword arguments for raster data.
405
395
 
406
396
  Returns:
407
- Union[np.ndarray, Dict]: By default, returns a NumPy array of sampled values
408
- with shape (n_zones, 1), taking the first non-nodata value encountered.
409
- Custom mapping functions may return different data structures.
397
+ Dict: By default, returns a dictionary of sampled values
398
+ with zone IDs as keys.
410
399
 
411
400
  Note:
412
401
  If the coordinate reference system of the rasters differs from the zones,
413
402
  the zone geometries will be automatically transformed to match the raster CRS.
414
403
  """
415
- if mapping_function is not None:
416
- return mapping_function(self, tif_processors, **mapping_kwargs)
404
+ raster_processor: Optional[TifProcessor] = None
405
+
406
+ if isinstance(raster_data, TifProcessor):
407
+ raster_processor = raster_data
408
+ elif isinstance(raster_data, list) and all(
409
+ isinstance(x, TifProcessor) for x in raster_data
410
+ ):
411
+ if not raster_data:
412
+ self.logger.info("No valid raster data provided")
413
+ return self.view
414
+
415
+ if len(raster_data) > 1:
416
+ all_source_paths = [tp.dataset_path for tp in raster_data]
417
+
418
+ self.logger.info(
419
+ f"Merging {len(all_source_paths)} rasters into a single TifProcessor for zonal statistics."
420
+ )
421
+ raster_processor = TifProcessor(
422
+ dataset_path=all_source_paths, data_store=self.data_store, **kwargs
423
+ )
424
+ else:
425
+ raster_processor = raster_data[0]
426
+ else:
427
+ raise ValueError(
428
+ "raster_data must be a TifProcessor object or a list of TifProcessor objects."
429
+ )
417
430
 
418
- raster_crs = tif_processors[0].crs
431
+ raster_crs = raster_processor.crs
419
432
 
420
433
  if raster_crs != self.zone_gdf.crs:
421
434
  self.logger.info(f"Projecting zones to raster CRS: {raster_crs}")
@@ -424,8 +437,8 @@ class ZonalViewGenerator(ABC, Generic[T]):
424
437
  zone_geoms = self.get_zonal_geometries()
425
438
 
426
439
  # Sample raster values
427
- sampled_values = sample_multiple_tifs_by_polygons(
428
- tif_processors=tif_processors, polygon_list=zone_geoms, stat=stat
440
+ sampled_values = raster_processor.sample_by_polygons(
441
+ polygon_list=zone_geoms, stat=stat
429
442
  )
430
443
 
431
444
  zone_ids = self.get_zone_identifiers()
@@ -261,13 +261,16 @@ class GeometryBasedZonalViewGenerator(ZonalViewGenerator[T]):
261
261
  f"Mapping {handler.config.product} data (year: {handler.config.year}, resolution: {handler.config.resolution}m)"
262
262
  )
263
263
  tif_processors = handler.load_data(
264
- self.zone_gdf, ensure_available=self.config.ensure_available
264
+ self.zone_gdf,
265
+ ensure_available=self.config.ensure_available,
266
+ merge_rasters=True,
267
+ **kwargs,
265
268
  )
266
269
 
267
270
  self.logger.info(
268
271
  f"Sampling {handler.config.product} data using '{stat}' statistic"
269
272
  )
270
- sampled_values = self.map_rasters(tif_processors=tif_processors, stat=stat)
273
+ sampled_values = self.map_rasters(raster_data=tif_processors, stat=stat)
271
274
 
272
275
  column_name = (
273
276
  output_column
@@ -488,57 +491,97 @@ class GeometryBasedZonalViewGenerator(ZonalViewGenerator[T]):
488
491
  self,
489
492
  country: Union[str, List[str]],
490
493
  resolution=1000,
491
- predicate: Literal["intersects", "fractional"] = "intersects",
494
+ predicate: Literal[
495
+ "centroid_within", "intersects", "fractional"
496
+ ] = "intersects",
492
497
  output_column: str = "population",
493
498
  **kwargs,
494
499
  ):
495
- if isinstance(country, str):
496
- country = [country]
500
+
501
+ # Ensure country is always a list for consistent handling
502
+ countries_list = [country] if isinstance(country, str) else country
497
503
 
498
504
  handler = WPPopulationHandler(
499
- project="pop", resolution=resolution, data_store=self.data_store, **kwargs
505
+ resolution=resolution,
506
+ data_store=self.data_store,
507
+ **kwargs,
500
508
  )
501
509
 
510
+ # Restrict to single country for age_structures project
511
+ if handler.config.project == "age_structures" and len(countries_list) > 1:
512
+ raise ValueError(
513
+ "For 'age_structures' project, only a single country can be processed at a time."
514
+ )
515
+
502
516
  self.logger.info(
503
517
  f"Mapping WorldPop Population data (year: {handler.config.year}, resolution: {handler.config.resolution}m)"
504
518
  )
505
519
 
506
- if predicate == "fractional":
507
- if resolution == 100:
508
- self.logger.warning(
509
- "Fractional aggregations only supported for datasets with 1000m resolution. Using `intersects` as predicate"
520
+ if predicate == "fractional" and resolution == 100:
521
+ self.logger.warning(
522
+ "Fractional aggregations only supported for datasets with 1000m resolution. Using `intersects` as predicate"
523
+ )
524
+ predicate = "intersects"
525
+
526
+ if predicate == "centroid_within":
527
+ if handler.config.project == "age_structures":
528
+ # Load individual tif processors for the single country
529
+ all_tif_processors = handler.load_data(
530
+ countries_list[0],
531
+ ensure_available=self.config.ensure_available,
532
+ **kwargs,
510
533
  )
511
- predicate = "intersects"
534
+
535
+ # Sum results from each tif_processor separately
536
+ all_results_by_zone = {
537
+ zone_id: 0 for zone_id in self.get_zone_identifiers()
538
+ }
539
+ self.logger.info(
540
+ f"Sampling individual age_structures rasters using 'sum' statistic and summing per zone."
541
+ )
542
+ for tif_processor in all_tif_processors:
543
+ single_raster_result = self.map_rasters(
544
+ raster_data=tif_processor, stat="sum"
545
+ )
546
+ for zone_id, value in single_raster_result.items():
547
+ all_results_by_zone[zone_id] += value
548
+ result = all_results_by_zone
512
549
  else:
513
- gdf_pop = pd.concat(
514
- [
515
- handler.load_into_geodataframe(
516
- c, ensure_available=self.config.ensure_available
550
+ # Existing behavior for non-age_structures projects or if merging is fine
551
+ tif_processors = []
552
+ for c in countries_list:
553
+ tif_processors.extend(
554
+ handler.load_data(
555
+ c,
556
+ ensure_available=self.config.ensure_available,
557
+ **kwargs,
517
558
  )
518
- for c in country
519
- ],
520
- ignore_index=True,
521
- )
522
-
523
- result = self.map_polygons(
524
- gdf_pop,
525
- value_columns="pixel_value",
526
- aggregation="sum",
527
- predicate=predicate,
559
+ )
560
+ self.logger.info(
561
+ f"Sampling WorldPop Population data using 'sum' statistic"
528
562
  )
529
-
530
- self.add_variable_to_view(result, output_column)
531
- return self.view
532
-
533
- tif_processors = []
534
- for c in country:
535
- tif_processors.extend(
536
- handler.load_data(c, ensure_available=self.config.ensure_available)
563
+ result = self.map_rasters(raster_data=tif_processors, stat="sum")
564
+ else:
565
+ gdf_pop = pd.concat(
566
+ [
567
+ handler.load_into_geodataframe(
568
+ c,
569
+ ensure_available=self.config.ensure_available,
570
+ **kwargs,
571
+ )
572
+ for c in countries_list
573
+ ],
574
+ ignore_index=True,
537
575
  )
538
576
 
539
- self.logger.info(f"Sampling WorldPop Population data using 'sum' statistic")
540
- sampled_values = self.map_rasters(tif_processors=tif_processors, stat="sum")
577
+ self.logger.info(f"Aggregating WorldPop Population data to the zones.")
578
+ result = self.map_polygons(
579
+ gdf_pop,
580
+ value_columns="pixel_value",
581
+ aggregation="sum",
582
+ predicate=predicate,
583
+ )
541
584
 
542
- self.add_variable_to_view(sampled_values, output_column)
585
+ self.add_variable_to_view(result, output_column)
543
586
 
544
587
  return self.view
@@ -328,21 +328,33 @@ class BaseHandlerReader(ABC):
328
328
  )
329
329
 
330
330
  def _load_raster_data(
331
- self, raster_paths: List[Union[str, Path]]
332
- ) -> List[TifProcessor]:
331
+ self,
332
+ raster_paths: List[Union[str, Path]],
333
+ merge_rasters: bool = False,
334
+ **kwargs,
335
+ ) -> Union[List[TifProcessor], TifProcessor]:
333
336
  """
334
337
  Load raster data from file paths.
335
338
 
336
339
  Args:
337
340
  raster_paths (List[Union[str, Path]]): List of file paths to raster files.
341
+ merge_rasters (bool): If True, all rasters will be merged into a single TifProcessor.
342
+ Defaults to False.
338
343
 
339
344
  Returns:
340
- List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
345
+ Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects or a single
346
+ TifProcessor if merge_rasters is True.
341
347
  """
342
- return [
343
- TifProcessor(data_path, self.data_store, mode="single")
344
- for data_path in raster_paths
345
- ]
348
+ if merge_rasters and len(raster_paths) > 1:
349
+ self.logger.info(
350
+ f"Merging {len(raster_paths)} rasters into a single TifProcessor."
351
+ )
352
+ return TifProcessor(raster_paths, self.data_store, **kwargs)
353
+ else:
354
+ return [
355
+ TifProcessor(data_path, self.data_store, **kwargs)
356
+ for data_path in raster_paths
357
+ ]
346
358
 
347
359
  def _load_tabular_data(
348
360
  self, file_paths: List[Union[str, Path]], read_function: Callable = read_dataset
@@ -619,7 +631,9 @@ class BaseHandler(ABC):
619
631
  # Download logic
620
632
  if data_units is not None:
621
633
  # Map data_units to their paths and select only those that are missing
622
- unit_to_path = dict(zip(data_paths,data_units)) #units might be dicts, cannot be used as key
634
+ unit_to_path = dict(
635
+ zip(data_paths, data_units)
636
+ ) # units might be dicts, cannot be used as key
623
637
  if force_download:
624
638
  # Download all units if force_download
625
639
  self.downloader.download_data_units(data_units, **kwargs)
@@ -610,19 +610,27 @@ class GHSLDataReader(BaseHandlerReader):
610
610
  super().__init__(config=config, data_store=data_store, logger=logger)
611
611
 
612
612
  def load_from_paths(
613
- self, source_data_path: List[Union[str, Path]], **kwargs
614
- ) -> List[TifProcessor]:
613
+ self,
614
+ source_data_path: List[Union[str, Path]],
615
+ merge_rasters: bool = False,
616
+ **kwargs,
617
+ ) -> Union[List[TifProcessor], TifProcessor]:
615
618
  """
616
619
  Load TifProcessors from GHSL dataset.
617
620
  Args:
618
621
  source_data_path: List of file paths to load
622
+ merge_rasters: If True, all rasters will be merged into a single TifProcessor.
623
+ Defaults to False.
619
624
  Returns:
620
- List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
625
+ Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
626
+ TifProcessor if merge_rasters is True.
621
627
  """
622
- return self._load_raster_data(raster_paths=source_data_path)
628
+ return self._load_raster_data(
629
+ raster_paths=source_data_path, merge_rasters=merge_rasters
630
+ )
623
631
 
624
- def load(self, source, **kwargs):
625
- return super().load(source=source, file_ext=".tif")
632
+ def load(self, source, merge_rasters: bool = False, **kwargs):
633
+ return super().load(source=source, file_ext=".tif", merge_rasters=merge_rasters)
626
634
 
627
635
 
628
636
  class GHSLDataHandler(BaseHandler):
@@ -763,6 +771,7 @@ class GHSLDataHandler(BaseHandler):
763
771
  List[Union[str, Path]], # list of paths
764
772
  ],
765
773
  ensure_available: bool = True,
774
+ merge_rasters: bool = False,
766
775
  **kwargs,
767
776
  ):
768
777
  return super().load_data(
@@ -771,6 +780,7 @@ class GHSLDataHandler(BaseHandler):
771
780
  file_ext=".tif",
772
781
  extract=True,
773
782
  file_pattern=r".*\.tif$",
783
+ merge_rasters=merge_rasters,
774
784
  **kwargs,
775
785
  )
776
786
 
@@ -801,8 +811,10 @@ class GHSLDataHandler(BaseHandler):
801
811
  tif_processors = self.load_data(
802
812
  source=source, ensure_available=ensure_available, **kwargs
803
813
  )
814
+ if isinstance(tif_processors, TifProcessor):
815
+ return tif_processors.to_dataframe(**kwargs)
804
816
  return pd.concat(
805
- [tp.to_dataframe() for tp in tif_processors], ignore_index=True
817
+ [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
806
818
  )
807
819
 
808
820
  def load_into_geodataframe(
@@ -832,8 +844,10 @@ class GHSLDataHandler(BaseHandler):
832
844
  tif_processors = self.load_data(
833
845
  source=source, ensure_available=ensure_available, **kwargs
834
846
  )
847
+ if isinstance(tif_processors, TifProcessor):
848
+ return tif_processors.to_geodataframe(**kwargs)
835
849
  return pd.concat(
836
- [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
850
+ [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
837
851
  )
838
852
 
839
853
  def get_available_data_info(
@@ -8,6 +8,7 @@ from shapely.geometry import Point
8
8
  import pycountry
9
9
  from typing import Optional, Union
10
10
  import logging
11
+ import geopandas as gpd
11
12
 
12
13
  from gigaspatial.config import config as global_config
13
14
 
@@ -40,11 +41,14 @@ class GigaSchoolLocationFetcher:
40
41
  if self.logger is None:
41
42
  self.logger = global_config.get_logger(self.__class__.__name__)
42
43
 
43
- def fetch_locations(self, **kwargs) -> pd.DataFrame:
44
+ def fetch_locations(
45
+ self, process_geospatial: bool = False, **kwargs
46
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
44
47
  """
45
48
  Fetch and process school locations.
46
49
 
47
50
  Args:
51
+ process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
48
52
  **kwargs: Additional parameters for customization
49
53
  - page_size: Override default page size
50
54
  - sleep_time: Override default sleep time between requests
@@ -122,11 +126,12 @@ class GigaSchoolLocationFetcher:
122
126
 
123
127
  df = pd.DataFrame(all_data)
124
128
 
125
- df = self._process_geospatial_data(df)
129
+ if process_geospatial:
130
+ df = self._process_geospatial_data(df)
126
131
 
127
132
  return df
128
133
 
129
- def _process_geospatial_data(self, df: pd.DataFrame) -> pd.DataFrame:
134
+ def _process_geospatial_data(self, df: pd.DataFrame) -> gpd.GeoDataFrame:
130
135
  """
131
136
  Process and enhance the DataFrame with geospatial information.
132
137
 
@@ -144,7 +149,7 @@ class GigaSchoolLocationFetcher:
144
149
  )
145
150
  self.logger.info(f"Created geometry for all {len(df)} records")
146
151
 
147
- return df
152
+ return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
148
153
 
149
154
 
150
155
  @dataclass(config=ConfigDict(arbitrary_types_allowed=True))