giga-spatial 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,8 @@ from shapely.geometry.base import BaseGeometry
19
19
  from shapely.geometry import Point
20
20
  from tqdm import tqdm
21
21
  import logging
22
+ import zipfile
23
+ import tempfile
22
24
 
23
25
  from gigaspatial.core.io.data_store import DataStore
24
26
  from gigaspatial.processing.tif_processor import TifProcessor
@@ -563,6 +565,132 @@ class WPPopulationConfig(BaseHandlerConfig):
563
565
  """
564
566
  return self.base_path / unit.split("GIS/")[1]
565
567
 
568
+ def get_data_unit_paths(self, units: Union[List[str], str], **kwargs) -> list:
569
+ """
570
+ Given WP file url(s), return the corresponding local file paths.
571
+
572
+ - For school_age age_structures (zip resources), if extracted .tif files are present
573
+ in the target directory, return those; otherwise, return the zip path(s) to allow
574
+ the downloader to fetch and extract them.
575
+ - For non-school_age age_structures (individual .tif URLs), you can filter by sex and age
576
+ using kwargs: sex, ages, min_age, max_age.
577
+ """
578
+ if not isinstance(units, list):
579
+ units = [units]
580
+
581
+ # Extract optional filters
582
+ sex = kwargs.get("sex")
583
+ education_level = kwargs.get("education_level") or kwargs.get("level")
584
+
585
+ def _to_set(v):
586
+ if v is None:
587
+ return None
588
+ if isinstance(v, (list, tuple, set)):
589
+ return {str(x).upper() for x in v}
590
+ return {str(v).upper()}
591
+
592
+ sex_filters = _to_set(sex)
593
+ level_filters = _to_set(education_level)
594
+
595
+ # 1) School-age branch (zip → extracted tifs)
596
+ if self.project == "age_structures" and self.school_age:
597
+ resolved_paths: List[Path] = []
598
+ for url in units:
599
+ output_dir = self.get_data_unit_path(url).parent
600
+
601
+ if self.data_store.is_dir(str(output_dir)):
602
+ try:
603
+ for f in self.data_store.list_files(str(output_dir)):
604
+ if f.lower().endswith(".tif"):
605
+ p = Path(f)
606
+ name = p.name.upper()
607
+ # Apply filters on extracted tif names
608
+ if sex_filters:
609
+ # Explicit matching: F matches only F-only; M matches only M-only;
610
+ # F_M matches only combined. No implicit inclusion of combined for F or M.
611
+ is_combined = "_F_M_" in name
612
+ is_f_only = ("_F_" in name) and not is_combined
613
+ is_m_only = ("_M_" in name) and not is_combined
614
+
615
+ wants_f = "F" in sex_filters
616
+ wants_m = "M" in sex_filters
617
+ wants_both = "F_M" in sex_filters
618
+
619
+ sex_ok = (
620
+ (wants_both and is_combined)
621
+ or (wants_f and is_f_only)
622
+ or (wants_m and is_m_only)
623
+ )
624
+ if not sex_ok:
625
+ continue
626
+ if level_filters:
627
+ if not any(lvl in name for lvl in level_filters):
628
+ continue
629
+ resolved_paths.append(p)
630
+ except Exception:
631
+ resolved_paths.append(self.get_data_unit_path(url))
632
+ else:
633
+ resolved_paths.append(self.get_data_unit_path(url))
634
+
635
+ return resolved_paths
636
+
637
+ # 2) Non-school_age age_structures (individual tif URLs) with sex/age filters
638
+ if self.project == "age_structures" and not self.school_age:
639
+ # optional filters
640
+ sex_filters = _to_set(kwargs.get("sex"))
641
+ ages_filter = kwargs.get("ages")
642
+ min_age = kwargs.get("min_age")
643
+ max_age = kwargs.get("max_age")
644
+
645
+ if ages_filter is not None and not isinstance(
646
+ ages_filter, (list, tuple, set)
647
+ ):
648
+ ages_filter = {int(ages_filter)}
649
+ elif isinstance(ages_filter, (list, tuple, set)):
650
+ ages_filter = {int(x) for x in ages_filter}
651
+
652
+ def _parse_meta(u: str):
653
+ # Expected basename pattern: ISO3_SEX_AGE_YEAR.tif
654
+ # e.g., RWA_F_25_2020.tif (case-insensitive possible)
655
+ bn = os.path.basename(u)
656
+ stem = os.path.splitext(bn)[0]
657
+ parts = stem.split("_")
658
+ # Be defensive about various casings/orderings
659
+ # Heuristic: country(0), sex(1), age(2), year(3+)
660
+ if len(parts) >= 4:
661
+ sex_val = parts[1].upper()
662
+ try:
663
+ age_val = int(parts[2])
664
+ except Exception:
665
+ age_val = None
666
+ else:
667
+ sex_val, age_val = None, None
668
+ return sex_val, age_val
669
+
670
+ filtered_units = []
671
+ for u in units:
672
+ sex_val, age_val = _parse_meta(u)
673
+
674
+ # sex filter
675
+ if sex_filters and sex_val not in sex_filters:
676
+ continue
677
+
678
+ # age filters: ages exact, or min/max bounds
679
+ if age_val is not None:
680
+ if ages_filter is not None and age_val not in ages_filter:
681
+ continue
682
+ if min_age is not None and age_val < int(min_age):
683
+ continue
684
+ if max_age is not None and age_val > int(max_age):
685
+ continue
686
+
687
+ filtered_units.append(u)
688
+
689
+ return [self.get_data_unit_path(unit) for unit in filtered_units]
690
+
691
+ # Default behavior
692
+ return [self.get_data_unit_path(unit) for unit in units]
693
+
566
694
  def __repr__(self) -> str:
567
695
 
568
696
  return (
@@ -601,7 +729,70 @@ class WPPopulationDownloader(BaseHandlerDownloader):
601
729
  super().__init__(config=config, data_store=data_store, logger=logger)
602
730
 
603
731
  def download_data_unit(self, url, **kwargs):
604
- """Download data file for a url."""
732
+ """Download data file for a url. If a zip, extract contained .tif files."""
733
+ # If the resource is a zip (e.g., school age datasets), download to temp and extract .tif files
734
+ if url.lower().endswith(".zip"):
735
+ temp_downloaded_path: Optional[Path] = None
736
+ try:
737
+ with tempfile.NamedTemporaryFile(
738
+ delete=False, suffix=".zip"
739
+ ) as temp_file:
740
+ temp_downloaded_path = Path(temp_file.name)
741
+ response = self.config.client.session.get(
742
+ url, stream=True, timeout=self.config.client.timeout
743
+ )
744
+ response.raise_for_status()
745
+
746
+ total_size = int(response.headers.get("content-length", 0))
747
+
748
+ with tqdm(
749
+ total=total_size,
750
+ unit="B",
751
+ unit_scale=True,
752
+ desc=f"Downloading {os.path.basename(temp_downloaded_path)}",
753
+ ) as pbar:
754
+ for chunk in response.iter_content(chunk_size=8192):
755
+ if chunk:
756
+ temp_file.write(chunk)
757
+ pbar.update(len(chunk))
758
+
759
+ extracted_files: List[Path] = []
760
+ output_dir = self.config.get_data_unit_path(url).parent
761
+ with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
762
+ members = [
763
+ m for m in zip_ref.namelist() if m.lower().endswith(".tif")
764
+ ]
765
+ for member in members:
766
+ extracted_path = output_dir / Path(member).name
767
+ with zip_ref.open(member) as source:
768
+ file_content = source.read()
769
+ self.data_store.write_file(
770
+ str(extracted_path), file_content
771
+ )
772
+ extracted_files.append(extracted_path)
773
+ self.logger.info(f"Extracted {member} to {extracted_path}")
774
+
775
+ return extracted_files
776
+
777
+ except requests.RequestException as e:
778
+ self.logger.error(f"Failed to download {url}: {e}")
779
+ return None
780
+ except zipfile.BadZipFile:
781
+ self.logger.error("Downloaded file is not a valid zip archive.")
782
+ return None
783
+ except Exception as e:
784
+ self.logger.error(f"Unexpected error processing zip for {url}: {e}")
785
+ return None
786
+ finally:
787
+ if temp_downloaded_path and temp_downloaded_path.exists():
788
+ try:
789
+ temp_downloaded_path.unlink()
790
+ except OSError as e:
791
+ self.logger.warning(
792
+ f"Could not delete temporary file {temp_downloaded_path}: {e}"
793
+ )
794
+
795
+ # Otherwise, download as a regular file (e.g., .tif)
605
796
  try:
606
797
  response = self.config.client.session.get(
607
798
  url, stream=True, timeout=self.config.client.timeout
@@ -636,12 +827,13 @@ class WPPopulationDownloader(BaseHandlerDownloader):
636
827
  def download_data_units(
637
828
  self,
638
829
  urls: List[str],
830
+ **kwargs,
639
831
  ) -> List[str]:
640
832
  """Download data files for multiple urls."""
641
833
 
642
834
  with multiprocessing.Pool(self.config.n_workers) as pool:
643
835
  download_func = functools.partial(self.download_data_unit)
644
- file_paths = list(
836
+ results = list(
645
837
  tqdm(
646
838
  pool.imap(download_func, urls),
647
839
  total=len(urls),
@@ -649,7 +841,17 @@ class WPPopulationDownloader(BaseHandlerDownloader):
649
841
  )
650
842
  )
651
843
 
652
- return [path for path in file_paths if path is not None]
844
+ # Flatten results and filter out None
845
+ flattened: List[Path] = []
846
+ for item in results:
847
+ if item is None:
848
+ continue
849
+ if isinstance(item, list):
850
+ flattened.extend(item)
851
+ else:
852
+ flattened.append(item)
853
+
854
+ return flattened
653
855
 
654
856
  def download(self, source: str, **kwargs) -> List[str]:
655
857
  """Download data for a source"""
@@ -681,16 +883,27 @@ class WPPopulationReader(BaseHandlerReader):
681
883
  super().__init__(config=config, data_store=data_store, logger=logger)
682
884
 
683
885
  def load_from_paths(
684
- self, source_data_path: List[Union[str, Path]], **kwargs
685
- ) -> List[TifProcessor]:
886
+ self,
887
+ source_data_path: List[Union[str, Path]],
888
+ merge_rasters: bool = False,
889
+ **kwargs,
890
+ ) -> Union[List[TifProcessor], TifProcessor]:
686
891
  """
687
892
  Load TifProcessors of WP datasets.
688
893
  Args:
689
894
  source_data_path: List of file paths to load
895
+ merge_rasters: If True, all rasters will be merged into a single TifProcessor.
896
+ Defaults to False.
690
897
  Returns:
691
- List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
898
+ Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
899
+ TifProcessor if merge_rasters is True.
692
900
  """
693
- return self._load_raster_data(raster_paths=source_data_path)
901
+ return self._load_raster_data(
902
+ raster_paths=source_data_path, merge_rasters=merge_rasters
903
+ )
904
+
905
+ def load(self, source, merge_rasters: bool = False, **kwargs):
906
+ return super().load(source=source, merge_rasters=merge_rasters, **kwargs)
694
907
 
695
908
 
696
909
  class WPPopulationHandler(BaseHandler):
@@ -822,8 +1035,11 @@ class WPPopulationHandler(BaseHandler):
822
1035
  tif_processors = self.load_data(
823
1036
  source=source, ensure_available=ensure_available, **kwargs
824
1037
  )
1038
+ if isinstance(tif_processors, TifProcessor):
1039
+ return tif_processors.to_dataframe(**kwargs)
1040
+
825
1041
  return pd.concat(
826
- [tp.to_dataframe() for tp in tif_processors], ignore_index=True
1042
+ [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
827
1043
  )
828
1044
 
829
1045
  def load_into_geodataframe(
@@ -846,6 +1062,9 @@ class WPPopulationHandler(BaseHandler):
846
1062
  tif_processors = self.load_data(
847
1063
  source=source, ensure_available=ensure_available, **kwargs
848
1064
  )
1065
+ if isinstance(tif_processors, TifProcessor):
1066
+ return tif_processors.to_geodataframe(**kwargs)
1067
+
849
1068
  return pd.concat(
850
- [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
1069
+ [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
851
1070
  )
@@ -604,7 +604,7 @@ def aggregate_points_to_zones(
604
604
  - Single string: Use same method for all columns ("count", "mean", "sum", "min", "max")
605
605
  - Dict: Map column names to aggregation methods
606
606
  point_zone_predicate (str): Spatial predicate for point-to-zone relationship
607
- Options: "within", "intersects", "contains"
607
+ Options: "within", "intersects"
608
608
  zone_id_column (str): Column in zones containing zone identifiers
609
609
  output_suffix (str): Suffix to add to output column names
610
610
  drop_geometry (bool): Whether to drop the geometry column from output