PyPI - giga-spatial - Versions diffs - 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
gigaspatial/__init__.py +1 -1
gigaspatial/config.py +1 -0
gigaspatial/core/io/adls_data_store.py +104 -11
gigaspatial/core/io/local_data_store.py +8 -0
gigaspatial/generators/poi.py +226 -82
gigaspatial/generators/zonal/base.py +41 -28
gigaspatial/generators/zonal/geometry.py +91 -41
gigaspatial/grid/h3.py +417 -0
gigaspatial/grid/mercator_tiles.py +1 -1
gigaspatial/handlers/base.py +22 -8
gigaspatial/handlers/ghsl.py +22 -8
gigaspatial/handlers/giga.py +9 -4
gigaspatial/handlers/healthsites.py +350 -0
gigaspatial/handlers/osm.py +325 -105
gigaspatial/handlers/worldpop.py +228 -9
gigaspatial/processing/geo.py +11 -6
gigaspatial/processing/tif_processor.py +1183 -496
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0

gigaspatial/handlers/ghsl.py CHANGED Viewed

@@ -610,19 +610,27 @@ class GHSLDataReader(BaseHandlerReader):
         super().__init__(config=config, data_store=data_store, logger=logger)
     def load_from_paths(
-        self, source_data_path: List[Union[str, Path]], **kwargs
-    ) -> List[TifProcessor]:
+        self,
+        source_data_path: List[Union[str, Path]],
+        merge_rasters: bool = False,
+        **kwargs,
+    ) -> Union[List[TifProcessor], TifProcessor]:
         """
         Load TifProcessors from GHSL dataset.
         Args:
             source_data_path: List of file paths to load
+            merge_rasters: If True, all rasters will be merged into a single TifProcessor.
+                           Defaults to False.
         Returns:
-            List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
+            Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
+                                                    TifProcessor if merge_rasters is True.
         """
-        return self._load_raster_data(raster_paths=source_data_path)
+        return self._load_raster_data(
+            raster_paths=source_data_path, merge_rasters=merge_rasters
+        )
-    def load(self, source, **kwargs):
-        return super().load(source=source, file_ext=".tif")
+    def load(self, source, merge_rasters: bool = False, **kwargs):
+        return super().load(source=source, file_ext=".tif", merge_rasters=merge_rasters)
 class GHSLDataHandler(BaseHandler):
@@ -763,6 +771,7 @@ class GHSLDataHandler(BaseHandler):
             List[Union[str, Path]],  # list of paths
         ],
         ensure_available: bool = True,
+        merge_rasters: bool = False,
         **kwargs,
     ):
         return super().load_data(
@@ -771,6 +780,7 @@ class GHSLDataHandler(BaseHandler):
             file_ext=".tif",
             extract=True,
             file_pattern=r".*\.tif$",
+            merge_rasters=merge_rasters,
             **kwargs,
         )
@@ -801,8 +811,10 @@ class GHSLDataHandler(BaseHandler):
         tif_processors = self.load_data(
             source=source, ensure_available=ensure_available, **kwargs
         )
+        if isinstance(tif_processors, TifProcessor):
+            return tif_processors.to_dataframe(**kwargs)
         return pd.concat(
-            [tp.to_dataframe() for tp in tif_processors], ignore_index=True
+            [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
         )
     def load_into_geodataframe(
@@ -832,8 +844,10 @@ class GHSLDataHandler(BaseHandler):
         tif_processors = self.load_data(
             source=source, ensure_available=ensure_available, **kwargs
         )
+        if isinstance(tif_processors, TifProcessor):
+            return tif_processors.to_geodataframe(**kwargs)
         return pd.concat(
-            [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
+            [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
         )
     def get_available_data_info(

gigaspatial/handlers/giga.py CHANGED Viewed

@@ -8,6 +8,7 @@ from shapely.geometry import Point
 import pycountry
 from typing import Optional, Union
 import logging
+import geopandas as gpd
 from gigaspatial.config import config as global_config
@@ -40,11 +41,14 @@ class GigaSchoolLocationFetcher:
         if self.logger is None:
             self.logger = global_config.get_logger(self.__class__.__name__)
-    def fetch_locations(self, **kwargs) -> pd.DataFrame:
+    def fetch_locations(
+        self, process_geospatial: bool = False, **kwargs
+    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
         """
         Fetch and process school locations.
         Args:
+            process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
             **kwargs: Additional parameters for customization
                 - page_size: Override default page size
                 - sleep_time: Override default sleep time between requests
@@ -122,11 +126,12 @@ class GigaSchoolLocationFetcher:
         df = pd.DataFrame(all_data)
-        df = self._process_geospatial_data(df)
+        if process_geospatial:
+            df = self._process_geospatial_data(df)
         return df
-    def _process_geospatial_data(self, df: pd.DataFrame) -> pd.DataFrame:
+    def _process_geospatial_data(self, df: pd.DataFrame) -> gpd.GeoDataFrame:
         """
         Process and enhance the DataFrame with geospatial information.
@@ -144,7 +149,7 @@ class GigaSchoolLocationFetcher:
         )
         self.logger.info(f"Created geometry for all {len(df)} records")
-        return df
+        return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))

gigaspatial/handlers/healthsites.py ADDED Viewed

@@ -0,0 +1,350 @@
+import requests
+import pandas as pd
+import geopandas as gpd
+import time
+from typing import List, Optional, Union, Tuple
+from pydantic.dataclasses import dataclass, Field
+from pydantic import ConfigDict
+import pycountry
+from gigaspatial.config import config
+from gigaspatial.handlers import OSMLocationFetcher
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class HealthSitesFetcher:
+    """
+    Fetch and process health facility location data from the Healthsites.io API.
+    """
+    country: Optional[str] = Field(default=None, description="Country to filter")
+    api_url: str = Field(
+        default="https://healthsites.io/api/v3/facilities/",
+        description="Base URL for the Healthsites API",
+    )
+    api_key: str = config.HEALTHSITES_API_KEY
+    extent: Optional[Tuple[float, float, float, float]] = Field(
+        default=None, description="Bounding box as (minLng, minLat, maxLng, maxLat)"
+    )
+    page_size: int = Field(default=100, description="Number of records per API page")
+    flat_properties: bool = Field(
+        default=True, description="Show properties in flat format"
+    )
+    tag_format: str = Field(default="osm", description="Tag format (osm/hxl)")
+    output_format: str = Field(
+        default="geojson", description="Output format (json/geojson)"
+    )
+    sleep_time: float = Field(
+        default=0.2, description="Sleep time between API requests"
+    )
+    def __post_init__(self):
+        self.logger = config.get_logger(self.__class__.__name__)
+        # Convert country code to OSM English name if provided
+        if self.country:
+            self.country = self._convert_country(self.country)
+    def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
+        """
+        Fetch and process health facility locations.
+        Args:
+            **kwargs: Additional parameters for customization
+                - country: Override country filter
+                - extent: Override extent filter
+                - from_date: Get data modified from this timestamp (datetime or string)
+                - to_date: Get data modified to this timestamp (datetime or string)
+                - page_size: Override default page size
+                - sleep_time: Override default sleep time between requests
+                - max_pages: Limit the number of pages to fetch
+                - output_format: Override output format ('json' or 'geojson')
+                - flat_properties: Override flat properties setting
+        Returns:
+            Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
+                Returns GeoDataFrame for geojson format, DataFrame for json format.
+        """
+        # Override defaults with kwargs if provided
+        country = kwargs.get("country", self.country)
+        extent = kwargs.get("extent", self.extent)
+        from_date = kwargs.get("from_date", None)
+        to_date = kwargs.get("to_date", None)
+        page_size = kwargs.get("page_size", self.page_size)
+        sleep_time = kwargs.get("sleep_time", self.sleep_time)
+        max_pages = kwargs.get("max_pages", None)
+        output_format = kwargs.get("output_format", self.output_format)
+        flat_properties = kwargs.get("flat_properties", self.flat_properties)
+        # Convert country if provided in kwargs
+        if country:
+            country = self._convert_country(country)
+        # Prepare base parameters
+        base_params = {
+            "api-key": self.api_key,
+            "tag-format": self.tag_format,
+            "output": output_format,
+        }
+        # Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
+        if flat_properties:
+            base_params["flat-properties"] = "true"
+        # Add optional filters
+        if country:
+            base_params["country"] = country
+        if extent:
+            if len(extent) != 4:
+                raise ValueError(
+                    "Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
+                )
+            base_params["extent"] = ",".join(map(str, extent))
+        if from_date:
+            base_params["from"] = self._format_timestamp(from_date)
+        if to_date:
+            base_params["to"] = self._format_timestamp(to_date)
+        all_data = []
+        page = 1
+        self.logger.info(
+            f"Starting to fetch health facilities for country: {country or 'all countries'}"
+        )
+        self.logger.info(
+            f"Output format: {output_format}, Flat properties: {flat_properties}"
+        )
+        while True:
+            # Check if we've reached max_pages limit
+            if max_pages and page > max_pages:
+                self.logger.info(f"Reached maximum pages limit: {max_pages}")
+                break
+            # Add page parameter
+            params = base_params.copy()
+            params["page"] = page
+            try:
+                self.logger.debug(f"Fetching page {page} with params: {params}")
+                response = requests.get(self.api_url, params=params)
+                response.raise_for_status()
+                parsed = response.json()
+                # Handle different response structures based on output format
+                if output_format == "geojson":
+                    # GeoJSON returns FeatureCollection with features list
+                    data = parsed.get("features", [])
+                else:
+                    # JSON returns direct list
+                    data = parsed if isinstance(parsed, list) else []
+            except requests.exceptions.RequestException as e:
+                self.logger.error(f"Request failed on page {page}: {e}")
+                break
+            except ValueError as e:
+                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
+                break
+            # Check if we got any data
+            if not data or not isinstance(data, list):
+                self.logger.info(f"No data on page {page}. Stopping.")
+                break
+            all_data.extend(data)
+            self.logger.info(f"Fetched page {page} with {len(data)} records")
+            # If we got fewer records than page_size, we've reached the end
+            if len(data) < page_size:
+                self.logger.info("Reached end of data (partial page received)")
+                break
+            page += 1
+            # Sleep to be respectful to the API
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")
+        # Convert to DataFrame/GeoDataFrame based on format
+        if not all_data:
+            self.logger.warning("No data fetched, returning empty DataFrame")
+            if output_format == "geojson":
+                return gpd.GeoDataFrame()
+            return pd.DataFrame()
+        if output_format == "geojson":
+            # Use GeoDataFrame.from_features for GeoJSON format
+            gdf = gpd.GeoDataFrame.from_features(all_data)
+            self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
+            return gdf
+        else:
+            # For JSON format, handle nested structure if flat_properties is False
+            if not flat_properties:
+                df = self._process_json_with_centroid(all_data)
+            else:
+                df = pd.DataFrame(all_data)
+            self.logger.info(f"Created DataFrame with {len(df)} records")
+            return df
+    def fetch_statistics(self, **kwargs) -> dict:
+        """
+        Fetch statistics for health facilities.
+        Args:
+            **kwargs: Same filtering parameters as fetch_facilities
+        Returns:
+            dict: Statistics data
+        """
+        country = kwargs.get("country", self.country)
+        extent = kwargs.get("extent", self.extent)
+        from_date = kwargs.get("from_date", None)
+        to_date = kwargs.get("to_date", None)
+        # Convert country if provided
+        if country:
+            country = self._convert_country(country)
+        params = {
+            "api-key": self.api_key,
+        }
+        # Add optional filters
+        if country:
+            params["country"] = country
+        if extent:
+            params["extent"] = ",".join(map(str, extent))
+        if from_date:
+            params["from"] = self._format_timestamp(from_date)
+        if to_date:
+            params["to"] = self._format_timestamp(to_date)
+        try:
+            response = requests.get(f"{self.api_url}/statistic/", params=params)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Request failed for statistics: {e}")
+            raise
+    def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
+        """
+        Fetch a specific facility by OSM type and ID.
+        Args:
+            osm_type: OSM type (node, way, relation)
+            osm_id: OSM ID
+        Returns:
+            dict: Facility details
+        """
+        params = {"api-key": self.api_key}
+        try:
+            url = f"{self.api_url}/{osm_type}/{osm_id}"
+            response = requests.get(url, params=params)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
+            raise
+    def _create_dataframe(self, data: List[dict]) -> pd.DataFrame:
+        """
+        Create DataFrame from API response data.
+        Args:
+            data: List of facility records
+        Returns:
+            pd.DataFrame: Processed DataFrame
+        """
+        if self.output_format == "geojson":
+            # Handle GeoJSON format
+            records = []
+            for feature in data:
+                record = feature.get("properties", {}).copy()
+                geometry = feature.get("geometry", {})
+                coordinates = geometry.get("coordinates", [])
+                if coordinates and len(coordinates) >= 2:
+                    record["longitude"] = coordinates[0]
+                    record["latitude"] = coordinates[1]
+                records.append(record)
+            return pd.DataFrame(records)
+        else:
+            # Handle regular JSON format
+            return pd.DataFrame(data)
+    def _process_json_with_centroid(self, data: List[dict]) -> pd.DataFrame:
+        """
+        Process JSON data to flatten 'attributes' and 'centroid' fields,
+        and extract longitude/latitude from centroid.
+        Args:
+            data: List of facility records, where each record might contain
+                  nested 'attributes' and 'centroid' dictionaries.
+        Returns:
+            pd.DataFrame: Processed DataFrame with flattened data.
+        """
+        processed_records = []
+        for record in data:
+            new_record = {}
+            # Flatten top-level keys
+            for key, value in record.items():
+                if key not in ["attributes", "centroid"]:
+                    new_record[key] = value
+            # Flatten 'attributes'
+            attributes = record.get("attributes", {})
+            for attr_key, attr_value in attributes.items():
+                new_record[f"{attr_key}"] = attr_value
+            # Extract centroid coordinates
+            centroid = record.get("centroid", {})
+            coordinates = centroid.get("coordinates", [])
+            if coordinates and len(coordinates) == 2:
+                new_record["longitude"] = coordinates[0]
+                new_record["latitude"] = coordinates[1]
+            else:
+                new_record["longitude"] = None
+                new_record["latitude"] = None
+            processed_records.append(new_record)
+        return pd.DataFrame(processed_records)
+    def _convert_country(self, country: str) -> str:
+        try:
+            # First convert to ISO3 format if needed
+            country_obj = pycountry.countries.lookup(country)
+            iso3_code = country_obj.alpha_3
+            # Get OSM English name using OSMLocationFetcher
+            osm_data = OSMLocationFetcher.get_osm_countries(iso3_code=iso3_code)
+            osm_name_en = osm_data.get("name:en")
+            if not osm_name_en:
+                raise ValueError(
+                    f"Could not find OSM English name for country: {country}"
+                )
+            self.logger.info(
+                f"Converted country code to OSM English name: {osm_name_en}"
+            )
+            return osm_name_en
+        except LookupError:
+            raise ValueError(f"Invalid country code provided: {country}")
+        except Exception as e:
+            raise ValueError(f"Failed to get OSM English name: {e}")

giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl