PyPI - giga-spatial - Versions diffs - 0.6.0__py3-none-any.whl - Mend

giga-spatial 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

giga_spatial-0.6.0.dist-info/METADATA +141 -0
giga_spatial-0.6.0.dist-info/RECORD +47 -0
giga_spatial-0.6.0.dist-info/WHEEL +5 -0
giga_spatial-0.6.0.dist-info/licenses/LICENSE +661 -0
giga_spatial-0.6.0.dist-info/top_level.txt +1 -0
gigaspatial/__init__.py +1 -0
gigaspatial/config.py +226 -0
gigaspatial/core/__init__.py +0 -0
gigaspatial/core/io/__init__.py +5 -0
gigaspatial/core/io/adls_data_store.py +325 -0
gigaspatial/core/io/data_api.py +113 -0
gigaspatial/core/io/data_store.py +147 -0
gigaspatial/core/io/local_data_store.py +92 -0
gigaspatial/core/io/readers.py +265 -0
gigaspatial/core/io/writers.py +128 -0
gigaspatial/core/schemas/__init__.py +0 -0
gigaspatial/core/schemas/entity.py +244 -0
gigaspatial/generators/__init__.py +2 -0
gigaspatial/generators/poi.py +636 -0
gigaspatial/generators/zonal/__init__.py +3 -0
gigaspatial/generators/zonal/base.py +370 -0
gigaspatial/generators/zonal/geometry.py +439 -0
gigaspatial/generators/zonal/mercator.py +78 -0
gigaspatial/grid/__init__.py +1 -0
gigaspatial/grid/mercator_tiles.py +286 -0
gigaspatial/handlers/__init__.py +40 -0
gigaspatial/handlers/base.py +761 -0
gigaspatial/handlers/boundaries.py +305 -0
gigaspatial/handlers/ghsl.py +772 -0
gigaspatial/handlers/giga.py +145 -0
gigaspatial/handlers/google_open_buildings.py +472 -0
gigaspatial/handlers/hdx.py +241 -0
gigaspatial/handlers/mapbox_image.py +208 -0
gigaspatial/handlers/maxar_image.py +291 -0
gigaspatial/handlers/microsoft_global_buildings.py +548 -0
gigaspatial/handlers/ookla_speedtest.py +199 -0
gigaspatial/handlers/opencellid.py +290 -0
gigaspatial/handlers/osm.py +356 -0
gigaspatial/handlers/overture.py +126 -0
gigaspatial/handlers/rwi.py +157 -0
gigaspatial/handlers/unicef_georepo.py +806 -0
gigaspatial/handlers/worldpop.py +266 -0
gigaspatial/processing/__init__.py +4 -0
gigaspatial/processing/geo.py +1054 -0
gigaspatial/processing/sat_images.py +39 -0
gigaspatial/processing/tif_processor.py +477 -0
gigaspatial/processing/utils.py +49 -0

gigaspatial/handlers/osm.py ADDED Viewed

@@ -0,0 +1,356 @@
+import requests
+import pandas as pd
+from typing import List, Dict, Union, Optional, Literal
+from dataclasses import dataclass
+from time import sleep
+from concurrent.futures import ThreadPoolExecutor
+from requests.exceptions import RequestException
+from shapely.geometry import Polygon, Point
+import pycountry
+from gigaspatial.config import config
+@dataclass
+class OSMLocationFetcher:
+    """
+    A class to fetch and process location data from OpenStreetMap using the Overpass API.
+    This class supports fetching various OSM location types including amenities, buildings,
+    shops, and other POI categories.
+    """
+    country: str
+    location_types: Union[List[str], Dict[str, List[str]]]
+    base_url: str = "http://overpass-api.de/api/interpreter"
+    timeout: int = 600
+    max_retries: int = 3
+    retry_delay: int = 5
+    def __post_init__(self):
+        """Validate inputs, normalize location_types, and set up logging."""
+        try:
+            self.country = pycountry.countries.lookup(self.country).alpha_2
+        except LookupError:
+            raise ValueError(f"Invalid country code provided: {self.country}")
+        # Normalize location_types to always be a dictionary
+        if isinstance(self.location_types, list):
+            self.location_types = {"amenity": self.location_types}
+        elif not isinstance(self.location_types, dict):
+            raise TypeError(
+                "location_types must be a list of strings or a dictionary mapping categories to type lists"
+            )
+        self.logger = config.get_logger(self.__class__.__name__)
+    def _build_queries(self, since_year: Optional[int] = None) -> List[str]:
+        """
+        Construct separate Overpass QL queries for different element types and categories.
+        Returns list of [nodes_relations_query, ways_query]
+        """
+        if since_year:
+            date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
+        else:
+            date_filter = ""
+        # Query for nodes and relations (with center output)
+        nodes_relations_queries = []
+        for category, types in self.location_types.items():
+            nodes_relations_queries.extend(
+                [
+                    f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
+                    f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
+                ]
+            )
+        nodes_relations_queries = "\n".join(nodes_relations_queries)
+        nodes_relations_query = f"""
+        [out:json][timeout:{self.timeout}];
+        area["ISO3166-1"={self.country}]->.searchArea;
+        (
+            {nodes_relations_queries}
+        );
+        out center;
+        """
+        # Query for ways (with geometry output)
+        ways_queries = []
+        for category, types in self.location_types.items():
+            ways_queries.append(
+                f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
+            )
+        ways_queries = "\n".join(ways_queries)
+        ways_query = f"""
+        [out:json][timeout:{self.timeout}];
+        area["ISO3166-1"={self.country}]->.searchArea;
+        (
+            {ways_queries}
+        );
+        out geom;
+        """
+        return [nodes_relations_query, ways_query]
+    def _make_request(self, query: str) -> Dict:
+        """Make HTTP request to Overpass API with retry mechanism."""
+        for attempt in range(self.max_retries):
+            try:
+                self.logger.debug(f"Executing query:\n{query}")
+                response = requests.get(
+                    self.base_url, params={"data": query}, timeout=self.timeout
+                )
+                response.raise_for_status()
+                return response.json()
+            except RequestException as e:
+                self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
+                if attempt < self.max_retries - 1:
+                    sleep(self.retry_delay)
+                else:
+                    raise RuntimeError(
+                        f"Failed to fetch data after {self.max_retries} attempts"
+                    ) from e
+    def _extract_matching_categories(self, tags: Dict[str, str]) -> Dict[str, str]:
+        """
+        Extract all matching categories and their values from the tags.
+        Returns:
+            Dict mapping each matching category to its value
+        """
+        matches = {}
+        for category, types in self.location_types.items():
+            if category in tags and tags[category] in types:
+                matches[category] = tags[category]
+        return matches
+    def _process_node_relation(self, element: Dict) -> List[Dict[str, any]]:
+        """
+        Process a node or relation element.
+        May return multiple processed elements if the element matches multiple categories.
+        """
+        try:
+            tags = element.get("tags", {})
+            matching_categories = self._extract_matching_categories(tags)
+            if not matching_categories:
+                self.logger.warning(
+                    f"Element {element['id']} missing or not matching specified category tags"
+                )
+                return []
+            _lat = element.get("lat") or element["center"]["lat"]
+            _lon = element.get("lon") or element["center"]["lon"]
+            point_geom = Point(_lon, _lat)
+            # for each matching category, create a separate element
+            results = []
+            for category, value in matching_categories.items():
+                results.append(
+                    {
+                        "source_id": element["id"],
+                        "category": category,
+                        "category_value": value,
+                        "name": tags.get("name", ""),
+                        "name_en": tags.get("name:en", ""),
+                        "type": element["type"],
+                        "geometry": point_geom,
+                        "latitude": _lat,
+                        "longitude": _lon,
+                        "matching_categories": list(matching_categories.keys()),
+                    }
+                )
+            return results
+        except KeyError as e:
+            self.logger.error(f"Corrupt data received for node/relation: {str(e)}")
+            return []
+    def _process_way(self, element: Dict) -> List[Dict[str, any]]:
+        """
+        Process a way element with geometry.
+        May return multiple processed elements if the element matches multiple categories.
+        """
+        try:
+            tags = element.get("tags", {})
+            matching_categories = self._extract_matching_categories(tags)
+            if not matching_categories:
+                self.logger.warning(
+                    f"Element {element['id']} missing or not matching specified category tags"
+                )
+                return []
+            # Create polygon from geometry points
+            polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
+            centroid = polygon.centroid
+            # For each matching category, create a separate element
+            results = []
+            for category, value in matching_categories.items():
+                results.append(
+                    {
+                        "source_id": element["id"],
+                        "category": category,
+                        "category_value": value,
+                        "name": tags.get("name", ""),
+                        "name_en": tags.get("name:en", ""),
+                        "type": element["type"],
+                        "geometry": polygon,
+                        "latitude": centroid.y,
+                        "longitude": centroid.x,
+                        "matching_categories": list(matching_categories.keys()),
+                    }
+                )
+            return results
+        except (KeyError, ValueError) as e:
+            self.logger.error(f"Error processing way geometry: {str(e)}")
+            return []
+    def fetch_locations(
+        self,
+        since_year: Optional[int] = None,
+        handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
+    ) -> pd.DataFrame:
+        """
+        Fetch and process OSM locations.
+        Args:
+            since_year (int, optional): Filter for locations added/modified since this year.
+            handle_duplicates (str): How to handle objects matching multiple categories:
+                - 'separate': Create separate entries for each category (default)
+                - 'combine': Use a single entry with a list of matching categories
+                - 'primary': Keep only the first matching category
+        Returns:
+            pd.DataFrame: Processed OSM locations
+        """
+        if handle_duplicates not in ("separate", "combine", "primary"):
+            raise ValueError(
+                "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
+            )
+        self.logger.info(
+            f"Fetching OSM locations from Overpass API for country: {self.country}"
+        )
+        self.logger.info(f"Location types: {self.location_types}")
+        self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")
+        # Get queries for different element types
+        nodes_relations_query, ways_query = self._build_queries(since_year)
+        # Fetch nodes and relations
+        nodes_relations_response = self._make_request(nodes_relations_query)
+        nodes_relations = nodes_relations_response.get("elements", [])
+        # Fetch ways
+        ways_response = self._make_request(ways_query)
+        ways = ways_response.get("elements", [])
+        if not nodes_relations and not ways:
+            self.logger.warning("No locations found for the specified criteria")
+            return pd.DataFrame()
+        self.logger.info(
+            f"Processing {len(nodes_relations)} nodes/relations and {len(ways)} ways..."
+        )
+        # Process nodes and relations
+        with ThreadPoolExecutor() as executor:
+            processed_nodes_relations = [
+                item
+                for sublist in executor.map(
+                    self._process_node_relation, nodes_relations
+                )
+                for item in sublist
+            ]
+        # Process ways
+        with ThreadPoolExecutor() as executor:
+            processed_ways = [
+                item
+                for sublist in executor.map(self._process_way, ways)
+                for item in sublist
+            ]
+        # Combine all processed elements
+        all_elements = processed_nodes_relations + processed_ways
+        if not all_elements:
+            self.logger.warning("No matching elements found after processing")
+            return pd.DataFrame()
+        # Handle duplicates based on the specified strategy
+        if handle_duplicates != "separate":
+            # Group by source_id
+            grouped_elements = {}
+            for elem in all_elements:
+                source_id = elem["source_id"]
+                if source_id not in grouped_elements:
+                    grouped_elements[source_id] = elem
+                elif handle_duplicates == "combine":
+                    # Combine matching categories
+                    if grouped_elements[source_id]["category"] != elem["category"]:
+                        if isinstance(grouped_elements[source_id]["category"], str):
+                            grouped_elements[source_id]["category"] = [
+                                grouped_elements[source_id]["category"]
+                            ]
+                            grouped_elements[source_id]["category_value"] = [
+                                grouped_elements[source_id]["category_value"]
+                            ]
+                        if (
+                            elem["category"]
+                            not in grouped_elements[source_id]["category"]
+                        ):
+                            grouped_elements[source_id]["category"].append(
+                                elem["category"]
+                            )
+                            grouped_elements[source_id]["category_value"].append(
+                                elem["category_value"]
+                            )
+                # For 'primary', just keep the first one we encountered
+            all_elements = list(grouped_elements.values())
+        locations = pd.DataFrame(all_elements)
+        # Log element type distribution
+        type_counts = locations["type"].value_counts()
+        self.logger.info("\nElement type distribution:")
+        for element_type, count in type_counts.items():
+            self.logger.info(f"{element_type}: {count}")
+        # Log category distribution
+        if handle_duplicates == "combine":
+            # Count each category separately when they're in lists
+            category_counts = {}
+            for cats in locations["category"]:
+                if isinstance(cats, list):
+                    for cat in cats:
+                        category_counts[cat] = category_counts.get(cat, 0) + 1
+                else:
+                    category_counts[cats] = category_counts.get(cats, 0) + 1
+            self.logger.info("\nCategory distribution:")
+            for category, count in category_counts.items():
+                self.logger.info(f"{category}: {count}")
+        else:
+            category_counts = locations["category"].value_counts()
+            self.logger.info("\nCategory distribution:")
+            for category, count in category_counts.items():
+                self.logger.info(f"{category}: {count}")
+        # Log elements with multiple matching categories
+        multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
+        if multi_category:
+            self.logger.info(
+                f"\n{len(multi_category)} elements matched multiple categories"
+            )
+        self.logger.info(f"Successfully processed {len(locations)} locations")
+        return locations

gigaspatial/handlers/overture.py ADDED Viewed

@@ -0,0 +1,126 @@
+# import requests
+import geopandas as gpd
+from typing import List, Optional, Union
+from pydantic.dataclasses import dataclass, Field
+from pydantic import ConfigDict
+from shapely.geometry import Polygon, MultiPolygon
+from shapely.strtree import STRtree
+from pathlib import Path
+import pycountry
+import duckdb
+from gigaspatial.config import config
+from gigaspatial.handlers.boundaries import AdminBoundaries
+from gigaspatial.core.io.data_store import DataStore
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class OvertureAmenityFetcher:
+    """
+    A class to fetch and process amenity locations from Overture.
+    """
+    # constants
+    release: Optional[str] = "2024-12-18.0"
+    base_url: Optional[str] = (
+        "s3://overturemaps-us-west-2/release/{release}/theme=places/*/*"
+    )
+    # user config
+    country: str = Field(...)
+    amenity_types: List[str] = Field(..., description="List of amenity types to fetch")
+    geom: Union[Polygon, MultiPolygon] = None
+    # config for country boundary access from data storage
+    # if None GADM boundaries will be used
+    data_store: DataStore = None
+    country_geom_path: Optional[Union[str, Path]] = None
+    def __post_init__(self):
+        """Validate inputs and set up logging."""
+        try:
+            self.country = pycountry.countries.lookup(self.country).alpha_2
+        except LookupError:
+            raise ValueError(f"Invalid country code provided: {self.country}")
+        self.base_url = self.base_url.format(release=self.release)
+        self.logger = config.get_logger(self.__class__.__name__)
+        self.connection = self._set_connection()
+    def _set_connection(self):
+        """Set the connection to the DB"""
+        db = duckdb.connect()
+        db.install_extension("spatial")
+        db.load_extension("spatial")
+        return db
+    def _load_country_geometry(
+        self,
+    ) -> Union[Polygon, MultiPolygon]:
+        """Load country boundary geometry from DataStore or GADM."""
+        gdf_admin0 = AdminBoundaries.create(
+            country_code=pycountry.countries.lookup(self.country).alpha_3,
+            admin_level=0,
+            data_store=self.data_store,
+            path=self.country_geom_path,
+        ).to_geodataframe()
+        return gdf_admin0.geometry.iloc[0]
+    def _build_query(self, match_pattern: bool = False, **kwargs) -> str:
+        """Constructs and returns the query"""
+        if match_pattern:
+            amenity_query = " OR ".join(
+                [f"category ilike '%{amenity}%'" for amenity in self.amenity_types]
+            )
+        else:
+            amenity_query = " OR ".join(
+                [f"category == '{amenity}'" for amenity in self.amenity_types]
+            )
+        query = """
+        SELECT id,
+            names.primary AS name,
+            ROUND(confidence,2) as confidence,
+            categories.primary AS category,
+            ST_AsText(geometry) as geometry,
+        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-12-18.0/theme=places/type=place/*',
+            hive_partitioning=1)
+        WHERE bbox.xmin > {}
+            AND bbox.ymin > {}
+            AND bbox.xmax <  {}
+            AND bbox.ymax < {}
+            AND ({})
+        """
+        if not self.geom:
+            self.geom = self._load_country_geometry()
+        return query.format(*self.geom.bounds, amenity_query)
+    def fetch_locations(
+        self, match_pattern: bool = False, **kwargs
+    ) -> gpd.GeoDataFrame:
+        """Fetch and process amenity locations."""
+        self.logger.info("Fetching amenity locations from Overture DB...")
+        query = self._build_query(match_pattern=match_pattern, **kwargs)
+        df = self.connection.execute(query).df()
+        self.logger.info("Processing geometries")
+        gdf = gpd.GeoDataFrame(
+            df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
+        )
+        # filter by geometry boundary
+        s = STRtree(gdf.geometry)
+        result = s.query(self.geom, predicate="intersects")
+        locations = gdf.iloc[result].reset_index(drop=True)
+        self.logger.info(f"Successfully processed {len(locations)} amenity locations")
+        return locations

gigaspatial/handlers/rwi.py ADDED Viewed

@@ -0,0 +1,157 @@
+import logging
+from pathlib import Path
+from typing import List, Optional, Union, Dict, Any, Literal
+import pycountry
+import tempfile
+from pydantic import Field, field_validator
+from gigaspatial.core.io.data_store import DataStore
+from gigaspatial.handlers.hdx import HDXConfig, HDXDownloader
+class RWIConfig(HDXConfig):
+    """Configuration for Relative Wealth Index data access"""
+    # Override dataset_name to be fixed for RWI
+    dataset_name: Literal["relative-wealth-index"] = Field(
+        default="relative-wealth-index"
+    )
+    # Additional RWI-specific configurations
+    country: Optional[str] = Field(
+        default=None, description="Country ISO code to filter data for"
+    )
+    @field_validator("country")
+    def validate_country(cls, value: str) -> str:
+        try:
+            return pycountry.countries.lookup(value).alpha_3
+        except LookupError:
+            raise ValueError(f"Invalid country code provided: {value}")
+class RelativeWealthIndexDownloader(HDXDownloader):
+    """Specialized downloader for the Relative Wealth Index dataset from HDX"""
+    def __init__(
+        self,
+        config: Union[RWIConfig, dict] = None,
+        data_store: Optional[DataStore] = None,
+        logger: Optional[logging.Logger] = None,
+    ):
+        if config is None:
+            config = RWIConfig()
+        elif isinstance(config, dict):
+            config = RWIConfig(**config)
+        super().__init__(config=config, data_store=data_store, logger=logger)
+    @classmethod
+    def from_config(
+        cls,
+        country: Optional[str] = None,
+        **kwargs,
+    ):
+        """Create a downloader with RWI-specific configurations"""
+        config = RWIConfig(country=country, **kwargs)
+        return cls(config=config)
+    def download_dataset(self) -> List[str]:
+        """Download RWI dataset, optionally filtering for a specific country.
+        If country is specified, attempts to find and download only the resources
+        relevant to that country. Otherwise, downloads all RWI resources.
+        Returns:
+            List of paths to the downloaded files
+        """
+        # If no country specified, download all resources
+        if self.config.country is None:
+            return super().download_dataset()
+        # Get all resources from the dataset
+        try:
+            resources = self.get_dataset_resources()
+            if not resources:
+                self.logger.warning(f"No resources found for RWI dataset")
+                return []
+            # Prepare country identifiers for matching
+            country_code = self.config.country.lower()
+            country_name = pycountry.countries.lookup(self.config.country).name.lower()
+            country_alpha2 = pycountry.countries.lookup(
+                self.config.country
+            ).alpha_2.lower()
+            # Try different matching patterns
+            country_patterns = [
+                f"/{country_code}_",  # URL path with ISO3 prefix
+                f"/{country_code}.",  # URL path with ISO3 followed by extension
+                f"_{country_code}_",  # Filename with ISO3 in middle
+                f"_{country_code}.",  # Filename with ISO3 at end
+                f"/{country_name.replace(' ', '')}_",  # URL with no spaces
+                f"/{country_name.replace(' ', '-')}_",  # URL with hyphens
+                f"/{country_alpha2}_",  # URL with ISO2 code
+                country_name,  # Country name anywhere in URL
+            ]
+            # Find matching resources
+            matching_resources = []
+            for resource in resources:
+                # Get the URL safely
+                resource_url = resource.get("url", "")
+                if not resource_url:
+                    continue
+                resource_url = resource_url.lower()
+                # Check for matches with our patterns
+                if any(pattern in resource_url for pattern in country_patterns):
+                    matching_resources.append(resource)
+            if not matching_resources:
+                self.logger.warning(
+                    f"No resources matching country '{self.config.country}' were found. "
+                    f"Consider downloading the full dataset with country=None and filtering afterwards."
+                )
+                return []
+            # Download the matching resources
+            downloaded_paths = []
+            for res in matching_resources:
+                try:
+                    resource_name = res.get("name", "Unknown")
+                    self.logger.info(f"Downloading resource: {resource_name}")
+                    # Download to a temporary directory
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        url, local_path = res.download(folder=tmpdir)
+                        # Read the file and write to the DataStore
+                        with open(local_path, "rb") as f:
+                            data = f.read()
+                        # Compose the target path in the DataStore
+                        target_path = str(
+                            self.config.output_dir_path / Path(local_path).name
+                        )
+                        self.data_store.write_file(target_path, data)
+                        downloaded_paths.append(target_path)
+                    self.logger.info(
+                        f"Downloaded resource: {resource_name} to {target_path}"
+                    )
+                except Exception as e:
+                    resource_name = res.get("name", "Unknown")
+                    self.logger.error(
+                        f"Error downloading resource {resource_name}: {str(e)}"
+                    )
+            return downloaded_paths
+        except Exception as e:
+            self.logger.error(f"Error during country-filtered download: {str(e)}")
+            # Fall back to downloading all resources
+            self.logger.info("Falling back to downloading all RWI resources")
+            return super().download_dataset()