PyPI - giga-spatial - Versions diffs - 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/METADATA +30 -4
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/RECORD +22 -20
gigaspatial/__init__.py +1 -1
gigaspatial/config.py +1 -0
gigaspatial/core/io/adls_data_store.py +104 -11
gigaspatial/core/io/local_data_store.py +8 -0
gigaspatial/generators/poi.py +226 -82
gigaspatial/generators/zonal/base.py +41 -28
gigaspatial/generators/zonal/geometry.py +91 -41
gigaspatial/grid/h3.py +417 -0
gigaspatial/grid/mercator_tiles.py +1 -1
gigaspatial/handlers/base.py +22 -8
gigaspatial/handlers/ghsl.py +22 -8
gigaspatial/handlers/giga.py +9 -4
gigaspatial/handlers/healthsites.py +350 -0
gigaspatial/handlers/osm.py +325 -105
gigaspatial/handlers/worldpop.py +228 -9
gigaspatial/processing/geo.py +11 -6
gigaspatial/processing/tif_processor.py +1183 -496
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/WHEEL +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/licenses/LICENSE +0 -0
{giga_spatial-0.6.9.dist-info → giga_spatial-0.7.1.dist-info}/top_level.txt +0 -0

gigaspatial/handlers/osm.py CHANGED Viewed

@@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor
 from requests.exceptions import RequestException
 from shapely.geometry import Polygon, Point
 import pycountry
+from datetime import datetime
 from gigaspatial.config import config
@@ -112,56 +113,100 @@ class OSMLocationFetcher:
                 names.append(name)
         return sorted(set(names))
-    def _build_queries(self, since_year: Optional[int] = None) -> List[str]:
-        """
-        Construct separate Overpass QL queries for different element types and categories.
-        Returns list of [nodes_relations_query, ways_query]
+    @staticmethod
+    def get_osm_countries(
+        iso3_code: Optional[str] = None, include_names: bool = True, timeout: int = 1000
+    ) -> Union[str, Dict[str, str], List[str], List[Dict[str, str]]]:
         """
-        if since_year:
-            date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
-        else:
-            date_filter = ""
+        Fetch countries from OpenStreetMap database.
-        # Query for nodes and relations (with center output)
-        nodes_relations_queries = []
-        for category, types in self.location_types.items():
-            nodes_relations_queries.extend(
-                [
-                    f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
-                    f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
-                ]
-            )
+        This queries the actual OSM database for country boundaries and returns
+        country names as they appear in OSM, including various name translations.
-        nodes_relations_queries = "\n".join(nodes_relations_queries)
+        Args:
+            iso3_code (str, optional): ISO 3166-1 alpha-3 code to fetch a specific country.
+                                      If provided, returns single country data.
+                                      If None, returns all countries.
+            include_names (bool): If True, return dict with multiple name variants.
+                                 If False, return only the primary name.
+            timeout (int): Timeout for the Overpass API request (default: 1000).
-        nodes_relations_query = f"""
-        [out:json][timeout:{self.timeout}];
-        {self.area_query}
+        Returns:
+            When iso3_code is provided:
+                - If include_names=False: Single country name (str)
+                - If include_names=True: Dict with name variants
+            When iso3_code is None:
+                - If include_names=False: List of country names
+                - If include_names=True: List of dicts with name variants including:
+                  name, name:en, ISO3166-1 codes, and other name translations
+        Raises:
+            ValueError: If iso3_code is provided but country not found in OSM.
+        """
+        if iso3_code:
+            # Filter for the specific ISO3 code provided
+            iso3_upper = iso3_code.upper()
+            country_filter = f'["ISO3166-1:alpha3"="{iso3_upper}"]'
+        else:
+            # Filter for the *existence* of an ISO3 code tag to limit results to actual countries
+            country_filter = '["ISO3166-1:alpha3"]'
+        # Query OSM for country-level boundaries
+        query = f"""
+        [out:json][timeout:{timeout}];
         (
-            {nodes_relations_queries}
+          relation["boundary"="administrative"]["admin_level"="2"]{country_filter};
         );
-        out center;
+        out tags;
         """
-        # Query for ways (with geometry output)
-        ways_queries = []
-        for category, types in self.location_types.items():
-            ways_queries.append(
-                f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
-            )
+        url = "http://overpass-api.de/api/interpreter"
+        response = requests.get(url, params={"data": query}, timeout=timeout)
+        response.raise_for_status()
+        data = response.json()
-        ways_queries = "\n".join(ways_queries)
+        countries = []
+        for element in data.get("elements", []):
+            tags = element.get("tags", {})
-        ways_query = f"""
-        [out:json][timeout:{self.timeout}];
-        {self.area_query}
-        (
-            {ways_queries}
-        );
-        out geom;
-        """
+            if include_names:
+                country_info = {
+                    "name": tags.get("name", ""),
+                    "name:en": tags.get("name:en", ""),
+                    "official_name": tags.get("official_name", ""),
+                    "official_name:en": tags.get("official_name:en", ""),
+                    "ISO3166-1": tags.get("ISO3166-1", ""),
+                    "ISO3166-1:alpha2": tags.get("ISO3166-1:alpha2", ""),
+                    "ISO3166-1:alpha3": tags.get("ISO3166-1:alpha3", ""),
+                }
+                # Add any other name:* tags (translations)
+                for key, value in tags.items():
+                    if key.startswith("name:") and key not in country_info:
+                        country_info[key] = value
+                # Remove empty string values
+                country_info = {k: v for k, v in country_info.items() if v}
+                if country_info.get("name"):  # Only add if has a name
+                    countries.append(country_info)
+            else:
+                name = tags.get("name")
+                if name:
+                    countries.append(name)
+        # If looking for a specific country, return single result or raise error
+        if iso3_code:
+            if not countries:
+                raise ValueError(
+                    f"Country with ISO3 code '{iso3_code}' not found in OSM database"
+                )
+            return countries[0]  # Return single country, not a list
-        return [nodes_relations_query, ways_query]
+        # Return sorted list for all countries
+        return sorted(
+            countries, key=lambda x: x if isinstance(x, str) else x.get("name", "")
+        )
     def _make_request(self, query: str) -> Dict:
         """Make HTTP request to Overpass API with retry mechanism."""
@@ -213,23 +258,33 @@ class OSMLocationFetcher:
             _lon = element.get("lon") or element["center"]["lon"]
             point_geom = Point(_lon, _lat)
-            # for each matching category, create a separate element
+            # Extract metadata if available
+            metadata = {}
+            if "timestamp" in element:
+                metadata["timestamp"] = element["timestamp"]
+                metadata["version"] = element.get("version")
+                metadata["changeset"] = element.get("changeset")
+                metadata["user"] = element.get("user")
+                metadata["uid"] = element.get("uid")
+            # For each matching category, create a separate element
             results = []
             for category, value in matching_categories.items():
-                results.append(
-                    {
-                        "source_id": element["id"],
-                        "category": category,
-                        "category_value": value,
-                        "name": tags.get("name", ""),
-                        "name_en": tags.get("name:en", ""),
-                        "type": element["type"],
-                        "geometry": point_geom,
-                        "latitude": _lat,
-                        "longitude": _lon,
-                        "matching_categories": list(matching_categories.keys()),
-                    }
-                )
+                result = {
+                    "source_id": element["id"],
+                    "category": category,
+                    "category_value": value,
+                    "name": tags.get("name", ""),
+                    "name_en": tags.get("name:en", ""),
+                    "type": element["type"],
+                    "geometry": point_geom,
+                    "latitude": _lat,
+                    "longitude": _lon,
+                    "matching_categories": list(matching_categories.keys()),
+                }
+                # Add metadata if available
+                result.update(metadata)
+                results.append(result)
             return results
@@ -256,36 +311,121 @@ class OSMLocationFetcher:
             polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
             centroid = polygon.centroid
+            # Extract metadata if available
+            metadata = {}
+            if "timestamp" in element:
+                metadata["timestamp"] = element["timestamp"]
+                metadata["version"] = element.get("version")
+                metadata["changeset"] = element.get("changeset")
+                metadata["user"] = element.get("user")
+                metadata["uid"] = element.get("uid")
             # For each matching category, create a separate element
             results = []
             for category, value in matching_categories.items():
-                results.append(
-                    {
-                        "source_id": element["id"],
-                        "category": category,
-                        "category_value": value,
-                        "name": tags.get("name", ""),
-                        "name_en": tags.get("name:en", ""),
-                        "type": element["type"],
-                        "geometry": polygon,
-                        "latitude": centroid.y,
-                        "longitude": centroid.x,
-                        "matching_categories": list(matching_categories.keys()),
-                    }
-                )
+                result = {
+                    "source_id": element["id"],
+                    "category": category,
+                    "category_value": value,
+                    "name": tags.get("name", ""),
+                    "name_en": tags.get("name:en", ""),
+                    "type": element["type"],
+                    "geometry": polygon,
+                    "latitude": centroid.y,
+                    "longitude": centroid.x,
+                    "matching_categories": list(matching_categories.keys()),
+                }
+                # Add metadata if available
+                result.update(metadata)
+                results.append(result)
             return results
         except (KeyError, ValueError) as e:
             self.logger.error(f"Error processing way geometry: {str(e)}")
             return []
+    def _build_queries(
+        self,
+        date_filter_type: Optional[Literal["newer", "changed"]] = None,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        include_metadata: bool = False,
+    ) -> List[str]:
+        """
+        Construct Overpass QL queries with optional date filtering and metadata.
+        Args:
+            date_filter_type: Type of date filter ('newer' or 'changed')
+            start_date: Start date in ISO 8601 format
+            end_date: End date in ISO 8601 format (required for 'changed')
+            include_metadata: If True, include change metadata (timestamp, version, changeset, user)
+        Returns:
+            List[str]: List of [nodes_relations_query, ways_query]
+        """
+        # Build the date filter based on type
+        if date_filter_type == "newer" and start_date:
+            date_filter = f'(newer:"{start_date}")'
+        elif date_filter_type == "changed" and start_date and end_date:
+            date_filter = f'(changed:"{start_date}","{end_date}")'
+        else:
+            date_filter = ""
+        # Determine output mode
+        output_mode = "center meta" if include_metadata else "center"
+        output_mode_geom = "geom meta" if include_metadata else "geom"
+        # Query for nodes and relations
+        nodes_relations_queries = []
+        for category, types in self.location_types.items():
+            nodes_relations_queries.extend(
+                [
+                    f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
+                    f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
+                ]
+            )
+        nodes_relations_queries = "\n".join(nodes_relations_queries)
+        nodes_relations_query = f"""
+        [out:json][timeout:{self.timeout}];
+        {self.area_query}
+        (
+            {nodes_relations_queries}
+        );
+        out {output_mode};
+        """
+        # Query for ways
+        ways_queries = []
+        for category, types in self.location_types.items():
+            ways_queries.append(
+                f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
+            )
+        ways_queries = "\n".join(ways_queries)
+        ways_query = f"""
+        [out:json][timeout:{self.timeout}];
+        {self.area_query}
+        (
+            {ways_queries}
+        );
+        out {output_mode_geom};
+        """
+        return [nodes_relations_query, ways_query]
     def fetch_locations(
         self,
-        since_year: Optional[int] = None,
+        since_date: Optional[Union[str, datetime]] = None,
         handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
+        include_metadata: bool = False,
     ) -> pd.DataFrame:
         """
-        Fetch and process OSM locations.
+        Fetch OSM locations, optionally filtered by 'since' date.
+        Use this for incremental updates or getting all current locations.
         Args:
             since_year (int, optional): Filter for locations added/modified since this year.
@@ -293,6 +433,8 @@ class OSMLocationFetcher:
                 - 'separate': Create separate entries for each category (default)
                 - 'combine': Use a single entry with a list of matching categories
                 - 'primary': Keep only the first matching category
+            include_metadata: If True, include change tracking metadata
+                (timestamp, version, changeset, user, uid)
         Returns:
             pd.DataFrame: Processed OSM locations
@@ -306,10 +448,118 @@ class OSMLocationFetcher:
             f"Fetching OSM locations from Overpass API for country: {self.country}"
         )
         self.logger.info(f"Location types: {self.location_types}")
-        self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")
-        # Get queries for different element types
-        nodes_relations_query, ways_query = self._build_queries(since_year)
+        # Normalize date if provided
+        since_str = self._normalize_date(since_date) if since_date else None
+        if since_str:
+            self.logger.info(f"Filtering for changes since: {since_str}")
+        queries = self._build_queries(
+            date_filter_type="newer" if since_str else None,
+            start_date=since_str,
+            include_metadata=include_metadata,
+        )
+        return self._execute_and_process_queries(queries, handle_duplicates)
+    def fetch_locations_changed_between(
+        self,
+        start_date: Union[str, datetime],
+        end_date: Union[str, datetime],
+        handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
+        include_metadata: bool = True,
+    ) -> pd.DataFrame:
+        """
+        Fetch OSM locations that changed within a specific date range.
+        Use this for historical analysis or tracking changes in a specific period.
+        Args:
+            start_date: Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
+                    or datetime object. Changes after this date will be included.
+            end_date: End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
+                    or datetime object. Changes before this date will be included.
+            handle_duplicates: How to handle objects matching multiple categories:
+                - 'separate': Create separate entries for each category (default)
+                - 'combine': Use a single entry with a list of matching categories
+                - 'primary': Keep only the first matching category
+            include_metadata: If True, include change tracking metadata
+                (timestamp, version, changeset, user, uid)
+                Defaults to True since change tracking is the main use case.
+        Returns:
+            pd.DataFrame: Processed OSM locations that changed within the date range
+        Raises:
+            ValueError: If dates are invalid or start_date is after end_date
+        """
+        start_str = self._normalize_date(start_date)
+        end_str = self._normalize_date(end_date)
+        if start_str >= end_str:
+            raise ValueError(
+                f"start_date must be before end_date (got {start_str} >= {end_str})"
+            )
+        queries = self._build_queries(
+            date_filter_type="changed",
+            start_date=start_str,
+            end_date=end_str,
+            include_metadata=include_metadata,
+        )
+        return self._execute_and_process_queries(queries, handle_duplicates)
+    def _normalize_date(self, date_input: Union[str, datetime]) -> str:
+        """
+        Convert date input to ISO 8601 format string.
+        Args:
+            date_input: Either a string in ISO 8601 format or a datetime object
+        Returns:
+            str: Date in format "YYYY-MM-DDThh:mm:ssZ"
+        Raises:
+            ValueError: If string format is invalid
+        """
+        from datetime import datetime
+        if isinstance(date_input, datetime):
+            # Convert datetime to ISO 8601 string with Z (UTC) timezone
+            return date_input.strftime("%Y-%m-%dT%H:%M:%SZ")
+        elif isinstance(date_input, str):
+            # Validate the string format
+            try:
+                # Try to parse it to ensure it's valid
+                datetime.strptime(date_input, "%Y-%m-%dT%H:%M:%SZ")
+                return date_input
+            except ValueError:
+                raise ValueError(
+                    f"Invalid date format: '{date_input}'. "
+                    "Expected format: 'YYYY-MM-DDThh:mm:ssZ' (e.g., '2024-03-15T14:30:00Z')"
+                )
+        else:
+            raise TypeError(
+                f"date_input must be str or datetime, got {type(date_input).__name__}"
+            )
+    def _execute_and_process_queries(
+        self, queries: List[str], handle_duplicates: str
+    ) -> pd.DataFrame:
+        """
+        Execute queries and process results (extracted from fetch_locations).
+        Args:
+            queries: List of [nodes_relations_query, ways_query]
+            handle_duplicates: Strategy for handling duplicate categories
+        Returns:
+            pd.DataFrame: Processed locations
+        """
+        nodes_relations_query, ways_query = queries
         # Fetch nodes and relations
         nodes_relations_response = self._make_request(nodes_relations_query)
@@ -352,16 +602,14 @@ class OSMLocationFetcher:
             self.logger.warning("No matching elements found after processing")
             return pd.DataFrame()
-        # Handle duplicates based on the specified strategy
+        # Handle duplicates (reuse existing logic from fetch_locations)
         if handle_duplicates != "separate":
-            # Group by source_id
             grouped_elements = {}
             for elem in all_elements:
                 source_id = elem["source_id"]
                 if source_id not in grouped_elements:
                     grouped_elements[source_id] = elem
                 elif handle_duplicates == "combine":
-                    # Combine matching categories
                     if grouped_elements[source_id]["category"] != elem["category"]:
                         if isinstance(grouped_elements[source_id]["category"], str):
                             grouped_elements[source_id]["category"] = [
@@ -381,44 +629,16 @@ class OSMLocationFetcher:
                             grouped_elements[source_id]["category_value"].append(
                                 elem["category_value"]
                             )
-                # For 'primary', just keep the first one we encountered
             all_elements = list(grouped_elements.values())
         locations = pd.DataFrame(all_elements)
-        # Log element type distribution
+        # Log statistics
         type_counts = locations["type"].value_counts()
         self.logger.info("\nElement type distribution:")
         for element_type, count in type_counts.items():
             self.logger.info(f"{element_type}: {count}")
-        # Log category distribution
-        if handle_duplicates == "combine":
-            # Count each category separately when they're in lists
-            category_counts = {}
-            for cats in locations["category"]:
-                if isinstance(cats, list):
-                    for cat in cats:
-                        category_counts[cat] = category_counts.get(cat, 0) + 1
-                else:
-                    category_counts[cats] = category_counts.get(cats, 0) + 1
-            self.logger.info("\nCategory distribution:")
-            for category, count in category_counts.items():
-                self.logger.info(f"{category}: {count}")
-        else:
-            category_counts = locations["category"].value_counts()
-            self.logger.info("\nCategory distribution:")
-            for category, count in category_counts.items():
-                self.logger.info(f"{category}: {count}")
-        # Log elements with multiple matching categories
-        multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
-        if multi_category:
-            self.logger.info(
-                f"\n{len(multi_category)} elements matched multiple categories"
-            )
         self.logger.info(f"Successfully processed {len(locations)} locations")
         return locations

giga-spatial 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

giga-spatial 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl