PyPI - giga-spatial - Versions diffs - 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

giga-spatial 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{giga_spatial-0.6.3.dist-info → giga_spatial-0.6.5.dist-info}/METADATA +2 -1
giga_spatial-0.6.5.dist-info/RECORD +50 -0
gigaspatial/__init__.py +1 -1
gigaspatial/config.py +35 -4
gigaspatial/core/io/__init__.py +1 -0
gigaspatial/core/io/database.py +316 -0
gigaspatial/generators/__init__.py +5 -1
gigaspatial/generators/poi.py +228 -43
gigaspatial/generators/zonal/__init__.py +2 -1
gigaspatial/generators/zonal/admin.py +84 -0
gigaspatial/generators/zonal/base.py +221 -64
gigaspatial/generators/zonal/geometry.py +74 -31
gigaspatial/generators/zonal/mercator.py +50 -19
gigaspatial/grid/__init__.py +1 -1
gigaspatial/grid/mercator_tiles.py +33 -10
gigaspatial/handlers/__init__.py +5 -1
gigaspatial/handlers/boundaries.py +226 -48
gigaspatial/handlers/ghsl.py +79 -14
gigaspatial/handlers/giga.py +641 -0
gigaspatial/handlers/hdx.py +50 -51
gigaspatial/handlers/maxar_image.py +1 -2
gigaspatial/handlers/rwi.py +5 -2
gigaspatial/processing/algorithms.py +188 -0
gigaspatial/processing/geo.py +87 -25
gigaspatial/processing/tif_processor.py +220 -45
giga_spatial-0.6.3.dist-info/RECORD +0 -47
{giga_spatial-0.6.3.dist-info → giga_spatial-0.6.5.dist-info}/WHEEL +0 -0
{giga_spatial-0.6.3.dist-info → giga_spatial-0.6.5.dist-info}/licenses/LICENSE +0 -0
{giga_spatial-0.6.3.dist-info → giga_spatial-0.6.5.dist-info}/top_level.txt +0 -0

gigaspatial/handlers/hdx.py CHANGED Viewed

@@ -1,13 +1,9 @@
-import os
 import logging
 from tqdm import tqdm
 from pathlib import Path
-from typing import List, Optional, Tuple, Union, Dict, Any, Iterable
+from typing import List, Optional, Union, Dict, Any, Iterable
 import tempfile
-import functools
-import multiprocessing
-import pandas as pd
 import geopandas as gpd
 from pydantic import Field, ConfigDict
 from pydantic.dataclasses import dataclass
@@ -50,6 +46,48 @@ class HDXConfig(BaseHandlerConfig):
     _hdx_configured: bool = Field(default=False, init=False)
     dataset: Optional[Dataset] = Field(default=None, init=False)
+    @staticmethod
+    def search_datasets(
+        query: str,
+        rows: int = None,
+        sort: str = "relevance asc, metadata_modified desc",
+        hdx_site: str = "prod",
+        user_agent: str = "gigaspatial",
+    ) -> List[Dict]:
+        """Search for datasets in HDX before initializing the class.
+        Args:
+            query: Search query string
+            rows: Number of results per page. Defaults to all datasets (sys.maxsize).
+            sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
+            hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
+            user_agent: User agent for HDX API requests (default: 'gigaspatial')
+        Returns:
+            List of dataset dictionaries containing search results
+        Example:
+            >>> results = HDXConfig.search_datasets("population", rows=5)
+            >>> for dataset in results:
+            >>>     print(f"Name: {dataset['name']}, Title: {dataset['title']}")
+        """
+        try:
+            Configuration.create(
+                hdx_site=hdx_site,
+                user_agent=user_agent,
+                hdx_read_only=True,
+            )
+        except:
+            pass
+        try:
+            results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)
+            return results
+        except Exception as e:
+            logging.error(f"Error searching HDX datasets: {str(e)}")
+            raise
     def __post_init__(self):
         super().__post_init__()
         try:
@@ -85,7 +123,11 @@ class HDXConfig(BaseHandlerConfig):
             self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
             dataset = Dataset.read_from_hdx(self.dataset_name)
             if not dataset:
-                raise ValueError(f"Dataset '{self.dataset_name}' not found on HDX")
+                raise ValueError(
+                    f"Dataset '{self.dataset_name}' not found on HDX. "
+                    "Please verify the dataset name or use search_datasets() "
+                    "to find available datasets."
+                )
             return dataset
         except Exception as e:
             self.logger.error(f"Error fetching HDX dataset: {str(e)}")
@@ -386,9 +428,9 @@ class HDXReader(BaseHandlerReader):
         self, source_data_path: List[Union[str, Path]], **kwargs
     ) -> Any:
         """Load data from paths"""
-        if len(source_data_path)==1:
+        if len(source_data_path) == 1:
             return read_dataset(self.data_store, source_data_path[0])
         all_data = {}
         for file_path in source_data_path:
             try:
@@ -401,49 +443,6 @@ class HDXReader(BaseHandlerReader):
         resources = self.config.list_resources()
         return self.load_from_paths(resources)
-    # def read_resource(
-    #     self, resource_file: str
-    # ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
-    #     """Read a specific resource file from the dataset using the data_store."""
-    #     if not self.dataset_path:
-    #         raise ValueError("No dataset path configured")
-    #     file_path = str(self.dataset_path / resource_file)
-    #     if not self.data_store.file_exists(file_path):
-    #         raise FileNotFoundError(
-    #             f"Resource file {resource_file} not found in dataset"
-    #         )
-    #     try:
-    #         return read_dataset(self.data_store, file_path)
-    #     except Exception as e:
-    #         raise ValueError(f"Could not read file {file_path}: {str(e)}")
-    # def read_all_resources(self) -> Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]:
-    #     """Read all resources in the dataset directory using the data_store."""
-    #     resources = self.list_resources()
-    #     result = {}
-    #     for resource in resources:
-    #         try:
-    #             result[resource] = self.read_resource(resource)
-    #         except Exception as e:
-    #             self.logger.warning(f"Could not read resource {resource}: {str(e)}")
-    #     return result
-    # def load_from_paths(
-    #     self, source_data_path: List[Union[str, Path]], **kwargs
-    # ) -> Union[
-    #     pd.DataFrame, gpd.GeoDataFrame, Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame]]
-    # ]:
-    #     """Load data from paths"""
-    #     if len(source_data_path) == 1:
-    #         return self.read_resource(str(source_data_path[0]))
-    #     else:
-    #         return self.read_all_resources()
 class HDXHandler(BaseHandler):
     """Handler for HDX datasets"""

gigaspatial/handlers/maxar_image.py CHANGED Viewed

@@ -14,7 +14,6 @@ from gigaspatial.processing.geo import (
     convert_to_geodataframe,
     buffer_geodataframe,
 )
-from gigaspatial.processing.sat_images import calculate_pixels_at_location
 from gigaspatial.config import config as global_config
@@ -142,7 +141,7 @@ class MaxarImageDownloader:
                 self.logger.warning(
                     f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
                 )
-                if attempt < self.max_retries - 1:
+                if attempt < self.config.max_retries - 1:
                     sleep(self.config.retry_delay)
                 else:
                     self.logger.warning(

gigaspatial/handlers/rwi.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 from typing import List, Optional, Union, Literal
 from pydantic.dataclasses import dataclass
 from datetime import datetime
+import pycountry
 from hdx.data.resource import Resource
@@ -36,8 +37,10 @@ class RWIConfig(HDXConfig):
         self, country: str, **kwargs
     ) -> List[Resource]:
         """Get relevant data units for a country, optionally filtering for latest version"""
-        resources = super().get_relevant_data_units_by_country(
-            country=country, key="url"
+        country = pycountry.countries.lookup(country)
+        values = [country.alpha_3]
+        resources = self.get_dataset_resources(
+            filter={"url": values},
         )
         if self.latest_only and len(resources) > 1:

gigaspatial/processing/algorithms.py ADDED Viewed

@@ -0,0 +1,188 @@
+import sys, os
+import numpy as np
+from typing import Literal, List, Tuple, Union, Optional
+import geopandas as gpd
+import pandas as pd
+from scipy.spatial import cKDTree
+import networkx as nx
+from gigaspatial.processing.geo import (
+    convert_to_geodataframe,
+)
+from gigaspatial.config import config
+LOGGER = config.get_logger("GigaSpatialProcessing")
+def build_distance_graph(
+    left_df: Union[pd.DataFrame, gpd.GeoDataFrame],
+    right_df: Union[pd.DataFrame, gpd.GeoDataFrame],
+    distance_threshold: float,
+    max_k: int = 100,
+    return_dataframe: bool = False,
+    verbose: bool = True,
+    exclude_same_index: Optional[bool] = None,
+) -> Union[nx.Graph, Tuple[nx.Graph, pd.DataFrame]]:
+    """
+    Build a graph of spatial matches between two dataframes using KD-tree.
+    Args:
+        left_df: Left dataframe to match from
+        right_df: Right dataframe to match to
+        distance_threshold: Maximum distance for matching (in meters)
+        max_k: Maximum number of neighbors to consider per point (default: 100)
+        return_dataframe: If True, also return the matches DataFrame
+        verbose: If True, print statistics about the graph
+        exclude_same_index: If True, exclude self-matches. If None, auto-detect based on df equality
+    Returns:
+        NetworkX Graph, or tuple of (Graph, DataFrame) if return_dataframe=True
+    Raises:
+        ValueError: If distance_threshold is negative or max_k is not positive
+    """
+    # Input validation
+    if distance_threshold < 0:
+        raise ValueError("distance_threshold must be non-negative")
+    if max_k <= 0:
+        raise ValueError("max_k must be positive")
+    if left_df.empty or right_df.empty:
+        if verbose:
+            LOGGER.warning("Warning: One or both dataframes are empty")
+        G = nx.Graph()
+        return (G, pd.DataFrame()) if return_dataframe else G
+    def get_utm_coordinates(df: Union[pd.DataFrame, gpd.GeoDataFrame]) -> np.ndarray:
+        """Extract coordinates as numpy array in UTM projection."""
+        if isinstance(df, pd.DataFrame):
+            gdf = convert_to_geodataframe(df)
+        else:
+            gdf = df.copy()
+        # More robust UTM CRS estimation
+        try:
+            gdf_utm = gdf.to_crs(gdf.estimate_utm_crs())
+        except Exception as e:
+            if verbose:
+                LOGGER.warning(
+                    f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
+                )
+            gdf_utm = gdf.to_crs("EPSG:3857")  # Fallback to Web Mercator
+        return gdf_utm.get_coordinates().to_numpy()
+    # Auto-detect same dataframe case
+    if exclude_same_index is None:
+        exclude_same_index = left_df.equals(right_df)
+        if verbose and exclude_same_index:
+            LOGGER.info("Auto-detected same dataframe - excluding self-matches")
+    # Get coordinates
+    left_coords = get_utm_coordinates(left_df)
+    right_coords = (
+        get_utm_coordinates(right_df) if not exclude_same_index else left_coords
+    )
+    # Build KD-tree and query
+    kdtree = cKDTree(right_coords)
+    # Use the provided max_k parameter, but don't exceed available points
+    k_to_use = min(max_k, len(right_coords))
+    if verbose and k_to_use < max_k:
+        LOGGER.info(
+            f"Note: max_k ({max_k}) reduced to {k_to_use} (number of available points)"
+        )
+    # Note: Distance calculations here are based on Euclidean distance in UTM projection.
+    # This can introduce errors up to ~50 cm for a 50 meter threshold, especially near the poles where distortion increases.
+    distances, indices = kdtree.query(
+        left_coords, k=k_to_use, distance_upper_bound=distance_threshold
+    )
+    # Handle single k case (when k_to_use = 1, results are 1D)
+    if distances.ndim == 1:
+        distances = distances.reshape(-1, 1)
+        indices = indices.reshape(-1, 1)
+    # Extract valid pairs using vectorized operations
+    left_indices = np.arange(len(distances))[:, np.newaxis]
+    left_indices = np.broadcast_to(left_indices, distances.shape)
+    valid_mask = np.isfinite(distances)
+    if exclude_same_index:
+        same_index_mask = left_indices == indices
+        valid_mask = valid_mask & ~same_index_mask
+    valid_left = left_indices[valid_mask]
+    valid_right = indices[valid_mask]
+    valid_distances = distances[valid_mask]
+    # Map back to original indices
+    valid_left_indices = left_df.index.values[valid_left]
+    valid_right_indices = right_df.index.values[valid_right]
+    # Create matches DataFrame
+    matches_df = pd.DataFrame(
+        {
+            "left_idx": valid_left_indices,
+            "right_idx": valid_right_indices,
+            "distance": valid_distances,
+        }
+    )
+    # Build graph more efficiently
+    G = nx.from_pandas_edgelist(
+        matches_df,
+        source="left_idx",
+        target="right_idx",
+        edge_attr="distance",
+        create_using=nx.Graph(),
+    )
+    # Add isolated nodes (nodes without any matches within threshold)
+    # This ensures all original indices are represented in the graph
+    all_left_nodes = set(left_df.index.values)
+    all_right_nodes = set(right_df.index.values)
+    if not exclude_same_index:
+        all_nodes = all_left_nodes | all_right_nodes
+    else:
+        all_nodes = all_left_nodes  # Same dataframe, so same node set
+    # Add nodes that don't have edges
+    existing_nodes = set(G.nodes())
+    isolated_nodes = all_nodes - existing_nodes
+    G.add_nodes_from(isolated_nodes)
+    # Print statistics
+    if verbose:
+        print(
+            f"Total potential matches: {len(left_df)} × {len(right_df)} = {len(left_df) * len(right_df):,}"
+        )
+        print(f"Matches found within {distance_threshold}m: {len(matches_df):,}")
+        print(f"Graph nodes: {G.number_of_nodes():,}")
+        print(f"Graph edges: {G.number_of_edges():,}")
+        components = list(nx.connected_components(G))
+        print(f"Connected components: {len(components):,}")
+        if len(components) > 1:
+            component_sizes = [len(c) for c in components]
+            print(f"Largest component size: {max(component_sizes):,}")
+            print(
+                f"Isolated nodes: {sum(1 for size in component_sizes if size == 1):,}"
+            )
+        if len(matches_df) > 0:
+            print(
+                f"Distance stats - min: {matches_df['distance'].min():.1f}m, "
+                f"max: {matches_df['distance'].max():.1f}m, "
+                f"mean: {matches_df['distance'].mean():.1f}m"
+            )
+    return (G, matches_df) if return_dataframe else G

gigaspatial/processing/geo.py CHANGED Viewed

@@ -272,8 +272,13 @@ def buffer_geodataframe(
     input_crs = gdf_work.crs
     try:
-        # Create a custom UTM CRS based on the calculated UTM zone
-        utm_crs = gdf_work.estimate_utm_crs()
+        try:
+            utm_crs = gdf_work.estimate_utm_crs()
+        except Exception as e:
+            LOGGER.warning(
+                f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
+            )
+            utm_crs = "EPSG:3857"  # Fallback to Web Mercator
         # Transform to UTM, create buffer, and transform back
         gdf_work = gdf_work.to_crs(utm_crs)
@@ -452,7 +457,13 @@ def add_area_in_meters(
     gdf_with_area = gdf.copy()
     # Calculate the UTM CRS for accurate area calculation
-    utm_crs = gdf_with_area.estimate_utm_crs()
+    try:
+        utm_crs = gdf_with_area.estimate_utm_crs()
+    except Exception as e:
+        LOGGER.warning(
+            f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
+        )
+        utm_crs = "EPSG:3857"  # Fallback to Web Mercator
     # Transform to UTM CRS and calculate the area in square meters
     gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area
@@ -858,39 +869,79 @@ def aggregate_polygons_to_zones(
     zones: gpd.GeoDataFrame,
     value_columns: Union[str, List[str]],
     aggregation: Union[str, Dict[str, str]] = "sum",
-    area_weighted: bool = True,
+    predicate: Literal["intersects", "within", "fractional"] = "intersects",
     zone_id_column: str = "zone_id",
     output_suffix: str = "",
     drop_geometry: bool = False,
 ) -> gpd.GeoDataFrame:
     """
-    Aggregate polygon data to zones with area-weighted values.
+    Aggregates polygon data to zones based on a specified spatial relationship.
-    This function maps polygon data to zones, weighting values by the
-    fractional area of overlap between polygons and zones.
+    This function performs a spatial join between polygons and zones and then
+    aggregates values from the polygons to their corresponding zones. The aggregation
+    method depends on the `predicate` parameter, which determines the nature of the
+    spatial relationship.
     Args:
-        polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): Polygon data to aggregate
-        zones (gpd.GeoDataFrame): Zones to aggregate polygons to
-        value_columns (Union[str, List[str]]): Column(s) containing values to aggregate
-        aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use:
-            - Single string: Use same method for all columns ("sum", "mean", "max", etc.)
-            - Dict: Map column names to aggregation methods
-        area_weighted (bool): Whether to weight values by fractional area overlap
-            If False, values are not weighted before aggregation
-        zone_id_column (str): Column in zones containing zone identifiers
-        output_suffix (str): Suffix to add to output column names
-        drop_geometry (bool): Whether to drop the geometry column from output
+        polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
+            Polygon data to aggregate. Must be a GeoDataFrame or convertible to one.
+        zones (gpd.GeoDataFrame):
+            The target zones to which the polygon data will be aggregated.
+        value_columns (Union[str, List[str]]):
+            The column(s) in `polygons` containing the numeric values to aggregate.
+        aggregation (Union[str, Dict[str, str]], optional):
+            The aggregation method(s) to use. Can be a single string (e.g., "sum",
+            "mean", "max") to apply the same method to all columns, or a dictionary
+            mapping column names to aggregation methods (e.g., `{'population': 'sum'}`).
+            Defaults to "sum".
+        predicate (Literal["intersects", "within", "fractional"], optional):
+            The spatial relationship to use for aggregation:
+            - "intersects": Aggregates values for any polygon that intersects a zone.
+            - "within": Aggregates values for polygons entirely contained within a zone.
+            - "fractional": Performs area-weighted aggregation. The value of a polygon
+              is distributed proportionally to the area of its overlap with each zone.
+              This requires calculating a UTM CRS for accurate area measurements.
+            Defaults to "intersects".
+        zone_id_column (str, optional):
+            The name of the column in `zones` that contains the unique zone identifiers.
+            Defaults to "zone_id".
+        output_suffix (str, optional):
+            A suffix to add to the names of the new aggregated columns in the output
+            GeoDataFrame. Defaults to "".
+        drop_geometry (bool, optional):
+            If True, the geometry column will be dropped from the output GeoDataFrame.
+            Defaults to False.
     Returns:
-        gpd.GeoDataFrame: Zones with aggregated polygon values
+        gpd.GeoDataFrame:
+            The `zones` GeoDataFrame with new columns containing the aggregated values.
+            Zones with no intersecting or contained polygons will have `0` values.
+    Raises:
+        TypeError: If `zones` is not a GeoDataFrame or `polygons` cannot be converted.
+        ValueError: If `zone_id_column` or any `value_columns` are not found, or
+                    if the geometry types in `polygons` are not polygons.
+        RuntimeError: If an error occurs during the area-weighted aggregation process.
     Example:
-        >>> landuse_stats = aggregate_polygons_to_zones(
+        >>> import geopandas as gpd
+        >>> # Assuming 'landuse_polygons' and 'grid_zones' are GeoDataFrames
+        >>> # Aggregate total population within each grid zone using area-weighting
+        >>> pop_by_zone = aggregate_polygons_to_zones(
+        ...     landuse_polygons,
+        ...     grid_zones,
+        ...     value_columns="population",
+        ...     predicate="fractional",
+        ...     aggregation="sum",
+        ...     output_suffix="_pop"
+        ... )
+        >>> # Aggregate the count of landuse parcels intersecting each zone
+        >>> count_by_zone = aggregate_polygons_to_zones(
         ...     landuse_polygons,
         ...     grid_zones,
-        ...     value_columns=["area", "population"],
-        ...     aggregation="sum"
+        ...     value_columns="parcel_id",
+        ...     predicate="intersects",
+        ...     aggregation="count"
         ... )
     """
     # Input validation
@@ -900,6 +951,11 @@ def aggregate_polygons_to_zones(
     if zone_id_column not in zones.columns:
         raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")
+    if predicate not in ["intersects", "within", "fractional"]:
+        raise ValueError(
+            f"Unsupported predicate: {predicate}. Predicate can be one of `intersects`, `within`, `fractional`"
+        )
     # Convert polygons to GeoDataFrame if necessary
     if not isinstance(polygons, gpd.GeoDataFrame):
         try:
@@ -956,11 +1012,17 @@ def aggregate_polygons_to_zones(
     # Create a copy of the zones
     result = zones.copy()
-    if area_weighted:
+    if predicate == "fractional":
         # Use area-weighted aggregation with polygon overlay
         try:
             # Compute UTM CRS for accurate area calculations
-            overlay_utm_crs = polygons_gdf.estimate_utm_crs()
+            try:
+                overlay_utm_crs = polygons_gdf.estimate_utm_crs()
+            except Exception as e:
+                LOGGER.warning(
+                    f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
+                )
+                overlay_utm_crs = "EPSG:3857"  # Fallback to Web Mercator
             # Prepare polygons for overlay
             polygons_utm = polygons_gdf.to_crs(overlay_utm_crs)
@@ -1020,7 +1082,7 @@ def aggregate_polygons_to_zones(
     else:
         # Non-weighted aggregation - simpler approach
         # Perform spatial join
-        joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate="intersects")
+        joined = gpd.sjoin(polygons_gdf, zones, how="inner", predicate=predicate)
         # Remove geometry column for aggregation
         if "geometry" in joined.columns:

giga-spatial 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

giga-spatial 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl