PyPI - giga-spatial - Versions diffs - 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl - Mend

giga-spatial 0.6.4py3-none-any.whl → 0.6.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/METADATA +3 -1
giga_spatial-0.6.6.dist-info/RECORD +50 -0
gigaspatial/__init__.py +1 -1
gigaspatial/config.py +29 -4
gigaspatial/core/io/__init__.py +1 -0
gigaspatial/core/io/data_api.py +3 -1
gigaspatial/core/io/database.py +319 -0
gigaspatial/generators/__init__.py +5 -1
gigaspatial/generators/poi.py +300 -52
gigaspatial/generators/zonal/__init__.py +2 -1
gigaspatial/generators/zonal/admin.py +84 -0
gigaspatial/generators/zonal/base.py +237 -81
gigaspatial/generators/zonal/geometry.py +151 -53
gigaspatial/generators/zonal/mercator.py +50 -19
gigaspatial/grid/__init__.py +1 -1
gigaspatial/grid/mercator_tiles.py +33 -10
gigaspatial/handlers/__init__.py +8 -1
gigaspatial/handlers/base.py +26 -6
gigaspatial/handlers/boundaries.py +93 -18
gigaspatial/handlers/ghsl.py +92 -15
gigaspatial/handlers/rwi.py +5 -2
gigaspatial/handlers/worldpop.py +771 -186
gigaspatial/processing/algorithms.py +188 -0
gigaspatial/processing/geo.py +204 -102
gigaspatial/processing/tif_processor.py +220 -45
giga_spatial-0.6.4.dist-info/RECORD +0 -47
{giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/WHEEL +0 -0
{giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/licenses/LICENSE +0 -0
{giga_spatial-0.6.4.dist-info → giga_spatial-0.6.6.dist-info}/top_level.txt +0 -0

gigaspatial/generators/zonal/base.py CHANGED Viewed

@@ -13,7 +13,6 @@ from gigaspatial.core.io.local_data_store import LocalDataStore
 from gigaspatial.core.io.writers import write_dataset
 from gigaspatial.config import config as global_config
 from gigaspatial.processing.geo import (
-    convert_to_geodataframe,
     aggregate_polygons_to_zones,
     aggregate_points_to_zones,
 )
@@ -77,6 +76,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
         self.config = config or ZonalViewGeneratorConfig()
         self.data_store = data_store or LocalDataStore()
         self.logger = logger or global_config.get_logger(self.__class__.__name__)
+        self._view: Optional[pd.DataFrame] = None
     @abstractmethod
     def get_zonal_geometries(self) -> List[Polygon]:
@@ -103,7 +103,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
         """
         pass
-    def to_geodataframe(self) -> gpd.GeoDataFrame:
+    def get_zone_geodataframe(self) -> gpd.GeoDataFrame:
         """Convert zones to a GeoDataFrame.
         Creates a GeoDataFrame containing zone identifiers and their corresponding
@@ -131,9 +131,77 @@ class ZonalViewGenerator(ABC, Generic[T]):
                 and identifiers.
         """
         if not hasattr(self, "_zone_gdf"):
-            self._zone_gdf = self.to_geodataframe()
+            self._zone_gdf = self.get_zone_geodataframe()
         return self._zone_gdf
+    @property
+    def view(self) -> pd.DataFrame:
+        """The DataFrame representing the current zonal view.
+        Returns:
+            pd.DataFrame: The DataFrame containing zone IDs, and
+                              any added variables. If no variables have been added,
+                              it returns the base `zone_gdf` without geometries.
+        """
+        if self._view is None:
+            self._view = self.zone_gdf.drop(columns="geometry")
+        return self._view
+    def add_variable_to_view(self, data_dict: Dict, column_name: str) -> None:
+        """
+        Adds a new variable (column) to the zonal view GeoDataFrame.
+        This method takes a dictionary (typically the result of map_points or map_polygons)
+        and adds its values as a new column to the internal `_view` (or `zone_gdf` if not yet initialized).
+        The dictionary keys are expected to be the `zone_id` values.
+        Args:
+            data_dict (Dict): A dictionary where keys are `zone_id`s and values are
+                              the data to be added.
+            column_name (str): The name of the new column to be added to the GeoDataFrame.
+        Raises:
+            ValueError: If the `data_dict` keys do not match the `zone_id`s in the zonal view.
+                        If the `column_name` already exists in the zonal view.
+        """
+        if self._view is None:
+            self._view = self.zone_gdf.drop(columns="geometry")
+        if column_name in self._view.columns:
+            raise ValueError(
+                f"Column '{column_name}' already exists in the zonal view."
+            )
+        # Create a pandas Series from the dictionary, aligning by index (zone_id)
+        new_series = pd.Series(data_dict, name=column_name)
+        # Before merging, ensure the zone_ids in data_dict match those in _view
+        missing_zones_in_data = set(self._view["zone_id"]) - set(new_series.index)
+        extra_zones_in_data = set(new_series.index) - set(self._view["zone_id"])
+        if missing_zones_in_data:
+            self.logger.warning(
+                f"Warning: {len(missing_zones_in_data)} zone(s) from the zonal view "
+                f"are missing in the provided data_dict for column '{column_name}'. "
+                f"These zones will have NaN values for '{column_name}'. Missing: {list(missing_zones_in_data)[:5]}..."
+            )
+        if extra_zones_in_data:
+            self.logger.warning(
+                f"Warning: {len(extra_zones_in_data)} zone(s) in the provided data_dict "
+                f"are not present in the zonal view for column '{column_name}'. "
+                f"These will be ignored. Extra: {list(extra_zones_in_data)[:5]}..."
+            )
+        # Merge the new series with the _view based on 'zone_id'
+        # Using .set_index() for efficient alignment
+        original_index_name = self._view.index.name
+        self._view = self._view.set_index("zone_id").join(new_series).reset_index()
+        if original_index_name:  # Restore original index name if it existed
+            self._view.index.name = original_index_name
+        else:  # If it was a default integer index, ensure it's not named 'index'
+            self._view.index.name = None
+        self.logger.info(f"Added variable '{column_name}' to the zonal view.")
     def map_points(
         self,
         points: Union[pd.DataFrame, gpd.GeoDataFrame],
@@ -173,98 +241,144 @@ class ZonalViewGenerator(ABC, Generic[T]):
         if mapping_function is not None:
             return mapping_function(self, points, **mapping_kwargs)
-        else:
+        self.logger.warning(
+            "Using default points mapping implementation. Consider creating a specialized mapping function."
+        )
+        result = aggregate_points_to_zones(
+            points=points,
+            zones=self.zone_gdf,
+            value_columns=value_columns,
+            aggregation=aggregation,
+            point_zone_predicate=predicate,
+            zone_id_column="zone_id",
+            output_suffix=output_suffix,
+        )
+        if isinstance(value_columns, str):
+            return result.set_index("zone_id")[value_columns].to_dict()
+        elif isinstance(value_columns, list):
+            # If multiple value columns, return a dictionary of dictionaries
+            # Or, if preferred, a dictionary where values are lists/tuples of results
+            # For now, let's return a dict of series, which is common.
+            # The previous version implied a single dictionary result from map_points/polygons
+            # but with multiple columns, it's usually {zone_id: {col1: val1, col2: val2}}
+            # or {col_name: {zone_id: val}}
+            # In this version, it'll return a dictionary for each column.
+            return {
+                col: result.set_index("zone_id")[col].to_dict() for col in value_columns
+            }
+        else:  # If value_columns is None, it should return point_count
             self.logger.warning(
-                "Using default points mapping implementation. Consider creating a specialized mapping function."
+                "No `value_columns` provided. Mapping point counts. Consider passing `value_columns` and `aggregation` or `mapping_function`."
             )
-            result = aggregate_points_to_zones(
-                points=points,
-                zones=self.zone_gdf,
-                value_columns=value_columns,
-                aggregation=aggregation,
-                point_zone_predicate=predicate,
-                zone_id_column="zone_id",
-                output_suffix=output_suffix,
-            )
-            if not value_columns:
-                return result["point_count"].to_dict()
-            return result[value_columns].to_dict()
+            return result.set_index("zone_id")["point_count"].to_dict()
     def map_polygons(
         self,
-        polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
+        polygons,
         value_columns: Optional[Union[str, List[str]]] = None,
-        aggregation: Union[str, Dict[str, str]] = "sum",
-        area_weighted: bool = False,
-        area_column: str = "area_in_meters",
-        mapping_function: Optional[Callable] = None,
-        **mapping_kwargs,
+        aggregation: Union[str, Dict[str, str]] = "count",
+        predicate: str = "intersects",
+        **kwargs,
     ) -> Dict:
-        """Map polygon data to zones with optional area weighting.
+        """
+        Maps polygon data to the instance's zones and aggregates values.
-        Aggregates polygon data to zones based on spatial intersections. Values can be
-        weighted by the fractional area of intersection between polygons and zones.
+        This method leverages `aggregate_polygons_to_zones` to perform a spatial
+        aggregation of polygon data onto the zones stored within this object instance.
+        It can count polygons, or aggregate their values, based on different spatial
+        relationships defined by the `predicate`.
         Args:
-            polygons (Union[pd.DataFrame, gpd.GeoDataFrame]): The polygon data to map.
-                Must contain geometry information if DataFrame.
-            value_columns (Union[str, List[str]], optional): Column name(s) to aggregate.
-                If None, only intersection areas will be calculated.
-            aggregation (Union[str, Dict[str, str]]): Aggregation method(s) to use.
-                Can be a single string ("sum", "mean", "max", "min") or a dictionary
-                mapping column names to specific aggregation methods. Defaults to "sum".
-            area_weighted (bool): Whether to weight values by fractional area of
-                intersection. Defaults to False.
-            area_column (str): Name of column to store calculated areas. Only used
-                if area calculation is needed. Defaults to "area_in_meters".
-            mapping_function (Callable, optional): Custom function for mapping polygons
-                to zones. If provided, signature should be mapping_function(self, polygons, **mapping_kwargs).
-                When used, all other parameters except mapping_kwargs are ignored.
-            **mapping_kwargs: Additional keyword arguments passed to the mapping function.
+            polygons (Union[pd.DataFrame, gpd.GeoDataFrame]):
+                The polygon data to map. Must contain geometry information if a
+                DataFrame.
+            value_columns (Union[str, List[str]], optional):
+                The column name(s) from the `polygons` data to aggregate. If `None`,
+                the method will automatically count the number of polygons that
+                match the given `predicate` for each zone.
+            aggregation (Union[str, Dict[str, str]], optional):
+                The aggregation method(s) to use. Can be a single string (e.g., "sum",
+                "mean", "max") or a dictionary mapping column names to specific
+                aggregation methods. This is ignored and set to "count" if
+                `value_columns` is `None`. Defaults to "count".
+            predicate (Literal["intersects", "within", "fractional"], optional):
+                The spatial relationship to use for aggregation:
+                - "intersects": Counts or aggregates values for any polygon that
+                  intersects a zone.
+                - "within": Counts or aggregates values for polygons that are
+                  entirely contained within a zone.
+                - "fractional": Performs area-weighted aggregation. The value of a
+                  polygon is distributed proportionally to the area of its overlap
+                  with each zone.
+                Defaults to "intersects".
+            **kwargs:
+                Additional keyword arguments to be passed to the underlying
+                `aggregate_polygons_to_zones_new` function.
         Returns:
-            Dict: Dictionary with zone IDs as keys and aggregated values as values.
-                Returns aggregated values for the specified value_columns.
+            Dict:
+                A dictionary or a nested dictionary containing the aggregated values,
+                with zone IDs as keys. If `value_columns` is a single string, the
+                return value is a dictionary mapping zone ID to the aggregated value.
+                If `value_columns` is a list, the return value is a nested dictionary
+                mapping each column name to its own dictionary of aggregated values.
         Raises:
-            TypeError: If polygons cannot be converted to a GeoDataFrame.
+            ValueError: If `value_columns` is of an unexpected type after processing.
+        Example:
+            >>> # Assuming 'self' is an object with a 'zone_gdf' attribute
+            >>> # Count all land parcels that intersect each zone
+            >>> parcel_counts = self.map_polygons(landuse_polygons)
+            >>>
+            >>> # Aggregate total population within zones using area weighting
+            >>> population_by_zone = self.map_polygons(
+            ...     landuse_polygons,
+            ...     value_columns="population",
+            ...     predicate="fractional",
+            ...     aggregation="sum"
+            ... )
+            >>>
+            >>> # Get the sum of residential area and count of buildings within each zone
+            >>> residential_stats = self.map_polygons(
+            ...     building_polygons,
+            ...     value_columns=["residential_area_sqm", "building_id"],
+            ...     aggregation={"residential_area_sqm": "sum", "building_id": "count"},
+            ...     predicate="intersects"
+            ... )
         """
-        if mapping_function is not None:
-            return mapping_function(self, polygons, **mapping_kwargs)
-        if area_column not in polygons_gdf:
-            if not isinstance(polygons, gpd.GeoDataFrame):
-                try:
-                    polygons_gdf = convert_to_geodataframe(polygons)
-                except:
-                    raise TypeError(
-                        "polygons must be a GeoDataFrame or convertible to one"
-                    )
-            else:
-                polygons_gdf = polygons.copy()
-            polygons_gdf[area_column] = polygons_gdf.to_crs(
-                polygons_gdf.estimate_utm_crs()
-            ).geometry.area
         if value_columns is None:
             self.logger.warning(
-                "Using default polygon mapping implementation. Consider providing value_columns."
+                f"No value_columns specified. Defaulting to counting polygons with {predicate} predicate."
             )
-            value_columns = area_column
+            temp_value_col = "_temp_polygon_count_dummy"
+            polygons[temp_value_col] = 1
+            actual_value_columns = temp_value_col
+            aggregation = "count"  # Force count if no value columns
+        else:
+            actual_value_columns = value_columns
         result = aggregate_polygons_to_zones(
-            polygons=polygons_gdf,
+            polygons=polygons,
             zones=self.zone_gdf,
-            value_columns=value_columns,
+            value_columns=actual_value_columns,
             aggregation=aggregation,
-            area_weighted=area_weighted,
+            predicate=predicate,
             zone_id_column="zone_id",
         )
-        return result[value_columns].to_dict()
+        # Convert the result GeoDataFrame to the expected dictionary format
+        if isinstance(actual_value_columns, str):
+            return result.set_index("zone_id")[actual_value_columns].to_dict()
+        elif isinstance(actual_value_columns, list):
+            return {
+                col: result.set_index("zone_id")[col].to_dict()
+                for col in actual_value_columns
+            }
+        else:
+            raise ValueError("Unexpected type for actual_value_columns.")
     def map_rasters(
         self,
@@ -291,7 +405,7 @@ class ZonalViewGenerator(ABC, Generic[T]):
         Returns:
             Union[np.ndarray, Dict]: By default, returns a NumPy array of sampled values
-                with shape (n_zones, n_rasters), taking the first non-nodata value encountered.
+                with shape (n_zones, 1), taking the first non-nodata value encountered.
                 Custom mapping functions may return different data structures.
         Note:
@@ -301,10 +415,6 @@ class ZonalViewGenerator(ABC, Generic[T]):
         if mapping_function is not None:
             return mapping_function(self, tif_processors, **mapping_kwargs)
-        self.logger.warning(
-            "Using default raster mapping implementation. Consider creating a specialized mapping function."
-        )
         raster_crs = tif_processors[0].crs
         if raster_crs != self.zone_gdf.crs:
@@ -318,7 +428,9 @@ class ZonalViewGenerator(ABC, Generic[T]):
             tif_processors=tif_processors, polygon_list=zone_geoms, stat=stat
         )
-        return sampled_values
+        zone_ids = self.get_zone_identifiers()
+        return {zone_id: value for zone_id, value in zip(zone_ids, sampled_values)}
     @lru_cache(maxsize=32)
     def _get_transformed_geometries(self, target_crs):
@@ -337,34 +449,78 @@ class ZonalViewGenerator(ABC, Generic[T]):
     def save_view(
         self,
-        view_data: gpd.GeoDataFrame,
         name: str,
         output_format: Optional[str] = None,
     ) -> Path:
         """Save the generated zonal view to disk.
         Args:
-            view_data (gpd.GeoDataFrame): The zonal view data to save.
             name (str): Base name for the output file (without extension).
             output_format (str, optional): File format to save in (e.g., "parquet",
-                "geojson", "shp"). If None, uses the format specified in generator_config.
+                "geojson", "shp"). If None, uses the format specified in config.
         Returns:
             Path: The full path where the view was saved.
         Note:
-            The output directory is determined by the generator_config.base_path setting.
+            The output directory is determined by the config.base_path setting.
             The file extension is automatically added based on the output format.
+            This method now saves the internal `self.view`.
         """
+        if self._view is None:
+            self.logger.warning(
+                "No variables have been added to the zonal view. Saving the base zone_gdf."
+            )
+            view_to_save = self.zone_gdf
+        else:
+            view_to_save = self._view
         format_to_use = output_format or self.config.output_format
         output_path = self.config.base_path / f"{name}.{format_to_use}"
         self.logger.info(f"Saving zonal view to {output_path}")
+        if format_to_use in ["geojson", "shp", "gpkg"]:
+            self.logger.warning(
+                f"Saving to {format_to_use} requires converting back to GeoDataFrame. Geometry column will be re-added."
+            )
+            # Re-add geometry for saving to geospatial formats
+            view_to_save = self.view.merge(
+                self.zone_gdf[["zone_id", "geometry"]], on="zone_id", how="left"
+            )
         write_dataset(
-            df=view_data,
+            data=view_to_save,
             path=str(output_path),
             data_store=self.data_store,
-            format=format_to_use,
         )
         return output_path
+    def to_dataframe(self) -> pd.DataFrame:
+        """
+        Returns the current zonal view as a DataFrame.
+        This method combines all accumulated variables in the view
+        Returns:
+            pd.DataFrame: The current view.
+        """
+        return self.view
+    def to_geodataframe(self) -> gpd.GeoDataFrame:
+        """
+        Returns the current zonal view merged with zone geometries as a GeoDataFrame.
+        This method combines all accumulated variables in the view with the corresponding
+        zone geometries, providing a spatially-enabled DataFrame for further analysis or export.
+        Returns:
+            gpd.GeoDataFrame: The current view merged with zone geometries.
+        """
+        return gpd.GeoDataFrame(
+            (self.view).merge(
+                self.zone_gdf[["zone_id", "geometry"]], on="zone_id", how="left"
+            ),
+            crs=self.zone_gdf.crs,
+        )

giga-spatial 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl

giga-spatial 0.6.4py3-none-any.whl → 0.6.6py3-none-any.whl