PyPI - ssb-sgis - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

ssb-sgis 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{ssb_sgis-0.2.1 → ssb_sgis-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ssb-sgis
-Version: 0.2.1
+Version: 0.2.3
 Summary: GIS functions used at Statistics Norway.
 Home-page: https://github.com/statisticsnorway/ssb-sgis
 License: MIT

{ssb_sgis-0.2.1 → ssb_sgis-0.2.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-sgis"
-version = "0.2.1"
+version = "0.2.3"
 description = "GIS functions used at Statistics Norway."
 authors = ["Statistics Norway <ort@ssb.no>"]
 license = "MIT"

{ssb_sgis-0.2.1 → ssb_sgis-0.2.3}/src/sgis/__init__.py RENAMED Viewed

@@ -38,6 +38,9 @@ from .geopandas_tools.point_operations import snap_all, snap_within_distance
 from .geopandas_tools.polygon_operations import (
     close_all_holes,
     close_small_holes,
+    eliminate_by_largest,
+    eliminate_by_longest,
+    eliminate_by_smallest,
     get_overlapping_polygon_indices,
     get_overlapping_polygon_product,
     get_overlapping_polygons,

{ssb_sgis-0.2.1 → ssb_sgis-0.2.3}/src/sgis/geopandas_tools/polygon_operations.py RENAMED Viewed

@@ -15,13 +15,177 @@ from shapely import (
 )
 from shapely.ops import unary_union
-from .general import _push_geom_col
+from .general import _push_geom_col, to_lines
 from .neighbors import get_neighbor_indices
 from .overlay import clean_overlay
+def eliminate_by_longest(
+    gdf: GeoDataFrame,
+    min_area: int | float,
+    ignore_index: bool = False,
+    aggfunc: str | dict | list = "first",
+    **kwargs,
+) -> GeoDataFrame:
+    """Dissolves small polygons with the longest bordering neighbor polygon.
+    Eliminates small geometries by dissolving them with the neighboring
+    polygon with the longest shared border. The index and column values of the
+    large polygons will be kept, unless else is specified.
+    Args:
+        gdf: GeoDataFrame with polygon geometries.
+        min_area: minimum area for the polygons to be eliminated.
+        ignore_index: If False (default), the resulting GeoDataFrame will keep the
+            index of the large polygons. If True, the resulting axis will be labeled
+            0, 1, …, n - 1.
+        aggfunc: Aggregation function(s) to use when dissolving. Defaults to 'first',
+            meaning the column values of the large polygons are kept.
+        kwargs: Keyword arguments passed to the dissolve method.
+    Returns:
+        The GeoDataFrame with the small polygons dissolved into the large polygons.
+    """
+    if not ignore_index:
+        idx_mapper = {i: idx for i, idx in enumerate(gdf.index)}
+        idx_name = gdf.index.name
+    gdf = gdf.reset_index(drop=True)
+    small = gdf.loc[gdf.area <= min_area].assign(small_idx=lambda x: x.index)
+    large = gdf.loc[gdf.area > min_area].assign(large_idx=lambda x: x.index)
+    lines = to_lines(small[["small_idx", "geometry"]], large[["large_idx", "geometry"]])
+    lines = lines[lines["small_idx"].notna()]
+    lines["length__"] = lines.length
+    longest = lines.sort_values("length__", ascending=False).drop_duplicates(
+        "small_idx"
+    )
+    small_to_large = longest.set_index("small_idx")["large_idx"]
+    small["dissolve_idx"] = small["small_idx"].map(small_to_large)
+    large["dissolve_idx"] = large["large_idx"]
+    kwargs.pop("as_index", None)
+    eliminated = (
+        pd.concat([large, small])
+        .dissolve("dissolve_idx", aggfunc=aggfunc, **kwargs)
+        .drop(
+            ["length__", "small_idx", "large_idx"],
+            axis=1,
+            errors="ignore",
+        )
+    )
+    if ignore_index:
+        return eliminated.reset_index(drop=True)
+    eliminated.index = eliminated.index.map(idx_mapper)
+    eliminated.index.name = idx_name
+    return eliminated
+def eliminate_by_largest(
+    gdf: GeoDataFrame,
+    min_area: int | float,
+    ignore_index: bool = False,
+    aggfunc: str | dict | list = "first",
+    **kwargs,
+) -> GeoDataFrame:
+    """Dissolves small polygons with the largest neighbor polygon.
+    Eliminates small geometries by dissolving them with the neighboring
+    polygon with the largest area. The index and column values of the
+    large polygons will be kept, unless else is specified.
+    Args:
+        gdf: GeoDataFrame with polygon geometries.
+        min_area: minimum area for the polygons to be eliminated.
+        ignore_index: If False (default), the resulting GeoDataFrame will keep the
+            index of the large polygons. If True, the resulting axis will be labeled
+            0, 1, …, n - 1.
+        aggfunc: Aggregation function(s) to use when dissolving. Defaults to 'first',
+            meaning the column values of the large polygons are kept.
+        kwargs: Keyword arguments passed to the dissolve method.
+    Returns:
+        The GeoDataFrame with the small polygons dissolved into the large polygons.
+    """
+    return _eliminate_by_area(
+        gdf,
+        min_area=min_area,
+        ignore_index=ignore_index,
+        sort_ascending=False,
+        aggfunc=aggfunc,
+        **kwargs,
+    )
+def eliminate_by_smallest(
+    gdf: GeoDataFrame,
+    min_area: int | float,
+    ignore_index: bool = False,
+    aggfunc: str | dict | list = "first",
+    **kwargs,
+) -> GeoDataFrame:
+    return _eliminate_by_area(
+        gdf,
+        min_area=min_area,
+        ignore_index=ignore_index,
+        sort_ascending=True,
+        aggfunc=aggfunc,
+        **kwargs,
+    )
+def _eliminate_by_area(
+    gdf: GeoDataFrame,
+    min_area: int | float,
+    sort_ascending: bool,
+    ignore_index: bool = False,
+    aggfunc="first",
+    **kwargs,
+) -> GeoDataFrame:
+    if not ignore_index:
+        idx_mapper = {i: idx for i, idx in enumerate(gdf.index)}
+        idx_name = gdf.index.name
+    gdf = gdf.reset_index(drop=True)
+    small = gdf.loc[gdf.area <= min_area]
+    large = gdf.loc[gdf.area > min_area]
+    large["area__"] = large.area
+    joined = small.sjoin(
+        large[["area__", "geometry"]], predicate="touches"
+    ).sort_values("area__", ascending=sort_ascending)
+    largest = joined[~joined.index.duplicated()]
+    large = large.assign(index_right=lambda x: x.index)
+    kwargs.pop("as_index", None)
+    eliminated = (
+        pd.concat([large, largest])
+        .dissolve("index_right", aggfunc=aggfunc, **kwargs)
+        .drop(["area__"], axis=1, errors="ignore")
+    )
+    if ignore_index:
+        return eliminated.reset_index(drop=True)
+    eliminated.index = eliminated.index.map(idx_mapper)
+    eliminated.index.name = idx_name
+    return eliminated
 def get_polygon_clusters(
-    *gdfs: GeoDataFrame | GeoSeries, cluster_col: str = "cluster", explode: bool = True
+    *gdfs: GeoDataFrame | GeoSeries,
+    cluster_col: str = "cluster",
+    allow_multipart: bool = False,
 ) -> GeoDataFrame | tuple[GeoDataFrame]:
     """Find which polygons overlap without dissolving.
@@ -38,8 +202,8 @@ def get_polygon_clusters(
     Args:
         gdfs: One or more GeoDataFrames of polygons.
         cluster_col: Name of the resulting cluster column.
-        explode: Whether to explode the geometries to singlepart before the spatial
-            join. Defaults to True. Index will be preserved.
+        allow_multipart: Whether to allow mutipart geometries in the gdfs.
+            Defaults to False to avoid confusing results.
     Returns:
         One or more GeoDataFrames (same amount as was given) with a new cluster column.
@@ -47,8 +211,7 @@ def get_polygon_clusters(
     Examples
     --------
-    Create polygon geometries where row 0, 1 and 2 overlap, 3 and 4 overlap
-    and 6 is on its own.
+    Create geometries with three clusters of overlapping polygons.
     >>> import sgis as sg
     >>> gdf = sg.to_gdf([(0, 0), (1, 1), (0, 1), (4, 4), (4, 3), (7, 7)])
@@ -62,7 +225,7 @@ def get_polygon_clusters(
     4  POLYGON ((5.00000 3.00000, 4.99951 2.96859, 4....
     5  POLYGON ((8.00000 7.00000, 7.99951 6.96859, 7....
-    This will add a cluster column to the GeoDataFrame:
+    Add a cluster column to the GeoDataFrame:
     >>> gdf = sg.get_polygon_clusters(gdf, cluster_col="cluster")
     >>> gdf
@@ -80,7 +243,7 @@ def get_polygon_clusters(
     >>> gdf2 = sg.to_gdf([(0, 0), (7, 7)])
     >>> gdf, gdf2 = sg.get_polygon_clusters(gdf, gdf2, cluster_col="cluster")
     >>> gdf2
-    cluster                 geometry
+       cluster                 geometry
     0        0  POINT (0.00000 0.00000)
     1        2  POINT (7.00000 7.00000)
     >>> gdf
@@ -101,28 +264,12 @@ def get_polygon_clusters(
     0        0  POLYGON ((0.99951 -0.03141, 0.99803 -0.06279, ...
     1        1  POLYGON ((4.99951 2.96859, 4.99803 2.93721, 4....
     2        2  POLYGON ((8.00000 7.00000, 7.99951 6.96859, 7....
-    Which is equivelen to this in straigt geopandas:
-    >>> dissolved2 = gdf.dissolve().explode(ignore_index=True).assign(cluster=lambda x: x.index)
-    >>> dissolved2
-       cluster                                           geometry
-    0        0  POLYGON ((0.99803 -0.06279, 0.99556 -0.09411, ...
-    1        1  POLYGON ((4.99803 2.93721, 4.99556 2.90589, 4....
-    2        2  POLYGON ((7.99556 6.90589, 7.99211 6.87467, 7....
-    Note that the order of the coordinates is different, and there is
-    some deviations in the rounding on microscopic levels.
-    >>> dissolved.area.sum()
-    15.016909720698278
-    >>> dissolved2.area.sum()
-    15.016909720698285
     """
     if isinstance(gdfs[-1], str):
         *gdfs, cluster_col = gdfs
     concated = pd.DataFrame()
+    orig_indices = ()
     for i, gdf in enumerate(gdfs):
         if isinstance(gdf, GeoSeries):
             gdf = gdf.to_frame()
@@ -130,11 +277,15 @@ def get_polygon_clusters(
         if not isinstance(gdf, GeoDataFrame):
             raise TypeError("'gdfs' should be one or more GeoDataFrames or GeoSeries.")
-        if explode:
-            gdf = gdf.explode(index_parts=False)
+        if not allow_multipart and len(gdf) != len(gdf.explode(index_parts=False)):
+            raise ValueError(
+                "All geometries should be exploded to singlepart "
+                "in order to get correct polygon clusters. "
+                "To allow multipart geometries, set allow_multipart=True"
+            )
-        gdf["orig_idx___"] = gdf.index
-        gdf["_i___"] = i
+        orig_indices = orig_indices + (gdf.index,)
+        gdf["i__"] = i
         concated = pd.concat([concated, gdf], ignore_index=True)
@@ -151,27 +302,28 @@ def get_polygon_clusters(
         for j in component
     }
-    concated[cluster_col] = concated.index.map(component_mapper)
-    concated.index = concated["orig_idx___"].values
+    concated[cluster_col] = component_mapper
     concated = _push_geom_col(concated)
-    _i___ = concated["_i___"].unique()
+    n_gdfs = concated["i__"].unique()
-    if len(_i___) == 1:
-        return concated.drop(["_i___", "orig_idx___"], axis=1)
+    if len(n_gdfs) == 1:
+        concated.index = orig_indices[0]
+        return concated.drop(["i__"], axis=1)
     unconcated = ()
-    for i in _i___:
-        gdf = concated[concated["_i___"] == i].drop(["_i___", "orig_idx___"], axis=1)
+    for i in n_gdfs:
+        gdf = concated[concated["i__"] == i]
+        gdf.index = orig_indices[i]
+        gdf = gdf.drop(["i__"], axis=1)
         unconcated = unconcated + (gdf,)
     return unconcated
 def get_overlapping_polygons(
-    gdf: GeoDataFrame | GeoSeries, ignore_index=False
+    gdf: GeoDataFrame | GeoSeries, ignore_index: bool = False
 ) -> GeoDataFrame | GeoSeries:
     """Find the areas that overlap.
@@ -210,14 +362,6 @@ def get_overlapping_polygons(
 def get_overlapping_polygon_indices(gdf: GeoDataFrame | GeoSeries) -> pd.Index:
-    """Get the index of the rows that contain overlapping geometries.
-    Args:
-        gdf: GeoDataFrame of polygons.
-    Returns:
-        A pandas Index with the overlapping polygon indices.
-    """
     if not gdf.index.is_unique:
         raise ValueError(
             "Index must be unique in order to correctly find "