PyPI - ssb-sgis - Versions diffs - 1.1.16__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

ssb-sgis 1.1.16py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

sgis/__init__.py +4 -0
sgis/conf.py +56 -4
sgis/geopandas_tools/buffer_dissolve_explode.py +24 -47
sgis/geopandas_tools/conversion.py +18 -25
sgis/geopandas_tools/duplicates.py +47 -60
sgis/geopandas_tools/general.py +8 -84
sgis/geopandas_tools/overlay.py +190 -260
sgis/geopandas_tools/polygon_operations.py +67 -88
sgis/geopandas_tools/runners.py +277 -0
sgis/geopandas_tools/sfilter.py +40 -24
sgis/geopandas_tools/utils.py +37 -0
sgis/helpers.py +1 -1
sgis/io/dapla_functions.py +5 -7
sgis/maps/map.py +3 -1
sgis/parallel/parallel.py +32 -24
sgis/raster/image_collection.py +184 -162
sgis/raster/indices.py +0 -1
{ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/METADATA +1 -1
{ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/RECORD +21 -19
{ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/LICENSE +0 -0
{ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/WHEEL +0 -0

sgis/geopandas_tools/polygon_operations.py CHANGED Viewed

@@ -26,17 +26,14 @@ from shapely.errors import GEOSException
 from shapely.geometry import LinearRing
 from shapely.ops import SplitOp
+from ..conf import config
 from ..debug_config import _DEBUG_CONFIG
 from ..debug_config import _try_debug_print
 from ..maps.maps import explore_locals
 from .conversion import to_gdf
 from .conversion import to_geoseries
 from .duplicates import _get_intersecting_geometries
-from .general import _grouped_unary_union
-from .general import _parallel_unary_union
-from .general import _parallel_unary_union_geoseries
 from .general import _push_geom_col
-from .general import _unary_union_for_notna
 from .general import clean_geoms
 from .general import extend_lines
 from .general import get_grouped_centroids
@@ -46,11 +43,13 @@ from .geometry_types import get_geom_type
 from .geometry_types import make_all_singlepart
 from .geometry_types import to_single_geom_type
 from .neighbors import get_neighbor_indices
-from .overlay import _try_difference
 from .overlay import clean_overlay
 from .polygons_as_rings import PolygonsAsRings
+from .runners import OverlayRunner
+from .runners import UnionRunner
 from .sfilter import sfilter
 from .sfilter import sfilter_inverse
+from .utils import _unary_union_for_notna
 PRECISION = 1e-3
 _BUFFER = False
@@ -232,6 +231,8 @@ def eliminate_by_longest(
     aggfunc: str | dict | list | None = None,
     grid_size=None,
     n_jobs: int = 1,
+    union_runner: UnionRunner | None = None,
+    overlay_runner: OverlayRunner | None = None,
     **kwargs,
 ) -> tuple[GeoDataFrame]:
     """Dissolves selected polygons with the longest bordering neighbor polygon.
@@ -259,6 +260,10 @@ def eliminate_by_longest(
             (if aggfunc="first").
         grid_size: Rounding of the coordinates. Defaults to None.
         n_jobs: Number of threads to use. Defaults to 1.
+        union_runner: Optionally debug/manipulate the spatial union operations.
+            See the 'runners' module for example implementations.
+        overlay_runner: Optionally debug/manipulate the spatial overlay operations.
+            See the 'runners' module for example implementations.
         **kwargs: Keyword arguments passed to the dissolve method.
     Returns:
@@ -350,6 +355,7 @@ def eliminate_by_longest(
         keep_geom_type=False,
         grid_size=grid_size,
         n_jobs=n_jobs,
+        overlay_runner=overlay_runner,
     ).loc[lambda x: x["_eliminate_idx"].notna()]
     borders["_length"] = borders.length
@@ -390,6 +396,8 @@ def eliminate_by_longest(
         fix_double,
         grid_size=grid_size,
         n_jobs=n_jobs,
+        union_runner=union_runner,
+        overlay_runner=overlay_runner,
         **kwargs,
     )
@@ -434,6 +442,8 @@ def eliminate_by_longest(
             ignore_index=ignore_index,
             aggfunc=aggfunc,
             grid_size=grid_size,
+            union_runner=union_runner,
+            overlay_runner=overlay_runner,
             n_jobs=n_jobs,
         )
@@ -494,6 +504,8 @@ def eliminate_by_largest(
     predicate: str = "intersects",
     grid_size=None,
     n_jobs: int = 1,
+    union_runner: UnionRunner | None = None,
+    overlay_runner: OverlayRunner | None = None,
     **kwargs,
 ) -> tuple[GeoDataFrame]:
     """Dissolves selected polygons with the largest neighbor polygon.
@@ -522,6 +534,10 @@ def eliminate_by_largest(
         predicate: Binary predicate passed to sjoin. Defaults to "intersects".
         grid_size: Rounding of the coordinates. Defaults to None.
         n_jobs: Number of threads to use. Defaults to 1.
+        union_runner: Optionally debug/manipulate the spatial union operations.
+            See the 'runners' module for example implementations.
+        overlay_runner: Optionally debug/manipulate the spatial overlay operations.
+            See the 'runners' module for example implementations.
         **kwargs: Keyword arguments passed to the dissolve method.
     Returns:
@@ -566,6 +582,8 @@ def eliminate_by_largest(
         fix_double=fix_double,
         grid_size=grid_size,
         n_jobs=n_jobs,
+        union_runner=union_runner,
+        overlay_runner=overlay_runner,
         **kwargs,
     )
@@ -581,6 +599,8 @@ def eliminate_by_smallest(
     fix_double: bool = True,
     grid_size=None,
     n_jobs: int = 1,
+    union_runner: UnionRunner | None = None,
+    overlay_runner: OverlayRunner | None = None,
     **kwargs,
 ) -> tuple[GeoDataFrame]:
     return _eliminate_by_area(
@@ -594,6 +614,8 @@ def eliminate_by_smallest(
         fix_double=fix_double,
         grid_size=grid_size,
         n_jobs=n_jobs,
+        union_runner=union_runner,
+        overlay_runner=overlay_runner,
         **kwargs,
     )
@@ -603,12 +625,14 @@ def _eliminate_by_area(
     to_eliminate: GeoDataFrame,
     max_distance: int | float | None,
     sort_ascending: bool,
-    ignore_index: bool = False,
-    aggfunc: str | dict | list | None = None,
-    predicate="intersects",
-    fix_double: bool = True,
-    grid_size=None,
-    n_jobs: int = 1,
+    ignore_index: bool,
+    aggfunc: str | dict | list | None,
+    predicate: str,
+    fix_double: bool,
+    grid_size,
+    n_jobs: int,
+    union_runner: UnionRunner,
+    overlay_runner: OverlayRunner,
     **kwargs,
 ) -> GeoDataFrame:
     _recurse = kwargs.pop("_recurse", False)
@@ -667,6 +691,8 @@ def _eliminate_by_area(
         fix_double=fix_double,
         grid_size=grid_size,
         n_jobs=n_jobs,
+        union_runner=union_runner,
+        overlay_runner=overlay_runner,
         **kwargs,
     )
@@ -717,18 +743,14 @@ def _eliminate_by_area(
             ignore_index=ignore_index,
             aggfunc=aggfunc,
             grid_size=grid_size,
+            union_runner=union_runner,
+            overlay_runner=overlay_runner,
             n_jobs=n_jobs,
         )
     if not was_multiple_gdfs:
         return out, isolated
-    for k, v in locals().items():
-        try:
-            print(k, v.columns)
-        except Exception:
-            pass
     gdfs = ()
     for i, cols in enumerate(original_cols):
         df = out.loc[out["_df_idx"] == i, cols]
@@ -738,11 +760,26 @@ def _eliminate_by_area(
 def _eliminate(
-    gdf, to_eliminate, aggfunc, crs, fix_double, grid_size, n_jobs, **kwargs
+    gdf,
+    to_eliminate,
+    aggfunc,
+    crs,
+    fix_double,
+    grid_size,
+    n_jobs,
+    overlay_runner,
+    union_runner,
+    **kwargs,
 ):
     if not len(to_eliminate):
         return gdf
+    if union_runner is None:
+        union_runner = config.get_instance("union_runner", n_jobs)
+    if overlay_runner is None:
+        overlay_runner = config.get_instance("overlay_runner", n_jobs)
     gdf["_range_idx_elim"] = range(len(gdf))
     in_to_eliminate = gdf["_dissolve_idx"].isin(to_eliminate["_dissolve_idx"])
@@ -798,16 +835,6 @@ def _eliminate(
         # all_geoms: pd.Series = gdf.set_index("_dissolve_idx").geometry
         all_geoms: pd.Series = gdf.geometry
-        # more_than_one = get_num_geometries(all_geoms.values) > 1
-        # all_geoms.loc[more_than_one] = all_geoms.loc[more_than_one].apply(
-        #     _unary_union_for_notna
-        # )
-        # more_than_one = get_num_geometries(to_be_eliminated.values) > 1
-        # to_be_eliminated.loc[more_than_one, "geometry"] = to_be_eliminated.loc[
-        #     more_than_one, "geometry"
-        # ].apply(_unary_union_for_notna)
         # create DataFrame of intersection pairs
         tree = STRtree(all_geoms.values)
         left, right = tree.query(
@@ -819,8 +846,6 @@ def _eliminate(
             dict(enumerate(to_be_eliminated.index))
         )
-        # pairs = pairs.loc[lambda x: x["right"] != x["_dissolve_idx"]]
         soon_erased = to_be_eliminated.iloc[pairs.index]
         intersecting = all_geoms.iloc[pairs["right"]]
@@ -829,61 +854,31 @@ def _eliminate(
         intersecting = intersecting[shoud_not_erase]
         missing = to_be_eliminated.loc[
-            # (~to_be_eliminated.index.isin(soon_erased.index))
-            # |
-            (~to_be_eliminated["_row_idx"].isin(soon_erased["_row_idx"])),
-            # | (~to_be_eliminated["_row_idx"].isin(soon_erased.index)),
-            "geometry",
+            (~to_be_eliminated["_row_idx"].isin(soon_erased["_row_idx"])), "geometry"
         ]
         # allign and aggregate by dissolve index to not get duplicates in difference
         intersecting.index = soon_erased.index
-        soon_erased = _grouped_unary_union(soon_erased, level=0, grid_size=grid_size)
-        intersecting = _grouped_unary_union(intersecting, level=0, grid_size=grid_size)
+        soon_erased = union_runner.run(soon_erased, level=0, grid_size=grid_size)
+        intersecting = union_runner.run(intersecting, level=0, grid_size=grid_size)
         assert soon_erased.index.equals(soon_erased.index)
-        # soon_erased = soon_erased.geometry.groupby(level=0).agg(
-        #     lambda x: unary_union(x, grid_size=grid_size)
-        # )
-        # intersecting = intersecting.groupby(level=0).agg(
-        #     lambda x: unary_union(x, grid_size=grid_size)
-        # )
-        # explore_locals(center=_DEBUG_CONFIG["center"])
-        soon_erased.loc[:] = _try_difference(
+        soon_erased.loc[:] = overlay_runner.run(
+            difference,
             soon_erased.to_numpy(),
             intersecting.to_numpy(),
             grid_size=grid_size,
-            n_jobs=n_jobs,
             geom_type="polygon",
         )
-        missing = _grouped_unary_union(missing, level=0, grid_size=grid_size)
+        missing = union_runner.run(missing, level=0, grid_size=grid_size)
         missing = make_all_singlepart(missing).loc[lambda x: x.area > 0]
         soon_erased = make_all_singlepart(soon_erased).loc[lambda x: x.area > 0]
-        if 0:
-            tree = STRtree(soon_erased.values)
-            left, right = tree.query(missing.values, predicate="intersects")
-            explore_locals(
-                missing2=to_gdf(missing.to_numpy()[left], 25833),
-                soon_erased2=to_gdf(soon_erased.to_numpy()[right], 25833),
-                center=_DEBUG_CONFIG["center"],
-            )
-            missing = pd.Series(
-                difference(
-                    missing.to_numpy()[left],
-                    soon_erased.to_numpy()[right],
-                    grid_size=grid_size,
-                ),
-                index=left,
-            ).loc[lambda x: (x.notna()) & (~is_empty(x))]
         soon_eliminated = pd.concat([eliminators, soon_erased, missing])
         more_than_one = get_num_geometries(soon_eliminated.values) > 1
@@ -891,29 +886,13 @@ def _eliminate(
             _unary_union_for_notna
         )
-        if n_jobs > 1:
-            eliminated["geometry"] = GeoSeries(
-                _parallel_unary_union_geoseries(
-                    soon_eliminated,
-                    level=0,
-                    grid_size=grid_size,
-                    n_jobs=n_jobs,
-                ),
-                index=eliminated.index,
-            )
-        else:
-            eliminated["geometry"] = _grouped_unary_union(soon_eliminated, level=0)
-            # eliminated["geometry"] = soon_eliminated.groupby(level=0).agg(
-            #     lambda x: make_valid(unary_union(x))
-            # )
+        eliminated["geometry"] = union_runner.run(
+            soon_eliminated, level=0, grid_size=grid_size
+        )
     else:
-        if n_jobs > 1:
-            eliminated["geometry"] = _parallel_unary_union(
-                many_hits, by="_dissolve_idx", grid_size=grid_size, n_jobs=n_jobs
-            )
-        else:
-            eliminated["geometry"] = _grouped_unary_union(many_hits, by="_dissolve_idx")
+        eliminated["geometry"] = union_runner.run(
+            many_hits, by="_dissolve_idx", grid_size=grid_size, n_jobs=n_jobs
+        )
     # setting crs on the GeometryArrays to avoid warning in concat
     not_to_dissolve.geometry.values.crs = crs

sgis/geopandas_tools/runners.py ADDED Viewed

@@ -0,0 +1,277 @@
+import functools
+from abc import ABC
+from abc import abstractmethod
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
+import joblib
+import numpy as np
+import pandas as pd
+from geopandas import GeoDataFrame
+from geopandas import GeoSeries
+from shapely import STRtree
+from shapely import get_parts
+from shapely import make_valid
+from shapely import union_all
+from shapely.errors import GEOSException
+from .utils import _unary_union_for_notna
+from .utils import make_valid_and_keep_geom_type
+@dataclass
+class AbstractRunner(ABC):
+    """Blueprint for 'runner' classes.
+    Subclasses must implement a 'run' method.
+    Args:
+        n_jobs: Number of workers.
+        backend: Backend for the workers.
+    """
+    n_jobs: int
+    backend: str | None = None
+    @abstractmethod
+    def run(self, *args, **kwargs) -> Any:
+        """Abstract run method."""
+@dataclass
+class UnionRunner(AbstractRunner):
+    """Run shapely.union_all with pandas.groupby.
+    Subclasses must implement a 'run' method that takes the arguments
+    'df' (GeoDataFrame or GeoSeries), 'by' (optional column to group by), 'grid_size'
+    (passed to shapely.union_all) and **kwargs passed to pandas.DataFrame.groupby.
+    Defaults to None, meaning the default runner with number of workers set
+    to 'n_jobs'.
+    Args:
+        n_jobs: Number of workers.
+        backend: Backend for the workers.
+    """
+    n_jobs: int
+    backend: str | None = None
+    def run(
+        self,
+        df: GeoDataFrame | GeoSeries | pd.DataFrame | pd.Series,
+        by: str | list[str] | None = None,
+        grid_size: float | int | None = None,
+        **kwargs,
+    ) -> GeoSeries | GeoDataFrame:
+        """Run groupby on geometries in parallel (if n_jobs > 1)."""
+        # assume geometry column is 'geometry' if input is pandas.Series og pandas.DataFrame
+        try:
+            geom_col = df.geometry.name
+        except AttributeError:
+            try:
+                geom_col = df.name
+                if geom_col is None:
+                    geom_col = "geometry"
+            except AttributeError:
+                geom_col = "geometry"
+        try:
+            crs = df.crs
+        except AttributeError:
+            crs = None
+        unary_union_for_grid_size = functools.partial(
+            _unary_union_for_notna, grid_size=grid_size
+        )
+        as_index = kwargs.pop("as_index", True)
+        if by is None and "level" not in kwargs:
+            by = np.zeros(len(df), dtype="int64")
+        try:
+            # DataFrame
+            groupby_obj = df.groupby(by, **kwargs)[geom_col]
+        except KeyError:
+            # Series
+            groupby_obj = df.groupby(by, **kwargs)
+        if self.n_jobs is None or self.n_jobs == 1:
+            results = groupby_obj.agg(unary_union_for_grid_size)
+            index = results.index
+        else:
+            backend = self.backend or "loky"
+            with joblib.Parallel(n_jobs=self.n_jobs, backend=backend) as parallel:
+                results = parallel(
+                    joblib.delayed(unary_union_for_grid_size)(geoms)
+                    for _, geoms in groupby_obj
+                )
+            index = groupby_obj.size().index
+        agged = GeoSeries(results, index=index, name=geom_col, crs=crs)
+        if not as_index:
+            return agged.reset_index()
+        return agged
+def _strtree_query(arr1, arr2, **kwargs):
+    tree = STRtree(arr2)
+    return tree.query(arr1, **kwargs)
+@dataclass
+class RTreeQueryRunner(AbstractRunner):
+    """Run shapely.STRTree chunkwise.
+    Subclasses must implement a 'query' method that takes a numpy.ndarray
+    of geometries as 0th and 1st argument and **kwargs passed to the query method,
+    chiefly 'predicate' and 'distance'. The 'query' method should return a tuple
+    of two arrays representing the spatial index pairs of the left and right input arrays.
+    Defaults to None, meaning the default runner with number of workers set
+    to 'n_jobs'.
+    Args:
+        n_jobs: Number of workers.
+        backend: Backend for the workers.
+    """
+    n_jobs: int
+    backend: str = "loky"
+    def run(
+        self, arr1: np.ndarray, arr2: np.ndarray, **kwargs
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Run a spatial rtree query and return indices of hits from arr1 and arr2 in a tuple of two arrays."""
+        # if (
+        #     self.n_jobs > 1
+        #     and len(arr1) / self.n_jobs > 1000
+        #     # and len(arr1) / len(arr2) > 3
+        # ):
+        #     chunks = np.array_split(np.arange(len(arr1)), self.n_jobs)
+        #     assert sum(len(x) for x in chunks) == len(arr1)
+        #     with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
+        #         results = parallel(
+        #             joblib.delayed(_strtree_query)(arr1[chunk], arr2, **kwargs)
+        #             for chunk in chunks
+        #         )
+        #     left = np.concatenate([x[0] for x in results])
+        #     right = np.concatenate([x[1] for x in results])
+        #     return left, right
+        # elif (
+        #     self.n_jobs > 1
+        #     and len(arr2) / self.n_jobs > 1000
+        #     and len(arr2) / len(arr1) > 3
+        # ):
+        #     chunks = np.array_split(np.arange(len(arr2)), self.n_jobs)
+        #     with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
+        #         results = parallel(
+        #             joblib.delayed(_strtree_query)(arr1, arr2[chunk], **kwargs)
+        #             for chunk in chunks
+        #         )
+        #     left = np.concatenate([x[0] for x in results])
+        #     right = np.concatenate([x[1] for x in results])
+        #     return left, right
+        return _strtree_query(arr1, arr2, **kwargs)
+@dataclass
+class OverlayRunner(AbstractRunner):
+    """Run a vectorized shapely overlay operation on two equal-length numpy arrays.
+    Subclasses must implement a 'run' method that takes an overlay function (shapely.intersection, shapely.difference etc.)
+    as 0th argument and two numpy.ndarrays of same length as 1st and 2nd argument.
+    The 'run' method should also take the argument 'grid_size' to be passed to the overlay function
+    and the argument 'geom_type' which is used to keep only relevant geometries (polygon, line or point)
+    in cases of GEOSExceptions caused by geometry type mismatch.
+    Defaults to an instance of OverlayRunner, which is run sequencially (no n_jobs)
+    because the vectorized shapely functions are usually faster than any attempt to parallelize.
+    """
+    n_jobs: None = None
+    backend: None = None
+    @staticmethod
+    def run(
+        func: Callable,
+        arr1: np.ndarray,
+        arr2: np.ndarray,
+        grid_size: int | float | None,
+        geom_type: str | None,
+    ) -> np.ndarray:
+        """Run the overlay operation (func) with fallback.
+        First tries to run func, then, if GEOSException, geometries are made valid
+        and only geometries with correct geom_type (point, line, polygon) are kept
+        in GeometryCollections.
+        """
+        try:
+            return func(arr1, arr2, grid_size=grid_size)
+        except GEOSException:
+            arr1 = make_valid_and_keep_geom_type(arr1, geom_type=geom_type)
+            arr2 = make_valid_and_keep_geom_type(arr2, geom_type=geom_type)
+            arr1 = arr1.loc[lambda x: x.index.isin(arr2.index)].to_numpy()
+            arr2 = arr2.loc[lambda x: x.index.isin(arr1.index)].to_numpy()
+            return func(arr1, arr2, grid_size=grid_size)
+@dataclass
+class GridSizeOverlayRunner(OverlayRunner):
+    """Run a shapely overlay operation rowwise for different grid_sizes until success."""
+    n_jobs: int
+    backend: str | None
+    grid_sizes: list[float] | None = None
+    def __post_init__(self) -> None:
+        """Check that grid_sizes is passed."""
+        if self.grid_sizes is None:
+            raise ValueError(
+                f"must set 'grid_sizes' in the {self.__class__.__name__} initialiser."
+            )
+    def run(
+        self,
+        func: Callable,
+        arr1: np.ndarray,
+        arr2: np.ndarray,
+        grid_size: int | float | None = None,
+        geom_type: str | None = None,
+    ) -> np.ndarray:
+        """Run the overlay operation rowwise with fallback.
+        The overlay operation (func) is looped for each row in arr1 and arr2
+        as 0th and 1st argument to 'func' and 'grid_size' as keyword argument. If a GEOSException is thrown,
+        geometries are made valid and GeometryCollections are forced to either
+        (Multi)Point, (Multi)Polygon or (Multi)LineString, depending on the value in "geom_type".
+        Then, if Another GEOSException is thrown, the overlay operation is looped for the grid_sizes given
+        in the instance's 'grid_sizes' attribute.
+        """
+        kwargs = dict(
+            grid_size=grid_size, geom_type=geom_type.lower(), grid_sizes=self.grid_sizes
+        )
+        with joblib.Parallel(self.n_jobs, backend="threading") as parallel:
+            return parallel(
+                joblib.delayed(_run_overlay_rowwise)(func, g1, g2, **kwargs)
+                for g1, g2 in zip(arr1, arr2, strict=True)
+            )
+def _run_overlay_rowwise(func, geom1, geom2, grid_size, geom_type, grid_sizes):
+    try:
+        return func(geom1, geom2, grid_size=grid_size)
+    except GEOSException:
+        pass
+    geom1 = get_parts(make_valid(geom1))
+    geom2 = get_parts(make_valid(geom2))
+    geom1 = union_all([g for g in geom1 if pd.notna(g) and geom_type in g.geom_type])
+    geom2 = union_all([g for g in geom2 if pd.notna(g) and geom_type in g.geom_type])
+    try:
+        return func(geom1, geom2)
+    except GEOSException:
+        pass
+    for i, grid_size in enumerate(grid_sizes):
+        try:
+            return func(geom1, geom2, grid_size=grid_size)
+        except GEOSException as e:
+            if i == len(grid_sizes) - 1:
+                raise e

ssb-sgis 1.1.16__py3-none-any.whl → 1.2.0__py3-none-any.whl

ssb-sgis 1.1.16py3-none-any.whl → 1.2.0py3-none-any.whl