PyPI - ssb-sgis - Versions diffs - 1.1.17__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

ssb-sgis 1.1.17py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

sgis/__init__.py +4 -0
sgis/conf.py +56 -4
sgis/geopandas_tools/buffer_dissolve_explode.py +24 -47
sgis/geopandas_tools/conversion.py +18 -25
sgis/geopandas_tools/duplicates.py +44 -60
sgis/geopandas_tools/general.py +8 -84
sgis/geopandas_tools/overlay.py +177 -256
sgis/geopandas_tools/polygon_operations.py +67 -88
sgis/geopandas_tools/runners.py +277 -0
sgis/geopandas_tools/sfilter.py +40 -24
sgis/geopandas_tools/utils.py +37 -0
sgis/helpers.py +1 -1
sgis/io/dapla_functions.py +5 -7
sgis/maps/map.py +3 -1
sgis/parallel/parallel.py +32 -24
sgis/raster/image_collection.py +184 -162
sgis/raster/indices.py +0 -1
{ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/METADATA +1 -1
{ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/RECORD +21 -19
{ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/LICENSE +0 -0
{ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/WHEEL +0 -0

sgis/geopandas_tools/sfilter.py CHANGED Viewed

@@ -4,11 +4,10 @@ import numpy as np
 import pandas as pd
 from geopandas import GeoDataFrame
 from geopandas import GeoSeries
-from geopandas import __version__ as geopandas_version
 from shapely import Geometry
-from shapely import STRtree
 from .conversion import to_gdf
+from .runners import RTreeQueryRunner
 gdf_type_error_message = "'gdf' should be of type GeoDataFrame or GeoSeries."
@@ -18,6 +17,8 @@ def sfilter(
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
     distance: int | float | None = None,
+    n_jobs: int = 1,
+    rtree_runner: RTreeQueryRunner | None = None,
 ) -> GeoDataFrame:
     """Filter a GeoDataFrame or GeoSeries by spatial predicate.
@@ -33,6 +34,9 @@ def sfilter(
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
         distance: Max distance to allow if predicate=="dwithin".
+        n_jobs: Number of workers.
+        rtree_runner: Optionally debug/manipulate the spatial indexing operations.
+            See the 'runners' module for example implementations.
     Returns:
         A copy of 'gdf' with only the rows matching the
@@ -80,7 +84,9 @@ def sfilter(
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate, distance)
+    indices = _get_sfilter_indices(
+        gdf, other, predicate, distance, n_jobs, rtree_runner
+    )
     return gdf.iloc[indices]
@@ -90,6 +96,8 @@ def sfilter_split(
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
     distance: int | float | None = None,
+    n_jobs: int = 1,
+    rtree_runner: RTreeQueryRunner | None = None,
 ) -> tuple[GeoDataFrame, GeoDataFrame]:
     """Split a GeoDataFrame or GeoSeries by spatial predicate.
@@ -101,6 +109,9 @@ def sfilter_split(
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
         distance: Max distance to allow if predicate=="dwithin".
+        n_jobs: Number of workers.
+        rtree_runner: Optionally debug/manipulate the spatial indexing operations.
+            See the 'runners' module for example implementations.
     Returns:
         A tuple of GeoDataFrames, one with the rows that match the spatial predicate
@@ -151,7 +162,9 @@ def sfilter_split(
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate, distance)
+    indices = _get_sfilter_indices(
+        gdf, other, predicate, distance, n_jobs, rtree_runner
+    )
     return (
         gdf.iloc[indices],
@@ -164,6 +177,8 @@ def sfilter_inverse(
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
     distance: int | float | None = None,
+    n_jobs: int = 1,
+    rtree_runner: RTreeQueryRunner | None = None,
 ) -> GeoDataFrame | GeoSeries:
     """Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
@@ -174,6 +189,9 @@ def sfilter_inverse(
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
         distance: Max distance to allow if predicate=="dwithin".
+        n_jobs: Number of workers.
+        rtree_runner: Optionally debug/manipulate the spatial indexing operations.
+            See the 'runners' module for example implementations.
     Returns:
         A copy of 'gdf' with only the rows that do not match the
@@ -215,11 +233,10 @@ def sfilter_inverse(
     """
     if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
         raise TypeError(gdf_type_error_message)
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate, distance)
+    indices = _get_sfilter_indices(
+        gdf, other, predicate, distance, n_jobs, rtree_runner
+    )
     return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
@@ -252,6 +269,8 @@ def _get_sfilter_indices(
     right: GeoDataFrame | GeoSeries | Geometry,
     predicate: str,
     distance: int | float | None,
+    n_jobs: int,
+    rtree_runner: RTreeQueryRunner | None,
 ) -> np.ndarray:
     """Compute geometric comparisons and get matching indices.
@@ -264,6 +283,9 @@ def _get_sfilter_indices(
     right : GeoDataFrame
     predicate : string
         Binary predicate to query.
+    n_jobs: Number of workers.
+    rtree_runner: Optionally debug/manipulate the spatial indexing operations.
+        See the 'runners' module for example implementations.
     Returns:
     -------
@@ -273,6 +295,9 @@ def _get_sfilter_indices(
     """
     original_predicate = predicate
+    if rtree_runner is None:
+        rtree_runner = RTreeQueryRunner(n_jobs)
     with warnings.catch_warnings():
         # We don't need to show our own warning here
         # TODO remove this once the deprecation has been enforced
@@ -285,25 +310,16 @@ def _get_sfilter_indices(
             # contains is a faster predicate
             # see discussion at https://github.com/geopandas/geopandas/pull/1421
             predicate = "contains"
-            sindex, kwargs = _get_spatial_tree(left)
-            input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
+            arr1 = right.geometry.values
+            arr2 = left.geometry.values
         else:
             # all other predicates are symmetric
             # keep them the same
-            sindex, kwargs = _get_spatial_tree(right)
-            input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
+            arr1 = left.geometry.values
+            arr2 = right.geometry.values
-    l_idx, r_idx = sindex.query(
-        input_geoms, predicate=predicate, distance=distance, **kwargs
-    )
+    left, right = rtree_runner.run(arr1, arr2, predicate=predicate, distance=distance)
     if original_predicate == "within":
-        return np.sort(np.unique(r_idx))
-    return np.sort(np.unique(l_idx))
-def _get_spatial_tree(df):
-    if int(geopandas_version[0]) >= 1:
-        return df.sindex, {"sort": False}
-    return STRtree(df.geometry.values), {}
+        return np.sort(np.unique(right))
+    return np.sort(np.unique(left))

sgis/geopandas_tools/utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+import numpy as np
+import pandas as pd
+from geopandas import GeoSeries
+from shapely import make_valid
+from shapely import union_all
+from .geometry_types import to_single_geom_type
+def _unary_union_for_notna(geoms, **kwargs):
+    try:
+        return make_valid(union_all(geoms, **kwargs))
+    except TypeError:
+        return make_valid(union_all([geom for geom in geoms.dropna().values], **kwargs))
+def make_valid_and_keep_geom_type(geoms: np.ndarray, geom_type: str) -> GeoSeries:
+    """Make GeometryCollections into (Multi)Polygons, (Multi)LineStrings or (Multi)Points.
+    Because GeometryCollections might appear after dissolving (union_all).
+    And this makes shapely difference/intersection fail.
+    Args:
+        geoms: Array of geometries.
+        geom_type: geometry type to be kept.
+    """
+    geoms = GeoSeries(geoms)
+    geoms.index = range(len(geoms))
+    geoms.loc[:] = make_valid(geoms.to_numpy())
+    geoms_with_correct_type = geoms.explode(index_parts=False).pipe(
+        to_single_geom_type, geom_type
+    )
+    only_one = geoms_with_correct_type.groupby(level=0).transform("size") == 1
+    one_hit = geoms_with_correct_type[only_one]
+    many_hits = geoms_with_correct_type[~only_one].groupby(level=0).agg(union_all)
+    geoms_with_wrong_type = geoms.loc[~geoms.index.isin(geoms_with_correct_type.index)]
+    return pd.concat([one_hit, many_hits, geoms_with_wrong_type]).sort_index()

sgis/helpers.py CHANGED Viewed

@@ -198,7 +198,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
 def return_two_vals(
-    vals: tuple[str, str] | list[str] | str | int | float
+    vals: tuple[str, str] | list[str] | str | int | float,
 ) -> tuple[str | int | float, str | int | float]:
     """Return a two-length tuple from a str/int/float or list/tuple of length 1 or 2.

sgis/io/dapla_functions.py CHANGED Viewed

@@ -779,14 +779,12 @@ def _read_partitioned_parquet(
         if all(isinstance(x, DataFrame) for x in results):
             return pd.concat(results)
         else:
-            geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
-            return _arrow_to_geopandas(
-                pyarrow.concat_tables(
-                    results,
-                    promote_options="permissive",
-                ),
-                geo_metadata,
+            results = pyarrow.concat_tables(
+                results,
+                promote_options="permissive",
             )
+            geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
+            return _arrow_to_geopandas(results, geo_metadata)
     # add columns to empty DataFrame
     first_path = next(iter(child_paths + [path]))

sgis/maps/map.py CHANGED Viewed

@@ -307,7 +307,9 @@ class Map:
         notna = array[array.notna()]
         isna = array[array.isna()]
-        unique_multiplied = (notna * self._multiplier).astype(np.int64)
+        unique_multiplied = (notna.astype(np.float64) * self._multiplier).astype(
+            np.int64
+        )
         return pd.concat([unique_multiplied, isna]).sort_index()

sgis/parallel/parallel.py CHANGED Viewed

@@ -75,13 +75,15 @@ def parallel_overlay(
     Returns:
         A GeoDataFrame containing the result of the overlay operation.
     """
+    if how != "intersection":
+        raise ValueError("parallel_overlay only supports how='intersection'.")
     return pd.concat(
         chunkwise(
             _clean_overlay_with_print,
             df1,
             kwargs={
                 "df2": df2,
-                # "to_print": to_print,
+                "to_print": to_print,
                 "how": how,
             }
             | kwargs,
@@ -672,7 +674,7 @@ class Parallel:
     def chunkwise(
         self,
         func: Callable,
-        iterable: Collection[Iterable[Any]],
+        *iterables: Collection[Iterable[Any]],
         args: tuple | None = None,
         kwargs: dict | None = None,
         max_rows_per_chunk: int | None = None,
@@ -682,8 +684,8 @@ class Parallel:
         Args:
             func: Function to run chunkwise. It should take
                 (a chunk of) the iterable as first argument.
-            iterable: Iterable to split in chunks and passed
-                as first argument to 'func'.
+            iterables: Iterable(s) to split in chunks and passed
+                as first argument(s) to 'func'. Iterables must have same length.
             args: Positional arguments in 'func' after the DataFrame.
             kwargs: Additional keyword arguments in 'func'.
             max_rows_per_chunk: Alternatively decide number of chunks
@@ -691,7 +693,7 @@ class Parallel:
         """
         return chunkwise(
             func,
-            iterable,
+            *iterables,
             args=args,
             kwargs=kwargs,
             processes=self.processes,
@@ -1067,7 +1069,7 @@ def _fix_missing_muni_numbers(
 def chunkwise(
     func: Callable,
-    iterable: Collection[Iterable[Any]],
+    *iterables: Collection[Iterable[Any]],
     args: tuple | None = None,
     kwargs: dict | None = None,
     processes: int = 1,
@@ -1082,7 +1084,7 @@ def chunkwise(
     Args:
         func: The function to apply to each chunk. This function must accept a DataFrame as
             its first argument and return a DataFrame.
-        iterable: Iterable to be chunked and processed.
+        iterables: Iterable(s) to be chunked and processed. Must have same length.
         args: Additional positional arguments to pass to 'func'.
         kwargs: Keyword arguments to pass to 'func'.
         processes: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
@@ -1096,30 +1098,36 @@ def chunkwise(
     args = args or ()
     kwargs = kwargs or {}
+    if len({len(x) for x in iterables}) not in [0, 1]:
+        raise ValueError(
+            f"iterables must have same length. Got {', '.join([len(x) for x in iterables])}"
+        )
     if max_rows_per_chunk is None:
         n_chunks: int = processes
     else:
-        n_chunks: int = len(iterable) // max_rows_per_chunk
+        n_chunks: int = len(next(iter(iterables))) // max_rows_per_chunk
     if n_chunks <= 1:
-        return [func(iterable, *args, **kwargs)]
+        return [func(*iterables, *args, **kwargs)]
-    chunks = np.array_split(np.arange(len(iterable)), n_chunks)
+    chunks = np.array_split(np.arange(len(next(iter(iterables)))), n_chunks)
-    if hasattr(iterable, "iloc"):
-        iterable_chunked: list[pd.DataFrame | pd.Series] = [
-            iterable.iloc[chunk] for chunk in chunks
-        ]
-    elif is_array_like(iterable):
-        iterable_chunked: list[np.ndarray] = [iterable[chunk] for chunk in chunks]
-    else:
-        to_type: type = iterable.__class__
-        iterable_chunked: list[Iterable] = [
-            to_type(chunk) for chunk in np.array_split(list(iterable), n_chunks)
-        ]
-    return Parallel(processes, backend=backend).map(
+    def get_chunk(iterable, chunk):
+        if hasattr(iterable, "iloc"):
+            return iterable.iloc[chunk]
+        elif is_array_like(iterable):
+            return iterable[chunk]
+        else:
+            to_type: type = iterable.__class__
+            return to_type([x for i, x in enumerate(iterable) if i in chunk])
+    iterables_chunked: list[list[Iterable[Any]]] = [
+        [get_chunk(iterable, chunk) for iterable in iterables] for chunk in chunks
+    ]
+    return Parallel(processes, backend=backend).starmap(
         func,
-        iterable_chunked,
+        iterables_chunked,
         args=args,
         kwargs=kwargs,
     )

ssb-sgis 1.1.17__py3-none-any.whl → 1.2.0__py3-none-any.whl

ssb-sgis 1.1.17py3-none-any.whl → 1.2.0py3-none-any.whl