PyPI - ssb-sgis - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

ssb-sgis 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

sgis/__init__.py +107 -121
sgis/exceptions.py +5 -3
sgis/geopandas_tools/__init__.py +1 -0
sgis/geopandas_tools/bounds.py +86 -47
sgis/geopandas_tools/buffer_dissolve_explode.py +62 -39
sgis/geopandas_tools/centerlines.py +53 -44
sgis/geopandas_tools/cleaning.py +87 -104
sgis/geopandas_tools/conversion.py +164 -107
sgis/geopandas_tools/duplicates.py +33 -19
sgis/geopandas_tools/general.py +84 -52
sgis/geopandas_tools/geometry_types.py +24 -10
sgis/geopandas_tools/neighbors.py +23 -11
sgis/geopandas_tools/overlay.py +136 -53
sgis/geopandas_tools/point_operations.py +11 -10
sgis/geopandas_tools/polygon_operations.py +53 -61
sgis/geopandas_tools/polygons_as_rings.py +121 -78
sgis/geopandas_tools/sfilter.py +17 -17
sgis/helpers.py +116 -58
sgis/io/dapla_functions.py +32 -23
sgis/io/opener.py +13 -6
sgis/io/read_parquet.py +2 -2
sgis/maps/examine.py +55 -28
sgis/maps/explore.py +471 -112
sgis/maps/httpserver.py +12 -12
sgis/maps/legend.py +285 -134
sgis/maps/map.py +248 -129
sgis/maps/maps.py +123 -119
sgis/maps/thematicmap.py +260 -94
sgis/maps/tilesources.py +3 -8
sgis/networkanalysis/_get_route.py +5 -4
sgis/networkanalysis/_od_cost_matrix.py +44 -1
sgis/networkanalysis/_points.py +10 -4
sgis/networkanalysis/_service_area.py +5 -2
sgis/networkanalysis/closing_network_holes.py +22 -64
sgis/networkanalysis/cutting_lines.py +58 -46
sgis/networkanalysis/directednetwork.py +16 -8
sgis/networkanalysis/finding_isolated_networks.py +6 -5
sgis/networkanalysis/network.py +15 -13
sgis/networkanalysis/networkanalysis.py +79 -61
sgis/networkanalysis/networkanalysisrules.py +21 -17
sgis/networkanalysis/nodes.py +2 -3
sgis/networkanalysis/traveling_salesman.py +6 -3
sgis/parallel/parallel.py +372 -142
sgis/raster/base.py +9 -3
sgis/raster/cube.py +331 -213
sgis/raster/cubebase.py +15 -29
sgis/raster/image_collection.py +2560 -0
sgis/raster/indices.py +17 -12
sgis/raster/raster.py +356 -275
sgis/raster/sentinel_config.py +104 -0
sgis/raster/zonal.py +38 -14
{ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/LICENSE +1 -1
{ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/METADATA +87 -16
ssb_sgis-1.0.3.dist-info/RECORD +61 -0
{ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/WHEEL +1 -1
sgis/raster/bands.py +0 -48
sgis/raster/gradient.py +0 -78
sgis/raster/methods_as_functions.py +0 -124
sgis/raster/torchgeo.py +0 -150
ssb_sgis-1.0.1.dist-info/RECORD +0 -63

sgis/parallel/parallel.py CHANGED Viewed

@@ -2,12 +2,14 @@ import functools
 import inspect
 import itertools
 import multiprocessing
+import pickle
 import warnings
-from collections.abc import Callable, Collection, Iterable
+from collections.abc import Callable
+from collections.abc import Collection
+from collections.abc import Iterable
 from pathlib import Path
 from typing import Any
 try:
     import dapla as dp
 except ImportError:
@@ -16,18 +18,19 @@ except ImportError:
 import joblib
 import numpy as np
 import pandas as pd
-from geopandas import GeoDataFrame, GeoSeries
+from geopandas import GeoDataFrame
 from pandas import DataFrame
-from shapely.geometry import MultiPolygon, Polygon
+from pandas import Series
-from ..geopandas_tools.general import clean_clip, clean_geoms
 from ..geopandas_tools.neighbors import get_neighbor_indices
 from ..geopandas_tools.overlay import clean_overlay
-from ..helpers import LocalFunctionError, dict_zip, dict_zip_union, in_jupyter
+from ..helpers import LocalFunctionError
+from ..helpers import dict_zip_union
+from ..helpers import in_jupyter
 try:
-    from ..io.dapla_functions import exists, read_geopandas, write_geopandas
+    from ..io.dapla_functions import read_geopandas
+    from ..io.dapla_functions import write_geopandas
     # from ..io.write_municipality_data import write_municipality_data
 except ImportError:
@@ -35,16 +38,13 @@ except ImportError:
 try:
-    from dapla import read_pandas, write_pandas
+    from dapla import read_pandas
+    from dapla import write_pandas
+    from dapla.gcs import GCSFileSystem
 except ImportError:
-    pass
-def turn_args_into_kwargs(func: Callable, args: tuple, index_start: int):
-    if not isinstance(args, tuple):
-        raise TypeError("args should be a tuple (it should not be unpacked with *)")
-    argnames = inspect.getfullargspec(func).args[index_start:]
-    return {name: value for value, name in zip(args, argnames, strict=False)}
+    class GCSFileSystem:
+        """Placeholder."""
 class Parallel:
@@ -87,10 +87,23 @@ class Parallel:
         backend: str = "multiprocessing",
         context: str = "spawn",
         maxtasksperchild: int = 10,
+        chunksize: int = 1,
         **kwargs,
-    ):
+    ) -> None:
+        """Initialize a Parallel instance with specified settings for parallel execution.
+        Args:
+            processes: Number of parallel processes. Set to 1 to run without parallelization.
+            backend: The backend to use for parallel execution. Defaults to 'multiprocessing'.
+            context: The context setting for multiprocessing. Defaults to 'spawn'.
+            maxtasksperchild: The maximum number of tasks a worker process can complete
+                before it is replaced. Defaults to 10.
+            chunksize: The size of the chunks of the iterable to distribute to workers.
+            **kwargs: Additional keyword arguments passed to the underlying parallel execution backend.
+        """
         self.processes = int(processes)
         self.maxtasksperchild = maxtasksperchild
+        self.chunksize = chunksize
         self.backend = backend
         self.context = context
         self.kwargs = kwargs
@@ -103,7 +116,6 @@ class Parallel:
         iterable: Collection,
         args: tuple | None = None,
         kwargs: dict | None = None,
-        chunksize: int = 1,
     ) -> list[Any]:
         """Run functions in parallel with items of an iterable as 0th arguemnt.
@@ -111,7 +123,7 @@ class Parallel:
             func: Function to be run.
             iterable: An iterable where each item will be passed to func as
                 0th positional argument.
-            Args: Positional arguments passed to 'func' starting from the 1st argument.
+            args: Positional arguments passed to 'func' starting from the 1st argument.
                 The 0th argument will be reserved for the values of 'iterable'.
             kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
                 not unpacked into separate keyword arguments.
@@ -120,8 +132,8 @@ class Parallel:
             A list of the return values of the function, one for each item in
             'iterable'.
-        Examples
-        --------
+        Examples:
+        ---------
         Multiply each list element by 2.
         >>> iterable = [1, 2, 3]
@@ -159,21 +171,20 @@ class Parallel:
         ...     print(results)
         [2, 4, 6]
         """
         if args:
             # start at index 1, meaning the 0th argument (the iterable) is still available
-            args_as_kwargs = turn_args_into_kwargs(func, args, index_start=1)
+            args_as_kwargs = _turn_args_into_kwargs(func, args, index_start=1)
         else:
             args_as_kwargs = {}
-        self.validate_execution(func)
+        self._validate_execution(func)
         kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
         func_with_kwargs = functools.partial(func, **kwargs)
         if self.processes == 1:
-            return list(map(func_with_kwargs, iterable))
+            return [func_with_kwargs(item) for item in iterable]
         iterable = list(iterable)
@@ -182,21 +193,42 @@ class Parallel:
         if not processes:
             return []
+        elif processes == 1:
+            return [func_with_kwargs(item) for item in iterable]
-        if self.backend == "multiprocessing":
-            with multiprocessing.get_context(self.context).Pool(
-                processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
-            ) as pool:
-                try:
-                    return pool.map(func_with_kwargs, iterable, chunksize=chunksize)
-                except Exception as e:
-                    pool.terminate()
-                    raise e
+        try:
+            if self.backend == "multiprocessing":
+                with multiprocessing.get_context(self.context).Pool(
+                    processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
+                ) as pool:
+                    try:
+                        return pool.map(
+                            func_with_kwargs, iterable, chunksize=self.chunksize
+                        )
+                    except Exception as e:
+                        pool.terminate()
+                        raise e
-        with joblib.Parallel(
-            n_jobs=processes, backend=self.backend, **self.kwargs
-        ) as parallel:
-            return parallel(joblib.delayed(func)(item, **kwargs) for item in iterable)
+            with joblib.Parallel(
+                n_jobs=processes, backend=self.backend, **self.kwargs
+            ) as parallel:
+                return parallel(
+                    joblib.delayed(func)(item, **kwargs) for item in iterable
+                )
+        except pickle.PickleError as e:
+            unpicklable = []
+            for k, v in locals().items():
+                try:
+                    pickle.dumps(v)
+                except pickle.PickleError:
+                    unpicklable.append(k)
+                except TypeError:
+                    pass
+            if unpicklable:
+                raise pickle.PickleError(
+                    f"Cannot unpickle objects: {unpicklable}"
+                ) from e
+            raise e
     def starmap(
         self,
@@ -204,7 +236,6 @@ class Parallel:
         iterable: Collection[Iterable[Any]],
         args: tuple | None = None,
         kwargs: dict | None = None,
-        chunksize: int = 1,
     ) -> list[Any]:
         """Run functions in parallel where items of the iterable are unpacked.
@@ -215,7 +246,7 @@ class Parallel:
             func: Function to be run.
             iterable: An iterable of iterables, where each item will be
                 unpacked as positional argument to the function.
-            Args: Positional arguments passed to 'func' starting at argument position
+            args: Positional arguments passed to 'func' starting at argument position
                 n + 1, where n is the length of the iterables inside the iterable.
             kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
                 not unpacked into separate keyword arguments.
@@ -224,8 +255,8 @@ class Parallel:
             A list of the return values of the function, one for each item in
             'iterable'.
-        Examples
-        --------
+        Examples:
+        ---------
         Multiply each list element by 2.
         >>> iterable = [(1, 2), (2, 3), (3, 4)]
@@ -262,13 +293,13 @@ class Parallel:
         if args:
             # starting the count at the length of the iterables inside the iterables
             iterable = list(iterable)
-            args_as_kwargs = turn_args_into_kwargs(
+            args_as_kwargs = _turn_args_into_kwargs(
                 func, args, index_start=len(iterable[0])
             )
         else:
             args_as_kwargs = {}
-        self.validate_execution(func)
+        self._validate_execution(func)
         kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
@@ -290,7 +321,9 @@ class Parallel:
                 processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
             ) as pool:
                 try:
-                    return pool.starmap(func_with_kwargs, iterable, chunksize=chunksize)
+                    return pool.starmap(
+                        func_with_kwargs, iterable, chunksize=self.chunksize
+                    )
                 except Exception as e:
                     pool.terminate()
                     raise e
@@ -320,10 +353,10 @@ class Parallel:
         Returns:
             A DataFrame, or a list of DataFrames if concat is False.
         """
-        if not strict:
-            files = [file for file in files if exists(file)]
-        res = self.map(dp.read_pandas, files, kwargs=kwargs)
+        if strict:
+            res = self.map(read_pandas, files, kwargs=kwargs)
+        else:
+            res = self.map(_try_to_read_pandas, files, kwargs=kwargs)
         return pd.concat(res, ignore_index=ignore_index) if concat else res
@@ -342,14 +375,19 @@ class Parallel:
             concat: Whether to concat the results to a GeoDataFrame.
             ignore_index: Defaults to True.
             strict: If True (default), all files must exist.
+            chunksize: The size of the chunks of the iterable to distribute to workers.
             **kwargs: Keyword arguments passed to sgis.read_geopandas.
         Returns:
             A GeoDataFrame, or a list of GeoDataFrames if concat is False.
         """
-        if not strict:
-            files = [file for file in files if exists(file)]
-        res = self.map(read_geopandas, files, kwargs=kwargs)
+        if "file_system" not in kwargs:
+            kwargs["file_system"] = dp.FileClient.get_gcs_file_system()
+        if strict:
+            res = self.map(read_geopandas, files, kwargs=kwargs)
+        else:
+            res = self.map(_try_to_read_geopandas, files, kwargs=kwargs)
         return pd.concat(res, ignore_index=ignore_index) if concat else res
@@ -367,10 +405,14 @@ class Parallel:
         clip: bool = True,
         max_rows_per_chunk: int = 150_000,
         processes_in_clip: int = 1,
-    ):
+        verbose: bool = True,
+    ) -> None:
         """Split multiple datasets into municipalities and write as separate files.
         The files will be named as the municipality number.
+        Each dataset in 'in_data' is intersected with 'municipalities'
+        in parallel. The intersections themselves can also be run in parallel
+        with the 'processes_in_clip' argument.
         Args:
             in_data: Dictionary with dataset names as keys and file paths or
@@ -397,7 +439,12 @@ class Parallel:
                 not have to have the same length as 'in_data'.
             write_empty: If False (default), municipalities with no data will be skipped.
                 If True, an empty parquet file will be written.
-            clip: If True (default), the data will be clipped.
+            clip: If True (default), the data will be clipped. If False, the data will
+                be spatial joined.
+            max_rows_per_chunk: Number of rows per data chunk for processing.
+            processes_in_clip: Number of parallel processes for data clipping.
+            verbose: Whether to print during execution.
         """
         shared_kwds = {
             "municipalities": municipalities,
@@ -409,6 +456,7 @@ class Parallel:
             "max_rows_per_chunk": max_rows_per_chunk,
             "processes_in_clip": processes_in_clip,
             "strict": strict,
+            "verbose": verbose,
         }
         if isinstance(out_data, (str, Path)):
@@ -417,10 +465,12 @@ class Parallel:
         if funcdict is None:
             funcdict = {}
-        zip_func = dict_zip if strict else dict_zip_union
+        fs = dp.FileClient.get_gcs_file_system()
-        for _, data, folder, postfunc in zip_func(in_data, out_data, funcdict):
-            if data is None:
+        for _, data, folder, postfunc in dict_zip_union(in_data, out_data, funcdict):
+            if data is None or (
+                not strict and isinstance(data, (str | Path)) and not fs.exists(data)
+            ):
                 continue
             kwds = shared_kwds | {
@@ -439,15 +489,33 @@ class Parallel:
         df: GeoDataFrame,
         args: tuple | None = None,
         kwargs: dict | None = None,
-        max_rows_per_chunk: int = 150_000,
-        n_chunks: int = None,
-        concat: bool = False,
+        n_chunks: int | None = None,
+        max_rows_per_chunk: int | None = None,
+        concat: bool = True,
     ) -> GeoDataFrame:
-        if len(df) < max_rows_per_chunk:
-            return func(df, *args, **kwargs)
+        """Run a function in parallel on chunks of a (Geo)DataFrame.
-        if n_chunks is None:
-            n_chunks = len(df) // max_rows_per_chunk
+        Args:
+            func: Function to run chunkwise. It should take
+                a (Geo)DataFrame as first argument.
+            df: (Geo)DataFrame to split in n_chunks and passed
+                as first argument to 'func'.
+            args: Positional arguments in 'func' after the DataFrame.
+            kwargs: Additional keyword arguments in 'func'.
+            n_chunks: Optionally set number of chunks to split
+                'df' into. Defaults to the 'processes' attribute
+                of the Parallel instance.
+            max_rows_per_chunk: Alternatively decide number of chunks
+                by a maximum number of rows per chunk.
+            concat: Whether to use pd.concat on the results.
+                Defaults to True.
+        """
+        if max_rows_per_chunk is None and n_chunks is None:
+            n_chunks: int = self.processes
+        elif n_chunks is None:
+            n_chunks: int = len(df) // max_rows_per_chunk
+        elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
+            return func(df, *args, **kwargs)
         chunks = np.array_split(np.arange(len(df)), n_chunks)
@@ -464,7 +532,7 @@ class Parallel:
         else:
             return out
-    def validate_execution(self, func):
+    def _validate_execution(self, func: Callable) -> None:
         """Multiprocessing doesn't work with local variables in interactive interpreter.
         Raising Exception to avoid confusion.
@@ -478,8 +546,8 @@ class Parallel:
             raise LocalFunctionError(func)
     @staticmethod
-    def _validate_kwargs(kwargs) -> dict:
-        """Make sure kwargs is a dict (not ** unpacked or None)"""
+    def _validate_kwargs(kwargs: dict) -> dict:
+        """Make sure kwargs is a dict (not ** unpacked or None)."""
         if kwargs is None:
             kwargs = {}
         elif not isinstance(kwargs, dict):
@@ -487,7 +555,7 @@ class Parallel:
         return kwargs
     def _execute(self) -> list[Any]:
-        [self.validate_execution(func) for func in self.funcs]
+        [self._validate_execution(func) for func in self.funcs]
         if self.processes == 1:
             return [func() for func in self.funcs]
@@ -513,7 +581,8 @@ class Parallel:
             results = [pool.apply_async(func) for func in self.funcs]
             return [result.get() for result in results]
-    def __repr__(self):
+    def __repr__(self) -> str:
+        """String representation."""
         return (
             f"{self.__class__.__name__}(processes={self.processes}, "
             f"backend='{self.backend}', context='{self.context}')"
@@ -523,7 +592,7 @@ class Parallel:
 def write_municipality_data(
     data: str | GeoDataFrame | DataFrame,
     out_folder: str,
-    municipalities: GeoDataFrame,
+    municipalities: GeoDataFrame | list[str] | None = None,
     with_neighbors: bool = False,
     muni_number_col: str = "KOMMUNENR",
     file_type: str = "parquet",
@@ -533,13 +602,39 @@ def write_municipality_data(
     max_rows_per_chunk: int = 150_000,
     processes_in_clip: int = 1,
     strict: bool = True,
+    verbose: bool = True,
 ) -> None:
+    """Splits and writes data into municipality-specific files.
+    Args:
+        data: Path to the data file or a GeoDataFrame.
+        out_folder: Path to the output directory where the municipality data
+            is written.
+        municipalities: Either a sequence of municipality numbers or a GeoDataFrame
+            of municipality polygons and municipality numbers in the column 'muni_number_col'.
+            Defaults to None.
+        with_neighbors: If True, include data from neighboring municipalities
+            for each municipality.
+        muni_number_col: Column name for municipality codes in 'municipalities'.
+        file_type: Format of the output file.
+        func: Function to process data before writing.
+        write_empty: If True, write empty files for municipalities without data.
+        clip: If True, clip the data to municipality boundaries. If False
+            the data is spatial joined.
+        max_rows_per_chunk: Maximum number of rows in each processed chunk.
+        processes_in_clip: Number of processes to use for clipping.
+        strict: If True (default) and the data has a municipality column,
+            all municipality numbers in 'data' must be present in 'municipalities'.
+        verbose: Whether to print during execution.
+    Returns:
+        None. The function writes files directly.
+    """
     write_func = (
         _write_neighbor_municipality_data
         if with_neighbors
         else _write_municipality_data
     )
     return write_func(
         data=data,
         out_folder=out_folder,
@@ -552,27 +647,34 @@ def write_municipality_data(
         max_rows_per_chunk=max_rows_per_chunk,
         processes_in_clip=processes_in_clip,
         strict=strict,
+        verbose=verbose,
     )
-def _validate_data(data: str | list[str]) -> str:
-    if isinstance(data, (str, Path)):
-        return data
+def _validate_data(
+    data: str | list[str] | DataFrame | GeoDataFrame,
+) -> DataFrame | GeoDataFrame:
     if hasattr(data, "__iter__") and len(data) == 1:
-        return data[0]
-    elif not isinstance(data, GeoDataFrame):
-        raise TypeError("'data' Must be a file path or a GeoDataFrame. Got", type(data))
+        data = data[0]
+    if isinstance(data, (str, Path)):
+        try:
+            return read_geopandas(str(data))
+        except ValueError as e:
+            try:
+                return read_pandas(str(data))
+            except ValueError as e2:
+                raise e.__class__(e, data) from e2
     return data
-def _get_out_path(out_folder, muni, file_type):
+def _get_out_path(out_folder: str | Path, muni: str, file_type: str) -> str:
     return str(Path(out_folder) / f"{muni}.{file_type.strip('.')}")
 def _write_municipality_data(
     data: str | GeoDataFrame | DataFrame,
     out_folder: str,
-    municipalities: GeoDataFrame,
+    municipalities: GeoDataFrame | list[str] | None = None,
     muni_number_col: str = "KOMMUNENR",
     file_type: str = "parquet",
     func: Callable | None = None,
@@ -581,21 +683,15 @@ def _write_municipality_data(
     max_rows_per_chunk: int = 150_000,
     processes_in_clip: int = 1,
     strict: bool = True,
+    verbose: bool = True,
 ) -> None:
-    data = _validate_data(data)
-    if isinstance(data, (str, Path)):
-        try:
-            gdf = read_geopandas(str(data))
-        except ValueError as e:
-            try:
-                gdf = read_pandas(str(data))
-            except ValueError:
-                raise e.__class__(e, data)
-    elif isinstance(data, DataFrame):
-        gdf = data
+    if verbose:
+        to_print = out_folder
+        print(to_print)
     else:
-        raise TypeError(type(data))
+        to_print = None
+    gdf = _validate_data(data)
     if func is not None:
         gdf = func(gdf)
@@ -608,22 +704,29 @@ def _write_municipality_data(
         max_rows_per_chunk,
         processes_in_clip=processes_in_clip,
         strict=strict,
+        to_print=to_print,
     )
-    for muni in municipalities[muni_number_col]:
-        print(muni)
-        out = _get_out_path(out_folder, muni, file_type)
-        gdf_muni = gdf.loc[gdf[muni_number_col] == muni]
-        if not len(gdf_muni):
-            if write_empty:
-                gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
-                gdf_muni["geometry"] = None
-                write_pandas(gdf_muni, out)
-            continue
-        write_geopandas(gdf_muni, out)
+    if municipalities is None:
+        muni_numbers = gdf[muni_number_col]
+    elif not isinstance(municipalities, DataFrame):
+        muni_numbers = municipalities
+    else:
+        muni_numbers = municipalities[muni_number_col]
+    # hardcode this to threading for efficiency in io bound task
+    Parallel(processes_in_clip, backend="threading").map(
+        _write_one_muni,
+        muni_numbers,
+        kwargs=dict(
+            gdf=gdf,
+            out_folder=out_folder,
+            muni_number_col=muni_number_col,
+            file_type=file_type,
+            write_empty=write_empty,
+            to_print=to_print,
+        ),
+    )
 def _write_neighbor_municipality_data(
@@ -638,11 +741,15 @@ def _write_neighbor_municipality_data(
     max_rows_per_chunk: int = 150_000,
     processes_in_clip: int = 1,
     strict: bool = True,
+    verbose: bool = True,
 ) -> None:
-    data = _validate_data(data)
+    if verbose:
+        to_print = out_folder
+        print("out_folder:", to_print)
+    else:
+        to_print = None
-    if isinstance(data, (str, Path)):
-        gdf = read_geopandas(str(data))
+    gdf = _validate_data(data)
     if func is not None:
         gdf = func(gdf)
@@ -655,6 +762,7 @@ def _write_neighbor_municipality_data(
         max_rows_per_chunk,
         processes_in_clip,
         strict=strict,
+        to_print=to_print,
     )
     if municipalities.index.name != muni_number_col:
@@ -664,43 +772,97 @@ def _write_neighbor_municipality_data(
         municipalities, municipalities, max_distance=1
     )
-    for muni in municipalities.index:
-        out = _get_out_path(out_folder, muni, file_type)
+    # hardcode this to threading for efficiency in io bound task
+    Parallel(processes_in_clip, backend="threading").map(
+        _write_one_muni_with_neighbors,
+        municipalities.index,
+        kwargs=dict(
+            gdf=gdf,
+            neighbor_munis=neighbor_munis,
+            out_folder=out_folder,
+            muni_number_col=muni_number_col,
+            file_type=file_type,
+            write_empty=write_empty,
+            to_print=to_print,
+        ),
+    )
-        muni_and_neighbors = neighbor_munis.loc[[muni]]
-        gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
-        if not len(gdf_neighbor):
-            if write_empty:
-                gdf_neighbor["geometry"] = gdf_neighbor["geometry"].astype(str)
-                write_pandas(gdf_neighbor, out)
-            continue
+def _write_one_muni(
+    muni_number: Any,
+    gdf: GeoDataFrame | DataFrame,
+    out_folder: str | Path,
+    muni_number_col: str,
+    file_type: str,
+    write_empty: bool,
+    to_print: str | None = None,
+) -> None:
+    out = _get_out_path(out_folder, muni_number, file_type)
+    if to_print:
+        print("writing:", out)
+    gdf_muni = gdf.loc[gdf[muni_number_col] == muni_number]
+    if not len(gdf_muni):
+        if write_empty:
+            gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
+            gdf_muni["geometry"] = None
+            write_pandas(gdf_muni, out)
+        return
-        write_geopandas(gdf_neighbor, out)
+    write_geopandas(gdf_muni, out)
+def _write_one_muni_with_neighbors(
+    muni_number: Any,
+    gdf: GeoDataFrame | DataFrame,
+    neighbor_munis: Series,
+    out_folder: str | Path,
+    muni_number_col: str,
+    file_type: str,
+    write_empty: bool,
+    to_print: str | None = None,
+) -> None:
+    out = _get_out_path(out_folder, muni_number, file_type)
+    if to_print:
+        print("writing:", out)
+    muni_and_neighbors: Series = neighbor_munis.loc[[muni_number]]
+    gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
+    if not len(gdf_neighbor):
+        if write_empty:
+            gdf_neighbor = gdf_neighbor.drop(columns="geometry", errors="ignore")
+            gdf_neighbor["geometry"] = None
+            write_pandas(gdf_neighbor, out)
+        return
+    write_geopandas(gdf_neighbor, out)
 def _fix_missing_muni_numbers(
-    gdf,
-    municipalities,
-    muni_number_col,
-    clip,
-    max_rows_per_chunk,
-    processes_in_clip,
-    strict,
-):
+    gdf: GeoDataFrame,
+    municipalities: GeoDataFrame,
+    muni_number_col: str,
+    clip: bool,
+    max_rows_per_chunk: int,
+    processes_in_clip: int,
+    strict: bool,
+    to_print: str,
+) -> GeoDataFrame:
     if muni_number_col in gdf and gdf[muni_number_col].notna().all():
         if municipalities is None:
             return gdf
         if diffs := set(gdf[muni_number_col].values).difference(
             set(municipalities[muni_number_col].values)
         ):
-            message = (
-                f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore"
-            )
+            message = f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore."
             if strict:
                 raise ValueError(message)
             else:
-                warnings.warn(message)
+                warnings.warn(message, stacklevel=1)
         return gdf
     if municipalities is None:
@@ -717,7 +879,10 @@ def _fix_missing_muni_numbers(
             "GeoDataFrame to clip the geometries by."
         )
-    municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
+    try:
+        municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
+    except Exception as e:
+        raise e.__class__(e, to_print) from e
     if muni_number_col in gdf and gdf[muni_number_col].isna().any():
         notna = gdf[gdf[muni_number_col].notna()]
@@ -732,6 +897,7 @@ def _fix_missing_muni_numbers(
                 municipalities[[muni_number_col, municipalities._geometry_column_name]],
                 processes=processes_in_clip,
                 max_rows_per_chunk=max_rows_per_chunk,
+                to_print=to_print,
             )
         return pd.concat([notna, notna_anymore], ignore_index=True)
@@ -744,25 +910,42 @@ def _fix_missing_muni_numbers(
             municipalities[[muni_number_col, municipalities._geometry_column_name]],
             processes=processes_in_clip,
             max_rows_per_chunk=max_rows_per_chunk,
+            to_print=to_print,
         )
 def parallel_overlay(
     df1: GeoDataFrame,
     df2: GeoDataFrame,
-    # muni_number_col: str,
     processes: int,
     max_rows_per_chunk: int,
     backend: str = "loky",
+    to_print: str | None = None,
     **kwargs,
 ) -> GeoDataFrame:
-    # df2 = df2[[muni_number_col, df2._geometry_column_name]]
+    """Perform spatial overlay operations on two GeoDataFrames in parallel.
+    This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
+    overlay operation with the second GeoDataFrame, and then concatenates the results.
+    Note that this function is most useful if df2 has few and simple geometries.
+    Args:
+        df1: The first GeoDataFrame for the overlay operation.
+        df2: The second GeoDataFrame for the overlay operation.
+        how: Type of overlay operation ('intersection', 'union', etc.).
+        processes: Number of parallel processes to use.
+        max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
+        backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
+        to_print: Optional text to print to see progression.
+        **kwargs: Additional keyword arguments to pass to the overlay function.
+    Returns:
+        A GeoDataFrame containing the result of the overlay operation.
+    """
     if len(df1) < max_rows_per_chunk:
         return clean_overlay(df1, df2, **kwargs)
-    # df2 = df2.dissolve(by=muni_number_col, as_index=False)
     n_chunks = len(df1) // max_rows_per_chunk
     chunks = np.array_split(np.arange(len(df1)), n_chunks)
@@ -778,26 +961,50 @@ def parallel_overlay(
     out = Parallel(processes, backend=backend).map(
         _clean_intersection,
         df1_chunked,
-        args=(df2,),
+        args=(df2, to_print) if to_print else (df2,),
     )
     return pd.concat(out, ignore_index=True)
-def _clean_intersection(df1, df2):
-    print(len(df1))
+def _clean_intersection(
+    df1: GeoDataFrame, df2: GeoDataFrame, to_print: str = ""
+) -> GeoDataFrame:
+    print(to_print, "- intersection chunk len:", len(df1))
     return clean_overlay(df1, df2, how="intersection")
 def chunkwise(
     func: Callable,
-    df: GeoDataFrame,
+    df: GeoDataFrame | pd.DataFrame,
     max_rows_per_chunk: int = 150_000,
-    n_chunks: int = None,
+    n_chunks: int | None = None,
     args: tuple | None = None,
     kwargs: dict | None = None,
     n_jobs: int = 1,
     backend: str = "loky",
-) -> GeoDataFrame:
+) -> GeoDataFrame | pd.DataFrame:
+    """Run a function in parallel on chunks of a DataFrame.
+    This method is used to process large (Geo)DataFrames in manageable pieces,
+    optionally in parallel.
+    Args:
+        func: The function to apply to each chunk. This function must accept a DataFrame as
+            its first argument and return a DataFrame.
+        df: The DataFrame to be chunked and processed.
+        max_rows_per_chunk: The maximum number of rows each chunk should contain.
+        n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
+            calculated based on 'max_rows_per_chunk'.
+        args: Additional positional arguments to pass to 'func'.
+        kwargs: Keyword arguments to pass to 'func'.
+        n_jobs: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
+        backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
+    Returns:
+        GeoDataFrame: A GeoDataFrame resulting from concatenating the results of applying 'func'
+            to each chunk of the original GeoDataFrame.
+    """
     if len(df) < max_rows_per_chunk:
         return func(df, *args, **kwargs)
@@ -815,3 +1022,26 @@ def chunkwise(
         kwargs=kwargs,
     )
     return pd.concat(out, ignore_index=True)
+def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
+    if not isinstance(args, tuple):
+        raise TypeError("args should be a tuple (it should not be unpacked with *)")
+    argnames = inspect.getfullargspec(func).args[index_start:]
+    return {name: value for value, name in zip(args, argnames, strict=False)}
+def _try_to_read_geopandas(path: str, **kwargs) -> GeoDataFrame | DataFrame | None:
+    """Read with try/except because it's faster than checking exists first."""
+    try:
+        return read_geopandas(path, **kwargs)
+    except FileNotFoundError:
+        return None
+def _try_to_read_pandas(path: str, **kwargs) -> DataFrame | None:
+    """Read with try/except because it's faster than checking exists first."""
+    try:
+        return read_pandas(path, **kwargs)
+    except FileNotFoundError:
+        return None

ssb-sgis 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

ssb-sgis 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl