PyPI - ssb-sgis - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

ssb-sgis 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

sgis/__init__.py +20 -9
sgis/debug_config.py +24 -0
sgis/exceptions.py +2 -2
sgis/geopandas_tools/bounds.py +33 -36
sgis/geopandas_tools/buffer_dissolve_explode.py +136 -35
sgis/geopandas_tools/centerlines.py +4 -91
sgis/geopandas_tools/cleaning.py +1576 -583
sgis/geopandas_tools/conversion.py +38 -19
sgis/geopandas_tools/duplicates.py +29 -8
sgis/geopandas_tools/general.py +263 -100
sgis/geopandas_tools/geometry_types.py +4 -4
sgis/geopandas_tools/neighbors.py +19 -15
sgis/geopandas_tools/overlay.py +2 -2
sgis/geopandas_tools/point_operations.py +5 -5
sgis/geopandas_tools/polygon_operations.py +510 -105
sgis/geopandas_tools/polygons_as_rings.py +40 -8
sgis/geopandas_tools/sfilter.py +29 -12
sgis/helpers.py +3 -3
sgis/io/dapla_functions.py +238 -19
sgis/io/read_parquet.py +1 -1
sgis/maps/examine.py +27 -12
sgis/maps/explore.py +450 -65
sgis/maps/legend.py +177 -76
sgis/maps/map.py +206 -103
sgis/maps/maps.py +178 -105
sgis/maps/thematicmap.py +243 -83
sgis/networkanalysis/_service_area.py +6 -1
sgis/networkanalysis/closing_network_holes.py +2 -2
sgis/networkanalysis/cutting_lines.py +15 -8
sgis/networkanalysis/directednetwork.py +1 -1
sgis/networkanalysis/finding_isolated_networks.py +15 -8
sgis/networkanalysis/networkanalysis.py +17 -19
sgis/networkanalysis/networkanalysisrules.py +1 -1
sgis/networkanalysis/traveling_salesman.py +1 -1
sgis/parallel/parallel.py +64 -27
sgis/raster/__init__.py +0 -6
sgis/raster/base.py +208 -0
sgis/raster/cube.py +54 -8
sgis/raster/image_collection.py +3257 -0
sgis/raster/indices.py +17 -5
sgis/raster/raster.py +138 -243
sgis/raster/sentinel_config.py +120 -0
sgis/raster/zonal.py +0 -1
{ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/METADATA +6 -7
ssb_sgis-1.0.4.dist-info/RECORD +62 -0
sgis/raster/methods_as_functions.py +0 -0
sgis/raster/torchgeo.py +0 -171
ssb_sgis-1.0.2.dist-info/RECORD +0 -61
{ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/LICENSE +0 -0
{ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/WHEEL +0 -0

sgis/geopandas_tools/polygons_as_rings.py CHANGED Viewed

@@ -8,6 +8,7 @@ from geopandas import GeoSeries
 from geopandas.array import GeometryArray
 from numpy.typing import NDArray
 from pyproj import CRS
+from shapely import difference
 from shapely import get_coordinates
 from shapely import get_exterior_ring
 from shapely import get_interior_ring
@@ -320,14 +321,14 @@ class PolygonsAsRings:
         exterior = self.rings.loc[self.is_exterior].sort_index()
         assert exterior.shape == (len(self.gdf),)
+        nonempty_exteriors = exterior.loc[lambda x: x.notna()]
+        empty_exteriors = exterior.loc[lambda x: x.isna()]
         nonempty_interiors = self.rings.loc[self.is_interior]
         if not len(nonempty_interiors):
-            try:
-                return make_valid(polygons(exterior.values))
-            except Exception:
-                return _geoms_to_linearrings_fallback(exterior).values
+            nonempty_exteriors.loc[:] = make_valid(polygons(nonempty_exteriors.values))
+            return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
         empty_interiors = pd.Series(
             [None for _ in range(len(self.gdf) * self.max_rings)],
@@ -343,10 +344,41 @@ class PolygonsAsRings:
         )
         assert interiors.shape == (len(self.gdf), self.max_rings), interiors.shape
-        try:
-            return make_valid(polygons(exterior.values, interiors.values))
-        except Exception:
-            return _geoms_to_linearrings_fallback(exterior, interiors).values
+        interiors = interiors.loc[
+            interiors.index.get_level_values(1).isin(
+                nonempty_exteriors.index.get_level_values(1)
+            )
+        ]
+        assert interiors.index.get_level_values(1).equals(
+            nonempty_exteriors.index.get_level_values(1)
+        )
+        # nan gives TypeError in shapely.polygons. None does not.
+        for i, _ in enumerate(interiors.columns):
+            interiors.loc[interiors.iloc[:, i].isna(), i] = None
+        nonempty_exteriors.loc[nonempty_exteriors.isna()] = None
+        # construct polygons with holes
+        polys = make_valid(
+            polygons(
+                nonempty_exteriors.values,
+                interiors.values,
+            )
+        )
+        # interiors might have moved (e.g. snapped) so that they are not within the exterior
+        # these interiors will not be holes, so we need to erase them manually
+        interiors_as_polys = make_valid(polygons(interiors.values))
+        # merge interior polygons into 1d array
+        interiors_as_polys = np.array(
+            [
+                make_valid(unary_union(interiors_as_polys[i, :]))
+                for i in range(interiors_as_polys.shape[0])
+            ]
+        )
+        # erase rowwise
+        nonempty_exteriors.loc[:] = make_valid(difference(polys, interiors_as_polys))
+        return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
 def get_linearring_series(geoms: GeoDataFrame | GeoSeries) -> pd.Series:

sgis/geopandas_tools/sfilter.py CHANGED Viewed

@@ -4,7 +4,9 @@ import numpy as np
 import pandas as pd
 from geopandas import GeoDataFrame
 from geopandas import GeoSeries
+from geopandas import __version__ as geopandas_version
 from shapely import Geometry
+from shapely import STRtree
 from .conversion import to_gdf
@@ -15,6 +17,7 @@ def sfilter(
     gdf: GeoDataFrame | GeoSeries,
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
+    distance: int | float | None = None,
 ) -> GeoDataFrame:
     """Filter a GeoDataFrame or GeoSeries by spatial predicate.
@@ -29,13 +32,14 @@ def sfilter(
         gdf: The GeoDataFrame.
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
+        distance: Max distance to allow if predicate=="dwithin".
     Returns:
         A copy of 'gdf' with only the rows matching the
         spatial predicate with 'other'.
     Examples:
-    --------
+    ---------
     >>> import sgis as sg
     >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
     >>> df1
@@ -66,7 +70,7 @@ def sfilter(
     Also equivelent to using the intersects method, which
     is often a lot slower since df2 must be dissolved:
-    >>> df1.loc[df1.intersects(df2.unary_union)]
+    >>> df1.loc[df1.intersects(df2.union_all())]
                       geometry
     0  POINT (0.00000 0.00000)
@@ -76,7 +80,7 @@ def sfilter(
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate)
+    indices = _get_sfilter_indices(gdf, other, predicate, distance)
     return gdf.iloc[indices]
@@ -85,6 +89,7 @@ def sfilter_split(
     gdf: GeoDataFrame | GeoSeries,
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
+    distance: int | float | None = None,
 ) -> tuple[GeoDataFrame, GeoDataFrame]:
     """Split a GeoDataFrame or GeoSeries by spatial predicate.
@@ -95,13 +100,14 @@ def sfilter_split(
         gdf: The GeoDataFrame.
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
+        distance: Max distance to allow if predicate=="dwithin".
     Returns:
         A tuple of GeoDataFrames, one with the rows that match the spatial predicate
         and one with the rows that do not.
     Examples:
-    --------
+    ---------
     >>> import sgis as sg
     >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
     >>> df1
@@ -135,7 +141,7 @@ def sfilter_split(
     Also equivelent to using the intersects method, which
     is often slower since df2 must be dissolved:
-    >>> filt = df1.intersects(df2.unary_union)
+    >>> filt = df1.intersects(df2.union_all())
     >>> intersecting = df1.loc[filt]
     >>> not_intersecting = df1.loc[~filt]
@@ -145,7 +151,7 @@ def sfilter_split(
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate)
+    indices = _get_sfilter_indices(gdf, other, predicate, distance)
     return (
         gdf.iloc[indices],
@@ -157,6 +163,7 @@ def sfilter_inverse(
     gdf: GeoDataFrame | GeoSeries,
     other: GeoDataFrame | GeoSeries | Geometry,
     predicate: str = "intersects",
+    distance: int | float | None = None,
 ) -> GeoDataFrame | GeoSeries:
     """Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
@@ -166,13 +173,14 @@ def sfilter_inverse(
         gdf: The GeoDataFrame or GeoSeries.
         other: The geometry object to filter 'gdf' by.
         predicate: Spatial predicate to use. Defaults to 'intersects'.
+        distance: Max distance to allow if predicate=="dwithin".
     Returns:
         A copy of 'gdf' with only the rows that do not match the
         spatial predicate with 'other'.
     Examples:
-    --------
+    ---------
     >>> import sgis as sg
     >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
     >>> df1
@@ -202,7 +210,7 @@ def sfilter_inverse(
     Also equivelent to using the intersects method, which
     is often slower since df2 must be dissolved:
-    >>> not_intersecting = df1.loc[~df1.intersects(df2.unary_union)]
+    >>> not_intersecting = df1.loc[~df1.intersects(df2.union_all())]
     """
     if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
@@ -210,7 +218,7 @@ def sfilter_inverse(
     other = _sfilter_checks(other, crs=gdf.crs)
-    indices = _get_sfilter_indices(gdf, other, predicate)
+    indices = _get_sfilter_indices(gdf, other, predicate, distance)
     return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
@@ -243,6 +251,7 @@ def _get_sfilter_indices(
     left: GeoDataFrame | GeoSeries,
     right: GeoDataFrame | GeoSeries | Geometry,
     predicate: str,
+    distance: int | float | None,
 ) -> np.ndarray:
     """Compute geometric comparisons and get matching indices.
@@ -276,17 +285,25 @@ def _get_sfilter_indices(
             # contains is a faster predicate
             # see discussion at https://github.com/geopandas/geopandas/pull/1421
             predicate = "contains"
-            sindex = left.sindex
+            sindex, kwargs = _get_spatial_tree(left)
             input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
         else:
             # all other predicates are symmetric
             # keep them the same
-            sindex = right.sindex
+            sindex, kwargs = _get_spatial_tree(right)
             input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
-    l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
+    l_idx, r_idx = sindex.query(
+        input_geoms, predicate=predicate, distance=distance, **kwargs
+    )
     if original_predicate == "within":
         return np.sort(np.unique(r_idx))
     return np.sort(np.unique(l_idx))
+def _get_spatial_tree(df):
+    if int(geopandas_version[0]) >= 1:
+        return df.sindex, {"sort": False}
+    return STRtree(df.geometry.values), {}

sgis/helpers.py CHANGED Viewed

@@ -141,7 +141,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
         A list of file paths.
     """
     if not recursive:
-        return [path for path in glob.glob(str(Path(root)) + "/*")]
+        return [path for path in glob.glob(str(Path(root)) + "/**")]
     paths = []
     for root_dir, _, files in os.walk(root):
         for file in files:
@@ -205,7 +205,7 @@ def unit_is_degrees(gdf: GeoDataFrame) -> bool:
 def get_object_name(
     var: object, start: int = 2, stop: int = 7, ignore_self: bool = True
-) -> str | None:
+) -> str:
     frame = inspect.currentframe()  # frame can be FrameType or None
     if frame:
         try:
@@ -230,7 +230,7 @@ def get_object_name(
         finally:
             if frame:
                 del frame  # Explicitly delete frame reference to assist with garbage collection
-    return None
+    raise ValueError(f"Couldn't find name for {var}")
 def make_namedict(gdfs: tuple[GeoDataFrame]) -> dict[int, str]:

sgis/io/dapla_functions.py CHANGED Viewed

@@ -1,24 +1,37 @@
 """Functions for reading and writing GeoDataFrames in Statistics Norway's GCS Dapla."""
+import json
+import multiprocessing
+import os
+from collections.abc import Iterable
 from pathlib import Path
 import dapla as dp
 import geopandas as gpd
 import joblib
 import pandas as pd
+import pyarrow
+import shapely
 from geopandas import GeoDataFrame
+from geopandas import GeoSeries
 from geopandas.io.arrow import _geopandas_to_arrow
 from pandas import DataFrame
-from pyarrow import parquet
+from pyarrow import ArrowInvalid
+from ..geopandas_tools.sfilter import sfilter
+PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
 def read_geopandas(
-    gcs_path: str | Path | list[str | Path],
+    gcs_path: str | Path | list[str | Path] | tuple[str | Path] | GeoSeries,
     pandas_fallback: bool = False,
     file_system: dp.gcs.GCSFileSystem | None = None,
+    mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
+    threads: int | None = None,
     **kwargs,
 ) -> GeoDataFrame | DataFrame:
-    """Reads geoparquet or other geodata from a file on GCS.
+    """Reads geoparquet or other geodata from one or more files on GCS.
     If the file has 0 rows, the contents will be returned as a pandas.DataFrame,
     since geopandas does not read and write empty tables.
@@ -33,6 +46,11 @@ def read_geopandas(
             not be read with geopandas and the number of rows is more than 0. If True,
             the file will be read with pandas if geopandas fails.
         file_system: Optional file system.
+        mask: Optional geometry mask to keep only intersecting geometries.
+            If 'gcs_path' is an iterable of multiple paths, only the files
+            with a bbox that intersects the mask are read, then filtered by location.
+        threads: Number of threads to use if reading multiple files. Defaults to
+            the number of files to read or the number of available threads (if lower).
         **kwargs: Additional keyword arguments passed to geopandas' read_parquet
             or read_file, depending on the file type.
@@ -42,14 +60,52 @@ def read_geopandas(
     if file_system is None:
         file_system = dp.FileClient.get_gcs_file_system()
-    if isinstance(gcs_path, (list, tuple)):
+    if not isinstance(gcs_path, (str | Path | os.PathLike)):
         kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
+        if mask is not None:
+            if not isinstance(gcs_path, GeoSeries):
+                bounds_series: GeoSeries = get_bounds_series(
+                    gcs_path,
+                    file_system,
+                    threads=threads,
+                    pandas_fallback=pandas_fallback,
+                )
+            else:
+                bounds_series = gcs_path
+            new_bounds_series = sfilter(bounds_series, mask)
+            if not len(new_bounds_series):
+                if isinstance(kwargs.get("columns"), Iterable):
+                    cols = {col: [] for col in kwargs["columns"]}
+                else:
+                    cols = {}
+                    for path in bounds_series.index:
+                        try:
+                            cols |= {col: [] for col in _get_columns(path, file_system)}
+                        except ArrowInvalid as e:
+                            if file_system.isfile(path):
+                                raise ArrowInvalid(e, path) from e
+                return GeoDataFrame(cols | {"geometry": []})
+            paths = list(new_bounds_series.index)
+        else:
+            if isinstance(gcs_path, GeoSeries):
+                paths = list(gcs_path.index)
+            else:
+                paths = list(gcs_path)
+        if threads is None:
+            threads = min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
         # recursive read with threads
-        with joblib.Parallel(n_jobs=len(gcs_path), backend="threading") as parallel:
+        with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
             dfs: list[GeoDataFrame] = parallel(
-                joblib.delayed(read_geopandas)(x, **kwargs) for x in gcs_path
+                joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
             )
-        return pd.concat(dfs)
+        df = pd.concat(dfs)
+        if mask is not None:
+            return sfilter(df, mask)
+        return df
     if not isinstance(gcs_path, str):
         try:
@@ -60,20 +116,26 @@ def read_geopandas(
     if "parquet" in gcs_path or "prqt" in gcs_path:
         with file_system.open(gcs_path, mode="rb") as file:
             try:
-                return gpd.read_parquet(file, **kwargs)
+                df = gpd.read_parquet(file, **kwargs)
             except ValueError as e:
                 if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
-                    raise e
+                    raise e.__class__(
+                        f"{e.__class__.__name__}: {e} for {gcs_path}."
+                    ) from e
                 df = dp.read_pandas(gcs_path, **kwargs)
                 if pandas_fallback or not len(df):
                     return df
                 else:
-                    raise e
+                    more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
+                    raise e.__class__(
+                        f"{e.__class__.__name__}: {e} for {df}." + more_txt
+                    ) from e
     else:
         with file_system.open(gcs_path, mode="rb") as file:
             try:
-                return gpd.read_file(file, **kwargs)
+                df = gpd.read_file(file, **kwargs)
             except ValueError as e:
                 if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
                     raise e
@@ -82,7 +144,144 @@ def read_geopandas(
                 if pandas_fallback or not len(df):
                     return df
                 else:
-                    raise e
+                    more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
+                    raise e.__class__(
+                        f"{e.__class__.__name__}: {e} for {df}. " + more_txt
+                    ) from e
+    if mask is not None:
+        return sfilter(df, mask)
+    return df
+def _get_bounds_parquet(
+    path: str | Path, file_system: dp.gcs.GCSFileSystem, pandas_fallback: bool = False
+) -> tuple[list[float], dict] | tuple[None, None]:
+    with file_system.open(path) as f:
+        try:
+            num_rows = pyarrow.parquet.read_metadata(f).num_rows
+        except ArrowInvalid as e:
+            if not file_system.isfile(f):
+                return None, None
+            raise ArrowInvalid(e, path) from e
+        if not num_rows:
+            return None, None
+        meta = pyarrow.parquet.read_schema(f).metadata
+    try:
+        meta = json.loads(meta[b"geo"])["columns"]["geometry"]
+    except KeyError as e:
+        if pandas_fallback:
+            return None, None
+        raise KeyError(
+            f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
+            # f"{num_rows=}",
+            # meta,
+        ) from e
+    return meta["bbox"], meta["crs"]
+def _get_columns(path: str | Path, file_system: dp.gcs.GCSFileSystem) -> pd.Index:
+    with file_system.open(path) as f:
+        schema = pyarrow.parquet.read_schema(f)
+        index_cols = _get_index_cols(schema)
+        return pd.Index(schema.names).difference(index_cols)
+def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
+    cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
+    return [x for x in cols if not isinstance(x, dict)]
+def get_bounds_series(
+    paths: list[str | Path] | tuple[str | Path],
+    file_system: dp.gcs.GCSFileSystem | None = None,
+    threads: int | None = None,
+    pandas_fallback: bool = False,
+) -> GeoSeries:
+    """Get a GeoSeries with file paths as indexes and the file's bounds as values.
+    The returned GeoSeries can be used as the first argument of 'read_geopandas'
+    along with the 'mask' keyword.
+    Args:
+        paths: Iterable of file paths in gcs.
+        file_system: Optional instance of dp.gcs.GCSFileSystem.
+            If None, an instance is created within the function.
+            Note that this is slower in long loops.
+        threads: Number of threads to use if reading multiple files. Defaults to
+            the number of files to read or the number of available threads (if lower).
+        pandas_fallback: If False (default), an exception is raised if the file has
+            no geo metadata. If True, the geometry value is set to None for this file.
+    Returns:
+        A geopandas.GeoSeries with file paths as indexes and bounds as values.
+    Examples:
+    ---------
+    >>> import sgis as sg
+    >>> import dapla as dp
+    >>> file_system = dp.FileClient.get_gcs_file_system()
+    >>> all_paths = file_system.ls("...")
+    Get the bounds of all your file paths, indexed by path.
+    >>> bounds_series = sg.get_bounds_series(all_paths, file_system)
+    >>> bounds_series
+    .../0301.parquet    POLYGON ((273514.334 6638380.233, 273514.334 6...
+    .../1101.parquet    POLYGON ((6464.463 6503547.192, 6464.463 65299...
+    .../1103.parquet    POLYGON ((-6282.301 6564097.347, -6282.301 660...
+    .../1106.parquet    POLYGON ((-46359.891 6622984.385, -46359.891 6...
+    .../1108.parquet    POLYGON ((30490.798 6551661.467, 30490.798 658...
+                                                                                                            ...
+    .../5628.parquet    POLYGON ((1019391.867 7809550.777, 1019391.867...
+    .../5630.parquet    POLYGON ((1017907.145 7893398.317, 1017907.145...
+    .../5632.parquet    POLYGON ((1075687.587 7887714.263, 1075687.587...
+    .../5634.parquet    POLYGON ((1103447.451 7874551.663, 1103447.451...
+    .../5636.parquet    POLYGON ((1024129.618 7838961.91, 1024129.618 ...
+    Length: 357, dtype: geometry
+    Make a grid around the total bounds of the files,
+    and read geometries intersecting with the mask in a loop.
+    >>> grid = sg.make_grid(bounds_series, 10_000)
+    >>> for mask in grid.geometry:
+    ...     df = sg.read_geopandas(
+    ...         bounds_series,
+    ...         mask=mask,
+    ...         file_system=file_system,
+    ...     )
+    """
+    if file_system is None:
+        file_system = dp.FileClient.get_gcs_file_system()
+    if threads is None:
+        threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
+    with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
+        bounds: list[tuple[list[float], dict]] = parallel(
+            joblib.delayed(_get_bounds_parquet)(
+                path, file_system=file_system, pandas_fallback=pandas_fallback
+            )
+            for path in paths
+        )
+    crss = {json.dumps(x[1]) for x in bounds}
+    crss = {
+        crs
+        for crs in crss
+        if not any(str(crs).lower() == txt for txt in ["none", "null"])
+    }
+    if not crss:
+        crs = None
+    elif len(crss) == 1:
+        crs = next(iter(crss))
+    else:
+        raise ValueError(f"crs mismatch: {crss}")
+    return GeoSeries(
+        [shapely.box(*bbox[0]) if bbox[0] is not None else None for bbox in bounds],
+        index=paths,
+        crs=crs,
+    )
 def write_geopandas(
@@ -91,6 +290,7 @@ def write_geopandas(
     overwrite: bool = True,
     pandas_fallback: bool = False,
     file_system: dp.gcs.GCSFileSystem | None = None,
+    write_covering_bbox: bool = False,
     **kwargs,
 ) -> None:
     """Writes a GeoDataFrame to the speficied format.
@@ -106,6 +306,13 @@ def write_geopandas(
             not be written with geopandas and the number of rows is more than 0. If True,
             the file will be written without geo-metadata if >0 rows.
         file_system: Optional file sustem.
+        write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
+            Writing a bbox column can be computationally expensive, but allows you to specify
+            a bbox in : func:read_parquet for filtered reading.
+            Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
+            considered as experimental. While writing the column is backwards compatible, using it
+            for filtering may not be supported by all readers.
         **kwargs: Additional keyword arguments passed to parquet.write_table
             (for parquet) or geopandas' to_file method (if not parquet).
     """
@@ -118,25 +325,37 @@ def write_geopandas(
     if not overwrite and exists(gcs_path):
         raise ValueError("File already exists.")
-    if file_system is None:
-        file_system = dp.FileClient.get_gcs_file_system()
     if not isinstance(df, GeoDataFrame):
         raise ValueError("DataFrame must be GeoDataFrame.")
+    if file_system is None:
+        file_system = dp.FileClient.get_gcs_file_system()
     if not len(df):
         if pandas_fallback:
-            df.geometry = df.geometry.astype(str)
             df = pd.DataFrame(df)
-        dp.write_pandas(df, gcs_path, **kwargs)
+            df.geometry = df.geometry.astype(str)
+            df.geometry = None
+        try:
+            dp.write_pandas(df, gcs_path, **kwargs)
+        except Exception as e:
+            more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
+            raise e.__class__(
+                f"{e.__class__.__name__}: {e} for {df}. " + more_txt
+            ) from e
         return
     file_system = dp.FileClient.get_gcs_file_system()
     if ".parquet" in gcs_path or "prqt" in gcs_path:
         with file_system.open(gcs_path, mode="wb") as buffer:
-            table = _geopandas_to_arrow(df, index=df.index, schema_version=None)
-            parquet.write_table(table, buffer, compression="snappy", **kwargs)
+            table = _geopandas_to_arrow(
+                df,
+                index=df.index,
+                schema_version=None,
+                write_covering_bbox=write_covering_bbox,
+            )
+            pyarrow.parquet.write_table(table, buffer, compression="snappy", **kwargs)
         return
     layer = kwargs.pop("layer", None)

sgis/io/read_parquet.py CHANGED Viewed

@@ -15,7 +15,7 @@ def read_parquet_url(url: str) -> GeoDataFrame:
         A GeoDataFrame.
     Examples:
-    --------
+    ---------
     >>> from sgis import read_parquet_url
     >>> url = "https://media.githubusercontent.com/media/statisticsnorway/ssb-sgis/main/tests/testdata/points_oslo.parquet"
     >>> points = read_parquet_url(url)

ssb-sgis 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

ssb-sgis 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl