PyPI - ssb-sgis - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ssb-sgis 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

sgis/__init__.py +2 -5
sgis/conf.py +72 -0
sgis/geopandas_tools/cleaning.py +583 -1577
sgis/geopandas_tools/duplicates.py +17 -3
sgis/helpers.py +22 -0
sgis/io/__init__.py +6 -0
sgis/io/dapla_functions.py +415 -74
sgis/maps/explore.py +23 -5
{ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/METADATA +1 -1
{ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/RECORD +12 -10
{ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/LICENSE +0 -0
{ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/WHEEL +0 -0

sgis/io/dapla_functions.py CHANGED Viewed

@@ -2,30 +2,46 @@
 from __future__ import annotations
+import functools
+import glob
 import json
 import multiprocessing
 import os
+import shutil
+import uuid
+from collections.abc import Callable
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
 from pathlib import Path
-import dapla as dp
 import geopandas as gpd
 import joblib
 import pandas as pd
 import pyarrow
+import pyarrow.dataset
+import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import shapely
-from gcsfs import GCSFileSystem
 from geopandas import GeoDataFrame
 from geopandas import GeoSeries
 from geopandas.io.arrow import _geopandas_to_arrow
 from pandas import DataFrame
 from pyarrow import ArrowInvalid
+from ..conf import config
+from ..geopandas_tools.conversion import to_shapely
 from ..geopandas_tools.general import get_common_crs
 from ..geopandas_tools.sfilter import sfilter
+from ..helpers import _get_file_system
+try:
+    from gcsfs import GCSFileSystem
+except ImportError:
+    pass
 PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
+NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
 def read_geopandas(
@@ -34,6 +50,7 @@ def read_geopandas(
     file_system: GCSFileSystem | None = None,
     mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
     threads: int | None = None,
+    filters: pyarrow.dataset.Expression | None = None,
     **kwargs,
 ) -> GeoDataFrame | DataFrame:
     """Reads geoparquet or other geodata from one or more files on GCS.
@@ -56,18 +73,18 @@ def read_geopandas(
             with a bbox that intersects the mask are read, then filtered by location.
         threads: Number of threads to use if reading multiple files. Defaults to
             the number of files to read or the number of available threads (if lower).
+        filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
+            structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
+            More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
         **kwargs: Additional keyword arguments passed to geopandas' read_parquet
             or read_file, depending on the file type.
     Returns:
          A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
     """
-    if file_system is None:
-        file_system = dp.FileClient.get_gcs_file_system()
+    file_system = _get_file_system(file_system, kwargs)
     if not isinstance(gcs_path, (str | Path | os.PathLike)):
-        kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
         cols = {}
         if mask is not None:
             if not isinstance(gcs_path, GeoSeries):
@@ -106,7 +123,16 @@ def read_geopandas(
         # recursive read with threads
         with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
             dfs: list[GeoDataFrame] = parallel(
-                joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
+                joblib.delayed(read_geopandas)(
+                    x,
+                    filters=filters,
+                    file_system=file_system,
+                    pandas_fallback=pandas_fallback,
+                    mask=mask,
+                    threads=threads,
+                    **kwargs,
+                )
+                for x in paths
             )
         if dfs:
@@ -124,22 +150,35 @@ def read_geopandas(
             return sfilter(df, mask)
         return df
-    if not isinstance(gcs_path, str):
-        try:
-            gcs_path = str(gcs_path)
-        except TypeError as e:
-            raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
+    child_paths = has_partitions(gcs_path, file_system)
+    if child_paths:
+        return gpd.GeoDataFrame(
+            _read_partitioned_parquet(
+                gcs_path,
+                read_func=_read_geopandas,
+                file_system=file_system,
+                mask=mask,
+                pandas_fallback=pandas_fallback,
+                filters=filters,
+                child_paths=child_paths,
+                **kwargs,
+            )
+        )
     if "parquet" in gcs_path or "prqt" in gcs_path:
         with file_system.open(gcs_path, mode="rb") as file:
             try:
-                df = gpd.read_parquet(file, **kwargs)
+                df = gpd.read_parquet(
+                    file, filters=filters, filesystem=file_system, **kwargs
+                )
             except ValueError as e:
                 if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
                     raise e.__class__(
                         f"{e.__class__.__name__}: {e} for {gcs_path}."
                     ) from e
-                df = pd.read_parquet(file, **kwargs)
+                df = pd.read_parquet(
+                    file, filters=filters, filesystem=file_system, **kwargs
+                )
                 if pandas_fallback or not len(df):
                     return df
                 else:
@@ -153,11 +192,16 @@ def read_geopandas(
     else:
         with file_system.open(gcs_path, mode="rb") as file:
             try:
-                df = gpd.read_file(file, **kwargs)
+                df = gpd.read_file(
+                    file, filters=filters, filesystem=file_system, **kwargs
+                )
             except ValueError as e:
                 if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
                     raise e
-                df = pd.read_parquet(file, **kwargs)
+                file_type: str = Path(gcs_path).suffix.strip(".")
+                df = getattr(pd, f"read_{file_type}")(
+                    file, filters=filters, filesystem=file_system, **kwargs
+                )
                 if pandas_fallback or not len(df):
                     return df
@@ -179,31 +223,42 @@ def read_geopandas(
 def _get_bounds_parquet(
     path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
 ) -> tuple[list[float], dict] | tuple[None, None]:
-    with file_system.open(path) as f:
+    with file_system.open(path, "rb") as file:
+        return _get_bounds_parquet_from_open_file(file, file_system)
+def _get_bounds_parquet_from_open_file(
+    file, file_system
+) -> tuple[list[float], dict] | tuple[None, None]:
+    geo_metadata = _get_geo_metadata(file, file_system)
+    if not geo_metadata:
+        return None, None
+    return geo_metadata["bbox"], geo_metadata["crs"]
+def _get_geo_metadata(file, file_system) -> dict:
+    meta = pq.read_schema(file).metadata
+    geo_metadata = json.loads(meta[b"geo"])
+    try:
+        primary_column = geo_metadata["primary_column"]
+    except KeyError as e:
+        raise KeyError(e, geo_metadata) from e
+    try:
+        return geo_metadata["columns"][primary_column]
+    except KeyError as e:
         try:
-            num_rows = pq.read_metadata(f).num_rows
+            num_rows = pq.read_metadata(file).num_rows
         except ArrowInvalid as e:
-            if not file_system.isfile(f):
-                return None, None
-            raise ArrowInvalid(e, path) from e
+            if not file_system.isfile(file):
+                return {}
+            raise ArrowInvalid(e, file) from e
         if not num_rows:
-            return None, None
-        meta = pq.read_schema(f).metadata
-    try:
-        meta = json.loads(meta[b"geo"])["columns"]["geometry"]
-    except KeyError as e:
-        if pandas_fallback:
-            return None, None
-        raise KeyError(
-            f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
-            # f"{num_rows=}",
-            # meta,
-        ) from e
-    return meta["bbox"], meta["crs"]
+            return {}
+    return {}
 def _get_columns(path: str | Path, file_system: GCSFileSystem) -> pd.Index:
-    with file_system.open(path) as f:
+    with file_system.open(path, "rb") as f:
         schema = pq.read_schema(f)
         index_cols = _get_index_cols(schema)
         return pd.Index(schema.names).difference(index_cols)
@@ -242,8 +297,7 @@ def get_bounds_series(
     ---------
     >>> import sgis as sg
     >>> import dapla as dp
-    >>> file_system = dp.FileClient.get_gcs_file_system()
-    >>> all_paths = file_system.ls("...")
+    >>> all_paths =  GCSFileSystem().ls("...")
     Get the bounds of all your file paths, indexed by path.
@@ -274,8 +328,7 @@ def get_bounds_series(
     ...     )
     """
-    if file_system is None:
-        file_system = dp.FileClient.get_gcs_file_system()
+    file_system = _get_file_system(file_system, {})
     if threads is None:
         threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
@@ -308,7 +361,8 @@ def write_geopandas(
     overwrite: bool = True,
     pandas_fallback: bool = False,
     file_system: GCSFileSystem | None = None,
-    write_covering_bbox: bool = False,
+    partition_cols=None,
+    existing_data_behavior: str = "error",
     **kwargs,
 ) -> None:
     """Writes a GeoDataFrame to the speficied format.
@@ -324,13 +378,9 @@ def write_geopandas(
             not be written with geopandas and the number of rows is more than 0. If True,
             the file will be written without geo-metadata if >0 rows.
         file_system: Optional file sustem.
-        write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
-            Writing a bbox column can be computationally expensive, but allows you to specify
-            a bbox in : func:read_parquet for filtered reading.
-            Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
-            considered as experimental. While writing the column is backwards compatible, using it
-            for filtering may not be supported by all readers.
+        partition_cols: Column(s) to partition by. Only for parquet files.
+        existing_data_behavior : 'error' | 'overwrite_or_ignore' | 'delete_matching'.
+            Defaults to 'error'. More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html
         **kwargs: Additional keyword arguments passed to parquet.write_table
             (for parquet) or geopandas' to_file method (if not parquet).
     """
@@ -340,22 +390,25 @@ def write_geopandas(
         except TypeError as e:
             raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
-    if not overwrite and exists(gcs_path):
+    file_system = _get_file_system(file_system, kwargs)
+    if not overwrite and file_system.exists(gcs_path):
         raise ValueError("File already exists.")
     if not isinstance(df, GeoDataFrame):
-        raise ValueError("DataFrame must be GeoDataFrame.")
+        raise ValueError(f"DataFrame must be GeoDataFrame. Got {type(df)}.")
-    if file_system is None:
-        file_system = dp.FileClient.get_gcs_file_system()
-    if not len(df):
+    if not len(df) and has_partitions(gcs_path, file_system):
+        # no need to write empty df
+        return
+    elif not len(df):
         if pandas_fallback:
             df = pd.DataFrame(df)
             df.geometry = df.geometry.astype(str)
             df.geometry = None
         try:
-            dp.write_pandas(df, gcs_path, **kwargs)
+            with file_system.open(gcs_path, "wb") as file:
+                df.to_parquet(file, **kwargs)
         except Exception as e:
             more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
             raise e.__class__(
@@ -363,17 +416,19 @@ def write_geopandas(
             ) from e
         return
-    file_system = dp.FileClient.get_gcs_file_system()
     if ".parquet" in gcs_path or "prqt" in gcs_path:
-        with file_system.open(gcs_path, mode="wb") as buffer:
-            table = _geopandas_to_arrow(
+        if partition_cols is not None:
+            return _write_partitioned_geoparquet(
                 df,
-                index=df.index,
-                schema_version=None,
-                write_covering_bbox=write_covering_bbox,
+                gcs_path,
+                partition_cols,
+                file_system,
+                existing_data_behavior=existing_data_behavior,
+                write_func=_to_geopandas,
+                **kwargs,
             )
-            pq.write_table(table, buffer, compression="snappy", **kwargs)
+        with file_system.open(gcs_path, mode="wb") as file:
+            df.to_parquet(file, **kwargs)
         return
     layer = kwargs.pop("layer", None)
@@ -389,21 +444,307 @@ def write_geopandas(
     else:
         driver = None
-    with file_system.open(gcs_path, "wb") as file:
-        df.to_file(file, driver=driver, layer=layer)
+    with BytesIO() as buffer:
+        df.to_file(buffer, driver=driver)
+        buffer.seek(0)  # Rewind the buffer to the beginning
+        # Upload buffer content to the desired storage
+        with file_system.open(gcs_path, "wb") as file:
+            file.write(buffer.read())
-def exists(path: str | Path) -> bool:
-    """Returns True if the path exists, and False if it doesn't.
-    Args:
-        path (str): The path to the file or directory.
+def _to_geopandas(df, path, **kwargs) -> None:
+    table = _geopandas_to_arrow(
+        df,
+        index=df.index,
+        schema_version=None,
+    )
-    Returns:
-        True if the path exists, False if not.
+    if "schema" in kwargs:
+        schema = kwargs.pop("schema")
+        # make sure to get the actual metadata
+        schema = pyarrow.schema(
+            [(schema.field(col).name, schema.field(col).type) for col in schema.names],
+            metadata=table.schema.metadata,
+        )
+        table = table.select(schema.names).cast(schema)
+    pq.write_table(table, path, compression="snappy", **kwargs)
+def _remove_file(path, file_system) -> None:
+    try:
+        file_system.rm_file(str(path))
+    except (AttributeError, TypeError, PermissionError) as e:
+        print(path, type(e), e)
+        try:
+            shutil.rmtree(path)
+        except NotADirectoryError:
+            try:
+                os.remove(path)
+            except PermissionError:
+                pass
+def _write_partitioned_geoparquet(
+    df,
+    path,
+    partition_cols,
+    file_system=None,
+    write_func: Callable = _to_geopandas,
+    existing_data_behavior: str = "error",
+    **kwargs,
+):
+    if isinstance(partition_cols, str):
+        partition_cols = [partition_cols]
+    file_system = _get_file_system(file_system, kwargs)
+    path = Path(path)
+    unique_id = uuid.uuid4()
+    for col in partition_cols:
+        if df[col].isna().all() and not kwargs.get("schema"):
+            raise ValueError("Must specify 'schema' when all rows are NA.")
+    try:
+        glob_func = functools.partial(file_system.glob, detail=False)
+    except AttributeError:
+        glob_func = functools.partial(glob.glob, recursive=True)
+    args: list[tuple[Path, DataFrame]] = []
+    dirs: list[Path] = set()
+    for group, rows in df.groupby(partition_cols, dropna=False):
+        name = (
+            "/".join(
+                f"{col}={value if not pd.isna(value) else NULL_VALUE}"
+                for col, value in zip(partition_cols, group, strict=True)
+            )
+            + f"/{unique_id}.parquet"
+        )
+        dirs.add((path / name).parent)
+        args.append((path / name, rows))
+    if file_system.exists(path) and file_system.isfile(path):
+        _remove_file(path, file_system)
+    if kwargs.get("schema"):
+        schema = kwargs.pop("schema")
+    elif isinstance(df, GeoDataFrame):
+        geom_name = df.geometry.name
+        pandas_columns = [col for col in df if col != geom_name]
+        schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
+        index_columns = _get_index_cols(schema)
+        schema = pyarrow.schema(
+            [
+                (
+                    (schema.field(col).name, schema.field(col).type)
+                    if col != geom_name
+                    else (geom_name, pyarrow.binary())
+                )
+                for col in [*df.columns, *index_columns]
+                # for col in df.columns
+            ]
+        )
+    else:
+        schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
+    def get_siblings(path: str, paths: list[str]) -> list[str]:
+        parts = path.parts
+        return {x for x in paths if all(part in parts for part in x.parts)}
+    def threaded_write(path_rows):
+        new_path, rows = path_rows
+        # for sibling_path in get_siblings(new_path, child_paths):
+        for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
+            if not paths_are_equal(sibling_path, Path(new_path).parent):
+                if existing_data_behavior == "delete_matching":
+                    _remove_file(sibling_path, file_system)
+                elif existing_data_behavior == "error":
+                    raise pyarrow.ArrowInvalid(
+                        f"Could not write to  {path} as the directory is not empty and existing_data_behavior is to error"
+                    )
+        try:
+            with file_system.open(new_path, mode="wb") as file:
+                write_func(rows, file, schema=schema, **kwargs)
+        except FileNotFoundError:
+            file_system.makedirs(str(Path(new_path).parent), exist_ok=True)
+            with file_system.open(new_path, mode="wb") as file:
+                write_func(rows, file, schema=schema, **kwargs)
+    with ThreadPoolExecutor() as executor:
+        list(executor.map(threaded_write, args))
+def _filters_to_expression(filters) -> list[ds.Expression]:
+    if filters is None:
+        return None
+    elif isinstance(filters, pyarrow.dataset.Expression):
+        return filters
+    for filt in filters:
+        if "in" in filt and isinstance(filt[-1], str):
+            raise ValueError(
+                "Using strings with 'in' is ambigous. Use a list of strings."
+            )
+    try:
+        return pq.core.filters_to_expression(filters)
+    except ValueError as e:
+        raise ValueError(f"{e}: {filters}") from e
+def expression_match_path(expression: ds.Expression, path: str) -> bool:
+    """Check if a file path match a pyarrow Expression.
+    Examples:
+    --------
+    >>> import pyarrow.compute as pc
+    >>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
+    >>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
+    >>> expression_match_path(path, expression)
+    True
+    >>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 5)
+    >>> expression_match_path(path, expression)
+    False
+    >>> expression = (pc.Field("x") == 1) & (pc.Field("z") == 10)
+    >>> expression_match_path(path, expression)
+    False
     """
-    file_system = dp.FileClient.get_gcs_file_system()
-    return file_system.exists(path)
+    if NULL_VALUE in path:
+        return True
+    # build a one lengthed pyarrow.Table of the partitioning in the file path
+    values = []
+    names = []
+    for part in Path(path).parts:
+        if part.count("=") != 1:
+            continue
+        name, value = part.split("=")
+        values.append([value])
+        names.append(name)
+    table = pyarrow.Table.from_arrays(values, names=names)
+    try:
+        table = table.filter(expression)
+    except pyarrow.ArrowInvalid as e:
+        if "No match for FieldRef" not in str(e):
+            raise e
+        # cannot determine if the expression match without reading the file
+        return True
+    return bool(len(table))
+def _read_geopandas(file, pandas_fallback: bool, **kwargs):
+    try:
+        return gpd.read_parquet(file, **kwargs)
+    except Exception as e:
+        if not pandas_fallback:
+            raise e
+        df = pd.read_parquet(file, **kwargs)
+        if len(df):
+            raise e
+        return df
+def _read_pandas(gcs_path: str, **kwargs):
+    file_system = _get_file_system(None, kwargs)
+    child_paths = has_partitions(gcs_path, file_system)
+    if child_paths:
+        return gpd.GeoDataFrame(
+            _read_partitioned_parquet(
+                gcs_path,
+                read_func=pd.read_parquet,
+                file_system=file_system,
+                mask=None,
+                child_paths=child_paths,
+                **kwargs,
+            )
+        )
+    with file_system.open(gcs_path, "rb") as file:
+        return pd.read_parquet(file, **kwargs)
+def _read_partitioned_parquet(
+    path: str,
+    read_func: Callable,
+    filters=None,
+    file_system=None,
+    mask=None,
+    child_paths: list[str] | None = None,
+    **kwargs,
+):
+    file_system = _get_file_system(file_system, kwargs)
+    if child_paths is None:
+        try:
+            glob_func = functools.partial(file_system.glob)
+        except AttributeError:
+            glob_func = functools.partial(glob.glob, recursive=True)
+        child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
+    filters = _filters_to_expression(filters)
+    def intersects(file, mask) -> bool:
+        bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
+        return shapely.box(*bbox).intersects(to_shapely(mask))
+    def read(path) -> GeoDataFrame | None:
+        with file_system.open(path, "rb") as file:
+            if mask is not None and not intersects(file, mask):
+                return
+            schema = kwargs.get("schema", pq.read_schema(file))
+            # copy kwargs because mutable
+            new_kwargs = {
+                key: value for key, value in kwargs.items() if key != "schema"
+            }
+            return read_func(file, schema=schema, filters=filters, **new_kwargs)
+    with ThreadPoolExecutor() as executor:
+        results = [
+            x
+            for x in (
+                executor.map(
+                    read,
+                    (
+                        path
+                        for path in child_paths
+                        if filters is None or expression_match_path(filters, path)
+                    ),
+                )
+            )
+            if x is not None
+        ]
+    if results:
+        if mask is not None:
+            return sfilter(pd.concat(results), mask)
+        return pd.concat(results)
+    # add columns to empty DataFrame
+    first_path = next(iter(child_paths + [path]))
+    return pd.DataFrame(
+        columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
+    )
+def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
+    return Path(path1).parts == Path(path2).parts
+def has_partitions(path, file_system) -> list[str]:
+    try:
+        glob_func = functools.partial(file_system.glob, detail=False)
+    except AttributeError:
+        glob_func = functools.partial(glob.glob, recursive=True)
+    return [
+        x
+        for x in glob_func(str(Path(path) / "**/*.parquet"))
+        if not paths_are_equal(x, path)
+    ]
 def check_files(
@@ -419,7 +760,7 @@ def check_files(
         within_minutes: Optionally include only files that were updated in the
             last n minutes.
     """
-    file_system = dp.FileClient.get_gcs_file_system()
+    file_system = config["file_system"]()
     # (recursive doesn't work, so doing recursive search below)
     info = file_system.ls(folder, detail=True, recursive=True)
@@ -474,7 +815,7 @@ def check_files(
 def _get_files_in_subfolders(folderinfo: list[dict]) -> list[tuple]:
-    file_system = dp.FileClient.get_gcs_file_system()
+    file_system = config["file_system"]()
     fileinfo = []

ssb-sgis 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

ssb-sgis 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl