PyPI - vector2dggs - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

vector2dggs 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vector2dggs/__init__.py +1 -1
vector2dggs/common.py +122 -9
vector2dggs/constants.py +1 -0
vector2dggs/geohash.py +98 -1
vector2dggs/h3.py +43 -1
vector2dggs/rHP.py +112 -2
vector2dggs/s2.py +67 -0
{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/METADATA +37 -24
vector2dggs-0.10.0.dist-info/RECORD +15 -0
{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/WHEEL +1 -1
vector2dggs-0.9.0.dist-info/RECORD +0 -15
{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/COPYING +0 -0
{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/COPYING.LESSER +0 -0
{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/entry_points.txt +0 -0

vector2dggs/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "0.9.0"
1	+ __version__: str = "0.10.0"

vector2dggs/common.py CHANGED Viewed

@@ -6,13 +6,14 @@ import click_log
 import sqlalchemy
 import shutil
 import pyproj
+from uuid import uuid4
 import pandas as pd
 import geopandas as gpd
 import dask.dataframe as dd
 import dask_geopandas as dgpd
-from typing import Union, Callable
+from typing import Union, Callable, Iterable
 from pathlib import Path, PurePath
 from urllib.parse import urlparse
 from tqdm import tqdm
@@ -36,6 +37,12 @@ class ParentResolutionException(Exception):
     pass
+class IdFieldError(ValueError):
+    """Raised when an invalid or missing ID field is provided."""
+    pass
 def check_resolutions(resolution: int, parent_res: int) -> None:
     if parent_res is not None and not int(parent_res) < int(resolution):
         raise ParentResolutionException(
@@ -45,6 +52,73 @@ def check_resolutions(resolution: int, parent_res: int) -> None:
         )
+def check_compaction_requirements(compact: bool, id_field: Union[str, None]) -> None:
+    if compact and not id_field:
+        raise IdFieldError(
+            "An id_field is required for compaction, in order to handle the potential for overlapping features"
+        )
+def compaction(
+    df: pd.DataFrame,
+    res: int,
+    id_field: str,
+    col_order: list[str],
+    dggs_col: str,
+    compact_func: Callable[[Iterable[Union[str, int]]], Iterable[Union[str, int]]],
+    cell_to_child_func: Callable[[Union[str, int], int], Union[str, int]],
+):
+    """
+    Compacts a dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
+    """
+    df = df.reset_index(drop=False)
+    feature_cell_groups = (
+        df.groupby(id_field)[dggs_col].apply(lambda x: set(x)).to_dict()
+    )
+    feature_cell_compact = {
+        id: set(compact_func(cells)) for id, cells in feature_cell_groups.items()
+    }
+    uncompressable = {
+        id: feature_cell_groups[id] & feature_cell_compact[id]
+        for id in feature_cell_groups.keys()
+    }
+    compressable = {
+        id: feature_cell_compact[id] - feature_cell_groups[id]
+        for id in feature_cell_groups.keys()
+    }
+    # Get rows that cannot be compressed
+    mask = pd.Series([False] * len(df), index=df.index)  # Init bool mask
+    for key, value_set in uncompressable.items():
+        mask |= (df[id_field] == key) & (df[dggs_col].isin(value_set))
+    uncompressable_df = df[mask].set_index(dggs_col)
+    # Get rows that can be compressed
+    # Convert each compressed (coarser resolution) cell into a cell at
+    #   the original resolution (usu using centre child as reference)
+    compression_mapping = {
+        (id, cell_to_child_func(cell, res)): cell
+        for id, cells in compressable.items()
+        if cells
+        for cell in cells
+    }
+    mask = pd.Series([False] * len(df), index=df.index)
+    composite_key = f"composite_key_{uuid4()}"
+    # Update mask for compressible rows and prepare for replacement
+    get_composite_key = lambda row: (row[id_field], row[dggs_col])
+    df[composite_key] = df.apply(get_composite_key, axis=1)
+    mask |= df[composite_key].isin(compression_mapping)
+    compressable_df = df[mask].copy()
+    compressable_df[dggs_col] = compressable_df[composite_key].map(
+        compression_mapping
+    )  # Replace DGGS cell ID with compressed representation
+    compressable_df = compressable_df.set_index(dggs_col)
+    return pd.concat([compressable_df, uncompressable_df])[col_order]
 def db_conn_and_input_path(
     vector_input: Union[str, Path],
 ) -> tuple[SQLConnectionType, Union[str, Path]]:
@@ -137,27 +211,59 @@ def parent_partitioning(
     dggs: str,
     input_dir: Path,
     output_dir: Path,
+    compaction_func: Union[Callable, None],
     resolution: int,
     parent_res: int,
+    id_field: str,
     **kwargs,
 ) -> None:
     partition_col = f"{dggs}_{parent_res:02}"
+    dggs_col = f"{dggs}_{resolution:02}"
+    # Read the parquet files into a Dask DataFrame
+    ddf = dd.read_parquet(input_dir, engine="pyarrow")
+    meta = ddf._meta
+    with TqdmCallback(
+        desc=f"Parent partitioning, writing {'compacted ' if compaction_func else ''}output"
+    ):
+        if compaction_func:
+            # Apply the compaction function to each partition
+            unique_parents = sorted(
+                [v for v in ddf[partition_col].unique().compute() if pd.notna(v)]
+            )
+            divisions = unique_parents + [unique_parents[-1]]
+            ddf = (
+                ddf.reset_index(drop=False)
+                .dropna(subset=[partition_col])
+                .set_index(partition_col)
+                .repartition(divisions=divisions)
+                .map_partitions(
+                    compaction_func,
+                    resolution,
+                    meta.columns.to_list(),  # Column order to be returned
+                    dggs_col,
+                    id_field,
+                    meta=meta,
+                )
+            )
-    with TqdmCallback(desc="Repartitioning"):
-        dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
+        ddf.to_parquet(
             output_dir,
             overwrite=kwargs.get("overwrite", False),
             engine=kwargs.get("engine", "pyarrow"),
-            partition_on=partition_col,
+            partition_on=[partition_col],
             compression=kwargs.get("compression", "ZSTD"),
+            # **kwargs
         )
-    LOGGER.debug("Parent cell repartitioning complete")
-    # Rename output to just be the partition key, suffix .parquet
+    LOGGER.debug("Parent cell partitioning complete")
+    # Append a .parquet suffix
     for f in os.listdir(output_dir):
         os.rename(
             os.path.join(output_dir, f),
-            os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
+            os.path.join(output_dir, f.replace(f"{partition_col}=", "")),
         )
     return
@@ -172,6 +278,7 @@ def polyfill(
     resolution: int,
     parent_res: int,
     output_directory: str,
+    compression: str = "snappy",
 ) -> None:
     """
     Reads a geoparquet, performs polyfilling (for Polygon),
@@ -198,7 +305,7 @@ def polyfill(
     df = secondary_index_func(df, parent_res)
     df.to_parquet(
-        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
+        PurePath(output_directory, pq_in.name), engine="auto", compression=compression
     )
     return None
@@ -211,6 +318,7 @@ def index(
     dggs: str,
     dggsfunc: Callable,
     secondary_index_func: Callable,
+    compaction_func: Union[Callable, None],
     input_file: Union[Path, str],
     output_directory: Union[Path, str],
     resolution: int,
@@ -220,6 +328,7 @@ def index(
     spatial_sorting: str,
     cut_threshold: int,
     processes: int,
+    compression: str = "snappy",
     id_field: str = None,
     cut_crs: pyproj.CRS = None,
     con: SQLConnectionType = None,
@@ -245,7 +354,7 @@ def index(
         )
     else:
         # Read file
-        df = gpd.read_file(input_file)
+        df = gpd.read_file(input_file, layer=layer)
     if cut_crs:
         df = df.to_crs(cut_crs)
@@ -329,6 +438,7 @@ def index(
                         resolution,
                         parent_res,
                         tmpdir2,
+                        compression,
                     )
                     for filepath in filepaths
                 ]
@@ -344,9 +454,12 @@ def index(
                 dggs,
                 Path(tmpdir2),
                 output_directory,
+                compaction_func,
                 resolution,
                 parent_res,
+                id_field,
                 overwrite=overwrite,
+                compression=compression,
             )
     return output_directory

vector2dggs/constants.py CHANGED Viewed

@@ -16,6 +16,7 @@ DEFAULTS = {
     "crs": None,
     "c": 5000,
     "t": (multiprocessing.cpu_count() - 1),
+    "cp": "snappy",
     "lyr": None,
     "g": "geom",
     "tempdir": tempfile.tempdir,

vector2dggs/geohash.py CHANGED Viewed

@@ -19,6 +19,8 @@ import vector2dggs.common as common
 from vector2dggs import __version__
+GEOHASH_BASE32_SET = set("0123456789bcdefghjkmnpqrstuvwxyz")
 def gh_secondary_index(df: pd.DataFrame, parent_level: int) -> pd.DataFrame:
     df[f"geohash_{parent_level:02}"] = df.index.to_series().str[:parent_level]
@@ -73,6 +75,82 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     )
+def gh_children(geohash: str, desired_resolution: int) -> int:
+    """
+    Determine the number of children in the geohash refinement, determined by the additional character levels.
+    """
+    current_resolution = len(geohash)
+    additional_length = desired_resolution - current_resolution
+    return 32**additional_length  # Each new character increases resolution by 32
+def compact(cells: set[str]) -> set[str]:
+    """
+    Compact a set of geohash cells.
+    Cells must be at the same resolution.
+    """
+    current_set = set(cells)
+    while True:
+        parent_map = {}
+        for gh in current_set:
+            parent = gh[:-1]
+            if parent not in parent_map:
+                parent_map[parent] = set()
+            parent_map[parent].add(gh)
+        next_set = set()
+        for parent, siblings in parent_map.items():
+            if len(siblings) == 32:
+                next_set.add(parent)
+            else:
+                next_set.update(siblings)
+        if next_set == current_set:
+            break
+        current_set = next_set
+    return current_set
+def get_central_child(geohash: str, precision: int):
+    """
+    Return an approximate central child of the geohash.
+    NB if only an arbitrary child is needed, use get_child_geohash
+    """
+    lat, lon = decode(geohash)
+    return encode(lat, lon, precision=precision)
+def get_child_geohash(geohash: str, desired_length: int, child: str = "0"):
+    """
+    Get a child geohash of the specified length by extending the input geohash.
+    Child geohash is
+    """
+    if child not in GEOHASH_BASE32_SET:
+        raise ValueError(
+            f"Invalid child character '{child}'. Must be one of {''.join(GEOHASH_BASE32_SET)}."
+        )
+    if len(geohash) >= desired_length:
+        return geohash
+    return geohash.ljust(desired_length, child)
+def gh_compaction(
+    df: pd.DataFrame,
+    res: int,
+    col_order: list,
+    dggs_col: str,
+    id_field: str,
+) -> pd.DataFrame:
+    """
+    Compacts a geohash dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
+    """
+    return common.compaction(
+        df, res, id_field, col_order, dggs_col, compact, get_child_geohash
+    )
 @click.command(context_settings={"show_default": True})
 @click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -133,7 +211,7 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     required=False,
     default=const.DEFAULTS["crs"],
     type=int,
-    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
     nargs=1,
 )
 @click.option(
@@ -154,6 +232,15 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     help="Amount of threads used for operation",
     nargs=1,
 )
+@click.option(
+    "-cp",
+    "--compression",
+    required=False,
+    default=const.DEFAULTS["cp"],
+    type=str,
+    help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
+    nargs=1,
+)
 @click.option(
     "-lyr",
     "--layer",
@@ -178,6 +265,12 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
+@click.option(
+    "-co",
+    "--compact",
+    is_flag=True,
+    help="Compact the geohash cells up to the parent resolution. Compaction requires an id_field.",
+)
 @click.option("-o", "--overwrite", is_flag=True)
 @click.version_option(version=__version__)
 def geohash(
@@ -192,9 +285,11 @@ def geohash(
     cut_crs: int,
     cut_threshold: int,
     threads: int,
+    compression: str,
     layer: str,
     geom_col: str,
     tempdir: Union[str, Path],
+    compact: bool,
     overwrite: bool,
 ):
     """
@@ -218,6 +313,7 @@ def geohash(
             "geohash",
             gh_polyfill,
             gh_secondary_index,
+            gh_compaction if compact else None,
             vector_input,
             output_directory,
             int(level),
@@ -227,6 +323,7 @@ def geohash(
             spatial_sorting,
             cut_threshold,
             threads,
+            compression=compression,
             cut_crs=cut_crs,
             id_field=id_field,
             con=con,

vector2dggs/h3.py CHANGED Viewed

@@ -4,6 +4,7 @@ import click_log
 import tempfile
 import pyproj
+import h3 as h3py
 import h3pandas  # Necessary import despite lack of explicit use
 import pandas as pd
@@ -50,6 +51,27 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     )
+def h3compaction(
+    df: pd.DataFrame,
+    res: int,
+    col_order: list,
+    dggs_col: str,
+    id_field: str,
+) -> pd.DataFrame:
+    """
+    Compacts an H3 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
+    """
+    return common.compaction(
+        df,
+        res,
+        id_field,
+        col_order,
+        dggs_col,
+        h3py.compact_cells,
+        h3py.cell_to_center_child,
+    )
 @click.command(context_settings={"show_default": True})
 @click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -108,7 +130,7 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     required=False,
     default=const.DEFAULTS["crs"],
     type=int,
-    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
     nargs=1,
 )
 @click.option(
@@ -129,6 +151,15 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     help="Amount of threads used for operation",
     nargs=1,
 )
+@click.option(
+    "-cp",
+    "--compression",
+    required=False,
+    default=const.DEFAULTS["cp"],
+    type=str,
+    help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
+    nargs=1,
+)
 @click.option(
     "-lyr",
     "--layer",
@@ -153,6 +184,12 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
+@click.option(
+    "-co",
+    "--compact",
+    is_flag=True,
+    help="Compact the H3 cells up to the parent resolution. Compaction requires an id_field.",
+)
 @click.option("-o", "--overwrite", is_flag=True)
 @click.version_option(version=__version__)
 def h3(
@@ -167,9 +204,11 @@ def h3(
     cut_crs: int,
     cut_threshold: int,
     threads: int,
+    compression: str,
     layer: str,
     geom_col: str,
     tempdir: Union[str, Path],
+    compact: bool,
     overwrite: bool,
 ):
     """
@@ -181,6 +220,7 @@ def h3(
     tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
     common.check_resolutions(resolution, parent_res)
+    common.check_compaction_requirements(compact, id_field)
     con, vector_input = common.db_conn_and_input_path(vector_input)
     output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -193,6 +233,7 @@ def h3(
             "h3",
             h3polyfill,
             h3_secondary_index,
+            h3compaction if compact else None,
             vector_input,
             output_directory,
             int(resolution),
@@ -202,6 +243,7 @@ def h3(
             spatial_sorting,
             cut_threshold,
             threads,
+            compression=compression,
             cut_crs=cut_crs,
             id_field=id_field,
             con=con,

vector2dggs/rHP.py CHANGED Viewed

@@ -11,8 +11,14 @@ import geopandas as gpd
 from typing import Union
 from pathlib import Path
+from rhealpixdggs.conversion import compress_order_cells
 from rhppandas.util.const import COLUMNS
+# from rhealpixdggs.rhp_wrappers import rhp_to_center_child, rhp_is_valid
+from rhealpixdggs.rhp_wrappers import rhp_is_valid
+from rhealpixdggs.dggs import RHEALPixDGGS
+from rhealpixdggs.dggs import WGS84_003
 import vector2dggs.constants as const
 import vector2dggs.common as common
@@ -27,7 +33,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     df_polygon = df[df.geom_type == "Polygon"]
     if len(df_polygon.index) > 0:
         df_polygon = df_polygon.rhp.polyfill_resample(
-            resolution, return_geometry=False
+            resolution, return_geometry=False, compress=False
         ).drop(columns=["index"])
     df_linestring = df[df.geom_type == "LineString"]
@@ -51,6 +57,90 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     )
+# TODO replace when merged https://github.com/manaakiwhenua/rhealpixdggs-py/pull/37
+def rhp_to_center_child(
+    rhpindex: str, res: int = None, dggs: RHEALPixDGGS = WGS84_003
+) -> str:
+    """
+    Returns central child of rhpindex at resolution res (immediate central
+    child if res == None).
+    Returns None if the cell index is invalid.
+    Returns None if the DGGS has an even number of cells on a side.
+    EXAMPLES::
+        >>> rhp_to_center_child('S001450634')
+        'S0014506344'
+        >>> rhp_to_center_child('S001450634', res=13)
+        'S001450634444'
+        >>> rhp_to_center_child('INVALID')
+    """
+    # Stop early if the cell index is invalid
+    if not rhp_is_valid(rhpindex, dggs):
+        return None
+    # DGGSs with even numbers of cells on a side never have a cell at the centre
+    if (dggs.N_side % 2) == 0:
+        return None
+    # Handle mismatch between cell resolution and requested child resolution
+    parent_res = len(rhpindex) - 1
+    if res is not None and res < parent_res:
+        return rhpindex
+    # Standard case (including parent_res == res)
+    else:
+        # res == None returns the central child from one level down (by convention)
+        added_levels = 1 if res is None else res - parent_res
+        # Derive index of centre child and append that to rhpindex
+        # NOTE: only works for odd values of N_side
+        c_index = int((dggs.N_side**2 - 1) / 2)
+        # Append the required number of child digits to cell index
+        child_index = rhpindex + "".join(str(c_index) for _ in range(0, added_levels))
+        return child_index
+def compact_cells(cells: set[str]) -> set[str]:
+    """
+    Compact a set of rHEALPix DGGS cells.
+    Cells must be at the same resolution.
+    See https://github.com/manaakiwhenua/rhealpixdggs-py/issues/35#issuecomment-3186073554
+    """
+    previous_result = set(cells)
+    while True:
+        current_result = set(compress_order_cells(previous_result))
+        if previous_result == current_result:
+            break
+        previous_result = current_result
+    return previous_result
+def rhpcompaction(
+    df: pd.DataFrame,
+    res: int,
+    col_order: list,
+    dggs_col: str,
+    id_field: str,
+) -> pd.DataFrame:
+    """
+    Compacts an rHP dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
+    """
+    return common.compaction(
+        df,
+        res,
+        id_field,
+        col_order,
+        dggs_col,
+        compact_cells,
+        rhp_to_center_child,
+    )
 @click.command(context_settings={"show_default": True})
 @click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -109,7 +199,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     required=False,
     default=const.DEFAULTS["crs"],
     type=int,
-    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
     nargs=1,
 )
 @click.option(
@@ -130,6 +220,15 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     help="Amount of threads used for operation",
     nargs=1,
 )
+@click.option(
+    "-cp",
+    "--compression",
+    required=False,
+    default=const.DEFAULTS["cp"],
+    type=str,
+    help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
+    nargs=1,
+)
 @click.option(
     "-lyr",
     "--layer",
@@ -154,6 +253,12 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
+@click.option(
+    "-co",
+    "--compact",
+    is_flag=True,
+    help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
+)
 @click.option("-o", "--overwrite", is_flag=True)
 @click.version_option(version=__version__)
 def rhp(
@@ -168,9 +273,11 @@ def rhp(
     cut_crs: int,
     cut_threshold: int,
     threads: int,
+    compression: str,
     layer: str,
     geom_col: str,
     tempdir: Union[str, Path],
+    compact: bool,
     overwrite: bool,
 ):
     """
@@ -182,6 +289,7 @@ def rhp(
     tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
     common.check_resolutions(resolution, parent_res)
+    common.check_compaction_requirements(compact, id_field)
     con, vector_input = common.db_conn_and_input_path(vector_input)
     output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -194,6 +302,7 @@ def rhp(
             "rhp",
             rhppolyfill,
             rhp_secondary_index,
+            rhpcompaction if compact else None,
             vector_input,
             output_directory,
             int(resolution),
@@ -203,6 +312,7 @@ def rhp(
             spatial_sorting,
             cut_threshold,
             threads,
+            compression=compression,
             cut_crs=cut_crs,
             id_field=id_field,
             con=con,

vector2dggs/s2.py CHANGED Viewed

@@ -182,6 +182,54 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     )
+def compact_tokens(tokens: set[str]) -> set[str]:
+    """
+    Compact a set of S2 DGGS cells.
+    Cells must be at the same resolution.
+    """
+    cell_ids: list[S2.S2CellId] = [
+        S2.S2CellId.FromToken(token, len(token)) for token in tokens
+    ]
+    cell_union: S2.S2CellUnion = S2.S2CellUnion(
+        cell_ids
+    )  # Vector of sorted, non-overlapping S2CellId
+    cell_union.NormalizeS2CellUnion()  # Mutates; 'normalize' == 'compact'
+    return {c.ToToken() for c in cell_union.cell_ids()}
+def token_to_child_token(token: str, level: int) -> str:
+    """
+    Returns first child (as string token) of a cell (also represented as a string
+    token) at a specific level.
+    """
+    cell: S2.S2CellId = S2.S2CellId.FromToken(token, len(token))
+    if level <= cell.level():
+        raise ValueError("Level must be greater than the current level of the cell.")
+    # Get the child cell iterator
+    return cell.child_begin(level).ToToken()
+def s2_compaction(
+    df: pd.DataFrame,
+    res: int,
+    col_order: list,
+    dggs_col: str,
+    id_field: str,
+) -> pd.DataFrame:
+    """
+    Compacts an S2 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
+    """
+    return common.compaction(
+        df,
+        res,
+        id_field,
+        col_order,
+        dggs_col,
+        compact_tokens,
+        token_to_child_token,
+    )
 @click.command(context_settings={"show_default": True})
 @click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -263,6 +311,15 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     help="Amount of threads used for operation",
     nargs=1,
 )
+@click.option(
+    "-cp",
+    "--compression",
+    required=False,
+    default=const.DEFAULTS["cp"],
+    type=str,
+    help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
+    nargs=1,
+)
 @click.option(
     "-lyr",
     "--layer",
@@ -287,6 +344,12 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
+@click.option(
+    "-co",
+    "--compact",
+    is_flag=True,
+    help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
+)
 @click.option("-o", "--overwrite", is_flag=True)
 @click.version_option(version=__version__)
 def s2(
@@ -301,9 +364,11 @@ def s2(
     cut_crs: int,
     cut_threshold: int,
     threads: int,
+    compression: str,
     layer: str,
     geom_col: str,
     tempdir: Union[str, Path],
+    compact: bool,
     overwrite: bool,
 ):
     """
@@ -327,6 +392,7 @@ def s2(
             "s2",
             s2_polyfill,
             s2_secondary_index,
+            s2_compaction if compact else None,
             vector_input,
             output_directory,
             int(level),
@@ -336,6 +402,7 @@ def s2(
             spatial_sorting,
             cut_threshold,
             threads,
+            compression=compression,
             cut_crs=cut_crs,
             id_field=id_field,
             con=con,

{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,7 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: vector2dggs
-Version: 0.9.0
+Version: 0.10.0
 Summary: CLI DGGS indexer for vector geospatial data
-Home-page: https://github.com/manaakiwhenua/vector2dggs
 License: LGPL-3.0-or-later
 Keywords: dggs,vector,h3,rHEALPix,cli
 Author: James Ardo
@@ -14,13 +13,14 @@ Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or l
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Scientific/Engineering :: GIS
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Dist: click (>=8.1.7,<9.0.0)
 Requires-Dist: click-log (>=0.4.0,<0.5.0)
 Requires-Dist: dask (>=2025.1,<2026.0)
-Requires-Dist: dask-geopandas (>=0.4,<0.5)
+Requires-Dist: dask-geopandas (>=0.5,<0.6)
 Requires-Dist: gdal (>=3.8,<4.0)
 Requires-Dist: geopandas (>=1.0.1,<2.0.0)
 Requires-Dist: h3pandas (>=0.3,<0.4)
@@ -43,7 +43,7 @@ Description-Content-Type: text/markdown
 [![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)
-Python-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.
+Python-based CLI tool to index vector files to DGGS in parallel, writing out to Parquet.
 This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
@@ -57,7 +57,7 @@ Currently this tool supports the following DGGSs:
 - [Geohash](https://en.wikipedia.org/wiki/Geohash) (points, polygons)
-Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
+Contributions (especially for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
 ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
@@ -114,23 +114,29 @@ Options:
                                   use when spatially partioning. Adjusting
                                   this number will trade off memory use and
                                   time.  [default: 50; required]
-  -s, --spatial_sorting [hilbert|morton|geohash]
+  -s, --spatial_sorting [hilbert|morton|geohash|none]
                                   Spatial sorting method when perfoming
-                                  spatial partitioning.  [default: hilbert]
+                                  spatial partitioning.  [default: none]
   -crs, --cut_crs INTEGER         Set the coordinate reference system (CRS)
-                                  used for cutting large polygons (see `--cur-
-                                  threshold`). Defaults to the same CRS as the
-                                  input. Should be a valid EPSG code.
-  -c, --cut_threshold INTEGER     Cutting up large polygons into smaller
-                                  pieces based on a target length. Units are
-                                  assumed to match the input CRS units unless
-                                  the `--cut_crs` is also given, in which case
-                                  units match the units of the supplied CRS.
-                                  [default: 5000; required]
+                                  used for cutting large geometries (see
+                                  `--cut_threshold`). Defaults to the same CRS
+                                  as the input. Should be a valid EPSG code.
+  -c, --cut_threshold INTEGER     Cutting up large geometries into smaller
+                                  geometries based on a target length. Units
+                                  are assumed to match the input CRS units
+                                  unless the `--cut_crs` is also given, in
+                                  which case units match the units of the
+                                  supplied CRS.  [default: 5000; required]
   -t, --threads INTEGER           Amount of threads used for operation
-                                  [default: 7]
-  -lyr, --layer TEXT              Name of the layer or table to read when using a
-                                  an input that supports layers or tables
+                                  [default: NUM_CPUS - 1]
+  -cp, --compression TEXT         Compression method to use for the output
+                                  Parquet files. Options include 'snappy',
+                                  'gzip', 'brotli', 'lz4', 'zstd', etc. Use
+                                  'none' for no compression.  [default:
+                                  snappy]
+  -lyr, --layer TEXT              Name of the layer or table to read when
+                                  using an input that supports layers or
+                                  tables
   -g, --geom_col TEXT             Column name to use when using a spatial
                                   database connection as input  [default:
                                   geom]
@@ -138,6 +144,8 @@ Options:
                                   execution of this program. This parameter
                                   allows you to control where this data will
                                   be written.
+  -co, --compact                  Compact the H3 cells up to the parent
+                                  resolution. Compaction requires an id_field.
   -o, --overwrite
   --version                       Show the version and exit.
   --help                          Show this message and exit.
@@ -187,7 +195,6 @@ from shapely.geometry import Polygon
 RES = 18
 df = pd.read_parquet(f'~/output-data/ponds-with-holes.s2.{RES}.pq')
-df = df.reset_index()
 def s2id_to_polygon(s2_id_hex):
     cell_id = s2sphere.CellId.from_token(s2_id_hex)
@@ -199,11 +206,17 @@ def s2id_to_polygon(s2_id_hex):
         vertices.append((lat_lng.lng().degrees, lat_lng.lat().degrees))  # (lon, lat)
     return Polygon(vertices)
-df['geometry'] = df[f's2_{RES}'].apply(s2id_to_polygon)
+df['geometry'] = df.index.to_series().apply(s2id_to_polygon)
 df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')  # WGS84
 df.to_parquet(f'sample-{RES}.parquet')
 ```
+## Compaction
+Compaction is supported with the `-co/--compact` argument. The result respects overlapping polygons by considering each feature independently. (In the below example output for rHEALPix, cells are shown with opacity; overlap is visible where there is a darker shade.) This does mean that the index of the result is not necessarily unique (unless your input is a vector _coverage_, i.e. it does not have overlaps.)
+![Example of compaction of overlapping vector features with the rHEALPix DGGS](docs/imgs/rhp-compaction-example.png)
 ### For development
 In brief, to get started:
@@ -248,14 +261,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -lyr topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.9.0},
+  version={0.10.0},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.9.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.10.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

vector2dggs-0.10.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+vector2dggs/__init__.py,sha256=qK7__omM0NPcz4bMM5qUWKeZsOEhxeVWv_P38IPNVnw,28
+vector2dggs/cli.py,sha256=d_4skD62k6pXUWgDdVHbDwpe4A4yo62ZFx8Cp_6GpBA,767
+vector2dggs/common.py,sha256=rQL1_rFr1VTyILffOZgdwPzZS1JThn4TBPswfCkMjbM,14471
+vector2dggs/constants.py,sha256=KdmBQCP_GCygzvDLtS8AMQM9i6QqOkf-9YQkh_AzrKc,1779
+vector2dggs/geohash.py,sha256=PVLkaaSVLgzDZNfuL0y3Xioh4pyvom845HuyLIAsLUY,10398
+vector2dggs/h3.py,sha256=Juvc8g4QWfDIco9RQHaX8p9S9rkW5QvusxpyO-G7eSs,7408
+vector2dggs/katana.py,sha256=v4BRzVCsroC6RzIYdxLfrr9eFOdmXb5S9jXBMs5tgSo,3571
+vector2dggs/rHP.py,sha256=E03dQngbT3LtksZkaM6QSJv983ZGpLeXRedjqsEQZZI,9869
+vector2dggs/s2.py,sha256=SOXMHQQq86bM88MDgBBemGiXIbuEIbrhLSgPwLKceLY,12809
+vector2dggs-0.10.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+vector2dggs-0.10.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
+vector2dggs-0.10.0.dist-info/METADATA,sha256=LuNEa06-KpDdXDdbDynhkxUXp6nuaOQ7q9ycIrJODKs,12606
+vector2dggs-0.10.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+vector2dggs-0.10.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
+vector2dggs-0.10.0.dist-info/RECORD,,

{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.0
+Generator: poetry-core 2.1.3
 Root-Is-Purelib: true
 Tag: py3-none-any

vector2dggs-0.9.0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-vector2dggs/__init__.py,sha256=L8qKCe-XFylNfRXefZ1yGESlLF24qwQQ87szPZJO6Zg,27
-vector2dggs/cli.py,sha256=d_4skD62k6pXUWgDdVHbDwpe4A4yo62ZFx8Cp_6GpBA,767
-vector2dggs/common.py,sha256=l5koOX1Ps0v5D7MgzHtK1t99hXnGA7b6I82n2rBOldE,10496
-vector2dggs/constants.py,sha256=_cj3Pf52gsXfWwvpsbekE8h1yD_1jS9xqzRg2mRCq3w,1759
-vector2dggs/geohash.py,sha256=t90FlZRQCH8lmtTHe2kPMcLTIf1nrrf2j-m95xk4xPc,7534
-vector2dggs/h3.py,sha256=Bu_4T1WIDuTv_tJWTS8BgPmHRiCozfUUh2CxBwk98Gw,6310
-vector2dggs/katana.py,sha256=v4BRzVCsroC6RzIYdxLfrr9eFOdmXb5S9jXBMs5tgSo,3571
-vector2dggs/rHP.py,sha256=tC4LvqRPMmgUd36BppkvYeq94pPBhO1vBDQ-aaiHUg4,6410
-vector2dggs/s2.py,sha256=HEpFTEL4UaZLjybKZ_q06QFjPuQ48MDLeg_qGc0NMEw,10835
-vector2dggs-0.9.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-vector2dggs-0.9.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
-vector2dggs-0.9.0.dist-info/METADATA,sha256=7y97ZXmDNqUQ-n8M-BgOE2XLG-pJ6f_aNGjzVlCUFzc,11534
-vector2dggs-0.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-vector2dggs-0.9.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
-vector2dggs-0.9.0.dist-info/RECORD,,

{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/COPYING RENAMED Viewed

File without changes

{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/COPYING.LESSER RENAMED Viewed

File without changes

{vector2dggs-0.9.0.dist-info → vector2dggs-0.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vector2dggs 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

vector2dggs 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl