PyPI - vector2dggs - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

vector2dggs 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

vector2dggs/__init__.py +1 -1
vector2dggs/cli.py +2 -0
vector2dggs/common.py +354 -0
vector2dggs/constants.py +26 -0
vector2dggs/h3.py +35 -296
vector2dggs/rHP.py +217 -0
{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/METADATA +37 -17
vector2dggs-0.6.3.dist-info/RECORD +13 -0
{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/WHEEL +1 -1
vector2dggs-0.6.1.dist-info/RECORD +0 -10
{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING +0 -0
{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING.LESSER +0 -0
{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/entry_points.txt +0 -0

vector2dggs/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "0.6.1"
1	+ __version__: str = "0.6.3"

vector2dggs/cli.py CHANGED Viewed

@@ -2,6 +2,7 @@ import click
 from vector2dggs import __version__
 from vector2dggs.h3 import h3
+from vector2dggs.rHP import rhp
 #   If the program does terminal interaction, make it output a short
 # notice like this when it starts in an interactive mode:
@@ -19,6 +20,7 @@ def cli():
 cli.add_command(h3)
+cli.add_command(rhp)
 def main():

vector2dggs/common.py ADDED Viewed

@@ -0,0 +1,354 @@
+import os
+import errno
+import logging
+import tempfile
+import click_log
+import sqlalchemy
+import shutil
+import pyproj
+import pandas as pd
+import geopandas as gpd
+import dask.dataframe as dd
+import dask_geopandas as dgpd
+from typing import Union, Callable
+from pathlib import Path, PurePath
+from urllib.parse import urlparse
+from tqdm import tqdm
+from tqdm.dask import TqdmCallback
+from multiprocessing.dummy import Pool
+from shapely.geometry import GeometryCollection
+import vector2dggs.constants as const
+from . import katana
+SQLConnectionType = Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine]
+LOGGER = logging.getLogger(__name__)
+click_log.basic_config(LOGGER)
+click_log.ColorFormatter.colors["info"] = dict(fg="green")
+class ParentResolutionException(Exception):
+    pass
+def check_resolutions(resolution: int, parent_res: int) -> None:
+    if parent_res is not None and not int(parent_res) < int(resolution):
+        raise ParentResolutionException(
+            "Parent resolution ({pr}) must be less than target resolution ({r})".format(
+                pr=parent_res, r=resolution
+            )
+        )
+def db_conn_and_input_path(
+    vector_input: Union[str, Path],
+) -> tuple[SQLConnectionType, Union[str, Path]]:
+    con: sqlalchemy.engine.Connection = None
+    scheme: str = urlparse(vector_input).scheme
+    if bool(scheme) and scheme != "file":
+        # Assume database connection
+        con = sqlalchemy.create_engine(vector_input)
+    elif not Path(vector_input).exists():
+        if not scheme:
+            LOGGER.error(
+                f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
+            )
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), vector_input
+            )
+        vector_input = str(vector_input)
+    else:
+        vector_input = Path(vector_input)
+    return (con, vector_input)
+def resolve_output_path(
+    output_directory: Union[str, Path], overwrite: bool
+) -> Union[str, Path]:
+    output_directory = Path(output_directory)
+    outputexists = os.path.exists(output_directory)
+    if outputexists and not overwrite:
+        raise FileExistsError(
+            f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
+        )
+    elif outputexists and overwrite:
+        LOGGER.warning(f"Overwriting the contents of {output_directory}")
+        shutil.rmtree(output_directory)
+    output_directory.mkdir(parents=True, exist_ok=True)
+    return output_directory
+def drop_condition(
+    df: pd.DataFrame,
+    drop_index: pd.Index,
+    log_statement: str,
+    warning_threshold: float = 0.01,
+):
+    LOGGER.debug(log_statement)
+    _before = len(df)
+    df = df.drop(drop_index)
+    _after = len(df)
+    _diff = _before - _after
+    if _diff:
+        log_method = (
+            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
+        )
+        log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
+    return df
+def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int):
+    """
+    Uses a parent resolution,
+    OR,
+    Given a target resolution, returns our recommended parent resolution.
+    Used for intermediate re-partioning.
+    """
+    if dggs == "h3":
+        return (
+            parent_res
+            if parent_res is not None
+            else max(const.MIN_H3, (resolution - const.DEFAULT_PARENT_OFFSET))
+        )
+    elif dggs == "rhp":
+        return (
+            parent_res
+            if parent_res is not None
+            else max(const.MIN_RHP, (resolution - const.DEFAULT_PARENT_OFFSET))
+        )
+    else:
+        raise RuntimeError(
+            "Unknown dggs {dggs}) -  must be one of [ 'h3', 'rhp' ]".format(dggs=dggs)
+        )
+def parent_partitioning(
+    dggs: str,
+    input_dir: Path,
+    output_dir: Path,
+    resolution: int,
+    parent_res: Union[None, int],
+    **kwargs,
+) -> None:
+    parent_res: int = get_parent_res(dggs, parent_res, resolution)
+    partition_col = f"{dggs}_{parent_res:02}"
+    with TqdmCallback(desc="Repartitioning"):
+        dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
+            output_dir,
+            overwrite=kwargs.get("overwrite", False),
+            engine=kwargs.get("engine", "pyarrow"),
+            partition_on=partition_col,
+            compression=kwargs.get("compression", "ZSTD"),
+        )
+    LOGGER.debug("Parent cell repartitioning complete")
+    # Rename output to just be the partition key, suffix .parquet
+    for f in os.listdir(output_dir):
+        os.rename(
+            os.path.join(output_dir, f),
+            os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
+        )
+    return
+def polyfill(
+    dggs: str,
+    dggsfunc: Callable,
+    secondary_index_func: Callable,
+    pq_in: Path,
+    spatial_sort_col: str,
+    resolution: int,
+    parent_res: Union[None, int],
+    output_directory: str,
+) -> None:
+    """
+    Reads a geoparquet, performs polyfilling (for Polygon),
+    linetracing (for LineString), and writes out to parquet.
+    """
+    df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
+    if len(df.index) == 0:
+        # Input is empty, nothing to polyfill
+        return None
+    # DGGS specific polyfill
+    df = dggsfunc(df, resolution)
+    if len(df.index) == 0:
+        # Polyfill resulted in empty output (e.g. large cell, small feature)
+        return None
+    df.index.rename(f"{dggs}_{resolution:02}", inplace=True)
+    parent_res: int = get_parent_res(dggs, parent_res, resolution)
+    # print(parent_res)
+    # print(df.index)
+    # print(df.columns)
+    # Secondary (parent) index, used later for partitioning
+    df = secondary_index_func(df, parent_res)
+    df.to_parquet(
+        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
+    )
+    return None
+def polyfill_star(args) -> None:
+    return polyfill(*args)
+def index(
+    dggs: str,
+    dggsfunc: Callable,
+    secondary_index_func: Callable,
+    input_file: Union[Path, str],
+    output_directory: Union[Path, str],
+    resolution: int,
+    parent_res: Union[None, int],
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_threshold: int,
+    processes: int,
+    id_field: str = None,
+    cut_crs: pyproj.CRS = None,
+    con: SQLConnectionType = None,
+    table: str = None,
+    geom_col: str = "geom",
+    overwrite: bool = False,
+) -> Path:
+    """
+    Performs multi-threaded polyfilling on (multi)polygons.
+    """
+    if table and con:
+        # Database connection
+        if keep_attributes:
+            q = sqlalchemy.text(f"SELECT * FROM {table}")
+        elif id_field and not keep_attributes:
+            q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
+        else:
+            q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
+        df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
+            "geometry"
+        )
+    else:
+        # Read file
+        df = gpd.read_file(input_file)
+    if cut_crs:
+        df = df.to_crs(cut_crs)
+    LOGGER.debug("Cutting with CRS: %s", df.crs)
+    if id_field:
+        df = df.set_index(id_field)
+    else:
+        df = df.reset_index()
+        df = df.rename(columns={"index": "fid"}).set_index("fid")
+    if not keep_attributes:
+        # Remove all attributes except the geometry
+        df = df.loc[:, ["geometry"]]
+    LOGGER.debug("Cutting large geometries")
+    with tqdm(total=df.shape[0], desc="Splitting") as pbar:
+        for index, row in df.iterrows():
+            df.loc[index, "geometry"] = GeometryCollection(
+                katana.katana(row.geometry, cut_threshold)
+            )
+            pbar.update(1)
+    LOGGER.debug("Exploding geometry collections and multipolygons")
+    df = (
+        df.to_crs(4326)
+        .explode(index_parts=False)  # Explode from GeometryCollection
+        .explode(index_parts=False)  # Explode multipolygons to polygons
+    ).reset_index()
+    drop_conditions = [
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.is_empty | frame.geometry.isna())
+            ],
+            "message": "Considering empty or null geometries",
+        },
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.geom_type != "Polygon")
+                & (frame.geometry.geom_type != "LineString")
+            ],  # NB currently points and other types are lost; in principle, these could be indexed
+            "message": "Considering unsupported geometries",
+        },
+    ]
+    for condition in drop_conditions:
+        df = drop_condition(df, condition["index"](df).index, condition["message"])
+    ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
+    LOGGER.debug("Spatially sorting and partitioning (%s)", spatial_sorting)
+    ddf = ddf.spatial_shuffle(by=spatial_sorting)
+    spatial_sort_col = (
+        spatial_sorting
+        if spatial_sorting == "geohash"
+        else f"{spatial_sorting}_distance"
+    )
+    with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
+        with TqdmCallback(desc=f"Spatially partitioning"):
+            ddf.to_parquet(tmpdir, overwrite=True)
+        filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
+        # Multithreaded polyfilling
+        LOGGER.debug(
+            "Indexing on spatial partitions by polyfill with resolution: %d",
+            resolution,
+        )
+        with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
+            with Pool(processes=processes) as pool:
+                args = [
+                    (
+                        dggs,
+                        dggsfunc,
+                        secondary_index_func,
+                        filepath,
+                        spatial_sort_col,
+                        resolution,
+                        parent_res,
+                        tmpdir2,
+                    )
+                    for filepath in filepaths
+                ]
+                list(
+                    tqdm(
+                        pool.imap(polyfill_star, args),
+                        total=len(args),
+                        desc="DGGS indexing",
+                    )
+                )
+            parent_partitioning(
+                dggs,
+                tmpdir2,
+                output_directory,
+                resolution,
+                parent_res,
+                overwrite=overwrite,
+            )
+    return output_directory

vector2dggs/constants.py ADDED Viewed

@@ -0,0 +1,26 @@
+import multiprocessing
+import warnings
+import tempfile
+MIN_H3, MAX_H3 = 0, 15
+MIN_RHP, MAX_RHP = 0, 15
+DEFAULTS = {
+    "id": None,
+    "k": False,
+    "ch": 50,
+    "s": "hilbert",
+    "crs": None,
+    "c": 5000,
+    "t": (multiprocessing.cpu_count() - 1),
+    "tbl": None,
+    "g": "geom",
+    "tempdir": tempfile.tempdir,
+}
+DEFAULT_PARENT_OFFSET = 6
+warnings.filterwarnings(
+    "ignore"
+)  # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows

vector2dggs/h3.py CHANGED Viewed

@@ -1,80 +1,28 @@
-import errno
-import logging
-import os
-import multiprocessing
-from multiprocessing.dummy import Pool
-from pathlib import Path, PurePath
-import shutil
 import sys
-import tempfile
-from typing import Union
-from urllib.parse import urlparse
-import warnings
 import click
 import click_log
-import dask.dataframe as dd
-import dask_geopandas as dgpd
-import geopandas as gpd
-import h3pandas
-import pandas as pd
+import tempfile
 import pyproj
-from shapely.geometry import GeometryCollection
-import sqlalchemy
-from tqdm import tqdm
-from tqdm.dask import TqdmCallback
-from . import katana
-from vector2dggs import __version__
-LOGGER = logging.getLogger(__name__)
-click_log.basic_config(LOGGER)
-MIN_H3, MAX_H3 = 0, 15
-warnings.filterwarnings(
-    "ignore"
-)  # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows
-DEFAULT_PARENT_OFFSET = 6
-DEFAULT_CHUNK_SIZE = 50
+import h3pandas  # Necessary import despite lack of explicit use
+import pandas as pd
+import geopandas as gpd
-class ParentResolutionException(Exception):
-    pass
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
-def _get_parent_res(parent_res: Union[None, int], resolution: int):
-    """
-    Uses a parent resolution,
-    OR,
-    Given a target resolution, returns our recommended parent resolution.
+from vector2dggs import __version__
-    Used for intermediate re-partioning.
-    """
-    return (
-        int(parent_res)
-        if parent_res is not None
-        else max(MIN_H3, (resolution - DEFAULT_PARENT_OFFSET))
-    )
+def h3_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+    return df.h3.h3_to_parent(parent_res)
-def polyfill(
-    pq_in: Path,
-    spatial_sort_col: str,
-    resolution: int,
-    parent_res: Union[None, int],
-    output_directory: str,
-) -> None:
-    """
-    Reads a geoparquet, performs H3 polyfilling (for Polygon),
-    linetracing (for LineString), and writes out to parquet.
-    """
-    df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
-    if len(df.index) == 0:
-        # Input is empty, nothing to polyfill
-        return None
+def h3polyfill(df: gpd.GeoDataFrame, resolution: int):
     df_polygon = df[df.geom_type == "Polygon"]
     if len(df_polygon.index) > 0:
         df_polygon = df_polygon.h3.polyfill_resample(
@@ -90,207 +38,23 @@ def polyfill(
         )
         df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
-    df = pd.concat(
+    return pd.concat(
         map(
             lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
             [df_polygon, df_linestring],
         )
     )
-    if len(df.index) == 0:
-        # Polyfill resulted in empty output (e.g. large cell, small feature)
-        return None
-    df.index.rename(f"h3_{resolution:02}", inplace=True)
-    parent_res: int = _get_parent_res(parent_res, resolution)
-    # Secondary (parent) H3 index, used later for partitioning
-    df.h3.h3_to_parent(parent_res).to_parquet(
-        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
-    )
-    return None
-def polyfill_star(args) -> None:
-    return polyfill(*args)
-def _parent_partitioning(
-    input_dir: Path,
-    output_dir: Path,
-    resolution: int,
-    parent_res: Union[None, int],
-    **kwargs,
-) -> None:
-    parent_res: int = _get_parent_res(parent_res, resolution)
-    partition_col = f"h3_{parent_res:02}"
-    with TqdmCallback(desc="Repartitioning"):
-        dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
-            output_dir,
-            overwrite=kwargs.get("overwrite", False),
-            engine=kwargs.get("engine", "pyarrow"),
-            partition_on=partition_col,
-            compression=kwargs.get("compression", "ZSTD"),
-        )
-    LOGGER.debug("Parent cell repartitioning complete")
-    # Rename output to just be the partition key, suffix .parquet
-    for f in os.listdir(output_dir):
-        os.rename(
-            os.path.join(output_dir, f),
-            os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
-        )
-    return
-def drop_condition(
-    df: pd.DataFrame,
-    drop_index: pd.Index,
-    log_statement: str,
-    warning_threshold: float = 0.01,
-):
-    LOGGER.info(log_statement)
-    _before = len(df)
-    df = df.drop(drop_index)
-    _after = len(df)
-    _diff = _before - _after
-    if _diff:
-        log_method = (
-            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
-        )
-        log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
-    return df
-def _index(
-    input_file: Union[Path, str],
-    output_directory: Union[Path, str],
-    resolution: int,
-    parent_res: Union[None, int],
-    keep_attributes: bool,
-    chunksize: int,
-    spatial_sorting: str,
-    cut_threshold: int,
-    processes: int,
-    id_field: str = None,
-    cut_crs: pyproj.CRS = None,
-    con: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine] = None,
-    table: str = None,
-    geom_col: str = "geom",
-    overwrite: bool = False,
-) -> Path:
-    """
-    Performs multi-threaded H3 polyfilling on (multi)polygons.
-    """
-    if table and con:
-        # Database connection
-        if keep_attributes:
-            q = sqlalchemy.text(f"SELECT * FROM {table}")
-        elif id_field and not keep_attributes:
-            q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
-        else:
-            q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
-        df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
-            "geometry"
-        )
-    else:
-        # Read file
-        df = gpd.read_file(input_file)
-    if cut_crs:
-        df = df.to_crs(cut_crs)
-    LOGGER.info("Cutting with CRS: %s", df.crs)
-    if id_field:
-        df = df.set_index(id_field)
-    else:
-        df = df.reset_index()
-        df = df.rename(columns={"index": "fid"}).set_index("fid")
-    if not keep_attributes:
-        # Remove all attributes except the geometry
-        df = df.loc[:, ["geometry"]]
-    LOGGER.info("Cutting large geometries")
-    with tqdm(total=df.shape[0]) as pbar:
-        for index, row in df.iterrows():
-            df.loc[index, "geometry"] = GeometryCollection(
-                katana.katana(row.geometry, cut_threshold)
-            )
-            pbar.update(1)
-    LOGGER.info("Exploding geometry collections and multipolygons")
-    df = (
-        df.to_crs(4326)
-        .explode(index_parts=False)  # Explode from GeometryCollection
-        .explode(index_parts=False)  # Explode multipolygons to polygons
-    ).reset_index()
-    drop_conditions = [
-        {
-            "index": lambda frame: frame[
-                (frame.geometry.is_empty | frame.geometry.isna())
-            ],
-            "message": "Dropping empty or null geometries",
-        },
-        {
-            "index": lambda frame: frame[
-                (frame.geometry.geom_type != "Polygon")
-                & (frame.geometry.geom_type != "LineString")
-            ],  # NB currently points and other types are lost; in principle, these could be indexed
-            "message": "Dropping unsupported geometries",
-        },
-    ]
-    for condition in drop_conditions:
-        df = drop_condition(df, condition["index"](df).index, condition["message"])
-    ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
-    LOGGER.info("Spatially sorting and partitioning (%s)", spatial_sorting)
-    ddf = ddf.spatial_shuffle(by=spatial_sorting)
-    spatial_sort_col = (
-        spatial_sorting
-        if spatial_sorting == "geohash"
-        else f"{spatial_sorting}_distance"
-    )
-    with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
-        with TqdmCallback():
-            ddf.to_parquet(tmpdir, overwrite=True)
-        filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
-        # Multithreaded polyfilling
-        LOGGER.info(
-            "H3 Indexing on spatial partitions by polyfill with H3 resolution: %d",
-            resolution,
-        )
-        with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
-            with Pool(processes=processes) as pool:
-                args = [
-                    (filepath, spatial_sort_col, resolution, parent_res, tmpdir2)
-                    for filepath in filepaths
-                ]
-                list(tqdm(pool.imap(polyfill_star, args), total=len(args)))
-            _parent_partitioning(
-                tmpdir2, output_directory, resolution, parent_res, overwrite=overwrite
-            )
-    return output_directory
 @click.command(context_settings={"show_default": True})
-@click_log.simple_verbosity_option(LOGGER)
+@click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
 @click.argument("output_directory", required=True, type=click.Path(), nargs=1)
 @click.option(
     "-r",
     "--resolution",
     required=True,
-    type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
+    type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
     help="H3 resolution to index",
     nargs=1,
 )
@@ -298,14 +62,14 @@ def _index(
     "-pr",
     "--parent_res",
     required=False,
-    type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
+    type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
     help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
 )
 @click.option(
     "-id",
     "--id_field",
     required=False,
-    default=None,
+    default=const.DEFAULTS["id"],
     type=str,
     help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
     nargs=1,
@@ -315,7 +79,7 @@ def _index(
     "--keep_attributes",
     is_flag=True,
     show_default=True,
-    default=False,
+    default=const.DEFAULTS["k"],
     help="Retain attributes in output. The default is to create an output that only includes H3 cell ID and the ID given by the -id field (or the default index ID).",
 )
 @click.option(
@@ -323,7 +87,7 @@ def _index(
     "--chunksize",
     required=True,
     type=int,
-    default=DEFAULT_CHUNK_SIZE,
+    default=const.DEFAULTS["ch"],
     help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
     nargs=1,
 )
@@ -331,14 +95,14 @@ def _index(
     "-s",
     "--spatial_sorting",
     type=click.Choice(["hilbert", "morton", "geohash"]),
-    default="hilbert",
+    default=const.DEFAULTS["s"],
     help="Spatial sorting method when perfoming spatial partitioning.",
 )
 @click.option(
     "-crs",
     "--cut_crs",
     required=False,
-    default=None,
+    default=const.DEFAULTS["crs"],
     type=int,
     help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
     nargs=1,
@@ -347,7 +111,7 @@ def _index(
     "-c",
     "--cut_threshold",
     required=True,
-    default=5000,
+    default=const.DEFAULTS["c"],
     type=int,
     help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
     nargs=1,
@@ -356,7 +120,7 @@ def _index(
     "-t",
     "--threads",
     required=False,
-    default=7,
+    default=const.DEFAULTS["t"],
     type=int,
     help="Amount of threads used for operation",
     nargs=1,
@@ -365,7 +129,7 @@ def _index(
     "-tbl",
     "--table",
     required=False,
-    default=None,
+    default=const.DEFAULTS["tbl"],
     type=str,
     help="Name of the table to read when using a spatial database connection as input",
     nargs=1,
@@ -374,14 +138,14 @@ def _index(
     "-g",
     "--geom_col",
     required=False,
-    default="geom",
+    default=const.DEFAULTS["g"],
     type=str,
     help="Column name to use when using a spatial database connection as input",
     nargs=1,
 )
 @click.option(
     "--tempdir",
-    default=tempfile.tempdir,
+    default=const.DEFAULTS["tempdir"],
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
@@ -410,46 +174,21 @@ def h3(
     VECTOR_INPUT is the path to input vector geospatial data.
     OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
     """
-    tempfile.tempdir = tempdir
-    if parent_res is not None and not int(parent_res) < int(resolution):
-        raise ParentResolutionException(
-            "Parent resolution ({pr}) must be less than target resolution ({r})".format(
-                pr=parent_res, r=resolution
-            )
-        )
-    con: sqlalchemy.engine.Connection = None
-    scheme: str = urlparse(vector_input).scheme
-    if bool(scheme) and scheme != "file":
-        # Assume database connection
-        con = sqlalchemy.create_engine(vector_input)
-    elif not Path(vector_input).exists():
-        if not scheme:
-            LOGGER.warning(
-                f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
-            )
-            raise FileNotFoundError(
-                errno.ENOENT, os.strerror(errno.ENOENT), vector_input
-            )
-        vector_input = str(vector_input)
-    else:
-        vector_input = Path(vector_input)
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
-    output_directory = Path(output_directory)
-    outputexists = os.path.exists(output_directory)
-    if outputexists and not overwrite:
-        raise FileExistsError(
-            f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
-        )
-    elif outputexists and overwrite:
-        LOGGER.info(f"Overwriting the contents of {output_directory}")
-        shutil.rmtree(output_directory)
-    output_directory.mkdir(parents=True, exist_ok=True)
+    common.check_resolutions(resolution, parent_res)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
     if cut_crs is not None:
         cut_crs = pyproj.CRS.from_user_input(cut_crs)
     try:
-        _index(
+        common.index(
+            "h3",
+            h3polyfill,
+            h3_secondary_index,
             vector_input,
             output_directory,
             int(resolution),

vector2dggs/rHP.py ADDED Viewed

@@ -0,0 +1,217 @@
+import sys
+import click
+import click_log
+import tempfile
+import pyproj
+import rhppandas  # Necessary import despite lack of explicit use
+import pandas as pd
+import geopandas as gpd
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
+from vector2dggs import __version__
+def rhp_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+    return df.rhp.rhp_to_parent(parent_res)
+def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
+    df_polygon = df[df.geom_type == "Polygon"]
+    if len(df_polygon.index) > 0:
+        df_polygon = df_polygon.rhp.polyfill_resample(
+            resolution, return_geometry=False
+        ).drop(columns=["index"])
+    df_multipolygon = df[df.geom_type == "MultiPolygon"]
+    if len(df_multipolygon.index) > 0:
+        df_multipolygon = df_multipolygon.rhp.polyfill_resample(
+            resolution, return_geometry=False
+        ).drop(columns=["index"])
+    # df_linestring = df[df.geom_type == "LineString"]
+    # if len(df_linestring.index) > 0:
+    #     df_linestring = (
+    #         df_linestring.h3.linetrace(resolution)
+    #         .explode("h3_linetrace")
+    #         .set_index("h3_linetrace")
+    #     )
+    #     df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
+    return pd.concat(
+        map(
+            lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
+            [df_polygon, df_multipolygon],  # df_linestring],
+        )
+    )
+@click.command(context_settings={"show_default": True})
+@click_log.simple_verbosity_option(common.LOGGER)
+@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
+@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
+@click.option(
+    "-r",
+    "--resolution",
+    required=True,
+    type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
+    help="H3 resolution to index",
+    nargs=1,
+)
+@click.option(
+    "-pr",
+    "--parent_res",
+    required=False,
+    type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
+    help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
+)
+@click.option(
+    "-id",
+    "--id_field",
+    required=False,
+    default=const.DEFAULTS["id"],
+    type=str,
+    help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
+    nargs=1,
+)
+@click.option(
+    "-k",
+    "--keep_attributes",
+    is_flag=True,
+    show_default=True,
+    default=const.DEFAULTS["k"],
+    help="Retain attributes in output. The default is to create an output that only includes rHEALPix cell ID and the ID given by the -id field (or the default index ID).",
+)
+@click.option(
+    "-ch",
+    "--chunksize",
+    required=True,
+    type=int,
+    default=const.DEFAULTS["ch"],
+    help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
+    nargs=1,
+)
+@click.option(
+    "-s",
+    "--spatial_sorting",
+    type=click.Choice(["hilbert", "morton", "geohash"]),
+    default=const.DEFAULTS["s"],
+    help="Spatial sorting method when perfoming spatial partitioning.",
+)
+@click.option(
+    "-crs",
+    "--cut_crs",
+    required=False,
+    default=const.DEFAULTS["crs"],
+    type=int,
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    nargs=1,
+)
+@click.option(
+    "-c",
+    "--cut_threshold",
+    required=True,
+    default=const.DEFAULTS["c"],
+    type=int,
+    help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
+    nargs=1,
+)
+@click.option(
+    "-t",
+    "--threads",
+    required=False,
+    default=const.DEFAULTS["t"],
+    type=int,
+    help="Amount of threads used for operation",
+    nargs=1,
+)
+@click.option(
+    "-tbl",
+    "--table",
+    required=False,
+    default=const.DEFAULTS["tbl"],
+    type=str,
+    help="Name of the table to read when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "-g",
+    "--geom_col",
+    required=False,
+    default=const.DEFAULTS["g"],
+    type=str,
+    help="Column name to use when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "--tempdir",
+    default=const.DEFAULTS["tempdir"],
+    type=click.Path(),
+    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
+)
+@click.option("-o", "--overwrite", is_flag=True)
+@click.version_option(version=__version__)
+def rhp(
+    vector_input: Union[str, Path],
+    output_directory: Union[str, Path],
+    resolution: str,
+    parent_res: str,
+    id_field: str,
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_crs: int,
+    cut_threshold: int,
+    threads: int,
+    table: str,
+    geom_col: str,
+    tempdir: Union[str, Path],
+    overwrite: bool,
+):
+    """
+    Ingest a vector dataset and index it to the rHEALPix DGGS.
+    VECTOR_INPUT is the path to input vector geospatial data.
+    OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
+    """
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
+    common.check_resolutions(resolution, parent_res)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
+    if cut_crs is not None:
+        cut_crs = pyproj.CRS.from_user_input(cut_crs)
+    try:
+        common.index(
+            "rhp",
+            rhppolyfill,
+            rhp_secondary_index,
+            vector_input,
+            output_directory,
+            int(resolution),
+            parent_res,
+            keep_attributes,
+            chunksize,
+            spatial_sorting,
+            cut_threshold,
+            threads,
+            cut_crs=cut_crs,
+            id_field=id_field,
+            con=con,
+            table=table,
+            geom_col=geom_col,
+            overwrite=overwrite,
+        )
+    except:
+        raise
+    else:
+        sys.exit(0)

{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.1
 Name: vector2dggs
-Version: 0.6.1
+Version: 0.6.3
 Summary: CLI DGGS indexer for vector geospatial data
 Home-page: https://github.com/manaakiwhenua/vector2dggs
 License: LGPL-3.0-or-later
-Keywords: dggs,vector,h3,cli
+Keywords: dggs,vector,h3,rHEALPix,cli
 Author: James Ardo
 Author-email: ardoj@landcareresearch.co.nz
 Maintainer: Richard Law
@@ -13,23 +13,27 @@ Requires-Python: >=3.11,<4.0
 Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Scientific/Engineering :: GIS
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Dist: click (>=8.1.7,<9.0.0)
 Requires-Dist: click-log (>=0.4.0,<0.5.0)
-Requires-Dist: dask (>=2024.8.0,<2025.0.0)
-Requires-Dist: dask-geopandas (>=0.4.1,<0.5.0)
-Requires-Dist: gdal (==3.8.4)
+Requires-Dist: dask (>=2025.1,<2026.0)
+Requires-Dist: dask-geopandas (>=0.4,<0.5)
+Requires-Dist: gdal (>=3.8,<4.0)
 Requires-Dist: geopandas (>=1.0.1,<2.0.0)
-Requires-Dist: h3pandas (>=0.2.6,<0.3.0)
-Requires-Dist: numpy (<2)
+Requires-Dist: h3pandas (>=0.3,<0.4)
+Requires-Dist: numpy (>=2,<3)
+Requires-Dist: pillow (>=11.2.1,<12.0.0)
 Requires-Dist: psycopg2 (>=2.9.9,<3.0.0)
-Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
-Requires-Dist: pyproj (>=3.6.1,<4.0.0)
-Requires-Dist: shapely (>=2.0.5,<3.0.0)
+Requires-Dist: pyarrow (>=20.0,<21.0)
+Requires-Dist: pyproj (>=3.7,<4.0)
+Requires-Dist: rhealpixdggs (>=0.5.5,<0.6.0)
+Requires-Dist: rhppandas (>=0.1.2,<0.2.0)
+Requires-Dist: shapely (>=2.1,<3.0)
 Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
-Requires-Dist: tqdm (>=4.66.5,<5.0.0)
+Requires-Dist: tqdm (>=4.67,<5.0)
 Project-URL: Repository, https://github.com/manaakiwhenua/vector2dggs
 Description-Content-Type: text/markdown
@@ -41,9 +45,12 @@ Python-based CLI tool to index raster files to DGGS in parallel, writing out to
 This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
-Currently only supports H3 DGGS, and probably has other limitations since it has been developed for a specific internal use case, though it is intended as a general-purpose abstraction. Contributions, suggestions, bug reports and strongly worded letters are all welcome.
+Currently this tool supports the following DGGSs:
-Currently only supports polygons; but both coverages (strictly non-overlapping polygons), and sets of polygons that do/may overlap, are supported. Overlapping polygons are captured by ensuring that DGGS cell IDs may be non-unique (repeated) in the output.
+- H3 (polygons, linestrings)
+- rHEALPix (polygons)
+Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
 ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
@@ -55,6 +62,19 @@ pip install vector2dggs
 ## Usage
+```bash
+vector2dggs --help                                                                                                                                                                                                           [11:22:14]
+Usage: vector2dggs [OPTIONS] COMMAND [ARGS]...
+Options:
+  --version  Show the version and exit.
+  --help     Show this message and exit.
+Commands:
+  h3   Ingest a vector dataset and index it to the H3 DGGS.
+  rhp  Ingest a vector dataset and index it to the rHEALPix DGGS.
+```
 ```bash
 vector2dggs h3 --help
 Usage: vector2dggs h3 [OPTIONS] VECTOR_INPUT OUTPUT_DIRECTORY
@@ -153,13 +173,13 @@ In brief, to get started:
 - Install [Poetry](https://python-poetry.org/docs/basic-usage/)
 - Install [GDAL](https://gdal.org/)
     - If you're on Windows, `pip install gdal` may be necessary before running the subsequent commands.
-    - On Linux, install GDAL 3.6+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.
+    - On Linux, install GDAL 3.8+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.
 - Create the virtual environment with `poetry init`. This will install necessary dependencies.
 - Subsequently, the virtual environment can be re-activated with `poetry shell`.
 If you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.
-Alternaively, it is also possible to isntall using pip with `pip install -e .`, and bypass Poetry.
+Alternatively, it is also possible to install using pip with `pip install -e .`, and bypass Poetry.
 #### Code formatting
@@ -189,14 +209,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.6.1},
+  version={0.6.3},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.6.1) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.6.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

vector2dggs-0.6.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+vector2dggs/__init__.py,sha256=75x2g9bnxuIHCJ-8fmoB5K2n_V2ayIxdRNvYsNUaMhs,27
+vector2dggs/cli.py,sha256=HoPp7Bwk2kZghAms6wNepx-bFhoAuHH7WXACMIy3MuM,652
+vector2dggs/common.py,sha256=DL3ohG-QQyI-phyxeO6Fi2BOwWnFct-I_Y87_XC2SRQ,10578
+vector2dggs/constants.py,sha256=u6n6XNvEVLUexn9Sb2rc22s2B4Rrg_VXFJaM7uEy-9Q,536
+vector2dggs/h3.py,sha256=GgiGOVbsXXNp95KWKKmJZvDxGFj91TTWl575OaPZ6yk,6145
+vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
+vector2dggs/rHP.py,sha256=Y36tPbtY-tYBUFILHD-xnUxa2yKlYotGP6043Bg5nZc,6450
+vector2dggs-0.6.3.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+vector2dggs-0.6.3.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
+vector2dggs-0.6.3.dist-info/METADATA,sha256=sFXXSXOutJzbTLRviHXXn7RkN51SCfbD6ZRBeledcmY,10223
+vector2dggs-0.6.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vector2dggs-0.6.3.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
+vector2dggs-0.6.3.dist-info/RECORD,,

{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.6.1
+Generator: poetry-core 1.9.0
 Root-Is-Purelib: true
 Tag: py3-none-any

vector2dggs-0.6.1.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-vector2dggs/__init__.py,sha256=8sqy-gBl8LCgOL8GSQBSJg6UWO0eWPpvb3gdHmGQvbg,27
-vector2dggs/cli.py,sha256=tL4NJ99uQsqoVinwYadna1a4ko5v2sdZaFaeDAj6QNE,599
-vector2dggs/h3.py,sha256=AMH9VdspvKu26VhFmuWf48xm4VEDKxmNuvOeb_I2nmI,14310
-vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
-vector2dggs-0.6.1.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-vector2dggs-0.6.1.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
-vector2dggs-0.6.1.dist-info/METADATA,sha256=djCsEsjEqHZp2iTyMhBaTVOs0VtNAORURR7s9N8cs0U,9846
-vector2dggs-0.6.1.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-vector2dggs-0.6.1.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
-vector2dggs-0.6.1.dist-info/RECORD,,

{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING RENAMED Viewed

File without changes

{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING.LESSER RENAMED Viewed

File without changes

{vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vector2dggs 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

vector2dggs 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl