PyPI - vector2dggs - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

vector2dggs 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

vector2dggs/__init__.py +1 -1
vector2dggs/cli.py +2 -0
vector2dggs/common.py +354 -0
vector2dggs/constants.py +26 -0
vector2dggs/h3.py +37 -300
vector2dggs/katana.py +10 -3
vector2dggs/rHP.py +217 -0
{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/METADATA +24 -18
vector2dggs-0.6.2.dist-info/RECORD +13 -0
{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/WHEEL +1 -1
vector2dggs-0.6.0.dist-info/RECORD +0 -10
{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/COPYING +0 -0
{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/COPYING.LESSER +0 -0
{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/entry_points.txt +0 -0

vector2dggs/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "0.6.0"
1	+ __version__: str = "0.6.2"

vector2dggs/cli.py CHANGED Viewed

@@ -2,6 +2,7 @@ import click
 from vector2dggs import __version__
 from vector2dggs.h3 import h3
+from vector2dggs.rHP import rhp
 #   If the program does terminal interaction, make it output a short
 # notice like this when it starts in an interactive mode:
@@ -19,6 +20,7 @@ def cli():
 cli.add_command(h3)
+cli.add_command(rhp)
 def main():

vector2dggs/common.py ADDED Viewed

@@ -0,0 +1,354 @@
+import os
+import errno
+import logging
+import tempfile
+import click_log
+import sqlalchemy
+import shutil
+import pyproj
+import pandas as pd
+import geopandas as gpd
+import dask.dataframe as dd
+import dask_geopandas as dgpd
+from typing import Union, Callable
+from pathlib import Path, PurePath
+from urllib.parse import urlparse
+from tqdm import tqdm
+from tqdm.dask import TqdmCallback
+from multiprocessing.dummy import Pool
+from shapely.geometry import GeometryCollection
+import vector2dggs.constants as const
+from . import katana
+SQLConnectionType = Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine]
+LOGGER = logging.getLogger(__name__)
+click_log.basic_config(LOGGER)
+click_log.ColorFormatter.colors["info"] = dict(fg="green")
+class ParentResolutionException(Exception):
+    pass
+def check_resolutions(resolution: int, parent_res: int) -> None:
+    if parent_res is not None and not int(parent_res) < int(resolution):
+        raise ParentResolutionException(
+            "Parent resolution ({pr}) must be less than target resolution ({r})".format(
+                pr=parent_res, r=resolution
+            )
+        )
+def db_conn_and_input_path(
+    vector_input: Union[str, Path],
+) -> tuple[SQLConnectionType, Union[str, Path]]:
+    con: sqlalchemy.engine.Connection = None
+    scheme: str = urlparse(vector_input).scheme
+    if bool(scheme) and scheme != "file":
+        # Assume database connection
+        con = sqlalchemy.create_engine(vector_input)
+    elif not Path(vector_input).exists():
+        if not scheme:
+            LOGGER.error(
+                f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
+            )
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), vector_input
+            )
+        vector_input = str(vector_input)
+    else:
+        vector_input = Path(vector_input)
+    return (con, vector_input)
+def resolve_output_path(
+    output_directory: Union[str, Path], overwrite: bool
+) -> Union[str, Path]:
+    output_directory = Path(output_directory)
+    outputexists = os.path.exists(output_directory)
+    if outputexists and not overwrite:
+        raise FileExistsError(
+            f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
+        )
+    elif outputexists and overwrite:
+        LOGGER.warning(f"Overwriting the contents of {output_directory}")
+        shutil.rmtree(output_directory)
+    output_directory.mkdir(parents=True, exist_ok=True)
+    return output_directory
+def drop_condition(
+    df: pd.DataFrame,
+    drop_index: pd.Index,
+    log_statement: str,
+    warning_threshold: float = 0.01,
+):
+    LOGGER.debug(log_statement)
+    _before = len(df)
+    df = df.drop(drop_index)
+    _after = len(df)
+    _diff = _before - _after
+    if _diff:
+        log_method = (
+            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
+        )
+        log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
+    return df
+def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int):
+    """
+    Uses a parent resolution,
+    OR,
+    Given a target resolution, returns our recommended parent resolution.
+    Used for intermediate re-partioning.
+    """
+    if dggs == "h3":
+        return (
+            parent_res
+            if parent_res is not None
+            else max(const.MIN_H3, (resolution - const.DEFAULT_PARENT_OFFSET))
+        )
+    elif dggs == "rhp":
+        return (
+            parent_res
+            if parent_res is not None
+            else max(const.MIN_RHP, (resolution - const.DEFAULT_PARENT_OFFSET))
+        )
+    else:
+        raise RuntimeError(
+            "Unknown dggs {dggs}) -  must be one of [ 'h3', 'rhp' ]".format(dggs=dggs)
+        )
+def parent_partitioning(
+    dggs: str,
+    input_dir: Path,
+    output_dir: Path,
+    resolution: int,
+    parent_res: Union[None, int],
+    **kwargs,
+) -> None:
+    parent_res: int = get_parent_res(dggs, parent_res, resolution)
+    partition_col = f"{dggs}_{parent_res:02}"
+    with TqdmCallback(desc="Repartitioning"):
+        dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
+            output_dir,
+            overwrite=kwargs.get("overwrite", False),
+            engine=kwargs.get("engine", "pyarrow"),
+            partition_on=partition_col,
+            compression=kwargs.get("compression", "ZSTD"),
+        )
+    LOGGER.debug("Parent cell repartitioning complete")
+    # Rename output to just be the partition key, suffix .parquet
+    for f in os.listdir(output_dir):
+        os.rename(
+            os.path.join(output_dir, f),
+            os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
+        )
+    return
+def polyfill(
+    dggs: str,
+    dggsfunc: Callable,
+    secondary_index_func: Callable,
+    pq_in: Path,
+    spatial_sort_col: str,
+    resolution: int,
+    parent_res: Union[None, int],
+    output_directory: str,
+) -> None:
+    """
+    Reads a geoparquet, performs polyfilling (for Polygon),
+    linetracing (for LineString), and writes out to parquet.
+    """
+    df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
+    if len(df.index) == 0:
+        # Input is empty, nothing to polyfill
+        return None
+    # DGGS specific polyfill
+    df = dggsfunc(df, resolution)
+    if len(df.index) == 0:
+        # Polyfill resulted in empty output (e.g. large cell, small feature)
+        return None
+    df.index.rename(f"{dggs}_{resolution:02}", inplace=True)
+    parent_res: int = get_parent_res(dggs, parent_res, resolution)
+    # print(parent_res)
+    # print(df.index)
+    # print(df.columns)
+    # Secondary (parent) index, used later for partitioning
+    df = secondary_index_func(df, parent_res)
+    df.to_parquet(
+        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
+    )
+    return None
+def polyfill_star(args) -> None:
+    return polyfill(*args)
+def index(
+    dggs: str,
+    dggsfunc: Callable,
+    secondary_index_func: Callable,
+    input_file: Union[Path, str],
+    output_directory: Union[Path, str],
+    resolution: int,
+    parent_res: Union[None, int],
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_threshold: int,
+    processes: int,
+    id_field: str = None,
+    cut_crs: pyproj.CRS = None,
+    con: SQLConnectionType = None,
+    table: str = None,
+    geom_col: str = "geom",
+    overwrite: bool = False,
+) -> Path:
+    """
+    Performs multi-threaded polyfilling on (multi)polygons.
+    """
+    if table and con:
+        # Database connection
+        if keep_attributes:
+            q = sqlalchemy.text(f"SELECT * FROM {table}")
+        elif id_field and not keep_attributes:
+            q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
+        else:
+            q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
+        df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
+            "geometry"
+        )
+    else:
+        # Read file
+        df = gpd.read_file(input_file)
+    if cut_crs:
+        df = df.to_crs(cut_crs)
+    LOGGER.debug("Cutting with CRS: %s", df.crs)
+    if id_field:
+        df = df.set_index(id_field)
+    else:
+        df = df.reset_index()
+        df = df.rename(columns={"index": "fid"}).set_index("fid")
+    if not keep_attributes:
+        # Remove all attributes except the geometry
+        df = df.loc[:, ["geometry"]]
+    LOGGER.debug("Cutting large geometries")
+    with tqdm(total=df.shape[0], desc="Splitting") as pbar:
+        for index, row in df.iterrows():
+            df.loc[index, "geometry"] = GeometryCollection(
+                katana.katana(row.geometry, cut_threshold)
+            )
+            pbar.update(1)
+    LOGGER.debug("Exploding geometry collections and multipolygons")
+    df = (
+        df.to_crs(4326)
+        .explode(index_parts=False)  # Explode from GeometryCollection
+        .explode(index_parts=False)  # Explode multipolygons to polygons
+    ).reset_index()
+    drop_conditions = [
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.is_empty | frame.geometry.isna())
+            ],
+            "message": "Considering empty or null geometries",
+        },
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.geom_type != "Polygon")
+                & (frame.geometry.geom_type != "LineString")
+            ],  # NB currently points and other types are lost; in principle, these could be indexed
+            "message": "Considering unsupported geometries",
+        },
+    ]
+    for condition in drop_conditions:
+        df = drop_condition(df, condition["index"](df).index, condition["message"])
+    ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
+    LOGGER.debug("Spatially sorting and partitioning (%s)", spatial_sorting)
+    ddf = ddf.spatial_shuffle(by=spatial_sorting)
+    spatial_sort_col = (
+        spatial_sorting
+        if spatial_sorting == "geohash"
+        else f"{spatial_sorting}_distance"
+    )
+    with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
+        with TqdmCallback(desc=f"Spatially partitioning"):
+            ddf.to_parquet(tmpdir, overwrite=True)
+        filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
+        # Multithreaded polyfilling
+        LOGGER.debug(
+            "Indexing on spatial partitions by polyfill with resolution: %d",
+            resolution,
+        )
+        with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
+            with Pool(processes=processes) as pool:
+                args = [
+                    (
+                        dggs,
+                        dggsfunc,
+                        secondary_index_func,
+                        filepath,
+                        spatial_sort_col,
+                        resolution,
+                        parent_res,
+                        tmpdir2,
+                    )
+                    for filepath in filepaths
+                ]
+                list(
+                    tqdm(
+                        pool.imap(polyfill_star, args),
+                        total=len(args),
+                        desc="DGGS indexing",
+                    )
+                )
+            parent_partitioning(
+                dggs,
+                tmpdir2,
+                output_directory,
+                resolution,
+                parent_res,
+                overwrite=overwrite,
+            )
+    return output_directory

vector2dggs/constants.py ADDED Viewed

@@ -0,0 +1,26 @@
+import multiprocessing
+import warnings
+import tempfile
+MIN_H3, MAX_H3 = 0, 15
+MIN_RHP, MAX_RHP = 0, 15
+DEFAULTS = {
+    "id": None,
+    "k": False,
+    "ch": 50,
+    "s": "hilbert",
+    "crs": None,
+    "c": 5000,
+    "t": (multiprocessing.cpu_count() - 1),
+    "tbl": None,
+    "g": "geom",
+    "tempdir": tempfile.tempdir,
+}
+DEFAULT_PARENT_OFFSET = 6
+warnings.filterwarnings(
+    "ignore"
+)  # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows

vector2dggs/h3.py CHANGED Viewed

@@ -1,82 +1,28 @@
-import errno
-import logging
-import os
-import multiprocessing
-from multiprocessing.dummy import Pool
-from pathlib import Path, PurePath
-import shutil
 import sys
-import tempfile
-from typing import Union
-from urllib.parse import urlparse
-import warnings
-os.environ["USE_PYGEOS"] = "0"
 import click
 import click_log
-import dask.dataframe as dd
-import dask_geopandas as dgpd
-import geopandas as gpd
-import h3pandas
-import pandas as pd
+import tempfile
 import pyproj
-from shapely.geometry import GeometryCollection
-import sqlalchemy
-from tqdm import tqdm
-from tqdm.dask import TqdmCallback
-from . import katana
-from vector2dggs import __version__
-LOGGER = logging.getLogger(__name__)
-click_log.basic_config(LOGGER)
-MIN_H3, MAX_H3 = 0, 15
-warnings.filterwarnings(
-    "ignore"
-)  # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows
-DEFAULT_PARENT_OFFSET = 6
-DEFAULT_CHUNK_SIZE = 50
+import h3pandas  # Necessary import despite lack of explicit use
+import pandas as pd
+import geopandas as gpd
-class ParentResolutionException(Exception):
-    pass
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
-def _get_parent_res(parent_res: Union[None, int], resolution: int):
-    """
-    Uses a parent resolution,
-    OR,
-    Given a target resolution, returns our recommended parent resolution.
+from vector2dggs import __version__
-    Used for intermediate re-partioning.
-    """
-    return (
-        int(parent_res)
-        if parent_res is not None
-        else max(MIN_H3, (resolution - DEFAULT_PARENT_OFFSET))
-    )
+def h3_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+    return df.h3.h3_to_parent(parent_res)
-def polyfill(
-    pq_in: Path,
-    spatial_sort_col: str,
-    resolution: int,
-    parent_res: Union[None, int],
-    output_directory: str,
-) -> None:
-    """
-    Reads a geoparquet, performs H3 polyfilling (for polygons),
-    linetracing (for linestrings), and writes out to parquet.
-    """
-    df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
-    if len(df.index) == 0:
-        # Input is empty, nothing to polyfill
-        return None
+def h3polyfill(df: gpd.GeoDataFrame, resolution: int):
     df_polygon = df[df.geom_type == "Polygon"]
     if len(df_polygon.index) > 0:
         df_polygon = df_polygon.h3.polyfill_resample(
@@ -92,207 +38,23 @@ def polyfill(
         )
         df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
-    df = pd.concat(
+    return pd.concat(
         map(
             lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
             [df_polygon, df_linestring],
         )
     )
-    if len(df.index) == 0:
-        # Polyfill resulted in empty output (e.g. large cell, small feature)
-        return None
-    df.index.rename(f"h3_{resolution:02}", inplace=True)
-    parent_res: int = _get_parent_res(parent_res, resolution)
-    # Secondary (parent) H3 index, used later for partitioning
-    df.h3.h3_to_parent(parent_res).to_parquet(
-        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
-    )
-    return None
-def polyfill_star(args) -> None:
-    return polyfill(*args)
-def _parent_partitioning(
-    input_dir: Path,
-    output_dir: Path,
-    resolution: int,
-    parent_res: Union[None, int],
-    **kwargs,
-) -> None:
-    parent_res: int = _get_parent_res(parent_res, resolution)
-    partition_col = f"h3_{parent_res:02}"
-    with TqdmCallback(desc="Repartitioning"):
-        dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
-            output_dir,
-            overwrite=kwargs.get("overwrite", False),
-            engine=kwargs.get("engine", "pyarrow"),
-            partition_on=partition_col,
-            compression=kwargs.get("compression", "ZSTD"),
-        )
-    LOGGER.debug("Parent cell repartitioning complete")
-    # Rename output to just be the partition key, suffix .parquet
-    for f in os.listdir(output_dir):
-        os.rename(
-            os.path.join(output_dir, f),
-            os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
-        )
-    return
-def drop_condition(
-    df: pd.DataFrame,
-    drop_index: pd.Index,
-    log_statement: str,
-    warning_threshold: float = 0.01,
-):
-    LOGGER.info(log_statement)
-    _before = len(df)
-    df = df.drop(drop_index)
-    _after = len(df)
-    _diff = _before - _after
-    if _diff:
-        log_method = (
-            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
-        )
-        log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
-    return df
-def _index(
-    input_file: Union[Path, str],
-    output_directory: Union[Path, str],
-    resolution: int,
-    parent_res: Union[None, int],
-    keep_attributes: bool,
-    chunksize: int,
-    spatial_sorting: str,
-    cut_threshold: int,
-    processes: int,
-    id_field: str = None,
-    cut_crs: pyproj.CRS = None,
-    con: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine] = None,
-    table: str = None,
-    geom_col: str = "geom",
-    overwrite: bool = False,
-) -> Path:
-    """
-    Performs multi-threaded H3 polyfilling on (multi)polygons.
-    """
-    if table and con:
-        # Database connection
-        if keep_attributes:
-            q = sqlalchemy.text(f"SELECT * FROM {table}")
-        elif id_field and not keep_attributes:
-            q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
-        else:
-            q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
-        df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
-            "geometry"
-        )
-    else:
-        # Read file
-        df = gpd.read_file(input_file)
-    if cut_crs:
-        df = df.to_crs(cut_crs)
-    LOGGER.info("Cutting with CRS: %s", df.crs)
-    if id_field:
-        df = df.set_index(id_field)
-    else:
-        df = df.reset_index()
-        df = df.rename(columns={"index": "fid"}).set_index("fid")
-    if not keep_attributes:
-        # Remove all attributes except the geometry
-        df = df.loc[:, ["geometry"]]
-    LOGGER.info("Watch out for ninjas! (Cutting polygons)")
-    with tqdm(total=df.shape[0]) as pbar:
-        for index, row in df.iterrows():
-            df.loc[index, "geometry"] = GeometryCollection(
-                katana.katana(row.geometry, cut_threshold)
-            )
-            pbar.update(1)
-    LOGGER.info("Exploding geometry collections and multipolygons")
-    df = (
-        df.to_crs(4326)
-        .explode(index_parts=False)  # Explode from GeometryCollection
-        .explode(index_parts=False)  # Explode multipolygons to polygons
-    ).reset_index()
-    drop_conditions = [
-        {
-            "index": lambda frame: frame[
-                (frame.geometry.is_empty | frame.geometry.isna())
-            ],
-            "message": "Dropping empty or null geometries",
-        },
-        {
-            "index": lambda frame: frame[
-                (frame.geometry.geom_type != "Polygon")
-                & (frame.geometry.geom_type != "LineString")
-            ],  # NB currently points and other types are lost; in principle, these could be indexed
-            "message": "Dropping non-polygonal geometries",
-        },
-    ]
-    for condition in drop_conditions:
-        df = drop_condition(df, condition["index"](df).index, condition["message"])
-    ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
-    LOGGER.info("Spatially sorting and partitioning (%s)", spatial_sorting)
-    ddf = ddf.spatial_shuffle(by=spatial_sorting)
-    spatial_sort_col = (
-        spatial_sorting
-        if spatial_sorting == "geohash"
-        else f"{spatial_sorting}_distance"
-    )
-    with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
-        with TqdmCallback():
-            ddf.to_parquet(tmpdir, overwrite=True)
-        filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
-        # Multithreaded polyfilling
-        LOGGER.info(
-            "H3 Indexing on spatial partitions by polyfill with H3 resolution: %d",
-            resolution,
-        )
-        with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
-            with Pool(processes=processes) as pool:
-                args = [
-                    (filepath, spatial_sort_col, resolution, parent_res, tmpdir2)
-                    for filepath in filepaths
-                ]
-                list(tqdm(pool.imap(polyfill_star, args), total=len(args)))
-            _parent_partitioning(
-                tmpdir2, output_directory, resolution, parent_res, overwrite=overwrite
-            )
-    return output_directory
 @click.command(context_settings={"show_default": True})
-@click_log.simple_verbosity_option(LOGGER)
+@click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
 @click.argument("output_directory", required=True, type=click.Path(), nargs=1)
 @click.option(
     "-r",
     "--resolution",
     required=True,
-    type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
+    type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
     help="H3 resolution to index",
     nargs=1,
 )
@@ -300,14 +62,14 @@ def _index(
     "-pr",
     "--parent_res",
     required=False,
-    type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
+    type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
     help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
 )
 @click.option(
     "-id",
     "--id_field",
     required=False,
-    default=None,
+    default=const.DEFAULTS["id"],
     type=str,
     help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
     nargs=1,
@@ -317,7 +79,7 @@ def _index(
     "--keep_attributes",
     is_flag=True,
     show_default=True,
-    default=False,
+    default=const.DEFAULTS["k"],
     help="Retain attributes in output. The default is to create an output that only includes H3 cell ID and the ID given by the -id field (or the default index ID).",
 )
 @click.option(
@@ -325,7 +87,7 @@ def _index(
     "--chunksize",
     required=True,
     type=int,
-    default=DEFAULT_CHUNK_SIZE,
+    default=const.DEFAULTS["ch"],
     help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
     nargs=1,
 )
@@ -333,32 +95,32 @@ def _index(
     "-s",
     "--spatial_sorting",
     type=click.Choice(["hilbert", "morton", "geohash"]),
-    default="hilbert",
+    default=const.DEFAULTS["s"],
     help="Spatial sorting method when perfoming spatial partitioning.",
 )
 @click.option(
     "-crs",
     "--cut_crs",
     required=False,
-    default=None,
+    default=const.DEFAULTS["crs"],
     type=int,
-    help="Set the coordinate reference system (CRS) used for cutting large polygons (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
     nargs=1,
 )
 @click.option(
     "-c",
     "--cut_threshold",
     required=True,
-    default=5000,
+    default=const.DEFAULTS["c"],
     type=int,
-    help="Cutting up large polygons into smaller pieces based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
+    help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
     nargs=1,
 )
 @click.option(
     "-t",
     "--threads",
     required=False,
-    default=7,
+    default=const.DEFAULTS["t"],
     type=int,
     help="Amount of threads used for operation",
     nargs=1,
@@ -367,7 +129,7 @@ def _index(
     "-tbl",
     "--table",
     required=False,
-    default=None,
+    default=const.DEFAULTS["tbl"],
     type=str,
     help="Name of the table to read when using a spatial database connection as input",
     nargs=1,
@@ -376,14 +138,14 @@ def _index(
     "-g",
     "--geom_col",
     required=False,
-    default="geom",
+    default=const.DEFAULTS["g"],
     type=str,
     help="Column name to use when using a spatial database connection as input",
     nargs=1,
 )
 @click.option(
     "--tempdir",
-    default=tempfile.tempdir,
+    default=const.DEFAULTS["tempdir"],
     type=click.Path(),
     help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
@@ -412,46 +174,21 @@ def h3(
     VECTOR_INPUT is the path to input vector geospatial data.
     OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
     """
-    tempfile.tempdir = tempdir
-    if parent_res is not None and not int(parent_res) < int(resolution):
-        raise ParentResolutionException(
-            "Parent resolution ({pr}) must be less than target resolution ({r})".format(
-                pr=parent_res, r=resolution
-            )
-        )
-    con: sqlalchemy.engine.Connection = None
-    scheme: str = urlparse(vector_input).scheme
-    if bool(scheme) and scheme != "file":
-        # Assume database connection
-        con = sqlalchemy.create_engine(vector_input)
-    elif not Path(vector_input).exists():
-        if not scheme:
-            LOGGER.warning(
-                f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
-            )
-            raise FileNotFoundError(
-                errno.ENOENT, os.strerror(errno.ENOENT), vector_input
-            )
-        vector_input = str(vector_input)
-    else:
-        vector_input = Path(vector_input)
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
-    output_directory = Path(output_directory)
-    outputexists = os.path.exists(output_directory)
-    if outputexists and not overwrite:
-        raise FileExistsError(
-            f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
-        )
-    elif outputexists and overwrite:
-        LOGGER.info(f"Overwriting the contents of {output_directory}")
-        shutil.rmtree(output_directory)
-    output_directory.mkdir(parents=True, exist_ok=True)
+    common.check_resolutions(resolution, parent_res)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
     if cut_crs is not None:
         cut_crs = pyproj.CRS.from_user_input(cut_crs)
     try:
-        _index(
+        common.index(
+            "h3",
+            h3polyfill,
+            h3_secondary_index,
             vector_input,
             output_directory,
             int(resolution),

vector2dggs/katana.py CHANGED Viewed

@@ -11,13 +11,20 @@ Redistribution and use in source and binary forms, with or without modification,
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """
-from shapely.geometry import box, Polygon, MultiPolygon, GeometryCollection
+from shapely.geometry import (
+    box,
+    Polygon,
+    MultiPolygon,
+    LineString,
+    MultiLineString,
+    GeometryCollection,
+)
 from shapely.validation import explain_validity, make_valid
 def katana(geometry, threshold, count=0) -> GeometryCollection:
     """
-    Split a polygon into two parts across it's shortest dimension.
+    Split a geometry into two parts across its shortest dimension.
     Invalid input `geometry` will silently be made valid (if possible).
     """
     if geometry is None:
@@ -53,7 +60,7 @@ def katana(geometry, threshold, count=0) -> GeometryCollection:
         if not isinstance(c, GeometryCollection):
             c = GeometryCollection([c])
         for e in c.geoms:
-            if isinstance(e, (Polygon, MultiPolygon)):
+            if isinstance(e, (Polygon, MultiPolygon, LineString, MultiLineString)):
                 result.extend(katana(e, threshold, count + 1))
     if count > 0:
         return result

vector2dggs/rHP.py ADDED Viewed

@@ -0,0 +1,217 @@
+import sys
+import click
+import click_log
+import tempfile
+import pyproj
+import rhppandas  # Necessary import despite lack of explicit use
+import pandas as pd
+import geopandas as gpd
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
+from vector2dggs import __version__
+def rhp_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+    return df.rhp.rhp_to_parent(parent_res)
+def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
+    df_polygon = df[df.geom_type == "Polygon"]
+    if len(df_polygon.index) > 0:
+        df_polygon = df_polygon.rhp.polyfill_resample(
+            resolution, return_geometry=False
+        ).drop(columns=["index"])
+    df_multipolygon = df[df.geom_type == "MultiPolygon"]
+    if len(df_multipolygon.index) > 0:
+        df_multipolygon = df_multipolygon.rhp.polyfill_resample(
+            resolution, return_geometry=False
+        ).drop(columns=["index"])
+    # df_linestring = df[df.geom_type == "LineString"]
+    # if len(df_linestring.index) > 0:
+    #     df_linestring = (
+    #         df_linestring.h3.linetrace(resolution)
+    #         .explode("h3_linetrace")
+    #         .set_index("h3_linetrace")
+    #     )
+    #     df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
+    return pd.concat(
+        map(
+            lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
+            [df_polygon, df_multipolygon],  # df_linestring],
+        )
+    )
+@click.command(context_settings={"show_default": True})
+@click_log.simple_verbosity_option(common.LOGGER)
+@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
+@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
+@click.option(
+    "-r",
+    "--resolution",
+    required=True,
+    type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
+    help="H3 resolution to index",
+    nargs=1,
+)
+@click.option(
+    "-pr",
+    "--parent_res",
+    required=False,
+    type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
+    help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
+)
+@click.option(
+    "-id",
+    "--id_field",
+    required=False,
+    default=const.DEFAULTS["id"],
+    type=str,
+    help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
+    nargs=1,
+)
+@click.option(
+    "-k",
+    "--keep_attributes",
+    is_flag=True,
+    show_default=True,
+    default=const.DEFAULTS["k"],
+    help="Retain attributes in output. The default is to create an output that only includes rHEALPix cell ID and the ID given by the -id field (or the default index ID).",
+)
+@click.option(
+    "-ch",
+    "--chunksize",
+    required=True,
+    type=int,
+    default=const.DEFAULTS["ch"],
+    help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
+    nargs=1,
+)
+@click.option(
+    "-s",
+    "--spatial_sorting",
+    type=click.Choice(["hilbert", "morton", "geohash"]),
+    default=const.DEFAULTS["s"],
+    help="Spatial sorting method when perfoming spatial partitioning.",
+)
+@click.option(
+    "-crs",
+    "--cut_crs",
+    required=False,
+    default=const.DEFAULTS["crs"],
+    type=int,
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    nargs=1,
+)
+@click.option(
+    "-c",
+    "--cut_threshold",
+    required=True,
+    default=const.DEFAULTS["c"],
+    type=int,
+    help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
+    nargs=1,
+)
+@click.option(
+    "-t",
+    "--threads",
+    required=False,
+    default=const.DEFAULTS["t"],
+    type=int,
+    help="Amount of threads used for operation",
+    nargs=1,
+)
+@click.option(
+    "-tbl",
+    "--table",
+    required=False,
+    default=const.DEFAULTS["tbl"],
+    type=str,
+    help="Name of the table to read when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "-g",
+    "--geom_col",
+    required=False,
+    default=const.DEFAULTS["g"],
+    type=str,
+    help="Column name to use when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "--tempdir",
+    default=const.DEFAULTS["tempdir"],
+    type=click.Path(),
+    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
+)
+@click.option("-o", "--overwrite", is_flag=True)
+@click.version_option(version=__version__)
+def rhp(
+    vector_input: Union[str, Path],
+    output_directory: Union[str, Path],
+    resolution: str,
+    parent_res: str,
+    id_field: str,
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_crs: int,
+    cut_threshold: int,
+    threads: int,
+    table: str,
+    geom_col: str,
+    tempdir: Union[str, Path],
+    overwrite: bool,
+):
+    """
+    Ingest a vector dataset and index it to the rHEALPix DGGS.
+    VECTOR_INPUT is the path to input vector geospatial data.
+    OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
+    """
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
+    common.check_resolutions(resolution, parent_res)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
+    if cut_crs is not None:
+        cut_crs = pyproj.CRS.from_user_input(cut_crs)
+    try:
+        common.index(
+            "rhp",
+            rhppolyfill,
+            rhp_secondary_index,
+            vector_input,
+            output_directory,
+            int(resolution),
+            parent_res,
+            keep_attributes,
+            chunksize,
+            spatial_sorting,
+            cut_threshold,
+            threads,
+            cut_crs=cut_crs,
+            id_field=id_field,
+            con=con,
+            table=table,
+            geom_col=geom_col,
+            overwrite=overwrite,
+        )
+    except:
+        raise
+    else:
+        sys.exit(0)

{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,35 +1,39 @@
 Metadata-Version: 2.1
 Name: vector2dggs
-Version: 0.6.0
+Version: 0.6.2
 Summary: CLI DGGS indexer for vector geospatial data
 Home-page: https://github.com/manaakiwhenua/vector2dggs
 License: LGPL-3.0-or-later
-Keywords: dggs,vector,h3,cli
+Keywords: dggs,vector,h3,rHEALPix,cli
 Author: James Ardo
 Author-email: ardoj@landcareresearch.co.nz
 Maintainer: Richard Law
 Maintainer-email: lawr@landcareresearch.co.nz
-Requires-Python: >=3.10,<4.0
+Requires-Python: >=3.11,<4.0
 Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Scientific/Engineering :: GIS
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
-Requires-Dist: click (>=8.1.3,<9.0.0)
+Requires-Dist: click (>=8.1.7,<9.0.0)
 Requires-Dist: click-log (>=0.4.0,<0.5.0)
-Requires-Dist: dask (>=2023.3.0,<2024.0.0)
-Requires-Dist: dask-geopandas (>=0.3.0,<0.4.0)
-Requires-Dist: gdal (>=3.8.0,<4.0.0)
-Requires-Dist: geopandas (>=0.12.2,<0.13.0)
-Requires-Dist: h3pandas (>=0.2.6,<0.3.0)
-Requires-Dist: psycopg2 (>=2.9.6,<3.0.0)
-Requires-Dist: pyarrow (>=14.0.1,<15.0.0)
-Requires-Dist: pygeos (>=0.13,<0.14)
-Requires-Dist: pyproj (>=3.5.0,<4.0.0)
-Requires-Dist: sqlalchemy (>=2.0.10,<3.0.0)
-Requires-Dist: tqdm (>=4.65.0,<5.0.0)
+Requires-Dist: dask (>=2025.1,<2026.0)
+Requires-Dist: dask-geopandas (>=0.4,<0.5)
+Requires-Dist: gdal (>=3.8,<4.0)
+Requires-Dist: geopandas (>=1.0.1,<2.0.0)
+Requires-Dist: h3pandas (>=0.3,<0.4)
+Requires-Dist: numpy (>=2,<3)
+Requires-Dist: pillow (>=11.2.1,<12.0.0)
+Requires-Dist: psycopg2 (>=2.9.9,<3.0.0)
+Requires-Dist: pyarrow (>=20.0,<21.0)
+Requires-Dist: pyproj (>=3.7,<4.0)
+Requires-Dist: rhealpixdggs (>=0.5.5,<0.6.0)
+Requires-Dist: rhppandas (>=0.1.2,<0.2.0)
+Requires-Dist: shapely (>=2.1,<3.0)
+Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
+Requires-Dist: tqdm (>=4.67,<5.0)
 Project-URL: Repository, https://github.com/manaakiwhenua/vector2dggs
 Description-Content-Type: text/markdown
@@ -159,6 +163,8 @@ In brief, to get started:
 If you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.
+Alternatively, it is also possible to install using pip with `pip install -e .`, and bypass Poetry.
 #### Code formatting
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
@@ -187,14 +193,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.6.0},
+  version={0.6.2},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.6.2) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

vector2dggs-0.6.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+vector2dggs/__init__.py,sha256=w9t1Aj5a_f__PKPw_C7bWnZmWL3_GHrtgVrGYGX1wfk,27
+vector2dggs/cli.py,sha256=HoPp7Bwk2kZghAms6wNepx-bFhoAuHH7WXACMIy3MuM,652
+vector2dggs/common.py,sha256=DL3ohG-QQyI-phyxeO6Fi2BOwWnFct-I_Y87_XC2SRQ,10578
+vector2dggs/constants.py,sha256=u6n6XNvEVLUexn9Sb2rc22s2B4Rrg_VXFJaM7uEy-9Q,536
+vector2dggs/h3.py,sha256=GgiGOVbsXXNp95KWKKmJZvDxGFj91TTWl575OaPZ6yk,6145
+vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
+vector2dggs/rHP.py,sha256=Y36tPbtY-tYBUFILHD-xnUxa2yKlYotGP6043Bg5nZc,6450
+vector2dggs-0.6.2.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+vector2dggs-0.6.2.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
+vector2dggs-0.6.2.dist-info/METADATA,sha256=kNT2Iyd8irBMo2Tq0_CwnORVNeCc1ekjO1TlMwBp6qY,10014
+vector2dggs-0.6.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vector2dggs-0.6.2.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
+vector2dggs-0.6.2.dist-info/RECORD,,

{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.4.0
+Generator: poetry-core 1.9.0
 Root-Is-Purelib: true
 Tag: py3-none-any

vector2dggs-0.6.0.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-vector2dggs/__init__.py,sha256=8xTECrwGH36hEfgoQ2Zcq4dfigWVZmIFK3OHqNOg-FQ,27
-vector2dggs/cli.py,sha256=tL4NJ99uQsqoVinwYadna1a4ko5v2sdZaFaeDAj6QNE,599
-vector2dggs/h3.py,sha256=kX3S630l-LOm04pe5YT-5g99DIV0t32GFYUEs0Hc5ZQ,14354
-vector2dggs/katana.py,sha256=xx5R9lDuraWLK5bfGkiDQDC2r2naj_sKZlYeB52_xwc,3320
-vector2dggs-0.6.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
-vector2dggs-0.6.0.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
-vector2dggs-0.6.0.dist-info/METADATA,sha256=d8dvx5_wXFO-ZMKcdXmc2TuXQv5B5UFRo1hq4fghe8w,9777
-vector2dggs-0.6.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
-vector2dggs-0.6.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-vector2dggs-0.6.0.dist-info/RECORD,,

{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/COPYING RENAMED Viewed

File without changes

{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/COPYING.LESSER RENAMED Viewed

File without changes

{vector2dggs-0.6.0.dist-info → vector2dggs-0.6.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vector2dggs 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

vector2dggs 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl