PyPI - vector2dggs - Versions diffs - 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

vector2dggs 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

vector2dggs/__init__.py +1 -1
vector2dggs/cli.py +4 -0
vector2dggs/common.py +23 -27
vector2dggs/constants.py +47 -0
vector2dggs/geohash.py +240 -0
vector2dggs/h3.py +8 -4
vector2dggs/katana.py +1 -1
vector2dggs/rHP.py +12 -14
vector2dggs/s2.py +349 -0
{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/METADATA +64 -8
vector2dggs-0.8.0.dist-info/RECORD +15 -0
vector2dggs-0.6.2.dist-info/RECORD +0 -13
{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/COPYING +0 -0
{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/COPYING.LESSER +0 -0
{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/WHEEL +0 -0
{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/entry_points.txt +0 -0

vector2dggs/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__: str = "0.6.2"
1	+ __version__: str = "0.8.0"

vector2dggs/cli.py CHANGED Viewed

@@ -3,6 +3,8 @@ import click
 from vector2dggs import __version__
 from vector2dggs.h3 import h3
 from vector2dggs.rHP import rhp
+from vector2dggs.s2 import s2
+from vector2dggs.geohash import geohash
 #   If the program does terminal interaction, make it output a short
 # notice like this when it starts in an interactive mode:
@@ -21,6 +23,8 @@ def cli():
 cli.add_command(h3)
 cli.add_command(rhp)
+cli.add_command(s2)
+cli.add_command(geohash)
 def main():

vector2dggs/common.py CHANGED Viewed

@@ -104,7 +104,9 @@ def drop_condition(
     _diff = _before - _after
     if _diff:
         log_method = (
-            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
+            LOGGER.info
+            if (_diff / float(_before)) < warning_threshold
+            else LOGGER.warning
         )
         log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
     return df
@@ -118,22 +120,17 @@ def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int):
     Used for intermediate re-partioning.
     """
-    if dggs == "h3":
-        return (
-            parent_res
-            if parent_res is not None
-            else max(const.MIN_H3, (resolution - const.DEFAULT_PARENT_OFFSET))
-        )
-    elif dggs == "rhp":
-        return (
-            parent_res
-            if parent_res is not None
-            else max(const.MIN_RHP, (resolution - const.DEFAULT_PARENT_OFFSET))
-        )
-    else:
+    if not dggs in const.DEFAULT_DGGS_PARENT_RES.keys():
         raise RuntimeError(
-            "Unknown dggs {dggs}) -  must be one of [ 'h3', 'rhp' ]".format(dggs=dggs)
+            "Unknown dggs {dggs}) -  must be one of [ {options} ]".format(
+                dggs=dggs, options=", ".join(const.DEFAULT_DGGS_PARENT_RES.keys())
+            )
         )
+    return (
+        parent_res
+        if parent_res is not None
+        else const.DEFAULT_DGGS_PARENT_RES[dggs](resolution)
+    )
 def parent_partitioning(
@@ -179,25 +176,23 @@ def polyfill(
 ) -> None:
     """
     Reads a geoparquet, performs polyfilling (for Polygon),
-    linetracing (for LineString), and writes out to parquet.
+    linetracing (for LineString), or indexing (for Point),
+    and writes out to parquet.
     """
     df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
     if len(df.index) == 0:
-        # Input is empty, nothing to polyfill
+        # Input is empty, nothing to convert
         return None
-    # DGGS specific polyfill
+    # DGGS specific conversion
     df = dggsfunc(df, resolution)
     if len(df.index) == 0:
-        # Polyfill resulted in empty output (e.g. large cell, small feature)
+        # Conversion resulted in empty output (e.g. large cell, small feature)
         return None
     df.index.rename(f"{dggs}_{resolution:02}", inplace=True)
     parent_res: int = get_parent_res(dggs, parent_res, resolution)
-    # print(parent_res)
-    # print(df.index)
-    # print(df.columns)
     # Secondary (parent) index, used later for partitioning
     df = secondary_index_func(df, parent_res)
@@ -233,7 +228,7 @@ def index(
     overwrite: bool = False,
 ) -> Path:
     """
-    Performs multi-threaded polyfilling on (multi)polygons.
+    Performs multi-threaded DGGS indexing on geometries (including multipart and collections).
     """
     if table and con:
@@ -291,7 +286,8 @@ def index(
             "index": lambda frame: frame[
                 (frame.geometry.geom_type != "Polygon")
                 & (frame.geometry.geom_type != "LineString")
-            ],  # NB currently points and other types are lost; in principle, these could be indexed
+                & (frame.geometry.geom_type != "Point")
+            ],
             "message": "Considering unsupported geometries",
         },
     ]
@@ -314,9 +310,9 @@ def index(
         filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
-        # Multithreaded polyfilling
+        # Multithreaded DGGS indexing
         LOGGER.debug(
-            "Indexing on spatial partitions by polyfill with resolution: %d",
+            "DGGS indexing by spatial partitions with resolution: %d",
             resolution,
         )
         with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
@@ -344,7 +340,7 @@ def index(
             parent_partitioning(
                 dggs,
-                tmpdir2,
+                Path(tmpdir2),
                 output_directory,
                 resolution,
                 parent_res,

vector2dggs/constants.py CHANGED Viewed

@@ -5,6 +5,8 @@ import tempfile
 MIN_H3, MAX_H3 = 0, 15
 MIN_RHP, MAX_RHP = 0, 15
+MIN_S2, MAX_S2 = 0, 30
+MIN_GEOHASH, MAX_GEOHASH = 0, 12
 DEFAULTS = {
     "id": None,
@@ -19,8 +21,53 @@ DEFAULTS = {
     "tempdir": tempfile.tempdir,
 }
+DEFAULT_DGGS_PARENT_RES = {
+    "h3": lambda resolution: max(MIN_H3, (resolution - DEFAULT_PARENT_OFFSET)),
+    "rhp": lambda resolution: max(MIN_RHP, (resolution - DEFAULT_PARENT_OFFSET)),
+    "geohash": lambda resolution: max(
+        MIN_GEOHASH, (resolution - DEFAULT_PARENT_OFFSET)
+    ),
+    "s2": lambda resolution: max(MIN_S2, (resolution - DEFAULT_PARENT_OFFSET)),
+}
 DEFAULT_PARENT_OFFSET = 6
+# http://s2geometry.io/resources/s2cell_statistics.html
+S2_CELLS_MAX_AREA_M2_BY_LEVEL = {
+    0: 85011012.19 * 1e6,
+    1: 21252753.05 * 1e6,
+    2: 6026521.16 * 1e6,
+    3: 1646455.50 * 1e6,
+    4: 413918.15 * 1e6,
+    5: 104297.91 * 1e6,
+    6: 26113.30 * 1e6,
+    7: 6529.09 * 1e6,
+    8: 1632.45 * 1e6,
+    9: 408.12 * 1e6,
+    10: 102.03 * 1e6,
+    11: 25.51 * 1e6,
+    12: 6.38 * 1e6,
+    13: 1.59 * 1e6,
+    14: 0.40 * 1e6,
+    15: 99638.93,
+    16: 24909.73,
+    17: 6227.43,
+    18: 1556.86,
+    19: 389.22,
+    20: 97.30,
+    21: 24.33,
+    22: 6.08,
+    23: 1.52,
+    24: 0.38,
+    25: 950.23 * 1e-4,
+    26: 237.56 * 1e-4,
+    27: 59.39 * 1e-4,
+    28: 14.85 * 1e-4,
+    29: 3.71 * 1e-4,
+    30: 0.93 * 1e-4,
+}
 warnings.filterwarnings(
     "ignore"
 )  # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows

vector2dggs/geohash.py ADDED Viewed

@@ -0,0 +1,240 @@
+import sys
+import click
+import click_log
+import tempfile
+import pyproj
+from geohash_polygon import polygon_to_geohashes  # rusty-polygon-geohasher
+from geohash import encode, decode  # python-geohash
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import Point, Polygon
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
+from vector2dggs import __version__
+def gh_secondary_index(df: pd.DataFrame, parent_level: int) -> pd.DataFrame:
+    df[f"geohash_{parent_level:02}"] = df.index.to_series().str[:parent_level]
+    return df
+# NB this implements a point-inside hash, but geohash_polygon only supports "within" or "intersects" (on the basis of geohashes as _polygon_ geometries) which means we have to perform additional computation to support "polyfill" as defined by H3
+# A future version of vector2dggs may support within/intersects modality, at which point that would just be outer/inner with no further computation
+def _polygon_to_geohashes(polygon: Polygon, level: int) -> set[str]:
+    # Function to compute geohash set for one polygon geometry
+    outer: set[str] = polygon_to_geohashes(polygon, level, inner=False)
+    inner: set[str] = polygon_to_geohashes(polygon, level, inner=True)
+    edge: set[str] = {
+        h
+        for h in (outer - inner)  # All edge cells
+        if Point(*reversed(decode(h))).within(polygon)
+    }  # Edge cells with a center within the polygon
+    return edge | inner
+def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
+    gh_col = f"geohash"
+    df_polygon = df[df.geom_type == "Polygon"].copy()
+    if not df_polygon.empty:
+        df_polygon = (
+            df_polygon.assign(
+                **{
+                    gh_col: df_polygon.geometry.apply(
+                        lambda geom: _polygon_to_geohashes(geom, level)
+                    )
+                }
+            )
+            .explode(gh_col, ignore_index=True)
+            .set_index(gh_col)
+        )
+    # TODO linestring support
+    # e.g. JS implementation https://github.com/alrico88/geohashes-along
+    df_point = df[df.geom_type == "Point"].copy()
+    if len(df_point.index) > 0:
+        df_point[gh_col] = df_point.geometry.apply(
+            lambda geom: encode(geom.y, geom.x, precision=level)
+        )
+        df_point = df_point.set_index(gh_col)
+    return pd.concat(
+        map(
+            lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
+            [df_polygon, df_point],
+        )
+    )
+@click.command(context_settings={"show_default": True})
+@click_log.simple_verbosity_option(common.LOGGER)
+@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
+@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
+@click.option(
+    "-r",
+    "--resolution",
+    "level",
+    required=True,
+    type=click.Choice(list(map(str, range(const.MIN_GEOHASH, const.MAX_GEOHASH + 1)))),
+    help="Geohash level to index",
+    nargs=1,
+)
+@click.option(
+    "-pr",
+    "--parent_res",
+    "parent_level",
+    required=False,
+    type=click.Choice(list(map(str, range(const.MIN_GEOHASH, const.MAX_GEOHASH + 1)))),
+    help="Geohash parent level for the output partition. Defaults to resolution - 6",
+)
+@click.option(
+    "-id",
+    "--id_field",
+    required=False,
+    default=const.DEFAULTS["id"],
+    type=str,
+    help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
+    nargs=1,
+)
+@click.option(
+    "-k",
+    "--keep_attributes",
+    is_flag=True,
+    show_default=True,
+    default=const.DEFAULTS["k"],
+    help="Retain attributes in output. The default is to create an output that only includes Geohash cell ID and the ID given by the -id field (or the default index ID).",
+)
+@click.option(
+    "-ch",
+    "--chunksize",
+    required=True,
+    type=int,
+    default=const.DEFAULTS["ch"],
+    help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
+    nargs=1,
+)
+@click.option(
+    "-s",
+    "--spatial_sorting",
+    type=click.Choice(["hilbert", "morton", "geohash"]),
+    default=const.DEFAULTS["s"],
+    help="Spatial sorting method when perfoming spatial partitioning.",
+)
+@click.option(
+    "-crs",
+    "--cut_crs",
+    required=False,
+    default=const.DEFAULTS["crs"],
+    type=int,
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    nargs=1,
+)
+@click.option(
+    "-c",
+    "--cut_threshold",
+    required=True,
+    default=const.DEFAULTS["c"],
+    type=int,
+    help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
+    nargs=1,
+)
+@click.option(
+    "-t",
+    "--threads",
+    required=False,
+    default=const.DEFAULTS["t"],
+    type=int,
+    help="Amount of threads used for operation",
+    nargs=1,
+)
+@click.option(
+    "-tbl",
+    "--table",
+    required=False,
+    default=const.DEFAULTS["tbl"],
+    type=str,
+    help="Name of the table to read when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "-g",
+    "--geom_col",
+    required=False,
+    default=const.DEFAULTS["g"],
+    type=str,
+    help="Column name to use when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "--tempdir",
+    default=const.DEFAULTS["tempdir"],
+    type=click.Path(),
+    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
+)
+@click.option("-o", "--overwrite", is_flag=True)
+@click.version_option(version=__version__)
+def geohash(
+    vector_input: Union[str, Path],
+    output_directory: Union[str, Path],
+    level: str,
+    parent_level: str,
+    id_field: str,
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_crs: int,
+    cut_threshold: int,
+    threads: int,
+    table: str,
+    geom_col: str,
+    tempdir: Union[str, Path],
+    overwrite: bool,
+):
+    """
+    Ingest a vector dataset and index it using the Geohash geocode system.
+    VECTOR_INPUT is the path to input vector geospatial data.
+    OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
+    """
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
+    common.check_resolutions(level, parent_level)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
+    if cut_crs is not None:
+        cut_crs = pyproj.CRS.from_user_input(cut_crs)
+    try:
+        common.index(
+            "geohash",
+            gh_polyfill,
+            gh_secondary_index,
+            vector_input,
+            output_directory,
+            int(level),
+            parent_level,
+            keep_attributes,
+            chunksize,
+            spatial_sorting,
+            cut_threshold,
+            threads,
+            cut_crs=cut_crs,
+            id_field=id_field,
+            con=con,
+            table=table,
+            geom_col=geom_col,
+            overwrite=overwrite,
+        )
+    except:
+        raise
+    else:
+        sys.exit(0)

vector2dggs/h3.py CHANGED Viewed

@@ -18,13 +18,13 @@ import vector2dggs.common as common
 from vector2dggs import __version__
-def h3_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+def h3_secondary_index(df: pd.DataFrame, parent_res: int) -> pd.DataFrame:
     return df.h3.h3_to_parent(parent_res)
-def h3polyfill(df: gpd.GeoDataFrame, resolution: int):
+def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     df_polygon = df[df.geom_type == "Polygon"]
-    if len(df_polygon.index) > 0:
+    if not df_polygon.empty:
         df_polygon = df_polygon.h3.polyfill_resample(
             resolution, return_geometry=False
         ).drop(columns=["index"])
@@ -38,10 +38,14 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int):
         )
         df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
+    df_point = df[df.geom_type == "Point"]
+    if len(df_point.index) > 0:
+        df_point = df_point.h3.geo_to_h3(resolution, set_index=True)
     return pd.concat(
         map(
             lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
-            [df_polygon, df_linestring],
+            [df_polygon, df_linestring, df_point],
         )
     )

vector2dggs/katana.py CHANGED Viewed

@@ -33,7 +33,7 @@ def katana(geometry, threshold, count=0) -> GeometryCollection:
     if not geometry.is_valid:
         # print(explain_validity(geometry))
         geometry = make_valid(geometry)
-        if geometry.type == "GeometryCollection":
+        if geometry.geom_type == "GeometryCollection":
             geometry.normalize()
         geometry = geometry.buffer(0)
     bounds = geometry.bounds

vector2dggs/rHP.py CHANGED Viewed

@@ -18,36 +18,34 @@ import vector2dggs.common as common
 from vector2dggs import __version__
-def rhp_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
+def rhp_secondary_index(df: pd.date_range, parent_res: int) -> pd.DataFrame:
     return df.rhp.rhp_to_parent(parent_res)
-def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
+def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
     df_polygon = df[df.geom_type == "Polygon"]
     if len(df_polygon.index) > 0:
         df_polygon = df_polygon.rhp.polyfill_resample(
             resolution, return_geometry=False
         ).drop(columns=["index"])
-    df_multipolygon = df[df.geom_type == "MultiPolygon"]
-    if len(df_multipolygon.index) > 0:
-        df_multipolygon = df_multipolygon.rhp.polyfill_resample(
-            resolution, return_geometry=False
-        ).drop(columns=["index"])
     # df_linestring = df[df.geom_type == "LineString"]
     # if len(df_linestring.index) > 0:
     #     df_linestring = (
-    #         df_linestring.h3.linetrace(resolution)
-    #         .explode("h3_linetrace")
-    #         .set_index("h3_linetrace")
+    #         df_linestring.rhp.linetrace(resolution)
+    #         .explode("rhp_linetrace")
+    #         .set_index("rhp_linetrace")
     #     )
     #     df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
+    df_point = df[df.geom_type == "Point"]
+    if len(df_point.index) > 0:
+        df_point = df_point.rhp.geo_to_rhp(resolution, set_index=True)
     return pd.concat(
         map(
             lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
-            [df_polygon, df_multipolygon],  # df_linestring],
+            [df_polygon, df_point],
         )
     )
@@ -61,7 +59,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
     "--resolution",
     required=True,
     type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
-    help="H3 resolution to index",
+    help="rHEALPix resolution to index",
     nargs=1,
 )
 @click.option(
@@ -69,7 +67,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
     "--parent_res",
     required=False,
     type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
-    help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
+    help="rHEALPix Parent resolution for the output partition. Defaults to resolution - 6",
 )
 @click.option(
     "-id",

vector2dggs/s2.py ADDED Viewed

@@ -0,0 +1,349 @@
+import sys
+import click
+import click_log
+import tempfile
+import pyproj
+from math import ceil
+from s2geometry import pywraps2 as S2
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import box, Polygon, LineString, Point
+from shapely.ops import transform
+from pyproj import CRS, Transformer
+from typing import Union
+from pathlib import Path
+import vector2dggs.constants as const
+import vector2dggs.common as common
+from vector2dggs import __version__
+def s2_secondary_index(df: pd.DataFrame, parent_level: int) -> pd.DataFrame:
+    # NB also converts the index to S2 cell tokens
+    index_series = df.index.to_series().astype(object)
+    df[f"s2_{parent_level:02}"] = index_series.map(
+        lambda cell_id: cell_id.parent(parent_level).ToToken()
+    )
+    df.index = index_series.map(lambda cell_id: cell_id.ToToken())
+    return df
+def bbox_area_in_m2(
+    geom: Polygon,
+    src_crs: Union[str, CRS] = "EPSG:4326",
+    dst_crs: Union[str, CRS] = "EPSG:6933",
+) -> float:
+    """
+    Calculate the area of the bounding box of a geometry in square meters.
+    """
+    minx, miny, maxx, maxy = geom.bounds
+    bbox = box(minx, miny, maxx, maxy)
+    transformer = Transformer.from_crs(src_crs, dst_crs, always_xy=True)
+    projected_bbox = transform(transformer.transform, bbox)
+    return projected_bbox.area
+def max_cells_for_geom(
+    geom: Union[Polygon, LineString], level: int, margin: float = 1.02
+) -> int:
+    """
+    Calculate the maximum number of S2 cells that are appropriate for the given geometry and level.
+    This is based on the area of the geometry's bounding box,
+    and the maximum area of S2 cells at the given level.
+    """
+    area = bbox_area_in_m2(geom)
+    max_cells = ceil(max(1, area / const.S2_CELLS_MAX_AREA_M2_BY_LEVEL[level]))
+    return ceil(max_cells * margin)
+def cell_center_is_inside_polygon(cell: S2.S2CellId, polygon: S2.S2Polygon) -> bool:
+    """Determines if the center of the S2 cell is inside the polygon"""
+    cell_center = S2.S2Cell(cell).GetCenter()
+    return polygon.Contains(cell_center)
+def s2_polyfill_polygons(df: gpd.GeoDataFrame, level: int) -> gpd.GeoDataFrame:
+    def generate_s2_covering(
+        geom: Polygon, level: int, centroid_inside: bool = True
+    ) -> set[S2.S2CellId]:
+        # Prepare loops: first the exterior loop, then the interior loops
+        loops = []
+        # Exterior ring
+        latlngs = [
+            S2.S2LatLng.FromDegrees(lat, lon) for lon, lat in geom.exterior.coords
+        ]
+        s2loop = S2.S2Loop([latlng.ToPoint() for latlng in latlngs])
+        s2loop.Normalize()
+        loops.append(s2loop)
+        # Interior rings (polygon holes)
+        for interior in geom.interiors:
+            interior_latlngs = [
+                S2.S2LatLng.FromDegrees(lat, lon) for lon, lat in interior.coords
+            ]
+            s2interior_loop = S2.S2Loop(
+                [latlng.ToPoint() for latlng in interior_latlngs]
+            )
+            s2interior_loop.Normalize()
+            loops.append(s2interior_loop)
+        # Build an S2Polygon from the loops
+        s2polygon = S2.S2Polygon()
+        s2polygon.InitNested(loops)
+        # Use S2RegionCoverer to get the cell IDs at the specified level
+        coverer = S2.S2RegionCoverer()
+        max_cells = max_cells_for_geom(geom, level)
+        coverer.set_max_cells(max_cells)
+        coverer.set_min_level(level)
+        coverer.set_max_level(level)
+        covering: list[S2.S2CellId] = coverer.GetCovering(s2polygon)
+        if centroid_inside:
+            # Coverings are "intersects" modality, polyfill is "centre inside" modality
+            # ergo, filter out covering cells that are not inside the polygon
+            covering = {
+                cell
+                for cell in covering
+                if cell_center_is_inside_polygon(cell, s2polygon)
+            }
+        else:
+            set(covering)
+        return covering
+    df["s2index"] = df["geometry"].apply(lambda geom: generate_s2_covering(geom, level))
+    df = df[
+        df["s2index"].map(lambda x: len(x) > 0)
+    ]  # Remove rows with no covering at this level
+    return df
+def s2_cell_ids_from_linestring(
+    linestring: LineString, level: int
+) -> list[S2.S2CellId]:
+    latlngs = [S2.S2LatLng.FromDegrees(lat, lon) for lon, lat in linestring.coords]
+    polyline = S2.S2Polyline(latlngs)
+    coverer = S2.S2RegionCoverer()
+    max_cells = max_cells_for_geom(linestring, level)
+    coverer.set_max_cells(max_cells)
+    coverer.set_min_level(level)
+    coverer.set_max_level(level)
+    return coverer.GetCovering(polyline)
+def s2_cell_id_from_point(geom: Point, level: int) -> S2.S2CellId:
+    """
+    Convert a point geometry to an S2 cell at the specified level.
+    """
+    latlng = S2.S2LatLng.FromDegrees(geom.y, geom.x)
+    return S2.S2CellId(latlng).parent(level)
+def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
+    df_polygon = df[df.geom_type == "Polygon"].copy()
+    if len(df_polygon.index) > 0:
+        df_polygon = (
+            s2_polyfill_polygons(df_polygon, level)
+            .explode("s2index")
+            .set_index("s2index")
+        )
+    df_linestring = df[df.geom_type == "LineString"].copy()
+    if len(df_linestring.index) > 0:
+        df_linestring["s2index"] = df_linestring.geometry.apply(
+            lambda geom: s2_cell_ids_from_linestring(geom, level)
+        )
+        df_linestring = df_linestring.explode("s2index").set_index("s2index")
+    df_point = df[df.geom_type == "Point"].copy()
+    if len(df_point.index) > 0:
+        df_point["s2index"] = df_point.geometry.apply(
+            lambda geom: s2_cell_id_from_point(geom, level)
+        )
+        df_point = df_point.set_index("s2index")
+    return pd.concat(
+        map(
+            lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
+            [df_polygon, df_linestring, df_point],
+        )
+    )
+@click.command(context_settings={"show_default": True})
+@click_log.simple_verbosity_option(common.LOGGER)
+@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
+@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
+@click.option(
+    "-r",
+    "--resolution",
+    "level",
+    required=True,
+    type=click.Choice(list(map(str, range(const.MIN_S2, const.MAX_S2 + 1)))),
+    help="S2 level to index",
+    nargs=1,
+)
+@click.option(
+    "-pr",
+    "--parent_res",
+    "parent_level",
+    required=False,
+    type=click.Choice(list(map(str, range(const.MIN_S2, const.MAX_S2 + 1)))),
+    help="S2 parent level for the output partition. Defaults to resolution - 6",
+)
+@click.option(
+    "-id",
+    "--id_field",
+    required=False,
+    default=const.DEFAULTS["id"],
+    type=str,
+    help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
+    nargs=1,
+)
+@click.option(
+    "-k",
+    "--keep_attributes",
+    is_flag=True,
+    show_default=True,
+    default=const.DEFAULTS["k"],
+    help="Retain attributes in output. The default is to create an output that only includes S2 cell ID and the ID given by the -id field (or the default index ID).",
+)
+@click.option(
+    "-ch",
+    "--chunksize",
+    required=True,
+    type=int,
+    default=const.DEFAULTS["ch"],
+    help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
+    nargs=1,
+)
+@click.option(
+    "-s",
+    "--spatial_sorting",
+    type=click.Choice(["hilbert", "morton", "geohash"]),
+    default=const.DEFAULTS["s"],
+    help="Spatial sorting method when perfoming spatial partitioning.",
+)
+@click.option(
+    "-crs",
+    "--cut_crs",
+    required=False,
+    default=const.DEFAULTS["crs"],
+    type=int,
+    help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
+    nargs=1,
+)
+@click.option(
+    "-c",
+    "--cut_threshold",
+    required=True,
+    default=const.DEFAULTS["c"],
+    type=int,
+    help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
+    nargs=1,
+)
+@click.option(
+    "-t",
+    "--threads",
+    required=False,
+    default=const.DEFAULTS["t"],
+    type=int,
+    help="Amount of threads used for operation",
+    nargs=1,
+)
+@click.option(
+    "-tbl",
+    "--table",
+    required=False,
+    default=const.DEFAULTS["tbl"],
+    type=str,
+    help="Name of the table to read when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "-g",
+    "--geom_col",
+    required=False,
+    default=const.DEFAULTS["g"],
+    type=str,
+    help="Column name to use when using a spatial database connection as input",
+    nargs=1,
+)
+@click.option(
+    "--tempdir",
+    default=const.DEFAULTS["tempdir"],
+    type=click.Path(),
+    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
+)
+@click.option("-o", "--overwrite", is_flag=True)
+@click.version_option(version=__version__)
+def s2(
+    vector_input: Union[str, Path],
+    output_directory: Union[str, Path],
+    level: str,
+    parent_level: str,
+    id_field: str,
+    keep_attributes: bool,
+    chunksize: int,
+    spatial_sorting: str,
+    cut_crs: int,
+    cut_threshold: int,
+    threads: int,
+    table: str,
+    geom_col: str,
+    tempdir: Union[str, Path],
+    overwrite: bool,
+):
+    """
+    Ingest a vector dataset and index it to the S2 DGGS.
+    VECTOR_INPUT is the path to input vector geospatial data.
+    OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
+    """
+    tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
+    common.check_resolutions(level, parent_level)
+    con, vector_input = common.db_conn_and_input_path(vector_input)
+    output_directory = common.resolve_output_path(output_directory, overwrite)
+    if cut_crs is not None:
+        cut_crs = pyproj.CRS.from_user_input(cut_crs)
+    try:
+        common.index(
+            "s2",
+            s2_polyfill,
+            s2_secondary_index,
+            vector_input,
+            output_directory,
+            int(level),
+            parent_level,
+            keep_attributes,
+            chunksize,
+            spatial_sorting,
+            cut_threshold,
+            threads,
+            cut_crs=cut_crs,
+            id_field=id_field,
+            con=con,
+            table=table,
+            geom_col=geom_col,
+            overwrite=overwrite,
+        )
+    except:
+        raise
+    else:
+        sys.exit(0)

{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vector2dggs
-Version: 0.6.2
+Version: 0.8.0
 Summary: CLI DGGS indexer for vector geospatial data
 Home-page: https://github.com/manaakiwhenua/vector2dggs
 License: LGPL-3.0-or-later
@@ -29,8 +29,11 @@ Requires-Dist: pillow (>=11.2.1,<12.0.0)
 Requires-Dist: psycopg2 (>=2.9.9,<3.0.0)
 Requires-Dist: pyarrow (>=20.0,<21.0)
 Requires-Dist: pyproj (>=3.7,<4.0)
+Requires-Dist: python-geohash (>=0.8.5,<0.9.0)
 Requires-Dist: rhealpixdggs (>=0.5.5,<0.6.0)
 Requires-Dist: rhppandas (>=0.1.2,<0.2.0)
+Requires-Dist: rusty-polygon-geohasher (>=0.2.3,<0.3.0)
+Requires-Dist: s2geometry (>=0.9.0,<0.10.0)
 Requires-Dist: shapely (>=2.1,<3.0)
 Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
 Requires-Dist: tqdm (>=4.67,<5.0)
@@ -45,9 +48,17 @@ Python-based CLI tool to index raster files to DGGS in parallel, writing out to
 This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
-Currently only supports H3 DGGS, and probably has other limitations since it has been developed for a specific internal use case, though it is intended as a general-purpose abstraction. Contributions, suggestions, bug reports and strongly worded letters are all welcome.
+Currently this tool supports the following DGGSs:
-Currently only supports polygons; but both coverages (strictly non-overlapping polygons), and sets of polygons that do/may overlap, are supported. Overlapping polygons are captured by ensuring that DGGS cell IDs may be non-unique (repeated) in the output.
+- [H3](https://h3geo.org/)
+- [rHEALPix](https://datastore.landcareresearch.co.nz/dataset/rhealpix-discrete-global-grid-system) (points, polygons)
+- [S2](https://s2geometry.io/)
+... and the following geocode systems:
+- [Geohash](https://en.wikipedia.org/wiki/Geohash) (points, polygons)
+Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
 ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
@@ -59,6 +70,22 @@ pip install vector2dggs
 ## Usage
+```bash
+vector2dggs --help
+Usage: vector2dggs [OPTIONS] COMMAND [ARGS]...
+Options:
+  --version  Show the version and exit.
+  --help     Show this message and exit.
+Commands:
+  geohash  Ingest a vector dataset and index it using the Geohash geocode...
+  h3       Ingest a vector dataset and index it to the H3 DGGS.
+  rhp      Ingest a vector dataset and index it to the rHEALPix DGGS.
+  s2       Ingest a vector dataset and index it to the S2 DGGS.
+```
 ```bash
 vector2dggs h3 --help
 Usage: vector2dggs h3 [OPTIONS] VECTOR_INPUT OUTPUT_DIRECTORY
@@ -121,9 +148,9 @@ Options:
 Output is in the Apache Parquet format, a directory with one file per partition.
-For a quick view of your output, you can read Apache Parquet with pandas, and then use h3-pandas and geopandas to convert this into a GeoPackage or GeoParquet for visualisation in a desktop GIS, such as QGIS. The Apache Parquet output is indexed by an ID column (which you can specify), so it should be ready for two intended use-cases:
+For a quick view of your output, you can read Apache Parquet with pandas, and then use tools like h3-pandas and geopandas to convert this into a GeoPackage or GeoParquet for visualisation in a desktop GIS, such as QGIS. The Apache Parquet output is indexed by an ID column (which you can specify), so it should be ready for two intended use-cases:
 - Joining attribute data from the original feature-level data onto computer DGGS cells.
-- Joining other data to this output on the H3 cell ID. (The output has a column like `h3_\d{2}`, e.g. `h3_09` or `h3_12` according to the target resolution.)
+- Joining other data to this output on the DGGS cell ID. (The output has a column like `{dggs}_\d`, e.g. `h3_09` or `h3_12` according to the target resolution, zero-padded to account for the maximum resolution of the DGGS)
 Geoparquet output (hexagon boundaries):
@@ -150,6 +177,34 @@ h3_12
 >>> g.to_parquet('./output-data/parcels.12.geo.parquet')
 ```
+An example for S2 output (using `s2sphere`):
+```python
+import pandas as pd
+import geopandas as gpd
+import s2sphere
+from shapely.geometry import Polygon
+RES = 18
+df = pd.read_parquet(f'~/output-data/ponds-with-holes.s2.{RES}.pq')
+df = df.reset_index()
+def s2id_to_polygon(s2_id_hex):
+    cell_id = s2sphere.CellId.from_token(s2_id_hex)
+    cell = s2sphere.Cell(cell_id)
+    vertices = []
+    for i in range(4):
+        vertex = cell.get_vertex(i)
+        lat_lng = s2sphere.LatLng.from_point(vertex)
+        vertices.append((lat_lng.lng().degrees, lat_lng.lat().degrees))  # (lon, lat)
+    return Polygon(vertices)
+df['geometry'] = df[f's2_{RES}'].apply(s2id_to_polygon)
+df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')  # WGS84
+df.to_parquet(f'sample-{RES}.parquet')
+```
 ### For development
 In brief, to get started:
@@ -157,8 +212,9 @@ In brief, to get started:
 - Install [Poetry](https://python-poetry.org/docs/basic-usage/)
 - Install [GDAL](https://gdal.org/)
     - If you're on Windows, `pip install gdal` may be necessary before running the subsequent commands.
-    - On Linux, install GDAL 3.6+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.
+    - On Linux, install GDAL 3.8+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.
 - Create the virtual environment with `poetry init`. This will install necessary dependencies.
+  - If the installation of `s2geometry` fails, you may require SWIG to build it. (A command like `conda install swig` or `sudo dnf install swig` depending on your platform).
 - Subsequently, the virtual environment can be re-activated with `poetry shell`.
 If you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.
@@ -193,14 +249,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.6.2},
+  version={0.8.0},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.6.2) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.8.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

vector2dggs-0.8.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+vector2dggs/__init__.py,sha256=MBs_zNH91Wn_gMAiPSI9xF7r-us5pgElKJBoafyySfc,27
+vector2dggs/cli.py,sha256=d_4skD62k6pXUWgDdVHbDwpe4A4yo62ZFx8Cp_6GpBA,767
+vector2dggs/common.py,sha256=yLYIWAbvlgY_tcTCRIzeqDjkT95LbIUqpHhrvm6TREE,10458
+vector2dggs/constants.py,sha256=I89XzsKO5vQlMkVFwDnVS2nO5MtLGRYeCOSunLU9Dfg,1694
+vector2dggs/geohash.py,sha256=BfBpgMYpZqH0j7hGf00qo0Q484DeRt7Afil5QJO8ZGI,7527
+vector2dggs/h3.py,sha256=LYZe4GgGjp0kBoqm_L14yYtgLLBS3lnkHi_pEbd7tYM,6303
+vector2dggs/katana.py,sha256=Z3RFB92DsIZ069Bz0mKyYBKtOxd2dPxYvRy6M-MyRsM,3412
+vector2dggs/rHP.py,sha256=oURBnbVTR5vZtcxN1LVI67A0vNbzPicogaYB_9Hwdvg,6353
+vector2dggs/s2.py,sha256=cyhvdXZjYKPCIgLJMG2ZChrbvsMKzLJcIa3bdwMdmVc,10828
+vector2dggs-0.8.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+vector2dggs-0.8.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
+vector2dggs-0.8.0.dist-info/METADATA,sha256=Dq8-j2-IeQZa92xRRmN7RC8-OJTTIKjZTv7WZL7Noz8,11586
+vector2dggs-0.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+vector2dggs-0.8.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
+vector2dggs-0.8.0.dist-info/RECORD,,

vector2dggs-0.6.2.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-vector2dggs/__init__.py,sha256=w9t1Aj5a_f__PKPw_C7bWnZmWL3_GHrtgVrGYGX1wfk,27
-vector2dggs/cli.py,sha256=HoPp7Bwk2kZghAms6wNepx-bFhoAuHH7WXACMIy3MuM,652
-vector2dggs/common.py,sha256=DL3ohG-QQyI-phyxeO6Fi2BOwWnFct-I_Y87_XC2SRQ,10578
-vector2dggs/constants.py,sha256=u6n6XNvEVLUexn9Sb2rc22s2B4Rrg_VXFJaM7uEy-9Q,536
-vector2dggs/h3.py,sha256=GgiGOVbsXXNp95KWKKmJZvDxGFj91TTWl575OaPZ6yk,6145
-vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
-vector2dggs/rHP.py,sha256=Y36tPbtY-tYBUFILHD-xnUxa2yKlYotGP6043Bg5nZc,6450
-vector2dggs-0.6.2.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-vector2dggs-0.6.2.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
-vector2dggs-0.6.2.dist-info/METADATA,sha256=kNT2Iyd8irBMo2Tq0_CwnORVNeCc1ekjO1TlMwBp6qY,10014
-vector2dggs-0.6.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-vector2dggs-0.6.2.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
-vector2dggs-0.6.2.dist-info/RECORD,,

{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/COPYING RENAMED Viewed

File without changes

{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/COPYING.LESSER RENAMED Viewed

File without changes

{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{vector2dggs-0.6.2.dist-info → vector2dggs-0.8.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

vector2dggs 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

vector2dggs 0.6.2py3-none-any.whl → 0.8.0py3-none-any.whl