PyPI - vector2dggs - Versions diffs - 0.5.3__tar.gz → 0.6.0__tar.gz - Mend

vector2dggs 0.5.3tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vector2dggs
-Version: 0.5.3
+Version: 0.6.0
 Summary: CLI DGGS indexer for vector geospatial data
 Home-page: https://github.com/manaakiwhenua/vector2dggs
 License: LGPL-3.0-or-later
@@ -21,12 +21,12 @@ Requires-Dist: click (>=8.1.3,<9.0.0)
 Requires-Dist: click-log (>=0.4.0,<0.5.0)
 Requires-Dist: dask (>=2023.3.0,<2024.0.0)
 Requires-Dist: dask-geopandas (>=0.3.0,<0.4.0)
-Requires-Dist: gdal (>=3.6.4,<4.0.0)
+Requires-Dist: gdal (>=3.8.0,<4.0.0)
 Requires-Dist: geopandas (>=0.12.2,<0.13.0)
-Requires-Dist: h3pandas (>=0.2.3,<0.3.0)
+Requires-Dist: h3pandas (>=0.2.6,<0.3.0)
 Requires-Dist: psycopg2 (>=2.9.6,<3.0.0)
-Requires-Dist: pyarrow (>=11.0.0,<12.0.0)
-Requires-Dist: pygeos (>=0.14,<0.15)
+Requires-Dist: pyarrow (>=14.0.1,<15.0.0)
+Requires-Dist: pygeos (>=0.13,<0.14)
 Requires-Dist: pyproj (>=3.5.0,<4.0.0)
 Requires-Dist: sqlalchemy (>=2.0.10,<3.0.0)
 Requires-Dist: tqdm (>=4.65.0,<5.0.0)
@@ -187,14 +187,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.5.3},
+  version={0.6.0},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.5.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/README.md RENAMED Viewed

@@ -152,13 +152,13 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
   title={{vector2dggs}},
   author={Ardo, James and Law, Richard},
   url={https://github.com/manaakiwhenua/vector2dggs},
-  version={0.5.3},
+  version={0.6.0},
   date={2023-04-20}
 }
 ```
 APA/Harvard
-> Ardo, J., & Law, R. (2023). vector2dggs (0.5.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
+> Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
 [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vector2dggs"
-version = "0.5.3"
+version = "0.6.0"
 description = "CLI DGGS indexer for vector geospatial data"
 authors = ["James Ardo <ardoj@landcareresearch.co.nz>"]
 maintainers = ["Richard Law <lawr@landcareresearch.co.nz>"]
@@ -16,16 +16,16 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.10"
-gdal = "^3.6.4"
+gdal = "^3.8.0"
 geopandas = "^0.12.2"
-h3pandas = "^0.2.3"
+h3pandas = "^0.2.6"
 dask-geopandas = "^0.3.0"
 dask = "^2023.3.0"
 click = "^8.1.3"
 tqdm = "^4.65.0"
 click-log = "^0.4.0"
-pyarrow = "^11.0.0"
-pygeos = "^0.14"
+pyarrow = "^14.0.1"
+pygeos = "^0.13"
 pyproj = "^3.5.0"
 sqlalchemy = "^2.0.10"
 psycopg2 = "^2.9.6"

vector2dggs-0.6.0/setup.py ADDED Viewed

@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+from setuptools import setup
+packages = \
+['vector2dggs']
+package_data = \
+{'': ['*']}
+install_requires = \
+['click-log>=0.4.0,<0.5.0',
+ 'click>=8.1.3,<9.0.0',
+ 'dask-geopandas>=0.3.0,<0.4.0',
+ 'dask>=2023.3.0,<2024.0.0',
+ 'gdal>=3.8.0,<4.0.0',
+ 'geopandas>=0.12.2,<0.13.0',
+ 'h3pandas>=0.2.6,<0.3.0',
+ 'psycopg2>=2.9.6,<3.0.0',
+ 'pyarrow>=14.0.1,<15.0.0',
+ 'pygeos>=0.13,<0.14',
+ 'pyproj>=3.5.0,<4.0.0',
+ 'sqlalchemy>=2.0.10,<3.0.0',
+ 'tqdm>=4.65.0,<5.0.0']
+entry_points = \
+{'console_scripts': ['vector2dggs = vector2dggs.cli:main']}
+setup_kwargs = {
+    'name': 'vector2dggs',
+    'version': '0.6.0',
+    'description': 'CLI DGGS indexer for vector geospatial data',
+    'long_description': '# vector2dggs\n\n[![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)\n\nPython-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.\n\nThis is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).\n\nCurrently only supports H3 DGGS, and probably has other limitations since it has been developed for a specific internal use case, though it is intended as a general-purpose abstraction. Contributions, suggestions, bug reports and strongly worded letters are all welcome.\n\nCurrently only supports polygons; but both coverages (strictly non-overlapping polygons), and sets of polygons that do/may overlap, are supported. Overlapping polygons are captured by ensuring that DGGS cell IDs may be non-unique (repeated) in the output.\n\n![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")\n\n## Installation\n\n```bash\npip install vector2dggs\n```\n\n## Usage\n\n```bash\nvector2dggs h3 --help\nUsage: vector2dggs h3 [OPTIONS] VECTOR_INPUT OUTPUT_DIRECTORY\n\n  Ingest a vector dataset and index it to the H3 DGGS.\n\n  VECTOR_INPUT is the path to input vector geospatial data. OUTPUT_DIRECTORY\n  should be a directory, not a file or database table, as it will instead be\n  the write location for an Apache Parquet data store.\n\nOptions:\n  -v, --verbosity LVL             Either CRITICAL, ERROR, WARNING, INFO or\n                                  DEBUG  [default: INFO]\n  -r, --resolution [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15]\n                                  H3 resolution to index  [required]\n  -pr, --parent_res [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15]\n                                  H3 Parent resolution for the output\n                                  partition. Defaults to resolution - 6\n  -id, --id_field TEXT            Field to use as an ID; defaults to a\n                                  constructed single 0...n index on the\n                                  original feature order.\n  -k, --keep_attributes           Retain attributes in output. The default is\n                                  to create an output that only includes H3\n                                  cell ID and the ID given by the -id field\n                                  (or the default index ID).\n  -ch, --chunksize INTEGER        The number of rows per index partition to\n                                  use when spatially partioning. Adjusting\n                                  this number will trade off memory use and\n                                  time.  [default: 50; required]\n  -s, --spatial_sorting [hilbert|morton|geohash]\n                                  Spatial sorting method when perfoming\n                                  spatial partitioning.  [default: hilbert]\n  -crs, --cut_crs INTEGER         Set the coordinate reference system (CRS)\n                                  used for cutting large polygons (see `--cur-\n                                  threshold`). Defaults to the same CRS as the\n                                  input. Should be a valid EPSG code.\n  -c, --cut_threshold INTEGER     Cutting up large polygons into smaller\n                                  pieces based on a target length. Units are\n                                  assumed to match the input CRS units unless\n                                  the `--cut_crs` is also given, in which case\n                                  units match the units of the supplied CRS.\n                                  [default: 5000; required]\n  -t, --threads INTEGER           Amount of threads used for operation\n                                  [default: 7]\n  -tbl, --table TEXT              Name of the table to read when using a\n                                  spatial database connection as input\n  -g, --geom_col TEXT             Column name to use when using a spatial\n                                  database connection as input  [default:\n                                  geom]\n  --tempdir PATH                  Temporary data is created during the\n                                  execution of this program. This parameter\n                                  allows you to control where this data will\n                                  be written.\n  -o, --overwrite\n  --version                       Show the version and exit.\n  --help                          Show this message and exit.\n```\n\n## Visualising output\n\nOutput is in the Apache Parquet format, a directory with one file per partition.\n\nFor a quick view of your output, you can read Apache Parquet with pandas, and then use h3-pandas and geopandas to convert this into a GeoPackage or GeoParquet for visualisation in a desktop GIS, such as QGIS. The Apache Parquet output is indexed by an ID column (which you can specify), so it should be ready for two intended use-cases:\n- Joining attribute data from the original feature-level data onto computer DGGS cells.\n- Joining other data to this output on the H3 cell ID. (The output has a column like `h3_\\d{2}`, e.g. `h3_09` or `h3_12` according to the target resolution.)\n\nGeoparquet output (hexagon boundaries):\n\n```python\n>>> import pandas as pd\n>>> import h3pandas\n>>> g = pd.read_parquet(\'./output-data/nz-property-titles.12.parquet\').h3.h3_to_geo_boundary()\n>>> g\n                  title_no                                           geometry\nh3_12                                                                        \n8cbb53a734553ff  NA94D/635  POLYGON ((174.28483 -35.69315, 174.28482 -35.6...\n8cbb53a734467ff  NA94D/635  POLYGON ((174.28454 -35.69333, 174.28453 -35.6...\n8cbb53a734445ff  NA94D/635  POLYGON ((174.28416 -35.69368, 174.28415 -35.6...\n8cbb53a734551ff  NA94D/635  POLYGON ((174.28496 -35.69329, 174.28494 -35.6...\n8cbb53a734463ff  NA94D/635  POLYGON ((174.28433 -35.69335, 174.28432 -35.6...\n...                    ...                                                ...\n8cbb53a548b2dff  NA62D/324  POLYGON ((174.30249 -35.69369, 174.30248 -35.6...\n8cbb53a548b61ff  NA62D/324  POLYGON ((174.30232 -35.69402, 174.30231 -35.6...\n8cbb53a548b11ff  NA57C/785  POLYGON ((174.30140 -35.69348, 174.30139 -35.6...\n8cbb53a548b15ff  NA57C/785  POLYGON ((174.30161 -35.69346, 174.30160 -35.6...\n8cbb53a548b17ff  NA57C/785  POLYGON ((174.30149 -35.69332, 174.30147 -35.6...\n\n[52736 rows x 2 columns]\n>>> g.to_parquet(\'./output-data/parcels.12.geo.parquet\')\n```\n\n### For development\n\nIn brief, to get started:\n\n- Install [Poetry](https://python-poetry.org/docs/basic-usage/)\n- Install [GDAL](https://gdal.org/)\n    - If you\'re on Windows, `pip install gdal` may be necessary before running the subsequent commands.\n    - On Linux, install GDAL 3.6+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.\n- Create the virtual environment with `poetry init`. This will install necessary dependencies.\n- Subsequently, the virtual environment can be re-activated with `poetry shell`.\n\nIf you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.\n\n#### Code formatting\n\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n\nPlease run `black .` before committing.\n\n## Example commands\n\nWith a local GPKG:\n\n```bash\nvector2dggs h3 -v DEBUG -id title_no -r 12 -o ~/Downloads/nz-property-titles.gpkg ~/Downloads/nz-property-titles.parquet\n\n```\n\nWith a PostgreSQL/PostGIS connection:\n\n```bash\nvector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake postgresql://user:password@host:port/db ./topo50_lake.parquet\n```\n\n## Citation\n\n```bibtex\n@software{vector2dggs,\n  title={{vector2dggs}},\n  author={Ardo, James and Law, Richard},\n  url={https://github.com/manaakiwhenua/vector2dggs},\n  version={0.6.0},\n  date={2023-04-20}\n}\n```\n\nAPA/Harvard\n\n> Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs\n\n[![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)\n',
+    'author': 'James Ardo',
+    'author_email': 'ardoj@landcareresearch.co.nz',
+    'maintainer': 'Richard Law',
+    'maintainer_email': 'lawr@landcareresearch.co.nz',
+    'url': 'https://github.com/manaakiwhenua/vector2dggs',
+    'packages': packages,
+    'package_data': package_data,
+    'install_requires': install_requires,
+    'entry_points': entry_points,
+    'python_requires': '>=3.10,<4.0',
+}
+setup(**setup_kwargs)

vector2dggs-0.6.0/vector2dggs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__: str = "0.6.0"

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/vector2dggs/h3.py RENAMED Viewed

@@ -41,6 +41,7 @@ warnings.filterwarnings(
 DEFAULT_PARENT_OFFSET = 6
 DEFAULT_CHUNK_SIZE = 50
 class ParentResolutionException(Exception):
     pass
@@ -68,29 +69,45 @@ def polyfill(
     output_directory: str,
 ) -> None:
     """
-    Reads a geoparquet, performs H3 polyfilling,
-    and writes out to parquet.
+    Reads a geoparquet, performs H3 polyfilling (for polygons),
+    linetracing (for linestrings), and writes out to parquet.
     """
-    df = (
-        gpd.read_parquet(pq_in)
-        .reset_index()
-        .drop(columns=[spatial_sort_col])
-    )
+    df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
     if len(df.index) == 0:
         # Input is empty, nothing to polyfill
         return None
-    df = df.h3.polyfill_resample(resolution, return_geometry=False)
+    df_polygon = df[df.geom_type == "Polygon"]
+    if len(df_polygon.index) > 0:
+        df_polygon = df_polygon.h3.polyfill_resample(
+            resolution, return_geometry=False
+        ).drop(columns=["index"])
+    df_linestring = df[df.geom_type == "LineString"]
+    if len(df_linestring.index) > 0:
+        df_linestring = (
+            df_linestring.h3.linetrace(resolution)
+            .explode("h3_linetrace")
+            .set_index("h3_linetrace")
+        )
+        df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
+    df = pd.concat(
+        map(
+            lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
+            [df_polygon, df_linestring],
+        )
+    )
     if len(df.index) == 0:
         # Polyfill resulted in empty output (e.g. large cell, small feature)
         return None
-    df = pd.DataFrame(df).drop(columns=["index", "geometry"])
     df.index.rename(f"h3_{resolution:02}", inplace=True)
     parent_res: int = _get_parent_res(parent_res, resolution)
     # Secondary (parent) H3 index, used later for partitioning
     df.h3.h3_to_parent(parent_res).to_parquet(
-        PurePath(output_directory, pq_in.name),
-        engine="auto",
-        compression="ZSTD"
+        PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
     )
     return None
@@ -102,7 +119,7 @@ def polyfill_star(args) -> None:
 def _parent_partitioning(
     input_dir: Path,
     output_dir: Path,
-    resolution,
+    resolution: int,
     parent_res: Union[None, int],
     **kwargs,
 ) -> None:
@@ -129,6 +146,25 @@ def _parent_partitioning(
     return
+def drop_condition(
+    df: pd.DataFrame,
+    drop_index: pd.Index,
+    log_statement: str,
+    warning_threshold: float = 0.01,
+):
+    LOGGER.info(log_statement)
+    _before = len(df)
+    df = df.drop(drop_index)
+    _after = len(df)
+    _diff = _before - _after
+    if _diff:
+        log_method = (
+            LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
+        )
+        log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
+    return df
 def _index(
     input_file: Union[Path, str],
     output_directory: Union[Path, str],
@@ -193,11 +229,24 @@ def _index(
         .explode(index_parts=False)  # Explode from GeometryCollection
         .explode(index_parts=False)  # Explode multipolygons to polygons
     ).reset_index()
-    LOGGER.info("Dropping empty or null geometries")
-    df = (
-        df.drop(df[(df.geometry.is_empty|df.geometry.isna())].index)
-        .reset_index()
-    )
+    drop_conditions = [
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.is_empty | frame.geometry.isna())
+            ],
+            "message": "Dropping empty or null geometries",
+        },
+        {
+            "index": lambda frame: frame[
+                (frame.geometry.geom_type != "Polygon")
+                & (frame.geometry.geom_type != "LineString")
+            ],  # NB currently points and other types are lost; in principle, these could be indexed
+            "message": "Dropping non-polygonal geometries",
+        },
+    ]
+    for condition in drop_conditions:
+        df = drop_condition(df, condition["index"](df).index, condition["message"])
     ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
@@ -336,7 +385,7 @@ def _index(
     "--tempdir",
     default=tempfile.tempdir,
     type=click.Path(),
-    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written."
+    help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
 )
 @click.option("-o", "--overwrite", is_flag=True)
 @click.version_option(version=__version__)

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/vector2dggs/katana.py RENAMED Viewed

@@ -26,7 +26,7 @@ def katana(geometry, threshold, count=0) -> GeometryCollection:
     if not geometry.is_valid:
         # print(explain_validity(geometry))
         geometry = make_valid(geometry)
-        if geometry.type == 'GeometryCollection':
+        if geometry.type == "GeometryCollection":
             geometry.normalize()
         geometry = geometry.buffer(0)
     bounds = geometry.bounds

vector2dggs-0.5.3/vector2dggs/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__: str = "0.5.3"

{vector2dggs-0.5.3 → vector2dggs-0.6.0}/vector2dggs/cli.py RENAMED Viewed

File without changes

vector2dggs 0.5.3__tar.gz → 0.6.0__tar.gz

vector2dggs 0.5.3tar.gz → 0.6.0tar.gz