vector2dggs 0.5.3__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vector2dggs
3
- Version: 0.5.3
3
+ Version: 0.6.0
4
4
  Summary: CLI DGGS indexer for vector geospatial data
5
5
  Home-page: https://github.com/manaakiwhenua/vector2dggs
6
6
  License: LGPL-3.0-or-later
@@ -21,12 +21,12 @@ Requires-Dist: click (>=8.1.3,<9.0.0)
21
21
  Requires-Dist: click-log (>=0.4.0,<0.5.0)
22
22
  Requires-Dist: dask (>=2023.3.0,<2024.0.0)
23
23
  Requires-Dist: dask-geopandas (>=0.3.0,<0.4.0)
24
- Requires-Dist: gdal (>=3.6.4,<4.0.0)
24
+ Requires-Dist: gdal (>=3.8.0,<4.0.0)
25
25
  Requires-Dist: geopandas (>=0.12.2,<0.13.0)
26
- Requires-Dist: h3pandas (>=0.2.3,<0.3.0)
26
+ Requires-Dist: h3pandas (>=0.2.6,<0.3.0)
27
27
  Requires-Dist: psycopg2 (>=2.9.6,<3.0.0)
28
- Requires-Dist: pyarrow (>=11.0.0,<12.0.0)
29
- Requires-Dist: pygeos (>=0.14,<0.15)
28
+ Requires-Dist: pyarrow (>=14.0.1,<15.0.0)
29
+ Requires-Dist: pygeos (>=0.13,<0.14)
30
30
  Requires-Dist: pyproj (>=3.5.0,<4.0.0)
31
31
  Requires-Dist: sqlalchemy (>=2.0.10,<3.0.0)
32
32
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
@@ -187,14 +187,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
187
187
  title={{vector2dggs}},
188
188
  author={Ardo, James and Law, Richard},
189
189
  url={https://github.com/manaakiwhenua/vector2dggs},
190
- version={0.5.3},
190
+ version={0.6.0},
191
191
  date={2023-04-20}
192
192
  }
193
193
  ```
194
194
 
195
195
  APA/Harvard
196
196
 
197
- > Ardo, J., & Law, R. (2023). vector2dggs (0.5.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
197
+ > Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
198
198
 
199
199
  [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)
200
200
 
@@ -152,13 +152,13 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
152
152
  title={{vector2dggs}},
153
153
  author={Ardo, James and Law, Richard},
154
154
  url={https://github.com/manaakiwhenua/vector2dggs},
155
- version={0.5.3},
155
+ version={0.6.0},
156
156
  date={2023-04-20}
157
157
  }
158
158
  ```
159
159
 
160
160
  APA/Harvard
161
161
 
162
- > Ardo, J., & Law, R. (2023). vector2dggs (0.5.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
162
+ > Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
163
163
 
164
164
  [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "vector2dggs"
3
- version = "0.5.3"
3
+ version = "0.6.0"
4
4
  description = "CLI DGGS indexer for vector geospatial data"
5
5
  authors = ["James Ardo <ardoj@landcareresearch.co.nz>"]
6
6
  maintainers = ["Richard Law <lawr@landcareresearch.co.nz>"]
@@ -16,16 +16,16 @@ classifiers = [
16
16
 
17
17
  [tool.poetry.dependencies]
18
18
  python = "^3.10"
19
- gdal = "^3.6.4"
19
+ gdal = "^3.8.0"
20
20
  geopandas = "^0.12.2"
21
- h3pandas = "^0.2.3"
21
+ h3pandas = "^0.2.6"
22
22
  dask-geopandas = "^0.3.0"
23
23
  dask = "^2023.3.0"
24
24
  click = "^8.1.3"
25
25
  tqdm = "^4.65.0"
26
26
  click-log = "^0.4.0"
27
- pyarrow = "^11.0.0"
28
- pygeos = "^0.14"
27
+ pyarrow = "^14.0.1"
28
+ pygeos = "^0.13"
29
29
  pyproj = "^3.5.0"
30
30
  sqlalchemy = "^2.0.10"
31
31
  psycopg2 = "^2.9.6"
@@ -0,0 +1,46 @@
1
+ # -*- coding: utf-8 -*-
2
+ from setuptools import setup
3
+
4
+ packages = \
5
+ ['vector2dggs']
6
+
7
+ package_data = \
8
+ {'': ['*']}
9
+
10
+ install_requires = \
11
+ ['click-log>=0.4.0,<0.5.0',
12
+ 'click>=8.1.3,<9.0.0',
13
+ 'dask-geopandas>=0.3.0,<0.4.0',
14
+ 'dask>=2023.3.0,<2024.0.0',
15
+ 'gdal>=3.8.0,<4.0.0',
16
+ 'geopandas>=0.12.2,<0.13.0',
17
+ 'h3pandas>=0.2.6,<0.3.0',
18
+ 'psycopg2>=2.9.6,<3.0.0',
19
+ 'pyarrow>=14.0.1,<15.0.0',
20
+ 'pygeos>=0.13,<0.14',
21
+ 'pyproj>=3.5.0,<4.0.0',
22
+ 'sqlalchemy>=2.0.10,<3.0.0',
23
+ 'tqdm>=4.65.0,<5.0.0']
24
+
25
+ entry_points = \
26
+ {'console_scripts': ['vector2dggs = vector2dggs.cli:main']}
27
+
28
+ setup_kwargs = {
29
+ 'name': 'vector2dggs',
30
+ 'version': '0.6.0',
31
+ 'description': 'CLI DGGS indexer for vector geospatial data',
32
+ 'long_description': '# vector2dggs\n\n[![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)\n\nPython-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.\n\nThis is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).\n\nCurrently only supports H3 DGGS, and probably has other limitations since it has been developed for a specific internal use case, though it is intended as a general-purpose abstraction. Contributions, suggestions, bug reports and strongly worded letters are all welcome.\n\nCurrently only supports polygons; but both coverages (strictly non-overlapping polygons), and sets of polygons that do/may overlap, are supported. Overlapping polygons are captured by ensuring that DGGS cell IDs may be non-unique (repeated) in the output.\n\n![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")\n\n## Installation\n\n```bash\npip install vector2dggs\n```\n\n## Usage\n\n```bash\nvector2dggs h3 --help\nUsage: vector2dggs h3 [OPTIONS] VECTOR_INPUT OUTPUT_DIRECTORY\n\n Ingest a vector dataset and index it to the H3 DGGS.\n\n VECTOR_INPUT is the path to input vector geospatial data. OUTPUT_DIRECTORY\n should be a directory, not a file or database table, as it will instead be\n the write location for an Apache Parquet data store.\n\nOptions:\n -v, --verbosity LVL Either CRITICAL, ERROR, WARNING, INFO or\n DEBUG [default: INFO]\n -r, --resolution [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15]\n H3 resolution to index [required]\n -pr, --parent_res [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15]\n H3 Parent resolution for the output\n partition. Defaults to resolution - 6\n -id, --id_field TEXT Field to use as an ID; defaults to a\n constructed single 0...n index on the\n original feature order.\n -k, --keep_attributes Retain attributes in output. The default is\n to create an output that only includes H3\n cell ID and the ID given by the -id field\n (or the default index ID).\n -ch, --chunksize INTEGER The number of rows per index partition to\n use when spatially partioning. Adjusting\n this number will trade off memory use and\n time. [default: 50; required]\n -s, --spatial_sorting [hilbert|morton|geohash]\n Spatial sorting method when perfoming\n spatial partitioning. [default: hilbert]\n -crs, --cut_crs INTEGER Set the coordinate reference system (CRS)\n used for cutting large polygons (see `--cur-\n threshold`). Defaults to the same CRS as the\n input. Should be a valid EPSG code.\n -c, --cut_threshold INTEGER Cutting up large polygons into smaller\n pieces based on a target length. Units are\n assumed to match the input CRS units unless\n the `--cut_crs` is also given, in which case\n units match the units of the supplied CRS.\n [default: 5000; required]\n -t, --threads INTEGER Amount of threads used for operation\n [default: 7]\n -tbl, --table TEXT Name of the table to read when using a\n spatial database connection as input\n -g, --geom_col TEXT Column name to use when using a spatial\n database connection as input [default:\n geom]\n --tempdir PATH Temporary data is created during the\n execution of this program. This parameter\n allows you to control where this data will\n be written.\n -o, --overwrite\n --version Show the version and exit.\n --help Show this message and exit.\n```\n\n## Visualising output\n\nOutput is in the Apache Parquet format, a directory with one file per partition.\n\nFor a quick view of your output, you can read Apache Parquet with pandas, and then use h3-pandas and geopandas to convert this into a GeoPackage or GeoParquet for visualisation in a desktop GIS, such as QGIS. The Apache Parquet output is indexed by an ID column (which you can specify), so it should be ready for two intended use-cases:\n- Joining attribute data from the original feature-level data onto computer DGGS cells.\n- Joining other data to this output on the H3 cell ID. (The output has a column like `h3_\\d{2}`, e.g. `h3_09` or `h3_12` according to the target resolution.)\n\nGeoparquet output (hexagon boundaries):\n\n```python\n>>> import pandas as pd\n>>> import h3pandas\n>>> g = pd.read_parquet(\'./output-data/nz-property-titles.12.parquet\').h3.h3_to_geo_boundary()\n>>> g\n title_no geometry\nh3_12 \n8cbb53a734553ff NA94D/635 POLYGON ((174.28483 -35.69315, 174.28482 -35.6...\n8cbb53a734467ff NA94D/635 POLYGON ((174.28454 -35.69333, 174.28453 -35.6...\n8cbb53a734445ff NA94D/635 POLYGON ((174.28416 -35.69368, 174.28415 -35.6...\n8cbb53a734551ff NA94D/635 POLYGON ((174.28496 -35.69329, 174.28494 -35.6...\n8cbb53a734463ff NA94D/635 POLYGON ((174.28433 -35.69335, 174.28432 -35.6...\n... ... ...\n8cbb53a548b2dff NA62D/324 POLYGON ((174.30249 -35.69369, 174.30248 -35.6...\n8cbb53a548b61ff NA62D/324 POLYGON ((174.30232 -35.69402, 174.30231 -35.6...\n8cbb53a548b11ff NA57C/785 POLYGON ((174.30140 -35.69348, 174.30139 -35.6...\n8cbb53a548b15ff NA57C/785 POLYGON ((174.30161 -35.69346, 174.30160 -35.6...\n8cbb53a548b17ff NA57C/785 POLYGON ((174.30149 -35.69332, 174.30147 -35.6...\n\n[52736 rows x 2 columns]\n>>> g.to_parquet(\'./output-data/parcels.12.geo.parquet\')\n```\n\n### For development\n\nIn brief, to get started:\n\n- Install [Poetry](https://python-poetry.org/docs/basic-usage/)\n- Install [GDAL](https://gdal.org/)\n - If you\'re on Windows, `pip install gdal` may be necessary before running the subsequent commands.\n - On Linux, install GDAL 3.6+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.\n- Create the virtual environment with `poetry init`. This will install necessary dependencies.\n- Subsequently, the virtual environment can be re-activated with `poetry shell`.\n\nIf you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.\n\n#### Code formatting\n\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n\nPlease run `black .` before committing.\n\n## Example commands\n\nWith a local GPKG:\n\n```bash\nvector2dggs h3 -v DEBUG -id title_no -r 12 -o ~/Downloads/nz-property-titles.gpkg ~/Downloads/nz-property-titles.parquet\n\n```\n\nWith a PostgreSQL/PostGIS connection:\n\n```bash\nvector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake postgresql://user:password@host:port/db ./topo50_lake.parquet\n```\n\n## Citation\n\n```bibtex\n@software{vector2dggs,\n title={{vector2dggs}},\n author={Ardo, James and Law, Richard},\n url={https://github.com/manaakiwhenua/vector2dggs},\n version={0.6.0},\n date={2023-04-20}\n}\n```\n\nAPA/Harvard\n\n> Ardo, J., & Law, R. (2023). vector2dggs (0.6.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs\n\n[![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)\n',
33
+ 'author': 'James Ardo',
34
+ 'author_email': 'ardoj@landcareresearch.co.nz',
35
+ 'maintainer': 'Richard Law',
36
+ 'maintainer_email': 'lawr@landcareresearch.co.nz',
37
+ 'url': 'https://github.com/manaakiwhenua/vector2dggs',
38
+ 'packages': packages,
39
+ 'package_data': package_data,
40
+ 'install_requires': install_requires,
41
+ 'entry_points': entry_points,
42
+ 'python_requires': '>=3.10,<4.0',
43
+ }
44
+
45
+
46
+ setup(**setup_kwargs)
@@ -0,0 +1 @@
1
+ __version__: str = "0.6.0"
@@ -41,6 +41,7 @@ warnings.filterwarnings(
41
41
  DEFAULT_PARENT_OFFSET = 6
42
42
  DEFAULT_CHUNK_SIZE = 50
43
43
 
44
+
44
45
  class ParentResolutionException(Exception):
45
46
  pass
46
47
 
@@ -68,29 +69,45 @@ def polyfill(
68
69
  output_directory: str,
69
70
  ) -> None:
70
71
  """
71
- Reads a geoparquet, performs H3 polyfilling,
72
- and writes out to parquet.
72
+ Reads a geoparquet, performs H3 polyfilling (for polygons),
73
+ linetracing (for linestrings), and writes out to parquet.
73
74
  """
74
- df = (
75
- gpd.read_parquet(pq_in)
76
- .reset_index()
77
- .drop(columns=[spatial_sort_col])
78
- )
75
+ df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
79
76
  if len(df.index) == 0:
80
77
  # Input is empty, nothing to polyfill
81
78
  return None
82
- df = df.h3.polyfill_resample(resolution, return_geometry=False)
79
+
80
+ df_polygon = df[df.geom_type == "Polygon"]
81
+ if len(df_polygon.index) > 0:
82
+ df_polygon = df_polygon.h3.polyfill_resample(
83
+ resolution, return_geometry=False
84
+ ).drop(columns=["index"])
85
+
86
+ df_linestring = df[df.geom_type == "LineString"]
87
+ if len(df_linestring.index) > 0:
88
+ df_linestring = (
89
+ df_linestring.h3.linetrace(resolution)
90
+ .explode("h3_linetrace")
91
+ .set_index("h3_linetrace")
92
+ )
93
+ df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
94
+
95
+ df = pd.concat(
96
+ map(
97
+ lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
98
+ [df_polygon, df_linestring],
99
+ )
100
+ )
101
+
83
102
  if len(df.index) == 0:
84
103
  # Polyfill resulted in empty output (e.g. large cell, small feature)
85
104
  return None
86
- df = pd.DataFrame(df).drop(columns=["index", "geometry"])
105
+
87
106
  df.index.rename(f"h3_{resolution:02}", inplace=True)
88
107
  parent_res: int = _get_parent_res(parent_res, resolution)
89
108
  # Secondary (parent) H3 index, used later for partitioning
90
109
  df.h3.h3_to_parent(parent_res).to_parquet(
91
- PurePath(output_directory, pq_in.name),
92
- engine="auto",
93
- compression="ZSTD"
110
+ PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
94
111
  )
95
112
  return None
96
113
 
@@ -102,7 +119,7 @@ def polyfill_star(args) -> None:
102
119
  def _parent_partitioning(
103
120
  input_dir: Path,
104
121
  output_dir: Path,
105
- resolution,
122
+ resolution: int,
106
123
  parent_res: Union[None, int],
107
124
  **kwargs,
108
125
  ) -> None:
@@ -129,6 +146,25 @@ def _parent_partitioning(
129
146
  return
130
147
 
131
148
 
149
+ def drop_condition(
150
+ df: pd.DataFrame,
151
+ drop_index: pd.Index,
152
+ log_statement: str,
153
+ warning_threshold: float = 0.01,
154
+ ):
155
+ LOGGER.info(log_statement)
156
+ _before = len(df)
157
+ df = df.drop(drop_index)
158
+ _after = len(df)
159
+ _diff = _before - _after
160
+ if _diff:
161
+ log_method = (
162
+ LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
163
+ )
164
+ log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
165
+ return df
166
+
167
+
132
168
  def _index(
133
169
  input_file: Union[Path, str],
134
170
  output_directory: Union[Path, str],
@@ -193,11 +229,24 @@ def _index(
193
229
  .explode(index_parts=False) # Explode from GeometryCollection
194
230
  .explode(index_parts=False) # Explode multipolygons to polygons
195
231
  ).reset_index()
196
- LOGGER.info("Dropping empty or null geometries")
197
- df = (
198
- df.drop(df[(df.geometry.is_empty|df.geometry.isna())].index)
199
- .reset_index()
200
- )
232
+
233
+ drop_conditions = [
234
+ {
235
+ "index": lambda frame: frame[
236
+ (frame.geometry.is_empty | frame.geometry.isna())
237
+ ],
238
+ "message": "Dropping empty or null geometries",
239
+ },
240
+ {
241
+ "index": lambda frame: frame[
242
+ (frame.geometry.geom_type != "Polygon")
243
+ & (frame.geometry.geom_type != "LineString")
244
+ ], # NB currently points and other types are lost; in principle, these could be indexed
245
+ "message": "Dropping non-polygonal geometries",
246
+ },
247
+ ]
248
+ for condition in drop_conditions:
249
+ df = drop_condition(df, condition["index"](df).index, condition["message"])
201
250
 
202
251
  ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
203
252
 
@@ -336,7 +385,7 @@ def _index(
336
385
  "--tempdir",
337
386
  default=tempfile.tempdir,
338
387
  type=click.Path(),
339
- help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written."
388
+ help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
340
389
  )
341
390
  @click.option("-o", "--overwrite", is_flag=True)
342
391
  @click.version_option(version=__version__)
@@ -26,7 +26,7 @@ def katana(geometry, threshold, count=0) -> GeometryCollection:
26
26
  if not geometry.is_valid:
27
27
  # print(explain_validity(geometry))
28
28
  geometry = make_valid(geometry)
29
- if geometry.type == 'GeometryCollection':
29
+ if geometry.type == "GeometryCollection":
30
30
  geometry.normalize()
31
31
  geometry = geometry.buffer(0)
32
32
  bounds = geometry.bounds
@@ -1 +0,0 @@
1
- __version__: str = "0.5.3"