vector2dggs 0.9.1__tar.gz → 0.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,7 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: vector2dggs
3
- Version: 0.9.1
3
+ Version: 0.10.1
4
4
  Summary: CLI DGGS indexer for vector geospatial data
5
- Home-page: https://github.com/manaakiwhenua/vector2dggs
6
5
  License: LGPL-3.0-or-later
7
6
  Keywords: dggs,vector,h3,rHEALPix,cli
8
7
  Author: James Ardo
@@ -14,6 +13,7 @@ Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or l
14
13
  Classifier: Programming Language :: Python :: 3
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
17
  Classifier: Topic :: Scientific/Engineering
18
18
  Classifier: Topic :: Scientific/Engineering :: GIS
19
19
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
@@ -30,6 +30,7 @@ Requires-Dist: psycopg2 (>=2.9.9,<3.0.0)
30
30
  Requires-Dist: pyarrow (>=20.0,<21.0)
31
31
  Requires-Dist: pyproj (>=3.7,<4.0)
32
32
  Requires-Dist: python-geohash (>=0.8.5,<0.9.0)
33
+ Requires-Dist: rhealpixdggs (>=0.5.12,<0.6.0)
33
34
  Requires-Dist: rhppandas (>=0.2.0,<0.3.0)
34
35
  Requires-Dist: rusty-polygon-geohasher (>=0.2.3,<0.3.0)
35
36
  Requires-Dist: s2geometry (>=0.9.0,<0.10.0)
@@ -43,7 +44,7 @@ Description-Content-Type: text/markdown
43
44
 
44
45
  [![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)
45
46
 
46
- Python-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.
47
+ Python-based CLI tool to index vector files to DGGS in parallel, writing out to Parquet.
47
48
 
48
49
  This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
49
50
 
@@ -57,7 +58,7 @@ Currently this tool supports the following DGGSs:
57
58
 
58
59
  - [Geohash](https://en.wikipedia.org/wiki/Geohash) (points, polygons)
59
60
 
60
- Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
61
+ Contributions (especially for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
61
62
 
62
63
  ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
63
64
 
@@ -114,23 +115,29 @@ Options:
114
115
  use when spatially partioning. Adjusting
115
116
  this number will trade off memory use and
116
117
  time. [default: 50; required]
117
- -s, --spatial_sorting [hilbert|morton|geohash]
118
+ -s, --spatial_sorting [hilbert|morton|geohash|none]
118
119
  Spatial sorting method when perfoming
119
- spatial partitioning. [default: hilbert]
120
+ spatial partitioning. [default: none]
120
121
  -crs, --cut_crs INTEGER Set the coordinate reference system (CRS)
121
- used for cutting large polygons (see `--cur-
122
- threshold`). Defaults to the same CRS as the
123
- input. Should be a valid EPSG code.
124
- -c, --cut_threshold INTEGER Cutting up large polygons into smaller
125
- pieces based on a target length. Units are
126
- assumed to match the input CRS units unless
127
- the `--cut_crs` is also given, in which case
128
- units match the units of the supplied CRS.
129
- [default: 5000; required]
122
+ used for cutting large geometries (see
123
+ `--cut_threshold`). Defaults to the same CRS
124
+ as the input. Should be a valid EPSG code.
125
+ -c, --cut_threshold INTEGER Cutting up large geometries into smaller
126
+ geometries based on a target length. Units
127
+ are assumed to match the input CRS units
128
+ unless the `--cut_crs` is also given, in
129
+ which case units match the units of the
130
+ supplied CRS. [default: 5000; required]
130
131
  -t, --threads INTEGER Amount of threads used for operation
131
- [default: 7]
132
- -lyr, --layer TEXT Name of the layer or table to read when using a
133
- an input that supports layers or tables
132
+ [default: NUM_CPUS - 1]
133
+ -cp, --compression TEXT Compression method to use for the output
134
+ Parquet files. Options include 'snappy',
135
+ 'gzip', 'brotli', 'lz4', 'zstd', etc. Use
136
+ 'none' for no compression. [default:
137
+ snappy]
138
+ -lyr, --layer TEXT Name of the layer or table to read when
139
+ using an input that supports layers or
140
+ tables
134
141
  -g, --geom_col TEXT Column name to use when using a spatial
135
142
  database connection as input [default:
136
143
  geom]
@@ -138,6 +145,8 @@ Options:
138
145
  execution of this program. This parameter
139
146
  allows you to control where this data will
140
147
  be written.
148
+ -co, --compact Compact the H3 cells up to the parent
149
+ resolution. Compaction requires an id_field.
141
150
  -o, --overwrite
142
151
  --version Show the version and exit.
143
152
  --help Show this message and exit.
@@ -187,7 +196,6 @@ from shapely.geometry import Polygon
187
196
 
188
197
  RES = 18
189
198
  df = pd.read_parquet(f'~/output-data/ponds-with-holes.s2.{RES}.pq')
190
- df = df.reset_index()
191
199
 
192
200
  def s2id_to_polygon(s2_id_hex):
193
201
  cell_id = s2sphere.CellId.from_token(s2_id_hex)
@@ -199,11 +207,17 @@ def s2id_to_polygon(s2_id_hex):
199
207
  vertices.append((lat_lng.lng().degrees, lat_lng.lat().degrees)) # (lon, lat)
200
208
  return Polygon(vertices)
201
209
 
202
- df['geometry'] = df[f's2_{RES}'].apply(s2id_to_polygon)
210
+ df['geometry'] = df.index.to_series().apply(s2id_to_polygon)
203
211
  df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326') # WGS84
204
212
  df.to_parquet(f'sample-{RES}.parquet')
205
213
  ```
206
214
 
215
+ ## Compaction
216
+
217
+ Compaction is supported with the `-co/--compact` argument. The result respects overlapping polygons by considering each feature independently. (In the below example output for rHEALPix, cells are shown with opacity; overlap is visible where there is a darker shade.) This does mean that the index of the result is not necessarily unique (unless your input is a vector _coverage_, i.e. it does not have overlaps.)
218
+
219
+ ![Example of compaction of overlapping vector features with the rHEALPix DGGS](docs/imgs/rhp-compaction-example.png)
220
+
207
221
  ### For development
208
222
 
209
223
  In brief, to get started:
@@ -248,14 +262,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -lyr topo50_lake
248
262
  title={{vector2dggs}},
249
263
  author={Ardo, James and Law, Richard},
250
264
  url={https://github.com/manaakiwhenua/vector2dggs},
251
- version={0.9.1},
265
+ version={0.10.1},
252
266
  date={2023-04-20}
253
267
  }
254
268
  ```
255
269
 
256
270
  APA/Harvard
257
271
 
258
- > Ardo, J., & Law, R. (2023). vector2dggs (0.9.1) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
272
+ > Ardo, J., & Law, R. (2023). vector2dggs (0.10.1) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
259
273
 
260
274
  [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)
261
275
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)
4
4
 
5
- Python-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.
5
+ Python-based CLI tool to index vector files to DGGS in parallel, writing out to Parquet.
6
6
 
7
7
  This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
8
8
 
@@ -16,7 +16,7 @@ Currently this tool supports the following DGGSs:
16
16
 
17
17
  - [Geohash](https://en.wikipedia.org/wiki/Geohash) (points, polygons)
18
18
 
19
- Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
19
+ Contributions (especially for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
20
20
 
21
21
  ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
22
22
 
@@ -73,23 +73,29 @@ Options:
73
73
  use when spatially partioning. Adjusting
74
74
  this number will trade off memory use and
75
75
  time. [default: 50; required]
76
- -s, --spatial_sorting [hilbert|morton|geohash]
76
+ -s, --spatial_sorting [hilbert|morton|geohash|none]
77
77
  Spatial sorting method when perfoming
78
- spatial partitioning. [default: hilbert]
78
+ spatial partitioning. [default: none]
79
79
  -crs, --cut_crs INTEGER Set the coordinate reference system (CRS)
80
- used for cutting large polygons (see `--cur-
81
- threshold`). Defaults to the same CRS as the
82
- input. Should be a valid EPSG code.
83
- -c, --cut_threshold INTEGER Cutting up large polygons into smaller
84
- pieces based on a target length. Units are
85
- assumed to match the input CRS units unless
86
- the `--cut_crs` is also given, in which case
87
- units match the units of the supplied CRS.
88
- [default: 5000; required]
80
+ used for cutting large geometries (see
81
+ `--cut_threshold`). Defaults to the same CRS
82
+ as the input. Should be a valid EPSG code.
83
+ -c, --cut_threshold INTEGER Cutting up large geometries into smaller
84
+ geometries based on a target length. Units
85
+ are assumed to match the input CRS units
86
+ unless the `--cut_crs` is also given, in
87
+ which case units match the units of the
88
+ supplied CRS. [default: 5000; required]
89
89
  -t, --threads INTEGER Amount of threads used for operation
90
- [default: 7]
91
- -lyr, --layer TEXT Name of the layer or table to read when using a
92
- an input that supports layers or tables
90
+ [default: NUM_CPUS - 1]
91
+ -cp, --compression TEXT Compression method to use for the output
92
+ Parquet files. Options include 'snappy',
93
+ 'gzip', 'brotli', 'lz4', 'zstd', etc. Use
94
+ 'none' for no compression. [default:
95
+ snappy]
96
+ -lyr, --layer TEXT Name of the layer or table to read when
97
+ using an input that supports layers or
98
+ tables
93
99
  -g, --geom_col TEXT Column name to use when using a spatial
94
100
  database connection as input [default:
95
101
  geom]
@@ -97,6 +103,8 @@ Options:
97
103
  execution of this program. This parameter
98
104
  allows you to control where this data will
99
105
  be written.
106
+ -co, --compact Compact the H3 cells up to the parent
107
+ resolution. Compaction requires an id_field.
100
108
  -o, --overwrite
101
109
  --version Show the version and exit.
102
110
  --help Show this message and exit.
@@ -146,7 +154,6 @@ from shapely.geometry import Polygon
146
154
 
147
155
  RES = 18
148
156
  df = pd.read_parquet(f'~/output-data/ponds-with-holes.s2.{RES}.pq')
149
- df = df.reset_index()
150
157
 
151
158
  def s2id_to_polygon(s2_id_hex):
152
159
  cell_id = s2sphere.CellId.from_token(s2_id_hex)
@@ -158,11 +165,17 @@ def s2id_to_polygon(s2_id_hex):
158
165
  vertices.append((lat_lng.lng().degrees, lat_lng.lat().degrees)) # (lon, lat)
159
166
  return Polygon(vertices)
160
167
 
161
- df['geometry'] = df[f's2_{RES}'].apply(s2id_to_polygon)
168
+ df['geometry'] = df.index.to_series().apply(s2id_to_polygon)
162
169
  df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326') # WGS84
163
170
  df.to_parquet(f'sample-{RES}.parquet')
164
171
  ```
165
172
 
173
+ ## Compaction
174
+
175
+ Compaction is supported with the `-co/--compact` argument. The result respects overlapping polygons by considering each feature independently. (In the below example output for rHEALPix, cells are shown with opacity; overlap is visible where there is a darker shade.) This does mean that the index of the result is not necessarily unique (unless your input is a vector _coverage_, i.e. it does not have overlaps.)
176
+
177
+ ![Example of compaction of overlapping vector features with the rHEALPix DGGS](docs/imgs/rhp-compaction-example.png)
178
+
166
179
  ### For development
167
180
 
168
181
  In brief, to get started:
@@ -207,13 +220,13 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -lyr topo50_lake
207
220
  title={{vector2dggs}},
208
221
  author={Ardo, James and Law, Richard},
209
222
  url={https://github.com/manaakiwhenua/vector2dggs},
210
- version={0.9.1},
223
+ version={0.10.1},
211
224
  date={2023-04-20}
212
225
  }
213
226
  ```
214
227
 
215
228
  APA/Harvard
216
229
 
217
- > Ardo, J., & Law, R. (2023). vector2dggs (0.9.1) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
230
+ > Ardo, J., & Law, R. (2023). vector2dggs (0.10.1) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
218
231
 
219
232
  [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "vector2dggs"
3
- version = "0.9.1"
3
+ version = "0.10.1"
4
4
  description = "CLI DGGS indexer for vector geospatial data"
5
5
  authors = ["James Ardo <ardoj@landcareresearch.co.nz>"]
6
6
  maintainers = ["Richard Law <lawr@landcareresearch.co.nz>"]
@@ -35,6 +35,7 @@ pillow = "^11.2.1"
35
35
  s2geometry = "^0.9.0"
36
36
  rusty-polygon-geohasher = "^0.2.3"
37
37
  python-geohash = "^0.8.5"
38
+ rhealpixdggs = "^0.5.12"
38
39
 
39
40
  [tool.poetry.group.dev.dependencies]
40
41
  pytest = "^7.2.2"
@@ -0,0 +1 @@
1
+ __version__: str = "0.10.1"
@@ -6,13 +6,14 @@ import click_log
6
6
  import sqlalchemy
7
7
  import shutil
8
8
  import pyproj
9
+ from uuid import uuid4
9
10
 
10
11
  import pandas as pd
11
12
  import geopandas as gpd
12
13
  import dask.dataframe as dd
13
14
  import dask_geopandas as dgpd
14
15
 
15
- from typing import Union, Callable
16
+ from typing import Union, Callable, Iterable
16
17
  from pathlib import Path, PurePath
17
18
  from urllib.parse import urlparse
18
19
  from tqdm import tqdm
@@ -36,6 +37,12 @@ class ParentResolutionException(Exception):
36
37
  pass
37
38
 
38
39
 
40
+ class IdFieldError(ValueError):
41
+ """Raised when an invalid or missing ID field is provided."""
42
+
43
+ pass
44
+
45
+
39
46
  def check_resolutions(resolution: int, parent_res: int) -> None:
40
47
  if parent_res is not None and not int(parent_res) < int(resolution):
41
48
  raise ParentResolutionException(
@@ -45,6 +52,73 @@ def check_resolutions(resolution: int, parent_res: int) -> None:
45
52
  )
46
53
 
47
54
 
55
+ def check_compaction_requirements(compact: bool, id_field: Union[str, None]) -> None:
56
+ if compact and not id_field:
57
+ raise IdFieldError(
58
+ "An id_field is required for compaction, in order to handle the potential for overlapping features"
59
+ )
60
+
61
+
62
+ def compaction(
63
+ df: pd.DataFrame,
64
+ res: int,
65
+ id_field: str,
66
+ col_order: list[str],
67
+ dggs_col: str,
68
+ compact_func: Callable[[Iterable[Union[str, int]]], Iterable[Union[str, int]]],
69
+ cell_to_child_func: Callable[[Union[str, int], int], Union[str, int]],
70
+ ):
71
+ """
72
+ Compacts a dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
73
+ """
74
+ df = df.reset_index(drop=False)
75
+
76
+ feature_cell_groups = (
77
+ df.groupby(id_field)[dggs_col].apply(lambda x: set(x)).to_dict()
78
+ )
79
+ feature_cell_compact = {
80
+ id: set(compact_func(cells)) for id, cells in feature_cell_groups.items()
81
+ }
82
+
83
+ uncompressable = {
84
+ id: feature_cell_groups[id] & feature_cell_compact[id]
85
+ for id in feature_cell_groups.keys()
86
+ }
87
+ compressable = {
88
+ id: feature_cell_compact[id] - feature_cell_groups[id]
89
+ for id in feature_cell_groups.keys()
90
+ }
91
+
92
+ # Get rows that cannot be compressed
93
+ mask = pd.Series([False] * len(df), index=df.index) # Init bool mask
94
+ for key, value_set in uncompressable.items():
95
+ mask |= (df[id_field] == key) & (df[dggs_col].isin(value_set))
96
+ uncompressable_df = df[mask].set_index(dggs_col)
97
+
98
+ # Get rows that can be compressed
99
+ # Convert each compressed (coarser resolution) cell into a cell at
100
+ # the original resolution (usu using centre child as reference)
101
+ compression_mapping = {
102
+ (id, cell_to_child_func(cell, res)): cell
103
+ for id, cells in compressable.items()
104
+ if cells
105
+ for cell in cells
106
+ }
107
+ mask = pd.Series([False] * len(df), index=df.index)
108
+ composite_key = f"composite_key_{uuid4()}"
109
+ # Update mask for compressible rows and prepare for replacement
110
+ get_composite_key = lambda row: (row[id_field], row[dggs_col])
111
+ df[composite_key] = df.apply(get_composite_key, axis=1)
112
+ mask |= df[composite_key].isin(compression_mapping)
113
+ compressable_df = df[mask].copy()
114
+ compressable_df[dggs_col] = compressable_df[composite_key].map(
115
+ compression_mapping
116
+ ) # Replace DGGS cell ID with compressed representation
117
+ compressable_df = compressable_df.set_index(dggs_col)
118
+
119
+ return pd.concat([compressable_df, uncompressable_df])[col_order]
120
+
121
+
48
122
  def db_conn_and_input_path(
49
123
  vector_input: Union[str, Path],
50
124
  ) -> tuple[SQLConnectionType, Union[str, Path]]:
@@ -137,27 +211,59 @@ def parent_partitioning(
137
211
  dggs: str,
138
212
  input_dir: Path,
139
213
  output_dir: Path,
214
+ compaction_func: Union[Callable, None],
140
215
  resolution: int,
141
216
  parent_res: int,
217
+ id_field: str,
142
218
  **kwargs,
143
219
  ) -> None:
144
220
  partition_col = f"{dggs}_{parent_res:02}"
221
+ dggs_col = f"{dggs}_{resolution:02}"
222
+
223
+ # Read the parquet files into a Dask DataFrame
224
+ ddf = dd.read_parquet(input_dir, engine="pyarrow")
225
+ meta = ddf._meta
226
+
227
+ with TqdmCallback(
228
+ desc=f"Parent partitioning, writing {'compacted ' if compaction_func else ''}output"
229
+ ):
230
+ if compaction_func:
231
+ # Apply the compaction function to each partition
232
+ unique_parents = sorted(
233
+ [v for v in ddf[partition_col].unique().compute() if pd.notna(v)]
234
+ )
235
+ divisions = unique_parents + [unique_parents[-1]]
236
+ ddf = (
237
+ ddf.reset_index(drop=False)
238
+ .dropna(subset=[partition_col])
239
+ .set_index(partition_col)
240
+ .repartition(divisions=divisions)
241
+ .map_partitions(
242
+ compaction_func,
243
+ resolution,
244
+ meta.columns.to_list(), # Column order to be returned
245
+ dggs_col,
246
+ id_field,
247
+ meta=meta,
248
+ )
249
+ )
145
250
 
146
- with TqdmCallback(desc="Repartitioning"):
147
- dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
251
+ ddf.to_parquet(
148
252
  output_dir,
149
253
  overwrite=kwargs.get("overwrite", False),
150
254
  engine=kwargs.get("engine", "pyarrow"),
151
- partition_on=partition_col,
255
+ partition_on=[partition_col],
152
256
  compression=kwargs.get("compression", "ZSTD"),
257
+ # **kwargs
153
258
  )
154
- LOGGER.debug("Parent cell repartitioning complete")
155
259
 
156
- # Rename output to just be the partition key, suffix .parquet
260
+ LOGGER.debug("Parent cell partitioning complete")
261
+
262
+ # Append a .parquet suffix
157
263
  for f in os.listdir(output_dir):
158
264
  os.rename(
159
265
  os.path.join(output_dir, f),
160
- os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
266
+ os.path.join(output_dir, f.replace(f"{partition_col}=", "")),
161
267
  )
162
268
 
163
269
  return
@@ -172,6 +278,7 @@ def polyfill(
172
278
  resolution: int,
173
279
  parent_res: int,
174
280
  output_directory: str,
281
+ compression: str = "snappy",
175
282
  ) -> None:
176
283
  """
177
284
  Reads a geoparquet, performs polyfilling (for Polygon),
@@ -198,7 +305,7 @@ def polyfill(
198
305
  df = secondary_index_func(df, parent_res)
199
306
 
200
307
  df.to_parquet(
201
- PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
308
+ PurePath(output_directory, pq_in.name), engine="auto", compression=compression
202
309
  )
203
310
  return None
204
311
 
@@ -211,6 +318,7 @@ def index(
211
318
  dggs: str,
212
319
  dggsfunc: Callable,
213
320
  secondary_index_func: Callable,
321
+ compaction_func: Union[Callable, None],
214
322
  input_file: Union[Path, str],
215
323
  output_directory: Union[Path, str],
216
324
  resolution: int,
@@ -220,6 +328,7 @@ def index(
220
328
  spatial_sorting: str,
221
329
  cut_threshold: int,
222
330
  processes: int,
331
+ compression: str = "snappy",
223
332
  id_field: str = None,
224
333
  cut_crs: pyproj.CRS = None,
225
334
  con: SQLConnectionType = None,
@@ -245,7 +354,7 @@ def index(
245
354
  )
246
355
  else:
247
356
  # Read file
248
- df = gpd.read_file(input_file)
357
+ df = gpd.read_file(input_file, layer=layer)
249
358
 
250
359
  if cut_crs:
251
360
  df = df.to_crs(cut_crs)
@@ -329,6 +438,7 @@ def index(
329
438
  resolution,
330
439
  parent_res,
331
440
  tmpdir2,
441
+ compression,
332
442
  )
333
443
  for filepath in filepaths
334
444
  ]
@@ -344,9 +454,12 @@ def index(
344
454
  dggs,
345
455
  Path(tmpdir2),
346
456
  output_directory,
457
+ compaction_func,
347
458
  resolution,
348
459
  parent_res,
460
+ id_field,
349
461
  overwrite=overwrite,
462
+ compression=compression,
350
463
  )
351
464
 
352
465
  return output_directory
@@ -16,6 +16,7 @@ DEFAULTS = {
16
16
  "crs": None,
17
17
  "c": 5000,
18
18
  "t": (multiprocessing.cpu_count() - 1),
19
+ "cp": "snappy",
19
20
  "lyr": None,
20
21
  "g": "geom",
21
22
  "tempdir": tempfile.tempdir,
@@ -19,6 +19,8 @@ import vector2dggs.common as common
19
19
 
20
20
  from vector2dggs import __version__
21
21
 
22
+ GEOHASH_BASE32_SET = set("0123456789bcdefghjkmnpqrstuvwxyz")
23
+
22
24
 
23
25
  def gh_secondary_index(df: pd.DataFrame, parent_level: int) -> pd.DataFrame:
24
26
  df[f"geohash_{parent_level:02}"] = df.index.to_series().str[:parent_level]
@@ -73,6 +75,82 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
73
75
  )
74
76
 
75
77
 
78
+ def gh_children(geohash: str, desired_resolution: int) -> int:
79
+ """
80
+ Determine the number of children in the geohash refinement, determined by the additional character levels.
81
+ """
82
+ current_resolution = len(geohash)
83
+ additional_length = desired_resolution - current_resolution
84
+ return 32**additional_length # Each new character increases resolution by 32
85
+
86
+
87
+ def compact(cells: set[str]) -> set[str]:
88
+ """
89
+ Compact a set of geohash cells.
90
+ Cells must be at the same resolution.
91
+ """
92
+ current_set = set(cells)
93
+ while True:
94
+ parent_map = {}
95
+ for gh in current_set:
96
+ parent = gh[:-1]
97
+ if parent not in parent_map:
98
+ parent_map[parent] = set()
99
+ parent_map[parent].add(gh)
100
+
101
+ next_set = set()
102
+ for parent, siblings in parent_map.items():
103
+ if len(siblings) == 32:
104
+ next_set.add(parent)
105
+ else:
106
+ next_set.update(siblings)
107
+
108
+ if next_set == current_set:
109
+ break
110
+ current_set = next_set
111
+
112
+ return current_set
113
+
114
+
115
+ def get_central_child(geohash: str, precision: int):
116
+ """
117
+ Return an approximate central child of the geohash.
118
+ NB if only an arbitrary child is needed, use get_child_geohash
119
+ """
120
+ lat, lon = decode(geohash)
121
+ return encode(lat, lon, precision=precision)
122
+
123
+
124
+ def get_child_geohash(geohash: str, desired_length: int, child: str = "0"):
125
+ """
126
+ Get a child geohash of the specified length by extending the input geohash.
127
+ Child geohash is
128
+ """
129
+ if child not in GEOHASH_BASE32_SET:
130
+ raise ValueError(
131
+ f"Invalid child character '{child}'. Must be one of {''.join(GEOHASH_BASE32_SET)}."
132
+ )
133
+
134
+ if len(geohash) >= desired_length:
135
+ return geohash
136
+ return geohash.ljust(desired_length, child)
137
+
138
+
139
+ def gh_compaction(
140
+ df: pd.DataFrame,
141
+ res: int,
142
+ col_order: list,
143
+ dggs_col: str,
144
+ id_field: str,
145
+ ) -> pd.DataFrame:
146
+ """
147
+ Compacts a geohash dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
148
+ """
149
+ return common.compaction(
150
+ df, res, id_field, col_order, dggs_col, compact, get_child_geohash
151
+ )
152
+
153
+
76
154
  @click.command(context_settings={"show_default": True})
77
155
  @click_log.simple_verbosity_option(common.LOGGER)
78
156
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -133,7 +211,7 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
133
211
  required=False,
134
212
  default=const.DEFAULTS["crs"],
135
213
  type=int,
136
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
214
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
137
215
  nargs=1,
138
216
  )
139
217
  @click.option(
@@ -154,6 +232,15 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
154
232
  help="Amount of threads used for operation",
155
233
  nargs=1,
156
234
  )
235
+ @click.option(
236
+ "-cp",
237
+ "--compression",
238
+ required=False,
239
+ default=const.DEFAULTS["cp"],
240
+ type=str,
241
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
242
+ nargs=1,
243
+ )
157
244
  @click.option(
158
245
  "-lyr",
159
246
  "--layer",
@@ -178,6 +265,12 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
178
265
  type=click.Path(),
179
266
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
180
267
  )
268
+ @click.option(
269
+ "-co",
270
+ "--compact",
271
+ is_flag=True,
272
+ help="Compact the geohash cells up to the parent resolution. Compaction requires an id_field.",
273
+ )
181
274
  @click.option("-o", "--overwrite", is_flag=True)
182
275
  @click.version_option(version=__version__)
183
276
  def geohash(
@@ -192,9 +285,11 @@ def geohash(
192
285
  cut_crs: int,
193
286
  cut_threshold: int,
194
287
  threads: int,
288
+ compression: str,
195
289
  layer: str,
196
290
  geom_col: str,
197
291
  tempdir: Union[str, Path],
292
+ compact: bool,
198
293
  overwrite: bool,
199
294
  ):
200
295
  """
@@ -218,6 +313,7 @@ def geohash(
218
313
  "geohash",
219
314
  gh_polyfill,
220
315
  gh_secondary_index,
316
+ gh_compaction if compact else None,
221
317
  vector_input,
222
318
  output_directory,
223
319
  int(level),
@@ -227,6 +323,7 @@ def geohash(
227
323
  spatial_sorting,
228
324
  cut_threshold,
229
325
  threads,
326
+ compression=compression,
230
327
  cut_crs=cut_crs,
231
328
  id_field=id_field,
232
329
  con=con,
@@ -4,6 +4,7 @@ import click_log
4
4
  import tempfile
5
5
  import pyproj
6
6
 
7
+ import h3 as h3py
7
8
  import h3pandas # Necessary import despite lack of explicit use
8
9
 
9
10
  import pandas as pd
@@ -50,6 +51,27 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
50
51
  )
51
52
 
52
53
 
54
+ def h3compaction(
55
+ df: pd.DataFrame,
56
+ res: int,
57
+ col_order: list,
58
+ dggs_col: str,
59
+ id_field: str,
60
+ ) -> pd.DataFrame:
61
+ """
62
+ Compacts an H3 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
63
+ """
64
+ return common.compaction(
65
+ df,
66
+ res,
67
+ id_field,
68
+ col_order,
69
+ dggs_col,
70
+ h3py.compact_cells,
71
+ h3py.cell_to_center_child,
72
+ )
73
+
74
+
53
75
  @click.command(context_settings={"show_default": True})
54
76
  @click_log.simple_verbosity_option(common.LOGGER)
55
77
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -108,7 +130,7 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
108
130
  required=False,
109
131
  default=const.DEFAULTS["crs"],
110
132
  type=int,
111
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
133
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
112
134
  nargs=1,
113
135
  )
114
136
  @click.option(
@@ -129,6 +151,15 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
129
151
  help="Amount of threads used for operation",
130
152
  nargs=1,
131
153
  )
154
+ @click.option(
155
+ "-cp",
156
+ "--compression",
157
+ required=False,
158
+ default=const.DEFAULTS["cp"],
159
+ type=str,
160
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
161
+ nargs=1,
162
+ )
132
163
  @click.option(
133
164
  "-lyr",
134
165
  "--layer",
@@ -153,6 +184,12 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
153
184
  type=click.Path(),
154
185
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
155
186
  )
187
+ @click.option(
188
+ "-co",
189
+ "--compact",
190
+ is_flag=True,
191
+ help="Compact the H3 cells up to the parent resolution. Compaction requires an id_field.",
192
+ )
156
193
  @click.option("-o", "--overwrite", is_flag=True)
157
194
  @click.version_option(version=__version__)
158
195
  def h3(
@@ -167,9 +204,11 @@ def h3(
167
204
  cut_crs: int,
168
205
  cut_threshold: int,
169
206
  threads: int,
207
+ compression: str,
170
208
  layer: str,
171
209
  geom_col: str,
172
210
  tempdir: Union[str, Path],
211
+ compact: bool,
173
212
  overwrite: bool,
174
213
  ):
175
214
  """
@@ -181,6 +220,7 @@ def h3(
181
220
  tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
182
221
 
183
222
  common.check_resolutions(resolution, parent_res)
223
+ common.check_compaction_requirements(compact, id_field)
184
224
 
185
225
  con, vector_input = common.db_conn_and_input_path(vector_input)
186
226
  output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -193,6 +233,7 @@ def h3(
193
233
  "h3",
194
234
  h3polyfill,
195
235
  h3_secondary_index,
236
+ h3compaction if compact else None,
196
237
  vector_input,
197
238
  output_directory,
198
239
  int(resolution),
@@ -202,6 +243,7 @@ def h3(
202
243
  spatial_sorting,
203
244
  cut_threshold,
204
245
  threads,
246
+ compression=compression,
205
247
  cut_crs=cut_crs,
206
248
  id_field=id_field,
207
249
  con=con,
@@ -11,6 +11,8 @@ import geopandas as gpd
11
11
 
12
12
  from typing import Union
13
13
  from pathlib import Path
14
+ from rhealpixdggs.conversion import compress_order_cells
15
+ from rhealpixdggs.rhp_wrappers import rhp_to_center_child
14
16
  from rhppandas.util.const import COLUMNS
15
17
 
16
18
  import vector2dggs.constants as const
@@ -27,7 +29,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
27
29
  df_polygon = df[df.geom_type == "Polygon"]
28
30
  if len(df_polygon.index) > 0:
29
31
  df_polygon = df_polygon.rhp.polyfill_resample(
30
- resolution, return_geometry=False
32
+ resolution, return_geometry=False, compress=False
31
33
  ).drop(columns=["index"])
32
34
 
33
35
  df_linestring = df[df.geom_type == "LineString"]
@@ -51,6 +53,42 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
51
53
  )
52
54
 
53
55
 
56
+ def compact_cells(cells: set[str]) -> set[str]:
57
+ """
58
+ Compact a set of rHEALPix DGGS cells.
59
+ Cells must be at the same resolution.
60
+ See https://github.com/manaakiwhenua/rhealpixdggs-py/issues/35#issuecomment-3186073554
61
+ """
62
+ previous_result = set(cells)
63
+ while True:
64
+ current_result = set(compress_order_cells(previous_result))
65
+ if previous_result == current_result:
66
+ break
67
+ previous_result = current_result
68
+ return previous_result
69
+
70
+
71
+ def rhpcompaction(
72
+ df: pd.DataFrame,
73
+ res: int,
74
+ col_order: list,
75
+ dggs_col: str,
76
+ id_field: str,
77
+ ) -> pd.DataFrame:
78
+ """
79
+ Compacts an rHP dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
80
+ """
81
+ return common.compaction(
82
+ df,
83
+ res,
84
+ id_field,
85
+ col_order,
86
+ dggs_col,
87
+ compact_cells,
88
+ rhp_to_center_child,
89
+ )
90
+
91
+
54
92
  @click.command(context_settings={"show_default": True})
55
93
  @click_log.simple_verbosity_option(common.LOGGER)
56
94
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -109,7 +147,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
109
147
  required=False,
110
148
  default=const.DEFAULTS["crs"],
111
149
  type=int,
112
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
150
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
113
151
  nargs=1,
114
152
  )
115
153
  @click.option(
@@ -130,6 +168,15 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
130
168
  help="Amount of threads used for operation",
131
169
  nargs=1,
132
170
  )
171
+ @click.option(
172
+ "-cp",
173
+ "--compression",
174
+ required=False,
175
+ default=const.DEFAULTS["cp"],
176
+ type=str,
177
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
178
+ nargs=1,
179
+ )
133
180
  @click.option(
134
181
  "-lyr",
135
182
  "--layer",
@@ -154,6 +201,12 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
154
201
  type=click.Path(),
155
202
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
156
203
  )
204
+ @click.option(
205
+ "-co",
206
+ "--compact",
207
+ is_flag=True,
208
+ help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
209
+ )
157
210
  @click.option("-o", "--overwrite", is_flag=True)
158
211
  @click.version_option(version=__version__)
159
212
  def rhp(
@@ -168,9 +221,11 @@ def rhp(
168
221
  cut_crs: int,
169
222
  cut_threshold: int,
170
223
  threads: int,
224
+ compression: str,
171
225
  layer: str,
172
226
  geom_col: str,
173
227
  tempdir: Union[str, Path],
228
+ compact: bool,
174
229
  overwrite: bool,
175
230
  ):
176
231
  """
@@ -182,6 +237,7 @@ def rhp(
182
237
  tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
183
238
 
184
239
  common.check_resolutions(resolution, parent_res)
240
+ common.check_compaction_requirements(compact, id_field)
185
241
 
186
242
  con, vector_input = common.db_conn_and_input_path(vector_input)
187
243
  output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -194,6 +250,7 @@ def rhp(
194
250
  "rhp",
195
251
  rhppolyfill,
196
252
  rhp_secondary_index,
253
+ rhpcompaction if compact else None,
197
254
  vector_input,
198
255
  output_directory,
199
256
  int(resolution),
@@ -203,6 +260,7 @@ def rhp(
203
260
  spatial_sorting,
204
261
  cut_threshold,
205
262
  threads,
263
+ compression=compression,
206
264
  cut_crs=cut_crs,
207
265
  id_field=id_field,
208
266
  con=con,
@@ -182,6 +182,54 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
182
182
  )
183
183
 
184
184
 
185
+ def compact_tokens(tokens: set[str]) -> set[str]:
186
+ """
187
+ Compact a set of S2 DGGS cells.
188
+ Cells must be at the same resolution.
189
+ """
190
+ cell_ids: list[S2.S2CellId] = [
191
+ S2.S2CellId.FromToken(token, len(token)) for token in tokens
192
+ ]
193
+ cell_union: S2.S2CellUnion = S2.S2CellUnion(
194
+ cell_ids
195
+ ) # Vector of sorted, non-overlapping S2CellId
196
+ cell_union.NormalizeS2CellUnion() # Mutates; 'normalize' == 'compact'
197
+ return {c.ToToken() for c in cell_union.cell_ids()}
198
+
199
+
200
+ def token_to_child_token(token: str, level: int) -> str:
201
+ """
202
+ Returns first child (as string token) of a cell (also represented as a string
203
+ token) at a specific level.
204
+ """
205
+ cell: S2.S2CellId = S2.S2CellId.FromToken(token, len(token))
206
+ if level <= cell.level():
207
+ raise ValueError("Level must be greater than the current level of the cell.")
208
+ # Get the child cell iterator
209
+ return cell.child_begin(level).ToToken()
210
+
211
+
212
+ def s2_compaction(
213
+ df: pd.DataFrame,
214
+ res: int,
215
+ col_order: list,
216
+ dggs_col: str,
217
+ id_field: str,
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Compacts an S2 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
221
+ """
222
+ return common.compaction(
223
+ df,
224
+ res,
225
+ id_field,
226
+ col_order,
227
+ dggs_col,
228
+ compact_tokens,
229
+ token_to_child_token,
230
+ )
231
+
232
+
185
233
  @click.command(context_settings={"show_default": True})
186
234
  @click_log.simple_verbosity_option(common.LOGGER)
187
235
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -263,6 +311,15 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
263
311
  help="Amount of threads used for operation",
264
312
  nargs=1,
265
313
  )
314
+ @click.option(
315
+ "-cp",
316
+ "--compression",
317
+ required=False,
318
+ default=const.DEFAULTS["cp"],
319
+ type=str,
320
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
321
+ nargs=1,
322
+ )
266
323
  @click.option(
267
324
  "-lyr",
268
325
  "--layer",
@@ -287,6 +344,12 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
287
344
  type=click.Path(),
288
345
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
289
346
  )
347
+ @click.option(
348
+ "-co",
349
+ "--compact",
350
+ is_flag=True,
351
+ help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
352
+ )
290
353
  @click.option("-o", "--overwrite", is_flag=True)
291
354
  @click.version_option(version=__version__)
292
355
  def s2(
@@ -301,9 +364,11 @@ def s2(
301
364
  cut_crs: int,
302
365
  cut_threshold: int,
303
366
  threads: int,
367
+ compression: str,
304
368
  layer: str,
305
369
  geom_col: str,
306
370
  tempdir: Union[str, Path],
371
+ compact: bool,
307
372
  overwrite: bool,
308
373
  ):
309
374
  """
@@ -327,6 +392,7 @@ def s2(
327
392
  "s2",
328
393
  s2_polyfill,
329
394
  s2_secondary_index,
395
+ s2_compaction if compact else None,
330
396
  vector_input,
331
397
  output_directory,
332
398
  int(level),
@@ -336,6 +402,7 @@ def s2(
336
402
  spatial_sorting,
337
403
  cut_threshold,
338
404
  threads,
405
+ compression=compression,
339
406
  cut_crs=cut_crs,
340
407
  id_field=id_field,
341
408
  con=con,
@@ -1 +0,0 @@
1
- __version__: str = "0.9.1"
File without changes
File without changes