vector2dggs 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vector2dggs/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "0.9.0"
1
+ __version__: str = "0.10.0"
vector2dggs/common.py CHANGED
@@ -6,13 +6,14 @@ import click_log
6
6
  import sqlalchemy
7
7
  import shutil
8
8
  import pyproj
9
+ from uuid import uuid4
9
10
 
10
11
  import pandas as pd
11
12
  import geopandas as gpd
12
13
  import dask.dataframe as dd
13
14
  import dask_geopandas as dgpd
14
15
 
15
- from typing import Union, Callable
16
+ from typing import Union, Callable, Iterable
16
17
  from pathlib import Path, PurePath
17
18
  from urllib.parse import urlparse
18
19
  from tqdm import tqdm
@@ -36,6 +37,12 @@ class ParentResolutionException(Exception):
36
37
  pass
37
38
 
38
39
 
40
+ class IdFieldError(ValueError):
41
+ """Raised when an invalid or missing ID field is provided."""
42
+
43
+ pass
44
+
45
+
39
46
  def check_resolutions(resolution: int, parent_res: int) -> None:
40
47
  if parent_res is not None and not int(parent_res) < int(resolution):
41
48
  raise ParentResolutionException(
@@ -45,6 +52,73 @@ def check_resolutions(resolution: int, parent_res: int) -> None:
45
52
  )
46
53
 
47
54
 
55
+ def check_compaction_requirements(compact: bool, id_field: Union[str, None]) -> None:
56
+ if compact and not id_field:
57
+ raise IdFieldError(
58
+ "An id_field is required for compaction, in order to handle the potential for overlapping features"
59
+ )
60
+
61
+
62
+ def compaction(
63
+ df: pd.DataFrame,
64
+ res: int,
65
+ id_field: str,
66
+ col_order: list[str],
67
+ dggs_col: str,
68
+ compact_func: Callable[[Iterable[Union[str, int]]], Iterable[Union[str, int]]],
69
+ cell_to_child_func: Callable[[Union[str, int], int], Union[str, int]],
70
+ ):
71
+ """
72
+ Compacts a dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
73
+ """
74
+ df = df.reset_index(drop=False)
75
+
76
+ feature_cell_groups = (
77
+ df.groupby(id_field)[dggs_col].apply(lambda x: set(x)).to_dict()
78
+ )
79
+ feature_cell_compact = {
80
+ id: set(compact_func(cells)) for id, cells in feature_cell_groups.items()
81
+ }
82
+
83
+ uncompressable = {
84
+ id: feature_cell_groups[id] & feature_cell_compact[id]
85
+ for id in feature_cell_groups.keys()
86
+ }
87
+ compressable = {
88
+ id: feature_cell_compact[id] - feature_cell_groups[id]
89
+ for id in feature_cell_groups.keys()
90
+ }
91
+
92
+ # Get rows that cannot be compressed
93
+ mask = pd.Series([False] * len(df), index=df.index) # Init bool mask
94
+ for key, value_set in uncompressable.items():
95
+ mask |= (df[id_field] == key) & (df[dggs_col].isin(value_set))
96
+ uncompressable_df = df[mask].set_index(dggs_col)
97
+
98
+ # Get rows that can be compressed
99
+ # Convert each compressed (coarser resolution) cell into a cell at
100
+ # the original resolution (usu using centre child as reference)
101
+ compression_mapping = {
102
+ (id, cell_to_child_func(cell, res)): cell
103
+ for id, cells in compressable.items()
104
+ if cells
105
+ for cell in cells
106
+ }
107
+ mask = pd.Series([False] * len(df), index=df.index)
108
+ composite_key = f"composite_key_{uuid4()}"
109
+ # Update mask for compressible rows and prepare for replacement
110
+ get_composite_key = lambda row: (row[id_field], row[dggs_col])
111
+ df[composite_key] = df.apply(get_composite_key, axis=1)
112
+ mask |= df[composite_key].isin(compression_mapping)
113
+ compressable_df = df[mask].copy()
114
+ compressable_df[dggs_col] = compressable_df[composite_key].map(
115
+ compression_mapping
116
+ ) # Replace DGGS cell ID with compressed representation
117
+ compressable_df = compressable_df.set_index(dggs_col)
118
+
119
+ return pd.concat([compressable_df, uncompressable_df])[col_order]
120
+
121
+
48
122
  def db_conn_and_input_path(
49
123
  vector_input: Union[str, Path],
50
124
  ) -> tuple[SQLConnectionType, Union[str, Path]]:
@@ -137,27 +211,59 @@ def parent_partitioning(
137
211
  dggs: str,
138
212
  input_dir: Path,
139
213
  output_dir: Path,
214
+ compaction_func: Union[Callable, None],
140
215
  resolution: int,
141
216
  parent_res: int,
217
+ id_field: str,
142
218
  **kwargs,
143
219
  ) -> None:
144
220
  partition_col = f"{dggs}_{parent_res:02}"
221
+ dggs_col = f"{dggs}_{resolution:02}"
222
+
223
+ # Read the parquet files into a Dask DataFrame
224
+ ddf = dd.read_parquet(input_dir, engine="pyarrow")
225
+ meta = ddf._meta
226
+
227
+ with TqdmCallback(
228
+ desc=f"Parent partitioning, writing {'compacted ' if compaction_func else ''}output"
229
+ ):
230
+ if compaction_func:
231
+ # Apply the compaction function to each partition
232
+ unique_parents = sorted(
233
+ [v for v in ddf[partition_col].unique().compute() if pd.notna(v)]
234
+ )
235
+ divisions = unique_parents + [unique_parents[-1]]
236
+ ddf = (
237
+ ddf.reset_index(drop=False)
238
+ .dropna(subset=[partition_col])
239
+ .set_index(partition_col)
240
+ .repartition(divisions=divisions)
241
+ .map_partitions(
242
+ compaction_func,
243
+ resolution,
244
+ meta.columns.to_list(), # Column order to be returned
245
+ dggs_col,
246
+ id_field,
247
+ meta=meta,
248
+ )
249
+ )
145
250
 
146
- with TqdmCallback(desc="Repartitioning"):
147
- dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
251
+ ddf.to_parquet(
148
252
  output_dir,
149
253
  overwrite=kwargs.get("overwrite", False),
150
254
  engine=kwargs.get("engine", "pyarrow"),
151
- partition_on=partition_col,
255
+ partition_on=[partition_col],
152
256
  compression=kwargs.get("compression", "ZSTD"),
257
+ # **kwargs
153
258
  )
154
- LOGGER.debug("Parent cell repartitioning complete")
155
259
 
156
- # Rename output to just be the partition key, suffix .parquet
260
+ LOGGER.debug("Parent cell partitioning complete")
261
+
262
+ # Append a .parquet suffix
157
263
  for f in os.listdir(output_dir):
158
264
  os.rename(
159
265
  os.path.join(output_dir, f),
160
- os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
266
+ os.path.join(output_dir, f.replace(f"{partition_col}=", "")),
161
267
  )
162
268
 
163
269
  return
@@ -172,6 +278,7 @@ def polyfill(
172
278
  resolution: int,
173
279
  parent_res: int,
174
280
  output_directory: str,
281
+ compression: str = "snappy",
175
282
  ) -> None:
176
283
  """
177
284
  Reads a geoparquet, performs polyfilling (for Polygon),
@@ -198,7 +305,7 @@ def polyfill(
198
305
  df = secondary_index_func(df, parent_res)
199
306
 
200
307
  df.to_parquet(
201
- PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
308
+ PurePath(output_directory, pq_in.name), engine="auto", compression=compression
202
309
  )
203
310
  return None
204
311
 
@@ -211,6 +318,7 @@ def index(
211
318
  dggs: str,
212
319
  dggsfunc: Callable,
213
320
  secondary_index_func: Callable,
321
+ compaction_func: Union[Callable, None],
214
322
  input_file: Union[Path, str],
215
323
  output_directory: Union[Path, str],
216
324
  resolution: int,
@@ -220,6 +328,7 @@ def index(
220
328
  spatial_sorting: str,
221
329
  cut_threshold: int,
222
330
  processes: int,
331
+ compression: str = "snappy",
223
332
  id_field: str = None,
224
333
  cut_crs: pyproj.CRS = None,
225
334
  con: SQLConnectionType = None,
@@ -245,7 +354,7 @@ def index(
245
354
  )
246
355
  else:
247
356
  # Read file
248
- df = gpd.read_file(input_file)
357
+ df = gpd.read_file(input_file, layer=layer)
249
358
 
250
359
  if cut_crs:
251
360
  df = df.to_crs(cut_crs)
@@ -329,6 +438,7 @@ def index(
329
438
  resolution,
330
439
  parent_res,
331
440
  tmpdir2,
441
+ compression,
332
442
  )
333
443
  for filepath in filepaths
334
444
  ]
@@ -344,9 +454,12 @@ def index(
344
454
  dggs,
345
455
  Path(tmpdir2),
346
456
  output_directory,
457
+ compaction_func,
347
458
  resolution,
348
459
  parent_res,
460
+ id_field,
349
461
  overwrite=overwrite,
462
+ compression=compression,
350
463
  )
351
464
 
352
465
  return output_directory
vector2dggs/constants.py CHANGED
@@ -16,6 +16,7 @@ DEFAULTS = {
16
16
  "crs": None,
17
17
  "c": 5000,
18
18
  "t": (multiprocessing.cpu_count() - 1),
19
+ "cp": "snappy",
19
20
  "lyr": None,
20
21
  "g": "geom",
21
22
  "tempdir": tempfile.tempdir,
vector2dggs/geohash.py CHANGED
@@ -19,6 +19,8 @@ import vector2dggs.common as common
19
19
 
20
20
  from vector2dggs import __version__
21
21
 
22
+ GEOHASH_BASE32_SET = set("0123456789bcdefghjkmnpqrstuvwxyz")
23
+
22
24
 
23
25
  def gh_secondary_index(df: pd.DataFrame, parent_level: int) -> pd.DataFrame:
24
26
  df[f"geohash_{parent_level:02}"] = df.index.to_series().str[:parent_level]
@@ -73,6 +75,82 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
73
75
  )
74
76
 
75
77
 
78
+ def gh_children(geohash: str, desired_resolution: int) -> int:
79
+ """
80
+ Determine the number of children in the geohash refinement, determined by the additional character levels.
81
+ """
82
+ current_resolution = len(geohash)
83
+ additional_length = desired_resolution - current_resolution
84
+ return 32**additional_length # Each new character increases resolution by 32
85
+
86
+
87
+ def compact(cells: set[str]) -> set[str]:
88
+ """
89
+ Compact a set of geohash cells.
90
+ Cells must be at the same resolution.
91
+ """
92
+ current_set = set(cells)
93
+ while True:
94
+ parent_map = {}
95
+ for gh in current_set:
96
+ parent = gh[:-1]
97
+ if parent not in parent_map:
98
+ parent_map[parent] = set()
99
+ parent_map[parent].add(gh)
100
+
101
+ next_set = set()
102
+ for parent, siblings in parent_map.items():
103
+ if len(siblings) == 32:
104
+ next_set.add(parent)
105
+ else:
106
+ next_set.update(siblings)
107
+
108
+ if next_set == current_set:
109
+ break
110
+ current_set = next_set
111
+
112
+ return current_set
113
+
114
+
115
+ def get_central_child(geohash: str, precision: int):
116
+ """
117
+ Return an approximate central child of the geohash.
118
+ NB if only an arbitrary child is needed, use get_child_geohash
119
+ """
120
+ lat, lon = decode(geohash)
121
+ return encode(lat, lon, precision=precision)
122
+
123
+
124
+ def get_child_geohash(geohash: str, desired_length: int, child: str = "0"):
125
+ """
126
+ Get a child geohash of the specified length by extending the input geohash.
127
+ Child geohash is
128
+ """
129
+ if child not in GEOHASH_BASE32_SET:
130
+ raise ValueError(
131
+ f"Invalid child character '{child}'. Must be one of {''.join(GEOHASH_BASE32_SET)}."
132
+ )
133
+
134
+ if len(geohash) >= desired_length:
135
+ return geohash
136
+ return geohash.ljust(desired_length, child)
137
+
138
+
139
+ def gh_compaction(
140
+ df: pd.DataFrame,
141
+ res: int,
142
+ col_order: list,
143
+ dggs_col: str,
144
+ id_field: str,
145
+ ) -> pd.DataFrame:
146
+ """
147
+ Compacts a geohash dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
148
+ """
149
+ return common.compaction(
150
+ df, res, id_field, col_order, dggs_col, compact, get_child_geohash
151
+ )
152
+
153
+
76
154
  @click.command(context_settings={"show_default": True})
77
155
  @click_log.simple_verbosity_option(common.LOGGER)
78
156
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -133,7 +211,7 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
133
211
  required=False,
134
212
  default=const.DEFAULTS["crs"],
135
213
  type=int,
136
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
214
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
137
215
  nargs=1,
138
216
  )
139
217
  @click.option(
@@ -154,6 +232,15 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
154
232
  help="Amount of threads used for operation",
155
233
  nargs=1,
156
234
  )
235
+ @click.option(
236
+ "-cp",
237
+ "--compression",
238
+ required=False,
239
+ default=const.DEFAULTS["cp"],
240
+ type=str,
241
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
242
+ nargs=1,
243
+ )
157
244
  @click.option(
158
245
  "-lyr",
159
246
  "--layer",
@@ -178,6 +265,12 @@ def gh_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
178
265
  type=click.Path(),
179
266
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
180
267
  )
268
+ @click.option(
269
+ "-co",
270
+ "--compact",
271
+ is_flag=True,
272
+ help="Compact the geohash cells up to the parent resolution. Compaction requires an id_field.",
273
+ )
181
274
  @click.option("-o", "--overwrite", is_flag=True)
182
275
  @click.version_option(version=__version__)
183
276
  def geohash(
@@ -192,9 +285,11 @@ def geohash(
192
285
  cut_crs: int,
193
286
  cut_threshold: int,
194
287
  threads: int,
288
+ compression: str,
195
289
  layer: str,
196
290
  geom_col: str,
197
291
  tempdir: Union[str, Path],
292
+ compact: bool,
198
293
  overwrite: bool,
199
294
  ):
200
295
  """
@@ -218,6 +313,7 @@ def geohash(
218
313
  "geohash",
219
314
  gh_polyfill,
220
315
  gh_secondary_index,
316
+ gh_compaction if compact else None,
221
317
  vector_input,
222
318
  output_directory,
223
319
  int(level),
@@ -227,6 +323,7 @@ def geohash(
227
323
  spatial_sorting,
228
324
  cut_threshold,
229
325
  threads,
326
+ compression=compression,
230
327
  cut_crs=cut_crs,
231
328
  id_field=id_field,
232
329
  con=con,
vector2dggs/h3.py CHANGED
@@ -4,6 +4,7 @@ import click_log
4
4
  import tempfile
5
5
  import pyproj
6
6
 
7
+ import h3 as h3py
7
8
  import h3pandas # Necessary import despite lack of explicit use
8
9
 
9
10
  import pandas as pd
@@ -50,6 +51,27 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
50
51
  )
51
52
 
52
53
 
54
+ def h3compaction(
55
+ df: pd.DataFrame,
56
+ res: int,
57
+ col_order: list,
58
+ dggs_col: str,
59
+ id_field: str,
60
+ ) -> pd.DataFrame:
61
+ """
62
+ Compacts an H3 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
63
+ """
64
+ return common.compaction(
65
+ df,
66
+ res,
67
+ id_field,
68
+ col_order,
69
+ dggs_col,
70
+ h3py.compact_cells,
71
+ h3py.cell_to_center_child,
72
+ )
73
+
74
+
53
75
  @click.command(context_settings={"show_default": True})
54
76
  @click_log.simple_verbosity_option(common.LOGGER)
55
77
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -108,7 +130,7 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
108
130
  required=False,
109
131
  default=const.DEFAULTS["crs"],
110
132
  type=int,
111
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
133
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
112
134
  nargs=1,
113
135
  )
114
136
  @click.option(
@@ -129,6 +151,15 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
129
151
  help="Amount of threads used for operation",
130
152
  nargs=1,
131
153
  )
154
+ @click.option(
155
+ "-cp",
156
+ "--compression",
157
+ required=False,
158
+ default=const.DEFAULTS["cp"],
159
+ type=str,
160
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
161
+ nargs=1,
162
+ )
132
163
  @click.option(
133
164
  "-lyr",
134
165
  "--layer",
@@ -153,6 +184,12 @@ def h3polyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
153
184
  type=click.Path(),
154
185
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
155
186
  )
187
+ @click.option(
188
+ "-co",
189
+ "--compact",
190
+ is_flag=True,
191
+ help="Compact the H3 cells up to the parent resolution. Compaction requires an id_field.",
192
+ )
156
193
  @click.option("-o", "--overwrite", is_flag=True)
157
194
  @click.version_option(version=__version__)
158
195
  def h3(
@@ -167,9 +204,11 @@ def h3(
167
204
  cut_crs: int,
168
205
  cut_threshold: int,
169
206
  threads: int,
207
+ compression: str,
170
208
  layer: str,
171
209
  geom_col: str,
172
210
  tempdir: Union[str, Path],
211
+ compact: bool,
173
212
  overwrite: bool,
174
213
  ):
175
214
  """
@@ -181,6 +220,7 @@ def h3(
181
220
  tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
182
221
 
183
222
  common.check_resolutions(resolution, parent_res)
223
+ common.check_compaction_requirements(compact, id_field)
184
224
 
185
225
  con, vector_input = common.db_conn_and_input_path(vector_input)
186
226
  output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -193,6 +233,7 @@ def h3(
193
233
  "h3",
194
234
  h3polyfill,
195
235
  h3_secondary_index,
236
+ h3compaction if compact else None,
196
237
  vector_input,
197
238
  output_directory,
198
239
  int(resolution),
@@ -202,6 +243,7 @@ def h3(
202
243
  spatial_sorting,
203
244
  cut_threshold,
204
245
  threads,
246
+ compression=compression,
205
247
  cut_crs=cut_crs,
206
248
  id_field=id_field,
207
249
  con=con,
vector2dggs/rHP.py CHANGED
@@ -11,8 +11,14 @@ import geopandas as gpd
11
11
 
12
12
  from typing import Union
13
13
  from pathlib import Path
14
+ from rhealpixdggs.conversion import compress_order_cells
14
15
  from rhppandas.util.const import COLUMNS
15
16
 
17
+ # from rhealpixdggs.rhp_wrappers import rhp_to_center_child, rhp_is_valid
18
+ from rhealpixdggs.rhp_wrappers import rhp_is_valid
19
+ from rhealpixdggs.dggs import RHEALPixDGGS
20
+ from rhealpixdggs.dggs import WGS84_003
21
+
16
22
  import vector2dggs.constants as const
17
23
  import vector2dggs.common as common
18
24
 
@@ -27,7 +33,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
27
33
  df_polygon = df[df.geom_type == "Polygon"]
28
34
  if len(df_polygon.index) > 0:
29
35
  df_polygon = df_polygon.rhp.polyfill_resample(
30
- resolution, return_geometry=False
36
+ resolution, return_geometry=False, compress=False
31
37
  ).drop(columns=["index"])
32
38
 
33
39
  df_linestring = df[df.geom_type == "LineString"]
@@ -51,6 +57,90 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
51
57
  )
52
58
 
53
59
 
60
+ # TODO replace when merged https://github.com/manaakiwhenua/rhealpixdggs-py/pull/37
61
+ def rhp_to_center_child(
62
+ rhpindex: str, res: int = None, dggs: RHEALPixDGGS = WGS84_003
63
+ ) -> str:
64
+ """
65
+ Returns central child of rhpindex at resolution res (immediate central
66
+ child if res == None).
67
+
68
+ Returns None if the cell index is invalid.
69
+
70
+ Returns None if the DGGS has an even number of cells on a side.
71
+
72
+ EXAMPLES::
73
+
74
+ >>> rhp_to_center_child('S001450634')
75
+ 'S0014506344'
76
+ >>> rhp_to_center_child('S001450634', res=13)
77
+ 'S001450634444'
78
+ >>> rhp_to_center_child('INVALID')
79
+ """
80
+ # Stop early if the cell index is invalid
81
+ if not rhp_is_valid(rhpindex, dggs):
82
+ return None
83
+
84
+ # DGGSs with even numbers of cells on a side never have a cell at the centre
85
+ if (dggs.N_side % 2) == 0:
86
+ return None
87
+
88
+ # Handle mismatch between cell resolution and requested child resolution
89
+ parent_res = len(rhpindex) - 1
90
+ if res is not None and res < parent_res:
91
+ return rhpindex
92
+
93
+ # Standard case (including parent_res == res)
94
+ else:
95
+ # res == None returns the central child from one level down (by convention)
96
+ added_levels = 1 if res is None else res - parent_res
97
+
98
+ # Derive index of centre child and append that to rhpindex
99
+ # NOTE: only works for odd values of N_side
100
+ c_index = int((dggs.N_side**2 - 1) / 2)
101
+
102
+ # Append the required number of child digits to cell index
103
+ child_index = rhpindex + "".join(str(c_index) for _ in range(0, added_levels))
104
+
105
+ return child_index
106
+
107
+
108
+ def compact_cells(cells: set[str]) -> set[str]:
109
+ """
110
+ Compact a set of rHEALPix DGGS cells.
111
+ Cells must be at the same resolution.
112
+ See https://github.com/manaakiwhenua/rhealpixdggs-py/issues/35#issuecomment-3186073554
113
+ """
114
+ previous_result = set(cells)
115
+ while True:
116
+ current_result = set(compress_order_cells(previous_result))
117
+ if previous_result == current_result:
118
+ break
119
+ previous_result = current_result
120
+ return previous_result
121
+
122
+
123
+ def rhpcompaction(
124
+ df: pd.DataFrame,
125
+ res: int,
126
+ col_order: list,
127
+ dggs_col: str,
128
+ id_field: str,
129
+ ) -> pd.DataFrame:
130
+ """
131
+ Compacts an rHP dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
132
+ """
133
+ return common.compaction(
134
+ df,
135
+ res,
136
+ id_field,
137
+ col_order,
138
+ dggs_col,
139
+ compact_cells,
140
+ rhp_to_center_child,
141
+ )
142
+
143
+
54
144
  @click.command(context_settings={"show_default": True})
55
145
  @click_log.simple_verbosity_option(common.LOGGER)
56
146
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -109,7 +199,7 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
109
199
  required=False,
110
200
  default=const.DEFAULTS["crs"],
111
201
  type=int,
112
- help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
202
+ help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
113
203
  nargs=1,
114
204
  )
115
205
  @click.option(
@@ -130,6 +220,15 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
130
220
  help="Amount of threads used for operation",
131
221
  nargs=1,
132
222
  )
223
+ @click.option(
224
+ "-cp",
225
+ "--compression",
226
+ required=False,
227
+ default=const.DEFAULTS["cp"],
228
+ type=str,
229
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
230
+ nargs=1,
231
+ )
133
232
  @click.option(
134
233
  "-lyr",
135
234
  "--layer",
@@ -154,6 +253,12 @@ def rhppolyfill(df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame:
154
253
  type=click.Path(),
155
254
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
156
255
  )
256
+ @click.option(
257
+ "-co",
258
+ "--compact",
259
+ is_flag=True,
260
+ help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
261
+ )
157
262
  @click.option("-o", "--overwrite", is_flag=True)
158
263
  @click.version_option(version=__version__)
159
264
  def rhp(
@@ -168,9 +273,11 @@ def rhp(
168
273
  cut_crs: int,
169
274
  cut_threshold: int,
170
275
  threads: int,
276
+ compression: str,
171
277
  layer: str,
172
278
  geom_col: str,
173
279
  tempdir: Union[str, Path],
280
+ compact: bool,
174
281
  overwrite: bool,
175
282
  ):
176
283
  """
@@ -182,6 +289,7 @@ def rhp(
182
289
  tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
183
290
 
184
291
  common.check_resolutions(resolution, parent_res)
292
+ common.check_compaction_requirements(compact, id_field)
185
293
 
186
294
  con, vector_input = common.db_conn_and_input_path(vector_input)
187
295
  output_directory = common.resolve_output_path(output_directory, overwrite)
@@ -194,6 +302,7 @@ def rhp(
194
302
  "rhp",
195
303
  rhppolyfill,
196
304
  rhp_secondary_index,
305
+ rhpcompaction if compact else None,
197
306
  vector_input,
198
307
  output_directory,
199
308
  int(resolution),
@@ -203,6 +312,7 @@ def rhp(
203
312
  spatial_sorting,
204
313
  cut_threshold,
205
314
  threads,
315
+ compression=compression,
206
316
  cut_crs=cut_crs,
207
317
  id_field=id_field,
208
318
  con=con,
vector2dggs/s2.py CHANGED
@@ -182,6 +182,54 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
182
182
  )
183
183
 
184
184
 
185
+ def compact_tokens(tokens: set[str]) -> set[str]:
186
+ """
187
+ Compact a set of S2 DGGS cells.
188
+ Cells must be at the same resolution.
189
+ """
190
+ cell_ids: list[S2.S2CellId] = [
191
+ S2.S2CellId.FromToken(token, len(token)) for token in tokens
192
+ ]
193
+ cell_union: S2.S2CellUnion = S2.S2CellUnion(
194
+ cell_ids
195
+ ) # Vector of sorted, non-overlapping S2CellId
196
+ cell_union.NormalizeS2CellUnion() # Mutates; 'normalize' == 'compact'
197
+ return {c.ToToken() for c in cell_union.cell_ids()}
198
+
199
+
200
+ def token_to_child_token(token: str, level: int) -> str:
201
+ """
202
+ Returns first child (as string token) of a cell (also represented as a string
203
+ token) at a specific level.
204
+ """
205
+ cell: S2.S2CellId = S2.S2CellId.FromToken(token, len(token))
206
+ if level <= cell.level():
207
+ raise ValueError("Level must be greater than the current level of the cell.")
208
+ # Get the child cell iterator
209
+ return cell.child_begin(level).ToToken()
210
+
211
+
212
+ def s2_compaction(
213
+ df: pd.DataFrame,
214
+ res: int,
215
+ col_order: list,
216
+ dggs_col: str,
217
+ id_field: str,
218
+ ) -> pd.DataFrame:
219
+ """
220
+ Compacts an S2 dataframe up to a given low resolution (parent_res), from an existing maximum resolution (res).
221
+ """
222
+ return common.compaction(
223
+ df,
224
+ res,
225
+ id_field,
226
+ col_order,
227
+ dggs_col,
228
+ compact_tokens,
229
+ token_to_child_token,
230
+ )
231
+
232
+
185
233
  @click.command(context_settings={"show_default": True})
186
234
  @click_log.simple_verbosity_option(common.LOGGER)
187
235
  @click.argument("vector_input", required=True, type=click.Path(), nargs=1)
@@ -263,6 +311,15 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
263
311
  help="Amount of threads used for operation",
264
312
  nargs=1,
265
313
  )
314
+ @click.option(
315
+ "-cp",
316
+ "--compression",
317
+ required=False,
318
+ default=const.DEFAULTS["cp"],
319
+ type=str,
320
+ help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.",
321
+ nargs=1,
322
+ )
266
323
  @click.option(
267
324
  "-lyr",
268
325
  "--layer",
@@ -287,6 +344,12 @@ def s2_polyfill(df: gpd.GeoDataFrame, level: int) -> pd.DataFrame:
287
344
  type=click.Path(),
288
345
  help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
289
346
  )
347
+ @click.option(
348
+ "-co",
349
+ "--compact",
350
+ is_flag=True,
351
+ help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.",
352
+ )
290
353
  @click.option("-o", "--overwrite", is_flag=True)
291
354
  @click.version_option(version=__version__)
292
355
  def s2(
@@ -301,9 +364,11 @@ def s2(
301
364
  cut_crs: int,
302
365
  cut_threshold: int,
303
366
  threads: int,
367
+ compression: str,
304
368
  layer: str,
305
369
  geom_col: str,
306
370
  tempdir: Union[str, Path],
371
+ compact: bool,
307
372
  overwrite: bool,
308
373
  ):
309
374
  """
@@ -327,6 +392,7 @@ def s2(
327
392
  "s2",
328
393
  s2_polyfill,
329
394
  s2_secondary_index,
395
+ s2_compaction if compact else None,
330
396
  vector_input,
331
397
  output_directory,
332
398
  int(level),
@@ -336,6 +402,7 @@ def s2(
336
402
  spatial_sorting,
337
403
  cut_threshold,
338
404
  threads,
405
+ compression=compression,
339
406
  cut_crs=cut_crs,
340
407
  id_field=id_field,
341
408
  con=con,
@@ -1,8 +1,7 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: vector2dggs
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: CLI DGGS indexer for vector geospatial data
5
- Home-page: https://github.com/manaakiwhenua/vector2dggs
6
5
  License: LGPL-3.0-or-later
7
6
  Keywords: dggs,vector,h3,rHEALPix,cli
8
7
  Author: James Ardo
@@ -14,13 +13,14 @@ Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or l
14
13
  Classifier: Programming Language :: Python :: 3
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
17
  Classifier: Topic :: Scientific/Engineering
18
18
  Classifier: Topic :: Scientific/Engineering :: GIS
19
19
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
20
  Requires-Dist: click (>=8.1.7,<9.0.0)
21
21
  Requires-Dist: click-log (>=0.4.0,<0.5.0)
22
22
  Requires-Dist: dask (>=2025.1,<2026.0)
23
- Requires-Dist: dask-geopandas (>=0.4,<0.5)
23
+ Requires-Dist: dask-geopandas (>=0.5,<0.6)
24
24
  Requires-Dist: gdal (>=3.8,<4.0)
25
25
  Requires-Dist: geopandas (>=1.0.1,<2.0.0)
26
26
  Requires-Dist: h3pandas (>=0.3,<0.4)
@@ -43,7 +43,7 @@ Description-Content-Type: text/markdown
43
43
 
44
44
  [![pypi](https://img.shields.io/pypi/v/vector2dggs?label=vector2dggs)](https://pypi.org/project/vector2dggs/)
45
45
 
46
- Python-based CLI tool to index raster files to DGGS in parallel, writing out to Parquet.
46
+ Python-based CLI tool to index vector files to DGGS in parallel, writing out to Parquet.
47
47
 
48
48
  This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
49
49
 
@@ -57,7 +57,7 @@ Currently this tool supports the following DGGSs:
57
57
 
58
58
  - [Geohash](https://en.wikipedia.org/wiki/Geohash) (points, polygons)
59
59
 
60
- Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
60
+ Contributions (especially for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
61
61
 
62
62
  ![Example use case for vector2dggs, showing parcels indexed to a high H3 resolution](./docs/imgs/vector2dggs-example.png "Example use case for vector2dggs, showing parcels indexed to a high H3 resolution")
63
63
 
@@ -114,23 +114,29 @@ Options:
114
114
  use when spatially partioning. Adjusting
115
115
  this number will trade off memory use and
116
116
  time. [default: 50; required]
117
- -s, --spatial_sorting [hilbert|morton|geohash]
117
+ -s, --spatial_sorting [hilbert|morton|geohash|none]
118
118
  Spatial sorting method when perfoming
119
- spatial partitioning. [default: hilbert]
119
+ spatial partitioning. [default: none]
120
120
  -crs, --cut_crs INTEGER Set the coordinate reference system (CRS)
121
- used for cutting large polygons (see `--cur-
122
- threshold`). Defaults to the same CRS as the
123
- input. Should be a valid EPSG code.
124
- -c, --cut_threshold INTEGER Cutting up large polygons into smaller
125
- pieces based on a target length. Units are
126
- assumed to match the input CRS units unless
127
- the `--cut_crs` is also given, in which case
128
- units match the units of the supplied CRS.
129
- [default: 5000; required]
121
+ used for cutting large geometries (see
122
+ `--cut_threshold`). Defaults to the same CRS
123
+ as the input. Should be a valid EPSG code.
124
+ -c, --cut_threshold INTEGER Cutting up large geometries into smaller
125
+ geometries based on a target length. Units
126
+ are assumed to match the input CRS units
127
+ unless the `--cut_crs` is also given, in
128
+ which case units match the units of the
129
+ supplied CRS. [default: 5000; required]
130
130
  -t, --threads INTEGER Amount of threads used for operation
131
- [default: 7]
132
- -lyr, --layer TEXT Name of the layer or table to read when using a
133
- an input that supports layers or tables
131
+ [default: NUM_CPUS - 1]
132
+ -cp, --compression TEXT Compression method to use for the output
133
+ Parquet files. Options include 'snappy',
134
+ 'gzip', 'brotli', 'lz4', 'zstd', etc. Use
135
+ 'none' for no compression. [default:
136
+ snappy]
137
+ -lyr, --layer TEXT Name of the layer or table to read when
138
+ using an input that supports layers or
139
+ tables
134
140
  -g, --geom_col TEXT Column name to use when using a spatial
135
141
  database connection as input [default:
136
142
  geom]
@@ -138,6 +144,8 @@ Options:
138
144
  execution of this program. This parameter
139
145
  allows you to control where this data will
140
146
  be written.
147
+ -co, --compact Compact the H3 cells up to the parent
148
+ resolution. Compaction requires an id_field.
141
149
  -o, --overwrite
142
150
  --version Show the version and exit.
143
151
  --help Show this message and exit.
@@ -187,7 +195,6 @@ from shapely.geometry import Polygon
187
195
 
188
196
  RES = 18
189
197
  df = pd.read_parquet(f'~/output-data/ponds-with-holes.s2.{RES}.pq')
190
- df = df.reset_index()
191
198
 
192
199
  def s2id_to_polygon(s2_id_hex):
193
200
  cell_id = s2sphere.CellId.from_token(s2_id_hex)
@@ -199,11 +206,17 @@ def s2id_to_polygon(s2_id_hex):
199
206
  vertices.append((lat_lng.lng().degrees, lat_lng.lat().degrees)) # (lon, lat)
200
207
  return Polygon(vertices)
201
208
 
202
- df['geometry'] = df[f's2_{RES}'].apply(s2id_to_polygon)
209
+ df['geometry'] = df.index.to_series().apply(s2id_to_polygon)
203
210
  df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326') # WGS84
204
211
  df.to_parquet(f'sample-{RES}.parquet')
205
212
  ```
206
213
 
214
+ ## Compaction
215
+
216
+ Compaction is supported with the `-co/--compact` argument. The result respects overlapping polygons by considering each feature independently. (In the below example output for rHEALPix, cells are shown with opacity; overlap is visible where there is a darker shade.) This does mean that the index of the result is not necessarily unique (unless your input is a vector _coverage_, i.e. it does not have overlaps.)
217
+
218
+ ![Example of compaction of overlapping vector features with the rHEALPix DGGS](docs/imgs/rhp-compaction-example.png)
219
+
207
220
  ### For development
208
221
 
209
222
  In brief, to get started:
@@ -248,14 +261,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -lyr topo50_lake
248
261
  title={{vector2dggs}},
249
262
  author={Ardo, James and Law, Richard},
250
263
  url={https://github.com/manaakiwhenua/vector2dggs},
251
- version={0.9.0},
264
+ version={0.10.0},
252
265
  date={2023-04-20}
253
266
  }
254
267
  ```
255
268
 
256
269
  APA/Harvard
257
270
 
258
- > Ardo, J., & Law, R. (2023). vector2dggs (0.9.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
271
+ > Ardo, J., & Law, R. (2023). vector2dggs (0.10.0) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
259
272
 
260
273
  [![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)
261
274
 
@@ -0,0 +1,15 @@
1
+ vector2dggs/__init__.py,sha256=qK7__omM0NPcz4bMM5qUWKeZsOEhxeVWv_P38IPNVnw,28
2
+ vector2dggs/cli.py,sha256=d_4skD62k6pXUWgDdVHbDwpe4A4yo62ZFx8Cp_6GpBA,767
3
+ vector2dggs/common.py,sha256=rQL1_rFr1VTyILffOZgdwPzZS1JThn4TBPswfCkMjbM,14471
4
+ vector2dggs/constants.py,sha256=KdmBQCP_GCygzvDLtS8AMQM9i6QqOkf-9YQkh_AzrKc,1779
5
+ vector2dggs/geohash.py,sha256=PVLkaaSVLgzDZNfuL0y3Xioh4pyvom845HuyLIAsLUY,10398
6
+ vector2dggs/h3.py,sha256=Juvc8g4QWfDIco9RQHaX8p9S9rkW5QvusxpyO-G7eSs,7408
7
+ vector2dggs/katana.py,sha256=v4BRzVCsroC6RzIYdxLfrr9eFOdmXb5S9jXBMs5tgSo,3571
8
+ vector2dggs/rHP.py,sha256=E03dQngbT3LtksZkaM6QSJv983ZGpLeXRedjqsEQZZI,9869
9
+ vector2dggs/s2.py,sha256=SOXMHQQq86bM88MDgBBemGiXIbuEIbrhLSgPwLKceLY,12809
10
+ vector2dggs-0.10.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
+ vector2dggs-0.10.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
12
+ vector2dggs-0.10.0.dist-info/METADATA,sha256=LuNEa06-KpDdXDdbDynhkxUXp6nuaOQ7q9ycIrJODKs,12606
13
+ vector2dggs-0.10.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
14
+ vector2dggs-0.10.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
15
+ vector2dggs-0.10.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,15 +0,0 @@
1
- vector2dggs/__init__.py,sha256=L8qKCe-XFylNfRXefZ1yGESlLF24qwQQ87szPZJO6Zg,27
2
- vector2dggs/cli.py,sha256=d_4skD62k6pXUWgDdVHbDwpe4A4yo62ZFx8Cp_6GpBA,767
3
- vector2dggs/common.py,sha256=l5koOX1Ps0v5D7MgzHtK1t99hXnGA7b6I82n2rBOldE,10496
4
- vector2dggs/constants.py,sha256=_cj3Pf52gsXfWwvpsbekE8h1yD_1jS9xqzRg2mRCq3w,1759
5
- vector2dggs/geohash.py,sha256=t90FlZRQCH8lmtTHe2kPMcLTIf1nrrf2j-m95xk4xPc,7534
6
- vector2dggs/h3.py,sha256=Bu_4T1WIDuTv_tJWTS8BgPmHRiCozfUUh2CxBwk98Gw,6310
7
- vector2dggs/katana.py,sha256=v4BRzVCsroC6RzIYdxLfrr9eFOdmXb5S9jXBMs5tgSo,3571
8
- vector2dggs/rHP.py,sha256=tC4LvqRPMmgUd36BppkvYeq94pPBhO1vBDQ-aaiHUg4,6410
9
- vector2dggs/s2.py,sha256=HEpFTEL4UaZLjybKZ_q06QFjPuQ48MDLeg_qGc0NMEw,10835
10
- vector2dggs-0.9.0.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
- vector2dggs-0.9.0.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
12
- vector2dggs-0.9.0.dist-info/METADATA,sha256=7y97ZXmDNqUQ-n8M-BgOE2XLG-pJ6f_aNGjzVlCUFzc,11534
13
- vector2dggs-0.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
- vector2dggs-0.9.0.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
15
- vector2dggs-0.9.0.dist-info/RECORD,,