vector2dggs 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vector2dggs/__init__.py +1 -1
- vector2dggs/cli.py +2 -0
- vector2dggs/common.py +354 -0
- vector2dggs/constants.py +26 -0
- vector2dggs/h3.py +35 -296
- vector2dggs/rHP.py +217 -0
- {vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/METADATA +37 -17
- vector2dggs-0.6.3.dist-info/RECORD +13 -0
- {vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/WHEEL +1 -1
- vector2dggs-0.6.1.dist-info/RECORD +0 -10
- {vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING +0 -0
- {vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/COPYING.LESSER +0 -0
- {vector2dggs-0.6.1.dist-info → vector2dggs-0.6.3.dist-info}/entry_points.txt +0 -0
vector2dggs/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__: str = "0.6.
|
1
|
+
__version__: str = "0.6.3"
|
vector2dggs/cli.py
CHANGED
@@ -2,6 +2,7 @@ import click
|
|
2
2
|
|
3
3
|
from vector2dggs import __version__
|
4
4
|
from vector2dggs.h3 import h3
|
5
|
+
from vector2dggs.rHP import rhp
|
5
6
|
|
6
7
|
# If the program does terminal interaction, make it output a short
|
7
8
|
# notice like this when it starts in an interactive mode:
|
@@ -19,6 +20,7 @@ def cli():
|
|
19
20
|
|
20
21
|
|
21
22
|
cli.add_command(h3)
|
23
|
+
cli.add_command(rhp)
|
22
24
|
|
23
25
|
|
24
26
|
def main():
|
vector2dggs/common.py
ADDED
@@ -0,0 +1,354 @@
|
|
1
|
+
import os
|
2
|
+
import errno
|
3
|
+
import logging
|
4
|
+
import tempfile
|
5
|
+
import click_log
|
6
|
+
import sqlalchemy
|
7
|
+
import shutil
|
8
|
+
import pyproj
|
9
|
+
|
10
|
+
import pandas as pd
|
11
|
+
import geopandas as gpd
|
12
|
+
import dask.dataframe as dd
|
13
|
+
import dask_geopandas as dgpd
|
14
|
+
|
15
|
+
from typing import Union, Callable
|
16
|
+
from pathlib import Path, PurePath
|
17
|
+
from urllib.parse import urlparse
|
18
|
+
from tqdm import tqdm
|
19
|
+
from tqdm.dask import TqdmCallback
|
20
|
+
from multiprocessing.dummy import Pool
|
21
|
+
from shapely.geometry import GeometryCollection
|
22
|
+
|
23
|
+
import vector2dggs.constants as const
|
24
|
+
|
25
|
+
from . import katana
|
26
|
+
|
27
|
+
SQLConnectionType = Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine]
|
28
|
+
|
29
|
+
|
30
|
+
LOGGER = logging.getLogger(__name__)
|
31
|
+
click_log.basic_config(LOGGER)
|
32
|
+
click_log.ColorFormatter.colors["info"] = dict(fg="green")
|
33
|
+
|
34
|
+
|
35
|
+
class ParentResolutionException(Exception):
|
36
|
+
pass
|
37
|
+
|
38
|
+
|
39
|
+
def check_resolutions(resolution: int, parent_res: int) -> None:
|
40
|
+
if parent_res is not None and not int(parent_res) < int(resolution):
|
41
|
+
raise ParentResolutionException(
|
42
|
+
"Parent resolution ({pr}) must be less than target resolution ({r})".format(
|
43
|
+
pr=parent_res, r=resolution
|
44
|
+
)
|
45
|
+
)
|
46
|
+
|
47
|
+
|
48
|
+
def db_conn_and_input_path(
|
49
|
+
vector_input: Union[str, Path],
|
50
|
+
) -> tuple[SQLConnectionType, Union[str, Path]]:
|
51
|
+
con: sqlalchemy.engine.Connection = None
|
52
|
+
scheme: str = urlparse(vector_input).scheme
|
53
|
+
|
54
|
+
if bool(scheme) and scheme != "file":
|
55
|
+
# Assume database connection
|
56
|
+
con = sqlalchemy.create_engine(vector_input)
|
57
|
+
|
58
|
+
elif not Path(vector_input).exists():
|
59
|
+
if not scheme:
|
60
|
+
LOGGER.error(
|
61
|
+
f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
|
62
|
+
)
|
63
|
+
raise FileNotFoundError(
|
64
|
+
errno.ENOENT, os.strerror(errno.ENOENT), vector_input
|
65
|
+
)
|
66
|
+
vector_input = str(vector_input)
|
67
|
+
|
68
|
+
else:
|
69
|
+
vector_input = Path(vector_input)
|
70
|
+
|
71
|
+
return (con, vector_input)
|
72
|
+
|
73
|
+
|
74
|
+
def resolve_output_path(
|
75
|
+
output_directory: Union[str, Path], overwrite: bool
|
76
|
+
) -> Union[str, Path]:
|
77
|
+
output_directory = Path(output_directory)
|
78
|
+
outputexists = os.path.exists(output_directory)
|
79
|
+
|
80
|
+
if outputexists and not overwrite:
|
81
|
+
raise FileExistsError(
|
82
|
+
f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
|
83
|
+
)
|
84
|
+
|
85
|
+
elif outputexists and overwrite:
|
86
|
+
LOGGER.warning(f"Overwriting the contents of {output_directory}")
|
87
|
+
shutil.rmtree(output_directory)
|
88
|
+
|
89
|
+
output_directory.mkdir(parents=True, exist_ok=True)
|
90
|
+
|
91
|
+
return output_directory
|
92
|
+
|
93
|
+
|
94
|
+
def drop_condition(
|
95
|
+
df: pd.DataFrame,
|
96
|
+
drop_index: pd.Index,
|
97
|
+
log_statement: str,
|
98
|
+
warning_threshold: float = 0.01,
|
99
|
+
):
|
100
|
+
LOGGER.debug(log_statement)
|
101
|
+
_before = len(df)
|
102
|
+
df = df.drop(drop_index)
|
103
|
+
_after = len(df)
|
104
|
+
_diff = _before - _after
|
105
|
+
if _diff:
|
106
|
+
log_method = (
|
107
|
+
LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
|
108
|
+
)
|
109
|
+
log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
|
110
|
+
return df
|
111
|
+
|
112
|
+
|
113
|
+
def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int):
|
114
|
+
"""
|
115
|
+
Uses a parent resolution,
|
116
|
+
OR,
|
117
|
+
Given a target resolution, returns our recommended parent resolution.
|
118
|
+
|
119
|
+
Used for intermediate re-partioning.
|
120
|
+
"""
|
121
|
+
if dggs == "h3":
|
122
|
+
return (
|
123
|
+
parent_res
|
124
|
+
if parent_res is not None
|
125
|
+
else max(const.MIN_H3, (resolution - const.DEFAULT_PARENT_OFFSET))
|
126
|
+
)
|
127
|
+
elif dggs == "rhp":
|
128
|
+
return (
|
129
|
+
parent_res
|
130
|
+
if parent_res is not None
|
131
|
+
else max(const.MIN_RHP, (resolution - const.DEFAULT_PARENT_OFFSET))
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
raise RuntimeError(
|
135
|
+
"Unknown dggs {dggs}) - must be one of [ 'h3', 'rhp' ]".format(dggs=dggs)
|
136
|
+
)
|
137
|
+
|
138
|
+
|
139
|
+
def parent_partitioning(
|
140
|
+
dggs: str,
|
141
|
+
input_dir: Path,
|
142
|
+
output_dir: Path,
|
143
|
+
resolution: int,
|
144
|
+
parent_res: Union[None, int],
|
145
|
+
**kwargs,
|
146
|
+
) -> None:
|
147
|
+
parent_res: int = get_parent_res(dggs, parent_res, resolution)
|
148
|
+
partition_col = f"{dggs}_{parent_res:02}"
|
149
|
+
|
150
|
+
with TqdmCallback(desc="Repartitioning"):
|
151
|
+
dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
|
152
|
+
output_dir,
|
153
|
+
overwrite=kwargs.get("overwrite", False),
|
154
|
+
engine=kwargs.get("engine", "pyarrow"),
|
155
|
+
partition_on=partition_col,
|
156
|
+
compression=kwargs.get("compression", "ZSTD"),
|
157
|
+
)
|
158
|
+
LOGGER.debug("Parent cell repartitioning complete")
|
159
|
+
|
160
|
+
# Rename output to just be the partition key, suffix .parquet
|
161
|
+
for f in os.listdir(output_dir):
|
162
|
+
os.rename(
|
163
|
+
os.path.join(output_dir, f),
|
164
|
+
os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
|
165
|
+
)
|
166
|
+
|
167
|
+
return
|
168
|
+
|
169
|
+
|
170
|
+
def polyfill(
|
171
|
+
dggs: str,
|
172
|
+
dggsfunc: Callable,
|
173
|
+
secondary_index_func: Callable,
|
174
|
+
pq_in: Path,
|
175
|
+
spatial_sort_col: str,
|
176
|
+
resolution: int,
|
177
|
+
parent_res: Union[None, int],
|
178
|
+
output_directory: str,
|
179
|
+
) -> None:
|
180
|
+
"""
|
181
|
+
Reads a geoparquet, performs polyfilling (for Polygon),
|
182
|
+
linetracing (for LineString), and writes out to parquet.
|
183
|
+
"""
|
184
|
+
df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
|
185
|
+
if len(df.index) == 0:
|
186
|
+
# Input is empty, nothing to polyfill
|
187
|
+
return None
|
188
|
+
|
189
|
+
# DGGS specific polyfill
|
190
|
+
df = dggsfunc(df, resolution)
|
191
|
+
|
192
|
+
if len(df.index) == 0:
|
193
|
+
# Polyfill resulted in empty output (e.g. large cell, small feature)
|
194
|
+
return None
|
195
|
+
|
196
|
+
df.index.rename(f"{dggs}_{resolution:02}", inplace=True)
|
197
|
+
parent_res: int = get_parent_res(dggs, parent_res, resolution)
|
198
|
+
# print(parent_res)
|
199
|
+
# print(df.index)
|
200
|
+
# print(df.columns)
|
201
|
+
|
202
|
+
# Secondary (parent) index, used later for partitioning
|
203
|
+
df = secondary_index_func(df, parent_res)
|
204
|
+
|
205
|
+
df.to_parquet(
|
206
|
+
PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
|
207
|
+
)
|
208
|
+
return None
|
209
|
+
|
210
|
+
|
211
|
+
def polyfill_star(args) -> None:
|
212
|
+
return polyfill(*args)
|
213
|
+
|
214
|
+
|
215
|
+
def index(
|
216
|
+
dggs: str,
|
217
|
+
dggsfunc: Callable,
|
218
|
+
secondary_index_func: Callable,
|
219
|
+
input_file: Union[Path, str],
|
220
|
+
output_directory: Union[Path, str],
|
221
|
+
resolution: int,
|
222
|
+
parent_res: Union[None, int],
|
223
|
+
keep_attributes: bool,
|
224
|
+
chunksize: int,
|
225
|
+
spatial_sorting: str,
|
226
|
+
cut_threshold: int,
|
227
|
+
processes: int,
|
228
|
+
id_field: str = None,
|
229
|
+
cut_crs: pyproj.CRS = None,
|
230
|
+
con: SQLConnectionType = None,
|
231
|
+
table: str = None,
|
232
|
+
geom_col: str = "geom",
|
233
|
+
overwrite: bool = False,
|
234
|
+
) -> Path:
|
235
|
+
"""
|
236
|
+
Performs multi-threaded polyfilling on (multi)polygons.
|
237
|
+
"""
|
238
|
+
|
239
|
+
if table and con:
|
240
|
+
# Database connection
|
241
|
+
if keep_attributes:
|
242
|
+
q = sqlalchemy.text(f"SELECT * FROM {table}")
|
243
|
+
elif id_field and not keep_attributes:
|
244
|
+
q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
|
245
|
+
else:
|
246
|
+
q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
|
247
|
+
df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
|
248
|
+
"geometry"
|
249
|
+
)
|
250
|
+
else:
|
251
|
+
# Read file
|
252
|
+
df = gpd.read_file(input_file)
|
253
|
+
|
254
|
+
if cut_crs:
|
255
|
+
df = df.to_crs(cut_crs)
|
256
|
+
LOGGER.debug("Cutting with CRS: %s", df.crs)
|
257
|
+
|
258
|
+
if id_field:
|
259
|
+
df = df.set_index(id_field)
|
260
|
+
else:
|
261
|
+
df = df.reset_index()
|
262
|
+
df = df.rename(columns={"index": "fid"}).set_index("fid")
|
263
|
+
|
264
|
+
if not keep_attributes:
|
265
|
+
# Remove all attributes except the geometry
|
266
|
+
df = df.loc[:, ["geometry"]]
|
267
|
+
|
268
|
+
LOGGER.debug("Cutting large geometries")
|
269
|
+
with tqdm(total=df.shape[0], desc="Splitting") as pbar:
|
270
|
+
for index, row in df.iterrows():
|
271
|
+
df.loc[index, "geometry"] = GeometryCollection(
|
272
|
+
katana.katana(row.geometry, cut_threshold)
|
273
|
+
)
|
274
|
+
pbar.update(1)
|
275
|
+
|
276
|
+
LOGGER.debug("Exploding geometry collections and multipolygons")
|
277
|
+
df = (
|
278
|
+
df.to_crs(4326)
|
279
|
+
.explode(index_parts=False) # Explode from GeometryCollection
|
280
|
+
.explode(index_parts=False) # Explode multipolygons to polygons
|
281
|
+
).reset_index()
|
282
|
+
|
283
|
+
drop_conditions = [
|
284
|
+
{
|
285
|
+
"index": lambda frame: frame[
|
286
|
+
(frame.geometry.is_empty | frame.geometry.isna())
|
287
|
+
],
|
288
|
+
"message": "Considering empty or null geometries",
|
289
|
+
},
|
290
|
+
{
|
291
|
+
"index": lambda frame: frame[
|
292
|
+
(frame.geometry.geom_type != "Polygon")
|
293
|
+
& (frame.geometry.geom_type != "LineString")
|
294
|
+
], # NB currently points and other types are lost; in principle, these could be indexed
|
295
|
+
"message": "Considering unsupported geometries",
|
296
|
+
},
|
297
|
+
]
|
298
|
+
for condition in drop_conditions:
|
299
|
+
df = drop_condition(df, condition["index"](df).index, condition["message"])
|
300
|
+
|
301
|
+
ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
|
302
|
+
|
303
|
+
LOGGER.debug("Spatially sorting and partitioning (%s)", spatial_sorting)
|
304
|
+
ddf = ddf.spatial_shuffle(by=spatial_sorting)
|
305
|
+
spatial_sort_col = (
|
306
|
+
spatial_sorting
|
307
|
+
if spatial_sorting == "geohash"
|
308
|
+
else f"{spatial_sorting}_distance"
|
309
|
+
)
|
310
|
+
|
311
|
+
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
|
312
|
+
with TqdmCallback(desc=f"Spatially partitioning"):
|
313
|
+
ddf.to_parquet(tmpdir, overwrite=True)
|
314
|
+
|
315
|
+
filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
|
316
|
+
|
317
|
+
# Multithreaded polyfilling
|
318
|
+
LOGGER.debug(
|
319
|
+
"Indexing on spatial partitions by polyfill with resolution: %d",
|
320
|
+
resolution,
|
321
|
+
)
|
322
|
+
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
|
323
|
+
with Pool(processes=processes) as pool:
|
324
|
+
args = [
|
325
|
+
(
|
326
|
+
dggs,
|
327
|
+
dggsfunc,
|
328
|
+
secondary_index_func,
|
329
|
+
filepath,
|
330
|
+
spatial_sort_col,
|
331
|
+
resolution,
|
332
|
+
parent_res,
|
333
|
+
tmpdir2,
|
334
|
+
)
|
335
|
+
for filepath in filepaths
|
336
|
+
]
|
337
|
+
list(
|
338
|
+
tqdm(
|
339
|
+
pool.imap(polyfill_star, args),
|
340
|
+
total=len(args),
|
341
|
+
desc="DGGS indexing",
|
342
|
+
)
|
343
|
+
)
|
344
|
+
|
345
|
+
parent_partitioning(
|
346
|
+
dggs,
|
347
|
+
tmpdir2,
|
348
|
+
output_directory,
|
349
|
+
resolution,
|
350
|
+
parent_res,
|
351
|
+
overwrite=overwrite,
|
352
|
+
)
|
353
|
+
|
354
|
+
return output_directory
|
vector2dggs/constants.py
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
import multiprocessing
|
2
|
+
import warnings
|
3
|
+
import tempfile
|
4
|
+
|
5
|
+
|
6
|
+
MIN_H3, MAX_H3 = 0, 15
|
7
|
+
MIN_RHP, MAX_RHP = 0, 15
|
8
|
+
|
9
|
+
DEFAULTS = {
|
10
|
+
"id": None,
|
11
|
+
"k": False,
|
12
|
+
"ch": 50,
|
13
|
+
"s": "hilbert",
|
14
|
+
"crs": None,
|
15
|
+
"c": 5000,
|
16
|
+
"t": (multiprocessing.cpu_count() - 1),
|
17
|
+
"tbl": None,
|
18
|
+
"g": "geom",
|
19
|
+
"tempdir": tempfile.tempdir,
|
20
|
+
}
|
21
|
+
|
22
|
+
DEFAULT_PARENT_OFFSET = 6
|
23
|
+
|
24
|
+
warnings.filterwarnings(
|
25
|
+
"ignore"
|
26
|
+
) # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows
|
vector2dggs/h3.py
CHANGED
@@ -1,80 +1,28 @@
|
|
1
|
-
import errno
|
2
|
-
import logging
|
3
|
-
import os
|
4
|
-
import multiprocessing
|
5
|
-
from multiprocessing.dummy import Pool
|
6
|
-
from pathlib import Path, PurePath
|
7
|
-
import shutil
|
8
1
|
import sys
|
9
|
-
import tempfile
|
10
|
-
from typing import Union
|
11
|
-
from urllib.parse import urlparse
|
12
|
-
import warnings
|
13
|
-
|
14
2
|
import click
|
15
3
|
import click_log
|
16
|
-
import
|
17
|
-
import dask_geopandas as dgpd
|
18
|
-
import geopandas as gpd
|
19
|
-
import h3pandas
|
20
|
-
import pandas as pd
|
4
|
+
import tempfile
|
21
5
|
import pyproj
|
22
|
-
from shapely.geometry import GeometryCollection
|
23
|
-
import sqlalchemy
|
24
|
-
from tqdm import tqdm
|
25
|
-
from tqdm.dask import TqdmCallback
|
26
|
-
|
27
|
-
from . import katana
|
28
|
-
from vector2dggs import __version__
|
29
|
-
|
30
|
-
LOGGER = logging.getLogger(__name__)
|
31
|
-
click_log.basic_config(LOGGER)
|
32
|
-
MIN_H3, MAX_H3 = 0, 15
|
33
6
|
|
34
|
-
|
35
|
-
"ignore"
|
36
|
-
) # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows
|
37
|
-
|
38
|
-
|
39
|
-
DEFAULT_PARENT_OFFSET = 6
|
40
|
-
DEFAULT_CHUNK_SIZE = 50
|
7
|
+
import h3pandas # Necessary import despite lack of explicit use
|
41
8
|
|
9
|
+
import pandas as pd
|
10
|
+
import geopandas as gpd
|
42
11
|
|
43
|
-
|
44
|
-
|
12
|
+
from typing import Union
|
13
|
+
from pathlib import Path
|
45
14
|
|
15
|
+
import vector2dggs.constants as const
|
16
|
+
import vector2dggs.common as common
|
46
17
|
|
47
|
-
|
48
|
-
"""
|
49
|
-
Uses a parent resolution,
|
50
|
-
OR,
|
51
|
-
Given a target resolution, returns our recommended parent resolution.
|
18
|
+
from vector2dggs import __version__
|
52
19
|
|
53
|
-
Used for intermediate re-partioning.
|
54
|
-
"""
|
55
|
-
return (
|
56
|
-
int(parent_res)
|
57
|
-
if parent_res is not None
|
58
|
-
else max(MIN_H3, (resolution - DEFAULT_PARENT_OFFSET))
|
59
|
-
)
|
60
20
|
|
21
|
+
def h3_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
|
22
|
+
return df.h3.h3_to_parent(parent_res)
|
61
23
|
|
62
|
-
def polyfill(
|
63
|
-
pq_in: Path,
|
64
|
-
spatial_sort_col: str,
|
65
|
-
resolution: int,
|
66
|
-
parent_res: Union[None, int],
|
67
|
-
output_directory: str,
|
68
|
-
) -> None:
|
69
|
-
"""
|
70
|
-
Reads a geoparquet, performs H3 polyfilling (for Polygon),
|
71
|
-
linetracing (for LineString), and writes out to parquet.
|
72
|
-
"""
|
73
|
-
df = gpd.read_parquet(pq_in).reset_index().drop(columns=[spatial_sort_col])
|
74
|
-
if len(df.index) == 0:
|
75
|
-
# Input is empty, nothing to polyfill
|
76
|
-
return None
|
77
24
|
|
25
|
+
def h3polyfill(df: gpd.GeoDataFrame, resolution: int):
|
78
26
|
df_polygon = df[df.geom_type == "Polygon"]
|
79
27
|
if len(df_polygon.index) > 0:
|
80
28
|
df_polygon = df_polygon.h3.polyfill_resample(
|
@@ -90,207 +38,23 @@ def polyfill(
|
|
90
38
|
)
|
91
39
|
df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
|
92
40
|
|
93
|
-
|
41
|
+
return pd.concat(
|
94
42
|
map(
|
95
43
|
lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
|
96
44
|
[df_polygon, df_linestring],
|
97
45
|
)
|
98
46
|
)
|
99
47
|
|
100
|
-
if len(df.index) == 0:
|
101
|
-
# Polyfill resulted in empty output (e.g. large cell, small feature)
|
102
|
-
return None
|
103
|
-
|
104
|
-
df.index.rename(f"h3_{resolution:02}", inplace=True)
|
105
|
-
parent_res: int = _get_parent_res(parent_res, resolution)
|
106
|
-
# Secondary (parent) H3 index, used later for partitioning
|
107
|
-
df.h3.h3_to_parent(parent_res).to_parquet(
|
108
|
-
PurePath(output_directory, pq_in.name), engine="auto", compression="ZSTD"
|
109
|
-
)
|
110
|
-
return None
|
111
|
-
|
112
|
-
|
113
|
-
def polyfill_star(args) -> None:
|
114
|
-
return polyfill(*args)
|
115
|
-
|
116
|
-
|
117
|
-
def _parent_partitioning(
|
118
|
-
input_dir: Path,
|
119
|
-
output_dir: Path,
|
120
|
-
resolution: int,
|
121
|
-
parent_res: Union[None, int],
|
122
|
-
**kwargs,
|
123
|
-
) -> None:
|
124
|
-
parent_res: int = _get_parent_res(parent_res, resolution)
|
125
|
-
partition_col = f"h3_{parent_res:02}"
|
126
|
-
|
127
|
-
with TqdmCallback(desc="Repartitioning"):
|
128
|
-
dd.read_parquet(input_dir, engine="pyarrow").to_parquet(
|
129
|
-
output_dir,
|
130
|
-
overwrite=kwargs.get("overwrite", False),
|
131
|
-
engine=kwargs.get("engine", "pyarrow"),
|
132
|
-
partition_on=partition_col,
|
133
|
-
compression=kwargs.get("compression", "ZSTD"),
|
134
|
-
)
|
135
|
-
LOGGER.debug("Parent cell repartitioning complete")
|
136
|
-
|
137
|
-
# Rename output to just be the partition key, suffix .parquet
|
138
|
-
for f in os.listdir(output_dir):
|
139
|
-
os.rename(
|
140
|
-
os.path.join(output_dir, f),
|
141
|
-
os.path.join(output_dir, f.replace(f"{partition_col}=", "") + ".parquet"),
|
142
|
-
)
|
143
|
-
|
144
|
-
return
|
145
|
-
|
146
|
-
|
147
|
-
def drop_condition(
|
148
|
-
df: pd.DataFrame,
|
149
|
-
drop_index: pd.Index,
|
150
|
-
log_statement: str,
|
151
|
-
warning_threshold: float = 0.01,
|
152
|
-
):
|
153
|
-
LOGGER.info(log_statement)
|
154
|
-
_before = len(df)
|
155
|
-
df = df.drop(drop_index)
|
156
|
-
_after = len(df)
|
157
|
-
_diff = _before - _after
|
158
|
-
if _diff:
|
159
|
-
log_method = (
|
160
|
-
LOGGER.info if (_diff / float(_before)) < warning_threshold else LOGGER.warn
|
161
|
-
)
|
162
|
-
log_method(f"Dropped {_diff} rows ({_diff/float(_before)*100:.2f}%)")
|
163
|
-
return df
|
164
|
-
|
165
|
-
|
166
|
-
def _index(
|
167
|
-
input_file: Union[Path, str],
|
168
|
-
output_directory: Union[Path, str],
|
169
|
-
resolution: int,
|
170
|
-
parent_res: Union[None, int],
|
171
|
-
keep_attributes: bool,
|
172
|
-
chunksize: int,
|
173
|
-
spatial_sorting: str,
|
174
|
-
cut_threshold: int,
|
175
|
-
processes: int,
|
176
|
-
id_field: str = None,
|
177
|
-
cut_crs: pyproj.CRS = None,
|
178
|
-
con: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine] = None,
|
179
|
-
table: str = None,
|
180
|
-
geom_col: str = "geom",
|
181
|
-
overwrite: bool = False,
|
182
|
-
) -> Path:
|
183
|
-
"""
|
184
|
-
Performs multi-threaded H3 polyfilling on (multi)polygons.
|
185
|
-
"""
|
186
|
-
|
187
|
-
if table and con:
|
188
|
-
# Database connection
|
189
|
-
if keep_attributes:
|
190
|
-
q = sqlalchemy.text(f"SELECT * FROM {table}")
|
191
|
-
elif id_field and not keep_attributes:
|
192
|
-
q = sqlalchemy.text(f"SELECT {id_field}, {geom_col} FROM {table}")
|
193
|
-
else:
|
194
|
-
q = sqlalchemy.text(f"SELECT {geom_col} FROM {table}")
|
195
|
-
df = gpd.read_postgis(q, con.connect(), geom_col=geom_col).rename_geometry(
|
196
|
-
"geometry"
|
197
|
-
)
|
198
|
-
else:
|
199
|
-
# Read file
|
200
|
-
df = gpd.read_file(input_file)
|
201
|
-
|
202
|
-
if cut_crs:
|
203
|
-
df = df.to_crs(cut_crs)
|
204
|
-
LOGGER.info("Cutting with CRS: %s", df.crs)
|
205
|
-
|
206
|
-
if id_field:
|
207
|
-
df = df.set_index(id_field)
|
208
|
-
else:
|
209
|
-
df = df.reset_index()
|
210
|
-
df = df.rename(columns={"index": "fid"}).set_index("fid")
|
211
|
-
|
212
|
-
if not keep_attributes:
|
213
|
-
# Remove all attributes except the geometry
|
214
|
-
df = df.loc[:, ["geometry"]]
|
215
|
-
|
216
|
-
LOGGER.info("Cutting large geometries")
|
217
|
-
with tqdm(total=df.shape[0]) as pbar:
|
218
|
-
for index, row in df.iterrows():
|
219
|
-
df.loc[index, "geometry"] = GeometryCollection(
|
220
|
-
katana.katana(row.geometry, cut_threshold)
|
221
|
-
)
|
222
|
-
pbar.update(1)
|
223
|
-
|
224
|
-
LOGGER.info("Exploding geometry collections and multipolygons")
|
225
|
-
df = (
|
226
|
-
df.to_crs(4326)
|
227
|
-
.explode(index_parts=False) # Explode from GeometryCollection
|
228
|
-
.explode(index_parts=False) # Explode multipolygons to polygons
|
229
|
-
).reset_index()
|
230
|
-
|
231
|
-
drop_conditions = [
|
232
|
-
{
|
233
|
-
"index": lambda frame: frame[
|
234
|
-
(frame.geometry.is_empty | frame.geometry.isna())
|
235
|
-
],
|
236
|
-
"message": "Dropping empty or null geometries",
|
237
|
-
},
|
238
|
-
{
|
239
|
-
"index": lambda frame: frame[
|
240
|
-
(frame.geometry.geom_type != "Polygon")
|
241
|
-
& (frame.geometry.geom_type != "LineString")
|
242
|
-
], # NB currently points and other types are lost; in principle, these could be indexed
|
243
|
-
"message": "Dropping unsupported geometries",
|
244
|
-
},
|
245
|
-
]
|
246
|
-
for condition in drop_conditions:
|
247
|
-
df = drop_condition(df, condition["index"](df).index, condition["message"])
|
248
|
-
|
249
|
-
ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True)
|
250
|
-
|
251
|
-
LOGGER.info("Spatially sorting and partitioning (%s)", spatial_sorting)
|
252
|
-
ddf = ddf.spatial_shuffle(by=spatial_sorting)
|
253
|
-
spatial_sort_col = (
|
254
|
-
spatial_sorting
|
255
|
-
if spatial_sorting == "geohash"
|
256
|
-
else f"{spatial_sorting}_distance"
|
257
|
-
)
|
258
|
-
|
259
|
-
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
|
260
|
-
with TqdmCallback():
|
261
|
-
ddf.to_parquet(tmpdir, overwrite=True)
|
262
|
-
|
263
|
-
filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*")))
|
264
|
-
|
265
|
-
# Multithreaded polyfilling
|
266
|
-
LOGGER.info(
|
267
|
-
"H3 Indexing on spatial partitions by polyfill with H3 resolution: %d",
|
268
|
-
resolution,
|
269
|
-
)
|
270
|
-
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
|
271
|
-
with Pool(processes=processes) as pool:
|
272
|
-
args = [
|
273
|
-
(filepath, spatial_sort_col, resolution, parent_res, tmpdir2)
|
274
|
-
for filepath in filepaths
|
275
|
-
]
|
276
|
-
list(tqdm(pool.imap(polyfill_star, args), total=len(args)))
|
277
|
-
|
278
|
-
_parent_partitioning(
|
279
|
-
tmpdir2, output_directory, resolution, parent_res, overwrite=overwrite
|
280
|
-
)
|
281
|
-
|
282
|
-
return output_directory
|
283
|
-
|
284
48
|
|
285
49
|
@click.command(context_settings={"show_default": True})
|
286
|
-
@click_log.simple_verbosity_option(LOGGER)
|
50
|
+
@click_log.simple_verbosity_option(common.LOGGER)
|
287
51
|
@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
|
288
52
|
@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
|
289
53
|
@click.option(
|
290
54
|
"-r",
|
291
55
|
"--resolution",
|
292
56
|
required=True,
|
293
|
-
type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
|
57
|
+
type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
|
294
58
|
help="H3 resolution to index",
|
295
59
|
nargs=1,
|
296
60
|
)
|
@@ -298,14 +62,14 @@ def _index(
|
|
298
62
|
"-pr",
|
299
63
|
"--parent_res",
|
300
64
|
required=False,
|
301
|
-
type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
|
65
|
+
type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))),
|
302
66
|
help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
|
303
67
|
)
|
304
68
|
@click.option(
|
305
69
|
"-id",
|
306
70
|
"--id_field",
|
307
71
|
required=False,
|
308
|
-
default=
|
72
|
+
default=const.DEFAULTS["id"],
|
309
73
|
type=str,
|
310
74
|
help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
|
311
75
|
nargs=1,
|
@@ -315,7 +79,7 @@ def _index(
|
|
315
79
|
"--keep_attributes",
|
316
80
|
is_flag=True,
|
317
81
|
show_default=True,
|
318
|
-
default=
|
82
|
+
default=const.DEFAULTS["k"],
|
319
83
|
help="Retain attributes in output. The default is to create an output that only includes H3 cell ID and the ID given by the -id field (or the default index ID).",
|
320
84
|
)
|
321
85
|
@click.option(
|
@@ -323,7 +87,7 @@ def _index(
|
|
323
87
|
"--chunksize",
|
324
88
|
required=True,
|
325
89
|
type=int,
|
326
|
-
default=
|
90
|
+
default=const.DEFAULTS["ch"],
|
327
91
|
help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
|
328
92
|
nargs=1,
|
329
93
|
)
|
@@ -331,14 +95,14 @@ def _index(
|
|
331
95
|
"-s",
|
332
96
|
"--spatial_sorting",
|
333
97
|
type=click.Choice(["hilbert", "morton", "geohash"]),
|
334
|
-
default="
|
98
|
+
default=const.DEFAULTS["s"],
|
335
99
|
help="Spatial sorting method when perfoming spatial partitioning.",
|
336
100
|
)
|
337
101
|
@click.option(
|
338
102
|
"-crs",
|
339
103
|
"--cut_crs",
|
340
104
|
required=False,
|
341
|
-
default=
|
105
|
+
default=const.DEFAULTS["crs"],
|
342
106
|
type=int,
|
343
107
|
help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
|
344
108
|
nargs=1,
|
@@ -347,7 +111,7 @@ def _index(
|
|
347
111
|
"-c",
|
348
112
|
"--cut_threshold",
|
349
113
|
required=True,
|
350
|
-
default=
|
114
|
+
default=const.DEFAULTS["c"],
|
351
115
|
type=int,
|
352
116
|
help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
|
353
117
|
nargs=1,
|
@@ -356,7 +120,7 @@ def _index(
|
|
356
120
|
"-t",
|
357
121
|
"--threads",
|
358
122
|
required=False,
|
359
|
-
default=
|
123
|
+
default=const.DEFAULTS["t"],
|
360
124
|
type=int,
|
361
125
|
help="Amount of threads used for operation",
|
362
126
|
nargs=1,
|
@@ -365,7 +129,7 @@ def _index(
|
|
365
129
|
"-tbl",
|
366
130
|
"--table",
|
367
131
|
required=False,
|
368
|
-
default=
|
132
|
+
default=const.DEFAULTS["tbl"],
|
369
133
|
type=str,
|
370
134
|
help="Name of the table to read when using a spatial database connection as input",
|
371
135
|
nargs=1,
|
@@ -374,14 +138,14 @@ def _index(
|
|
374
138
|
"-g",
|
375
139
|
"--geom_col",
|
376
140
|
required=False,
|
377
|
-
default="
|
141
|
+
default=const.DEFAULTS["g"],
|
378
142
|
type=str,
|
379
143
|
help="Column name to use when using a spatial database connection as input",
|
380
144
|
nargs=1,
|
381
145
|
)
|
382
146
|
@click.option(
|
383
147
|
"--tempdir",
|
384
|
-
default=
|
148
|
+
default=const.DEFAULTS["tempdir"],
|
385
149
|
type=click.Path(),
|
386
150
|
help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
|
387
151
|
)
|
@@ -410,46 +174,21 @@ def h3(
|
|
410
174
|
VECTOR_INPUT is the path to input vector geospatial data.
|
411
175
|
OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
|
412
176
|
"""
|
413
|
-
tempfile.tempdir = tempdir
|
414
|
-
if parent_res is not None and not int(parent_res) < int(resolution):
|
415
|
-
raise ParentResolutionException(
|
416
|
-
"Parent resolution ({pr}) must be less than target resolution ({r})".format(
|
417
|
-
pr=parent_res, r=resolution
|
418
|
-
)
|
419
|
-
)
|
420
|
-
con: sqlalchemy.engine.Connection = None
|
421
|
-
scheme: str = urlparse(vector_input).scheme
|
422
|
-
if bool(scheme) and scheme != "file":
|
423
|
-
# Assume database connection
|
424
|
-
con = sqlalchemy.create_engine(vector_input)
|
425
|
-
elif not Path(vector_input).exists():
|
426
|
-
if not scheme:
|
427
|
-
LOGGER.warning(
|
428
|
-
f"Input vector {vector_input} does not exist, and is not recognised as a remote URI"
|
429
|
-
)
|
430
|
-
raise FileNotFoundError(
|
431
|
-
errno.ENOENT, os.strerror(errno.ENOENT), vector_input
|
432
|
-
)
|
433
|
-
vector_input = str(vector_input)
|
434
|
-
else:
|
435
|
-
vector_input = Path(vector_input)
|
177
|
+
tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
|
436
178
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
f"{output_directory} already exists; if you want to overwrite this, use the -o/--overwrite flag"
|
442
|
-
)
|
443
|
-
elif outputexists and overwrite:
|
444
|
-
LOGGER.info(f"Overwriting the contents of {output_directory}")
|
445
|
-
shutil.rmtree(output_directory)
|
446
|
-
output_directory.mkdir(parents=True, exist_ok=True)
|
179
|
+
common.check_resolutions(resolution, parent_res)
|
180
|
+
|
181
|
+
con, vector_input = common.db_conn_and_input_path(vector_input)
|
182
|
+
output_directory = common.resolve_output_path(output_directory, overwrite)
|
447
183
|
|
448
184
|
if cut_crs is not None:
|
449
185
|
cut_crs = pyproj.CRS.from_user_input(cut_crs)
|
450
186
|
|
451
187
|
try:
|
452
|
-
|
188
|
+
common.index(
|
189
|
+
"h3",
|
190
|
+
h3polyfill,
|
191
|
+
h3_secondary_index,
|
453
192
|
vector_input,
|
454
193
|
output_directory,
|
455
194
|
int(resolution),
|
vector2dggs/rHP.py
ADDED
@@ -0,0 +1,217 @@
|
|
1
|
+
import sys
|
2
|
+
import click
|
3
|
+
import click_log
|
4
|
+
import tempfile
|
5
|
+
import pyproj
|
6
|
+
|
7
|
+
import rhppandas # Necessary import despite lack of explicit use
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
import geopandas as gpd
|
11
|
+
|
12
|
+
from typing import Union
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
import vector2dggs.constants as const
|
16
|
+
import vector2dggs.common as common
|
17
|
+
|
18
|
+
from vector2dggs import __version__
|
19
|
+
|
20
|
+
|
21
|
+
def rhp_secondary_index(df: gpd.GeoDataFrame, parent_res: int) -> gpd.GeoDataFrame:
|
22
|
+
return df.rhp.rhp_to_parent(parent_res)
|
23
|
+
|
24
|
+
|
25
|
+
def rhppolyfill(df: gpd.GeoDataFrame, resolution: int):
|
26
|
+
df_polygon = df[df.geom_type == "Polygon"]
|
27
|
+
if len(df_polygon.index) > 0:
|
28
|
+
df_polygon = df_polygon.rhp.polyfill_resample(
|
29
|
+
resolution, return_geometry=False
|
30
|
+
).drop(columns=["index"])
|
31
|
+
|
32
|
+
df_multipolygon = df[df.geom_type == "MultiPolygon"]
|
33
|
+
if len(df_multipolygon.index) > 0:
|
34
|
+
df_multipolygon = df_multipolygon.rhp.polyfill_resample(
|
35
|
+
resolution, return_geometry=False
|
36
|
+
).drop(columns=["index"])
|
37
|
+
|
38
|
+
# df_linestring = df[df.geom_type == "LineString"]
|
39
|
+
# if len(df_linestring.index) > 0:
|
40
|
+
# df_linestring = (
|
41
|
+
# df_linestring.h3.linetrace(resolution)
|
42
|
+
# .explode("h3_linetrace")
|
43
|
+
# .set_index("h3_linetrace")
|
44
|
+
# )
|
45
|
+
# df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")]
|
46
|
+
|
47
|
+
return pd.concat(
|
48
|
+
map(
|
49
|
+
lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])),
|
50
|
+
[df_polygon, df_multipolygon], # df_linestring],
|
51
|
+
)
|
52
|
+
)
|
53
|
+
|
54
|
+
|
55
|
+
@click.command(context_settings={"show_default": True})
|
56
|
+
@click_log.simple_verbosity_option(common.LOGGER)
|
57
|
+
@click.argument("vector_input", required=True, type=click.Path(), nargs=1)
|
58
|
+
@click.argument("output_directory", required=True, type=click.Path(), nargs=1)
|
59
|
+
@click.option(
|
60
|
+
"-r",
|
61
|
+
"--resolution",
|
62
|
+
required=True,
|
63
|
+
type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
|
64
|
+
help="H3 resolution to index",
|
65
|
+
nargs=1,
|
66
|
+
)
|
67
|
+
@click.option(
|
68
|
+
"-pr",
|
69
|
+
"--parent_res",
|
70
|
+
required=False,
|
71
|
+
type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))),
|
72
|
+
help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
|
73
|
+
)
|
74
|
+
@click.option(
|
75
|
+
"-id",
|
76
|
+
"--id_field",
|
77
|
+
required=False,
|
78
|
+
default=const.DEFAULTS["id"],
|
79
|
+
type=str,
|
80
|
+
help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.",
|
81
|
+
nargs=1,
|
82
|
+
)
|
83
|
+
@click.option(
|
84
|
+
"-k",
|
85
|
+
"--keep_attributes",
|
86
|
+
is_flag=True,
|
87
|
+
show_default=True,
|
88
|
+
default=const.DEFAULTS["k"],
|
89
|
+
help="Retain attributes in output. The default is to create an output that only includes rHEALPix cell ID and the ID given by the -id field (or the default index ID).",
|
90
|
+
)
|
91
|
+
@click.option(
|
92
|
+
"-ch",
|
93
|
+
"--chunksize",
|
94
|
+
required=True,
|
95
|
+
type=int,
|
96
|
+
default=const.DEFAULTS["ch"],
|
97
|
+
help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.",
|
98
|
+
nargs=1,
|
99
|
+
)
|
100
|
+
@click.option(
|
101
|
+
"-s",
|
102
|
+
"--spatial_sorting",
|
103
|
+
type=click.Choice(["hilbert", "morton", "geohash"]),
|
104
|
+
default=const.DEFAULTS["s"],
|
105
|
+
help="Spatial sorting method when perfoming spatial partitioning.",
|
106
|
+
)
|
107
|
+
@click.option(
|
108
|
+
"-crs",
|
109
|
+
"--cut_crs",
|
110
|
+
required=False,
|
111
|
+
default=const.DEFAULTS["crs"],
|
112
|
+
type=int,
|
113
|
+
help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.",
|
114
|
+
nargs=1,
|
115
|
+
)
|
116
|
+
@click.option(
|
117
|
+
"-c",
|
118
|
+
"--cut_threshold",
|
119
|
+
required=True,
|
120
|
+
default=const.DEFAULTS["c"],
|
121
|
+
type=int,
|
122
|
+
help="Cutting up large geometries into smaller geometries based on a target length. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS.",
|
123
|
+
nargs=1,
|
124
|
+
)
|
125
|
+
@click.option(
|
126
|
+
"-t",
|
127
|
+
"--threads",
|
128
|
+
required=False,
|
129
|
+
default=const.DEFAULTS["t"],
|
130
|
+
type=int,
|
131
|
+
help="Amount of threads used for operation",
|
132
|
+
nargs=1,
|
133
|
+
)
|
134
|
+
@click.option(
|
135
|
+
"-tbl",
|
136
|
+
"--table",
|
137
|
+
required=False,
|
138
|
+
default=const.DEFAULTS["tbl"],
|
139
|
+
type=str,
|
140
|
+
help="Name of the table to read when using a spatial database connection as input",
|
141
|
+
nargs=1,
|
142
|
+
)
|
143
|
+
@click.option(
|
144
|
+
"-g",
|
145
|
+
"--geom_col",
|
146
|
+
required=False,
|
147
|
+
default=const.DEFAULTS["g"],
|
148
|
+
type=str,
|
149
|
+
help="Column name to use when using a spatial database connection as input",
|
150
|
+
nargs=1,
|
151
|
+
)
|
152
|
+
@click.option(
|
153
|
+
"--tempdir",
|
154
|
+
default=const.DEFAULTS["tempdir"],
|
155
|
+
type=click.Path(),
|
156
|
+
help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
|
157
|
+
)
|
158
|
+
@click.option("-o", "--overwrite", is_flag=True)
|
159
|
+
@click.version_option(version=__version__)
|
160
|
+
def rhp(
|
161
|
+
vector_input: Union[str, Path],
|
162
|
+
output_directory: Union[str, Path],
|
163
|
+
resolution: str,
|
164
|
+
parent_res: str,
|
165
|
+
id_field: str,
|
166
|
+
keep_attributes: bool,
|
167
|
+
chunksize: int,
|
168
|
+
spatial_sorting: str,
|
169
|
+
cut_crs: int,
|
170
|
+
cut_threshold: int,
|
171
|
+
threads: int,
|
172
|
+
table: str,
|
173
|
+
geom_col: str,
|
174
|
+
tempdir: Union[str, Path],
|
175
|
+
overwrite: bool,
|
176
|
+
):
|
177
|
+
"""
|
178
|
+
Ingest a vector dataset and index it to the rHEALPix DGGS.
|
179
|
+
|
180
|
+
VECTOR_INPUT is the path to input vector geospatial data.
|
181
|
+
OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
|
182
|
+
"""
|
183
|
+
tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir
|
184
|
+
|
185
|
+
common.check_resolutions(resolution, parent_res)
|
186
|
+
|
187
|
+
con, vector_input = common.db_conn_and_input_path(vector_input)
|
188
|
+
output_directory = common.resolve_output_path(output_directory, overwrite)
|
189
|
+
|
190
|
+
if cut_crs is not None:
|
191
|
+
cut_crs = pyproj.CRS.from_user_input(cut_crs)
|
192
|
+
|
193
|
+
try:
|
194
|
+
common.index(
|
195
|
+
"rhp",
|
196
|
+
rhppolyfill,
|
197
|
+
rhp_secondary_index,
|
198
|
+
vector_input,
|
199
|
+
output_directory,
|
200
|
+
int(resolution),
|
201
|
+
parent_res,
|
202
|
+
keep_attributes,
|
203
|
+
chunksize,
|
204
|
+
spatial_sorting,
|
205
|
+
cut_threshold,
|
206
|
+
threads,
|
207
|
+
cut_crs=cut_crs,
|
208
|
+
id_field=id_field,
|
209
|
+
con=con,
|
210
|
+
table=table,
|
211
|
+
geom_col=geom_col,
|
212
|
+
overwrite=overwrite,
|
213
|
+
)
|
214
|
+
except:
|
215
|
+
raise
|
216
|
+
else:
|
217
|
+
sys.exit(0)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vector2dggs
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.3
|
4
4
|
Summary: CLI DGGS indexer for vector geospatial data
|
5
5
|
Home-page: https://github.com/manaakiwhenua/vector2dggs
|
6
6
|
License: LGPL-3.0-or-later
|
7
|
-
Keywords: dggs,vector,h3,cli
|
7
|
+
Keywords: dggs,vector,h3,rHEALPix,cli
|
8
8
|
Author: James Ardo
|
9
9
|
Author-email: ardoj@landcareresearch.co.nz
|
10
10
|
Maintainer: Richard Law
|
@@ -13,23 +13,27 @@ Requires-Python: >=3.11,<4.0
|
|
13
13
|
Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
17
|
Classifier: Topic :: Scientific/Engineering
|
17
18
|
Classifier: Topic :: Scientific/Engineering :: GIS
|
18
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
19
20
|
Requires-Dist: click (>=8.1.7,<9.0.0)
|
20
21
|
Requires-Dist: click-log (>=0.4.0,<0.5.0)
|
21
|
-
Requires-Dist: dask (>=
|
22
|
-
Requires-Dist: dask-geopandas (>=0.4
|
23
|
-
Requires-Dist: gdal (
|
22
|
+
Requires-Dist: dask (>=2025.1,<2026.0)
|
23
|
+
Requires-Dist: dask-geopandas (>=0.4,<0.5)
|
24
|
+
Requires-Dist: gdal (>=3.8,<4.0)
|
24
25
|
Requires-Dist: geopandas (>=1.0.1,<2.0.0)
|
25
|
-
Requires-Dist: h3pandas (>=0.
|
26
|
-
Requires-Dist: numpy (
|
26
|
+
Requires-Dist: h3pandas (>=0.3,<0.4)
|
27
|
+
Requires-Dist: numpy (>=2,<3)
|
28
|
+
Requires-Dist: pillow (>=11.2.1,<12.0.0)
|
27
29
|
Requires-Dist: psycopg2 (>=2.9.9,<3.0.0)
|
28
|
-
Requires-Dist: pyarrow (>=
|
29
|
-
Requires-Dist: pyproj (>=3.
|
30
|
-
Requires-Dist:
|
30
|
+
Requires-Dist: pyarrow (>=20.0,<21.0)
|
31
|
+
Requires-Dist: pyproj (>=3.7,<4.0)
|
32
|
+
Requires-Dist: rhealpixdggs (>=0.5.5,<0.6.0)
|
33
|
+
Requires-Dist: rhppandas (>=0.1.2,<0.2.0)
|
34
|
+
Requires-Dist: shapely (>=2.1,<3.0)
|
31
35
|
Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
|
32
|
-
Requires-Dist: tqdm (>=4.
|
36
|
+
Requires-Dist: tqdm (>=4.67,<5.0)
|
33
37
|
Project-URL: Repository, https://github.com/manaakiwhenua/vector2dggs
|
34
38
|
Description-Content-Type: text/markdown
|
35
39
|
|
@@ -41,9 +45,12 @@ Python-based CLI tool to index raster files to DGGS in parallel, writing out to
|
|
41
45
|
|
42
46
|
This is the vector equivalent of [raster2dggs](https://github.com/manaakiwhenua/raster2dggs).
|
43
47
|
|
44
|
-
Currently
|
48
|
+
Currently this tool supports the following DGGSs:
|
45
49
|
|
46
|
-
|
50
|
+
- H3 (polygons, linestrings)
|
51
|
+
- rHEALPix (polygons)
|
52
|
+
|
53
|
+
Contributions (espeically for other DGGSs), suggestions, bug reports and strongly worded letters are all welcome.
|
47
54
|
|
48
55
|

|
49
56
|
|
@@ -55,6 +62,19 @@ pip install vector2dggs
|
|
55
62
|
|
56
63
|
## Usage
|
57
64
|
|
65
|
+
```bash
|
66
|
+
vector2dggs --help [11:22:14]
|
67
|
+
Usage: vector2dggs [OPTIONS] COMMAND [ARGS]...
|
68
|
+
|
69
|
+
Options:
|
70
|
+
--version Show the version and exit.
|
71
|
+
--help Show this message and exit.
|
72
|
+
|
73
|
+
Commands:
|
74
|
+
h3 Ingest a vector dataset and index it to the H3 DGGS.
|
75
|
+
rhp Ingest a vector dataset and index it to the rHEALPix DGGS.
|
76
|
+
```
|
77
|
+
|
58
78
|
```bash
|
59
79
|
vector2dggs h3 --help
|
60
80
|
Usage: vector2dggs h3 [OPTIONS] VECTOR_INPUT OUTPUT_DIRECTORY
|
@@ -153,13 +173,13 @@ In brief, to get started:
|
|
153
173
|
- Install [Poetry](https://python-poetry.org/docs/basic-usage/)
|
154
174
|
- Install [GDAL](https://gdal.org/)
|
155
175
|
- If you're on Windows, `pip install gdal` may be necessary before running the subsequent commands.
|
156
|
-
- On Linux, install GDAL 3.
|
176
|
+
- On Linux, install GDAL 3.8+ according to your platform-specific instructions, including development headers, i.e. `libgdal-dev`.
|
157
177
|
- Create the virtual environment with `poetry init`. This will install necessary dependencies.
|
158
178
|
- Subsequently, the virtual environment can be re-activated with `poetry shell`.
|
159
179
|
|
160
180
|
If you run `poetry install`, the CLI tool will be aliased so you can simply use `vector2dggs` rather than `poetry run vector2dggs`, which is the alternative if you do not `poetry install`.
|
161
181
|
|
162
|
-
|
182
|
+
Alternatively, it is also possible to install using pip with `pip install -e .`, and bypass Poetry.
|
163
183
|
|
164
184
|
#### Code formatting
|
165
185
|
|
@@ -189,14 +209,14 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
|
|
189
209
|
title={{vector2dggs}},
|
190
210
|
author={Ardo, James and Law, Richard},
|
191
211
|
url={https://github.com/manaakiwhenua/vector2dggs},
|
192
|
-
version={0.6.
|
212
|
+
version={0.6.3},
|
193
213
|
date={2023-04-20}
|
194
214
|
}
|
195
215
|
```
|
196
216
|
|
197
217
|
APA/Harvard
|
198
218
|
|
199
|
-
> Ardo, J., & Law, R. (2023). vector2dggs (0.6.
|
219
|
+
> Ardo, J., & Law, R. (2023). vector2dggs (0.6.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
|
200
220
|
|
201
221
|
[](https://github.com/manaakiwhenua/manaakiwhenua-standards)
|
202
222
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
vector2dggs/__init__.py,sha256=75x2g9bnxuIHCJ-8fmoB5K2n_V2ayIxdRNvYsNUaMhs,27
|
2
|
+
vector2dggs/cli.py,sha256=HoPp7Bwk2kZghAms6wNepx-bFhoAuHH7WXACMIy3MuM,652
|
3
|
+
vector2dggs/common.py,sha256=DL3ohG-QQyI-phyxeO6Fi2BOwWnFct-I_Y87_XC2SRQ,10578
|
4
|
+
vector2dggs/constants.py,sha256=u6n6XNvEVLUexn9Sb2rc22s2B4Rrg_VXFJaM7uEy-9Q,536
|
5
|
+
vector2dggs/h3.py,sha256=GgiGOVbsXXNp95KWKKmJZvDxGFj91TTWl575OaPZ6yk,6145
|
6
|
+
vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
|
7
|
+
vector2dggs/rHP.py,sha256=Y36tPbtY-tYBUFILHD-xnUxa2yKlYotGP6043Bg5nZc,6450
|
8
|
+
vector2dggs-0.6.3.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
9
|
+
vector2dggs-0.6.3.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
10
|
+
vector2dggs-0.6.3.dist-info/METADATA,sha256=sFXXSXOutJzbTLRviHXXn7RkN51SCfbD6ZRBeledcmY,10223
|
11
|
+
vector2dggs-0.6.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
12
|
+
vector2dggs-0.6.3.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
|
13
|
+
vector2dggs-0.6.3.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
vector2dggs/__init__.py,sha256=8sqy-gBl8LCgOL8GSQBSJg6UWO0eWPpvb3gdHmGQvbg,27
|
2
|
-
vector2dggs/cli.py,sha256=tL4NJ99uQsqoVinwYadna1a4ko5v2sdZaFaeDAj6QNE,599
|
3
|
-
vector2dggs/h3.py,sha256=AMH9VdspvKu26VhFmuWf48xm4VEDKxmNuvOeb_I2nmI,14310
|
4
|
-
vector2dggs/katana.py,sha256=pgVWy032NkT5yilUO0d0IKH4NUvY7DJLjmfsxhBiF08,3407
|
5
|
-
vector2dggs-0.6.1.dist-info/COPYING,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
6
|
-
vector2dggs-0.6.1.dist-info/COPYING.LESSER,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
7
|
-
vector2dggs-0.6.1.dist-info/METADATA,sha256=djCsEsjEqHZp2iTyMhBaTVOs0VtNAORURR7s9N8cs0U,9846
|
8
|
-
vector2dggs-0.6.1.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
9
|
-
vector2dggs-0.6.1.dist-info/entry_points.txt,sha256=5h8LB9L2oOE5u_N7FRGtu4JDwa553iPs4u0XhcLeLZU,52
|
10
|
-
vector2dggs-0.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|