rq_geo_toolkit 2025.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. rq_geo_toolkit-2025.4.0/PKG-INFO +22 -0
  2. rq_geo_toolkit-2025.4.0/README.md +1 -0
  3. rq_geo_toolkit-2025.4.0/pyproject.toml +78 -0
  4. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/__init__.py +1 -0
  5. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/_exceptions.py +1 -0
  6. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/_geopandas_api_version.py +4 -0
  7. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/constants.py +5 -0
  8. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/duckdb.py +29 -0
  9. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/geocode.py +70 -0
  10. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/geoparquet_compression.py +187 -0
  11. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/geoparquet_sorting.py +314 -0
  12. rq_geo_toolkit-2025.4.0/rq_geo_toolkit/rich_utils.py +6 -0
  13. rq_geo_toolkit-2025.4.0/tests/__init__.py +0 -0
  14. rq_geo_toolkit-2025.4.0/tests/conftest.py +24 -0
  15. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/062dda15f8838576019a50b01bdf29cc5046008bbfcfaa292350ccd382b51ec9.json +1 -0
  16. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/0f106a896123036ea3127fe6cc4fd4cca5a3648c.json +1 -0
  17. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/11f5ee14b142c0dd7044269acdccaa83f7fec5f0d211a7df2413931d367fa7f3.json +1 -0
  18. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/123221ee49d7ad51158c1ff8e648c4aa910247db.json +1 -0
  19. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/2061ddb327c93e1264c93c5d3b8c666765bc3cf3.json +1 -0
  20. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/28885ee83bb696765e29c4369d9a30e4906066b7.json +1 -0
  21. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/2de3f64cc4643acc0e573956e10cff5f99912308.json +1 -0
  22. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/3e340be4de18e6f6df4404109ac61d3329ab3b73b8745b3d3cac74c41919cd20.json +1 -0
  23. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/6af732d2784f1a7414e32a72e9335557c2eb34d7b43cb49bd2633ed720a66dbc.json +1 -0
  24. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/8c1c66a7ccfe126587800ffae960e60c47c5a3a0377713808ec01f29f410929e.json +1 -0
  25. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512ff098eaa56a4d5fb19.json +1 -0
  26. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/935018cc4153102a020fc749e7565e9df5c51a4ddeba6f74694f0ebac1ebe5c6.json +1 -0
  27. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/a66b6c078db27b114adbc579902d58b3bb8a8c471f33c69071f9a4bb197f1393.json +1 -0
  28. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/c229000b41b163c7d643397a0ee62bf332c16826.json +1 -0
  29. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/c60a970e61adaa2132e455f6728b0da92bb15ade.json +1 -0
  30. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/e9c722cbefc2f055ae60b4e2cbe73a2d99537eab0c37f3bc2dd9e0854278b970.json +1 -0
  31. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/eba02a989b1016eb1a39b907c1a7de66d0303226.json +1 -0
  32. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/ef428fa2750ed260ac05d964b13ad8468b2065840b7ef9bd37cc8c600c32a785.json +1 -0
  33. rq_geo_toolkit-2025.4.0/tests/test_files/geocoding_cache/fd7c2180ad72034cc0f5d3059237faef308318ddb3e8f4ec14519eefe730e021.json +1 -0
  34. rq_geo_toolkit-2025.4.0/tests/test_geocoding.py +49 -0
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.1
2
+ Name: rq_geo_toolkit
3
+ Version: 2025.4.0
4
+ Summary: Collection of geo related functions for reuse in other libraries
5
+ Author: Kamil Raczycki
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: geopandas>=1.0
9
+ Requires-Dist: shapely>=2.0.6
10
+ Requires-Dist: pyarrow>=16.0.0
11
+ Requires-Dist: rich>=12.0.0
12
+ Requires-Dist: geoarrow-rust-core>=0.3.0
13
+ Requires-Dist: pooch>=1.6.0
14
+ Requires-Dist: geopy>=2.0.0
15
+ Requires-Dist: numpy>=1.26.0
16
+ Requires-Dist: duckdb>=1.1.2
17
+ Requires-Dist: psutil>=5.6.2
18
+ Requires-Dist: polars>=1.9
19
+ Requires-Dist: packaging>=17.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ # RQ Geo Toolkit
@@ -0,0 +1 @@
1
+ # RQ Geo Toolkit
@@ -0,0 +1,78 @@
1
+ [project]
2
+ name = "rq_geo_toolkit"
3
+ version = "2025.4.0"
4
+ description = "Collection of geo related functions for reuse in other libraries"
5
+ authors = [
6
+ { name = "Kamil Raczycki", email = "" },
7
+ ]
8
+ dependencies = [
9
+ "geopandas>=1.0",
10
+ "shapely>=2.0.6",
11
+ "pyarrow>=16.0.0",
12
+ "rich>=12.0.0",
13
+ "geoarrow-rust-core>=0.3.0",
14
+ "pooch>=1.6.0",
15
+ "geopy>=2.0.0",
16
+ "numpy>=1.26.0",
17
+ "duckdb>=1.1.2",
18
+ "psutil>=5.6.2",
19
+ "polars>=1.9",
20
+ "packaging>=17.0",
21
+ ]
22
+ requires-python = ">=3.9"
23
+ readme = "README.md"
24
+
25
+ [project.license]
26
+ text = "MIT"
27
+
28
+ [build-system]
29
+ requires = [
30
+ "pdm-backend",
31
+ ]
32
+ build-backend = "pdm.backend"
33
+
34
+ [tool.pdm]
35
+ distribution = true
36
+
37
+ [tool.pdm.dev-dependencies]
38
+ dev = [
39
+ "bumpver",
40
+ "types-requests",
41
+ "setuptools>=45.0.0",
42
+ ]
43
+ lint = [
44
+ "pre-commit>=4",
45
+ "mypy>=1",
46
+ "docformatter[tomli]",
47
+ "ruff>=0.1.0",
48
+ ]
49
+ test = [
50
+ "pytest>=7.0.0",
51
+ "tox-pdm>=0.7.2",
52
+ "pytest-mock>=3.3.0",
53
+ "requests-mock>=1.12.1",
54
+ "pytest-check>=2.3.1",
55
+ "pytest-parametrization>=2022.2.1",
56
+ "pytest-doctestplus>=1.2.1",
57
+ "osmnx>=1.3.0",
58
+ ]
59
+
60
+ [tool.pdm.scripts]
61
+ post_install = "pre-commit install"
62
+
63
+ [tool.bumpver]
64
+ current_version = "2025.4.0"
65
+ version_pattern = "YYYY.MM.INC0"
66
+ commit_message = "chore(CI/CD): bump version {old_version} -> {new_version}"
67
+ commit = true
68
+ tag = false
69
+ push = false
70
+
71
+ [tool.bumpver.file_patterns]
72
+ "pyproject.toml" = [
73
+ "^current_version = \"{version}\"$",
74
+ "^version = \"{version}\"$",
75
+ ]
76
+ "rq_geo_toolkit/__init__.py" = [
77
+ "^__version__ = \"{version}\"$",
78
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "2025.4.0"
@@ -0,0 +1 @@
1
+ class QueryNotGeocodedError(ValueError): ...
@@ -0,0 +1,4 @@
1
+ import geopandas as gpd
2
+ from packaging import version
3
+
4
+ GEOPANDAS_NEW_API = version.parse(gpd.__version__) >= version.parse("1.0.0")
@@ -0,0 +1,5 @@
1
+ GEOMETRY_COLUMN = "geometry"
2
+
3
+ PARQUET_ROW_GROUP_SIZE = 100_000
4
+ PARQUET_COMPRESSION = "zstd"
5
+ PARQUET_COMPRESSION_LEVEL = 3
@@ -0,0 +1,29 @@
1
+ """Helper functions for DuckDB."""
2
+
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import duckdb
7
+
8
+
9
+ def sql_escape(value: str) -> str:
10
+ """Escape value for SQL query."""
11
+ return value.replace("'", "''")
12
+
13
+
14
+ def set_up_duckdb_connection(
15
+ tmp_dir_path: Union[str, Path], preserve_insertion_order: bool = False
16
+ ) -> "duckdb.DuckDBPyConnection":
17
+ """Create DuckDB connection in a given directory."""
18
+ local_db_file = "db.duckdb"
19
+ connection = duckdb.connect(
20
+ database=str(Path(tmp_dir_path) / local_db_file),
21
+ config=dict(preserve_insertion_order=preserve_insertion_order),
22
+ )
23
+ connection.sql("SET enable_progress_bar = false;")
24
+ connection.sql("SET enable_progress_bar_print = false;")
25
+
26
+ connection.install_extension("spatial")
27
+ connection.load_extension("spatial")
28
+
29
+ return connection
@@ -0,0 +1,70 @@
1
+ """Geocoding module for getting a geometry from query using Nominatim."""
2
+
3
+ import hashlib
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Union, cast, overload
7
+
8
+ from geopy.geocoders.nominatim import Nominatim
9
+ from geopy.location import Location
10
+ from shapely.geometry import shape
11
+ from shapely.geometry.base import BaseGeometry
12
+ from shapely.ops import unary_union
13
+
14
+ from rq_geo_toolkit._exceptions import QueryNotGeocodedError
15
+
16
+ USER_AGENT = "RQ Geo Toolkit Python package (https://github.com/kraina-ai/rq_geo_toolkit)"
17
+
18
+ @overload
19
+ def geocode_to_geometry(query: str) -> BaseGeometry: ...
20
+
21
+
22
+ @overload
23
+ def geocode_to_geometry(query: list[str]) -> BaseGeometry: ...
24
+
25
+
26
+ def geocode_to_geometry(query: Union[str, list[str]]) -> BaseGeometry:
27
+ """Geocode a query to a (Multi)Polygon geometry using Nominatim."""
28
+ if not isinstance(query, str):
29
+ return unary_union([geocode_to_geometry(sub_query) for sub_query in query])
30
+
31
+ h = hashlib.new("sha256")
32
+ h.update(query.encode())
33
+ query_hash = h.hexdigest()
34
+ query_file_path = Path("cache").resolve() / f"{query_hash}.json"
35
+
36
+ if not query_file_path.exists():
37
+ query_results = Nominatim(
38
+ user_agent=USER_AGENT
39
+ ).geocode(query, geometry="geojson", exactly_one=False)
40
+
41
+ if not query_results:
42
+ raise QueryNotGeocodedError(f"Zero results from Nominatim for query '{query}'.")
43
+
44
+ polygon_result = _get_first_polygon(query_results)
45
+
46
+ if not polygon_result:
47
+ raise QueryNotGeocodedError(f"No polygon found for query '{query}'.")
48
+
49
+ query_file_path.parent.mkdir(parents=True, exist_ok=True)
50
+ query_file_path.write_text(json.dumps(polygon_result))
51
+ else:
52
+ polygon_result = json.loads(query_file_path.read_text())
53
+
54
+ return unary_union(shape(polygon_result))
55
+
56
+
57
+ def _get_first_polygon(results: list[Location]) -> Optional[dict[str, Any]]:
58
+ """Choose first result of geometry type (Multi)Polygon from list of
59
+ results.
60
+
61
+ Inspired by OSMnx implementation.
62
+ """
63
+ polygon_types = {"Polygon", "MultiPolygon"}
64
+
65
+ for result in results:
66
+ geojson_dict = cast(dict[str, Any], result.raw["geojson"])
67
+ if geojson_dict["type"] in polygon_types:
68
+ return geojson_dict
69
+
70
+ return None
@@ -0,0 +1,187 @@
1
+ """Module for sorting GeoParquet files."""
2
+
3
+ import multiprocessing
4
+ import tempfile
5
+ from collections.abc import Callable
6
+ from functools import partial
7
+ from math import ceil
8
+ from pathlib import Path
9
+ from time import sleep
10
+ from typing import TYPE_CHECKING, Any, Optional, Union
11
+
12
+ import duckdb
13
+ import psutil
14
+ import pyarrow.parquet as pq
15
+ from rich import print as rprint
16
+
17
+ from rq_geo_toolkit.constants import PARQUET_COMPRESSION, PARQUET_COMPRESSION_LEVEL, PARQUET_ROW_GROUP_SIZE
18
+ from rq_geo_toolkit.duckdb import set_up_duckdb_connection
19
+
20
+ if TYPE_CHECKING: # pragma: no cover
21
+ from rq_geo_toolkit.rich_utils import VERBOSITY_MODE
22
+
23
+ MEMORY_1GB = 1024**3
24
+
25
+
26
+ def compress_parquet_with_duckdb(
27
+ input_file_path: Path,
28
+ output_file_path: Path,
29
+ working_directory: Union[str, Path] = "files",
30
+ parquet_metadata: Optional[pq.FileMetaData] = None,
31
+ verbosity_mode: "VERBOSITY_MODE" = "transient",
32
+ ) -> Path:
33
+ """Compresses a GeoParquet file while keeping its metadata.
34
+
35
+ Args:
36
+ input_file_path (Path): Input GeoParquet file path.
37
+ output_file_path (Path): Output GeoParquet file path.
38
+ working_directory (Union[str, Path], optional): Directory where to save
39
+ the downloaded `*.parquet` files. Defaults to "files".
40
+ parquet_metadata (Optional[pq.FileMetaData], optional): GeoParquet file metadata used to
41
+ copy. If not provided, will load the metadata from the input file. Defaults to None.
42
+ verbosity_mode (Literal["silent", "transient", "verbose"], optional): Set progress
43
+ verbosity mode. Can be one of: silent, transient and verbose. Silent disables
44
+ output completely. Transient tracks progress, but removes output after finished.
45
+ Verbose leaves all progress outputs in the stdout. Defaults to "transient".
46
+ """
47
+ assert input_file_path.resolve().as_posix() != output_file_path.resolve().as_posix()
48
+
49
+ Path(working_directory).mkdir(parents=True, exist_ok=True)
50
+
51
+ if pq.read_metadata(input_file_path).num_rows == 0:
52
+ return input_file_path.rename(output_file_path)
53
+
54
+ with tempfile.TemporaryDirectory(dir=Path(working_directory).resolve()) as tmp_dir_name:
55
+ tmp_dir_path = Path(tmp_dir_name)
56
+
57
+ original_metadata_string = _parquet_schema_metadata_to_duckdb_kv_metadata(
58
+ parquet_metadata or pq.read_metadata(input_file_path)
59
+ )
60
+
61
+ _run_query_with_memory_limit(
62
+ tmp_dir_path=tmp_dir_path,
63
+ verbosity_mode=verbosity_mode,
64
+ current_memory_gb_limit=None,
65
+ current_threads_limit=None,
66
+ function=_compress_with_memory_limit,
67
+ args=(input_file_path, output_file_path, original_metadata_string),
68
+ )
69
+
70
+ return output_file_path
71
+
72
+
73
+ def _compress_with_memory_limit(
74
+ input_file_path: Union[list[Path], Path],
75
+ output_file_path: Path,
76
+ original_metadata_string: str,
77
+ current_memory_gb_limit: float,
78
+ current_threads_limit: int,
79
+ tmp_dir_path: Path,
80
+ ) -> None:
81
+ connection = set_up_duckdb_connection(tmp_dir_path, preserve_insertion_order=True)
82
+
83
+ connection.execute("SET enable_geoparquet_conversion = false;")
84
+ connection.execute(f"SET memory_limit = '{current_memory_gb_limit}GB';")
85
+ connection.execute(f"SET threads = {current_threads_limit};")
86
+
87
+ if isinstance(input_file_path, Path):
88
+ sql_input_str = f"'{input_file_path}'"
89
+ else:
90
+ mapped_paths = ", ".join(f"'{path}'" for path in input_file_path)
91
+ sql_input_str = f"[{mapped_paths}]"
92
+
93
+ connection.execute(
94
+ f"""
95
+ COPY (
96
+ SELECT original_data.*
97
+ FROM read_parquet({sql_input_str}, hive_partitioning=false) original_data
98
+ ) TO '{output_file_path}' (
99
+ FORMAT parquet,
100
+ COMPRESSION {PARQUET_COMPRESSION},
101
+ COMPRESSION_LEVEL {PARQUET_COMPRESSION_LEVEL},
102
+ ROW_GROUP_SIZE {PARQUET_ROW_GROUP_SIZE},
103
+ KV_METADATA {original_metadata_string}
104
+ );
105
+ """
106
+ )
107
+
108
+ connection.close()
109
+
110
+
111
+ def _run_query_with_memory_limit(
112
+ tmp_dir_path: Path,
113
+ verbosity_mode: "VERBOSITY_MODE",
114
+ current_memory_gb_limit: Optional[float],
115
+ current_threads_limit: Optional[int],
116
+ function: Callable[..., None],
117
+ args: Any,
118
+ ) -> tuple[float, int]:
119
+ current_memory_gb_limit = current_memory_gb_limit or ceil(
120
+ psutil.virtual_memory().total / MEMORY_1GB
121
+ )
122
+ current_threads_limit = current_threads_limit or multiprocessing.cpu_count()
123
+
124
+ while current_memory_gb_limit > 0:
125
+ try:
126
+ with (
127
+ tempfile.TemporaryDirectory(dir=Path(tmp_dir_path).resolve()) as tmp_dir_name,
128
+ multiprocessing.get_context("spawn").Pool() as pool,
129
+ ):
130
+ nested_tmp_dir_path = Path(tmp_dir_name)
131
+ r = pool.apply_async(
132
+ func=partial(
133
+ function,
134
+ current_memory_gb_limit=current_memory_gb_limit,
135
+ current_threads_limit=current_threads_limit,
136
+ tmp_dir_path=nested_tmp_dir_path,
137
+ ),
138
+ args=args,
139
+ )
140
+ actual_memory = psutil.virtual_memory()
141
+ percentage_threshold = 95
142
+ if (actual_memory.total * 0.05) > MEMORY_1GB:
143
+ percentage_threshold = (
144
+ 100 * (actual_memory.total - MEMORY_1GB) / actual_memory.total
145
+ )
146
+ while not r.ready():
147
+ actual_memory = psutil.virtual_memory()
148
+ if actual_memory.percent > percentage_threshold:
149
+ raise MemoryError()
150
+
151
+ sleep(0.5)
152
+ r.get()
153
+ return current_memory_gb_limit, current_threads_limit
154
+ except (duckdb.OutOfMemoryException, MemoryError) as ex:
155
+ if current_memory_gb_limit < 1:
156
+ raise RuntimeError(
157
+ "Not enough memory to run the ordering query. Please rerun without sorting."
158
+ ) from ex
159
+
160
+ if current_memory_gb_limit == 1:
161
+ current_memory_gb_limit /= 2
162
+ else:
163
+ current_memory_gb_limit = ceil(current_memory_gb_limit / 2)
164
+
165
+ current_threads_limit = ceil(current_threads_limit / 2)
166
+
167
+ if not verbosity_mode == "silent":
168
+ rprint(
169
+ f"Encountered {ex.__class__.__name__} during operation."
170
+ " Retrying with lower number of resources"
171
+ f" ({current_memory_gb_limit:.2f}GB, {current_threads_limit} threads)."
172
+ )
173
+
174
+ raise RuntimeError("Not enough memory to run the query. Please rerun without sorting.")
175
+
176
+
177
+ def _parquet_schema_metadata_to_duckdb_kv_metadata(parquet_file_metadata: pq.FileMetaData) -> str:
178
+ def escape_single_quotes(s: str) -> str:
179
+ return s.replace("'", "''")
180
+
181
+ kv_pairs = []
182
+ for key, value in parquet_file_metadata.metadata.items():
183
+ escaped_key = escape_single_quotes(key.decode())
184
+ escaped_value = escape_single_quotes(value.decode())
185
+ kv_pairs.append(f"'{escaped_key}': '{escaped_value}'")
186
+
187
+ return "{ " + ", ".join(kv_pairs) + " }"