ssb-sgis 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/PKG-INFO +1 -1
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/pyproject.toml +1 -1
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/__init__.py +1 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/conf.py +13 -47
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/buffer_dissolve_explode.py +2 -1
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/duplicates.py +4 -3
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/general.py +61 -30
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/neighbors.py +25 -4
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/overlay.py +4 -3
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/polygon_operations.py +3 -2
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/runners.py +94 -45
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/sfilter.py +4 -2
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/dapla_functions.py +98 -106
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/LICENSE +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/README.md +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/debug_config.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/exceptions.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/bounds.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/centerlines.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/cleaning.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/conversion.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/geocoding.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/geometry_types.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/point_operations.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/polygons_as_rings.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/utils.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/helpers.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/_is_dapla.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/opener.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/read_parquet.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/examine.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/explore.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/httpserver.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/legend.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/map.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/maps.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/norge_i_bilder.json +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/thematicmap.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/tilesources.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/wms.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_get_route.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_od_cost_matrix.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_points.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_service_area.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/closing_network_holes.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/cutting_lines.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/directednetwork.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/finding_isolated_networks.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/network.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/networkanalysis.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/networkanalysisrules.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/nodes.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/traveling_salesman.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/parallel/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/parallel/parallel.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/py.typed +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/__init__.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/base.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/image_collection.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/indices.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/regex.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/sentinel_config.py +0 -0
- {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/zonal.py +0 -0
|
@@ -42,6 +42,7 @@ from .geopandas_tools.general import make_lines_between_points
|
|
|
42
42
|
from .geopandas_tools.general import points_in_bounds
|
|
43
43
|
from .geopandas_tools.general import random_points
|
|
44
44
|
from .geopandas_tools.general import random_points_in_polygons
|
|
45
|
+
from .geopandas_tools.general import random_points_norway
|
|
45
46
|
from .geopandas_tools.general import sort_large_first
|
|
46
47
|
from .geopandas_tools.general import sort_long_first
|
|
47
48
|
from .geopandas_tools.general import sort_short_first
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from collections.abc import Iterable
|
|
2
1
|
from typing import Any
|
|
3
2
|
|
|
4
3
|
try:
|
|
@@ -74,51 +73,18 @@ from .geopandas_tools.runners import RTreeQueryRunner
|
|
|
74
73
|
from .geopandas_tools.runners import UnionRunner
|
|
75
74
|
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
"""
|
|
76
|
+
def _get_instance(data: dict, key: str, **kwargs) -> Any:
|
|
77
|
+
"""Get the dict value and call it if callable."""
|
|
78
|
+
x = data[key]
|
|
79
|
+
if callable(x):
|
|
80
|
+
return x(**kwargs)
|
|
81
|
+
return x
|
|
79
82
|
|
|
80
|
-
def __init__(self, data: dict) -> None:
|
|
81
|
-
"""Initialise with dict."""
|
|
82
|
-
self.data = data
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def __getattr__(self, attr: str) -> Any:
|
|
92
|
-
"""Get dict attribute."""
|
|
93
|
-
return getattr(self.data, attr)
|
|
94
|
-
|
|
95
|
-
def __getitem__(self, key: str) -> Any:
|
|
96
|
-
"""Get dict value."""
|
|
97
|
-
return self.data[key]
|
|
98
|
-
|
|
99
|
-
def __setitem__(self, key: str, value) -> None:
|
|
100
|
-
"""Set dict value."""
|
|
101
|
-
self.data[key] = value
|
|
102
|
-
|
|
103
|
-
def __iter__(self) -> Iterable[str]:
|
|
104
|
-
"""Iterate over dict keys."""
|
|
105
|
-
return iter(self.data)
|
|
106
|
-
|
|
107
|
-
def __len__(self) -> int:
|
|
108
|
-
"""Length of dict."""
|
|
109
|
-
return len(self.data)
|
|
110
|
-
|
|
111
|
-
def __str__(self) -> str:
|
|
112
|
-
"""String representation of dict."""
|
|
113
|
-
return str(self.data)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
config = Config(
|
|
117
|
-
{
|
|
118
|
-
"n_jobs": 1,
|
|
119
|
-
"file_system": file_system,
|
|
120
|
-
"rtree_runner": RTreeQueryRunner,
|
|
121
|
-
"overlay_runner": OverlayRunner,
|
|
122
|
-
"union_runner": UnionRunner,
|
|
123
|
-
}
|
|
124
|
-
)
|
|
84
|
+
config = {
|
|
85
|
+
"n_jobs": 1,
|
|
86
|
+
"file_system": file_system,
|
|
87
|
+
"rtree_runner": RTreeQueryRunner,
|
|
88
|
+
"overlay_runner": OverlayRunner,
|
|
89
|
+
"union_runner": UnionRunner,
|
|
90
|
+
}
|
|
@@ -23,6 +23,7 @@ from geopandas import GeoDataFrame
|
|
|
23
23
|
from geopandas import GeoSeries
|
|
24
24
|
from shapely import get_num_geometries
|
|
25
25
|
|
|
26
|
+
from ..conf import _get_instance
|
|
26
27
|
from ..conf import config
|
|
27
28
|
from ..parallel.parallel import Parallel
|
|
28
29
|
from .geometry_types import make_all_singlepart
|
|
@@ -197,7 +198,7 @@ def _dissolve(
|
|
|
197
198
|
return gdf
|
|
198
199
|
|
|
199
200
|
if union_runner is None:
|
|
200
|
-
union_runner = config
|
|
201
|
+
union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
|
|
201
202
|
|
|
202
203
|
geom_col = gdf.geometry.name
|
|
203
204
|
by = dissolve_kwargs.pop("by", None)
|
|
@@ -8,6 +8,7 @@ from shapely import STRtree
|
|
|
8
8
|
from shapely import difference
|
|
9
9
|
from shapely import simplify
|
|
10
10
|
|
|
11
|
+
from ..conf import _get_instance
|
|
11
12
|
from ..conf import config
|
|
12
13
|
from .general import _determine_geom_type_args
|
|
13
14
|
from .general import _push_geom_col
|
|
@@ -106,11 +107,11 @@ def update_geometries(
|
|
|
106
107
|
return gdf
|
|
107
108
|
|
|
108
109
|
if rtree_runner is None:
|
|
109
|
-
rtree_runner = config
|
|
110
|
+
rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
|
|
110
111
|
if union_runner is None:
|
|
111
|
-
union_runner = config
|
|
112
|
+
union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
|
|
112
113
|
if overlay_runner is None:
|
|
113
|
-
overlay_runner = config
|
|
114
|
+
overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
|
|
114
115
|
|
|
115
116
|
if geom_type == "polygon" or get_geom_type(gdf) == "polygon":
|
|
116
117
|
gdf.geometry = gdf.buffer(0)
|
|
@@ -21,7 +21,6 @@ from shapely import get_coordinates
|
|
|
21
21
|
from shapely import get_parts
|
|
22
22
|
from shapely import linestrings
|
|
23
23
|
from shapely import make_valid
|
|
24
|
-
from shapely import points as shapely_points
|
|
25
24
|
from shapely.geometry import LineString
|
|
26
25
|
from shapely.geometry import MultiPoint
|
|
27
26
|
from shapely.geometry import Point
|
|
@@ -34,6 +33,7 @@ from .geometry_types import get_geom_type
|
|
|
34
33
|
from .geometry_types import make_all_singlepart
|
|
35
34
|
from .geometry_types import to_single_geom_type
|
|
36
35
|
from .neighbors import get_k_nearest_neighbors
|
|
36
|
+
from .sfilter import sfilter
|
|
37
37
|
from .sfilter import sfilter_split
|
|
38
38
|
|
|
39
39
|
|
|
@@ -416,7 +416,7 @@ def random_points(n: int, loc: float | int = 0.5) -> GeoDataFrame:
|
|
|
416
416
|
"""Creates a GeoDataFrame with n random points.
|
|
417
417
|
|
|
418
418
|
Args:
|
|
419
|
-
n: Number of points
|
|
419
|
+
n: Number of points to create.
|
|
420
420
|
loc: Mean ('centre') of the distribution.
|
|
421
421
|
|
|
422
422
|
Returns:
|
|
@@ -459,47 +459,78 @@ def random_points(n: int, loc: float | int = 0.5) -> GeoDataFrame:
|
|
|
459
459
|
9999 POINT (134.503 168.155)
|
|
460
460
|
[10000 rows x 1 columns]
|
|
461
461
|
"""
|
|
462
|
-
if isinstance(n, (str, float)):
|
|
463
|
-
n = int(n)
|
|
464
|
-
|
|
465
462
|
x = np.random.rand(n) * float(loc) * 2
|
|
466
463
|
y = np.random.rand(n) * float(loc) * 2
|
|
464
|
+
return GeoDataFrame(shapely.points(x, y=y), columns=["geometry"])
|
|
467
465
|
|
|
468
|
-
return GeoDataFrame(
|
|
469
|
-
(Point(x, y) for x, y in zip(x, y, strict=True)), columns=["geometry"]
|
|
470
|
-
)
|
|
471
466
|
|
|
467
|
+
def random_points_norway(size: int, *, seed: int | None = None) -> GeoDataFrame:
|
|
468
|
+
"""Creates a GeoDataFrame with crs=25833 n random points aprox. within the borders of mainland Norway.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
size: Number of points to create.
|
|
472
|
+
seed: Optional random seed.
|
|
472
473
|
|
|
473
|
-
|
|
474
|
+
Returns:
|
|
475
|
+
A GeoDataFrame of points with n rows.
|
|
476
|
+
"""
|
|
477
|
+
return random_points_in_polygons(
|
|
478
|
+
[
|
|
479
|
+
shapely.wkt.loads(x)
|
|
480
|
+
for x in [
|
|
481
|
+
"POLYGON ((546192 7586393, 546191 7586393, 526598 7592425, 526597 7592425, 526596 7592425, 526595 7592426, 526594 7592426, 525831 7593004, 525830 7593005, 525327 7593495, 525326 7593496, 525326 7593497, 525325 7593498, 525325 7593499, 525324 7593500, 525192 7594183, 525192 7594184, 524157 7606517, 524157 7606518, 524157 7606519, 524157 7606520, 524157 7606521, 526235 7613535, 526236 7613536, 559423 7676952, 559424 7676953, 559511 7677088, 579978 7708379, 636963 7792940, 636963 7792941, 636964 7792942, 636965 7792943, 641013 7795664, 823514 7912323, 823515 7912323, 823516 7912323, 882519 7931958, 882520 7931959, 882521 7931959, 953896 7939985, 953897 7939985, 973544 7939988, 973545 7939988, 973546 7939988, 975510 7939467, 1051029 7913762, 1051030 7913762, 1055067 7912225, 1055068 7912224, 1056725 7911491, 1098379 7890321, 1098380 7890320, 1098381 7890320, 1099197 7889670, 1099198 7889669, 1099442 7889429, 1099443 7889429, 1099444 7889428, 1099444 7889427, 1099445 7889426, 1099445 7889425, 1099445 7889424, 1099446 7889423, 1114954 7799458, 1115106 7797736, 1115106 7797735, 1115106 7797734, 1115106 7797733, 1115106 7797732, 1115105 7797731, 1115105 7797730, 1114774 7797199, 1112876 7794451, 1057595 7720320, 1057112 7719702, 1057112 7719701, 1057111 7719701, 1057110 7719700, 1057109 7719699, 902599 7637176, 902598 7637176, 902597 7637175, 902596 7637175, 702394 7590633, 702393 7590633, 702392 7590633, 546193 7586393, 546192 7586393))",
|
|
482
|
+
"POLYGON ((60672 6448410, 60671 6448411, 57185 6448783, 39229 6451077, 39228 6451077, 39227 6451077, 27839 6454916, 27838 6454916, 27808 6454929, 27807 6454929, 8939 6465625, 8938 6465626, 7449 6466699, 7448 6466700, 6876 6467215, 6876 6467216, -31966 6512038, -31968 6512040, -32554 6512779, -32554 6512780, -40259 6524877, -42041 6527698, -42217 6528008, -42546 6528677, -42547 6528678, -77251 6614452, -77252 6614453, -77252 6614454, -77252 6614455, -77252 6614456, -77206 6615751, -77206 6615752, -65669 6811422, -65669 6811423, -65608 6812139, -65608 6812140, -65608 6812141, -50907 6879624, -50907 6879625, -50907 6879626, -50906 6879627, -50889 6879658, -50889 6879659, -16217 6934790, -16217 6934791, -16216 6934792, -2958 6949589, -2957 6949590, 55128 6995098, 144915 7064393, 144915 7064394, 144916 7064395, 144958 7064418, 144959 7064418, 144960 7064418, 144961 7064419, 144962 7064419, 144963 7064419, 150493 7064408, 150494 7064408, 150495 7064408, 150770 7064370, 150771 7064370, 150772 7064370, 188559 7048106, 188560 7048105, 188664 7048054, 188665 7048054, 188666 7048053, 357806 6914084, 357807 6914083, 357808 6914082, 357809 6914081, 357809 6914080, 357810 6914079, 357810 6914078, 359829 6906908, 386160 6804356, 386160 6804355, 386160 6804354, 386160 6804353, 386160 6804352, 386160 6804351, 368140 6699014, 368140 6699013, 363725 6675483, 363725 6675482, 361041 6665071, 361040 6665070, 361040 6665069, 308721 6537573, 308720 6537572, 307187 6534433, 307187 6534432, 307186 6534431, 307185 6534430, 307184 6534429, 307183 6534429, 307182 6534428, 303562 6532881, 300420 6531558, 99437 6459510, 99436 6459510, 67654 6449332, 65417 6448682, 65416 6448682, 65415 6448682, 60673 6448410, 60672 6448410))",
|
|
483
|
+
"POLYGON ((219870 6914350, 219869 6914350, 219868 6914351, 219867 6914351, 194827 6928565, 194826 6928566, 193100 6929790, 193099 6929790, 193098 6929791, 193098 6929792, 193097 6929793, 157353 7006877, 157353 7006878, 154402 7017846, 154402 7017847, 154392 7017923, 154392 7017924, 154392 7017925, 154392 7017926, 166616 7077346, 166617 7077347, 169164 7087256, 169165 7087257, 170277 7089848, 173146 7096147, 173147 7096148, 174684 7098179, 174685 7098180, 314514 7253805, 314515 7253805, 314515 7253806, 314516 7253806, 314517 7253807, 314518 7253807, 314519 7253808, 314520 7253808, 314521 7253808, 314522 7253808, 314523 7253808, 314524 7253808, 332374.8847495829 7250200.016409928, 327615 7280207, 327615 7280208, 327615 7280209, 327615 7280210, 328471 7285637, 364549 7480637, 364549 7480638, 367030 7488919, 367030 7488920, 367045 7488948, 367045 7488949, 367046 7488950, 419493 7560257, 472291 7626092, 506326 7665544, 506327 7665545, 506328 7665546, 541847 7692387, 541848 7692388, 541849 7692388, 541850 7692389, 541851 7692389, 541852 7692389, 545852 7692619, 546265 7692617, 546266 7692617, 546267 7692617, 546268 7692617, 546269 7692616, 546270 7692616, 546270 7692615, 546271 7692615, 546272 7692614, 623027 7613734, 623028 7613733, 623029 7613732, 627609 7605928, 627610 7605928, 627610 7605927, 627610 7605926, 627611 7605925, 627611 7605924, 630573 7568363, 630573 7568362, 630573 7568361, 630573 7568360, 630573 7568359, 628567 7562381, 621356 7542293, 621356 7542292, 468368 7221876.188770507, 468368 7221876, 459071 7119021, 459071 7119020, 459071 7119019, 459070 7119018, 459070 7119017, 454728 7109371, 451784 7102984, 449525 7098307, 357809 6914071, 357808 6914070, 357808 6914069, 357807 6914068, 357806 6914068, 357806 6914067, 357805 6914067, 357804 6914066, 353158 6912240, 353157 6912239, 353156 6912239, 351669 6911974, 351668 6911974, 351667 6911974, 219871 6914350, 219870 6914350))",
|
|
484
|
+
]
|
|
485
|
+
],
|
|
486
|
+
size=size,
|
|
487
|
+
crs=25833,
|
|
488
|
+
seed=seed,
|
|
489
|
+
).sample(size)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def random_points_in_polygons(
|
|
493
|
+
polygons: Geometry | GeoDataFrame | GeoSeries,
|
|
494
|
+
size: int,
|
|
495
|
+
*,
|
|
496
|
+
seed: int | None = None,
|
|
497
|
+
crs: Any = 25833,
|
|
498
|
+
) -> GeoDataFrame:
|
|
474
499
|
"""Creates a GeoDataFrame with n random points within the geometries of 'gdf'.
|
|
475
500
|
|
|
476
501
|
Args:
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
seed: Optional random
|
|
502
|
+
polygons: A GeoDataFrame or GeoSeries of polygons. Or a single polygon.
|
|
503
|
+
size: Number of points to create.
|
|
504
|
+
seed: Optional random seed.
|
|
505
|
+
crs: Optional crs of the output GeoDataFrame if input is shapely.Geometry.
|
|
480
506
|
|
|
481
507
|
Returns:
|
|
482
508
|
A GeoDataFrame of points with n rows.
|
|
483
509
|
"""
|
|
484
|
-
|
|
485
|
-
|
|
510
|
+
if crs is None:
|
|
511
|
+
try:
|
|
512
|
+
crs = polygons.crs
|
|
513
|
+
except AttributeError:
|
|
514
|
+
pass
|
|
486
515
|
rng = np.random.default_rng(seed)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
.
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
516
|
+
polygons = to_gdf(polygons, crs=crs).geometry
|
|
517
|
+
bounds = polygons.bounds
|
|
518
|
+
minx = np.repeat(bounds["minx"].values, size)
|
|
519
|
+
maxx = np.repeat(bounds["maxx"].values, size)
|
|
520
|
+
miny = np.repeat(bounds["miny"].values, size)
|
|
521
|
+
maxy = np.repeat(bounds["maxy"].values, size)
|
|
522
|
+
index = np.repeat(np.arange(len(polygons)), size)
|
|
523
|
+
length = len(index)
|
|
524
|
+
out = []
|
|
525
|
+
while sum(len(df) for df in out) < size * len(polygons):
|
|
526
|
+
xs = rng.uniform(low=minx, high=maxx, size=length)
|
|
527
|
+
ys = rng.uniform(low=miny, high=maxy, size=length)
|
|
528
|
+
out.append(
|
|
529
|
+
GeoDataFrame(
|
|
530
|
+
shapely.points(xs, y=ys), index=index, columns=["geometry"], crs=crs
|
|
531
|
+
).pipe(sfilter, polygons)
|
|
532
|
+
)
|
|
533
|
+
return pd.concat(out).groupby(level=0).sample(size, replace=True).sort_index()
|
|
503
534
|
|
|
504
535
|
|
|
505
536
|
def polygons_to_lines(
|
|
@@ -17,8 +17,11 @@ from pandas import MultiIndex
|
|
|
17
17
|
from pandas import Series
|
|
18
18
|
from sklearn.neighbors import NearestNeighbors
|
|
19
19
|
|
|
20
|
+
from ..conf import _get_instance
|
|
21
|
+
from ..conf import config
|
|
20
22
|
from .conversion import coordinate_array
|
|
21
23
|
from .geometry_types import get_geom_type
|
|
24
|
+
from .runners import RTreeQueryRunner
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def get_neighbor_indices(
|
|
@@ -26,6 +29,8 @@ def get_neighbor_indices(
|
|
|
26
29
|
neighbors: GeoDataFrame | GeoSeries,
|
|
27
30
|
max_distance: int = 0,
|
|
28
31
|
predicate: str = "intersects",
|
|
32
|
+
rtree_runner: RTreeQueryRunner | None = None,
|
|
33
|
+
n_jobs: int | None = None,
|
|
29
34
|
) -> Series:
|
|
30
35
|
"""Creates a pandas Series with the index of 'gdf' and values of 'neighbors'.
|
|
31
36
|
|
|
@@ -41,6 +46,9 @@ def get_neighbor_indices(
|
|
|
41
46
|
predicate: Spatial predicate to use in sjoin. Defaults to "intersects", meaning
|
|
42
47
|
the geometry itself and geometries within will be considered neighbors if
|
|
43
48
|
they are part of the 'neighbors' GeoDataFrame.
|
|
49
|
+
rtree_runner: Optionally debug/manipulate the spatial indexing operations.
|
|
50
|
+
See the 'runners' module for example implementations.
|
|
51
|
+
n_jobs: Number of workers.
|
|
44
52
|
|
|
45
53
|
Returns:
|
|
46
54
|
A pandas Series with values of the intersecting 'neighbors' indices.
|
|
@@ -103,6 +111,9 @@ def get_neighbor_indices(
|
|
|
103
111
|
if gdf.crs != neighbors.crs:
|
|
104
112
|
raise ValueError(f"'crs' mismatch. Got {gdf.crs} and {neighbors.crs}")
|
|
105
113
|
|
|
114
|
+
if rtree_runner is None:
|
|
115
|
+
rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
|
|
116
|
+
|
|
106
117
|
if isinstance(neighbors, GeoSeries):
|
|
107
118
|
neighbors = neighbors.to_frame()
|
|
108
119
|
else:
|
|
@@ -119,11 +130,21 @@ def get_neighbor_indices(
|
|
|
119
130
|
|
|
120
131
|
if predicate == "nearest":
|
|
121
132
|
max_distance = None if max_distance == 0 else max_distance
|
|
122
|
-
|
|
133
|
+
left, right = rtree_runner.run(
|
|
134
|
+
gdf.geometry.values,
|
|
135
|
+
neighbors.geometry.values,
|
|
136
|
+
method="query_nearest",
|
|
137
|
+
max_distance=max_distance,
|
|
138
|
+
)
|
|
123
139
|
else:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
140
|
+
left, right = rtree_runner.run(
|
|
141
|
+
gdf.geometry.values, neighbors.geometry.values, predicate=predicate
|
|
142
|
+
)
|
|
143
|
+
index_mapper1 = {i: x for i, x in enumerate(gdf.index)}
|
|
144
|
+
left = np.array([index_mapper1[i] for i in left])
|
|
145
|
+
index_mapper2 = {i: x for i, x in enumerate(neighbors.index)}
|
|
146
|
+
right = np.array([index_mapper2[i] for i in right])
|
|
147
|
+
return Series(right, index=left, name="neighbor_index")
|
|
127
148
|
|
|
128
149
|
|
|
129
150
|
def get_neighbor_dfs(
|
|
@@ -23,6 +23,7 @@ from shapely import is_empty
|
|
|
23
23
|
from shapely import make_valid
|
|
24
24
|
from shapely import union_all
|
|
25
25
|
|
|
26
|
+
from ..conf import _get_instance
|
|
26
27
|
from ..conf import config
|
|
27
28
|
from .general import _determine_geom_type_args
|
|
28
29
|
from .general import clean_geoms
|
|
@@ -109,11 +110,11 @@ def clean_overlay(
|
|
|
109
110
|
raise ValueError(f"'crs' mismatch. Got {df1.crs} and {df2.crs}")
|
|
110
111
|
|
|
111
112
|
if rtree_runner is None:
|
|
112
|
-
rtree_runner = config
|
|
113
|
+
rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
|
|
113
114
|
if union_runner is None:
|
|
114
|
-
union_runner = config
|
|
115
|
+
union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
|
|
115
116
|
if overlay_runner is None:
|
|
116
|
-
overlay_runner = config
|
|
117
|
+
overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
|
|
117
118
|
|
|
118
119
|
crs = df1.crs
|
|
119
120
|
|
|
@@ -26,6 +26,7 @@ from shapely.errors import GEOSException
|
|
|
26
26
|
from shapely.geometry import LinearRing
|
|
27
27
|
from shapely.ops import SplitOp
|
|
28
28
|
|
|
29
|
+
from ..conf import _get_instance
|
|
29
30
|
from ..conf import config
|
|
30
31
|
from ..debug_config import _DEBUG_CONFIG
|
|
31
32
|
from ..debug_config import _try_debug_print
|
|
@@ -776,9 +777,9 @@ def _eliminate(
|
|
|
776
777
|
return gdf
|
|
777
778
|
|
|
778
779
|
if union_runner is None:
|
|
779
|
-
union_runner = config
|
|
780
|
+
union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
|
|
780
781
|
if overlay_runner is None:
|
|
781
|
-
overlay_runner = config
|
|
782
|
+
overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
|
|
782
783
|
|
|
783
784
|
gdf["_range_idx_elim"] = range(len(gdf))
|
|
784
785
|
|
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from geopandas import GeoDataFrame
|
|
12
12
|
from geopandas import GeoSeries
|
|
13
|
+
from shapely import Geometry
|
|
13
14
|
from shapely import STRtree
|
|
14
15
|
from shapely import get_parts
|
|
15
16
|
from shapely import make_valid
|
|
@@ -38,6 +39,12 @@ class AbstractRunner(ABC):
|
|
|
38
39
|
def run(self, *args, **kwargs) -> Any:
|
|
39
40
|
"""Abstract run method."""
|
|
40
41
|
|
|
42
|
+
def __str__(self) -> str:
|
|
43
|
+
"""String representation."""
|
|
44
|
+
return (
|
|
45
|
+
f"{self.__class__.__name__}(n_jobs={self.n_jobs}, backend='{self.backend}')"
|
|
46
|
+
)
|
|
47
|
+
|
|
41
48
|
|
|
42
49
|
@dataclass
|
|
43
50
|
class UnionRunner(AbstractRunner):
|
|
@@ -68,10 +75,10 @@ class UnionRunner(AbstractRunner):
|
|
|
68
75
|
"""Run groupby on geometries in parallel (if n_jobs > 1)."""
|
|
69
76
|
# assume geometry column is 'geometry' if input is pandas.Series og pandas.DataFrame
|
|
70
77
|
try:
|
|
71
|
-
geom_col = df.geometry.name
|
|
78
|
+
geom_col: str = df.geometry.name
|
|
72
79
|
except AttributeError:
|
|
73
80
|
try:
|
|
74
|
-
geom_col = df.name
|
|
81
|
+
geom_col: str | None = df.name
|
|
75
82
|
if geom_col is None:
|
|
76
83
|
geom_col = "geometry"
|
|
77
84
|
except AttributeError:
|
|
@@ -90,10 +97,10 @@ class UnionRunner(AbstractRunner):
|
|
|
90
97
|
by = np.zeros(len(df), dtype="int64")
|
|
91
98
|
|
|
92
99
|
try:
|
|
93
|
-
# DataFrame
|
|
100
|
+
# (Geo)DataFrame
|
|
94
101
|
groupby_obj = df.groupby(by, **kwargs)[geom_col]
|
|
95
102
|
except KeyError:
|
|
96
|
-
# Series
|
|
103
|
+
# (Geo)Series
|
|
97
104
|
groupby_obj = df.groupby(by, **kwargs)
|
|
98
105
|
|
|
99
106
|
if self.n_jobs is None or self.n_jobs == 1:
|
|
@@ -113,9 +120,24 @@ class UnionRunner(AbstractRunner):
|
|
|
113
120
|
return agged
|
|
114
121
|
|
|
115
122
|
|
|
116
|
-
def _strtree_query(
|
|
123
|
+
def _strtree_query(
|
|
124
|
+
arr1: np.ndarray,
|
|
125
|
+
arr2: np.ndarray,
|
|
126
|
+
method: str,
|
|
127
|
+
indices1: np.ndarray | None = None,
|
|
128
|
+
indices2: np.ndarray | None = None,
|
|
129
|
+
**kwargs,
|
|
130
|
+
):
|
|
117
131
|
tree = STRtree(arr2)
|
|
118
|
-
|
|
132
|
+
func = getattr(tree, method)
|
|
133
|
+
left, right = func(arr1, **kwargs)
|
|
134
|
+
if indices1 is not None:
|
|
135
|
+
index_mapper1 = {i: x for i, x in enumerate(indices1)}
|
|
136
|
+
left = np.array([index_mapper1[i] for i in left])
|
|
137
|
+
if indices2 is not None:
|
|
138
|
+
index_mapper2 = {i: x for i, x in enumerate(indices2)}
|
|
139
|
+
right = np.array([index_mapper2[i] for i in right])
|
|
140
|
+
return left, right
|
|
119
141
|
|
|
120
142
|
|
|
121
143
|
@dataclass
|
|
@@ -138,39 +160,52 @@ class RTreeQueryRunner(AbstractRunner):
|
|
|
138
160
|
backend: str = "loky"
|
|
139
161
|
|
|
140
162
|
def run(
|
|
141
|
-
self, arr1: np.ndarray, arr2: np.ndarray, **kwargs
|
|
163
|
+
self, arr1: np.ndarray, arr2: np.ndarray, method: str = "query", **kwargs
|
|
142
164
|
) -> tuple[np.ndarray, np.ndarray]:
|
|
143
165
|
"""Run a spatial rtree query and return indices of hits from arr1 and arr2 in a tuple of two arrays."""
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
166
|
+
if (
|
|
167
|
+
(self.n_jobs or 1) > 1
|
|
168
|
+
and len(arr1) / self.n_jobs > 10_000
|
|
169
|
+
and len(arr1) / len(arr2)
|
|
170
|
+
):
|
|
171
|
+
chunks = np.array_split(np.arange(len(arr1)), self.n_jobs)
|
|
172
|
+
assert sum(len(x) for x in chunks) == len(arr1)
|
|
173
|
+
with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
|
|
174
|
+
results = parallel(
|
|
175
|
+
joblib.delayed(_strtree_query)(
|
|
176
|
+
arr1[chunk],
|
|
177
|
+
arr2,
|
|
178
|
+
method=method,
|
|
179
|
+
indices1=chunk,
|
|
180
|
+
**kwargs,
|
|
181
|
+
)
|
|
182
|
+
for chunk in chunks
|
|
183
|
+
)
|
|
184
|
+
left = np.concatenate([x[0] for x in results])
|
|
185
|
+
right = np.concatenate([x[1] for x in results])
|
|
186
|
+
return left, right
|
|
187
|
+
elif (
|
|
188
|
+
(self.n_jobs or 1) > 1
|
|
189
|
+
and len(arr2) / self.n_jobs > 10_000
|
|
190
|
+
and len(arr2) / len(arr1)
|
|
191
|
+
):
|
|
192
|
+
chunks = np.array_split(np.arange(len(arr2)), self.n_jobs)
|
|
193
|
+
with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
|
|
194
|
+
results = parallel(
|
|
195
|
+
joblib.delayed(_strtree_query)(
|
|
196
|
+
arr1,
|
|
197
|
+
arr2[chunk],
|
|
198
|
+
method=method,
|
|
199
|
+
indices2=chunk,
|
|
200
|
+
**kwargs,
|
|
201
|
+
)
|
|
202
|
+
for chunk in chunks
|
|
203
|
+
)
|
|
204
|
+
left = np.concatenate([x[0] for x in results])
|
|
205
|
+
right = np.concatenate([x[1] for x in results])
|
|
206
|
+
return left, right
|
|
207
|
+
|
|
208
|
+
return _strtree_query(arr1, arr2, method=method, **kwargs)
|
|
174
209
|
|
|
175
210
|
|
|
176
211
|
@dataclass
|
|
@@ -189,8 +224,8 @@ class OverlayRunner(AbstractRunner):
|
|
|
189
224
|
n_jobs: None = None
|
|
190
225
|
backend: None = None
|
|
191
226
|
|
|
192
|
-
@staticmethod
|
|
193
227
|
def run(
|
|
228
|
+
self,
|
|
194
229
|
func: Callable,
|
|
195
230
|
arr1: np.ndarray,
|
|
196
231
|
arr2: np.ndarray,
|
|
@@ -219,7 +254,7 @@ class GridSizeOverlayRunner(OverlayRunner):
|
|
|
219
254
|
|
|
220
255
|
n_jobs: int
|
|
221
256
|
backend: str | None
|
|
222
|
-
grid_sizes: list[float] | None = None
|
|
257
|
+
grid_sizes: list[float | int] | None = None
|
|
223
258
|
|
|
224
259
|
def __post_init__(self) -> None:
|
|
225
260
|
"""Check that grid_sizes is passed."""
|
|
@@ -247,7 +282,9 @@ class GridSizeOverlayRunner(OverlayRunner):
|
|
|
247
282
|
|
|
248
283
|
"""
|
|
249
284
|
kwargs = dict(
|
|
250
|
-
grid_size=grid_size,
|
|
285
|
+
grid_size=grid_size,
|
|
286
|
+
geom_type=geom_type.lower() if geom_type is not None else None,
|
|
287
|
+
grid_sizes=self.grid_sizes,
|
|
251
288
|
)
|
|
252
289
|
with joblib.Parallel(self.n_jobs, backend="threading") as parallel:
|
|
253
290
|
return parallel(
|
|
@@ -256,15 +293,27 @@ class GridSizeOverlayRunner(OverlayRunner):
|
|
|
256
293
|
)
|
|
257
294
|
|
|
258
295
|
|
|
259
|
-
def
|
|
296
|
+
def _fix_gemetry_fast(geom: Geometry, geom_type: str | None) -> Geometry:
|
|
297
|
+
geom = make_valid(geom)
|
|
298
|
+
if geom.geom_type == geom_type or geom_type is None:
|
|
299
|
+
return geom
|
|
300
|
+
return union_all([g for g in get_parts(geom) if geom_type in g.geom_type])
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _run_overlay_rowwise(
|
|
304
|
+
func: Callable,
|
|
305
|
+
geom1: Geometry,
|
|
306
|
+
geom2: Geometry,
|
|
307
|
+
grid_size: float | int | None,
|
|
308
|
+
geom_type: str | None,
|
|
309
|
+
grid_sizes: list[float | int],
|
|
310
|
+
) -> Geometry:
|
|
260
311
|
try:
|
|
261
312
|
return func(geom1, geom2, grid_size=grid_size)
|
|
262
313
|
except GEOSException:
|
|
263
314
|
pass
|
|
264
|
-
geom1 =
|
|
265
|
-
geom2 =
|
|
266
|
-
geom1 = union_all([g for g in geom1 if pd.notna(g) and geom_type in g.geom_type])
|
|
267
|
-
geom2 = union_all([g for g in geom2 if pd.notna(g) and geom_type in g.geom_type])
|
|
315
|
+
geom1 = _fix_gemetry_fast(geom1, geom_type)
|
|
316
|
+
geom2 = _fix_gemetry_fast(geom2, geom_type)
|
|
268
317
|
try:
|
|
269
318
|
return func(geom1, geom2)
|
|
270
319
|
except GEOSException:
|
|
@@ -6,6 +6,8 @@ from geopandas import GeoDataFrame
|
|
|
6
6
|
from geopandas import GeoSeries
|
|
7
7
|
from shapely import Geometry
|
|
8
8
|
|
|
9
|
+
from ..conf import _get_instance
|
|
10
|
+
from ..conf import config
|
|
9
11
|
from .conversion import to_gdf
|
|
10
12
|
from .runners import RTreeQueryRunner
|
|
11
13
|
|
|
@@ -17,7 +19,7 @@ def sfilter(
|
|
|
17
19
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
18
20
|
predicate: str = "intersects",
|
|
19
21
|
distance: int | float | None = None,
|
|
20
|
-
n_jobs: int =
|
|
22
|
+
n_jobs: int | None = None,
|
|
21
23
|
rtree_runner: RTreeQueryRunner | None = None,
|
|
22
24
|
) -> GeoDataFrame:
|
|
23
25
|
"""Filter a GeoDataFrame or GeoSeries by spatial predicate.
|
|
@@ -296,7 +298,7 @@ def _get_sfilter_indices(
|
|
|
296
298
|
original_predicate = predicate
|
|
297
299
|
|
|
298
300
|
if rtree_runner is None:
|
|
299
|
-
rtree_runner =
|
|
301
|
+
rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
|
|
300
302
|
|
|
301
303
|
with warnings.catch_warnings():
|
|
302
304
|
# We don't need to show our own warning here
|
|
@@ -110,7 +110,8 @@ def read_geopandas(
|
|
|
110
110
|
try:
|
|
111
111
|
expression = "".join(next(iter(filters))).replace("==", "=")
|
|
112
112
|
glob_func = _get_glob_func(file_system)
|
|
113
|
-
|
|
113
|
+
suffix: str = Path(gcs_path).suffix
|
|
114
|
+
paths = glob_func(str(Path(gcs_path) / expression / f"*{suffix}"))
|
|
114
115
|
if paths:
|
|
115
116
|
return _read_geopandas_from_iterable(
|
|
116
117
|
paths,
|
|
@@ -128,11 +129,11 @@ def read_geopandas(
|
|
|
128
129
|
return gpd.GeoDataFrame(
|
|
129
130
|
_read_partitioned_parquet(
|
|
130
131
|
gcs_path,
|
|
131
|
-
read_func=pq.read_table,
|
|
132
132
|
file_system=file_system,
|
|
133
133
|
mask=mask,
|
|
134
134
|
filters=filters,
|
|
135
135
|
child_paths=child_paths,
|
|
136
|
+
use_threads=use_threads,
|
|
136
137
|
**kwargs,
|
|
137
138
|
)
|
|
138
139
|
)
|
|
@@ -145,7 +146,7 @@ def read_geopandas(
|
|
|
145
146
|
read_func = gpd.read_file
|
|
146
147
|
|
|
147
148
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
148
|
-
return
|
|
149
|
+
return _read_geopandas_single_path(
|
|
149
150
|
file,
|
|
150
151
|
read_func=read_func,
|
|
151
152
|
file_format=file_format,
|
|
@@ -163,18 +164,10 @@ def _read_geopandas_from_iterable(
|
|
|
163
164
|
paths = list(paths.index)
|
|
164
165
|
elif mask is None:
|
|
165
166
|
paths = list(paths)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
file_system,
|
|
171
|
-
use_threads=use_threads,
|
|
172
|
-
pandas_fallback=pandas_fallback,
|
|
173
|
-
)
|
|
174
|
-
else:
|
|
175
|
-
bounds_series = paths
|
|
176
|
-
new_bounds_series = sfilter(bounds_series, mask)
|
|
177
|
-
if not len(new_bounds_series):
|
|
167
|
+
elif isinstance(paths, GeoSeries):
|
|
168
|
+
bounds_series = sfilter(paths, mask)
|
|
169
|
+
if not len(bounds_series):
|
|
170
|
+
# return GeoDataFrame with correct columns
|
|
178
171
|
if isinstance(kwargs.get("columns"), Iterable):
|
|
179
172
|
cols = {col: [] for col in kwargs["columns"]}
|
|
180
173
|
else:
|
|
@@ -186,29 +179,14 @@ def _read_geopandas_from_iterable(
|
|
|
186
179
|
if file_system.isfile(path):
|
|
187
180
|
raise ArrowInvalid(e, path) from e
|
|
188
181
|
return GeoDataFrame(cols | {"geometry": []})
|
|
189
|
-
paths = list(
|
|
182
|
+
paths = list(bounds_series.index)
|
|
190
183
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
|
|
184
|
+
results: list[pyarrow.Table] = _read_pyarrow_with_treads(
|
|
185
|
+
paths, file_system=file_system, mask=mask, use_threads=use_threads, **kwargs
|
|
194
186
|
)
|
|
195
|
-
|
|
196
|
-
dfs: list[GeoDataFrame] = parallel(
|
|
197
|
-
joblib.delayed(read_geopandas)(
|
|
198
|
-
x,
|
|
199
|
-
file_system=file_system,
|
|
200
|
-
pandas_fallback=pandas_fallback,
|
|
201
|
-
mask=mask,
|
|
202
|
-
use_threads=use_threads,
|
|
203
|
-
**kwargs,
|
|
204
|
-
)
|
|
205
|
-
for x in paths
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
if dfs:
|
|
209
|
-
df = pd.concat(dfs, ignore_index=True)
|
|
187
|
+
if results:
|
|
210
188
|
try:
|
|
211
|
-
|
|
189
|
+
return _concat_pyarrow_to_geopandas(results, paths, file_system)
|
|
212
190
|
except Exception as e:
|
|
213
191
|
if not pandas_fallback:
|
|
214
192
|
print(e)
|
|
@@ -219,6 +197,49 @@ def _read_geopandas_from_iterable(
|
|
|
219
197
|
return df
|
|
220
198
|
|
|
221
199
|
|
|
200
|
+
def _read_pyarrow_with_treads(
|
|
201
|
+
paths: list[str | Path | os.PathLike], file_system, use_threads, mask, **kwargs
|
|
202
|
+
) -> list[pyarrow.Table]:
|
|
203
|
+
read_partial = functools.partial(
|
|
204
|
+
_read_pyarrow, mask=mask, file_system=file_system, **kwargs
|
|
205
|
+
)
|
|
206
|
+
if not use_threads:
|
|
207
|
+
return [x for x in map(read_partial, paths) if x is not None]
|
|
208
|
+
with ThreadPoolExecutor() as executor:
|
|
209
|
+
return [x for x in executor.map(read_partial, paths) if x is not None]
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def intersects(file, mask, file_system) -> bool:
|
|
213
|
+
bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
|
|
214
|
+
return shapely.box(*bbox).intersects(to_shapely(mask))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _read_pyarrow(path: str, file_system, mask=None, **kwargs) -> pyarrow.Table | None:
|
|
218
|
+
try:
|
|
219
|
+
with file_system.open(path, "rb") as file:
|
|
220
|
+
if mask is not None and not intersects(file, mask, file_system):
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
# 'get' instead of 'pop' because dict is mutable
|
|
224
|
+
schema = kwargs.get("schema", pq.read_schema(file))
|
|
225
|
+
new_kwargs = {
|
|
226
|
+
key: value for key, value in kwargs.items() if key != "schema"
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
return pq.read_table(file, schema=schema, **new_kwargs)
|
|
230
|
+
except ArrowInvalid as e:
|
|
231
|
+
glob_func = _get_glob_func(file_system)
|
|
232
|
+
if not len(
|
|
233
|
+
{
|
|
234
|
+
x
|
|
235
|
+
for x in glob_func(str(Path(path) / "**"))
|
|
236
|
+
if not paths_are_equal(path, x)
|
|
237
|
+
}
|
|
238
|
+
):
|
|
239
|
+
raise e
|
|
240
|
+
# allow not being able to read empty directories that are hard to delete in gcs
|
|
241
|
+
|
|
242
|
+
|
|
222
243
|
def _get_bounds_parquet(
|
|
223
244
|
path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
|
|
224
245
|
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
@@ -662,10 +683,10 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
|
|
|
662
683
|
return bool(len(table))
|
|
663
684
|
|
|
664
685
|
|
|
665
|
-
def
|
|
686
|
+
def _read_geopandas_single_path(
|
|
666
687
|
file,
|
|
667
|
-
read_func: Callable
|
|
668
|
-
file_format: str
|
|
688
|
+
read_func: Callable,
|
|
689
|
+
file_format: str,
|
|
669
690
|
**kwargs,
|
|
670
691
|
):
|
|
671
692
|
try:
|
|
@@ -681,32 +702,29 @@ def _read_geopandas(
|
|
|
681
702
|
raise e.__class__(f"{e.__class__.__name__}: {e} for {file}.") from e
|
|
682
703
|
|
|
683
704
|
|
|
684
|
-
def _read_pandas(gcs_path: str, **kwargs):
|
|
705
|
+
def _read_pandas(gcs_path: str, use_threads: bool = True, **kwargs):
|
|
685
706
|
file_system = _get_file_system(None, kwargs)
|
|
686
707
|
|
|
687
708
|
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
709
|
+
results: list[pyarrow.Table] = _read_pyarrow_with_treads(
|
|
710
|
+
gcs_path,
|
|
711
|
+
file_system=file_system,
|
|
712
|
+
mask=None,
|
|
713
|
+
use_threads=use_threads,
|
|
714
|
+
**kwargs,
|
|
693
715
|
)
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
parallel(
|
|
697
|
-
joblib.delayed(_read_pandas)(x, file_system=file_system, **kwargs)
|
|
698
|
-
for x in gcs_path
|
|
699
|
-
)
|
|
700
|
-
)
|
|
716
|
+
results = pyarrow.concat_tables(results, promote_options="permissive")
|
|
717
|
+
return results.to_pandas()
|
|
701
718
|
|
|
702
719
|
child_paths = get_child_paths(gcs_path, file_system)
|
|
703
720
|
if child_paths:
|
|
704
721
|
return _read_partitioned_parquet(
|
|
705
722
|
gcs_path,
|
|
706
|
-
read_func=pd.read_parquet,
|
|
707
723
|
file_system=file_system,
|
|
708
724
|
mask=None,
|
|
709
725
|
child_paths=child_paths,
|
|
726
|
+
use_threads=use_threads,
|
|
727
|
+
to_geopandas=False,
|
|
710
728
|
**kwargs,
|
|
711
729
|
)
|
|
712
730
|
|
|
@@ -716,11 +734,12 @@ def _read_pandas(gcs_path: str, **kwargs):
|
|
|
716
734
|
|
|
717
735
|
def _read_partitioned_parquet(
|
|
718
736
|
path: str,
|
|
719
|
-
read_func: Callable,
|
|
720
737
|
filters=None,
|
|
721
738
|
file_system=None,
|
|
722
739
|
mask=None,
|
|
723
740
|
child_paths: list[str] | None = None,
|
|
741
|
+
use_threads: bool = True,
|
|
742
|
+
to_geopandas: bool = True,
|
|
724
743
|
**kwargs,
|
|
725
744
|
):
|
|
726
745
|
file_system = _get_file_system(file_system, kwargs)
|
|
@@ -731,60 +750,22 @@ def _read_partitioned_parquet(
|
|
|
731
750
|
|
|
732
751
|
filters = _filters_to_expression(filters)
|
|
733
752
|
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
schema = kwargs.get("schema", pq.read_schema(file))
|
|
746
|
-
new_kwargs = {
|
|
747
|
-
key: value for key, value in kwargs.items() if key != "schema"
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
return read_func(file, schema=schema, filters=filters, **new_kwargs)
|
|
751
|
-
except ArrowInvalid as e:
|
|
752
|
-
if not len(
|
|
753
|
-
{
|
|
754
|
-
x
|
|
755
|
-
for x in glob_func(str(Path(child_path) / "**"))
|
|
756
|
-
if not paths_are_equal(child_path, x)
|
|
757
|
-
}
|
|
758
|
-
):
|
|
759
|
-
raise e
|
|
760
|
-
# allow not being able to read hard-to-delete empty directories
|
|
753
|
+
results: list[pyarrow.Table] = _read_pyarrow_with_treads(
|
|
754
|
+
(
|
|
755
|
+
path
|
|
756
|
+
for path in child_paths
|
|
757
|
+
if filters is None or expression_match_path(filters, path)
|
|
758
|
+
),
|
|
759
|
+
file_system=file_system,
|
|
760
|
+
mask=mask,
|
|
761
|
+
use_threads=use_threads,
|
|
762
|
+
**kwargs,
|
|
763
|
+
)
|
|
761
764
|
|
|
762
|
-
|
|
763
|
-
results
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
executor.map(
|
|
767
|
-
read,
|
|
768
|
-
(
|
|
769
|
-
path
|
|
770
|
-
for path in child_paths
|
|
771
|
-
if filters is None or expression_match_path(filters, path)
|
|
772
|
-
),
|
|
773
|
-
)
|
|
774
|
-
)
|
|
775
|
-
if df is not None
|
|
776
|
-
]
|
|
777
|
-
|
|
778
|
-
if results:
|
|
779
|
-
if all(isinstance(x, DataFrame) for x in results):
|
|
780
|
-
return pd.concat(results)
|
|
781
|
-
else:
|
|
782
|
-
results = pyarrow.concat_tables(
|
|
783
|
-
results,
|
|
784
|
-
promote_options="permissive",
|
|
785
|
-
)
|
|
786
|
-
geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
|
|
787
|
-
return _arrow_to_geopandas(results, geo_metadata)
|
|
765
|
+
if results and to_geopandas:
|
|
766
|
+
return _concat_pyarrow_to_geopandas(results, child_paths, file_system)
|
|
767
|
+
elif results:
|
|
768
|
+
return pyarrow.concat_tables(results, promote_options="permissive").to_pandas()
|
|
788
769
|
|
|
789
770
|
# add columns to empty DataFrame
|
|
790
771
|
first_path = next(iter(child_paths + [path]))
|
|
@@ -794,6 +775,17 @@ def _read_partitioned_parquet(
|
|
|
794
775
|
return df
|
|
795
776
|
|
|
796
777
|
|
|
778
|
+
def _concat_pyarrow_to_geopandas(
|
|
779
|
+
results: list[pyarrow.Table], paths: list[str], file_system: Any
|
|
780
|
+
):
|
|
781
|
+
results = pyarrow.concat_tables(
|
|
782
|
+
results,
|
|
783
|
+
promote_options="permissive",
|
|
784
|
+
)
|
|
785
|
+
geo_metadata = _get_geo_metadata(next(iter(paths)), file_system)
|
|
786
|
+
return _arrow_to_geopandas(results, geo_metadata)
|
|
787
|
+
|
|
788
|
+
|
|
797
789
|
def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
798
790
|
return Path(path1).parts == Path(path2).parts
|
|
799
791
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|