ssb-sgis 1.1.16__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +4 -0
- sgis/conf.py +56 -4
- sgis/geopandas_tools/buffer_dissolve_explode.py +24 -47
- sgis/geopandas_tools/conversion.py +18 -25
- sgis/geopandas_tools/duplicates.py +47 -60
- sgis/geopandas_tools/general.py +8 -84
- sgis/geopandas_tools/overlay.py +190 -260
- sgis/geopandas_tools/polygon_operations.py +67 -88
- sgis/geopandas_tools/runners.py +277 -0
- sgis/geopandas_tools/sfilter.py +40 -24
- sgis/geopandas_tools/utils.py +37 -0
- sgis/helpers.py +1 -1
- sgis/io/dapla_functions.py +5 -7
- sgis/maps/map.py +3 -1
- sgis/parallel/parallel.py +32 -24
- sgis/raster/image_collection.py +184 -162
- sgis/raster/indices.py +0 -1
- {ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/METADATA +1 -1
- {ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/RECORD +21 -19
- {ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.1.16.dist-info → ssb_sgis-1.2.0.dist-info}/WHEEL +0 -0
|
@@ -26,17 +26,14 @@ from shapely.errors import GEOSException
|
|
|
26
26
|
from shapely.geometry import LinearRing
|
|
27
27
|
from shapely.ops import SplitOp
|
|
28
28
|
|
|
29
|
+
from ..conf import config
|
|
29
30
|
from ..debug_config import _DEBUG_CONFIG
|
|
30
31
|
from ..debug_config import _try_debug_print
|
|
31
32
|
from ..maps.maps import explore_locals
|
|
32
33
|
from .conversion import to_gdf
|
|
33
34
|
from .conversion import to_geoseries
|
|
34
35
|
from .duplicates import _get_intersecting_geometries
|
|
35
|
-
from .general import _grouped_unary_union
|
|
36
|
-
from .general import _parallel_unary_union
|
|
37
|
-
from .general import _parallel_unary_union_geoseries
|
|
38
36
|
from .general import _push_geom_col
|
|
39
|
-
from .general import _unary_union_for_notna
|
|
40
37
|
from .general import clean_geoms
|
|
41
38
|
from .general import extend_lines
|
|
42
39
|
from .general import get_grouped_centroids
|
|
@@ -46,11 +43,13 @@ from .geometry_types import get_geom_type
|
|
|
46
43
|
from .geometry_types import make_all_singlepart
|
|
47
44
|
from .geometry_types import to_single_geom_type
|
|
48
45
|
from .neighbors import get_neighbor_indices
|
|
49
|
-
from .overlay import _try_difference
|
|
50
46
|
from .overlay import clean_overlay
|
|
51
47
|
from .polygons_as_rings import PolygonsAsRings
|
|
48
|
+
from .runners import OverlayRunner
|
|
49
|
+
from .runners import UnionRunner
|
|
52
50
|
from .sfilter import sfilter
|
|
53
51
|
from .sfilter import sfilter_inverse
|
|
52
|
+
from .utils import _unary_union_for_notna
|
|
54
53
|
|
|
55
54
|
PRECISION = 1e-3
|
|
56
55
|
_BUFFER = False
|
|
@@ -232,6 +231,8 @@ def eliminate_by_longest(
|
|
|
232
231
|
aggfunc: str | dict | list | None = None,
|
|
233
232
|
grid_size=None,
|
|
234
233
|
n_jobs: int = 1,
|
|
234
|
+
union_runner: UnionRunner | None = None,
|
|
235
|
+
overlay_runner: OverlayRunner | None = None,
|
|
235
236
|
**kwargs,
|
|
236
237
|
) -> tuple[GeoDataFrame]:
|
|
237
238
|
"""Dissolves selected polygons with the longest bordering neighbor polygon.
|
|
@@ -259,6 +260,10 @@ def eliminate_by_longest(
|
|
|
259
260
|
(if aggfunc="first").
|
|
260
261
|
grid_size: Rounding of the coordinates. Defaults to None.
|
|
261
262
|
n_jobs: Number of threads to use. Defaults to 1.
|
|
263
|
+
union_runner: Optionally debug/manipulate the spatial union operations.
|
|
264
|
+
See the 'runners' module for example implementations.
|
|
265
|
+
overlay_runner: Optionally debug/manipulate the spatial overlay operations.
|
|
266
|
+
See the 'runners' module for example implementations.
|
|
262
267
|
**kwargs: Keyword arguments passed to the dissolve method.
|
|
263
268
|
|
|
264
269
|
Returns:
|
|
@@ -350,6 +355,7 @@ def eliminate_by_longest(
|
|
|
350
355
|
keep_geom_type=False,
|
|
351
356
|
grid_size=grid_size,
|
|
352
357
|
n_jobs=n_jobs,
|
|
358
|
+
overlay_runner=overlay_runner,
|
|
353
359
|
).loc[lambda x: x["_eliminate_idx"].notna()]
|
|
354
360
|
|
|
355
361
|
borders["_length"] = borders.length
|
|
@@ -390,6 +396,8 @@ def eliminate_by_longest(
|
|
|
390
396
|
fix_double,
|
|
391
397
|
grid_size=grid_size,
|
|
392
398
|
n_jobs=n_jobs,
|
|
399
|
+
union_runner=union_runner,
|
|
400
|
+
overlay_runner=overlay_runner,
|
|
393
401
|
**kwargs,
|
|
394
402
|
)
|
|
395
403
|
|
|
@@ -434,6 +442,8 @@ def eliminate_by_longest(
|
|
|
434
442
|
ignore_index=ignore_index,
|
|
435
443
|
aggfunc=aggfunc,
|
|
436
444
|
grid_size=grid_size,
|
|
445
|
+
union_runner=union_runner,
|
|
446
|
+
overlay_runner=overlay_runner,
|
|
437
447
|
n_jobs=n_jobs,
|
|
438
448
|
)
|
|
439
449
|
|
|
@@ -494,6 +504,8 @@ def eliminate_by_largest(
|
|
|
494
504
|
predicate: str = "intersects",
|
|
495
505
|
grid_size=None,
|
|
496
506
|
n_jobs: int = 1,
|
|
507
|
+
union_runner: UnionRunner | None = None,
|
|
508
|
+
overlay_runner: OverlayRunner | None = None,
|
|
497
509
|
**kwargs,
|
|
498
510
|
) -> tuple[GeoDataFrame]:
|
|
499
511
|
"""Dissolves selected polygons with the largest neighbor polygon.
|
|
@@ -522,6 +534,10 @@ def eliminate_by_largest(
|
|
|
522
534
|
predicate: Binary predicate passed to sjoin. Defaults to "intersects".
|
|
523
535
|
grid_size: Rounding of the coordinates. Defaults to None.
|
|
524
536
|
n_jobs: Number of threads to use. Defaults to 1.
|
|
537
|
+
union_runner: Optionally debug/manipulate the spatial union operations.
|
|
538
|
+
See the 'runners' module for example implementations.
|
|
539
|
+
overlay_runner: Optionally debug/manipulate the spatial overlay operations.
|
|
540
|
+
See the 'runners' module for example implementations.
|
|
525
541
|
**kwargs: Keyword arguments passed to the dissolve method.
|
|
526
542
|
|
|
527
543
|
Returns:
|
|
@@ -566,6 +582,8 @@ def eliminate_by_largest(
|
|
|
566
582
|
fix_double=fix_double,
|
|
567
583
|
grid_size=grid_size,
|
|
568
584
|
n_jobs=n_jobs,
|
|
585
|
+
union_runner=union_runner,
|
|
586
|
+
overlay_runner=overlay_runner,
|
|
569
587
|
**kwargs,
|
|
570
588
|
)
|
|
571
589
|
|
|
@@ -581,6 +599,8 @@ def eliminate_by_smallest(
|
|
|
581
599
|
fix_double: bool = True,
|
|
582
600
|
grid_size=None,
|
|
583
601
|
n_jobs: int = 1,
|
|
602
|
+
union_runner: UnionRunner | None = None,
|
|
603
|
+
overlay_runner: OverlayRunner | None = None,
|
|
584
604
|
**kwargs,
|
|
585
605
|
) -> tuple[GeoDataFrame]:
|
|
586
606
|
return _eliminate_by_area(
|
|
@@ -594,6 +614,8 @@ def eliminate_by_smallest(
|
|
|
594
614
|
fix_double=fix_double,
|
|
595
615
|
grid_size=grid_size,
|
|
596
616
|
n_jobs=n_jobs,
|
|
617
|
+
union_runner=union_runner,
|
|
618
|
+
overlay_runner=overlay_runner,
|
|
597
619
|
**kwargs,
|
|
598
620
|
)
|
|
599
621
|
|
|
@@ -603,12 +625,14 @@ def _eliminate_by_area(
|
|
|
603
625
|
to_eliminate: GeoDataFrame,
|
|
604
626
|
max_distance: int | float | None,
|
|
605
627
|
sort_ascending: bool,
|
|
606
|
-
ignore_index: bool
|
|
607
|
-
aggfunc: str | dict | list | None
|
|
608
|
-
predicate
|
|
609
|
-
fix_double: bool
|
|
610
|
-
grid_size
|
|
611
|
-
n_jobs: int
|
|
628
|
+
ignore_index: bool,
|
|
629
|
+
aggfunc: str | dict | list | None,
|
|
630
|
+
predicate: str,
|
|
631
|
+
fix_double: bool,
|
|
632
|
+
grid_size,
|
|
633
|
+
n_jobs: int,
|
|
634
|
+
union_runner: UnionRunner,
|
|
635
|
+
overlay_runner: OverlayRunner,
|
|
612
636
|
**kwargs,
|
|
613
637
|
) -> GeoDataFrame:
|
|
614
638
|
_recurse = kwargs.pop("_recurse", False)
|
|
@@ -667,6 +691,8 @@ def _eliminate_by_area(
|
|
|
667
691
|
fix_double=fix_double,
|
|
668
692
|
grid_size=grid_size,
|
|
669
693
|
n_jobs=n_jobs,
|
|
694
|
+
union_runner=union_runner,
|
|
695
|
+
overlay_runner=overlay_runner,
|
|
670
696
|
**kwargs,
|
|
671
697
|
)
|
|
672
698
|
|
|
@@ -717,18 +743,14 @@ def _eliminate_by_area(
|
|
|
717
743
|
ignore_index=ignore_index,
|
|
718
744
|
aggfunc=aggfunc,
|
|
719
745
|
grid_size=grid_size,
|
|
746
|
+
union_runner=union_runner,
|
|
747
|
+
overlay_runner=overlay_runner,
|
|
720
748
|
n_jobs=n_jobs,
|
|
721
749
|
)
|
|
722
750
|
|
|
723
751
|
if not was_multiple_gdfs:
|
|
724
752
|
return out, isolated
|
|
725
753
|
|
|
726
|
-
for k, v in locals().items():
|
|
727
|
-
try:
|
|
728
|
-
print(k, v.columns)
|
|
729
|
-
except Exception:
|
|
730
|
-
pass
|
|
731
|
-
|
|
732
754
|
gdfs = ()
|
|
733
755
|
for i, cols in enumerate(original_cols):
|
|
734
756
|
df = out.loc[out["_df_idx"] == i, cols]
|
|
@@ -738,11 +760,26 @@ def _eliminate_by_area(
|
|
|
738
760
|
|
|
739
761
|
|
|
740
762
|
def _eliminate(
|
|
741
|
-
gdf,
|
|
763
|
+
gdf,
|
|
764
|
+
to_eliminate,
|
|
765
|
+
aggfunc,
|
|
766
|
+
crs,
|
|
767
|
+
fix_double,
|
|
768
|
+
grid_size,
|
|
769
|
+
n_jobs,
|
|
770
|
+
overlay_runner,
|
|
771
|
+
union_runner,
|
|
772
|
+
**kwargs,
|
|
742
773
|
):
|
|
774
|
+
|
|
743
775
|
if not len(to_eliminate):
|
|
744
776
|
return gdf
|
|
745
777
|
|
|
778
|
+
if union_runner is None:
|
|
779
|
+
union_runner = config.get_instance("union_runner", n_jobs)
|
|
780
|
+
if overlay_runner is None:
|
|
781
|
+
overlay_runner = config.get_instance("overlay_runner", n_jobs)
|
|
782
|
+
|
|
746
783
|
gdf["_range_idx_elim"] = range(len(gdf))
|
|
747
784
|
|
|
748
785
|
in_to_eliminate = gdf["_dissolve_idx"].isin(to_eliminate["_dissolve_idx"])
|
|
@@ -798,16 +835,6 @@ def _eliminate(
|
|
|
798
835
|
# all_geoms: pd.Series = gdf.set_index("_dissolve_idx").geometry
|
|
799
836
|
all_geoms: pd.Series = gdf.geometry
|
|
800
837
|
|
|
801
|
-
# more_than_one = get_num_geometries(all_geoms.values) > 1
|
|
802
|
-
# all_geoms.loc[more_than_one] = all_geoms.loc[more_than_one].apply(
|
|
803
|
-
# _unary_union_for_notna
|
|
804
|
-
# )
|
|
805
|
-
|
|
806
|
-
# more_than_one = get_num_geometries(to_be_eliminated.values) > 1
|
|
807
|
-
# to_be_eliminated.loc[more_than_one, "geometry"] = to_be_eliminated.loc[
|
|
808
|
-
# more_than_one, "geometry"
|
|
809
|
-
# ].apply(_unary_union_for_notna)
|
|
810
|
-
|
|
811
838
|
# create DataFrame of intersection pairs
|
|
812
839
|
tree = STRtree(all_geoms.values)
|
|
813
840
|
left, right = tree.query(
|
|
@@ -819,8 +846,6 @@ def _eliminate(
|
|
|
819
846
|
dict(enumerate(to_be_eliminated.index))
|
|
820
847
|
)
|
|
821
848
|
|
|
822
|
-
# pairs = pairs.loc[lambda x: x["right"] != x["_dissolve_idx"]]
|
|
823
|
-
|
|
824
849
|
soon_erased = to_be_eliminated.iloc[pairs.index]
|
|
825
850
|
intersecting = all_geoms.iloc[pairs["right"]]
|
|
826
851
|
|
|
@@ -829,61 +854,31 @@ def _eliminate(
|
|
|
829
854
|
intersecting = intersecting[shoud_not_erase]
|
|
830
855
|
|
|
831
856
|
missing = to_be_eliminated.loc[
|
|
832
|
-
|
|
833
|
-
# |
|
|
834
|
-
(~to_be_eliminated["_row_idx"].isin(soon_erased["_row_idx"])),
|
|
835
|
-
# | (~to_be_eliminated["_row_idx"].isin(soon_erased.index)),
|
|
836
|
-
"geometry",
|
|
857
|
+
(~to_be_eliminated["_row_idx"].isin(soon_erased["_row_idx"])), "geometry"
|
|
837
858
|
]
|
|
838
859
|
|
|
839
860
|
# allign and aggregate by dissolve index to not get duplicates in difference
|
|
840
861
|
intersecting.index = soon_erased.index
|
|
841
862
|
|
|
842
|
-
soon_erased =
|
|
843
|
-
intersecting =
|
|
863
|
+
soon_erased = union_runner.run(soon_erased, level=0, grid_size=grid_size)
|
|
864
|
+
intersecting = union_runner.run(intersecting, level=0, grid_size=grid_size)
|
|
844
865
|
|
|
845
866
|
assert soon_erased.index.equals(soon_erased.index)
|
|
846
867
|
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
# )
|
|
850
|
-
# intersecting = intersecting.groupby(level=0).agg(
|
|
851
|
-
# lambda x: unary_union(x, grid_size=grid_size)
|
|
852
|
-
# )
|
|
853
|
-
|
|
854
|
-
# explore_locals(center=_DEBUG_CONFIG["center"])
|
|
855
|
-
|
|
856
|
-
soon_erased.loc[:] = _try_difference(
|
|
868
|
+
soon_erased.loc[:] = overlay_runner.run(
|
|
869
|
+
difference,
|
|
857
870
|
soon_erased.to_numpy(),
|
|
858
871
|
intersecting.to_numpy(),
|
|
859
872
|
grid_size=grid_size,
|
|
860
|
-
n_jobs=n_jobs,
|
|
861
873
|
geom_type="polygon",
|
|
862
874
|
)
|
|
863
875
|
|
|
864
|
-
missing =
|
|
876
|
+
missing = union_runner.run(missing, level=0, grid_size=grid_size)
|
|
865
877
|
|
|
866
878
|
missing = make_all_singlepart(missing).loc[lambda x: x.area > 0]
|
|
867
879
|
|
|
868
880
|
soon_erased = make_all_singlepart(soon_erased).loc[lambda x: x.area > 0]
|
|
869
881
|
|
|
870
|
-
if 0:
|
|
871
|
-
tree = STRtree(soon_erased.values)
|
|
872
|
-
left, right = tree.query(missing.values, predicate="intersects")
|
|
873
|
-
explore_locals(
|
|
874
|
-
missing2=to_gdf(missing.to_numpy()[left], 25833),
|
|
875
|
-
soon_erased2=to_gdf(soon_erased.to_numpy()[right], 25833),
|
|
876
|
-
center=_DEBUG_CONFIG["center"],
|
|
877
|
-
)
|
|
878
|
-
missing = pd.Series(
|
|
879
|
-
difference(
|
|
880
|
-
missing.to_numpy()[left],
|
|
881
|
-
soon_erased.to_numpy()[right],
|
|
882
|
-
grid_size=grid_size,
|
|
883
|
-
),
|
|
884
|
-
index=left,
|
|
885
|
-
).loc[lambda x: (x.notna()) & (~is_empty(x))]
|
|
886
|
-
|
|
887
882
|
soon_eliminated = pd.concat([eliminators, soon_erased, missing])
|
|
888
883
|
more_than_one = get_num_geometries(soon_eliminated.values) > 1
|
|
889
884
|
|
|
@@ -891,29 +886,13 @@ def _eliminate(
|
|
|
891
886
|
_unary_union_for_notna
|
|
892
887
|
)
|
|
893
888
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
soon_eliminated,
|
|
898
|
-
level=0,
|
|
899
|
-
grid_size=grid_size,
|
|
900
|
-
n_jobs=n_jobs,
|
|
901
|
-
),
|
|
902
|
-
index=eliminated.index,
|
|
903
|
-
)
|
|
904
|
-
else:
|
|
905
|
-
eliminated["geometry"] = _grouped_unary_union(soon_eliminated, level=0)
|
|
906
|
-
# eliminated["geometry"] = soon_eliminated.groupby(level=0).agg(
|
|
907
|
-
# lambda x: make_valid(unary_union(x))
|
|
908
|
-
# )
|
|
909
|
-
|
|
889
|
+
eliminated["geometry"] = union_runner.run(
|
|
890
|
+
soon_eliminated, level=0, grid_size=grid_size
|
|
891
|
+
)
|
|
910
892
|
else:
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
)
|
|
915
|
-
else:
|
|
916
|
-
eliminated["geometry"] = _grouped_unary_union(many_hits, by="_dissolve_idx")
|
|
893
|
+
eliminated["geometry"] = union_runner.run(
|
|
894
|
+
many_hits, by="_dissolve_idx", grid_size=grid_size, n_jobs=n_jobs
|
|
895
|
+
)
|
|
917
896
|
|
|
918
897
|
# setting crs on the GeometryArrays to avoid warning in concat
|
|
919
898
|
not_to_dissolve.geometry.values.crs = crs
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import joblib
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from geopandas import GeoDataFrame
|
|
12
|
+
from geopandas import GeoSeries
|
|
13
|
+
from shapely import STRtree
|
|
14
|
+
from shapely import get_parts
|
|
15
|
+
from shapely import make_valid
|
|
16
|
+
from shapely import union_all
|
|
17
|
+
from shapely.errors import GEOSException
|
|
18
|
+
|
|
19
|
+
from .utils import _unary_union_for_notna
|
|
20
|
+
from .utils import make_valid_and_keep_geom_type
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class AbstractRunner(ABC):
|
|
25
|
+
"""Blueprint for 'runner' classes.
|
|
26
|
+
|
|
27
|
+
Subclasses must implement a 'run' method.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
n_jobs: Number of workers.
|
|
31
|
+
backend: Backend for the workers.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
n_jobs: int
|
|
35
|
+
backend: str | None = None
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def run(self, *args, **kwargs) -> Any:
|
|
39
|
+
"""Abstract run method."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class UnionRunner(AbstractRunner):
|
|
44
|
+
"""Run shapely.union_all with pandas.groupby.
|
|
45
|
+
|
|
46
|
+
Subclasses must implement a 'run' method that takes the arguments
|
|
47
|
+
'df' (GeoDataFrame or GeoSeries), 'by' (optional column to group by), 'grid_size'
|
|
48
|
+
(passed to shapely.union_all) and **kwargs passed to pandas.DataFrame.groupby.
|
|
49
|
+
Defaults to None, meaning the default runner with number of workers set
|
|
50
|
+
to 'n_jobs'.
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
n_jobs: Number of workers.
|
|
55
|
+
backend: Backend for the workers.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
n_jobs: int
|
|
59
|
+
backend: str | None = None
|
|
60
|
+
|
|
61
|
+
def run(
|
|
62
|
+
self,
|
|
63
|
+
df: GeoDataFrame | GeoSeries | pd.DataFrame | pd.Series,
|
|
64
|
+
by: str | list[str] | None = None,
|
|
65
|
+
grid_size: float | int | None = None,
|
|
66
|
+
**kwargs,
|
|
67
|
+
) -> GeoSeries | GeoDataFrame:
|
|
68
|
+
"""Run groupby on geometries in parallel (if n_jobs > 1)."""
|
|
69
|
+
# assume geometry column is 'geometry' if input is pandas.Series og pandas.DataFrame
|
|
70
|
+
try:
|
|
71
|
+
geom_col = df.geometry.name
|
|
72
|
+
except AttributeError:
|
|
73
|
+
try:
|
|
74
|
+
geom_col = df.name
|
|
75
|
+
if geom_col is None:
|
|
76
|
+
geom_col = "geometry"
|
|
77
|
+
except AttributeError:
|
|
78
|
+
geom_col = "geometry"
|
|
79
|
+
try:
|
|
80
|
+
crs = df.crs
|
|
81
|
+
except AttributeError:
|
|
82
|
+
crs = None
|
|
83
|
+
|
|
84
|
+
unary_union_for_grid_size = functools.partial(
|
|
85
|
+
_unary_union_for_notna, grid_size=grid_size
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
as_index = kwargs.pop("as_index", True)
|
|
89
|
+
if by is None and "level" not in kwargs:
|
|
90
|
+
by = np.zeros(len(df), dtype="int64")
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# DataFrame
|
|
94
|
+
groupby_obj = df.groupby(by, **kwargs)[geom_col]
|
|
95
|
+
except KeyError:
|
|
96
|
+
# Series
|
|
97
|
+
groupby_obj = df.groupby(by, **kwargs)
|
|
98
|
+
|
|
99
|
+
if self.n_jobs is None or self.n_jobs == 1:
|
|
100
|
+
results = groupby_obj.agg(unary_union_for_grid_size)
|
|
101
|
+
index = results.index
|
|
102
|
+
else:
|
|
103
|
+
backend = self.backend or "loky"
|
|
104
|
+
with joblib.Parallel(n_jobs=self.n_jobs, backend=backend) as parallel:
|
|
105
|
+
results = parallel(
|
|
106
|
+
joblib.delayed(unary_union_for_grid_size)(geoms)
|
|
107
|
+
for _, geoms in groupby_obj
|
|
108
|
+
)
|
|
109
|
+
index = groupby_obj.size().index
|
|
110
|
+
agged = GeoSeries(results, index=index, name=geom_col, crs=crs)
|
|
111
|
+
if not as_index:
|
|
112
|
+
return agged.reset_index()
|
|
113
|
+
return agged
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _strtree_query(arr1, arr2, **kwargs):
|
|
117
|
+
tree = STRtree(arr2)
|
|
118
|
+
return tree.query(arr1, **kwargs)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class RTreeQueryRunner(AbstractRunner):
|
|
123
|
+
"""Run shapely.STRTree chunkwise.
|
|
124
|
+
|
|
125
|
+
Subclasses must implement a 'query' method that takes a numpy.ndarray
|
|
126
|
+
of geometries as 0th and 1st argument and **kwargs passed to the query method,
|
|
127
|
+
chiefly 'predicate' and 'distance'. The 'query' method should return a tuple
|
|
128
|
+
of two arrays representing the spatial index pairs of the left and right input arrays.
|
|
129
|
+
Defaults to None, meaning the default runner with number of workers set
|
|
130
|
+
to 'n_jobs'.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
n_jobs: Number of workers.
|
|
134
|
+
backend: Backend for the workers.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
n_jobs: int
|
|
138
|
+
backend: str = "loky"
|
|
139
|
+
|
|
140
|
+
def run(
|
|
141
|
+
self, arr1: np.ndarray, arr2: np.ndarray, **kwargs
|
|
142
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
143
|
+
"""Run a spatial rtree query and return indices of hits from arr1 and arr2 in a tuple of two arrays."""
|
|
144
|
+
# if (
|
|
145
|
+
# self.n_jobs > 1
|
|
146
|
+
# and len(arr1) / self.n_jobs > 1000
|
|
147
|
+
# # and len(arr1) / len(arr2) > 3
|
|
148
|
+
# ):
|
|
149
|
+
# chunks = np.array_split(np.arange(len(arr1)), self.n_jobs)
|
|
150
|
+
# assert sum(len(x) for x in chunks) == len(arr1)
|
|
151
|
+
# with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
|
|
152
|
+
# results = parallel(
|
|
153
|
+
# joblib.delayed(_strtree_query)(arr1[chunk], arr2, **kwargs)
|
|
154
|
+
# for chunk in chunks
|
|
155
|
+
# )
|
|
156
|
+
# left = np.concatenate([x[0] for x in results])
|
|
157
|
+
# right = np.concatenate([x[1] for x in results])
|
|
158
|
+
# return left, right
|
|
159
|
+
# elif (
|
|
160
|
+
# self.n_jobs > 1
|
|
161
|
+
# and len(arr2) / self.n_jobs > 1000
|
|
162
|
+
# and len(arr2) / len(arr1) > 3
|
|
163
|
+
# ):
|
|
164
|
+
# chunks = np.array_split(np.arange(len(arr2)), self.n_jobs)
|
|
165
|
+
# with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
|
|
166
|
+
# results = parallel(
|
|
167
|
+
# joblib.delayed(_strtree_query)(arr1, arr2[chunk], **kwargs)
|
|
168
|
+
# for chunk in chunks
|
|
169
|
+
# )
|
|
170
|
+
# left = np.concatenate([x[0] for x in results])
|
|
171
|
+
# right = np.concatenate([x[1] for x in results])
|
|
172
|
+
# return left, right
|
|
173
|
+
return _strtree_query(arr1, arr2, **kwargs)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class OverlayRunner(AbstractRunner):
|
|
178
|
+
"""Run a vectorized shapely overlay operation on two equal-length numpy arrays.
|
|
179
|
+
|
|
180
|
+
Subclasses must implement a 'run' method that takes an overlay function (shapely.intersection, shapely.difference etc.)
|
|
181
|
+
as 0th argument and two numpy.ndarrays of same length as 1st and 2nd argument.
|
|
182
|
+
The 'run' method should also take the argument 'grid_size' to be passed to the overlay function
|
|
183
|
+
and the argument 'geom_type' which is used to keep only relevant geometries (polygon, line or point)
|
|
184
|
+
in cases of GEOSExceptions caused by geometry type mismatch.
|
|
185
|
+
Defaults to an instance of OverlayRunner, which is run sequencially (no n_jobs)
|
|
186
|
+
because the vectorized shapely functions are usually faster than any attempt to parallelize.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
n_jobs: None = None
|
|
190
|
+
backend: None = None
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def run(
|
|
194
|
+
func: Callable,
|
|
195
|
+
arr1: np.ndarray,
|
|
196
|
+
arr2: np.ndarray,
|
|
197
|
+
grid_size: int | float | None,
|
|
198
|
+
geom_type: str | None,
|
|
199
|
+
) -> np.ndarray:
|
|
200
|
+
"""Run the overlay operation (func) with fallback.
|
|
201
|
+
|
|
202
|
+
First tries to run func, then, if GEOSException, geometries are made valid
|
|
203
|
+
and only geometries with correct geom_type (point, line, polygon) are kept
|
|
204
|
+
in GeometryCollections.
|
|
205
|
+
"""
|
|
206
|
+
try:
|
|
207
|
+
return func(arr1, arr2, grid_size=grid_size)
|
|
208
|
+
except GEOSException:
|
|
209
|
+
arr1 = make_valid_and_keep_geom_type(arr1, geom_type=geom_type)
|
|
210
|
+
arr2 = make_valid_and_keep_geom_type(arr2, geom_type=geom_type)
|
|
211
|
+
arr1 = arr1.loc[lambda x: x.index.isin(arr2.index)].to_numpy()
|
|
212
|
+
arr2 = arr2.loc[lambda x: x.index.isin(arr1.index)].to_numpy()
|
|
213
|
+
return func(arr1, arr2, grid_size=grid_size)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@dataclass
|
|
217
|
+
class GridSizeOverlayRunner(OverlayRunner):
|
|
218
|
+
"""Run a shapely overlay operation rowwise for different grid_sizes until success."""
|
|
219
|
+
|
|
220
|
+
n_jobs: int
|
|
221
|
+
backend: str | None
|
|
222
|
+
grid_sizes: list[float] | None = None
|
|
223
|
+
|
|
224
|
+
def __post_init__(self) -> None:
|
|
225
|
+
"""Check that grid_sizes is passed."""
|
|
226
|
+
if self.grid_sizes is None:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"must set 'grid_sizes' in the {self.__class__.__name__} initialiser."
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def run(
|
|
232
|
+
self,
|
|
233
|
+
func: Callable,
|
|
234
|
+
arr1: np.ndarray,
|
|
235
|
+
arr2: np.ndarray,
|
|
236
|
+
grid_size: int | float | None = None,
|
|
237
|
+
geom_type: str | None = None,
|
|
238
|
+
) -> np.ndarray:
|
|
239
|
+
"""Run the overlay operation rowwise with fallback.
|
|
240
|
+
|
|
241
|
+
The overlay operation (func) is looped for each row in arr1 and arr2
|
|
242
|
+
as 0th and 1st argument to 'func' and 'grid_size' as keyword argument. If a GEOSException is thrown,
|
|
243
|
+
geometries are made valid and GeometryCollections are forced to either
|
|
244
|
+
(Multi)Point, (Multi)Polygon or (Multi)LineString, depending on the value in "geom_type".
|
|
245
|
+
Then, if Another GEOSException is thrown, the overlay operation is looped for the grid_sizes given
|
|
246
|
+
in the instance's 'grid_sizes' attribute.
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
kwargs = dict(
|
|
250
|
+
grid_size=grid_size, geom_type=geom_type.lower(), grid_sizes=self.grid_sizes
|
|
251
|
+
)
|
|
252
|
+
with joblib.Parallel(self.n_jobs, backend="threading") as parallel:
|
|
253
|
+
return parallel(
|
|
254
|
+
joblib.delayed(_run_overlay_rowwise)(func, g1, g2, **kwargs)
|
|
255
|
+
for g1, g2 in zip(arr1, arr2, strict=True)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _run_overlay_rowwise(func, geom1, geom2, grid_size, geom_type, grid_sizes):
|
|
260
|
+
try:
|
|
261
|
+
return func(geom1, geom2, grid_size=grid_size)
|
|
262
|
+
except GEOSException:
|
|
263
|
+
pass
|
|
264
|
+
geom1 = get_parts(make_valid(geom1))
|
|
265
|
+
geom2 = get_parts(make_valid(geom2))
|
|
266
|
+
geom1 = union_all([g for g in geom1 if pd.notna(g) and geom_type in g.geom_type])
|
|
267
|
+
geom2 = union_all([g for g in geom2 if pd.notna(g) and geom_type in g.geom_type])
|
|
268
|
+
try:
|
|
269
|
+
return func(geom1, geom2)
|
|
270
|
+
except GEOSException:
|
|
271
|
+
pass
|
|
272
|
+
for i, grid_size in enumerate(grid_sizes):
|
|
273
|
+
try:
|
|
274
|
+
return func(geom1, geom2, grid_size=grid_size)
|
|
275
|
+
except GEOSException as e:
|
|
276
|
+
if i == len(grid_sizes) - 1:
|
|
277
|
+
raise e
|