ssb-sgis 1.1.17__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +4 -0
- sgis/conf.py +56 -4
- sgis/geopandas_tools/buffer_dissolve_explode.py +24 -47
- sgis/geopandas_tools/conversion.py +18 -25
- sgis/geopandas_tools/duplicates.py +44 -60
- sgis/geopandas_tools/general.py +8 -84
- sgis/geopandas_tools/overlay.py +177 -256
- sgis/geopandas_tools/polygon_operations.py +67 -88
- sgis/geopandas_tools/runners.py +277 -0
- sgis/geopandas_tools/sfilter.py +40 -24
- sgis/geopandas_tools/utils.py +37 -0
- sgis/helpers.py +1 -1
- sgis/io/dapla_functions.py +5 -7
- sgis/maps/map.py +3 -1
- sgis/parallel/parallel.py +32 -24
- sgis/raster/image_collection.py +184 -162
- sgis/raster/indices.py +0 -1
- {ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/METADATA +1 -1
- {ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/RECORD +21 -19
- {ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.1.17.dist-info → ssb_sgis-1.2.0.dist-info}/WHEEL +0 -0
sgis/geopandas_tools/sfilter.py
CHANGED
|
@@ -4,11 +4,10 @@ import numpy as np
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from geopandas import GeoDataFrame
|
|
6
6
|
from geopandas import GeoSeries
|
|
7
|
-
from geopandas import __version__ as geopandas_version
|
|
8
7
|
from shapely import Geometry
|
|
9
|
-
from shapely import STRtree
|
|
10
8
|
|
|
11
9
|
from .conversion import to_gdf
|
|
10
|
+
from .runners import RTreeQueryRunner
|
|
12
11
|
|
|
13
12
|
gdf_type_error_message = "'gdf' should be of type GeoDataFrame or GeoSeries."
|
|
14
13
|
|
|
@@ -18,6 +17,8 @@ def sfilter(
|
|
|
18
17
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
19
18
|
predicate: str = "intersects",
|
|
20
19
|
distance: int | float | None = None,
|
|
20
|
+
n_jobs: int = 1,
|
|
21
|
+
rtree_runner: RTreeQueryRunner | None = None,
|
|
21
22
|
) -> GeoDataFrame:
|
|
22
23
|
"""Filter a GeoDataFrame or GeoSeries by spatial predicate.
|
|
23
24
|
|
|
@@ -33,6 +34,9 @@ def sfilter(
|
|
|
33
34
|
other: The geometry object to filter 'gdf' by.
|
|
34
35
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
35
36
|
distance: Max distance to allow if predicate=="dwithin".
|
|
37
|
+
n_jobs: Number of workers.
|
|
38
|
+
rtree_runner: Optionally debug/manipulate the spatial indexing operations.
|
|
39
|
+
See the 'runners' module for example implementations.
|
|
36
40
|
|
|
37
41
|
Returns:
|
|
38
42
|
A copy of 'gdf' with only the rows matching the
|
|
@@ -80,7 +84,9 @@ def sfilter(
|
|
|
80
84
|
|
|
81
85
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
82
86
|
|
|
83
|
-
indices = _get_sfilter_indices(
|
|
87
|
+
indices = _get_sfilter_indices(
|
|
88
|
+
gdf, other, predicate, distance, n_jobs, rtree_runner
|
|
89
|
+
)
|
|
84
90
|
|
|
85
91
|
return gdf.iloc[indices]
|
|
86
92
|
|
|
@@ -90,6 +96,8 @@ def sfilter_split(
|
|
|
90
96
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
91
97
|
predicate: str = "intersects",
|
|
92
98
|
distance: int | float | None = None,
|
|
99
|
+
n_jobs: int = 1,
|
|
100
|
+
rtree_runner: RTreeQueryRunner | None = None,
|
|
93
101
|
) -> tuple[GeoDataFrame, GeoDataFrame]:
|
|
94
102
|
"""Split a GeoDataFrame or GeoSeries by spatial predicate.
|
|
95
103
|
|
|
@@ -101,6 +109,9 @@ def sfilter_split(
|
|
|
101
109
|
other: The geometry object to filter 'gdf' by.
|
|
102
110
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
103
111
|
distance: Max distance to allow if predicate=="dwithin".
|
|
112
|
+
n_jobs: Number of workers.
|
|
113
|
+
rtree_runner: Optionally debug/manipulate the spatial indexing operations.
|
|
114
|
+
See the 'runners' module for example implementations.
|
|
104
115
|
|
|
105
116
|
Returns:
|
|
106
117
|
A tuple of GeoDataFrames, one with the rows that match the spatial predicate
|
|
@@ -151,7 +162,9 @@ def sfilter_split(
|
|
|
151
162
|
|
|
152
163
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
153
164
|
|
|
154
|
-
indices = _get_sfilter_indices(
|
|
165
|
+
indices = _get_sfilter_indices(
|
|
166
|
+
gdf, other, predicate, distance, n_jobs, rtree_runner
|
|
167
|
+
)
|
|
155
168
|
|
|
156
169
|
return (
|
|
157
170
|
gdf.iloc[indices],
|
|
@@ -164,6 +177,8 @@ def sfilter_inverse(
|
|
|
164
177
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
165
178
|
predicate: str = "intersects",
|
|
166
179
|
distance: int | float | None = None,
|
|
180
|
+
n_jobs: int = 1,
|
|
181
|
+
rtree_runner: RTreeQueryRunner | None = None,
|
|
167
182
|
) -> GeoDataFrame | GeoSeries:
|
|
168
183
|
"""Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
|
|
169
184
|
|
|
@@ -174,6 +189,9 @@ def sfilter_inverse(
|
|
|
174
189
|
other: The geometry object to filter 'gdf' by.
|
|
175
190
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
176
191
|
distance: Max distance to allow if predicate=="dwithin".
|
|
192
|
+
n_jobs: Number of workers.
|
|
193
|
+
rtree_runner: Optionally debug/manipulate the spatial indexing operations.
|
|
194
|
+
See the 'runners' module for example implementations.
|
|
177
195
|
|
|
178
196
|
Returns:
|
|
179
197
|
A copy of 'gdf' with only the rows that do not match the
|
|
@@ -215,11 +233,10 @@ def sfilter_inverse(
|
|
|
215
233
|
"""
|
|
216
234
|
if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
|
|
217
235
|
raise TypeError(gdf_type_error_message)
|
|
218
|
-
|
|
219
236
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
237
|
+
indices = _get_sfilter_indices(
|
|
238
|
+
gdf, other, predicate, distance, n_jobs, rtree_runner
|
|
239
|
+
)
|
|
223
240
|
return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
|
|
224
241
|
|
|
225
242
|
|
|
@@ -252,6 +269,8 @@ def _get_sfilter_indices(
|
|
|
252
269
|
right: GeoDataFrame | GeoSeries | Geometry,
|
|
253
270
|
predicate: str,
|
|
254
271
|
distance: int | float | None,
|
|
272
|
+
n_jobs: int,
|
|
273
|
+
rtree_runner: RTreeQueryRunner | None,
|
|
255
274
|
) -> np.ndarray:
|
|
256
275
|
"""Compute geometric comparisons and get matching indices.
|
|
257
276
|
|
|
@@ -264,6 +283,9 @@ def _get_sfilter_indices(
|
|
|
264
283
|
right : GeoDataFrame
|
|
265
284
|
predicate : string
|
|
266
285
|
Binary predicate to query.
|
|
286
|
+
n_jobs: Number of workers.
|
|
287
|
+
rtree_runner: Optionally debug/manipulate the spatial indexing operations.
|
|
288
|
+
See the 'runners' module for example implementations.
|
|
267
289
|
|
|
268
290
|
Returns:
|
|
269
291
|
-------
|
|
@@ -273,6 +295,9 @@ def _get_sfilter_indices(
|
|
|
273
295
|
"""
|
|
274
296
|
original_predicate = predicate
|
|
275
297
|
|
|
298
|
+
if rtree_runner is None:
|
|
299
|
+
rtree_runner = RTreeQueryRunner(n_jobs)
|
|
300
|
+
|
|
276
301
|
with warnings.catch_warnings():
|
|
277
302
|
# We don't need to show our own warning here
|
|
278
303
|
# TODO remove this once the deprecation has been enforced
|
|
@@ -285,25 +310,16 @@ def _get_sfilter_indices(
|
|
|
285
310
|
# contains is a faster predicate
|
|
286
311
|
# see discussion at https://github.com/geopandas/geopandas/pull/1421
|
|
287
312
|
predicate = "contains"
|
|
288
|
-
|
|
289
|
-
|
|
313
|
+
arr1 = right.geometry.values
|
|
314
|
+
arr2 = left.geometry.values
|
|
290
315
|
else:
|
|
291
316
|
# all other predicates are symmetric
|
|
292
317
|
# keep them the same
|
|
293
|
-
|
|
294
|
-
|
|
318
|
+
arr1 = left.geometry.values
|
|
319
|
+
arr2 = right.geometry.values
|
|
295
320
|
|
|
296
|
-
|
|
297
|
-
input_geoms, predicate=predicate, distance=distance, **kwargs
|
|
298
|
-
)
|
|
321
|
+
left, right = rtree_runner.run(arr1, arr2, predicate=predicate, distance=distance)
|
|
299
322
|
|
|
300
323
|
if original_predicate == "within":
|
|
301
|
-
return np.sort(np.unique(
|
|
302
|
-
|
|
303
|
-
return np.sort(np.unique(l_idx))
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
def _get_spatial_tree(df):
|
|
307
|
-
if int(geopandas_version[0]) >= 1:
|
|
308
|
-
return df.sindex, {"sort": False}
|
|
309
|
-
return STRtree(df.geometry.values), {}
|
|
324
|
+
return np.sort(np.unique(right))
|
|
325
|
+
return np.sort(np.unique(left))
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from geopandas import GeoSeries
|
|
4
|
+
from shapely import make_valid
|
|
5
|
+
from shapely import union_all
|
|
6
|
+
|
|
7
|
+
from .geometry_types import to_single_geom_type
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _unary_union_for_notna(geoms, **kwargs):
|
|
11
|
+
try:
|
|
12
|
+
return make_valid(union_all(geoms, **kwargs))
|
|
13
|
+
except TypeError:
|
|
14
|
+
return make_valid(union_all([geom for geom in geoms.dropna().values], **kwargs))
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def make_valid_and_keep_geom_type(geoms: np.ndarray, geom_type: str) -> GeoSeries:
|
|
18
|
+
"""Make GeometryCollections into (Multi)Polygons, (Multi)LineStrings or (Multi)Points.
|
|
19
|
+
|
|
20
|
+
Because GeometryCollections might appear after dissolving (union_all).
|
|
21
|
+
And this makes shapely difference/intersection fail.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
geoms: Array of geometries.
|
|
25
|
+
geom_type: geometry type to be kept.
|
|
26
|
+
"""
|
|
27
|
+
geoms = GeoSeries(geoms)
|
|
28
|
+
geoms.index = range(len(geoms))
|
|
29
|
+
geoms.loc[:] = make_valid(geoms.to_numpy())
|
|
30
|
+
geoms_with_correct_type = geoms.explode(index_parts=False).pipe(
|
|
31
|
+
to_single_geom_type, geom_type
|
|
32
|
+
)
|
|
33
|
+
only_one = geoms_with_correct_type.groupby(level=0).transform("size") == 1
|
|
34
|
+
one_hit = geoms_with_correct_type[only_one]
|
|
35
|
+
many_hits = geoms_with_correct_type[~only_one].groupby(level=0).agg(union_all)
|
|
36
|
+
geoms_with_wrong_type = geoms.loc[~geoms.index.isin(geoms_with_correct_type.index)]
|
|
37
|
+
return pd.concat([one_hit, many_hits, geoms_with_wrong_type]).sort_index()
|
sgis/helpers.py
CHANGED
|
@@ -198,7 +198,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
|
|
|
198
198
|
|
|
199
199
|
|
|
200
200
|
def return_two_vals(
|
|
201
|
-
vals: tuple[str, str] | list[str] | str | int | float
|
|
201
|
+
vals: tuple[str, str] | list[str] | str | int | float,
|
|
202
202
|
) -> tuple[str | int | float, str | int | float]:
|
|
203
203
|
"""Return a two-length tuple from a str/int/float or list/tuple of length 1 or 2.
|
|
204
204
|
|
sgis/io/dapla_functions.py
CHANGED
|
@@ -779,14 +779,12 @@ def _read_partitioned_parquet(
|
|
|
779
779
|
if all(isinstance(x, DataFrame) for x in results):
|
|
780
780
|
return pd.concat(results)
|
|
781
781
|
else:
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
results,
|
|
786
|
-
promote_options="permissive",
|
|
787
|
-
),
|
|
788
|
-
geo_metadata,
|
|
782
|
+
results = pyarrow.concat_tables(
|
|
783
|
+
results,
|
|
784
|
+
promote_options="permissive",
|
|
789
785
|
)
|
|
786
|
+
geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
|
|
787
|
+
return _arrow_to_geopandas(results, geo_metadata)
|
|
790
788
|
|
|
791
789
|
# add columns to empty DataFrame
|
|
792
790
|
first_path = next(iter(child_paths + [path]))
|
sgis/maps/map.py
CHANGED
|
@@ -307,7 +307,9 @@ class Map:
|
|
|
307
307
|
notna = array[array.notna()]
|
|
308
308
|
isna = array[array.isna()]
|
|
309
309
|
|
|
310
|
-
unique_multiplied = (notna * self._multiplier).astype(
|
|
310
|
+
unique_multiplied = (notna.astype(np.float64) * self._multiplier).astype(
|
|
311
|
+
np.int64
|
|
312
|
+
)
|
|
311
313
|
|
|
312
314
|
return pd.concat([unique_multiplied, isna]).sort_index()
|
|
313
315
|
|
sgis/parallel/parallel.py
CHANGED
|
@@ -75,13 +75,15 @@ def parallel_overlay(
|
|
|
75
75
|
Returns:
|
|
76
76
|
A GeoDataFrame containing the result of the overlay operation.
|
|
77
77
|
"""
|
|
78
|
+
if how != "intersection":
|
|
79
|
+
raise ValueError("parallel_overlay only supports how='intersection'.")
|
|
78
80
|
return pd.concat(
|
|
79
81
|
chunkwise(
|
|
80
82
|
_clean_overlay_with_print,
|
|
81
83
|
df1,
|
|
82
84
|
kwargs={
|
|
83
85
|
"df2": df2,
|
|
84
|
-
|
|
86
|
+
"to_print": to_print,
|
|
85
87
|
"how": how,
|
|
86
88
|
}
|
|
87
89
|
| kwargs,
|
|
@@ -672,7 +674,7 @@ class Parallel:
|
|
|
672
674
|
def chunkwise(
|
|
673
675
|
self,
|
|
674
676
|
func: Callable,
|
|
675
|
-
|
|
677
|
+
*iterables: Collection[Iterable[Any]],
|
|
676
678
|
args: tuple | None = None,
|
|
677
679
|
kwargs: dict | None = None,
|
|
678
680
|
max_rows_per_chunk: int | None = None,
|
|
@@ -682,8 +684,8 @@ class Parallel:
|
|
|
682
684
|
Args:
|
|
683
685
|
func: Function to run chunkwise. It should take
|
|
684
686
|
(a chunk of) the iterable as first argument.
|
|
685
|
-
|
|
686
|
-
as first argument to 'func'.
|
|
687
|
+
iterables: Iterable(s) to split in chunks and passed
|
|
688
|
+
as first argument(s) to 'func'. Iterables must have same length.
|
|
687
689
|
args: Positional arguments in 'func' after the DataFrame.
|
|
688
690
|
kwargs: Additional keyword arguments in 'func'.
|
|
689
691
|
max_rows_per_chunk: Alternatively decide number of chunks
|
|
@@ -691,7 +693,7 @@ class Parallel:
|
|
|
691
693
|
"""
|
|
692
694
|
return chunkwise(
|
|
693
695
|
func,
|
|
694
|
-
|
|
696
|
+
*iterables,
|
|
695
697
|
args=args,
|
|
696
698
|
kwargs=kwargs,
|
|
697
699
|
processes=self.processes,
|
|
@@ -1067,7 +1069,7 @@ def _fix_missing_muni_numbers(
|
|
|
1067
1069
|
|
|
1068
1070
|
def chunkwise(
|
|
1069
1071
|
func: Callable,
|
|
1070
|
-
|
|
1072
|
+
*iterables: Collection[Iterable[Any]],
|
|
1071
1073
|
args: tuple | None = None,
|
|
1072
1074
|
kwargs: dict | None = None,
|
|
1073
1075
|
processes: int = 1,
|
|
@@ -1082,7 +1084,7 @@ def chunkwise(
|
|
|
1082
1084
|
Args:
|
|
1083
1085
|
func: The function to apply to each chunk. This function must accept a DataFrame as
|
|
1084
1086
|
its first argument and return a DataFrame.
|
|
1085
|
-
|
|
1087
|
+
iterables: Iterable(s) to be chunked and processed. Must have same length.
|
|
1086
1088
|
args: Additional positional arguments to pass to 'func'.
|
|
1087
1089
|
kwargs: Keyword arguments to pass to 'func'.
|
|
1088
1090
|
processes: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
|
|
@@ -1096,30 +1098,36 @@ def chunkwise(
|
|
|
1096
1098
|
args = args or ()
|
|
1097
1099
|
kwargs = kwargs or {}
|
|
1098
1100
|
|
|
1101
|
+
if len({len(x) for x in iterables}) not in [0, 1]:
|
|
1102
|
+
raise ValueError(
|
|
1103
|
+
f"iterables must have same length. Got {', '.join([len(x) for x in iterables])}"
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1099
1106
|
if max_rows_per_chunk is None:
|
|
1100
1107
|
n_chunks: int = processes
|
|
1101
1108
|
else:
|
|
1102
|
-
n_chunks: int = len(
|
|
1103
|
-
|
|
1109
|
+
n_chunks: int = len(next(iter(iterables))) // max_rows_per_chunk
|
|
1104
1110
|
if n_chunks <= 1:
|
|
1105
|
-
return [func(
|
|
1111
|
+
return [func(*iterables, *args, **kwargs)]
|
|
1106
1112
|
|
|
1107
|
-
chunks = np.array_split(np.arange(len(
|
|
1113
|
+
chunks = np.array_split(np.arange(len(next(iter(iterables)))), n_chunks)
|
|
1108
1114
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
iterable.iloc[chunk]
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
]
|
|
1120
|
-
|
|
1115
|
+
def get_chunk(iterable, chunk):
|
|
1116
|
+
if hasattr(iterable, "iloc"):
|
|
1117
|
+
return iterable.iloc[chunk]
|
|
1118
|
+
elif is_array_like(iterable):
|
|
1119
|
+
return iterable[chunk]
|
|
1120
|
+
else:
|
|
1121
|
+
to_type: type = iterable.__class__
|
|
1122
|
+
return to_type([x for i, x in enumerate(iterable) if i in chunk])
|
|
1123
|
+
|
|
1124
|
+
iterables_chunked: list[list[Iterable[Any]]] = [
|
|
1125
|
+
[get_chunk(iterable, chunk) for iterable in iterables] for chunk in chunks
|
|
1126
|
+
]
|
|
1127
|
+
|
|
1128
|
+
return Parallel(processes, backend=backend).starmap(
|
|
1121
1129
|
func,
|
|
1122
|
-
|
|
1130
|
+
iterables_chunked,
|
|
1123
1131
|
args=args,
|
|
1124
1132
|
kwargs=kwargs,
|
|
1125
1133
|
)
|