ssb-sgis 1.1.17__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,10 @@ import numpy as np
4
4
  import pandas as pd
5
5
  from geopandas import GeoDataFrame
6
6
  from geopandas import GeoSeries
7
- from geopandas import __version__ as geopandas_version
8
7
  from shapely import Geometry
9
- from shapely import STRtree
10
8
 
11
9
  from .conversion import to_gdf
10
+ from .runners import RTreeQueryRunner
12
11
 
13
12
  gdf_type_error_message = "'gdf' should be of type GeoDataFrame or GeoSeries."
14
13
 
@@ -18,6 +17,8 @@ def sfilter(
18
17
  other: GeoDataFrame | GeoSeries | Geometry,
19
18
  predicate: str = "intersects",
20
19
  distance: int | float | None = None,
20
+ n_jobs: int = 1,
21
+ rtree_runner: RTreeQueryRunner | None = None,
21
22
  ) -> GeoDataFrame:
22
23
  """Filter a GeoDataFrame or GeoSeries by spatial predicate.
23
24
 
@@ -33,6 +34,9 @@ def sfilter(
33
34
  other: The geometry object to filter 'gdf' by.
34
35
  predicate: Spatial predicate to use. Defaults to 'intersects'.
35
36
  distance: Max distance to allow if predicate=="dwithin".
37
+ n_jobs: Number of workers.
38
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
39
+ See the 'runners' module for example implementations.
36
40
 
37
41
  Returns:
38
42
  A copy of 'gdf' with only the rows matching the
@@ -80,7 +84,9 @@ def sfilter(
80
84
 
81
85
  other = _sfilter_checks(other, crs=gdf.crs)
82
86
 
83
- indices = _get_sfilter_indices(gdf, other, predicate, distance)
87
+ indices = _get_sfilter_indices(
88
+ gdf, other, predicate, distance, n_jobs, rtree_runner
89
+ )
84
90
 
85
91
  return gdf.iloc[indices]
86
92
 
@@ -90,6 +96,8 @@ def sfilter_split(
90
96
  other: GeoDataFrame | GeoSeries | Geometry,
91
97
  predicate: str = "intersects",
92
98
  distance: int | float | None = None,
99
+ n_jobs: int = 1,
100
+ rtree_runner: RTreeQueryRunner | None = None,
93
101
  ) -> tuple[GeoDataFrame, GeoDataFrame]:
94
102
  """Split a GeoDataFrame or GeoSeries by spatial predicate.
95
103
 
@@ -101,6 +109,9 @@ def sfilter_split(
101
109
  other: The geometry object to filter 'gdf' by.
102
110
  predicate: Spatial predicate to use. Defaults to 'intersects'.
103
111
  distance: Max distance to allow if predicate=="dwithin".
112
+ n_jobs: Number of workers.
113
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
114
+ See the 'runners' module for example implementations.
104
115
 
105
116
  Returns:
106
117
  A tuple of GeoDataFrames, one with the rows that match the spatial predicate
@@ -151,7 +162,9 @@ def sfilter_split(
151
162
 
152
163
  other = _sfilter_checks(other, crs=gdf.crs)
153
164
 
154
- indices = _get_sfilter_indices(gdf, other, predicate, distance)
165
+ indices = _get_sfilter_indices(
166
+ gdf, other, predicate, distance, n_jobs, rtree_runner
167
+ )
155
168
 
156
169
  return (
157
170
  gdf.iloc[indices],
@@ -164,6 +177,8 @@ def sfilter_inverse(
164
177
  other: GeoDataFrame | GeoSeries | Geometry,
165
178
  predicate: str = "intersects",
166
179
  distance: int | float | None = None,
180
+ n_jobs: int = 1,
181
+ rtree_runner: RTreeQueryRunner | None = None,
167
182
  ) -> GeoDataFrame | GeoSeries:
168
183
  """Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
169
184
 
@@ -174,6 +189,9 @@ def sfilter_inverse(
174
189
  other: The geometry object to filter 'gdf' by.
175
190
  predicate: Spatial predicate to use. Defaults to 'intersects'.
176
191
  distance: Max distance to allow if predicate=="dwithin".
192
+ n_jobs: Number of workers.
193
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
194
+ See the 'runners' module for example implementations.
177
195
 
178
196
  Returns:
179
197
  A copy of 'gdf' with only the rows that do not match the
@@ -215,11 +233,10 @@ def sfilter_inverse(
215
233
  """
216
234
  if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
217
235
  raise TypeError(gdf_type_error_message)
218
-
219
236
  other = _sfilter_checks(other, crs=gdf.crs)
220
-
221
- indices = _get_sfilter_indices(gdf, other, predicate, distance)
222
-
237
+ indices = _get_sfilter_indices(
238
+ gdf, other, predicate, distance, n_jobs, rtree_runner
239
+ )
223
240
  return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
224
241
 
225
242
 
@@ -252,6 +269,8 @@ def _get_sfilter_indices(
252
269
  right: GeoDataFrame | GeoSeries | Geometry,
253
270
  predicate: str,
254
271
  distance: int | float | None,
272
+ n_jobs: int,
273
+ rtree_runner: RTreeQueryRunner | None,
255
274
  ) -> np.ndarray:
256
275
  """Compute geometric comparisons and get matching indices.
257
276
 
@@ -264,6 +283,9 @@ def _get_sfilter_indices(
264
283
  right : GeoDataFrame
265
284
  predicate : string
266
285
  Binary predicate to query.
286
+ n_jobs: Number of workers.
287
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
288
+ See the 'runners' module for example implementations.
267
289
 
268
290
  Returns:
269
291
  -------
@@ -273,6 +295,9 @@ def _get_sfilter_indices(
273
295
  """
274
296
  original_predicate = predicate
275
297
 
298
+ if rtree_runner is None:
299
+ rtree_runner = RTreeQueryRunner(n_jobs)
300
+
276
301
  with warnings.catch_warnings():
277
302
  # We don't need to show our own warning here
278
303
  # TODO remove this once the deprecation has been enforced
@@ -285,25 +310,16 @@ def _get_sfilter_indices(
285
310
  # contains is a faster predicate
286
311
  # see discussion at https://github.com/geopandas/geopandas/pull/1421
287
312
  predicate = "contains"
288
- sindex, kwargs = _get_spatial_tree(left)
289
- input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
313
+ arr1 = right.geometry.values
314
+ arr2 = left.geometry.values
290
315
  else:
291
316
  # all other predicates are symmetric
292
317
  # keep them the same
293
- sindex, kwargs = _get_spatial_tree(right)
294
- input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
318
+ arr1 = left.geometry.values
319
+ arr2 = right.geometry.values
295
320
 
296
- l_idx, r_idx = sindex.query(
297
- input_geoms, predicate=predicate, distance=distance, **kwargs
298
- )
321
+ left, right = rtree_runner.run(arr1, arr2, predicate=predicate, distance=distance)
299
322
 
300
323
  if original_predicate == "within":
301
- return np.sort(np.unique(r_idx))
302
-
303
- return np.sort(np.unique(l_idx))
304
-
305
-
306
- def _get_spatial_tree(df):
307
- if int(geopandas_version[0]) >= 1:
308
- return df.sindex, {"sort": False}
309
- return STRtree(df.geometry.values), {}
324
+ return np.sort(np.unique(right))
325
+ return np.sort(np.unique(left))
@@ -0,0 +1,37 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from geopandas import GeoSeries
4
+ from shapely import make_valid
5
+ from shapely import union_all
6
+
7
+ from .geometry_types import to_single_geom_type
8
+
9
+
10
+ def _unary_union_for_notna(geoms, **kwargs):
11
+ try:
12
+ return make_valid(union_all(geoms, **kwargs))
13
+ except TypeError:
14
+ return make_valid(union_all([geom for geom in geoms.dropna().values], **kwargs))
15
+
16
+
17
+ def make_valid_and_keep_geom_type(geoms: np.ndarray, geom_type: str) -> GeoSeries:
18
+ """Make GeometryCollections into (Multi)Polygons, (Multi)LineStrings or (Multi)Points.
19
+
20
+ Because GeometryCollections might appear after dissolving (union_all).
21
+ And this makes shapely difference/intersection fail.
22
+
23
+ Args:
24
+ geoms: Array of geometries.
25
+ geom_type: geometry type to be kept.
26
+ """
27
+ geoms = GeoSeries(geoms)
28
+ geoms.index = range(len(geoms))
29
+ geoms.loc[:] = make_valid(geoms.to_numpy())
30
+ geoms_with_correct_type = geoms.explode(index_parts=False).pipe(
31
+ to_single_geom_type, geom_type
32
+ )
33
+ only_one = geoms_with_correct_type.groupby(level=0).transform("size") == 1
34
+ one_hit = geoms_with_correct_type[only_one]
35
+ many_hits = geoms_with_correct_type[~only_one].groupby(level=0).agg(union_all)
36
+ geoms_with_wrong_type = geoms.loc[~geoms.index.isin(geoms_with_correct_type.index)]
37
+ return pd.concat([one_hit, many_hits, geoms_with_wrong_type]).sort_index()
sgis/helpers.py CHANGED
@@ -198,7 +198,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
198
198
 
199
199
 
200
200
  def return_two_vals(
201
- vals: tuple[str, str] | list[str] | str | int | float
201
+ vals: tuple[str, str] | list[str] | str | int | float,
202
202
  ) -> tuple[str | int | float, str | int | float]:
203
203
  """Return a two-length tuple from a str/int/float or list/tuple of length 1 or 2.
204
204
 
@@ -779,14 +779,12 @@ def _read_partitioned_parquet(
779
779
  if all(isinstance(x, DataFrame) for x in results):
780
780
  return pd.concat(results)
781
781
  else:
782
- geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
783
- return _arrow_to_geopandas(
784
- pyarrow.concat_tables(
785
- results,
786
- promote_options="permissive",
787
- ),
788
- geo_metadata,
782
+ results = pyarrow.concat_tables(
783
+ results,
784
+ promote_options="permissive",
789
785
  )
786
+ geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
787
+ return _arrow_to_geopandas(results, geo_metadata)
790
788
 
791
789
  # add columns to empty DataFrame
792
790
  first_path = next(iter(child_paths + [path]))
sgis/maps/map.py CHANGED
@@ -307,7 +307,9 @@ class Map:
307
307
  notna = array[array.notna()]
308
308
  isna = array[array.isna()]
309
309
 
310
- unique_multiplied = (notna * self._multiplier).astype(np.int64)
310
+ unique_multiplied = (notna.astype(np.float64) * self._multiplier).astype(
311
+ np.int64
312
+ )
311
313
 
312
314
  return pd.concat([unique_multiplied, isna]).sort_index()
313
315
 
sgis/parallel/parallel.py CHANGED
@@ -75,13 +75,15 @@ def parallel_overlay(
75
75
  Returns:
76
76
  A GeoDataFrame containing the result of the overlay operation.
77
77
  """
78
+ if how != "intersection":
79
+ raise ValueError("parallel_overlay only supports how='intersection'.")
78
80
  return pd.concat(
79
81
  chunkwise(
80
82
  _clean_overlay_with_print,
81
83
  df1,
82
84
  kwargs={
83
85
  "df2": df2,
84
- # "to_print": to_print,
86
+ "to_print": to_print,
85
87
  "how": how,
86
88
  }
87
89
  | kwargs,
@@ -672,7 +674,7 @@ class Parallel:
672
674
  def chunkwise(
673
675
  self,
674
676
  func: Callable,
675
- iterable: Collection[Iterable[Any]],
677
+ *iterables: Collection[Iterable[Any]],
676
678
  args: tuple | None = None,
677
679
  kwargs: dict | None = None,
678
680
  max_rows_per_chunk: int | None = None,
@@ -682,8 +684,8 @@ class Parallel:
682
684
  Args:
683
685
  func: Function to run chunkwise. It should take
684
686
  (a chunk of) the iterable as first argument.
685
- iterable: Iterable to split in chunks and passed
686
- as first argument to 'func'.
687
+ iterables: Iterable(s) to split in chunks and passed
688
+ as first argument(s) to 'func'. Iterables must have same length.
687
689
  args: Positional arguments in 'func' after the DataFrame.
688
690
  kwargs: Additional keyword arguments in 'func'.
689
691
  max_rows_per_chunk: Alternatively decide number of chunks
@@ -691,7 +693,7 @@ class Parallel:
691
693
  """
692
694
  return chunkwise(
693
695
  func,
694
- iterable,
696
+ *iterables,
695
697
  args=args,
696
698
  kwargs=kwargs,
697
699
  processes=self.processes,
@@ -1067,7 +1069,7 @@ def _fix_missing_muni_numbers(
1067
1069
 
1068
1070
  def chunkwise(
1069
1071
  func: Callable,
1070
- iterable: Collection[Iterable[Any]],
1072
+ *iterables: Collection[Iterable[Any]],
1071
1073
  args: tuple | None = None,
1072
1074
  kwargs: dict | None = None,
1073
1075
  processes: int = 1,
@@ -1082,7 +1084,7 @@ def chunkwise(
1082
1084
  Args:
1083
1085
  func: The function to apply to each chunk. This function must accept a DataFrame as
1084
1086
  its first argument and return a DataFrame.
1085
- iterable: Iterable to be chunked and processed.
1087
+ iterables: Iterable(s) to be chunked and processed. Must have same length.
1086
1088
  args: Additional positional arguments to pass to 'func'.
1087
1089
  kwargs: Keyword arguments to pass to 'func'.
1088
1090
  processes: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
@@ -1096,30 +1098,36 @@ def chunkwise(
1096
1098
  args = args or ()
1097
1099
  kwargs = kwargs or {}
1098
1100
 
1101
+ if len({len(x) for x in iterables}) not in [0, 1]:
1102
+ raise ValueError(
1103
+ f"iterables must have same length. Got {', '.join([len(x) for x in iterables])}"
1104
+ )
1105
+
1099
1106
  if max_rows_per_chunk is None:
1100
1107
  n_chunks: int = processes
1101
1108
  else:
1102
- n_chunks: int = len(iterable) // max_rows_per_chunk
1103
-
1109
+ n_chunks: int = len(next(iter(iterables))) // max_rows_per_chunk
1104
1110
  if n_chunks <= 1:
1105
- return [func(iterable, *args, **kwargs)]
1111
+ return [func(*iterables, *args, **kwargs)]
1106
1112
 
1107
- chunks = np.array_split(np.arange(len(iterable)), n_chunks)
1113
+ chunks = np.array_split(np.arange(len(next(iter(iterables)))), n_chunks)
1108
1114
 
1109
- if hasattr(iterable, "iloc"):
1110
- iterable_chunked: list[pd.DataFrame | pd.Series] = [
1111
- iterable.iloc[chunk] for chunk in chunks
1112
- ]
1113
- elif is_array_like(iterable):
1114
- iterable_chunked: list[np.ndarray] = [iterable[chunk] for chunk in chunks]
1115
- else:
1116
- to_type: type = iterable.__class__
1117
- iterable_chunked: list[Iterable] = [
1118
- to_type(chunk) for chunk in np.array_split(list(iterable), n_chunks)
1119
- ]
1120
- return Parallel(processes, backend=backend).map(
1115
+ def get_chunk(iterable, chunk):
1116
+ if hasattr(iterable, "iloc"):
1117
+ return iterable.iloc[chunk]
1118
+ elif is_array_like(iterable):
1119
+ return iterable[chunk]
1120
+ else:
1121
+ to_type: type = iterable.__class__
1122
+ return to_type([x for i, x in enumerate(iterable) if i in chunk])
1123
+
1124
+ iterables_chunked: list[list[Iterable[Any]]] = [
1125
+ [get_chunk(iterable, chunk) for iterable in iterables] for chunk in chunks
1126
+ ]
1127
+
1128
+ return Parallel(processes, backend=backend).starmap(
1121
1129
  func,
1122
- iterable_chunked,
1130
+ iterables_chunked,
1123
1131
  args=args,
1124
1132
  kwargs=kwargs,
1125
1133
  )