ssb-sgis 1.1.16__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,34 +9,29 @@ version of the solution from GH 2792.
9
9
  """
10
10
 
11
11
  import functools
12
- from collections.abc import Callable
13
12
 
14
13
  import geopandas as gpd
15
- import joblib
16
14
  import numpy as np
17
15
  import pandas as pd
18
16
  from geopandas import GeoDataFrame
19
- from geopandas import GeoSeries
20
17
  from pandas import DataFrame
21
18
  from shapely import Geometry
22
- from shapely import STRtree
23
19
  from shapely import box
24
20
  from shapely import difference
25
21
  from shapely import intersection
22
+ from shapely import is_empty
26
23
  from shapely import make_valid
27
- from shapely import unary_union
28
- from shapely.errors import GEOSException
29
-
30
- try:
31
- import dask.array as da
32
- except ImportError:
33
- pass
24
+ from shapely import union_all
34
25
 
26
+ from ..conf import config
35
27
  from .general import _determine_geom_type_args
36
28
  from .general import clean_geoms
37
29
  from .geometry_types import get_geom_type
38
30
  from .geometry_types import make_all_singlepart
39
31
  from .geometry_types import to_single_geom_type
32
+ from .runners import OverlayRunner
33
+ from .runners import RTreeQueryRunner
34
+ from .runners import UnionRunner
40
35
 
41
36
  DEFAULT_GRID_SIZE = None
42
37
  DEFAULT_LSUFFIX = "_1"
@@ -51,9 +46,12 @@ def clean_overlay(
51
46
  geom_type: str | None = None,
52
47
  predicate: str | None = "intersects",
53
48
  grid_size: float | None = None,
54
- n_jobs: int = 1,
55
49
  lsuffix: str = DEFAULT_LSUFFIX,
56
50
  rsuffix: str = DEFAULT_RSUFFIX,
51
+ n_jobs: int = 1,
52
+ rtree_runner: RTreeQueryRunner | None = None,
53
+ union_runner: UnionRunner | None = None,
54
+ overlay_runner: OverlayRunner | None = None,
57
55
  ) -> GeoDataFrame:
58
56
  """Fixes and explodes geometries before doing a shapely overlay, then cleans up.
59
57
 
@@ -74,10 +72,16 @@ def clean_overlay(
74
72
  "point".
75
73
  grid_size: Precision grid size to round the geometries. Will use the highest
76
74
  precision of the inputs by default.
77
- n_jobs: number of threads.
78
75
  predicate: Spatial predicate in the spatial tree.
79
76
  lsuffix: Suffix of columns in df1 that are also in df2.
80
77
  rsuffix: Suffix of columns in df2 that are also in df1.
78
+ n_jobs: number of jobs. Defaults to 1.
79
+ union_runner: Optionally debug/manipulate the spatial union operations.
80
+ See the 'runners' module for example implementations.
81
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
82
+ See the 'runners' module for example implementations.
83
+ overlay_runner: Optionally debug/manipulate the spatial overlay operations.
84
+ See the 'runners' module for example implementations.
81
85
 
82
86
  Returns:
83
87
  GeoDataFrame with overlayed and fixed geometries and columns from both
@@ -104,6 +108,13 @@ def clean_overlay(
104
108
  if df1.crs != df2.crs:
105
109
  raise ValueError(f"'crs' mismatch. Got {df1.crs} and {df2.crs}")
106
110
 
111
+ if rtree_runner is None:
112
+ rtree_runner = config.get_instance("rtree_runner", n_jobs)
113
+ if union_runner is None:
114
+ union_runner = config.get_instance("union_runner", n_jobs)
115
+ if overlay_runner is None:
116
+ overlay_runner = config.get_instance("overlay_runner", n_jobs)
117
+
107
118
  crs = df1.crs
108
119
 
109
120
  # original_geom_type = geom_type
@@ -119,6 +130,11 @@ def clean_overlay(
119
130
  df1.geometry.geom_type.value_counts(),
120
131
  )
121
132
 
133
+ if geom_type == "polygon" or get_geom_type(df1) == "polygon":
134
+ df1.geometry = df1.buffer(0)
135
+ if geom_type == "polygon" or get_geom_type(df2) == "polygon":
136
+ df2.geometry = df2.buffer(0)
137
+
122
138
  df1 = clean_geoms(df1)
123
139
  df2 = clean_geoms(df2)
124
140
 
@@ -131,21 +147,28 @@ def clean_overlay(
131
147
  if geom_type and get_geom_type(df1) == get_geom_type(df2):
132
148
  df2 = to_single_geom_type(df2, geom_type)
133
149
 
134
- assert df1.is_valid.all(), df1.is_valid.value_counts()
135
- assert df2.is_valid.all(), df2.is_valid.value_counts()
136
- assert df1.geometry.notna().all()
137
- assert df2.geometry.notna().all()
150
+ assert df1.is_valid.all(), [
151
+ geom.wkt for geom in df1[lambda x: x.is_valid == False].geometry
152
+ ]
153
+ assert df2.is_valid.all(), [
154
+ geom.wkt for geom in df2[lambda x: x.is_valid == False].geometry
155
+ ]
156
+ assert df1.geometry.notna().all(), df1[lambda x: x.isna()]
157
+ assert df2.geometry.notna().all(), df2[lambda x: x.isna()]
138
158
 
139
159
  box1 = box(*df1.total_bounds)
140
160
  box2 = box(*df2.total_bounds)
141
161
 
142
- if not len(df1) or not len(df1) or not box1.intersects(box2):
162
+ if not grid_size and (
163
+ (not len(df1) or not len(df2))
164
+ or (not box1.intersects(box2) and how == "intersection")
165
+ ):
143
166
  return _no_intersections_return(df1, df2, how, lsuffix, rsuffix)
144
167
 
145
- if df1._geometry_column_name != "geometry":
168
+ if df1.geometry.name != "geometry":
146
169
  df1 = df1.rename_geometry("geometry")
147
170
 
148
- if df2._geometry_column_name != "geometry":
171
+ if df2.geometry.name != "geometry":
149
172
  df2 = df2.rename_geometry("geometry")
150
173
 
151
174
  # to pandas because GeoDataFrame constructor is expensive
@@ -162,8 +185,10 @@ def clean_overlay(
162
185
  lsuffix=lsuffix,
163
186
  rsuffix=rsuffix,
164
187
  geom_type=geom_type,
165
- n_jobs=n_jobs,
166
188
  predicate=predicate,
189
+ rtree_runner=rtree_runner,
190
+ overlay_runner=overlay_runner,
191
+ union_runner=union_runner,
167
192
  ),
168
193
  geometry="geometry",
169
194
  crs=crs,
@@ -179,9 +204,9 @@ def clean_overlay(
179
204
 
180
205
 
181
206
  def _join_and_get_no_rows(df1, df2, lsuffix, rsuffix):
182
- geom_col = df1._geometry_column_name
207
+ geom_col = df1.geometry.name
183
208
  df1_cols = df1.columns.difference({geom_col})
184
- df2_cols = df2.columns.difference({df2._geometry_column_name})
209
+ df2_cols = df2.columns.difference({df2.geometry.name})
185
210
  cols_with_suffix = [
186
211
  f"{col}{lsuffix}" if col in df2_cols else col for col in df1_cols
187
212
  ] + [f"{col}{rsuffix}" if col in df1_cols else col for col in df2_cols]
@@ -206,7 +231,7 @@ def _no_intersections_return(
206
231
  if how == "identity":
207
232
  # add suffixes and return df1
208
233
  df_template = _join_and_get_no_rows(df1, df2, lsuffix, rsuffix)
209
- df2_cols = df2.columns.difference({df2._geometry_column_name})
234
+ df2_cols = df2.columns.difference({df2.geometry.name})
210
235
  df1.columns = [f"{col}{lsuffix}" if col in df2_cols else col for col in df1]
211
236
  return pd.concat([df_template, df1], ignore_index=True)
212
237
 
@@ -228,33 +253,41 @@ def _shapely_pd_overlay(
228
253
  df1: DataFrame,
229
254
  df2: DataFrame,
230
255
  how: str,
231
- grid_size: float = DEFAULT_GRID_SIZE,
232
- predicate: str = "intersects",
233
- lsuffix: str = DEFAULT_LSUFFIX,
234
- rsuffix: str = DEFAULT_RSUFFIX,
235
- geom_type: str | None = None,
236
- n_jobs: int = 1,
256
+ grid_size: float,
257
+ predicate: str,
258
+ lsuffix: str,
259
+ rsuffix: str,
260
+ geom_type: str | None,
261
+ rtree_runner: RTreeQueryRunner,
262
+ overlay_runner: OverlayRunner,
263
+ union_runner: UnionRunner,
237
264
  ) -> DataFrame:
238
- if not grid_size and not len(df1) or not len(df2):
239
- return _no_intersections_return(df1, df2, how, lsuffix, rsuffix)
240
-
241
- tree = STRtree(df2.geometry.values)
242
- left, right = tree.query(df1.geometry.values, predicate=predicate)
243
-
265
+ left, right = rtree_runner.run(
266
+ df1.geometry.values, df2.geometry.values, predicate=predicate
267
+ )
244
268
  pairs = _get_intersects_pairs(df1, df2, left, right, rsuffix)
245
- assert pairs.geometry.notna().all(), pairs.geometry
246
- assert pairs.geom_right.notna().all(), pairs.geom_right
269
+ assert pairs["geometry"].notna().all(), pairs.geometry[lambda x: x.isna()]
270
+ assert pairs["geom_right"].notna().all(), pairs.geom_right[lambda x: x.isna()]
247
271
 
248
272
  if how == "intersection":
249
273
  overlayed = [
250
274
  _intersection(
251
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
275
+ pairs,
276
+ grid_size=grid_size,
277
+ geom_type=geom_type,
278
+ overlay_runner=overlay_runner,
252
279
  )
253
280
  ]
254
281
 
255
282
  elif how == "difference":
256
283
  overlayed = _difference(
257
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
284
+ pairs,
285
+ df1,
286
+ left,
287
+ grid_size=grid_size,
288
+ geom_type=geom_type,
289
+ overlay_runner=overlay_runner,
290
+ union_runner=union_runner,
258
291
  )
259
292
 
260
293
  elif how == "symmetric_difference":
@@ -267,12 +300,19 @@ def _shapely_pd_overlay(
267
300
  grid_size=grid_size,
268
301
  rsuffix=rsuffix,
269
302
  geom_type=geom_type,
270
- n_jobs=n_jobs,
303
+ overlay_runner=overlay_runner,
304
+ union_runner=union_runner,
271
305
  )
272
306
 
273
307
  elif how == "identity":
274
308
  overlayed = _identity(
275
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
309
+ pairs,
310
+ df1,
311
+ left,
312
+ grid_size=grid_size,
313
+ geom_type=geom_type,
314
+ overlay_runner=overlay_runner,
315
+ union_runner=union_runner,
276
316
  )
277
317
 
278
318
  elif how == "union":
@@ -285,7 +325,8 @@ def _shapely_pd_overlay(
285
325
  grid_size=grid_size,
286
326
  rsuffix=rsuffix,
287
327
  geom_type=geom_type,
288
- n_jobs=n_jobs,
328
+ overlay_runner=overlay_runner,
329
+ union_runner=union_runner,
289
330
  )
290
331
 
291
332
  elif how == "update":
@@ -295,8 +336,9 @@ def _shapely_pd_overlay(
295
336
  df2,
296
337
  left=left,
297
338
  grid_size=grid_size,
298
- n_jobs=n_jobs,
299
339
  geom_type=geom_type,
340
+ overlay_runner=overlay_runner,
341
+ union_runner=union_runner,
300
342
  )
301
343
 
302
344
  assert isinstance(overlayed, list)
@@ -314,8 +356,9 @@ def _shapely_pd_overlay(
314
356
  overlayed = _add_suffix_left(overlayed, df1, df2, lsuffix)
315
357
 
316
358
  overlayed["geometry"] = make_valid(overlayed["geometry"])
317
- # None and empty are falsy
318
- overlayed = overlayed.loc[lambda x: x["geometry"].notna()]
359
+ overlayed = overlayed.loc[
360
+ lambda x: (x["geometry"].notna().values) & (~is_empty(x["geometry"].values))
361
+ ]
319
362
 
320
363
  return overlayed
321
364
 
@@ -327,115 +370,38 @@ def _update(
327
370
  left: np.ndarray,
328
371
  grid_size: float | None | int,
329
372
  geom_type: str | None,
330
- n_jobs: int,
373
+ overlay_runner: OverlayRunner,
374
+ union_runner: UnionRunner,
331
375
  ) -> GeoDataFrame:
332
376
  overlayed = _difference(
333
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
377
+ pairs,
378
+ df1,
379
+ left,
380
+ grid_size=grid_size,
381
+ geom_type=geom_type,
382
+ overlay_runner=overlay_runner,
383
+ union_runner=union_runner,
334
384
  )
335
385
 
336
386
  return overlayed + [df2]
337
387
 
338
388
 
339
- def _run_overlay_dask(
340
- arr1: np.ndarray,
341
- arr2: np.ndarray,
342
- func: Callable,
343
- n_jobs: int,
344
- grid_size: float | int | None,
345
- ) -> np.ndarray:
346
- if len(arr1) // n_jobs <= 1:
347
- try:
348
- return func(arr1, arr2, grid_size=grid_size)
349
- except TypeError as e:
350
- raise TypeError(e, {type(x) for x in arr1}, {type(x) for x in arr2}) from e
351
- arr1 = da.from_array(arr1, chunks=len(arr1) // n_jobs)
352
- arr2 = da.from_array(arr2, chunks=len(arr2) // n_jobs)
353
- res = arr1.map_blocks(func, arr2, grid_size=grid_size, dtype=float)
354
- return res.compute(scheduler="threads", optimize_graph=False, num_workers=n_jobs)
355
-
356
-
357
- def _run_overlay_joblib_threading(
358
- arr1: np.ndarray,
359
- arr2: np.ndarray,
360
- func: Callable,
361
- n_jobs: int,
362
- grid_size: int | float | None,
363
- ) -> list[Geometry]:
364
- if len(arr1) // n_jobs <= 1:
365
- try:
366
- return func(arr1, arr2, grid_size=grid_size)
367
- except TypeError as e:
368
- raise TypeError(e, {type(x) for x in arr1}, {type(x) for x in arr2}) from e
369
- with joblib.Parallel(n_jobs=n_jobs, backend="threading") as parallel:
370
- return parallel(
371
- joblib.delayed(func)(g1, g2, grid_size=grid_size)
372
- for g1, g2 in zip(arr1, arr2, strict=True)
373
- )
374
-
375
-
376
389
  def _intersection(
377
390
  pairs: pd.DataFrame,
378
391
  grid_size: None | float | int,
379
392
  geom_type: str | None,
380
- n_jobs: int = 1,
393
+ overlay_runner: OverlayRunner,
381
394
  ) -> GeoDataFrame:
382
395
  if not len(pairs):
383
396
  return pairs.drop(columns="geom_right")
384
-
385
397
  intersections = pairs.copy()
386
-
387
- arr1 = intersections["geometry"].to_numpy()
388
- arr2 = intersections["geom_right"].to_numpy()
389
-
390
- if n_jobs > 1 and len(arr1) / n_jobs > 10:
391
- try:
392
- res = _run_overlay_joblib_threading(
393
- arr1,
394
- arr2,
395
- func=intersection,
396
- n_jobs=n_jobs,
397
- grid_size=grid_size,
398
- )
399
- except GEOSException:
400
- arr1 = make_valid_and_keep_geom_type(
401
- arr1, geom_type=geom_type, n_jobs=n_jobs
402
- )
403
- arr2 = make_valid_and_keep_geom_type(
404
- arr2, geom_type=geom_type, n_jobs=n_jobs
405
- )
406
- arr1 = arr1.loc[lambda x: x.index.isin(arr2.index)]
407
- arr2 = arr2.loc[lambda x: x.index.isin(arr1.index)]
408
-
409
- res = _run_overlay_joblib_threading(
410
- arr1.to_numpy(),
411
- arr2.to_numpy(),
412
- func=intersection,
413
- n_jobs=n_jobs,
414
- grid_size=grid_size,
415
- )
416
- intersections["geometry"] = res
417
- return intersections.drop(columns="geom_right")
418
-
419
- try:
420
- intersections["geometry"] = intersection(
421
- intersections["geometry"].to_numpy(),
422
- intersections["geom_right"].to_numpy(),
423
- grid_size=grid_size,
424
- )
425
- except GEOSException:
426
- left = make_valid_and_keep_geom_type(
427
- intersections["geometry"].to_numpy(), geom_type, n_jobs=n_jobs
428
- )
429
- right = make_valid_and_keep_geom_type(
430
- intersections["geom_right"].to_numpy(), geom_type, n_jobs=n_jobs
431
- )
432
- left = left.loc[lambda x: x.index.isin(right.index)]
433
- right = right.loc[lambda x: x.index.isin(left.index)]
434
-
435
- intersections["geometry"] = intersection(
436
- left.to_numpy(), right.to_numpy(), grid_size=grid_size
437
- )
438
-
398
+ intersections["geometry"] = overlay_runner.run(
399
+ intersection,
400
+ intersections["geometry"].to_numpy(),
401
+ intersections["geom_right"].to_numpy(),
402
+ grid_size=grid_size,
403
+ geom_type=geom_type,
404
+ )
439
405
  return intersections.drop(columns="geom_right")
440
406
 
441
407
 
@@ -448,12 +414,16 @@ def _union(
448
414
  grid_size: int | float | None,
449
415
  rsuffix: str,
450
416
  geom_type: str | None,
451
- n_jobs: int = 1,
417
+ overlay_runner: OverlayRunner,
418
+ union_runner: UnionRunner,
452
419
  ) -> list[GeoDataFrame]:
453
420
  merged = []
454
421
  if len(left):
455
422
  intersections = _intersection(
456
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
423
+ pairs,
424
+ grid_size=grid_size,
425
+ geom_type=geom_type,
426
+ overlay_runner=overlay_runner,
457
427
  )
458
428
  merged.append(intersections)
459
429
  symmdiff = _symmetric_difference(
@@ -465,7 +435,8 @@ def _union(
465
435
  grid_size=grid_size,
466
436
  rsuffix=rsuffix,
467
437
  geom_type=geom_type,
468
- n_jobs=n_jobs,
438
+ overlay_runner=overlay_runner,
439
+ union_runner=union_runner,
469
440
  )
470
441
  merged += symmdiff
471
442
  return merged
@@ -477,15 +448,27 @@ def _identity(
477
448
  left: np.ndarray,
478
449
  grid_size: int | float | None,
479
450
  geom_type: str | None,
480
- n_jobs: int = 1,
451
+ overlay_runner: OverlayRunner,
452
+ union_runner: UnionRunner,
481
453
  ) -> list[GeoDataFrame]:
482
454
  merged = []
483
455
  if len(left):
484
456
  intersections = _intersection(
485
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
457
+ pairs,
458
+ grid_size=grid_size,
459
+ geom_type=geom_type,
460
+ overlay_runner=overlay_runner,
486
461
  )
487
462
  merged.append(intersections)
488
- diff = _difference(pairs, df1, left, grid_size=grid_size, n_jobs=n_jobs)
463
+ diff = _difference(
464
+ pairs,
465
+ df1,
466
+ left,
467
+ geom_type=geom_type,
468
+ grid_size=grid_size,
469
+ overlay_runner=overlay_runner,
470
+ union_runner=union_runner,
471
+ )
489
472
  merged += diff
490
473
  return merged
491
474
 
@@ -499,12 +482,19 @@ def _symmetric_difference(
499
482
  grid_size: int | float | None,
500
483
  rsuffix: str,
501
484
  geom_type: str | None,
502
- n_jobs: int = 1,
485
+ overlay_runner: OverlayRunner,
486
+ union_runner: UnionRunner,
503
487
  ) -> list[GeoDataFrame]:
504
488
  merged = []
505
489
 
506
490
  difference_left = _difference(
507
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
491
+ pairs,
492
+ df1,
493
+ left,
494
+ grid_size=grid_size,
495
+ geom_type=geom_type,
496
+ overlay_runner=overlay_runner,
497
+ union_runner=union_runner,
508
498
  )
509
499
  merged += difference_left
510
500
 
@@ -516,7 +506,8 @@ def _symmetric_difference(
516
506
  grid_size=grid_size,
517
507
  rsuffix=rsuffix,
518
508
  geom_type=geom_type,
519
- n_jobs=n_jobs,
509
+ overlay_runner=overlay_runner,
510
+ union_runner=union_runner,
520
511
  )
521
512
  merged.append(clip_right)
522
513
 
@@ -530,9 +521,10 @@ def _difference(
530
521
  pairs: pd.DataFrame,
531
522
  df1: pd.DataFrame,
532
523
  left: np.ndarray,
533
- grid_size: int | float | None = None,
534
- geom_type: str | None = None,
535
- n_jobs: int = 1,
524
+ grid_size: int | float | None,
525
+ geom_type: str | None,
526
+ overlay_runner: OverlayRunner,
527
+ union_runner: UnionRunner,
536
528
  ) -> list[GeoDataFrame]:
537
529
  merged = []
538
530
  if len(left):
@@ -541,7 +533,8 @@ def _difference(
541
533
  df1=df1,
542
534
  grid_size=grid_size,
543
535
  geom_type=geom_type,
544
- n_jobs=n_jobs,
536
+ overlay_runner=overlay_runner,
537
+ union_runner=union_runner,
545
538
  )
546
539
  merged.append(clip_left)
547
540
  diff_left = _add_indices_from_left(df1, left)
@@ -609,7 +602,8 @@ def _shapely_diffclip_left(
609
602
  df1: pd.DataFrame,
610
603
  grid_size: int | float | None,
611
604
  geom_type: str | None,
612
- n_jobs: int,
605
+ overlay_runner: OverlayRunner,
606
+ union_runner: UnionRunner,
613
607
  ) -> pd.DataFrame:
614
608
  """Aggregate areas in right by unique values from left, then erases those from left."""
615
609
  keep_cols = list(df1.columns.difference({"_overlay_index_right"})) + ["geom_right"]
@@ -666,12 +660,14 @@ def _shapely_diffclip_left(
666
660
  }
667
661
  )
668
662
 
669
- agged = pd.Series(
670
- {
671
- i: agg_geoms_partial(geoms)
672
- for i, geoms in agger.groupby(level=0)["geom_right"]
673
- }
674
- )
663
+ agged = union_runner.run(agger["geom_right"], level=0)
664
+ # agged = pd.Series(
665
+
666
+ # {
667
+ # i: agg_geoms_partial(geoms)
668
+ # for i, geoms in agger.groupby(level=0)["geom_right"]
669
+ # }
670
+ # )
675
671
  many_hits_agged["geom_right"] = inverse_index_mapper.map(agged)
676
672
  many_hits_agged = many_hits_agged.drop(columns=["_right_indices"])
677
673
 
@@ -679,15 +675,19 @@ def _shapely_diffclip_left(
679
675
  except IndexError:
680
676
  clip_left = pairs.loc[:, list(keep_cols)]
681
677
 
682
- assert clip_left["geometry"].notna().all()
683
- assert clip_left["geom_right"].notna().all()
678
+ assert clip_left["geometry"].notna().all(), clip_left["geometry"][
679
+ lambda x: x.isna()
680
+ ]
681
+ assert clip_left["geom_right"].notna().all(), clip_left["geom_right"][
682
+ lambda x: x.isna()
683
+ ]
684
684
 
685
- clip_left["geometry"] = _try_difference(
685
+ clip_left["geometry"] = overlay_runner.run(
686
+ difference,
686
687
  clip_left["geometry"].to_numpy(),
687
688
  clip_left["geom_right"].to_numpy(),
688
689
  grid_size=grid_size,
689
690
  geom_type=geom_type,
690
- n_jobs=n_jobs,
691
691
  )
692
692
 
693
693
  return clip_left.drop(columns="geom_right")
@@ -700,7 +700,8 @@ def _shapely_diffclip_right(
700
700
  grid_size: int | float | None,
701
701
  rsuffix: str,
702
702
  geom_type: str | None,
703
- n_jobs: int,
703
+ overlay_runner: OverlayRunner,
704
+ union_runner: UnionRunner,
704
705
  ) -> pd.DataFrame:
705
706
  agg_geoms_partial = functools.partial(_agg_geoms, grid_size=grid_size)
706
707
 
@@ -711,16 +712,22 @@ def _shapely_diffclip_right(
711
712
  one_hit = pairs[only_one].set_index("_overlay_index_right")[
712
713
  ["geom_left", "geometry"]
713
714
  ]
714
- many_hits = (
715
- pairs[~only_one]
716
- .groupby("_overlay_index_right")
717
- .agg(
718
- {
719
- "geom_left": agg_geoms_partial,
720
- "geometry": "first",
721
- }
722
- )
715
+ many_hits_ungrouped = pairs[~only_one].set_index("_overlay_index_right")
716
+ many_hits = pd.DataFrame(index=many_hits_ungrouped.index.unique())
717
+ many_hits["geometry"] = many_hits_ungrouped.groupby(level=0)["geometry"].first()
718
+ many_hits["geom_left"] = union_runner.run(
719
+ many_hits_ungrouped["geom_left"], level=0
723
720
  )
721
+ # many_hits = (
722
+ # pairs[~only_one]
723
+ # .groupby("_overlay_index_right")
724
+ # .agg(
725
+ # {
726
+ # "geom_left": agg_geoms_partial,
727
+ # "geometry": "first",
728
+ # }
729
+ # )
730
+ # )
724
731
  clip_right = (
725
732
  pd.concat([one_hit, many_hits])
726
733
  .join(df2.drop(columns=["geometry"]))
@@ -739,10 +746,15 @@ def _shapely_diffclip_right(
739
746
  }
740
747
  )
741
748
 
742
- assert clip_right["geometry"].notna().all()
743
- assert clip_right["geom_left"].notna().all()
749
+ assert clip_right["geometry"].notna().all(), clip_right["geometry"][
750
+ lambda x: x.isna()
751
+ ]
752
+ assert clip_right["geom_left"].notna().all(), clip_right["geom_left"][
753
+ lambda x: x.isna()
754
+ ]
744
755
 
745
- clip_right["geometry"] = _try_difference(
756
+ clip_right["geometry"] = overlay_runner.run(
757
+ difference,
746
758
  clip_right["geometry"].to_numpy(),
747
759
  clip_right["geom_left"].to_numpy(),
748
760
  grid_size=grid_size,
@@ -752,87 +764,5 @@ def _shapely_diffclip_right(
752
764
  return clip_right.drop(columns="geom_left")
753
765
 
754
766
 
755
- def _try_difference(
756
- left: np.ndarray,
757
- right: np.ndarray,
758
- grid_size: int | float | None,
759
- geom_type: str | None,
760
- n_jobs: int = 1,
761
- ) -> np.ndarray:
762
- """Try difference overlay, then make_valid and retry."""
763
- if n_jobs > 1 and len(left) / n_jobs > 10:
764
- try:
765
- return _run_overlay_joblib_threading(
766
- left,
767
- right,
768
- func=difference,
769
- n_jobs=n_jobs,
770
- grid_size=grid_size,
771
- )
772
- except GEOSException:
773
- left = make_valid_and_keep_geom_type(
774
- left, geom_type=geom_type, n_jobs=n_jobs
775
- )
776
- right = make_valid_and_keep_geom_type(
777
- right, geom_type=geom_type, n_jobs=n_jobs
778
- )
779
- left = left.loc[lambda x: x.index.isin(right.index)]
780
- right = right.loc[lambda x: x.index.isin(left.index)]
781
-
782
- return _run_overlay_joblib_threading(
783
- left.to_numpy(),
784
- right.to_numpy(),
785
- func=difference,
786
- n_jobs=n_jobs,
787
- grid_size=grid_size,
788
- )
789
-
790
- try:
791
- return difference(
792
- left,
793
- right,
794
- grid_size=grid_size,
795
- )
796
- except GEOSException:
797
- left = make_valid_and_keep_geom_type(left, geom_type, n_jobs=n_jobs)
798
- right = make_valid_and_keep_geom_type(right, geom_type, n_jobs=n_jobs)
799
- left = left.loc[lambda x: x.index.isin(right.index)]
800
- right = right.loc[lambda x: x.index.isin(left.index)]
801
- try:
802
- return difference(
803
- left.to_numpy(),
804
- right.to_numpy(),
805
- grid_size=grid_size,
806
- )
807
- except GEOSException as e:
808
- raise e.__class__(e, f"{grid_size=}", f"{left=}", f"{right=}") from e
809
-
810
-
811
- def make_valid_and_keep_geom_type(
812
- geoms: np.ndarray, geom_type: str, n_jobs: int
813
- ) -> GeoSeries:
814
- """Make GeometryCollections into (Multi)Polygons, (Multi)LineStrings or (Multi)Points.
815
-
816
- Because GeometryCollections might appear after dissolving (unary_union).
817
- And this makes shapely difference/intersection fail.
818
-
819
- Args:
820
- geoms: Array of geometries.
821
- geom_type: geometry type to be kept.
822
- n_jobs: Number of treads.
823
- """
824
- geoms = GeoSeries(geoms)
825
- geoms.index = range(len(geoms))
826
- geoms.loc[:] = make_valid(geoms.to_numpy())
827
- geoms_with_correct_type = geoms.explode(index_parts=False).pipe(
828
- to_single_geom_type, geom_type
829
- )
830
- only_one = geoms_with_correct_type.groupby(level=0).transform("size") == 1
831
- one_hit = geoms_with_correct_type[only_one]
832
- many_hits = geoms_with_correct_type[~only_one].groupby(level=0).agg(unary_union)
833
- geoms_with_wrong_type = geoms.loc[~geoms.index.isin(geoms_with_correct_type.index)]
834
- return pd.concat([one_hit, many_hits, geoms_with_wrong_type]).sort_index()
835
-
836
-
837
767
  def _agg_geoms(g: np.ndarray, grid_size: int | float | None = None) -> Geometry:
838
- return make_valid(unary_union(g, grid_size=grid_size))
768
+ return make_valid(union_all(g, grid_size=grid_size))