ssb-sgis 1.1.17__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,34 +9,30 @@ version of the solution from GH 2792.
9
9
  """
10
10
 
11
11
  import functools
12
- from collections.abc import Callable
13
12
 
14
13
  import geopandas as gpd
15
- import joblib
16
14
  import numpy as np
17
15
  import pandas as pd
18
16
  from geopandas import GeoDataFrame
19
- from geopandas import GeoSeries
20
17
  from pandas import DataFrame
21
18
  from shapely import Geometry
22
- from shapely import STRtree
23
19
  from shapely import box
24
20
  from shapely import difference
25
21
  from shapely import intersection
22
+ from shapely import is_empty
26
23
  from shapely import make_valid
27
- from shapely import unary_union
28
- from shapely.errors import GEOSException
29
-
30
- try:
31
- import dask.array as da
32
- except ImportError:
33
- pass
24
+ from shapely import union_all
34
25
 
26
+ from ..conf import _get_instance
27
+ from ..conf import config
35
28
  from .general import _determine_geom_type_args
36
29
  from .general import clean_geoms
37
30
  from .geometry_types import get_geom_type
38
31
  from .geometry_types import make_all_singlepart
39
32
  from .geometry_types import to_single_geom_type
33
+ from .runners import OverlayRunner
34
+ from .runners import RTreeQueryRunner
35
+ from .runners import UnionRunner
40
36
 
41
37
  DEFAULT_GRID_SIZE = None
42
38
  DEFAULT_LSUFFIX = "_1"
@@ -51,9 +47,12 @@ def clean_overlay(
51
47
  geom_type: str | None = None,
52
48
  predicate: str | None = "intersects",
53
49
  grid_size: float | None = None,
54
- n_jobs: int = 1,
55
50
  lsuffix: str = DEFAULT_LSUFFIX,
56
51
  rsuffix: str = DEFAULT_RSUFFIX,
52
+ n_jobs: int = 1,
53
+ rtree_runner: RTreeQueryRunner | None = None,
54
+ union_runner: UnionRunner | None = None,
55
+ overlay_runner: OverlayRunner | None = None,
57
56
  ) -> GeoDataFrame:
58
57
  """Fixes and explodes geometries before doing a shapely overlay, then cleans up.
59
58
 
@@ -74,10 +73,16 @@ def clean_overlay(
74
73
  "point".
75
74
  grid_size: Precision grid size to round the geometries. Will use the highest
76
75
  precision of the inputs by default.
77
- n_jobs: number of threads.
78
76
  predicate: Spatial predicate in the spatial tree.
79
77
  lsuffix: Suffix of columns in df1 that are also in df2.
80
78
  rsuffix: Suffix of columns in df2 that are also in df1.
79
+ n_jobs: number of jobs. Defaults to 1.
80
+ union_runner: Optionally debug/manipulate the spatial union operations.
81
+ See the 'runners' module for example implementations.
82
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
83
+ See the 'runners' module for example implementations.
84
+ overlay_runner: Optionally debug/manipulate the spatial overlay operations.
85
+ See the 'runners' module for example implementations.
81
86
 
82
87
  Returns:
83
88
  GeoDataFrame with overlayed and fixed geometries and columns from both
@@ -104,6 +109,13 @@ def clean_overlay(
104
109
  if df1.crs != df2.crs:
105
110
  raise ValueError(f"'crs' mismatch. Got {df1.crs} and {df2.crs}")
106
111
 
112
+ if rtree_runner is None:
113
+ rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
114
+ if union_runner is None:
115
+ union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
116
+ if overlay_runner is None:
117
+ overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
118
+
107
119
  crs = df1.crs
108
120
 
109
121
  # original_geom_type = geom_type
@@ -148,13 +160,16 @@ def clean_overlay(
148
160
  box1 = box(*df1.total_bounds)
149
161
  box2 = box(*df2.total_bounds)
150
162
 
151
- if not len(df1) or not len(df1) or not box1.intersects(box2):
163
+ if not grid_size and (
164
+ (not len(df1) or not len(df2))
165
+ or (not box1.intersects(box2) and how == "intersection")
166
+ ):
152
167
  return _no_intersections_return(df1, df2, how, lsuffix, rsuffix)
153
168
 
154
- if df1._geometry_column_name != "geometry":
169
+ if df1.geometry.name != "geometry":
155
170
  df1 = df1.rename_geometry("geometry")
156
171
 
157
- if df2._geometry_column_name != "geometry":
172
+ if df2.geometry.name != "geometry":
158
173
  df2 = df2.rename_geometry("geometry")
159
174
 
160
175
  # to pandas because GeoDataFrame constructor is expensive
@@ -171,8 +186,10 @@ def clean_overlay(
171
186
  lsuffix=lsuffix,
172
187
  rsuffix=rsuffix,
173
188
  geom_type=geom_type,
174
- n_jobs=n_jobs,
175
189
  predicate=predicate,
190
+ rtree_runner=rtree_runner,
191
+ overlay_runner=overlay_runner,
192
+ union_runner=union_runner,
176
193
  ),
177
194
  geometry="geometry",
178
195
  crs=crs,
@@ -188,9 +205,9 @@ def clean_overlay(
188
205
 
189
206
 
190
207
  def _join_and_get_no_rows(df1, df2, lsuffix, rsuffix):
191
- geom_col = df1._geometry_column_name
208
+ geom_col = df1.geometry.name
192
209
  df1_cols = df1.columns.difference({geom_col})
193
- df2_cols = df2.columns.difference({df2._geometry_column_name})
210
+ df2_cols = df2.columns.difference({df2.geometry.name})
194
211
  cols_with_suffix = [
195
212
  f"{col}{lsuffix}" if col in df2_cols else col for col in df1_cols
196
213
  ] + [f"{col}{rsuffix}" if col in df1_cols else col for col in df2_cols]
@@ -215,7 +232,7 @@ def _no_intersections_return(
215
232
  if how == "identity":
216
233
  # add suffixes and return df1
217
234
  df_template = _join_and_get_no_rows(df1, df2, lsuffix, rsuffix)
218
- df2_cols = df2.columns.difference({df2._geometry_column_name})
235
+ df2_cols = df2.columns.difference({df2.geometry.name})
219
236
  df1.columns = [f"{col}{lsuffix}" if col in df2_cols else col for col in df1]
220
237
  return pd.concat([df_template, df1], ignore_index=True)
221
238
 
@@ -237,33 +254,41 @@ def _shapely_pd_overlay(
237
254
  df1: DataFrame,
238
255
  df2: DataFrame,
239
256
  how: str,
240
- grid_size: float = DEFAULT_GRID_SIZE,
241
- predicate: str = "intersects",
242
- lsuffix: str = DEFAULT_LSUFFIX,
243
- rsuffix: str = DEFAULT_RSUFFIX,
244
- geom_type: str | None = None,
245
- n_jobs: int = 1,
257
+ grid_size: float,
258
+ predicate: str,
259
+ lsuffix: str,
260
+ rsuffix: str,
261
+ geom_type: str | None,
262
+ rtree_runner: RTreeQueryRunner,
263
+ overlay_runner: OverlayRunner,
264
+ union_runner: UnionRunner,
246
265
  ) -> DataFrame:
247
- if not grid_size and not len(df1) or not len(df2):
248
- return _no_intersections_return(df1, df2, how, lsuffix, rsuffix)
249
-
250
- tree = STRtree(df2.geometry.values)
251
- left, right = tree.query(df1.geometry.values, predicate=predicate)
252
-
266
+ left, right = rtree_runner.run(
267
+ df1.geometry.values, df2.geometry.values, predicate=predicate
268
+ )
253
269
  pairs = _get_intersects_pairs(df1, df2, left, right, rsuffix)
254
- assert pairs.geometry.notna().all(), pairs.geometry
255
- assert pairs.geom_right.notna().all(), pairs.geom_right
270
+ assert pairs["geometry"].notna().all(), pairs.geometry[lambda x: x.isna()]
271
+ assert pairs["geom_right"].notna().all(), pairs.geom_right[lambda x: x.isna()]
256
272
 
257
273
  if how == "intersection":
258
274
  overlayed = [
259
275
  _intersection(
260
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
276
+ pairs,
277
+ grid_size=grid_size,
278
+ geom_type=geom_type,
279
+ overlay_runner=overlay_runner,
261
280
  )
262
281
  ]
263
282
 
264
283
  elif how == "difference":
265
284
  overlayed = _difference(
266
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
285
+ pairs,
286
+ df1,
287
+ left,
288
+ grid_size=grid_size,
289
+ geom_type=geom_type,
290
+ overlay_runner=overlay_runner,
291
+ union_runner=union_runner,
267
292
  )
268
293
 
269
294
  elif how == "symmetric_difference":
@@ -276,12 +301,19 @@ def _shapely_pd_overlay(
276
301
  grid_size=grid_size,
277
302
  rsuffix=rsuffix,
278
303
  geom_type=geom_type,
279
- n_jobs=n_jobs,
304
+ overlay_runner=overlay_runner,
305
+ union_runner=union_runner,
280
306
  )
281
307
 
282
308
  elif how == "identity":
283
309
  overlayed = _identity(
284
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
310
+ pairs,
311
+ df1,
312
+ left,
313
+ grid_size=grid_size,
314
+ geom_type=geom_type,
315
+ overlay_runner=overlay_runner,
316
+ union_runner=union_runner,
285
317
  )
286
318
 
287
319
  elif how == "union":
@@ -294,7 +326,8 @@ def _shapely_pd_overlay(
294
326
  grid_size=grid_size,
295
327
  rsuffix=rsuffix,
296
328
  geom_type=geom_type,
297
- n_jobs=n_jobs,
329
+ overlay_runner=overlay_runner,
330
+ union_runner=union_runner,
298
331
  )
299
332
 
300
333
  elif how == "update":
@@ -304,8 +337,9 @@ def _shapely_pd_overlay(
304
337
  df2,
305
338
  left=left,
306
339
  grid_size=grid_size,
307
- n_jobs=n_jobs,
308
340
  geom_type=geom_type,
341
+ overlay_runner=overlay_runner,
342
+ union_runner=union_runner,
309
343
  )
310
344
 
311
345
  assert isinstance(overlayed, list)
@@ -323,8 +357,9 @@ def _shapely_pd_overlay(
323
357
  overlayed = _add_suffix_left(overlayed, df1, df2, lsuffix)
324
358
 
325
359
  overlayed["geometry"] = make_valid(overlayed["geometry"])
326
- # None and empty are falsy
327
- overlayed = overlayed.loc[lambda x: x["geometry"].notna()]
360
+ overlayed = overlayed.loc[
361
+ lambda x: (x["geometry"].notna().values) & (~is_empty(x["geometry"].values))
362
+ ]
328
363
 
329
364
  return overlayed
330
365
 
@@ -336,115 +371,38 @@ def _update(
336
371
  left: np.ndarray,
337
372
  grid_size: float | None | int,
338
373
  geom_type: str | None,
339
- n_jobs: int,
374
+ overlay_runner: OverlayRunner,
375
+ union_runner: UnionRunner,
340
376
  ) -> GeoDataFrame:
341
377
  overlayed = _difference(
342
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
378
+ pairs,
379
+ df1,
380
+ left,
381
+ grid_size=grid_size,
382
+ geom_type=geom_type,
383
+ overlay_runner=overlay_runner,
384
+ union_runner=union_runner,
343
385
  )
344
386
 
345
387
  return overlayed + [df2]
346
388
 
347
389
 
348
- def _run_overlay_dask(
349
- arr1: np.ndarray,
350
- arr2: np.ndarray,
351
- func: Callable,
352
- n_jobs: int,
353
- grid_size: float | int | None,
354
- ) -> np.ndarray:
355
- if len(arr1) // n_jobs <= 1:
356
- try:
357
- return func(arr1, arr2, grid_size=grid_size)
358
- except TypeError as e:
359
- raise TypeError(e, {type(x) for x in arr1}, {type(x) for x in arr2}) from e
360
- arr1 = da.from_array(arr1, chunks=len(arr1) // n_jobs)
361
- arr2 = da.from_array(arr2, chunks=len(arr2) // n_jobs)
362
- res = arr1.map_blocks(func, arr2, grid_size=grid_size, dtype=float)
363
- return res.compute(scheduler="threads", optimize_graph=False, num_workers=n_jobs)
364
-
365
-
366
- def _run_overlay_joblib_threading(
367
- arr1: np.ndarray,
368
- arr2: np.ndarray,
369
- func: Callable,
370
- n_jobs: int,
371
- grid_size: int | float | None,
372
- ) -> list[Geometry]:
373
- if len(arr1) // n_jobs <= 1:
374
- try:
375
- return func(arr1, arr2, grid_size=grid_size)
376
- except TypeError as e:
377
- raise TypeError(e, {type(x) for x in arr1}, {type(x) for x in arr2}) from e
378
- with joblib.Parallel(n_jobs=n_jobs, backend="threading") as parallel:
379
- return parallel(
380
- joblib.delayed(func)(g1, g2, grid_size=grid_size)
381
- for g1, g2 in zip(arr1, arr2, strict=True)
382
- )
383
-
384
-
385
390
  def _intersection(
386
391
  pairs: pd.DataFrame,
387
392
  grid_size: None | float | int,
388
393
  geom_type: str | None,
389
- n_jobs: int = 1,
394
+ overlay_runner: OverlayRunner,
390
395
  ) -> GeoDataFrame:
391
396
  if not len(pairs):
392
397
  return pairs.drop(columns="geom_right")
393
-
394
398
  intersections = pairs.copy()
395
-
396
- arr1 = intersections["geometry"].to_numpy()
397
- arr2 = intersections["geom_right"].to_numpy()
398
-
399
- if n_jobs > 1 and len(arr1) / n_jobs > 10:
400
- try:
401
- res = _run_overlay_joblib_threading(
402
- arr1,
403
- arr2,
404
- func=intersection,
405
- n_jobs=n_jobs,
406
- grid_size=grid_size,
407
- )
408
- except GEOSException:
409
- arr1 = make_valid_and_keep_geom_type(
410
- arr1, geom_type=geom_type, n_jobs=n_jobs
411
- )
412
- arr2 = make_valid_and_keep_geom_type(
413
- arr2, geom_type=geom_type, n_jobs=n_jobs
414
- )
415
- arr1 = arr1.loc[lambda x: x.index.isin(arr2.index)]
416
- arr2 = arr2.loc[lambda x: x.index.isin(arr1.index)]
417
-
418
- res = _run_overlay_joblib_threading(
419
- arr1.to_numpy(),
420
- arr2.to_numpy(),
421
- func=intersection,
422
- n_jobs=n_jobs,
423
- grid_size=grid_size,
424
- )
425
- intersections["geometry"] = res
426
- return intersections.drop(columns="geom_right")
427
-
428
- try:
429
- intersections["geometry"] = intersection(
430
- intersections["geometry"].to_numpy(),
431
- intersections["geom_right"].to_numpy(),
432
- grid_size=grid_size,
433
- )
434
- except GEOSException:
435
- left = make_valid_and_keep_geom_type(
436
- intersections["geometry"].to_numpy(), geom_type, n_jobs=n_jobs
437
- )
438
- right = make_valid_and_keep_geom_type(
439
- intersections["geom_right"].to_numpy(), geom_type, n_jobs=n_jobs
440
- )
441
- left = left.loc[lambda x: x.index.isin(right.index)]
442
- right = right.loc[lambda x: x.index.isin(left.index)]
443
-
444
- intersections["geometry"] = intersection(
445
- left.to_numpy(), right.to_numpy(), grid_size=grid_size
446
- )
447
-
399
+ intersections["geometry"] = overlay_runner.run(
400
+ intersection,
401
+ intersections["geometry"].to_numpy(),
402
+ intersections["geom_right"].to_numpy(),
403
+ grid_size=grid_size,
404
+ geom_type=geom_type,
405
+ )
448
406
  return intersections.drop(columns="geom_right")
449
407
 
450
408
 
@@ -457,12 +415,16 @@ def _union(
457
415
  grid_size: int | float | None,
458
416
  rsuffix: str,
459
417
  geom_type: str | None,
460
- n_jobs: int = 1,
418
+ overlay_runner: OverlayRunner,
419
+ union_runner: UnionRunner,
461
420
  ) -> list[GeoDataFrame]:
462
421
  merged = []
463
422
  if len(left):
464
423
  intersections = _intersection(
465
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
424
+ pairs,
425
+ grid_size=grid_size,
426
+ geom_type=geom_type,
427
+ overlay_runner=overlay_runner,
466
428
  )
467
429
  merged.append(intersections)
468
430
  symmdiff = _symmetric_difference(
@@ -474,7 +436,8 @@ def _union(
474
436
  grid_size=grid_size,
475
437
  rsuffix=rsuffix,
476
438
  geom_type=geom_type,
477
- n_jobs=n_jobs,
439
+ overlay_runner=overlay_runner,
440
+ union_runner=union_runner,
478
441
  )
479
442
  merged += symmdiff
480
443
  return merged
@@ -486,15 +449,27 @@ def _identity(
486
449
  left: np.ndarray,
487
450
  grid_size: int | float | None,
488
451
  geom_type: str | None,
489
- n_jobs: int = 1,
452
+ overlay_runner: OverlayRunner,
453
+ union_runner: UnionRunner,
490
454
  ) -> list[GeoDataFrame]:
491
455
  merged = []
492
456
  if len(left):
493
457
  intersections = _intersection(
494
- pairs, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
458
+ pairs,
459
+ grid_size=grid_size,
460
+ geom_type=geom_type,
461
+ overlay_runner=overlay_runner,
495
462
  )
496
463
  merged.append(intersections)
497
- diff = _difference(pairs, df1, left, grid_size=grid_size, n_jobs=n_jobs)
464
+ diff = _difference(
465
+ pairs,
466
+ df1,
467
+ left,
468
+ geom_type=geom_type,
469
+ grid_size=grid_size,
470
+ overlay_runner=overlay_runner,
471
+ union_runner=union_runner,
472
+ )
498
473
  merged += diff
499
474
  return merged
500
475
 
@@ -508,12 +483,19 @@ def _symmetric_difference(
508
483
  grid_size: int | float | None,
509
484
  rsuffix: str,
510
485
  geom_type: str | None,
511
- n_jobs: int = 1,
486
+ overlay_runner: OverlayRunner,
487
+ union_runner: UnionRunner,
512
488
  ) -> list[GeoDataFrame]:
513
489
  merged = []
514
490
 
515
491
  difference_left = _difference(
516
- pairs, df1, left, grid_size=grid_size, geom_type=geom_type, n_jobs=n_jobs
492
+ pairs,
493
+ df1,
494
+ left,
495
+ grid_size=grid_size,
496
+ geom_type=geom_type,
497
+ overlay_runner=overlay_runner,
498
+ union_runner=union_runner,
517
499
  )
518
500
  merged += difference_left
519
501
 
@@ -525,7 +507,8 @@ def _symmetric_difference(
525
507
  grid_size=grid_size,
526
508
  rsuffix=rsuffix,
527
509
  geom_type=geom_type,
528
- n_jobs=n_jobs,
510
+ overlay_runner=overlay_runner,
511
+ union_runner=union_runner,
529
512
  )
530
513
  merged.append(clip_right)
531
514
 
@@ -539,9 +522,10 @@ def _difference(
539
522
  pairs: pd.DataFrame,
540
523
  df1: pd.DataFrame,
541
524
  left: np.ndarray,
542
- grid_size: int | float | None = None,
543
- geom_type: str | None = None,
544
- n_jobs: int = 1,
525
+ grid_size: int | float | None,
526
+ geom_type: str | None,
527
+ overlay_runner: OverlayRunner,
528
+ union_runner: UnionRunner,
545
529
  ) -> list[GeoDataFrame]:
546
530
  merged = []
547
531
  if len(left):
@@ -550,7 +534,8 @@ def _difference(
550
534
  df1=df1,
551
535
  grid_size=grid_size,
552
536
  geom_type=geom_type,
553
- n_jobs=n_jobs,
537
+ overlay_runner=overlay_runner,
538
+ union_runner=union_runner,
554
539
  )
555
540
  merged.append(clip_left)
556
541
  diff_left = _add_indices_from_left(df1, left)
@@ -618,7 +603,8 @@ def _shapely_diffclip_left(
618
603
  df1: pd.DataFrame,
619
604
  grid_size: int | float | None,
620
605
  geom_type: str | None,
621
- n_jobs: int,
606
+ overlay_runner: OverlayRunner,
607
+ union_runner: UnionRunner,
622
608
  ) -> pd.DataFrame:
623
609
  """Aggregate areas in right by unique values from left, then erases those from left."""
624
610
  keep_cols = list(df1.columns.difference({"_overlay_index_right"})) + ["geom_right"]
@@ -675,12 +661,14 @@ def _shapely_diffclip_left(
675
661
  }
676
662
  )
677
663
 
678
- agged = pd.Series(
679
- {
680
- i: agg_geoms_partial(geoms)
681
- for i, geoms in agger.groupby(level=0)["geom_right"]
682
- }
683
- )
664
+ agged = union_runner.run(agger["geom_right"], level=0)
665
+ # agged = pd.Series(
666
+
667
+ # {
668
+ # i: agg_geoms_partial(geoms)
669
+ # for i, geoms in agger.groupby(level=0)["geom_right"]
670
+ # }
671
+ # )
684
672
  many_hits_agged["geom_right"] = inverse_index_mapper.map(agged)
685
673
  many_hits_agged = many_hits_agged.drop(columns=["_right_indices"])
686
674
 
@@ -688,15 +676,19 @@ def _shapely_diffclip_left(
688
676
  except IndexError:
689
677
  clip_left = pairs.loc[:, list(keep_cols)]
690
678
 
691
- assert clip_left["geometry"].notna().all()
692
- assert clip_left["geom_right"].notna().all()
679
+ assert clip_left["geometry"].notna().all(), clip_left["geometry"][
680
+ lambda x: x.isna()
681
+ ]
682
+ assert clip_left["geom_right"].notna().all(), clip_left["geom_right"][
683
+ lambda x: x.isna()
684
+ ]
693
685
 
694
- clip_left["geometry"] = _try_difference(
686
+ clip_left["geometry"] = overlay_runner.run(
687
+ difference,
695
688
  clip_left["geometry"].to_numpy(),
696
689
  clip_left["geom_right"].to_numpy(),
697
690
  grid_size=grid_size,
698
691
  geom_type=geom_type,
699
- n_jobs=n_jobs,
700
692
  )
701
693
 
702
694
  return clip_left.drop(columns="geom_right")
@@ -709,7 +701,8 @@ def _shapely_diffclip_right(
709
701
  grid_size: int | float | None,
710
702
  rsuffix: str,
711
703
  geom_type: str | None,
712
- n_jobs: int,
704
+ overlay_runner: OverlayRunner,
705
+ union_runner: UnionRunner,
713
706
  ) -> pd.DataFrame:
714
707
  agg_geoms_partial = functools.partial(_agg_geoms, grid_size=grid_size)
715
708
 
@@ -720,16 +713,22 @@ def _shapely_diffclip_right(
720
713
  one_hit = pairs[only_one].set_index("_overlay_index_right")[
721
714
  ["geom_left", "geometry"]
722
715
  ]
723
- many_hits = (
724
- pairs[~only_one]
725
- .groupby("_overlay_index_right")
726
- .agg(
727
- {
728
- "geom_left": agg_geoms_partial,
729
- "geometry": "first",
730
- }
731
- )
716
+ many_hits_ungrouped = pairs[~only_one].set_index("_overlay_index_right")
717
+ many_hits = pd.DataFrame(index=many_hits_ungrouped.index.unique())
718
+ many_hits["geometry"] = many_hits_ungrouped.groupby(level=0)["geometry"].first()
719
+ many_hits["geom_left"] = union_runner.run(
720
+ many_hits_ungrouped["geom_left"], level=0
732
721
  )
722
+ # many_hits = (
723
+ # pairs[~only_one]
724
+ # .groupby("_overlay_index_right")
725
+ # .agg(
726
+ # {
727
+ # "geom_left": agg_geoms_partial,
728
+ # "geometry": "first",
729
+ # }
730
+ # )
731
+ # )
733
732
  clip_right = (
734
733
  pd.concat([one_hit, many_hits])
735
734
  .join(df2.drop(columns=["geometry"]))
@@ -748,10 +747,15 @@ def _shapely_diffclip_right(
748
747
  }
749
748
  )
750
749
 
751
- assert clip_right["geometry"].notna().all()
752
- assert clip_right["geom_left"].notna().all()
750
+ assert clip_right["geometry"].notna().all(), clip_right["geometry"][
751
+ lambda x: x.isna()
752
+ ]
753
+ assert clip_right["geom_left"].notna().all(), clip_right["geom_left"][
754
+ lambda x: x.isna()
755
+ ]
753
756
 
754
- clip_right["geometry"] = _try_difference(
757
+ clip_right["geometry"] = overlay_runner.run(
758
+ difference,
755
759
  clip_right["geometry"].to_numpy(),
756
760
  clip_right["geom_left"].to_numpy(),
757
761
  grid_size=grid_size,
@@ -761,87 +765,5 @@ def _shapely_diffclip_right(
761
765
  return clip_right.drop(columns="geom_left")
762
766
 
763
767
 
764
- def _try_difference(
765
- left: np.ndarray,
766
- right: np.ndarray,
767
- grid_size: int | float | None,
768
- geom_type: str | None,
769
- n_jobs: int = 1,
770
- ) -> np.ndarray:
771
- """Try difference overlay, then make_valid and retry."""
772
- if n_jobs > 1 and len(left) / n_jobs > 10:
773
- try:
774
- return _run_overlay_joblib_threading(
775
- left,
776
- right,
777
- func=difference,
778
- n_jobs=n_jobs,
779
- grid_size=grid_size,
780
- )
781
- except GEOSException:
782
- left = make_valid_and_keep_geom_type(
783
- left, geom_type=geom_type, n_jobs=n_jobs
784
- )
785
- right = make_valid_and_keep_geom_type(
786
- right, geom_type=geom_type, n_jobs=n_jobs
787
- )
788
- left = left.loc[lambda x: x.index.isin(right.index)]
789
- right = right.loc[lambda x: x.index.isin(left.index)]
790
-
791
- return _run_overlay_joblib_threading(
792
- left.to_numpy(),
793
- right.to_numpy(),
794
- func=difference,
795
- n_jobs=n_jobs,
796
- grid_size=grid_size,
797
- )
798
-
799
- try:
800
- return difference(
801
- left,
802
- right,
803
- grid_size=grid_size,
804
- )
805
- except GEOSException:
806
- left = make_valid_and_keep_geom_type(left, geom_type, n_jobs=n_jobs)
807
- right = make_valid_and_keep_geom_type(right, geom_type, n_jobs=n_jobs)
808
- left = left.loc[lambda x: x.index.isin(right.index)]
809
- right = right.loc[lambda x: x.index.isin(left.index)]
810
- try:
811
- return difference(
812
- left.to_numpy(),
813
- right.to_numpy(),
814
- grid_size=grid_size,
815
- )
816
- except GEOSException as e:
817
- raise e.__class__(e, f"{grid_size=}", f"{left=}", f"{right=}") from e
818
-
819
-
820
- def make_valid_and_keep_geom_type(
821
- geoms: np.ndarray, geom_type: str, n_jobs: int
822
- ) -> GeoSeries:
823
- """Make GeometryCollections into (Multi)Polygons, (Multi)LineStrings or (Multi)Points.
824
-
825
- Because GeometryCollections might appear after dissolving (unary_union).
826
- And this makes shapely difference/intersection fail.
827
-
828
- Args:
829
- geoms: Array of geometries.
830
- geom_type: geometry type to be kept.
831
- n_jobs: Number of treads.
832
- """
833
- geoms = GeoSeries(geoms)
834
- geoms.index = range(len(geoms))
835
- geoms.loc[:] = make_valid(geoms.to_numpy())
836
- geoms_with_correct_type = geoms.explode(index_parts=False).pipe(
837
- to_single_geom_type, geom_type
838
- )
839
- only_one = geoms_with_correct_type.groupby(level=0).transform("size") == 1
840
- one_hit = geoms_with_correct_type[only_one]
841
- many_hits = geoms_with_correct_type[~only_one].groupby(level=0).agg(unary_union)
842
- geoms_with_wrong_type = geoms.loc[~geoms.index.isin(geoms_with_correct_type.index)]
843
- return pd.concat([one_hit, many_hits, geoms_with_wrong_type]).sort_index()
844
-
845
-
846
768
  def _agg_geoms(g: np.ndarray, grid_size: int | float | None = None) -> Geometry:
847
- return make_valid(unary_union(g, grid_size=grid_size))
769
+ return make_valid(union_all(g, grid_size=grid_size))