ssb-sgis 0.3.13__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sgis/__init__.py CHANGED
@@ -22,18 +22,18 @@ from .geopandas_tools.buffer_dissolve_explode import (
22
22
  buffdiss,
23
23
  buffdissexp,
24
24
  buffdissexp_by_cluster,
25
+ diss,
26
+ diss_by_cluster,
25
27
  dissexp,
26
28
  dissexp_by_cluster,
27
29
  )
28
30
  from .geopandas_tools.centerlines import get_rough_centerlines
29
31
  from .geopandas_tools.cleaning import (
30
32
  coverage_clean,
33
+ remove_interior_slivers,
31
34
  remove_spikes,
32
- snap_polygons,
33
- snap_to_mask,
34
35
  split_and_eliminate_by_longest,
35
36
  split_by_neighbors,
36
- split_spiky_polygons,
37
37
  )
38
38
  from .geopandas_tools.conversion import (
39
39
  coordinate_array,
@@ -76,6 +76,7 @@ from .geopandas_tools.neighbors import (
76
76
  get_neighbor_dfs,
77
77
  get_neighbor_indices,
78
78
  k_nearest_neighbors,
79
+ sjoin_within_distance,
79
80
  )
80
81
  from .geopandas_tools.overlay import clean_overlay
81
82
  from .geopandas_tools.point_operations import snap_all, snap_within_distance
@@ -87,6 +88,7 @@ from .geopandas_tools.polygon_operations import (
87
88
  eliminate_by_largest,
88
89
  eliminate_by_longest,
89
90
  eliminate_by_smallest,
91
+ get_cluster_mapper,
90
92
  get_gaps,
91
93
  get_holes,
92
94
  get_polygon_clusters,
@@ -130,7 +132,7 @@ from .networkanalysis.nodes import (
130
132
  make_node_ids,
131
133
  )
132
134
  from .networkanalysis.traveling_salesman import traveling_salesman_problem
133
- from .parallel.parallel import Parallel
135
+ from .parallel.parallel import Parallel, parallel_overlay
134
136
  from .raster.cube import DataCube
135
137
 
136
138
 
@@ -59,19 +59,15 @@ class Gridlooper:
59
59
 
60
60
  Instantiate a gridlooper.
61
61
 
62
- >>> looper = sg.Gridlooper(gridsize=200, mask=buffered, parallelizer=sg.Parallel(1, backend="multiprocessing"))
62
+ >>> looper = sg.Gridlooper(gridsize=200, mask=buffered, concat=True, parallelizer=sg.Parallel(1, backend="multiprocessing"))
63
63
 
64
64
  Run the function clean_overlay in a gridloop.
65
65
 
66
- >>> resultslist = looper.run(
66
+ >>> results = looper.run(
67
67
  ... sg.clean_overlay,
68
68
  ... points,
69
69
  ... buffered,
70
70
  ... )
71
- >>> type(resultslist)
72
- list
73
-
74
- >>> results = pd.concat(resultslist, ignore_index=True)
75
71
  >>> results
76
72
  idx_1 idx_2 geometry
77
73
  0 220 220 POINT (254575.200 6661631.500)
@@ -14,17 +14,13 @@ for the following:
14
14
  - The buff function returns a GeoDataFrame, the geopandas method returns a GeoSeries.
15
15
  """
16
16
 
17
- import joblib
17
+ from typing import Callable
18
+
18
19
  import numpy as np
20
+ import pandas as pd
19
21
  from geopandas import GeoDataFrame, GeoSeries
20
- from shapely import Geometry, make_valid, unary_union
21
-
22
- from .general import (
23
- _push_geom_col,
24
- merge_geometries,
25
- parallel_unary_union,
26
- parallel_unary_union_geoseries,
27
- )
22
+
23
+ from .general import merge_geometries, parallel_unary_union
28
24
  from .geometry_types import make_all_singlepart
29
25
  from .polygon_operations import get_cluster_mapper, get_grouped_centroids
30
26
 
@@ -172,17 +168,16 @@ def buffdiss(
172
168
 
173
169
 
174
170
  def _dissolve(gdf, aggfunc="first", grid_size=None, n_jobs=1, **dissolve_kwargs):
175
- geom_col = gdf._geometry_column_name
176
- # if grid_size is None:
177
- # dissolved = gdf.dissolve(aggfunc=aggfunc, **dissolve_kwargs)
178
171
 
179
- # dissolved[geom_col] = dissolved.make_valid()
180
- # return dissolved
172
+ if not len(gdf):
173
+ return gdf
181
174
 
182
175
  geom_col = gdf._geometry_column_name
183
176
 
184
177
  by = dissolve_kwargs.pop("by", None)
185
178
 
179
+ by_was_none = not bool(by)
180
+
186
181
  if by is None and dissolve_kwargs.get("level") is None:
187
182
  by = np.zeros(len(gdf), dtype="int64")
188
183
  other_cols = list(gdf.columns.difference({geom_col}))
@@ -191,32 +186,50 @@ def _dissolve(gdf, aggfunc="first", grid_size=None, n_jobs=1, **dissolve_kwargs)
191
186
  by = [by]
192
187
  other_cols = list(gdf.columns.difference({geom_col} | set(by or {})))
193
188
 
194
- dissolved = gdf.groupby(by, **dissolve_kwargs)[other_cols].agg(aggfunc)
189
+ try:
190
+ is_one_hit = gdf.groupby(by, **dissolve_kwargs).transform("size") == 1
191
+ except IndexError:
192
+ # if no rows when dropna=True
193
+ original_by = [x for x in by]
194
+ query = gdf[by.pop(0)].notna()
195
+ for col in gdf[by]:
196
+ query &= gdf[col].notna()
197
+ gdf = gdf.loc[query]
198
+ assert not len(gdf), gdf
199
+ if not by_was_none and dissolve_kwargs.get("as_index", True):
200
+ try:
201
+ gdf = gdf.set_index(original_by)
202
+ except Exception as e:
203
+ print(gdf)
204
+ print(original_by)
205
+ raise e
206
+ return gdf
207
+
208
+ if not by_was_none and dissolve_kwargs.get("as_index", True):
209
+ one_hit = gdf[is_one_hit].set_index(by)
210
+ else:
211
+ one_hit = gdf[is_one_hit]
212
+ many_hits = gdf[~is_one_hit]
213
+
214
+ if not len(many_hits):
215
+ return GeoDataFrame(one_hit, geometry=geom_col, crs=gdf.crs)
216
+
217
+ dissolved = many_hits.groupby(by, **dissolve_kwargs)[other_cols].agg(aggfunc)
218
+
219
+ # dissolved = gdf.groupby(by, **dissolve_kwargs)[other_cols].agg(aggfunc)
195
220
 
196
221
  if n_jobs > 1:
197
- dissolved[geom_col] = parallel_unary_union(
198
- gdf, n_jobs=n_jobs, by=by, grid_size=grid_size, **dissolve_kwargs
199
- )
200
222
  try:
223
+ agged = parallel_unary_union(
224
+ many_hits, n_jobs=n_jobs, by=by, grid_size=grid_size, **dissolve_kwargs
225
+ )
226
+ dissolved[geom_col] = agged
201
227
  return GeoDataFrame(dissolved, geometry=geom_col, crs=gdf.crs)
202
228
  except Exception as e:
203
- print(e, dissolved[geom_col])
229
+ print(e, dissolved, agged, many_hits)
204
230
  raise e
205
- # import dask_geopandas
206
-
207
- # if not isinstance(by, str):
208
- # gdf["_by"] = 1
209
- # ddf = dask_geopandas.from_geopandas(gdf, npartitions=n_jobs, by=by)
210
-
211
- with joblib.Parallel(n_jobs=n_jobs, backend="threading") as parallel:
212
- delayed_operations = []
213
- for _, geoms in gdf.groupby(by, **dissolve_kwargs)[geom_col]:
214
- delayed_operations.append(joblib.delayed(merge_geometries)(geoms))
215
231
 
216
- dissolved[geom_col] = parallel(delayed_operations)
217
- return GeoDataFrame(dissolved, geometry=geom_col, crs=gdf.crs)
218
-
219
- geoms_agged = gdf.groupby(by, **dissolve_kwargs)[geom_col].agg(
232
+ geoms_agged = many_hits.groupby(by, **dissolve_kwargs)[geom_col].agg(
220
233
  lambda x: merge_geometries(x, grid_size=grid_size)
221
234
  )
222
235
 
@@ -228,22 +241,23 @@ def _dissolve(gdf, aggfunc="first", grid_size=None, n_jobs=1, **dissolve_kwargs)
228
241
 
229
242
  dissolved[geom_col] = geoms_agged
230
243
 
231
- return GeoDataFrame(dissolved, geometry=geom_col, crs=gdf.crs)
244
+ return GeoDataFrame(
245
+ pd.concat([dissolved, one_hit]).sort_index(), geometry=geom_col, crs=gdf.crs
246
+ )
232
247
 
233
248
 
234
- def dissexp(
249
+ def diss(
235
250
  gdf: GeoDataFrame,
236
251
  by=None,
237
252
  aggfunc="first",
238
253
  as_index: bool = True,
239
- index_parts: bool = False,
240
254
  grid_size: float | int | None = None,
241
255
  n_jobs: int = 1,
242
256
  **dissolve_kwargs,
243
257
  ):
244
- """Dissolves overlapping geometries.
258
+ """Dissolves geometries.
245
259
 
246
- It takes a GeoDataFrame and dissolves, fixes and explodes geometries.
260
+ It takes a GeoDataFrame and dissolves and fixes geometries.
247
261
 
248
262
  Args:
249
263
  gdf: the GeoDataFrame that will be dissolved and exploded.
@@ -251,12 +265,10 @@ def dissexp(
251
265
  aggfunc: How to aggregate the non-geometry colums not in "by".
252
266
  as_index: Whether the 'by' columns should be returned as index. Defaults to
253
267
  True to be consistent with geopandas.
254
- index_parts: If False (default), the index after dissolve is respected. If
255
- True, an integer index level is added during explode.
256
268
  **dissolve_kwargs: additional keyword arguments passed to geopandas' dissolve.
257
269
 
258
270
  Returns:
259
- A GeoDataFrame where overlapping geometries are dissolved.
271
+ A GeoDataFrame with dissolved geometries.
260
272
  """
261
273
  if not len(gdf):
262
274
  if as_index:
@@ -267,6 +279,44 @@ def dissexp(
267
279
  else:
268
280
  return gdf
269
281
 
282
+ return _dissolve(
283
+ gdf,
284
+ by=by,
285
+ aggfunc=aggfunc,
286
+ grid_size=grid_size,
287
+ n_jobs=n_jobs,
288
+ as_index=as_index,
289
+ **dissolve_kwargs,
290
+ )
291
+
292
+
293
+ def dissexp(
294
+ gdf: GeoDataFrame,
295
+ by=None,
296
+ aggfunc="first",
297
+ as_index: bool = True,
298
+ index_parts: bool = False,
299
+ grid_size: float | int | None = None,
300
+ n_jobs: int = 1,
301
+ **dissolve_kwargs,
302
+ ):
303
+ """Dissolves overlapping geometries.
304
+
305
+ It takes a GeoDataFrame and dissolves, fixes and explodes geometries.
306
+
307
+ Args:
308
+ gdf: the GeoDataFrame that will be dissolved and exploded.
309
+ by: Columns to dissolve by.
310
+ aggfunc: How to aggregate the non-geometry colums not in "by".
311
+ as_index: Whether the 'by' columns should be returned as index. Defaults to
312
+ True to be consistent with geopandas.
313
+ index_parts: If False (default), the index after dissolve is respected. If
314
+ True, an integer index level is added during explode.
315
+ **dissolve_kwargs: additional keyword arguments passed to geopandas' dissolve.
316
+
317
+ Returns:
318
+ A GeoDataFrame where overlapping geometries are dissolved.
319
+ """
270
320
  dissolve_kwargs = dissolve_kwargs | {
271
321
  "by": by,
272
322
  "as_index": as_index,
@@ -274,7 +324,7 @@ def dissexp(
274
324
 
275
325
  dissolve_kwargs, ignore_index = _decide_ignore_index(dissolve_kwargs)
276
326
 
277
- dissolved = _dissolve(
327
+ dissolved = diss(
278
328
  gdf, aggfunc=aggfunc, grid_size=grid_size, n_jobs=n_jobs, **dissolve_kwargs
279
329
  )
280
330
 
@@ -296,6 +346,60 @@ def dissexp_by_cluster(
296
346
  This might be many times faster than a regular dissexp, if there are many
297
347
  non-overlapping geometries.
298
348
 
349
+ Args:
350
+ gdf: the GeoDataFrame that will be dissolved and exploded.
351
+ **dissolve_kwargs: Keyword arguments passed to geopandas' dissolve.
352
+
353
+ Returns:
354
+ A GeoDataFrame where overlapping geometries are dissolved.
355
+ """
356
+ return _run_func_by_cluster(
357
+ dissexp, gdf, predicate=predicate, n_jobs=n_jobs, **dissolve_kwargs
358
+ )
359
+
360
+
361
+ def diss_by_cluster(
362
+ gdf: GeoDataFrame, predicate=None, n_jobs: int = 1, **dissolve_kwargs
363
+ ) -> GeoDataFrame:
364
+ """Dissolves overlapping geometries through clustering with sjoin and networkx.
365
+
366
+ Works exactly like dissexp, but, before dissolving, the geometries are divided
367
+ into clusters based on overlap (uses the function sgis.get_polygon_clusters).
368
+ The geometries are then dissolved based on this column (and optionally other
369
+ columns).
370
+
371
+ This might be many times faster than a regular dissexp, if there are many
372
+ non-overlapping geometries.
373
+
374
+ Args:
375
+ gdf: the GeoDataFrame that will be dissolved and exploded.
376
+ **dissolve_kwargs: Keyword arguments passed to geopandas' dissolve.
377
+
378
+ Returns:
379
+ A GeoDataFrame where overlapping geometries are dissolved.
380
+ """
381
+ return _run_func_by_cluster(
382
+ diss, gdf, predicate=predicate, n_jobs=n_jobs, **dissolve_kwargs
383
+ )
384
+
385
+
386
+ def _run_func_by_cluster(
387
+ func: Callable,
388
+ gdf: GeoDataFrame,
389
+ predicate=None,
390
+ n_jobs: int = 1,
391
+ **dissolve_kwargs,
392
+ ) -> GeoDataFrame:
393
+ """Dissolves overlapping geometries through clustering with sjoin and networkx.
394
+
395
+ Works exactly like dissexp, but, before dissolving, the geometries are divided
396
+ into clusters based on overlap (uses the function sgis.get_polygon_clusters).
397
+ The geometries are then dissolved based on this column (and optionally other
398
+ columns).
399
+
400
+ This might be many times faster than a regular dissexp, if there are many
401
+ non-overlapping geometries.
402
+
299
403
  Args:
300
404
  gdf: the GeoDataFrame that will be dissolved and exploded.
301
405
  **dissolve_kwargs: Keyword arguments passed to geopandas' dissolve.
@@ -312,7 +416,7 @@ def dissexp_by_cluster(
312
416
  by = list(by)
313
417
 
314
418
  if not len(gdf):
315
- return dissexp(gdf, by=by, **dissolve_kwargs)
419
+ return func(gdf, by=by, **dissolve_kwargs)
316
420
 
317
421
  def get_group_clusters(group: GeoDataFrame):
318
422
  """Adds cluster column. Applied to each group because much faster."""
@@ -328,11 +432,11 @@ def dissexp_by_cluster(
328
432
  make_all_singlepart(gdf)
329
433
  .groupby(by, group_keys=True, dropna=False, as_index=False)
330
434
  .apply(get_group_clusters)
331
- .pipe(dissexp, by=["_cluster"] + by, n_jobs=n_jobs, **dissolve_kwargs)
435
+ .pipe(func, by=["_cluster"] + by, n_jobs=n_jobs, **dissolve_kwargs)
332
436
  )
333
437
  else:
334
438
  dissolved = get_group_clusters(make_all_singlepart(gdf)).pipe(
335
- dissexp, by="_cluster", n_jobs=n_jobs, **dissolve_kwargs
439
+ func, by="_cluster", n_jobs=n_jobs, **dissolve_kwargs
336
440
  )
337
441
 
338
442
  if not by: