ssb-sgis 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sgis/__init__.py +97 -115
  2. sgis/exceptions.py +3 -1
  3. sgis/geopandas_tools/__init__.py +1 -0
  4. sgis/geopandas_tools/bounds.py +75 -38
  5. sgis/geopandas_tools/buffer_dissolve_explode.py +38 -34
  6. sgis/geopandas_tools/centerlines.py +53 -44
  7. sgis/geopandas_tools/cleaning.py +87 -104
  8. sgis/geopandas_tools/conversion.py +149 -101
  9. sgis/geopandas_tools/duplicates.py +31 -17
  10. sgis/geopandas_tools/general.py +76 -48
  11. sgis/geopandas_tools/geometry_types.py +21 -7
  12. sgis/geopandas_tools/neighbors.py +20 -8
  13. sgis/geopandas_tools/overlay.py +136 -53
  14. sgis/geopandas_tools/point_operations.py +9 -8
  15. sgis/geopandas_tools/polygon_operations.py +48 -56
  16. sgis/geopandas_tools/polygons_as_rings.py +121 -78
  17. sgis/geopandas_tools/sfilter.py +14 -14
  18. sgis/helpers.py +114 -56
  19. sgis/io/dapla_functions.py +32 -23
  20. sgis/io/opener.py +13 -6
  21. sgis/io/read_parquet.py +1 -1
  22. sgis/maps/examine.py +39 -26
  23. sgis/maps/explore.py +112 -66
  24. sgis/maps/httpserver.py +12 -12
  25. sgis/maps/legend.py +124 -65
  26. sgis/maps/map.py +66 -41
  27. sgis/maps/maps.py +31 -29
  28. sgis/maps/thematicmap.py +46 -33
  29. sgis/maps/tilesources.py +3 -8
  30. sgis/networkanalysis/_get_route.py +5 -4
  31. sgis/networkanalysis/_od_cost_matrix.py +44 -1
  32. sgis/networkanalysis/_points.py +10 -4
  33. sgis/networkanalysis/_service_area.py +5 -2
  34. sgis/networkanalysis/closing_network_holes.py +20 -62
  35. sgis/networkanalysis/cutting_lines.py +55 -43
  36. sgis/networkanalysis/directednetwork.py +15 -7
  37. sgis/networkanalysis/finding_isolated_networks.py +4 -3
  38. sgis/networkanalysis/network.py +15 -13
  39. sgis/networkanalysis/networkanalysis.py +72 -54
  40. sgis/networkanalysis/networkanalysisrules.py +20 -16
  41. sgis/networkanalysis/nodes.py +2 -3
  42. sgis/networkanalysis/traveling_salesman.py +5 -2
  43. sgis/parallel/parallel.py +337 -127
  44. sgis/raster/__init__.py +6 -0
  45. sgis/raster/base.py +9 -3
  46. sgis/raster/cube.py +280 -208
  47. sgis/raster/cubebase.py +15 -29
  48. sgis/raster/indices.py +3 -7
  49. sgis/raster/methods_as_functions.py +0 -124
  50. sgis/raster/raster.py +313 -127
  51. sgis/raster/torchgeo.py +58 -37
  52. sgis/raster/zonal.py +38 -13
  53. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/LICENSE +1 -1
  54. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/METADATA +87 -16
  55. ssb_sgis-1.0.2.dist-info/RECORD +61 -0
  56. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/WHEEL +1 -1
  57. sgis/raster/bands.py +0 -48
  58. sgis/raster/gradient.py +0 -78
  59. ssb_sgis-1.0.1.dist-info/RECORD +0 -63
sgis/parallel/parallel.py CHANGED
@@ -3,11 +3,12 @@ import inspect
3
3
  import itertools
4
4
  import multiprocessing
5
5
  import warnings
6
- from collections.abc import Callable, Collection, Iterable
6
+ from collections.abc import Callable
7
+ from collections.abc import Collection
8
+ from collections.abc import Iterable
7
9
  from pathlib import Path
8
10
  from typing import Any
9
11
 
10
-
11
12
  try:
12
13
  import dapla as dp
13
14
  except ImportError:
@@ -16,18 +17,19 @@ except ImportError:
16
17
  import joblib
17
18
  import numpy as np
18
19
  import pandas as pd
19
- from geopandas import GeoDataFrame, GeoSeries
20
+ from geopandas import GeoDataFrame
20
21
  from pandas import DataFrame
21
- from shapely.geometry import MultiPolygon, Polygon
22
+ from pandas import Series
22
23
 
23
- from ..geopandas_tools.general import clean_clip, clean_geoms
24
24
  from ..geopandas_tools.neighbors import get_neighbor_indices
25
25
  from ..geopandas_tools.overlay import clean_overlay
26
- from ..helpers import LocalFunctionError, dict_zip, dict_zip_union, in_jupyter
27
-
26
+ from ..helpers import LocalFunctionError
27
+ from ..helpers import dict_zip_union
28
+ from ..helpers import in_jupyter
28
29
 
29
30
  try:
30
- from ..io.dapla_functions import exists, read_geopandas, write_geopandas
31
+ from ..io.dapla_functions import read_geopandas
32
+ from ..io.dapla_functions import write_geopandas
31
33
 
32
34
  # from ..io.write_municipality_data import write_municipality_data
33
35
  except ImportError:
@@ -35,16 +37,13 @@ except ImportError:
35
37
 
36
38
 
37
39
  try:
38
- from dapla import read_pandas, write_pandas
40
+ from dapla import read_pandas
41
+ from dapla import write_pandas
42
+ from dapla.gcs import GCSFileSystem
39
43
  except ImportError:
40
- pass
41
44
 
42
-
43
- def turn_args_into_kwargs(func: Callable, args: tuple, index_start: int):
44
- if not isinstance(args, tuple):
45
- raise TypeError("args should be a tuple (it should not be unpacked with *)")
46
- argnames = inspect.getfullargspec(func).args[index_start:]
47
- return {name: value for value, name in zip(args, argnames, strict=False)}
45
+ class GCSFileSystem:
46
+ """Placeholder."""
48
47
 
49
48
 
50
49
  class Parallel:
@@ -87,10 +86,23 @@ class Parallel:
87
86
  backend: str = "multiprocessing",
88
87
  context: str = "spawn",
89
88
  maxtasksperchild: int = 10,
89
+ chunksize: int = 1,
90
90
  **kwargs,
91
- ):
91
+ ) -> None:
92
+ """Initialize a Parallel instance with specified settings for parallel execution.
93
+
94
+ Args:
95
+ processes: Number of parallel processes. Set to 1 to run without parallelization.
96
+ backend: The backend to use for parallel execution. Defaults to 'multiprocessing'.
97
+ context: The context setting for multiprocessing. Defaults to 'spawn'.
98
+ maxtasksperchild: The maximum number of tasks a worker process can complete
99
+ before it is replaced. Defaults to 10.
100
+ chunksize: The size of the chunks of the iterable to distribute to workers.
101
+ **kwargs: Additional keyword arguments passed to the underlying parallel execution backend.
102
+ """
92
103
  self.processes = int(processes)
93
104
  self.maxtasksperchild = maxtasksperchild
105
+ self.chunksize = chunksize
94
106
  self.backend = backend
95
107
  self.context = context
96
108
  self.kwargs = kwargs
@@ -103,7 +115,6 @@ class Parallel:
103
115
  iterable: Collection,
104
116
  args: tuple | None = None,
105
117
  kwargs: dict | None = None,
106
- chunksize: int = 1,
107
118
  ) -> list[Any]:
108
119
  """Run functions in parallel with items of an iterable as 0th arguemnt.
109
120
 
@@ -111,7 +122,7 @@ class Parallel:
111
122
  func: Function to be run.
112
123
  iterable: An iterable where each item will be passed to func as
113
124
  0th positional argument.
114
- Args: Positional arguments passed to 'func' starting from the 1st argument.
125
+ args: Positional arguments passed to 'func' starting from the 1st argument.
115
126
  The 0th argument will be reserved for the values of 'iterable'.
116
127
  kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
117
128
  not unpacked into separate keyword arguments.
@@ -120,7 +131,7 @@ class Parallel:
120
131
  A list of the return values of the function, one for each item in
121
132
  'iterable'.
122
133
 
123
- Examples
134
+ Examples:
124
135
  --------
125
136
  Multiply each list element by 2.
126
137
 
@@ -159,14 +170,13 @@ class Parallel:
159
170
  ... print(results)
160
171
  [2, 4, 6]
161
172
  """
162
-
163
173
  if args:
164
174
  # start at index 1, meaning the 0th argument (the iterable) is still available
165
- args_as_kwargs = turn_args_into_kwargs(func, args, index_start=1)
175
+ args_as_kwargs = _turn_args_into_kwargs(func, args, index_start=1)
166
176
  else:
167
177
  args_as_kwargs = {}
168
178
 
169
- self.validate_execution(func)
179
+ self._validate_execution(func)
170
180
 
171
181
  kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
172
182
 
@@ -188,7 +198,9 @@ class Parallel:
188
198
  processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
189
199
  ) as pool:
190
200
  try:
191
- return pool.map(func_with_kwargs, iterable, chunksize=chunksize)
201
+ return pool.map(
202
+ func_with_kwargs, iterable, chunksize=self.chunksize
203
+ )
192
204
  except Exception as e:
193
205
  pool.terminate()
194
206
  raise e
@@ -204,7 +216,6 @@ class Parallel:
204
216
  iterable: Collection[Iterable[Any]],
205
217
  args: tuple | None = None,
206
218
  kwargs: dict | None = None,
207
- chunksize: int = 1,
208
219
  ) -> list[Any]:
209
220
  """Run functions in parallel where items of the iterable are unpacked.
210
221
 
@@ -215,7 +226,7 @@ class Parallel:
215
226
  func: Function to be run.
216
227
  iterable: An iterable of iterables, where each item will be
217
228
  unpacked as positional argument to the function.
218
- Args: Positional arguments passed to 'func' starting at argument position
229
+ args: Positional arguments passed to 'func' starting at argument position
219
230
  n + 1, where n is the length of the iterables inside the iterable.
220
231
  kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
221
232
  not unpacked into separate keyword arguments.
@@ -224,7 +235,7 @@ class Parallel:
224
235
  A list of the return values of the function, one for each item in
225
236
  'iterable'.
226
237
 
227
- Examples
238
+ Examples:
228
239
  --------
229
240
  Multiply each list element by 2.
230
241
 
@@ -262,13 +273,13 @@ class Parallel:
262
273
  if args:
263
274
  # starting the count at the length of the iterables inside the iterables
264
275
  iterable = list(iterable)
265
- args_as_kwargs = turn_args_into_kwargs(
276
+ args_as_kwargs = _turn_args_into_kwargs(
266
277
  func, args, index_start=len(iterable[0])
267
278
  )
268
279
  else:
269
280
  args_as_kwargs = {}
270
281
 
271
- self.validate_execution(func)
282
+ self._validate_execution(func)
272
283
 
273
284
  kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
274
285
 
@@ -290,7 +301,9 @@ class Parallel:
290
301
  processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
291
302
  ) as pool:
292
303
  try:
293
- return pool.starmap(func_with_kwargs, iterable, chunksize=chunksize)
304
+ return pool.starmap(
305
+ func_with_kwargs, iterable, chunksize=self.chunksize
306
+ )
294
307
  except Exception as e:
295
308
  pool.terminate()
296
309
  raise e
@@ -320,10 +333,10 @@ class Parallel:
320
333
  Returns:
321
334
  A DataFrame, or a list of DataFrames if concat is False.
322
335
  """
323
- if not strict:
324
- files = [file for file in files if exists(file)]
325
-
326
- res = self.map(dp.read_pandas, files, kwargs=kwargs)
336
+ if strict:
337
+ res = self.map(read_pandas, files, kwargs=kwargs)
338
+ else:
339
+ res = self.map(_try_to_read_pandas, files, kwargs=kwargs)
327
340
 
328
341
  return pd.concat(res, ignore_index=ignore_index) if concat else res
329
342
 
@@ -342,14 +355,19 @@ class Parallel:
342
355
  concat: Whether to concat the results to a GeoDataFrame.
343
356
  ignore_index: Defaults to True.
344
357
  strict: If True (default), all files must exist.
358
+ chunksize: The size of the chunks of the iterable to distribute to workers.
345
359
  **kwargs: Keyword arguments passed to sgis.read_geopandas.
346
360
 
347
361
  Returns:
348
362
  A GeoDataFrame, or a list of GeoDataFrames if concat is False.
349
363
  """
350
- if not strict:
351
- files = [file for file in files if exists(file)]
352
- res = self.map(read_geopandas, files, kwargs=kwargs)
364
+ if "file_system" not in kwargs:
365
+ kwargs["file_system"] = dp.FileClient.get_gcs_file_system()
366
+
367
+ if strict:
368
+ res = self.map(read_geopandas, files, kwargs=kwargs)
369
+ else:
370
+ res = self.map(_try_to_read_geopandas, files, kwargs=kwargs)
353
371
 
354
372
  return pd.concat(res, ignore_index=ignore_index) if concat else res
355
373
 
@@ -367,10 +385,14 @@ class Parallel:
367
385
  clip: bool = True,
368
386
  max_rows_per_chunk: int = 150_000,
369
387
  processes_in_clip: int = 1,
370
- ):
388
+ verbose: bool = True,
389
+ ) -> None:
371
390
  """Split multiple datasets into municipalities and write as separate files.
372
391
 
373
392
  The files will be named as the municipality number.
393
+ Each dataset in 'in_data' is intersected with 'municipalities'
394
+ in parallel. The intersections themselves can also be run in parallel
395
+ with the 'processes_in_clip' argument.
374
396
 
375
397
  Args:
376
398
  in_data: Dictionary with dataset names as keys and file paths or
@@ -397,7 +419,12 @@ class Parallel:
397
419
  not have to have the same length as 'in_data'.
398
420
  write_empty: If False (default), municipalities with no data will be skipped.
399
421
  If True, an empty parquet file will be written.
400
- clip: If True (default), the data will be clipped.
422
+ clip: If True (default), the data will be clipped. If False, the data will
423
+ be spatial joined.
424
+ max_rows_per_chunk: Number of rows per data chunk for processing.
425
+ processes_in_clip: Number of parallel processes for data clipping.
426
+ verbose: Whether to print during execution.
427
+
401
428
  """
402
429
  shared_kwds = {
403
430
  "municipalities": municipalities,
@@ -409,6 +436,7 @@ class Parallel:
409
436
  "max_rows_per_chunk": max_rows_per_chunk,
410
437
  "processes_in_clip": processes_in_clip,
411
438
  "strict": strict,
439
+ "verbose": verbose,
412
440
  }
413
441
 
414
442
  if isinstance(out_data, (str, Path)):
@@ -417,10 +445,12 @@ class Parallel:
417
445
  if funcdict is None:
418
446
  funcdict = {}
419
447
 
420
- zip_func = dict_zip if strict else dict_zip_union
448
+ fs = dp.FileClient.get_gcs_file_system()
421
449
 
422
- for _, data, folder, postfunc in zip_func(in_data, out_data, funcdict):
423
- if data is None:
450
+ for _, data, folder, postfunc in dict_zip_union(in_data, out_data, funcdict):
451
+ if data is None or (
452
+ not strict and isinstance(data, (str | Path)) and not fs.exists(data)
453
+ ):
424
454
  continue
425
455
 
426
456
  kwds = shared_kwds | {
@@ -439,15 +469,33 @@ class Parallel:
439
469
  df: GeoDataFrame,
440
470
  args: tuple | None = None,
441
471
  kwargs: dict | None = None,
442
- max_rows_per_chunk: int = 150_000,
443
- n_chunks: int = None,
444
- concat: bool = False,
472
+ n_chunks: int | None = None,
473
+ max_rows_per_chunk: int | None = None,
474
+ concat: bool = True,
445
475
  ) -> GeoDataFrame:
446
- if len(df) < max_rows_per_chunk:
447
- return func(df, *args, **kwargs)
476
+ """Run a function in parallel on chunks of a (Geo)DataFrame.
448
477
 
449
- if n_chunks is None:
450
- n_chunks = len(df) // max_rows_per_chunk
478
+ Args:
479
+ func: Function to run chunkwise. It should take
480
+ a (Geo)DataFrame as first argument.
481
+ df: (Geo)DataFrame to split in n_chunks and passed
482
+ as first argument to 'func'.
483
+ args: Positional arguments in 'func' after the DataFrame.
484
+ kwargs: Additional keyword arguments in 'func'.
485
+ n_chunks: Optionally set number of chunks to split
486
+ 'df' into. Defaults to the 'processes' attribute
487
+ of the Parallel instance.
488
+ max_rows_per_chunk: Alternatively decide number of chunks
489
+ by a maximum number of rows per chunk.
490
+ concat: Whether to use pd.concat on the results.
491
+ Defaults to True.
492
+ """
493
+ if max_rows_per_chunk is None and n_chunks is None:
494
+ n_chunks: int = self.processes
495
+ elif n_chunks is None:
496
+ n_chunks: int = len(df) // max_rows_per_chunk
497
+ elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
498
+ return func(df, *args, **kwargs)
451
499
 
452
500
  chunks = np.array_split(np.arange(len(df)), n_chunks)
453
501
 
@@ -464,7 +512,7 @@ class Parallel:
464
512
  else:
465
513
  return out
466
514
 
467
- def validate_execution(self, func):
515
+ def _validate_execution(self, func: Callable) -> None:
468
516
  """Multiprocessing doesn't work with local variables in interactive interpreter.
469
517
 
470
518
  Raising Exception to avoid confusion.
@@ -478,8 +526,8 @@ class Parallel:
478
526
  raise LocalFunctionError(func)
479
527
 
480
528
  @staticmethod
481
- def _validate_kwargs(kwargs) -> dict:
482
- """Make sure kwargs is a dict (not ** unpacked or None)"""
529
+ def _validate_kwargs(kwargs: dict) -> dict:
530
+ """Make sure kwargs is a dict (not ** unpacked or None)."""
483
531
  if kwargs is None:
484
532
  kwargs = {}
485
533
  elif not isinstance(kwargs, dict):
@@ -487,7 +535,7 @@ class Parallel:
487
535
  return kwargs
488
536
 
489
537
  def _execute(self) -> list[Any]:
490
- [self.validate_execution(func) for func in self.funcs]
538
+ [self._validate_execution(func) for func in self.funcs]
491
539
 
492
540
  if self.processes == 1:
493
541
  return [func() for func in self.funcs]
@@ -513,7 +561,8 @@ class Parallel:
513
561
  results = [pool.apply_async(func) for func in self.funcs]
514
562
  return [result.get() for result in results]
515
563
 
516
- def __repr__(self):
564
+ def __repr__(self) -> str:
565
+ """String representation."""
517
566
  return (
518
567
  f"{self.__class__.__name__}(processes={self.processes}, "
519
568
  f"backend='{self.backend}', context='{self.context}')"
@@ -523,7 +572,7 @@ class Parallel:
523
572
  def write_municipality_data(
524
573
  data: str | GeoDataFrame | DataFrame,
525
574
  out_folder: str,
526
- municipalities: GeoDataFrame,
575
+ municipalities: GeoDataFrame | list[str] | None = None,
527
576
  with_neighbors: bool = False,
528
577
  muni_number_col: str = "KOMMUNENR",
529
578
  file_type: str = "parquet",
@@ -533,13 +582,39 @@ def write_municipality_data(
533
582
  max_rows_per_chunk: int = 150_000,
534
583
  processes_in_clip: int = 1,
535
584
  strict: bool = True,
585
+ verbose: bool = True,
536
586
  ) -> None:
587
+ """Splits and writes data into municipality-specific files.
588
+
589
+ Args:
590
+ data: Path to the data file or a GeoDataFrame.
591
+ out_folder: Path to the output directory where the municipality data
592
+ is written.
593
+ municipalities: Either a sequence of municipality numbers or a GeoDataFrame
594
+ of municipality polygons and municipality numbers in the column 'muni_number_col'.
595
+ Defaults to None.
596
+ with_neighbors: If True, include data from neighboring municipalities
597
+ for each municipality.
598
+ muni_number_col: Column name for municipality codes in 'municipalities'.
599
+ file_type: Format of the output file.
600
+ func: Function to process data before writing.
601
+ write_empty: If True, write empty files for municipalities without data.
602
+ clip: If True, clip the data to municipality boundaries. If False
603
+ the data is spatial joined.
604
+ max_rows_per_chunk: Maximum number of rows in each processed chunk.
605
+ processes_in_clip: Number of processes to use for clipping.
606
+ strict: If True (default) and the data has a municipality column,
607
+ all municipality numbers in 'data' must be present in 'municipalities'.
608
+ verbose: Whether to print during execution.
609
+
610
+ Returns:
611
+ None. The function writes files directly.
612
+ """
537
613
  write_func = (
538
614
  _write_neighbor_municipality_data
539
615
  if with_neighbors
540
616
  else _write_municipality_data
541
617
  )
542
-
543
618
  return write_func(
544
619
  data=data,
545
620
  out_folder=out_folder,
@@ -552,27 +627,34 @@ def write_municipality_data(
552
627
  max_rows_per_chunk=max_rows_per_chunk,
553
628
  processes_in_clip=processes_in_clip,
554
629
  strict=strict,
630
+ verbose=verbose,
555
631
  )
556
632
 
557
633
 
558
- def _validate_data(data: str | list[str]) -> str:
559
- if isinstance(data, (str, Path)):
560
- return data
634
+ def _validate_data(
635
+ data: str | list[str] | DataFrame | GeoDataFrame,
636
+ ) -> DataFrame | GeoDataFrame:
561
637
  if hasattr(data, "__iter__") and len(data) == 1:
562
- return data[0]
563
- elif not isinstance(data, GeoDataFrame):
564
- raise TypeError("'data' Must be a file path or a GeoDataFrame. Got", type(data))
638
+ data = data[0]
639
+ if isinstance(data, (str, Path)):
640
+ try:
641
+ return read_geopandas(str(data))
642
+ except ValueError as e:
643
+ try:
644
+ return read_pandas(str(data))
645
+ except ValueError as e2:
646
+ raise e.__class__(e, data) from e2
565
647
  return data
566
648
 
567
649
 
568
- def _get_out_path(out_folder, muni, file_type):
650
+ def _get_out_path(out_folder: str | Path, muni: str, file_type: str) -> str:
569
651
  return str(Path(out_folder) / f"{muni}.{file_type.strip('.')}")
570
652
 
571
653
 
572
654
  def _write_municipality_data(
573
655
  data: str | GeoDataFrame | DataFrame,
574
656
  out_folder: str,
575
- municipalities: GeoDataFrame,
657
+ municipalities: GeoDataFrame | list[str] | None = None,
576
658
  muni_number_col: str = "KOMMUNENR",
577
659
  file_type: str = "parquet",
578
660
  func: Callable | None = None,
@@ -581,21 +663,15 @@ def _write_municipality_data(
581
663
  max_rows_per_chunk: int = 150_000,
582
664
  processes_in_clip: int = 1,
583
665
  strict: bool = True,
666
+ verbose: bool = True,
584
667
  ) -> None:
585
- data = _validate_data(data)
586
-
587
- if isinstance(data, (str, Path)):
588
- try:
589
- gdf = read_geopandas(str(data))
590
- except ValueError as e:
591
- try:
592
- gdf = read_pandas(str(data))
593
- except ValueError:
594
- raise e.__class__(e, data)
595
- elif isinstance(data, DataFrame):
596
- gdf = data
668
+ if verbose:
669
+ to_print = out_folder
670
+ print(to_print)
597
671
  else:
598
- raise TypeError(type(data))
672
+ to_print = None
673
+
674
+ gdf = _validate_data(data)
599
675
 
600
676
  if func is not None:
601
677
  gdf = func(gdf)
@@ -608,22 +684,29 @@ def _write_municipality_data(
608
684
  max_rows_per_chunk,
609
685
  processes_in_clip=processes_in_clip,
610
686
  strict=strict,
687
+ to_print=to_print,
611
688
  )
612
689
 
613
- for muni in municipalities[muni_number_col]:
614
- print(muni)
615
- out = _get_out_path(out_folder, muni, file_type)
616
-
617
- gdf_muni = gdf.loc[gdf[muni_number_col] == muni]
618
-
619
- if not len(gdf_muni):
620
- if write_empty:
621
- gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
622
- gdf_muni["geometry"] = None
623
- write_pandas(gdf_muni, out)
624
- continue
625
-
626
- write_geopandas(gdf_muni, out)
690
+ if municipalities is None:
691
+ muni_numbers = gdf[muni_number_col]
692
+ elif not isinstance(municipalities, DataFrame):
693
+ muni_numbers = municipalities
694
+ else:
695
+ muni_numbers = municipalities[muni_number_col]
696
+
697
+ # hardcode this to threading for efficiency in io bound task
698
+ Parallel(processes_in_clip, backend="threading").map(
699
+ _write_one_muni,
700
+ muni_numbers,
701
+ kwargs=dict(
702
+ gdf=gdf,
703
+ out_folder=out_folder,
704
+ muni_number_col=muni_number_col,
705
+ file_type=file_type,
706
+ write_empty=write_empty,
707
+ to_print=to_print,
708
+ ),
709
+ )
627
710
 
628
711
 
629
712
  def _write_neighbor_municipality_data(
@@ -638,11 +721,15 @@ def _write_neighbor_municipality_data(
638
721
  max_rows_per_chunk: int = 150_000,
639
722
  processes_in_clip: int = 1,
640
723
  strict: bool = True,
724
+ verbose: bool = True,
641
725
  ) -> None:
642
- data = _validate_data(data)
726
+ if verbose:
727
+ to_print = out_folder
728
+ print("out_folder:", to_print)
729
+ else:
730
+ to_print = None
643
731
 
644
- if isinstance(data, (str, Path)):
645
- gdf = read_geopandas(str(data))
732
+ gdf = _validate_data(data)
646
733
 
647
734
  if func is not None:
648
735
  gdf = func(gdf)
@@ -655,6 +742,7 @@ def _write_neighbor_municipality_data(
655
742
  max_rows_per_chunk,
656
743
  processes_in_clip,
657
744
  strict=strict,
745
+ to_print=to_print,
658
746
  )
659
747
 
660
748
  if municipalities.index.name != muni_number_col:
@@ -664,43 +752,97 @@ def _write_neighbor_municipality_data(
664
752
  municipalities, municipalities, max_distance=1
665
753
  )
666
754
 
667
- for muni in municipalities.index:
668
- out = _get_out_path(out_folder, muni, file_type)
755
+ # hardcode this to threading for efficiency in io bound task
756
+ Parallel(processes_in_clip, backend="threading").map(
757
+ _write_one_muni_with_neighbors,
758
+ municipalities.index,
759
+ kwargs=dict(
760
+ gdf=gdf,
761
+ neighbor_munis=neighbor_munis,
762
+ out_folder=out_folder,
763
+ muni_number_col=muni_number_col,
764
+ file_type=file_type,
765
+ write_empty=write_empty,
766
+ to_print=to_print,
767
+ ),
768
+ )
769
+
770
+
771
+ def _write_one_muni(
772
+ muni_number: Any,
773
+ gdf: GeoDataFrame | DataFrame,
774
+ out_folder: str | Path,
775
+ muni_number_col: str,
776
+ file_type: str,
777
+ write_empty: bool,
778
+ to_print: str | None = None,
779
+ ) -> None:
780
+ out = _get_out_path(out_folder, muni_number, file_type)
781
+
782
+ if to_print:
783
+ print("writing:", out)
784
+
785
+ gdf_muni = gdf.loc[gdf[muni_number_col] == muni_number]
669
786
 
670
- muni_and_neighbors = neighbor_munis.loc[[muni]]
671
- gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
787
+ if not len(gdf_muni):
788
+ if write_empty:
789
+ gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
790
+ gdf_muni["geometry"] = None
791
+ write_pandas(gdf_muni, out)
792
+ return
672
793
 
673
- if not len(gdf_neighbor):
674
- if write_empty:
675
- gdf_neighbor["geometry"] = gdf_neighbor["geometry"].astype(str)
676
- write_pandas(gdf_neighbor, out)
677
- continue
794
+ write_geopandas(gdf_muni, out)
678
795
 
679
- write_geopandas(gdf_neighbor, out)
796
+
797
+ def _write_one_muni_with_neighbors(
798
+ muni_number: Any,
799
+ gdf: GeoDataFrame | DataFrame,
800
+ neighbor_munis: Series,
801
+ out_folder: str | Path,
802
+ muni_number_col: str,
803
+ file_type: str,
804
+ write_empty: bool,
805
+ to_print: str | None = None,
806
+ ) -> None:
807
+ out = _get_out_path(out_folder, muni_number, file_type)
808
+
809
+ if to_print:
810
+ print("writing:", out)
811
+
812
+ muni_and_neighbors: Series = neighbor_munis.loc[[muni_number]]
813
+ gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
814
+
815
+ if not len(gdf_neighbor):
816
+ if write_empty:
817
+ gdf_neighbor = gdf_neighbor.drop(columns="geometry", errors="ignore")
818
+ gdf_neighbor["geometry"] = None
819
+ write_pandas(gdf_neighbor, out)
820
+ return
821
+
822
+ write_geopandas(gdf_neighbor, out)
680
823
 
681
824
 
682
825
  def _fix_missing_muni_numbers(
683
- gdf,
684
- municipalities,
685
- muni_number_col,
686
- clip,
687
- max_rows_per_chunk,
688
- processes_in_clip,
689
- strict,
690
- ):
826
+ gdf: GeoDataFrame,
827
+ municipalities: GeoDataFrame,
828
+ muni_number_col: str,
829
+ clip: bool,
830
+ max_rows_per_chunk: int,
831
+ processes_in_clip: int,
832
+ strict: bool,
833
+ to_print: str,
834
+ ) -> GeoDataFrame:
691
835
  if muni_number_col in gdf and gdf[muni_number_col].notna().all():
692
836
  if municipalities is None:
693
837
  return gdf
694
838
  if diffs := set(gdf[muni_number_col].values).difference(
695
839
  set(municipalities[muni_number_col].values)
696
840
  ):
697
- message = (
698
- f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore"
699
- )
841
+ message = f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore."
700
842
  if strict:
701
843
  raise ValueError(message)
702
844
  else:
703
- warnings.warn(message)
845
+ warnings.warn(message, stacklevel=1)
704
846
  return gdf
705
847
 
706
848
  if municipalities is None:
@@ -717,7 +859,10 @@ def _fix_missing_muni_numbers(
717
859
  "GeoDataFrame to clip the geometries by."
718
860
  )
719
861
 
720
- municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
862
+ try:
863
+ municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
864
+ except Exception as e:
865
+ raise e.__class__(e, to_print) from e
721
866
 
722
867
  if muni_number_col in gdf and gdf[muni_number_col].isna().any():
723
868
  notna = gdf[gdf[muni_number_col].notna()]
@@ -732,6 +877,7 @@ def _fix_missing_muni_numbers(
732
877
  municipalities[[muni_number_col, municipalities._geometry_column_name]],
733
878
  processes=processes_in_clip,
734
879
  max_rows_per_chunk=max_rows_per_chunk,
880
+ to_print=to_print,
735
881
  )
736
882
 
737
883
  return pd.concat([notna, notna_anymore], ignore_index=True)
@@ -744,25 +890,42 @@ def _fix_missing_muni_numbers(
744
890
  municipalities[[muni_number_col, municipalities._geometry_column_name]],
745
891
  processes=processes_in_clip,
746
892
  max_rows_per_chunk=max_rows_per_chunk,
893
+ to_print=to_print,
747
894
  )
748
895
 
749
896
 
750
897
  def parallel_overlay(
751
898
  df1: GeoDataFrame,
752
899
  df2: GeoDataFrame,
753
- # muni_number_col: str,
754
900
  processes: int,
755
901
  max_rows_per_chunk: int,
756
902
  backend: str = "loky",
903
+ to_print: str | None = None,
757
904
  **kwargs,
758
905
  ) -> GeoDataFrame:
759
- # df2 = df2[[muni_number_col, df2._geometry_column_name]]
906
+ """Perform spatial overlay operations on two GeoDataFrames in parallel.
907
+
908
+ This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
909
+ overlay operation with the second GeoDataFrame, and then concatenates the results.
760
910
 
911
+ Note that this function is most useful if df2 has few and simple geometries.
912
+
913
+ Args:
914
+ df1: The first GeoDataFrame for the overlay operation.
915
+ df2: The second GeoDataFrame for the overlay operation.
916
+ how: Type of overlay operation ('intersection', 'union', etc.).
917
+ processes: Number of parallel processes to use.
918
+ max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
919
+ backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
920
+ to_print: Optional text to print to see progression.
921
+ **kwargs: Additional keyword arguments to pass to the overlay function.
922
+
923
+ Returns:
924
+ A GeoDataFrame containing the result of the overlay operation.
925
+ """
761
926
  if len(df1) < max_rows_per_chunk:
762
927
  return clean_overlay(df1, df2, **kwargs)
763
928
 
764
- # df2 = df2.dissolve(by=muni_number_col, as_index=False)
765
-
766
929
  n_chunks = len(df1) // max_rows_per_chunk
767
930
  chunks = np.array_split(np.arange(len(df1)), n_chunks)
768
931
 
@@ -778,26 +941,50 @@ def parallel_overlay(
778
941
  out = Parallel(processes, backend=backend).map(
779
942
  _clean_intersection,
780
943
  df1_chunked,
781
- args=(df2,),
944
+ args=(df2, to_print) if to_print else (df2,),
782
945
  )
783
946
  return pd.concat(out, ignore_index=True)
784
947
 
785
948
 
786
- def _clean_intersection(df1, df2):
787
- print(len(df1))
949
+ def _clean_intersection(
950
+ df1: GeoDataFrame, df2: GeoDataFrame, to_print: str | None = None
951
+ ) -> GeoDataFrame:
952
+ print(to_print, "- intersection chunk len:", len(df1))
788
953
  return clean_overlay(df1, df2, how="intersection")
789
954
 
790
955
 
791
956
  def chunkwise(
792
957
  func: Callable,
793
- df: GeoDataFrame,
958
+ df: GeoDataFrame | pd.DataFrame,
794
959
  max_rows_per_chunk: int = 150_000,
795
- n_chunks: int = None,
960
+ n_chunks: int | None = None,
796
961
  args: tuple | None = None,
797
962
  kwargs: dict | None = None,
798
963
  n_jobs: int = 1,
799
964
  backend: str = "loky",
800
- ) -> GeoDataFrame:
965
+ ) -> GeoDataFrame | pd.DataFrame:
966
+ """Run a function in parallel on chunks of a DataFrame.
967
+
968
+ This method is used to process large (Geo)DataFrames in manageable pieces,
969
+ optionally in parallel.
970
+
971
+ Args:
972
+ func: The function to apply to each chunk. This function must accept a DataFrame as
973
+ its first argument and return a DataFrame.
974
+ df: The DataFrame to be chunked and processed.
975
+ max_rows_per_chunk: The maximum number of rows each chunk should contain.
976
+ n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
977
+ calculated based on 'max_rows_per_chunk'.
978
+ args: Additional positional arguments to pass to 'func'.
979
+ kwargs: Keyword arguments to pass to 'func'.
980
+ n_jobs: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
981
+ backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
982
+
983
+ Returns:
984
+ GeoDataFrame: A GeoDataFrame resulting from concatenating the results of applying 'func'
985
+ to each chunk of the original GeoDataFrame.
986
+
987
+ """
801
988
  if len(df) < max_rows_per_chunk:
802
989
  return func(df, *args, **kwargs)
803
990
 
@@ -815,3 +1002,26 @@ def chunkwise(
815
1002
  kwargs=kwargs,
816
1003
  )
817
1004
  return pd.concat(out, ignore_index=True)
1005
+
1006
+
1007
+ def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
1008
+ if not isinstance(args, tuple):
1009
+ raise TypeError("args should be a tuple (it should not be unpacked with *)")
1010
+ argnames = inspect.getfullargspec(func).args[index_start:]
1011
+ return {name: value for value, name in zip(args, argnames, strict=False)}
1012
+
1013
+
1014
+ def _try_to_read_geopandas(path: str, **kwargs) -> GeoDataFrame | DataFrame | None:
1015
+ """Read with try/except because it's faster than checking exists first."""
1016
+ try:
1017
+ return read_geopandas(path, **kwargs)
1018
+ except FileNotFoundError:
1019
+ return None
1020
+
1021
+
1022
+ def _try_to_read_pandas(path: str, **kwargs) -> DataFrame | None:
1023
+ """Read with try/except because it's faster than checking exists first."""
1024
+ try:
1025
+ return read_pandas(path, **kwargs)
1026
+ except FileNotFoundError:
1027
+ return None