ssb-sgis 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sgis/__init__.py +107 -121
  2. sgis/exceptions.py +5 -3
  3. sgis/geopandas_tools/__init__.py +1 -0
  4. sgis/geopandas_tools/bounds.py +86 -47
  5. sgis/geopandas_tools/buffer_dissolve_explode.py +62 -39
  6. sgis/geopandas_tools/centerlines.py +53 -44
  7. sgis/geopandas_tools/cleaning.py +87 -104
  8. sgis/geopandas_tools/conversion.py +164 -107
  9. sgis/geopandas_tools/duplicates.py +33 -19
  10. sgis/geopandas_tools/general.py +84 -52
  11. sgis/geopandas_tools/geometry_types.py +24 -10
  12. sgis/geopandas_tools/neighbors.py +23 -11
  13. sgis/geopandas_tools/overlay.py +136 -53
  14. sgis/geopandas_tools/point_operations.py +11 -10
  15. sgis/geopandas_tools/polygon_operations.py +53 -61
  16. sgis/geopandas_tools/polygons_as_rings.py +121 -78
  17. sgis/geopandas_tools/sfilter.py +17 -17
  18. sgis/helpers.py +116 -58
  19. sgis/io/dapla_functions.py +32 -23
  20. sgis/io/opener.py +13 -6
  21. sgis/io/read_parquet.py +2 -2
  22. sgis/maps/examine.py +55 -28
  23. sgis/maps/explore.py +471 -112
  24. sgis/maps/httpserver.py +12 -12
  25. sgis/maps/legend.py +285 -134
  26. sgis/maps/map.py +248 -129
  27. sgis/maps/maps.py +123 -119
  28. sgis/maps/thematicmap.py +260 -94
  29. sgis/maps/tilesources.py +3 -8
  30. sgis/networkanalysis/_get_route.py +5 -4
  31. sgis/networkanalysis/_od_cost_matrix.py +44 -1
  32. sgis/networkanalysis/_points.py +10 -4
  33. sgis/networkanalysis/_service_area.py +5 -2
  34. sgis/networkanalysis/closing_network_holes.py +22 -64
  35. sgis/networkanalysis/cutting_lines.py +58 -46
  36. sgis/networkanalysis/directednetwork.py +16 -8
  37. sgis/networkanalysis/finding_isolated_networks.py +6 -5
  38. sgis/networkanalysis/network.py +15 -13
  39. sgis/networkanalysis/networkanalysis.py +79 -61
  40. sgis/networkanalysis/networkanalysisrules.py +21 -17
  41. sgis/networkanalysis/nodes.py +2 -3
  42. sgis/networkanalysis/traveling_salesman.py +6 -3
  43. sgis/parallel/parallel.py +372 -142
  44. sgis/raster/base.py +9 -3
  45. sgis/raster/cube.py +331 -213
  46. sgis/raster/cubebase.py +15 -29
  47. sgis/raster/image_collection.py +2560 -0
  48. sgis/raster/indices.py +17 -12
  49. sgis/raster/raster.py +356 -275
  50. sgis/raster/sentinel_config.py +104 -0
  51. sgis/raster/zonal.py +38 -14
  52. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/LICENSE +1 -1
  53. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/METADATA +87 -16
  54. ssb_sgis-1.0.3.dist-info/RECORD +61 -0
  55. {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.3.dist-info}/WHEEL +1 -1
  56. sgis/raster/bands.py +0 -48
  57. sgis/raster/gradient.py +0 -78
  58. sgis/raster/methods_as_functions.py +0 -124
  59. sgis/raster/torchgeo.py +0 -150
  60. ssb_sgis-1.0.1.dist-info/RECORD +0 -63
sgis/parallel/parallel.py CHANGED
@@ -2,12 +2,14 @@ import functools
2
2
  import inspect
3
3
  import itertools
4
4
  import multiprocessing
5
+ import pickle
5
6
  import warnings
6
- from collections.abc import Callable, Collection, Iterable
7
+ from collections.abc import Callable
8
+ from collections.abc import Collection
9
+ from collections.abc import Iterable
7
10
  from pathlib import Path
8
11
  from typing import Any
9
12
 
10
-
11
13
  try:
12
14
  import dapla as dp
13
15
  except ImportError:
@@ -16,18 +18,19 @@ except ImportError:
16
18
  import joblib
17
19
  import numpy as np
18
20
  import pandas as pd
19
- from geopandas import GeoDataFrame, GeoSeries
21
+ from geopandas import GeoDataFrame
20
22
  from pandas import DataFrame
21
- from shapely.geometry import MultiPolygon, Polygon
23
+ from pandas import Series
22
24
 
23
- from ..geopandas_tools.general import clean_clip, clean_geoms
24
25
  from ..geopandas_tools.neighbors import get_neighbor_indices
25
26
  from ..geopandas_tools.overlay import clean_overlay
26
- from ..helpers import LocalFunctionError, dict_zip, dict_zip_union, in_jupyter
27
-
27
+ from ..helpers import LocalFunctionError
28
+ from ..helpers import dict_zip_union
29
+ from ..helpers import in_jupyter
28
30
 
29
31
  try:
30
- from ..io.dapla_functions import exists, read_geopandas, write_geopandas
32
+ from ..io.dapla_functions import read_geopandas
33
+ from ..io.dapla_functions import write_geopandas
31
34
 
32
35
  # from ..io.write_municipality_data import write_municipality_data
33
36
  except ImportError:
@@ -35,16 +38,13 @@ except ImportError:
35
38
 
36
39
 
37
40
  try:
38
- from dapla import read_pandas, write_pandas
41
+ from dapla import read_pandas
42
+ from dapla import write_pandas
43
+ from dapla.gcs import GCSFileSystem
39
44
  except ImportError:
40
- pass
41
-
42
45
 
43
- def turn_args_into_kwargs(func: Callable, args: tuple, index_start: int):
44
- if not isinstance(args, tuple):
45
- raise TypeError("args should be a tuple (it should not be unpacked with *)")
46
- argnames = inspect.getfullargspec(func).args[index_start:]
47
- return {name: value for value, name in zip(args, argnames, strict=False)}
46
+ class GCSFileSystem:
47
+ """Placeholder."""
48
48
 
49
49
 
50
50
  class Parallel:
@@ -87,10 +87,23 @@ class Parallel:
87
87
  backend: str = "multiprocessing",
88
88
  context: str = "spawn",
89
89
  maxtasksperchild: int = 10,
90
+ chunksize: int = 1,
90
91
  **kwargs,
91
- ):
92
+ ) -> None:
93
+ """Initialize a Parallel instance with specified settings for parallel execution.
94
+
95
+ Args:
96
+ processes: Number of parallel processes. Set to 1 to run without parallelization.
97
+ backend: The backend to use for parallel execution. Defaults to 'multiprocessing'.
98
+ context: The context setting for multiprocessing. Defaults to 'spawn'.
99
+ maxtasksperchild: The maximum number of tasks a worker process can complete
100
+ before it is replaced. Defaults to 10.
101
+ chunksize: The size of the chunks of the iterable to distribute to workers.
102
+ **kwargs: Additional keyword arguments passed to the underlying parallel execution backend.
103
+ """
92
104
  self.processes = int(processes)
93
105
  self.maxtasksperchild = maxtasksperchild
106
+ self.chunksize = chunksize
94
107
  self.backend = backend
95
108
  self.context = context
96
109
  self.kwargs = kwargs
@@ -103,7 +116,6 @@ class Parallel:
103
116
  iterable: Collection,
104
117
  args: tuple | None = None,
105
118
  kwargs: dict | None = None,
106
- chunksize: int = 1,
107
119
  ) -> list[Any]:
108
120
  """Run functions in parallel with items of an iterable as 0th arguemnt.
109
121
 
@@ -111,7 +123,7 @@ class Parallel:
111
123
  func: Function to be run.
112
124
  iterable: An iterable where each item will be passed to func as
113
125
  0th positional argument.
114
- Args: Positional arguments passed to 'func' starting from the 1st argument.
126
+ args: Positional arguments passed to 'func' starting from the 1st argument.
115
127
  The 0th argument will be reserved for the values of 'iterable'.
116
128
  kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
117
129
  not unpacked into separate keyword arguments.
@@ -120,8 +132,8 @@ class Parallel:
120
132
  A list of the return values of the function, one for each item in
121
133
  'iterable'.
122
134
 
123
- Examples
124
- --------
135
+ Examples:
136
+ ---------
125
137
  Multiply each list element by 2.
126
138
 
127
139
  >>> iterable = [1, 2, 3]
@@ -159,21 +171,20 @@ class Parallel:
159
171
  ... print(results)
160
172
  [2, 4, 6]
161
173
  """
162
-
163
174
  if args:
164
175
  # start at index 1, meaning the 0th argument (the iterable) is still available
165
- args_as_kwargs = turn_args_into_kwargs(func, args, index_start=1)
176
+ args_as_kwargs = _turn_args_into_kwargs(func, args, index_start=1)
166
177
  else:
167
178
  args_as_kwargs = {}
168
179
 
169
- self.validate_execution(func)
180
+ self._validate_execution(func)
170
181
 
171
182
  kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
172
183
 
173
184
  func_with_kwargs = functools.partial(func, **kwargs)
174
185
 
175
186
  if self.processes == 1:
176
- return list(map(func_with_kwargs, iterable))
187
+ return [func_with_kwargs(item) for item in iterable]
177
188
 
178
189
  iterable = list(iterable)
179
190
 
@@ -182,21 +193,42 @@ class Parallel:
182
193
 
183
194
  if not processes:
184
195
  return []
196
+ elif processes == 1:
197
+ return [func_with_kwargs(item) for item in iterable]
185
198
 
186
- if self.backend == "multiprocessing":
187
- with multiprocessing.get_context(self.context).Pool(
188
- processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
189
- ) as pool:
190
- try:
191
- return pool.map(func_with_kwargs, iterable, chunksize=chunksize)
192
- except Exception as e:
193
- pool.terminate()
194
- raise e
199
+ try:
200
+ if self.backend == "multiprocessing":
201
+ with multiprocessing.get_context(self.context).Pool(
202
+ processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
203
+ ) as pool:
204
+ try:
205
+ return pool.map(
206
+ func_with_kwargs, iterable, chunksize=self.chunksize
207
+ )
208
+ except Exception as e:
209
+ pool.terminate()
210
+ raise e
195
211
 
196
- with joblib.Parallel(
197
- n_jobs=processes, backend=self.backend, **self.kwargs
198
- ) as parallel:
199
- return parallel(joblib.delayed(func)(item, **kwargs) for item in iterable)
212
+ with joblib.Parallel(
213
+ n_jobs=processes, backend=self.backend, **self.kwargs
214
+ ) as parallel:
215
+ return parallel(
216
+ joblib.delayed(func)(item, **kwargs) for item in iterable
217
+ )
218
+ except pickle.PickleError as e:
219
+ unpicklable = []
220
+ for k, v in locals().items():
221
+ try:
222
+ pickle.dumps(v)
223
+ except pickle.PickleError:
224
+ unpicklable.append(k)
225
+ except TypeError:
226
+ pass
227
+ if unpicklable:
228
+ raise pickle.PickleError(
229
+ f"Cannot unpickle objects: {unpicklable}"
230
+ ) from e
231
+ raise e
200
232
 
201
233
  def starmap(
202
234
  self,
@@ -204,7 +236,6 @@ class Parallel:
204
236
  iterable: Collection[Iterable[Any]],
205
237
  args: tuple | None = None,
206
238
  kwargs: dict | None = None,
207
- chunksize: int = 1,
208
239
  ) -> list[Any]:
209
240
  """Run functions in parallel where items of the iterable are unpacked.
210
241
 
@@ -215,7 +246,7 @@ class Parallel:
215
246
  func: Function to be run.
216
247
  iterable: An iterable of iterables, where each item will be
217
248
  unpacked as positional argument to the function.
218
- Args: Positional arguments passed to 'func' starting at argument position
249
+ args: Positional arguments passed to 'func' starting at argument position
219
250
  n + 1, where n is the length of the iterables inside the iterable.
220
251
  kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
221
252
  not unpacked into separate keyword arguments.
@@ -224,8 +255,8 @@ class Parallel:
224
255
  A list of the return values of the function, one for each item in
225
256
  'iterable'.
226
257
 
227
- Examples
228
- --------
258
+ Examples:
259
+ ---------
229
260
  Multiply each list element by 2.
230
261
 
231
262
  >>> iterable = [(1, 2), (2, 3), (3, 4)]
@@ -262,13 +293,13 @@ class Parallel:
262
293
  if args:
263
294
  # starting the count at the length of the iterables inside the iterables
264
295
  iterable = list(iterable)
265
- args_as_kwargs = turn_args_into_kwargs(
296
+ args_as_kwargs = _turn_args_into_kwargs(
266
297
  func, args, index_start=len(iterable[0])
267
298
  )
268
299
  else:
269
300
  args_as_kwargs = {}
270
301
 
271
- self.validate_execution(func)
302
+ self._validate_execution(func)
272
303
 
273
304
  kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
274
305
 
@@ -290,7 +321,9 @@ class Parallel:
290
321
  processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
291
322
  ) as pool:
292
323
  try:
293
- return pool.starmap(func_with_kwargs, iterable, chunksize=chunksize)
324
+ return pool.starmap(
325
+ func_with_kwargs, iterable, chunksize=self.chunksize
326
+ )
294
327
  except Exception as e:
295
328
  pool.terminate()
296
329
  raise e
@@ -320,10 +353,10 @@ class Parallel:
320
353
  Returns:
321
354
  A DataFrame, or a list of DataFrames if concat is False.
322
355
  """
323
- if not strict:
324
- files = [file for file in files if exists(file)]
325
-
326
- res = self.map(dp.read_pandas, files, kwargs=kwargs)
356
+ if strict:
357
+ res = self.map(read_pandas, files, kwargs=kwargs)
358
+ else:
359
+ res = self.map(_try_to_read_pandas, files, kwargs=kwargs)
327
360
 
328
361
  return pd.concat(res, ignore_index=ignore_index) if concat else res
329
362
 
@@ -342,14 +375,19 @@ class Parallel:
342
375
  concat: Whether to concat the results to a GeoDataFrame.
343
376
  ignore_index: Defaults to True.
344
377
  strict: If True (default), all files must exist.
378
+ chunksize: The size of the chunks of the iterable to distribute to workers.
345
379
  **kwargs: Keyword arguments passed to sgis.read_geopandas.
346
380
 
347
381
  Returns:
348
382
  A GeoDataFrame, or a list of GeoDataFrames if concat is False.
349
383
  """
350
- if not strict:
351
- files = [file for file in files if exists(file)]
352
- res = self.map(read_geopandas, files, kwargs=kwargs)
384
+ if "file_system" not in kwargs:
385
+ kwargs["file_system"] = dp.FileClient.get_gcs_file_system()
386
+
387
+ if strict:
388
+ res = self.map(read_geopandas, files, kwargs=kwargs)
389
+ else:
390
+ res = self.map(_try_to_read_geopandas, files, kwargs=kwargs)
353
391
 
354
392
  return pd.concat(res, ignore_index=ignore_index) if concat else res
355
393
 
@@ -367,10 +405,14 @@ class Parallel:
367
405
  clip: bool = True,
368
406
  max_rows_per_chunk: int = 150_000,
369
407
  processes_in_clip: int = 1,
370
- ):
408
+ verbose: bool = True,
409
+ ) -> None:
371
410
  """Split multiple datasets into municipalities and write as separate files.
372
411
 
373
412
  The files will be named as the municipality number.
413
+ Each dataset in 'in_data' is intersected with 'municipalities'
414
+ in parallel. The intersections themselves can also be run in parallel
415
+ with the 'processes_in_clip' argument.
374
416
 
375
417
  Args:
376
418
  in_data: Dictionary with dataset names as keys and file paths or
@@ -397,7 +439,12 @@ class Parallel:
397
439
  not have to have the same length as 'in_data'.
398
440
  write_empty: If False (default), municipalities with no data will be skipped.
399
441
  If True, an empty parquet file will be written.
400
- clip: If True (default), the data will be clipped.
442
+ clip: If True (default), the data will be clipped. If False, the data will
443
+ be spatial joined.
444
+ max_rows_per_chunk: Number of rows per data chunk for processing.
445
+ processes_in_clip: Number of parallel processes for data clipping.
446
+ verbose: Whether to print during execution.
447
+
401
448
  """
402
449
  shared_kwds = {
403
450
  "municipalities": municipalities,
@@ -409,6 +456,7 @@ class Parallel:
409
456
  "max_rows_per_chunk": max_rows_per_chunk,
410
457
  "processes_in_clip": processes_in_clip,
411
458
  "strict": strict,
459
+ "verbose": verbose,
412
460
  }
413
461
 
414
462
  if isinstance(out_data, (str, Path)):
@@ -417,10 +465,12 @@ class Parallel:
417
465
  if funcdict is None:
418
466
  funcdict = {}
419
467
 
420
- zip_func = dict_zip if strict else dict_zip_union
468
+ fs = dp.FileClient.get_gcs_file_system()
421
469
 
422
- for _, data, folder, postfunc in zip_func(in_data, out_data, funcdict):
423
- if data is None:
470
+ for _, data, folder, postfunc in dict_zip_union(in_data, out_data, funcdict):
471
+ if data is None or (
472
+ not strict and isinstance(data, (str | Path)) and not fs.exists(data)
473
+ ):
424
474
  continue
425
475
 
426
476
  kwds = shared_kwds | {
@@ -439,15 +489,33 @@ class Parallel:
439
489
  df: GeoDataFrame,
440
490
  args: tuple | None = None,
441
491
  kwargs: dict | None = None,
442
- max_rows_per_chunk: int = 150_000,
443
- n_chunks: int = None,
444
- concat: bool = False,
492
+ n_chunks: int | None = None,
493
+ max_rows_per_chunk: int | None = None,
494
+ concat: bool = True,
445
495
  ) -> GeoDataFrame:
446
- if len(df) < max_rows_per_chunk:
447
- return func(df, *args, **kwargs)
496
+ """Run a function in parallel on chunks of a (Geo)DataFrame.
448
497
 
449
- if n_chunks is None:
450
- n_chunks = len(df) // max_rows_per_chunk
498
+ Args:
499
+ func: Function to run chunkwise. It should take
500
+ a (Geo)DataFrame as first argument.
501
+ df: (Geo)DataFrame to split in n_chunks and passed
502
+ as first argument to 'func'.
503
+ args: Positional arguments in 'func' after the DataFrame.
504
+ kwargs: Additional keyword arguments in 'func'.
505
+ n_chunks: Optionally set number of chunks to split
506
+ 'df' into. Defaults to the 'processes' attribute
507
+ of the Parallel instance.
508
+ max_rows_per_chunk: Alternatively decide number of chunks
509
+ by a maximum number of rows per chunk.
510
+ concat: Whether to use pd.concat on the results.
511
+ Defaults to True.
512
+ """
513
+ if max_rows_per_chunk is None and n_chunks is None:
514
+ n_chunks: int = self.processes
515
+ elif n_chunks is None:
516
+ n_chunks: int = len(df) // max_rows_per_chunk
517
+ elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
518
+ return func(df, *args, **kwargs)
451
519
 
452
520
  chunks = np.array_split(np.arange(len(df)), n_chunks)
453
521
 
@@ -464,7 +532,7 @@ class Parallel:
464
532
  else:
465
533
  return out
466
534
 
467
- def validate_execution(self, func):
535
+ def _validate_execution(self, func: Callable) -> None:
468
536
  """Multiprocessing doesn't work with local variables in interactive interpreter.
469
537
 
470
538
  Raising Exception to avoid confusion.
@@ -478,8 +546,8 @@ class Parallel:
478
546
  raise LocalFunctionError(func)
479
547
 
480
548
  @staticmethod
481
- def _validate_kwargs(kwargs) -> dict:
482
- """Make sure kwargs is a dict (not ** unpacked or None)"""
549
+ def _validate_kwargs(kwargs: dict) -> dict:
550
+ """Make sure kwargs is a dict (not ** unpacked or None)."""
483
551
  if kwargs is None:
484
552
  kwargs = {}
485
553
  elif not isinstance(kwargs, dict):
@@ -487,7 +555,7 @@ class Parallel:
487
555
  return kwargs
488
556
 
489
557
  def _execute(self) -> list[Any]:
490
- [self.validate_execution(func) for func in self.funcs]
558
+ [self._validate_execution(func) for func in self.funcs]
491
559
 
492
560
  if self.processes == 1:
493
561
  return [func() for func in self.funcs]
@@ -513,7 +581,8 @@ class Parallel:
513
581
  results = [pool.apply_async(func) for func in self.funcs]
514
582
  return [result.get() for result in results]
515
583
 
516
- def __repr__(self):
584
+ def __repr__(self) -> str:
585
+ """String representation."""
517
586
  return (
518
587
  f"{self.__class__.__name__}(processes={self.processes}, "
519
588
  f"backend='{self.backend}', context='{self.context}')"
@@ -523,7 +592,7 @@ class Parallel:
523
592
  def write_municipality_data(
524
593
  data: str | GeoDataFrame | DataFrame,
525
594
  out_folder: str,
526
- municipalities: GeoDataFrame,
595
+ municipalities: GeoDataFrame | list[str] | None = None,
527
596
  with_neighbors: bool = False,
528
597
  muni_number_col: str = "KOMMUNENR",
529
598
  file_type: str = "parquet",
@@ -533,13 +602,39 @@ def write_municipality_data(
533
602
  max_rows_per_chunk: int = 150_000,
534
603
  processes_in_clip: int = 1,
535
604
  strict: bool = True,
605
+ verbose: bool = True,
536
606
  ) -> None:
607
+ """Splits and writes data into municipality-specific files.
608
+
609
+ Args:
610
+ data: Path to the data file or a GeoDataFrame.
611
+ out_folder: Path to the output directory where the municipality data
612
+ is written.
613
+ municipalities: Either a sequence of municipality numbers or a GeoDataFrame
614
+ of municipality polygons and municipality numbers in the column 'muni_number_col'.
615
+ Defaults to None.
616
+ with_neighbors: If True, include data from neighboring municipalities
617
+ for each municipality.
618
+ muni_number_col: Column name for municipality codes in 'municipalities'.
619
+ file_type: Format of the output file.
620
+ func: Function to process data before writing.
621
+ write_empty: If True, write empty files for municipalities without data.
622
+ clip: If True, clip the data to municipality boundaries. If False
623
+ the data is spatial joined.
624
+ max_rows_per_chunk: Maximum number of rows in each processed chunk.
625
+ processes_in_clip: Number of processes to use for clipping.
626
+ strict: If True (default) and the data has a municipality column,
627
+ all municipality numbers in 'data' must be present in 'municipalities'.
628
+ verbose: Whether to print during execution.
629
+
630
+ Returns:
631
+ None. The function writes files directly.
632
+ """
537
633
  write_func = (
538
634
  _write_neighbor_municipality_data
539
635
  if with_neighbors
540
636
  else _write_municipality_data
541
637
  )
542
-
543
638
  return write_func(
544
639
  data=data,
545
640
  out_folder=out_folder,
@@ -552,27 +647,34 @@ def write_municipality_data(
552
647
  max_rows_per_chunk=max_rows_per_chunk,
553
648
  processes_in_clip=processes_in_clip,
554
649
  strict=strict,
650
+ verbose=verbose,
555
651
  )
556
652
 
557
653
 
558
- def _validate_data(data: str | list[str]) -> str:
559
- if isinstance(data, (str, Path)):
560
- return data
654
+ def _validate_data(
655
+ data: str | list[str] | DataFrame | GeoDataFrame,
656
+ ) -> DataFrame | GeoDataFrame:
561
657
  if hasattr(data, "__iter__") and len(data) == 1:
562
- return data[0]
563
- elif not isinstance(data, GeoDataFrame):
564
- raise TypeError("'data' Must be a file path or a GeoDataFrame. Got", type(data))
658
+ data = data[0]
659
+ if isinstance(data, (str, Path)):
660
+ try:
661
+ return read_geopandas(str(data))
662
+ except ValueError as e:
663
+ try:
664
+ return read_pandas(str(data))
665
+ except ValueError as e2:
666
+ raise e.__class__(e, data) from e2
565
667
  return data
566
668
 
567
669
 
568
- def _get_out_path(out_folder, muni, file_type):
670
+ def _get_out_path(out_folder: str | Path, muni: str, file_type: str) -> str:
569
671
  return str(Path(out_folder) / f"{muni}.{file_type.strip('.')}")
570
672
 
571
673
 
572
674
  def _write_municipality_data(
573
675
  data: str | GeoDataFrame | DataFrame,
574
676
  out_folder: str,
575
- municipalities: GeoDataFrame,
677
+ municipalities: GeoDataFrame | list[str] | None = None,
576
678
  muni_number_col: str = "KOMMUNENR",
577
679
  file_type: str = "parquet",
578
680
  func: Callable | None = None,
@@ -581,21 +683,15 @@ def _write_municipality_data(
581
683
  max_rows_per_chunk: int = 150_000,
582
684
  processes_in_clip: int = 1,
583
685
  strict: bool = True,
686
+ verbose: bool = True,
584
687
  ) -> None:
585
- data = _validate_data(data)
586
-
587
- if isinstance(data, (str, Path)):
588
- try:
589
- gdf = read_geopandas(str(data))
590
- except ValueError as e:
591
- try:
592
- gdf = read_pandas(str(data))
593
- except ValueError:
594
- raise e.__class__(e, data)
595
- elif isinstance(data, DataFrame):
596
- gdf = data
688
+ if verbose:
689
+ to_print = out_folder
690
+ print(to_print)
597
691
  else:
598
- raise TypeError(type(data))
692
+ to_print = None
693
+
694
+ gdf = _validate_data(data)
599
695
 
600
696
  if func is not None:
601
697
  gdf = func(gdf)
@@ -608,22 +704,29 @@ def _write_municipality_data(
608
704
  max_rows_per_chunk,
609
705
  processes_in_clip=processes_in_clip,
610
706
  strict=strict,
707
+ to_print=to_print,
611
708
  )
612
709
 
613
- for muni in municipalities[muni_number_col]:
614
- print(muni)
615
- out = _get_out_path(out_folder, muni, file_type)
616
-
617
- gdf_muni = gdf.loc[gdf[muni_number_col] == muni]
618
-
619
- if not len(gdf_muni):
620
- if write_empty:
621
- gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
622
- gdf_muni["geometry"] = None
623
- write_pandas(gdf_muni, out)
624
- continue
625
-
626
- write_geopandas(gdf_muni, out)
710
+ if municipalities is None:
711
+ muni_numbers = gdf[muni_number_col]
712
+ elif not isinstance(municipalities, DataFrame):
713
+ muni_numbers = municipalities
714
+ else:
715
+ muni_numbers = municipalities[muni_number_col]
716
+
717
+ # hardcode this to threading for efficiency in io bound task
718
+ Parallel(processes_in_clip, backend="threading").map(
719
+ _write_one_muni,
720
+ muni_numbers,
721
+ kwargs=dict(
722
+ gdf=gdf,
723
+ out_folder=out_folder,
724
+ muni_number_col=muni_number_col,
725
+ file_type=file_type,
726
+ write_empty=write_empty,
727
+ to_print=to_print,
728
+ ),
729
+ )
627
730
 
628
731
 
629
732
  def _write_neighbor_municipality_data(
@@ -638,11 +741,15 @@ def _write_neighbor_municipality_data(
638
741
  max_rows_per_chunk: int = 150_000,
639
742
  processes_in_clip: int = 1,
640
743
  strict: bool = True,
744
+ verbose: bool = True,
641
745
  ) -> None:
642
- data = _validate_data(data)
746
+ if verbose:
747
+ to_print = out_folder
748
+ print("out_folder:", to_print)
749
+ else:
750
+ to_print = None
643
751
 
644
- if isinstance(data, (str, Path)):
645
- gdf = read_geopandas(str(data))
752
+ gdf = _validate_data(data)
646
753
 
647
754
  if func is not None:
648
755
  gdf = func(gdf)
@@ -655,6 +762,7 @@ def _write_neighbor_municipality_data(
655
762
  max_rows_per_chunk,
656
763
  processes_in_clip,
657
764
  strict=strict,
765
+ to_print=to_print,
658
766
  )
659
767
 
660
768
  if municipalities.index.name != muni_number_col:
@@ -664,43 +772,97 @@ def _write_neighbor_municipality_data(
664
772
  municipalities, municipalities, max_distance=1
665
773
  )
666
774
 
667
- for muni in municipalities.index:
668
- out = _get_out_path(out_folder, muni, file_type)
775
+ # hardcode this to threading for efficiency in io bound task
776
+ Parallel(processes_in_clip, backend="threading").map(
777
+ _write_one_muni_with_neighbors,
778
+ municipalities.index,
779
+ kwargs=dict(
780
+ gdf=gdf,
781
+ neighbor_munis=neighbor_munis,
782
+ out_folder=out_folder,
783
+ muni_number_col=muni_number_col,
784
+ file_type=file_type,
785
+ write_empty=write_empty,
786
+ to_print=to_print,
787
+ ),
788
+ )
669
789
 
670
- muni_and_neighbors = neighbor_munis.loc[[muni]]
671
- gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
672
790
 
673
- if not len(gdf_neighbor):
674
- if write_empty:
675
- gdf_neighbor["geometry"] = gdf_neighbor["geometry"].astype(str)
676
- write_pandas(gdf_neighbor, out)
677
- continue
791
+ def _write_one_muni(
792
+ muni_number: Any,
793
+ gdf: GeoDataFrame | DataFrame,
794
+ out_folder: str | Path,
795
+ muni_number_col: str,
796
+ file_type: str,
797
+ write_empty: bool,
798
+ to_print: str | None = None,
799
+ ) -> None:
800
+ out = _get_out_path(out_folder, muni_number, file_type)
801
+
802
+ if to_print:
803
+ print("writing:", out)
804
+
805
+ gdf_muni = gdf.loc[gdf[muni_number_col] == muni_number]
806
+
807
+ if not len(gdf_muni):
808
+ if write_empty:
809
+ gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
810
+ gdf_muni["geometry"] = None
811
+ write_pandas(gdf_muni, out)
812
+ return
678
813
 
679
- write_geopandas(gdf_neighbor, out)
814
+ write_geopandas(gdf_muni, out)
815
+
816
+
817
+ def _write_one_muni_with_neighbors(
818
+ muni_number: Any,
819
+ gdf: GeoDataFrame | DataFrame,
820
+ neighbor_munis: Series,
821
+ out_folder: str | Path,
822
+ muni_number_col: str,
823
+ file_type: str,
824
+ write_empty: bool,
825
+ to_print: str | None = None,
826
+ ) -> None:
827
+ out = _get_out_path(out_folder, muni_number, file_type)
828
+
829
+ if to_print:
830
+ print("writing:", out)
831
+
832
+ muni_and_neighbors: Series = neighbor_munis.loc[[muni_number]]
833
+ gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
834
+
835
+ if not len(gdf_neighbor):
836
+ if write_empty:
837
+ gdf_neighbor = gdf_neighbor.drop(columns="geometry", errors="ignore")
838
+ gdf_neighbor["geometry"] = None
839
+ write_pandas(gdf_neighbor, out)
840
+ return
841
+
842
+ write_geopandas(gdf_neighbor, out)
680
843
 
681
844
 
682
845
  def _fix_missing_muni_numbers(
683
- gdf,
684
- municipalities,
685
- muni_number_col,
686
- clip,
687
- max_rows_per_chunk,
688
- processes_in_clip,
689
- strict,
690
- ):
846
+ gdf: GeoDataFrame,
847
+ municipalities: GeoDataFrame,
848
+ muni_number_col: str,
849
+ clip: bool,
850
+ max_rows_per_chunk: int,
851
+ processes_in_clip: int,
852
+ strict: bool,
853
+ to_print: str,
854
+ ) -> GeoDataFrame:
691
855
  if muni_number_col in gdf and gdf[muni_number_col].notna().all():
692
856
  if municipalities is None:
693
857
  return gdf
694
858
  if diffs := set(gdf[muni_number_col].values).difference(
695
859
  set(municipalities[muni_number_col].values)
696
860
  ):
697
- message = (
698
- f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore"
699
- )
861
+ message = f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore."
700
862
  if strict:
701
863
  raise ValueError(message)
702
864
  else:
703
- warnings.warn(message)
865
+ warnings.warn(message, stacklevel=1)
704
866
  return gdf
705
867
 
706
868
  if municipalities is None:
@@ -717,7 +879,10 @@ def _fix_missing_muni_numbers(
717
879
  "GeoDataFrame to clip the geometries by."
718
880
  )
719
881
 
720
- municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
882
+ try:
883
+ municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
884
+ except Exception as e:
885
+ raise e.__class__(e, to_print) from e
721
886
 
722
887
  if muni_number_col in gdf and gdf[muni_number_col].isna().any():
723
888
  notna = gdf[gdf[muni_number_col].notna()]
@@ -732,6 +897,7 @@ def _fix_missing_muni_numbers(
732
897
  municipalities[[muni_number_col, municipalities._geometry_column_name]],
733
898
  processes=processes_in_clip,
734
899
  max_rows_per_chunk=max_rows_per_chunk,
900
+ to_print=to_print,
735
901
  )
736
902
 
737
903
  return pd.concat([notna, notna_anymore], ignore_index=True)
@@ -744,25 +910,42 @@ def _fix_missing_muni_numbers(
744
910
  municipalities[[muni_number_col, municipalities._geometry_column_name]],
745
911
  processes=processes_in_clip,
746
912
  max_rows_per_chunk=max_rows_per_chunk,
913
+ to_print=to_print,
747
914
  )
748
915
 
749
916
 
750
917
  def parallel_overlay(
751
918
  df1: GeoDataFrame,
752
919
  df2: GeoDataFrame,
753
- # muni_number_col: str,
754
920
  processes: int,
755
921
  max_rows_per_chunk: int,
756
922
  backend: str = "loky",
923
+ to_print: str | None = None,
757
924
  **kwargs,
758
925
  ) -> GeoDataFrame:
759
- # df2 = df2[[muni_number_col, df2._geometry_column_name]]
926
+ """Perform spatial overlay operations on two GeoDataFrames in parallel.
927
+
928
+ This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
929
+ overlay operation with the second GeoDataFrame, and then concatenates the results.
760
930
 
931
+ Note that this function is most useful if df2 has few and simple geometries.
932
+
933
+ Args:
934
+ df1: The first GeoDataFrame for the overlay operation.
935
+ df2: The second GeoDataFrame for the overlay operation.
936
+ how: Type of overlay operation ('intersection', 'union', etc.).
937
+ processes: Number of parallel processes to use.
938
+ max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
939
+ backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
940
+ to_print: Optional text to print to see progression.
941
+ **kwargs: Additional keyword arguments to pass to the overlay function.
942
+
943
+ Returns:
944
+ A GeoDataFrame containing the result of the overlay operation.
945
+ """
761
946
  if len(df1) < max_rows_per_chunk:
762
947
  return clean_overlay(df1, df2, **kwargs)
763
948
 
764
- # df2 = df2.dissolve(by=muni_number_col, as_index=False)
765
-
766
949
  n_chunks = len(df1) // max_rows_per_chunk
767
950
  chunks = np.array_split(np.arange(len(df1)), n_chunks)
768
951
 
@@ -778,26 +961,50 @@ def parallel_overlay(
778
961
  out = Parallel(processes, backend=backend).map(
779
962
  _clean_intersection,
780
963
  df1_chunked,
781
- args=(df2,),
964
+ args=(df2, to_print) if to_print else (df2,),
782
965
  )
783
966
  return pd.concat(out, ignore_index=True)
784
967
 
785
968
 
786
- def _clean_intersection(df1, df2):
787
- print(len(df1))
969
+ def _clean_intersection(
970
+ df1: GeoDataFrame, df2: GeoDataFrame, to_print: str = ""
971
+ ) -> GeoDataFrame:
972
+ print(to_print, "- intersection chunk len:", len(df1))
788
973
  return clean_overlay(df1, df2, how="intersection")
789
974
 
790
975
 
791
976
  def chunkwise(
792
977
  func: Callable,
793
- df: GeoDataFrame,
978
+ df: GeoDataFrame | pd.DataFrame,
794
979
  max_rows_per_chunk: int = 150_000,
795
- n_chunks: int = None,
980
+ n_chunks: int | None = None,
796
981
  args: tuple | None = None,
797
982
  kwargs: dict | None = None,
798
983
  n_jobs: int = 1,
799
984
  backend: str = "loky",
800
- ) -> GeoDataFrame:
985
+ ) -> GeoDataFrame | pd.DataFrame:
986
+ """Run a function in parallel on chunks of a DataFrame.
987
+
988
+ This method is used to process large (Geo)DataFrames in manageable pieces,
989
+ optionally in parallel.
990
+
991
+ Args:
992
+ func: The function to apply to each chunk. This function must accept a DataFrame as
993
+ its first argument and return a DataFrame.
994
+ df: The DataFrame to be chunked and processed.
995
+ max_rows_per_chunk: The maximum number of rows each chunk should contain.
996
+ n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
997
+ calculated based on 'max_rows_per_chunk'.
998
+ args: Additional positional arguments to pass to 'func'.
999
+ kwargs: Keyword arguments to pass to 'func'.
1000
+ n_jobs: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
1001
+ backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
1002
+
1003
+ Returns:
1004
+ GeoDataFrame: A GeoDataFrame resulting from concatenating the results of applying 'func'
1005
+ to each chunk of the original GeoDataFrame.
1006
+
1007
+ """
801
1008
  if len(df) < max_rows_per_chunk:
802
1009
  return func(df, *args, **kwargs)
803
1010
 
@@ -815,3 +1022,26 @@ def chunkwise(
815
1022
  kwargs=kwargs,
816
1023
  )
817
1024
  return pd.concat(out, ignore_index=True)
1025
+
1026
+
1027
+ def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
1028
+ if not isinstance(args, tuple):
1029
+ raise TypeError("args should be a tuple (it should not be unpacked with *)")
1030
+ argnames = inspect.getfullargspec(func).args[index_start:]
1031
+ return {name: value for value, name in zip(args, argnames, strict=False)}
1032
+
1033
+
1034
+ def _try_to_read_geopandas(path: str, **kwargs) -> GeoDataFrame | DataFrame | None:
1035
+ """Read with try/except because it's faster than checking exists first."""
1036
+ try:
1037
+ return read_geopandas(path, **kwargs)
1038
+ except FileNotFoundError:
1039
+ return None
1040
+
1041
+
1042
+ def _try_to_read_pandas(path: str, **kwargs) -> DataFrame | None:
1043
+ """Read with try/except because it's faster than checking exists first."""
1044
+ try:
1045
+ return read_pandas(path, **kwargs)
1046
+ except FileNotFoundError:
1047
+ return None