ssb-sgis 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +97 -115
- sgis/exceptions.py +3 -1
- sgis/geopandas_tools/__init__.py +1 -0
- sgis/geopandas_tools/bounds.py +75 -38
- sgis/geopandas_tools/buffer_dissolve_explode.py +38 -34
- sgis/geopandas_tools/centerlines.py +53 -44
- sgis/geopandas_tools/cleaning.py +87 -104
- sgis/geopandas_tools/conversion.py +149 -101
- sgis/geopandas_tools/duplicates.py +31 -17
- sgis/geopandas_tools/general.py +76 -48
- sgis/geopandas_tools/geometry_types.py +21 -7
- sgis/geopandas_tools/neighbors.py +20 -8
- sgis/geopandas_tools/overlay.py +136 -53
- sgis/geopandas_tools/point_operations.py +9 -8
- sgis/geopandas_tools/polygon_operations.py +48 -56
- sgis/geopandas_tools/polygons_as_rings.py +121 -78
- sgis/geopandas_tools/sfilter.py +14 -14
- sgis/helpers.py +114 -56
- sgis/io/dapla_functions.py +32 -23
- sgis/io/opener.py +13 -6
- sgis/io/read_parquet.py +1 -1
- sgis/maps/examine.py +39 -26
- sgis/maps/explore.py +112 -66
- sgis/maps/httpserver.py +12 -12
- sgis/maps/legend.py +124 -65
- sgis/maps/map.py +66 -41
- sgis/maps/maps.py +31 -29
- sgis/maps/thematicmap.py +46 -33
- sgis/maps/tilesources.py +3 -8
- sgis/networkanalysis/_get_route.py +5 -4
- sgis/networkanalysis/_od_cost_matrix.py +44 -1
- sgis/networkanalysis/_points.py +10 -4
- sgis/networkanalysis/_service_area.py +5 -2
- sgis/networkanalysis/closing_network_holes.py +20 -62
- sgis/networkanalysis/cutting_lines.py +55 -43
- sgis/networkanalysis/directednetwork.py +15 -7
- sgis/networkanalysis/finding_isolated_networks.py +4 -3
- sgis/networkanalysis/network.py +15 -13
- sgis/networkanalysis/networkanalysis.py +72 -54
- sgis/networkanalysis/networkanalysisrules.py +20 -16
- sgis/networkanalysis/nodes.py +2 -3
- sgis/networkanalysis/traveling_salesman.py +5 -2
- sgis/parallel/parallel.py +337 -127
- sgis/raster/__init__.py +6 -0
- sgis/raster/base.py +9 -3
- sgis/raster/cube.py +280 -208
- sgis/raster/cubebase.py +15 -29
- sgis/raster/indices.py +3 -7
- sgis/raster/methods_as_functions.py +0 -124
- sgis/raster/raster.py +313 -127
- sgis/raster/torchgeo.py +58 -37
- sgis/raster/zonal.py +38 -13
- {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/LICENSE +1 -1
- {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/METADATA +87 -16
- ssb_sgis-1.0.2.dist-info/RECORD +61 -0
- {ssb_sgis-1.0.1.dist-info → ssb_sgis-1.0.2.dist-info}/WHEEL +1 -1
- sgis/raster/bands.py +0 -48
- sgis/raster/gradient.py +0 -78
- ssb_sgis-1.0.1.dist-info/RECORD +0 -63
sgis/parallel/parallel.py
CHANGED
|
@@ -3,11 +3,12 @@ import inspect
|
|
|
3
3
|
import itertools
|
|
4
4
|
import multiprocessing
|
|
5
5
|
import warnings
|
|
6
|
-
from collections.abc import Callable
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from collections.abc import Collection
|
|
8
|
+
from collections.abc import Iterable
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import Any
|
|
9
11
|
|
|
10
|
-
|
|
11
12
|
try:
|
|
12
13
|
import dapla as dp
|
|
13
14
|
except ImportError:
|
|
@@ -16,18 +17,19 @@ except ImportError:
|
|
|
16
17
|
import joblib
|
|
17
18
|
import numpy as np
|
|
18
19
|
import pandas as pd
|
|
19
|
-
from geopandas import GeoDataFrame
|
|
20
|
+
from geopandas import GeoDataFrame
|
|
20
21
|
from pandas import DataFrame
|
|
21
|
-
from
|
|
22
|
+
from pandas import Series
|
|
22
23
|
|
|
23
|
-
from ..geopandas_tools.general import clean_clip, clean_geoms
|
|
24
24
|
from ..geopandas_tools.neighbors import get_neighbor_indices
|
|
25
25
|
from ..geopandas_tools.overlay import clean_overlay
|
|
26
|
-
from ..helpers import LocalFunctionError
|
|
27
|
-
|
|
26
|
+
from ..helpers import LocalFunctionError
|
|
27
|
+
from ..helpers import dict_zip_union
|
|
28
|
+
from ..helpers import in_jupyter
|
|
28
29
|
|
|
29
30
|
try:
|
|
30
|
-
from ..io.dapla_functions import
|
|
31
|
+
from ..io.dapla_functions import read_geopandas
|
|
32
|
+
from ..io.dapla_functions import write_geopandas
|
|
31
33
|
|
|
32
34
|
# from ..io.write_municipality_data import write_municipality_data
|
|
33
35
|
except ImportError:
|
|
@@ -35,16 +37,13 @@ except ImportError:
|
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
try:
|
|
38
|
-
from dapla import read_pandas
|
|
40
|
+
from dapla import read_pandas
|
|
41
|
+
from dapla import write_pandas
|
|
42
|
+
from dapla.gcs import GCSFileSystem
|
|
39
43
|
except ImportError:
|
|
40
|
-
pass
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if not isinstance(args, tuple):
|
|
45
|
-
raise TypeError("args should be a tuple (it should not be unpacked with *)")
|
|
46
|
-
argnames = inspect.getfullargspec(func).args[index_start:]
|
|
47
|
-
return {name: value for value, name in zip(args, argnames, strict=False)}
|
|
45
|
+
class GCSFileSystem:
|
|
46
|
+
"""Placeholder."""
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
class Parallel:
|
|
@@ -87,10 +86,23 @@ class Parallel:
|
|
|
87
86
|
backend: str = "multiprocessing",
|
|
88
87
|
context: str = "spawn",
|
|
89
88
|
maxtasksperchild: int = 10,
|
|
89
|
+
chunksize: int = 1,
|
|
90
90
|
**kwargs,
|
|
91
|
-
):
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Initialize a Parallel instance with specified settings for parallel execution.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
processes: Number of parallel processes. Set to 1 to run without parallelization.
|
|
96
|
+
backend: The backend to use for parallel execution. Defaults to 'multiprocessing'.
|
|
97
|
+
context: The context setting for multiprocessing. Defaults to 'spawn'.
|
|
98
|
+
maxtasksperchild: The maximum number of tasks a worker process can complete
|
|
99
|
+
before it is replaced. Defaults to 10.
|
|
100
|
+
chunksize: The size of the chunks of the iterable to distribute to workers.
|
|
101
|
+
**kwargs: Additional keyword arguments passed to the underlying parallel execution backend.
|
|
102
|
+
"""
|
|
92
103
|
self.processes = int(processes)
|
|
93
104
|
self.maxtasksperchild = maxtasksperchild
|
|
105
|
+
self.chunksize = chunksize
|
|
94
106
|
self.backend = backend
|
|
95
107
|
self.context = context
|
|
96
108
|
self.kwargs = kwargs
|
|
@@ -103,7 +115,6 @@ class Parallel:
|
|
|
103
115
|
iterable: Collection,
|
|
104
116
|
args: tuple | None = None,
|
|
105
117
|
kwargs: dict | None = None,
|
|
106
|
-
chunksize: int = 1,
|
|
107
118
|
) -> list[Any]:
|
|
108
119
|
"""Run functions in parallel with items of an iterable as 0th arguemnt.
|
|
109
120
|
|
|
@@ -111,7 +122,7 @@ class Parallel:
|
|
|
111
122
|
func: Function to be run.
|
|
112
123
|
iterable: An iterable where each item will be passed to func as
|
|
113
124
|
0th positional argument.
|
|
114
|
-
|
|
125
|
+
args: Positional arguments passed to 'func' starting from the 1st argument.
|
|
115
126
|
The 0th argument will be reserved for the values of 'iterable'.
|
|
116
127
|
kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
|
|
117
128
|
not unpacked into separate keyword arguments.
|
|
@@ -120,7 +131,7 @@ class Parallel:
|
|
|
120
131
|
A list of the return values of the function, one for each item in
|
|
121
132
|
'iterable'.
|
|
122
133
|
|
|
123
|
-
Examples
|
|
134
|
+
Examples:
|
|
124
135
|
--------
|
|
125
136
|
Multiply each list element by 2.
|
|
126
137
|
|
|
@@ -159,14 +170,13 @@ class Parallel:
|
|
|
159
170
|
... print(results)
|
|
160
171
|
[2, 4, 6]
|
|
161
172
|
"""
|
|
162
|
-
|
|
163
173
|
if args:
|
|
164
174
|
# start at index 1, meaning the 0th argument (the iterable) is still available
|
|
165
|
-
args_as_kwargs =
|
|
175
|
+
args_as_kwargs = _turn_args_into_kwargs(func, args, index_start=1)
|
|
166
176
|
else:
|
|
167
177
|
args_as_kwargs = {}
|
|
168
178
|
|
|
169
|
-
self.
|
|
179
|
+
self._validate_execution(func)
|
|
170
180
|
|
|
171
181
|
kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
|
|
172
182
|
|
|
@@ -188,7 +198,9 @@ class Parallel:
|
|
|
188
198
|
processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
|
|
189
199
|
) as pool:
|
|
190
200
|
try:
|
|
191
|
-
return pool.map(
|
|
201
|
+
return pool.map(
|
|
202
|
+
func_with_kwargs, iterable, chunksize=self.chunksize
|
|
203
|
+
)
|
|
192
204
|
except Exception as e:
|
|
193
205
|
pool.terminate()
|
|
194
206
|
raise e
|
|
@@ -204,7 +216,6 @@ class Parallel:
|
|
|
204
216
|
iterable: Collection[Iterable[Any]],
|
|
205
217
|
args: tuple | None = None,
|
|
206
218
|
kwargs: dict | None = None,
|
|
207
|
-
chunksize: int = 1,
|
|
208
219
|
) -> list[Any]:
|
|
209
220
|
"""Run functions in parallel where items of the iterable are unpacked.
|
|
210
221
|
|
|
@@ -215,7 +226,7 @@ class Parallel:
|
|
|
215
226
|
func: Function to be run.
|
|
216
227
|
iterable: An iterable of iterables, where each item will be
|
|
217
228
|
unpacked as positional argument to the function.
|
|
218
|
-
|
|
229
|
+
args: Positional arguments passed to 'func' starting at argument position
|
|
219
230
|
n + 1, where n is the length of the iterables inside the iterable.
|
|
220
231
|
kwargs: Keyword arguments passed to 'func'. Must be passed as a dict,
|
|
221
232
|
not unpacked into separate keyword arguments.
|
|
@@ -224,7 +235,7 @@ class Parallel:
|
|
|
224
235
|
A list of the return values of the function, one for each item in
|
|
225
236
|
'iterable'.
|
|
226
237
|
|
|
227
|
-
Examples
|
|
238
|
+
Examples:
|
|
228
239
|
--------
|
|
229
240
|
Multiply each list element by 2.
|
|
230
241
|
|
|
@@ -262,13 +273,13 @@ class Parallel:
|
|
|
262
273
|
if args:
|
|
263
274
|
# starting the count at the length of the iterables inside the iterables
|
|
264
275
|
iterable = list(iterable)
|
|
265
|
-
args_as_kwargs =
|
|
276
|
+
args_as_kwargs = _turn_args_into_kwargs(
|
|
266
277
|
func, args, index_start=len(iterable[0])
|
|
267
278
|
)
|
|
268
279
|
else:
|
|
269
280
|
args_as_kwargs = {}
|
|
270
281
|
|
|
271
|
-
self.
|
|
282
|
+
self._validate_execution(func)
|
|
272
283
|
|
|
273
284
|
kwargs = self._validate_kwargs(kwargs) | args_as_kwargs
|
|
274
285
|
|
|
@@ -290,7 +301,9 @@ class Parallel:
|
|
|
290
301
|
processes, maxtasksperchild=self.maxtasksperchild, **self.kwargs
|
|
291
302
|
) as pool:
|
|
292
303
|
try:
|
|
293
|
-
return pool.starmap(
|
|
304
|
+
return pool.starmap(
|
|
305
|
+
func_with_kwargs, iterable, chunksize=self.chunksize
|
|
306
|
+
)
|
|
294
307
|
except Exception as e:
|
|
295
308
|
pool.terminate()
|
|
296
309
|
raise e
|
|
@@ -320,10 +333,10 @@ class Parallel:
|
|
|
320
333
|
Returns:
|
|
321
334
|
A DataFrame, or a list of DataFrames if concat is False.
|
|
322
335
|
"""
|
|
323
|
-
if
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
336
|
+
if strict:
|
|
337
|
+
res = self.map(read_pandas, files, kwargs=kwargs)
|
|
338
|
+
else:
|
|
339
|
+
res = self.map(_try_to_read_pandas, files, kwargs=kwargs)
|
|
327
340
|
|
|
328
341
|
return pd.concat(res, ignore_index=ignore_index) if concat else res
|
|
329
342
|
|
|
@@ -342,14 +355,19 @@ class Parallel:
|
|
|
342
355
|
concat: Whether to concat the results to a GeoDataFrame.
|
|
343
356
|
ignore_index: Defaults to True.
|
|
344
357
|
strict: If True (default), all files must exist.
|
|
358
|
+
chunksize: The size of the chunks of the iterable to distribute to workers.
|
|
345
359
|
**kwargs: Keyword arguments passed to sgis.read_geopandas.
|
|
346
360
|
|
|
347
361
|
Returns:
|
|
348
362
|
A GeoDataFrame, or a list of GeoDataFrames if concat is False.
|
|
349
363
|
"""
|
|
350
|
-
if not
|
|
351
|
-
|
|
352
|
-
|
|
364
|
+
if "file_system" not in kwargs:
|
|
365
|
+
kwargs["file_system"] = dp.FileClient.get_gcs_file_system()
|
|
366
|
+
|
|
367
|
+
if strict:
|
|
368
|
+
res = self.map(read_geopandas, files, kwargs=kwargs)
|
|
369
|
+
else:
|
|
370
|
+
res = self.map(_try_to_read_geopandas, files, kwargs=kwargs)
|
|
353
371
|
|
|
354
372
|
return pd.concat(res, ignore_index=ignore_index) if concat else res
|
|
355
373
|
|
|
@@ -367,10 +385,14 @@ class Parallel:
|
|
|
367
385
|
clip: bool = True,
|
|
368
386
|
max_rows_per_chunk: int = 150_000,
|
|
369
387
|
processes_in_clip: int = 1,
|
|
370
|
-
|
|
388
|
+
verbose: bool = True,
|
|
389
|
+
) -> None:
|
|
371
390
|
"""Split multiple datasets into municipalities and write as separate files.
|
|
372
391
|
|
|
373
392
|
The files will be named as the municipality number.
|
|
393
|
+
Each dataset in 'in_data' is intersected with 'municipalities'
|
|
394
|
+
in parallel. The intersections themselves can also be run in parallel
|
|
395
|
+
with the 'processes_in_clip' argument.
|
|
374
396
|
|
|
375
397
|
Args:
|
|
376
398
|
in_data: Dictionary with dataset names as keys and file paths or
|
|
@@ -397,7 +419,12 @@ class Parallel:
|
|
|
397
419
|
not have to have the same length as 'in_data'.
|
|
398
420
|
write_empty: If False (default), municipalities with no data will be skipped.
|
|
399
421
|
If True, an empty parquet file will be written.
|
|
400
|
-
clip: If True (default), the data will be clipped.
|
|
422
|
+
clip: If True (default), the data will be clipped. If False, the data will
|
|
423
|
+
be spatial joined.
|
|
424
|
+
max_rows_per_chunk: Number of rows per data chunk for processing.
|
|
425
|
+
processes_in_clip: Number of parallel processes for data clipping.
|
|
426
|
+
verbose: Whether to print during execution.
|
|
427
|
+
|
|
401
428
|
"""
|
|
402
429
|
shared_kwds = {
|
|
403
430
|
"municipalities": municipalities,
|
|
@@ -409,6 +436,7 @@ class Parallel:
|
|
|
409
436
|
"max_rows_per_chunk": max_rows_per_chunk,
|
|
410
437
|
"processes_in_clip": processes_in_clip,
|
|
411
438
|
"strict": strict,
|
|
439
|
+
"verbose": verbose,
|
|
412
440
|
}
|
|
413
441
|
|
|
414
442
|
if isinstance(out_data, (str, Path)):
|
|
@@ -417,10 +445,12 @@ class Parallel:
|
|
|
417
445
|
if funcdict is None:
|
|
418
446
|
funcdict = {}
|
|
419
447
|
|
|
420
|
-
|
|
448
|
+
fs = dp.FileClient.get_gcs_file_system()
|
|
421
449
|
|
|
422
|
-
for _, data, folder, postfunc in
|
|
423
|
-
if data is None
|
|
450
|
+
for _, data, folder, postfunc in dict_zip_union(in_data, out_data, funcdict):
|
|
451
|
+
if data is None or (
|
|
452
|
+
not strict and isinstance(data, (str | Path)) and not fs.exists(data)
|
|
453
|
+
):
|
|
424
454
|
continue
|
|
425
455
|
|
|
426
456
|
kwds = shared_kwds | {
|
|
@@ -439,15 +469,33 @@ class Parallel:
|
|
|
439
469
|
df: GeoDataFrame,
|
|
440
470
|
args: tuple | None = None,
|
|
441
471
|
kwargs: dict | None = None,
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
concat: bool =
|
|
472
|
+
n_chunks: int | None = None,
|
|
473
|
+
max_rows_per_chunk: int | None = None,
|
|
474
|
+
concat: bool = True,
|
|
445
475
|
) -> GeoDataFrame:
|
|
446
|
-
|
|
447
|
-
return func(df, *args, **kwargs)
|
|
476
|
+
"""Run a function in parallel on chunks of a (Geo)DataFrame.
|
|
448
477
|
|
|
449
|
-
|
|
450
|
-
|
|
478
|
+
Args:
|
|
479
|
+
func: Function to run chunkwise. It should take
|
|
480
|
+
a (Geo)DataFrame as first argument.
|
|
481
|
+
df: (Geo)DataFrame to split in n_chunks and passed
|
|
482
|
+
as first argument to 'func'.
|
|
483
|
+
args: Positional arguments in 'func' after the DataFrame.
|
|
484
|
+
kwargs: Additional keyword arguments in 'func'.
|
|
485
|
+
n_chunks: Optionally set number of chunks to split
|
|
486
|
+
'df' into. Defaults to the 'processes' attribute
|
|
487
|
+
of the Parallel instance.
|
|
488
|
+
max_rows_per_chunk: Alternatively decide number of chunks
|
|
489
|
+
by a maximum number of rows per chunk.
|
|
490
|
+
concat: Whether to use pd.concat on the results.
|
|
491
|
+
Defaults to True.
|
|
492
|
+
"""
|
|
493
|
+
if max_rows_per_chunk is None and n_chunks is None:
|
|
494
|
+
n_chunks: int = self.processes
|
|
495
|
+
elif n_chunks is None:
|
|
496
|
+
n_chunks: int = len(df) // max_rows_per_chunk
|
|
497
|
+
elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
|
|
498
|
+
return func(df, *args, **kwargs)
|
|
451
499
|
|
|
452
500
|
chunks = np.array_split(np.arange(len(df)), n_chunks)
|
|
453
501
|
|
|
@@ -464,7 +512,7 @@ class Parallel:
|
|
|
464
512
|
else:
|
|
465
513
|
return out
|
|
466
514
|
|
|
467
|
-
def
|
|
515
|
+
def _validate_execution(self, func: Callable) -> None:
|
|
468
516
|
"""Multiprocessing doesn't work with local variables in interactive interpreter.
|
|
469
517
|
|
|
470
518
|
Raising Exception to avoid confusion.
|
|
@@ -478,8 +526,8 @@ class Parallel:
|
|
|
478
526
|
raise LocalFunctionError(func)
|
|
479
527
|
|
|
480
528
|
@staticmethod
|
|
481
|
-
def _validate_kwargs(kwargs) -> dict:
|
|
482
|
-
"""Make sure kwargs is a dict (not ** unpacked or None)"""
|
|
529
|
+
def _validate_kwargs(kwargs: dict) -> dict:
|
|
530
|
+
"""Make sure kwargs is a dict (not ** unpacked or None)."""
|
|
483
531
|
if kwargs is None:
|
|
484
532
|
kwargs = {}
|
|
485
533
|
elif not isinstance(kwargs, dict):
|
|
@@ -487,7 +535,7 @@ class Parallel:
|
|
|
487
535
|
return kwargs
|
|
488
536
|
|
|
489
537
|
def _execute(self) -> list[Any]:
|
|
490
|
-
[self.
|
|
538
|
+
[self._validate_execution(func) for func in self.funcs]
|
|
491
539
|
|
|
492
540
|
if self.processes == 1:
|
|
493
541
|
return [func() for func in self.funcs]
|
|
@@ -513,7 +561,8 @@ class Parallel:
|
|
|
513
561
|
results = [pool.apply_async(func) for func in self.funcs]
|
|
514
562
|
return [result.get() for result in results]
|
|
515
563
|
|
|
516
|
-
def __repr__(self):
|
|
564
|
+
def __repr__(self) -> str:
|
|
565
|
+
"""String representation."""
|
|
517
566
|
return (
|
|
518
567
|
f"{self.__class__.__name__}(processes={self.processes}, "
|
|
519
568
|
f"backend='{self.backend}', context='{self.context}')"
|
|
@@ -523,7 +572,7 @@ class Parallel:
|
|
|
523
572
|
def write_municipality_data(
|
|
524
573
|
data: str | GeoDataFrame | DataFrame,
|
|
525
574
|
out_folder: str,
|
|
526
|
-
municipalities: GeoDataFrame,
|
|
575
|
+
municipalities: GeoDataFrame | list[str] | None = None,
|
|
527
576
|
with_neighbors: bool = False,
|
|
528
577
|
muni_number_col: str = "KOMMUNENR",
|
|
529
578
|
file_type: str = "parquet",
|
|
@@ -533,13 +582,39 @@ def write_municipality_data(
|
|
|
533
582
|
max_rows_per_chunk: int = 150_000,
|
|
534
583
|
processes_in_clip: int = 1,
|
|
535
584
|
strict: bool = True,
|
|
585
|
+
verbose: bool = True,
|
|
536
586
|
) -> None:
|
|
587
|
+
"""Splits and writes data into municipality-specific files.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
data: Path to the data file or a GeoDataFrame.
|
|
591
|
+
out_folder: Path to the output directory where the municipality data
|
|
592
|
+
is written.
|
|
593
|
+
municipalities: Either a sequence of municipality numbers or a GeoDataFrame
|
|
594
|
+
of municipality polygons and municipality numbers in the column 'muni_number_col'.
|
|
595
|
+
Defaults to None.
|
|
596
|
+
with_neighbors: If True, include data from neighboring municipalities
|
|
597
|
+
for each municipality.
|
|
598
|
+
muni_number_col: Column name for municipality codes in 'municipalities'.
|
|
599
|
+
file_type: Format of the output file.
|
|
600
|
+
func: Function to process data before writing.
|
|
601
|
+
write_empty: If True, write empty files for municipalities without data.
|
|
602
|
+
clip: If True, clip the data to municipality boundaries. If False
|
|
603
|
+
the data is spatial joined.
|
|
604
|
+
max_rows_per_chunk: Maximum number of rows in each processed chunk.
|
|
605
|
+
processes_in_clip: Number of processes to use for clipping.
|
|
606
|
+
strict: If True (default) and the data has a municipality column,
|
|
607
|
+
all municipality numbers in 'data' must be present in 'municipalities'.
|
|
608
|
+
verbose: Whether to print during execution.
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
None. The function writes files directly.
|
|
612
|
+
"""
|
|
537
613
|
write_func = (
|
|
538
614
|
_write_neighbor_municipality_data
|
|
539
615
|
if with_neighbors
|
|
540
616
|
else _write_municipality_data
|
|
541
617
|
)
|
|
542
|
-
|
|
543
618
|
return write_func(
|
|
544
619
|
data=data,
|
|
545
620
|
out_folder=out_folder,
|
|
@@ -552,27 +627,34 @@ def write_municipality_data(
|
|
|
552
627
|
max_rows_per_chunk=max_rows_per_chunk,
|
|
553
628
|
processes_in_clip=processes_in_clip,
|
|
554
629
|
strict=strict,
|
|
630
|
+
verbose=verbose,
|
|
555
631
|
)
|
|
556
632
|
|
|
557
633
|
|
|
558
|
-
def _validate_data(
|
|
559
|
-
|
|
560
|
-
|
|
634
|
+
def _validate_data(
|
|
635
|
+
data: str | list[str] | DataFrame | GeoDataFrame,
|
|
636
|
+
) -> DataFrame | GeoDataFrame:
|
|
561
637
|
if hasattr(data, "__iter__") and len(data) == 1:
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
638
|
+
data = data[0]
|
|
639
|
+
if isinstance(data, (str, Path)):
|
|
640
|
+
try:
|
|
641
|
+
return read_geopandas(str(data))
|
|
642
|
+
except ValueError as e:
|
|
643
|
+
try:
|
|
644
|
+
return read_pandas(str(data))
|
|
645
|
+
except ValueError as e2:
|
|
646
|
+
raise e.__class__(e, data) from e2
|
|
565
647
|
return data
|
|
566
648
|
|
|
567
649
|
|
|
568
|
-
def _get_out_path(out_folder, muni, file_type):
|
|
650
|
+
def _get_out_path(out_folder: str | Path, muni: str, file_type: str) -> str:
|
|
569
651
|
return str(Path(out_folder) / f"{muni}.{file_type.strip('.')}")
|
|
570
652
|
|
|
571
653
|
|
|
572
654
|
def _write_municipality_data(
|
|
573
655
|
data: str | GeoDataFrame | DataFrame,
|
|
574
656
|
out_folder: str,
|
|
575
|
-
municipalities: GeoDataFrame,
|
|
657
|
+
municipalities: GeoDataFrame | list[str] | None = None,
|
|
576
658
|
muni_number_col: str = "KOMMUNENR",
|
|
577
659
|
file_type: str = "parquet",
|
|
578
660
|
func: Callable | None = None,
|
|
@@ -581,21 +663,15 @@ def _write_municipality_data(
|
|
|
581
663
|
max_rows_per_chunk: int = 150_000,
|
|
582
664
|
processes_in_clip: int = 1,
|
|
583
665
|
strict: bool = True,
|
|
666
|
+
verbose: bool = True,
|
|
584
667
|
) -> None:
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
try:
|
|
589
|
-
gdf = read_geopandas(str(data))
|
|
590
|
-
except ValueError as e:
|
|
591
|
-
try:
|
|
592
|
-
gdf = read_pandas(str(data))
|
|
593
|
-
except ValueError:
|
|
594
|
-
raise e.__class__(e, data)
|
|
595
|
-
elif isinstance(data, DataFrame):
|
|
596
|
-
gdf = data
|
|
668
|
+
if verbose:
|
|
669
|
+
to_print = out_folder
|
|
670
|
+
print(to_print)
|
|
597
671
|
else:
|
|
598
|
-
|
|
672
|
+
to_print = None
|
|
673
|
+
|
|
674
|
+
gdf = _validate_data(data)
|
|
599
675
|
|
|
600
676
|
if func is not None:
|
|
601
677
|
gdf = func(gdf)
|
|
@@ -608,22 +684,29 @@ def _write_municipality_data(
|
|
|
608
684
|
max_rows_per_chunk,
|
|
609
685
|
processes_in_clip=processes_in_clip,
|
|
610
686
|
strict=strict,
|
|
687
|
+
to_print=to_print,
|
|
611
688
|
)
|
|
612
689
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
690
|
+
if municipalities is None:
|
|
691
|
+
muni_numbers = gdf[muni_number_col]
|
|
692
|
+
elif not isinstance(municipalities, DataFrame):
|
|
693
|
+
muni_numbers = municipalities
|
|
694
|
+
else:
|
|
695
|
+
muni_numbers = municipalities[muni_number_col]
|
|
696
|
+
|
|
697
|
+
# hardcode this to threading for efficiency in io bound task
|
|
698
|
+
Parallel(processes_in_clip, backend="threading").map(
|
|
699
|
+
_write_one_muni,
|
|
700
|
+
muni_numbers,
|
|
701
|
+
kwargs=dict(
|
|
702
|
+
gdf=gdf,
|
|
703
|
+
out_folder=out_folder,
|
|
704
|
+
muni_number_col=muni_number_col,
|
|
705
|
+
file_type=file_type,
|
|
706
|
+
write_empty=write_empty,
|
|
707
|
+
to_print=to_print,
|
|
708
|
+
),
|
|
709
|
+
)
|
|
627
710
|
|
|
628
711
|
|
|
629
712
|
def _write_neighbor_municipality_data(
|
|
@@ -638,11 +721,15 @@ def _write_neighbor_municipality_data(
|
|
|
638
721
|
max_rows_per_chunk: int = 150_000,
|
|
639
722
|
processes_in_clip: int = 1,
|
|
640
723
|
strict: bool = True,
|
|
724
|
+
verbose: bool = True,
|
|
641
725
|
) -> None:
|
|
642
|
-
|
|
726
|
+
if verbose:
|
|
727
|
+
to_print = out_folder
|
|
728
|
+
print("out_folder:", to_print)
|
|
729
|
+
else:
|
|
730
|
+
to_print = None
|
|
643
731
|
|
|
644
|
-
|
|
645
|
-
gdf = read_geopandas(str(data))
|
|
732
|
+
gdf = _validate_data(data)
|
|
646
733
|
|
|
647
734
|
if func is not None:
|
|
648
735
|
gdf = func(gdf)
|
|
@@ -655,6 +742,7 @@ def _write_neighbor_municipality_data(
|
|
|
655
742
|
max_rows_per_chunk,
|
|
656
743
|
processes_in_clip,
|
|
657
744
|
strict=strict,
|
|
745
|
+
to_print=to_print,
|
|
658
746
|
)
|
|
659
747
|
|
|
660
748
|
if municipalities.index.name != muni_number_col:
|
|
@@ -664,43 +752,97 @@ def _write_neighbor_municipality_data(
|
|
|
664
752
|
municipalities, municipalities, max_distance=1
|
|
665
753
|
)
|
|
666
754
|
|
|
667
|
-
for
|
|
668
|
-
|
|
755
|
+
# hardcode this to threading for efficiency in io bound task
|
|
756
|
+
Parallel(processes_in_clip, backend="threading").map(
|
|
757
|
+
_write_one_muni_with_neighbors,
|
|
758
|
+
municipalities.index,
|
|
759
|
+
kwargs=dict(
|
|
760
|
+
gdf=gdf,
|
|
761
|
+
neighbor_munis=neighbor_munis,
|
|
762
|
+
out_folder=out_folder,
|
|
763
|
+
muni_number_col=muni_number_col,
|
|
764
|
+
file_type=file_type,
|
|
765
|
+
write_empty=write_empty,
|
|
766
|
+
to_print=to_print,
|
|
767
|
+
),
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def _write_one_muni(
|
|
772
|
+
muni_number: Any,
|
|
773
|
+
gdf: GeoDataFrame | DataFrame,
|
|
774
|
+
out_folder: str | Path,
|
|
775
|
+
muni_number_col: str,
|
|
776
|
+
file_type: str,
|
|
777
|
+
write_empty: bool,
|
|
778
|
+
to_print: str | None = None,
|
|
779
|
+
) -> None:
|
|
780
|
+
out = _get_out_path(out_folder, muni_number, file_type)
|
|
781
|
+
|
|
782
|
+
if to_print:
|
|
783
|
+
print("writing:", out)
|
|
784
|
+
|
|
785
|
+
gdf_muni = gdf.loc[gdf[muni_number_col] == muni_number]
|
|
669
786
|
|
|
670
|
-
|
|
671
|
-
|
|
787
|
+
if not len(gdf_muni):
|
|
788
|
+
if write_empty:
|
|
789
|
+
gdf_muni = gdf_muni.drop(columns="geometry", errors="ignore")
|
|
790
|
+
gdf_muni["geometry"] = None
|
|
791
|
+
write_pandas(gdf_muni, out)
|
|
792
|
+
return
|
|
672
793
|
|
|
673
|
-
|
|
674
|
-
if write_empty:
|
|
675
|
-
gdf_neighbor["geometry"] = gdf_neighbor["geometry"].astype(str)
|
|
676
|
-
write_pandas(gdf_neighbor, out)
|
|
677
|
-
continue
|
|
794
|
+
write_geopandas(gdf_muni, out)
|
|
678
795
|
|
|
679
|
-
|
|
796
|
+
|
|
797
|
+
def _write_one_muni_with_neighbors(
|
|
798
|
+
muni_number: Any,
|
|
799
|
+
gdf: GeoDataFrame | DataFrame,
|
|
800
|
+
neighbor_munis: Series,
|
|
801
|
+
out_folder: str | Path,
|
|
802
|
+
muni_number_col: str,
|
|
803
|
+
file_type: str,
|
|
804
|
+
write_empty: bool,
|
|
805
|
+
to_print: str | None = None,
|
|
806
|
+
) -> None:
|
|
807
|
+
out = _get_out_path(out_folder, muni_number, file_type)
|
|
808
|
+
|
|
809
|
+
if to_print:
|
|
810
|
+
print("writing:", out)
|
|
811
|
+
|
|
812
|
+
muni_and_neighbors: Series = neighbor_munis.loc[[muni_number]]
|
|
813
|
+
gdf_neighbor = gdf.loc[gdf[muni_number_col].isin(muni_and_neighbors)]
|
|
814
|
+
|
|
815
|
+
if not len(gdf_neighbor):
|
|
816
|
+
if write_empty:
|
|
817
|
+
gdf_neighbor = gdf_neighbor.drop(columns="geometry", errors="ignore")
|
|
818
|
+
gdf_neighbor["geometry"] = None
|
|
819
|
+
write_pandas(gdf_neighbor, out)
|
|
820
|
+
return
|
|
821
|
+
|
|
822
|
+
write_geopandas(gdf_neighbor, out)
|
|
680
823
|
|
|
681
824
|
|
|
682
825
|
def _fix_missing_muni_numbers(
|
|
683
|
-
gdf,
|
|
684
|
-
municipalities,
|
|
685
|
-
muni_number_col,
|
|
686
|
-
clip,
|
|
687
|
-
max_rows_per_chunk,
|
|
688
|
-
processes_in_clip,
|
|
689
|
-
strict,
|
|
690
|
-
|
|
826
|
+
gdf: GeoDataFrame,
|
|
827
|
+
municipalities: GeoDataFrame,
|
|
828
|
+
muni_number_col: str,
|
|
829
|
+
clip: bool,
|
|
830
|
+
max_rows_per_chunk: int,
|
|
831
|
+
processes_in_clip: int,
|
|
832
|
+
strict: bool,
|
|
833
|
+
to_print: str,
|
|
834
|
+
) -> GeoDataFrame:
|
|
691
835
|
if muni_number_col in gdf and gdf[muni_number_col].notna().all():
|
|
692
836
|
if municipalities is None:
|
|
693
837
|
return gdf
|
|
694
838
|
if diffs := set(gdf[muni_number_col].values).difference(
|
|
695
839
|
set(municipalities[muni_number_col].values)
|
|
696
840
|
):
|
|
697
|
-
message =
|
|
698
|
-
f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore"
|
|
699
|
-
)
|
|
841
|
+
message = f"Different municipality numbers: {diffs}. Set 'strict=False' to ignore."
|
|
700
842
|
if strict:
|
|
701
843
|
raise ValueError(message)
|
|
702
844
|
else:
|
|
703
|
-
warnings.warn(message)
|
|
845
|
+
warnings.warn(message, stacklevel=1)
|
|
704
846
|
return gdf
|
|
705
847
|
|
|
706
848
|
if municipalities is None:
|
|
@@ -717,7 +859,10 @@ def _fix_missing_muni_numbers(
|
|
|
717
859
|
"GeoDataFrame to clip the geometries by."
|
|
718
860
|
)
|
|
719
861
|
|
|
720
|
-
|
|
862
|
+
try:
|
|
863
|
+
municipalities = municipalities[[muni_number_col, "geometry"]].to_crs(gdf.crs)
|
|
864
|
+
except Exception as e:
|
|
865
|
+
raise e.__class__(e, to_print) from e
|
|
721
866
|
|
|
722
867
|
if muni_number_col in gdf and gdf[muni_number_col].isna().any():
|
|
723
868
|
notna = gdf[gdf[muni_number_col].notna()]
|
|
@@ -732,6 +877,7 @@ def _fix_missing_muni_numbers(
|
|
|
732
877
|
municipalities[[muni_number_col, municipalities._geometry_column_name]],
|
|
733
878
|
processes=processes_in_clip,
|
|
734
879
|
max_rows_per_chunk=max_rows_per_chunk,
|
|
880
|
+
to_print=to_print,
|
|
735
881
|
)
|
|
736
882
|
|
|
737
883
|
return pd.concat([notna, notna_anymore], ignore_index=True)
|
|
@@ -744,25 +890,42 @@ def _fix_missing_muni_numbers(
|
|
|
744
890
|
municipalities[[muni_number_col, municipalities._geometry_column_name]],
|
|
745
891
|
processes=processes_in_clip,
|
|
746
892
|
max_rows_per_chunk=max_rows_per_chunk,
|
|
893
|
+
to_print=to_print,
|
|
747
894
|
)
|
|
748
895
|
|
|
749
896
|
|
|
750
897
|
def parallel_overlay(
|
|
751
898
|
df1: GeoDataFrame,
|
|
752
899
|
df2: GeoDataFrame,
|
|
753
|
-
# muni_number_col: str,
|
|
754
900
|
processes: int,
|
|
755
901
|
max_rows_per_chunk: int,
|
|
756
902
|
backend: str = "loky",
|
|
903
|
+
to_print: str | None = None,
|
|
757
904
|
**kwargs,
|
|
758
905
|
) -> GeoDataFrame:
|
|
759
|
-
|
|
906
|
+
"""Perform spatial overlay operations on two GeoDataFrames in parallel.
|
|
907
|
+
|
|
908
|
+
This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
|
|
909
|
+
overlay operation with the second GeoDataFrame, and then concatenates the results.
|
|
760
910
|
|
|
911
|
+
Note that this function is most useful if df2 has few and simple geometries.
|
|
912
|
+
|
|
913
|
+
Args:
|
|
914
|
+
df1: The first GeoDataFrame for the overlay operation.
|
|
915
|
+
df2: The second GeoDataFrame for the overlay operation.
|
|
916
|
+
how: Type of overlay operation ('intersection', 'union', etc.).
|
|
917
|
+
processes: Number of parallel processes to use.
|
|
918
|
+
max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
|
|
919
|
+
backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
|
|
920
|
+
to_print: Optional text to print to see progression.
|
|
921
|
+
**kwargs: Additional keyword arguments to pass to the overlay function.
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
A GeoDataFrame containing the result of the overlay operation.
|
|
925
|
+
"""
|
|
761
926
|
if len(df1) < max_rows_per_chunk:
|
|
762
927
|
return clean_overlay(df1, df2, **kwargs)
|
|
763
928
|
|
|
764
|
-
# df2 = df2.dissolve(by=muni_number_col, as_index=False)
|
|
765
|
-
|
|
766
929
|
n_chunks = len(df1) // max_rows_per_chunk
|
|
767
930
|
chunks = np.array_split(np.arange(len(df1)), n_chunks)
|
|
768
931
|
|
|
@@ -778,26 +941,50 @@ def parallel_overlay(
|
|
|
778
941
|
out = Parallel(processes, backend=backend).map(
|
|
779
942
|
_clean_intersection,
|
|
780
943
|
df1_chunked,
|
|
781
|
-
args=(df2,),
|
|
944
|
+
args=(df2, to_print) if to_print else (df2,),
|
|
782
945
|
)
|
|
783
946
|
return pd.concat(out, ignore_index=True)
|
|
784
947
|
|
|
785
948
|
|
|
786
|
-
def _clean_intersection(
|
|
787
|
-
|
|
949
|
+
def _clean_intersection(
|
|
950
|
+
df1: GeoDataFrame, df2: GeoDataFrame, to_print: str | None = None
|
|
951
|
+
) -> GeoDataFrame:
|
|
952
|
+
print(to_print, "- intersection chunk len:", len(df1))
|
|
788
953
|
return clean_overlay(df1, df2, how="intersection")
|
|
789
954
|
|
|
790
955
|
|
|
791
956
|
def chunkwise(
|
|
792
957
|
func: Callable,
|
|
793
|
-
df: GeoDataFrame,
|
|
958
|
+
df: GeoDataFrame | pd.DataFrame,
|
|
794
959
|
max_rows_per_chunk: int = 150_000,
|
|
795
|
-
n_chunks: int = None,
|
|
960
|
+
n_chunks: int | None = None,
|
|
796
961
|
args: tuple | None = None,
|
|
797
962
|
kwargs: dict | None = None,
|
|
798
963
|
n_jobs: int = 1,
|
|
799
964
|
backend: str = "loky",
|
|
800
|
-
) -> GeoDataFrame:
|
|
965
|
+
) -> GeoDataFrame | pd.DataFrame:
|
|
966
|
+
"""Run a function in parallel on chunks of a DataFrame.
|
|
967
|
+
|
|
968
|
+
This method is used to process large (Geo)DataFrames in manageable pieces,
|
|
969
|
+
optionally in parallel.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
func: The function to apply to each chunk. This function must accept a DataFrame as
|
|
973
|
+
its first argument and return a DataFrame.
|
|
974
|
+
df: The DataFrame to be chunked and processed.
|
|
975
|
+
max_rows_per_chunk: The maximum number of rows each chunk should contain.
|
|
976
|
+
n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
|
|
977
|
+
calculated based on 'max_rows_per_chunk'.
|
|
978
|
+
args: Additional positional arguments to pass to 'func'.
|
|
979
|
+
kwargs: Keyword arguments to pass to 'func'.
|
|
980
|
+
n_jobs: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
|
|
981
|
+
backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
GeoDataFrame: A GeoDataFrame resulting from concatenating the results of applying 'func'
|
|
985
|
+
to each chunk of the original GeoDataFrame.
|
|
986
|
+
|
|
987
|
+
"""
|
|
801
988
|
if len(df) < max_rows_per_chunk:
|
|
802
989
|
return func(df, *args, **kwargs)
|
|
803
990
|
|
|
@@ -815,3 +1002,26 @@ def chunkwise(
|
|
|
815
1002
|
kwargs=kwargs,
|
|
816
1003
|
)
|
|
817
1004
|
return pd.concat(out, ignore_index=True)
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
|
|
1008
|
+
if not isinstance(args, tuple):
|
|
1009
|
+
raise TypeError("args should be a tuple (it should not be unpacked with *)")
|
|
1010
|
+
argnames = inspect.getfullargspec(func).args[index_start:]
|
|
1011
|
+
return {name: value for value, name in zip(args, argnames, strict=False)}
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def _try_to_read_geopandas(path: str, **kwargs) -> GeoDataFrame | DataFrame | None:
|
|
1015
|
+
"""Read with try/except because it's faster than checking exists first."""
|
|
1016
|
+
try:
|
|
1017
|
+
return read_geopandas(path, **kwargs)
|
|
1018
|
+
except FileNotFoundError:
|
|
1019
|
+
return None
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _try_to_read_pandas(path: str, **kwargs) -> DataFrame | None:
|
|
1023
|
+
"""Read with try/except because it's faster than checking exists first."""
|
|
1024
|
+
try:
|
|
1025
|
+
return read_pandas(path, **kwargs)
|
|
1026
|
+
except FileNotFoundError:
|
|
1027
|
+
return None
|