ssb-sgis 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +5 -5
- sgis/debug_config.py +1 -0
- sgis/geopandas_tools/buffer_dissolve_explode.py +3 -40
- sgis/geopandas_tools/conversion.py +37 -9
- sgis/geopandas_tools/general.py +330 -106
- sgis/geopandas_tools/geometry_types.py +38 -33
- sgis/geopandas_tools/overlay.py +5 -1
- sgis/io/dapla_functions.py +33 -17
- sgis/maps/explore.py +16 -5
- sgis/maps/map.py +3 -0
- sgis/maps/maps.py +0 -1
- sgis/networkanalysis/closing_network_holes.py +100 -22
- sgis/networkanalysis/cutting_lines.py +4 -147
- sgis/networkanalysis/finding_isolated_networks.py +6 -0
- sgis/networkanalysis/nodes.py +4 -110
- sgis/parallel/parallel.py +267 -182
- sgis/raster/image_collection.py +789 -836
- sgis/raster/indices.py +0 -90
- sgis/raster/regex.py +146 -0
- sgis/raster/sentinel_config.py +9 -0
- {ssb_sgis-1.0.4.dist-info → ssb_sgis-1.0.6.dist-info}/METADATA +1 -1
- {ssb_sgis-1.0.4.dist-info → ssb_sgis-1.0.6.dist-info}/RECORD +24 -26
- sgis/raster/cube.py +0 -1274
- sgis/raster/cubebase.py +0 -25
- sgis/raster/raster.py +0 -1475
- {ssb_sgis-1.0.4.dist-info → ssb_sgis-1.0.6.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.0.4.dist-info → ssb_sgis-1.0.6.dist-info}/WHEEL +0 -0
sgis/parallel/parallel.py
CHANGED
|
@@ -10,6 +10,8 @@ from collections.abc import Iterable
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
+
from pandas.api.types import is_array_like
|
|
14
|
+
|
|
13
15
|
try:
|
|
14
16
|
import dapla as dp
|
|
15
17
|
except ImportError:
|
|
@@ -43,6 +45,199 @@ except ImportError:
|
|
|
43
45
|
pass
|
|
44
46
|
|
|
45
47
|
|
|
48
|
+
def parallel_overlay(
|
|
49
|
+
df1: GeoDataFrame,
|
|
50
|
+
df2: GeoDataFrame,
|
|
51
|
+
processes: int,
|
|
52
|
+
how: str = "intersection",
|
|
53
|
+
max_rows_per_chunk: int | None = None,
|
|
54
|
+
backend: str = "loky",
|
|
55
|
+
to_print: str | None = None,
|
|
56
|
+
**kwargs,
|
|
57
|
+
) -> GeoDataFrame:
|
|
58
|
+
"""Perform spatial overlay operations on two GeoDataFrames in parallel.
|
|
59
|
+
|
|
60
|
+
This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
|
|
61
|
+
overlay operation with the second GeoDataFrame, and then concatenates the results.
|
|
62
|
+
|
|
63
|
+
Note that this function is most useful if df2 has few and simple geometries.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
df1: The first GeoDataFrame for the overlay operation.
|
|
67
|
+
df2: The second GeoDataFrame for the overlay operation.
|
|
68
|
+
how: Type of overlay operation ('intersection', 'union', etc.).
|
|
69
|
+
processes: Number of parallel processes to use.
|
|
70
|
+
max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
|
|
71
|
+
backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
|
|
72
|
+
to_print: Optional text to print to see progression.
|
|
73
|
+
**kwargs: Additional keyword arguments to pass to the overlay function.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
A GeoDataFrame containing the result of the overlay operation.
|
|
77
|
+
"""
|
|
78
|
+
return pd.concat(
|
|
79
|
+
chunkwise(
|
|
80
|
+
_clean_overlay_with_print,
|
|
81
|
+
df1,
|
|
82
|
+
kwargs={
|
|
83
|
+
"df2": df2,
|
|
84
|
+
# "to_print": to_print,
|
|
85
|
+
"how": how,
|
|
86
|
+
}
|
|
87
|
+
| kwargs,
|
|
88
|
+
processes=processes,
|
|
89
|
+
max_rows_per_chunk=max_rows_per_chunk,
|
|
90
|
+
backend=backend,
|
|
91
|
+
),
|
|
92
|
+
ignore_index=True,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def parallel_overlay_rowwise(
|
|
97
|
+
df1: GeoDataFrame,
|
|
98
|
+
df2: GeoDataFrame,
|
|
99
|
+
processes: int,
|
|
100
|
+
max_rows_per_chunk: int | None = None,
|
|
101
|
+
backend: str = "loky",
|
|
102
|
+
to_print: str | None = None,
|
|
103
|
+
**kwargs,
|
|
104
|
+
) -> GeoDataFrame:
|
|
105
|
+
"""Perform spatial clip on two GeoDataFrames in parallel.
|
|
106
|
+
|
|
107
|
+
This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
|
|
108
|
+
overlay operation with the second GeoDataFrame, and then concatenates the results.
|
|
109
|
+
|
|
110
|
+
Note that this function is most useful if df2 has few and simple geometries.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df1: The first GeoDataFrame for the overlay operation.
|
|
114
|
+
df2: The second GeoDataFrame for the overlay operation.
|
|
115
|
+
how: Type of overlay operation ('intersection', 'union', etc.).
|
|
116
|
+
processes: Number of parallel processes to use.
|
|
117
|
+
max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
|
|
118
|
+
backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
|
|
119
|
+
to_print: Optional text to print to see progression.
|
|
120
|
+
**kwargs: Additional keyword arguments to pass to the overlay function.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
A GeoDataFrame containing the result of the overlay operation.
|
|
124
|
+
"""
|
|
125
|
+
return pd.concat(
|
|
126
|
+
chunkwise(
|
|
127
|
+
_clip_rowwise,
|
|
128
|
+
df1,
|
|
129
|
+
kwargs={
|
|
130
|
+
"df2": df2,
|
|
131
|
+
"to_print": to_print,
|
|
132
|
+
}
|
|
133
|
+
| kwargs,
|
|
134
|
+
processes=processes,
|
|
135
|
+
max_rows_per_chunk=max_rows_per_chunk,
|
|
136
|
+
backend=backend,
|
|
137
|
+
),
|
|
138
|
+
ignore_index=True,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def parallel_sjoin(
|
|
143
|
+
df1: GeoDataFrame,
|
|
144
|
+
df2: GeoDataFrame,
|
|
145
|
+
processes: int,
|
|
146
|
+
max_rows_per_chunk: int | None = None,
|
|
147
|
+
backend: str = "loky",
|
|
148
|
+
to_print: str | None = None,
|
|
149
|
+
**kwargs,
|
|
150
|
+
) -> GeoDataFrame:
|
|
151
|
+
"""Perform spatial clip on two GeoDataFrames in parallel.
|
|
152
|
+
|
|
153
|
+
This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
|
|
154
|
+
overlay operation with the second GeoDataFrame, and then concatenates the results.
|
|
155
|
+
|
|
156
|
+
Note that this function is most useful if df2 has few and simple geometries.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
df1: The first GeoDataFrame for the overlay operation.
|
|
160
|
+
df2: The second GeoDataFrame for the overlay operation.
|
|
161
|
+
how: Type of overlay operation ('intersection', 'union', etc.).
|
|
162
|
+
processes: Number of parallel processes to use.
|
|
163
|
+
max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
|
|
164
|
+
backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
|
|
165
|
+
to_print: Optional text to print to see progression.
|
|
166
|
+
**kwargs: Additional keyword arguments to pass to the overlay function.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
A GeoDataFrame containing the result of the overlay operation.
|
|
170
|
+
"""
|
|
171
|
+
return pd.concat(
|
|
172
|
+
chunkwise(
|
|
173
|
+
_sjoin_within_first,
|
|
174
|
+
df1,
|
|
175
|
+
kwargs={
|
|
176
|
+
"df2": df2,
|
|
177
|
+
"to_print": to_print,
|
|
178
|
+
}
|
|
179
|
+
| kwargs,
|
|
180
|
+
processes=processes,
|
|
181
|
+
max_rows_per_chunk=max_rows_per_chunk,
|
|
182
|
+
backend=backend,
|
|
183
|
+
),
|
|
184
|
+
ignore_index=True,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _sjoin_within_first(
|
|
189
|
+
df1, df2, to_print: str | None = None, predicate: str = "intersects", **kwargs
|
|
190
|
+
):
|
|
191
|
+
if to_print:
|
|
192
|
+
print(to_print, "- sjoin chunk len:", len(df1))
|
|
193
|
+
|
|
194
|
+
df2 = df2.reset_index(drop=True)
|
|
195
|
+
df2["_from_df2"] = 1
|
|
196
|
+
df1["_range_idx"] = range(len(df1))
|
|
197
|
+
joined = df1.sjoin(df2, predicate="within", how="left")
|
|
198
|
+
within = joined.loc[joined["_from_df2"].notna()].drop(
|
|
199
|
+
columns=["_from_df2", "_range_idx", "index_right"], errors="raise"
|
|
200
|
+
)
|
|
201
|
+
not_within = df1.loc[
|
|
202
|
+
df1["_range_idx"].isin(joined.loc[joined["_from_df2"].isna(), "_range_idx"])
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
return pd.concat(
|
|
206
|
+
[
|
|
207
|
+
within,
|
|
208
|
+
not_within.sjoin(df2, predicate=predicate, **kwargs),
|
|
209
|
+
],
|
|
210
|
+
ignore_index=True,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _clip_rowwise(df1, df2, to_print: str | None = None):
|
|
215
|
+
geom_col = df2.geometry.name
|
|
216
|
+
|
|
217
|
+
def clip_by_one_row(i):
|
|
218
|
+
this: pd.Series = df2.iloc[i]
|
|
219
|
+
clipped = df1.clip(this[geom_col])
|
|
220
|
+
without_geom_col = this.drop(geom_col)
|
|
221
|
+
clipped.loc[:, without_geom_col.index] = without_geom_col.values
|
|
222
|
+
if to_print:
|
|
223
|
+
print(i, to_print, len(clipped))
|
|
224
|
+
return clipped
|
|
225
|
+
|
|
226
|
+
return pd.concat([clip_by_one_row(i) for i in range(len(df2))], ignore_index=True)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _clean_overlay_with_print(
|
|
230
|
+
df1: GeoDataFrame,
|
|
231
|
+
df2: GeoDataFrame,
|
|
232
|
+
how: str = "intersection",
|
|
233
|
+
to_print: str | None = None,
|
|
234
|
+
**kwargs,
|
|
235
|
+
) -> GeoDataFrame:
|
|
236
|
+
if to_print:
|
|
237
|
+
print(to_print, f"- {how} chunk len:", len(df1))
|
|
238
|
+
return clean_overlay(df1, df2, how=how, **kwargs)
|
|
239
|
+
|
|
240
|
+
|
|
46
241
|
class Parallel:
|
|
47
242
|
"""Run functions in parallell.
|
|
48
243
|
|
|
@@ -398,9 +593,7 @@ class Parallel:
|
|
|
398
593
|
muni_number_col: str = "KOMMUNENR",
|
|
399
594
|
strict: bool = False,
|
|
400
595
|
write_empty: bool = False,
|
|
401
|
-
|
|
402
|
-
max_rows_per_chunk: int = 150_000,
|
|
403
|
-
processes_in_clip: int = 1,
|
|
596
|
+
id_assign_func: Callable | functools.partial = clean_overlay,
|
|
404
597
|
verbose: bool = True,
|
|
405
598
|
) -> None:
|
|
406
599
|
"""Split multiple datasets into municipalities and write as separate files.
|
|
@@ -410,6 +603,7 @@ class Parallel:
|
|
|
410
603
|
in parallel. The intersections themselves can also be run in parallel
|
|
411
604
|
with the 'processes_in_clip' argument.
|
|
412
605
|
|
|
606
|
+
|
|
413
607
|
Args:
|
|
414
608
|
in_data: Dictionary with dataset names as keys and file paths or
|
|
415
609
|
(Geo)DataFrames as values. Note that the files will be read
|
|
@@ -435,10 +629,8 @@ class Parallel:
|
|
|
435
629
|
not have to have the same length as 'in_data'.
|
|
436
630
|
write_empty: If False (default), municipalities with no data will be skipped.
|
|
437
631
|
If True, an empty parquet file will be written.
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
max_rows_per_chunk: Number of rows per data chunk for processing.
|
|
441
|
-
processes_in_clip: Number of parallel processes for data clipping.
|
|
632
|
+
id_assign_func: Function to assign ids (e.g. municipality number) to
|
|
633
|
+
the dataframe for missing values.
|
|
442
634
|
verbose: Whether to print during execution.
|
|
443
635
|
|
|
444
636
|
"""
|
|
@@ -448,11 +640,9 @@ class Parallel:
|
|
|
448
640
|
"muni_number_col": muni_number_col,
|
|
449
641
|
"write_empty": write_empty,
|
|
450
642
|
"with_neighbors": with_neighbors,
|
|
451
|
-
"clip": clip,
|
|
452
|
-
"max_rows_per_chunk": max_rows_per_chunk,
|
|
453
|
-
"processes_in_clip": processes_in_clip,
|
|
454
643
|
"strict": strict,
|
|
455
644
|
"verbose": verbose,
|
|
645
|
+
"id_assign_func": id_assign_func,
|
|
456
646
|
}
|
|
457
647
|
|
|
458
648
|
if isinstance(out_data, (str, Path)):
|
|
@@ -482,51 +672,32 @@ class Parallel:
|
|
|
482
672
|
def chunkwise(
|
|
483
673
|
self,
|
|
484
674
|
func: Callable,
|
|
485
|
-
|
|
675
|
+
iterable: Collection[Iterable[Any]],
|
|
486
676
|
args: tuple | None = None,
|
|
487
677
|
kwargs: dict | None = None,
|
|
488
|
-
n_chunks: int | None = None,
|
|
489
678
|
max_rows_per_chunk: int | None = None,
|
|
490
|
-
|
|
491
|
-
) -> GeoDataFrame:
|
|
679
|
+
) -> Collection[Iterable[Any]]:
|
|
492
680
|
"""Run a function in parallel on chunks of a (Geo)DataFrame.
|
|
493
681
|
|
|
494
682
|
Args:
|
|
495
683
|
func: Function to run chunkwise. It should take
|
|
496
|
-
a
|
|
497
|
-
|
|
684
|
+
(a chunk of) the iterable as first argument.
|
|
685
|
+
iterable: Iterable to split in chunks and passed
|
|
498
686
|
as first argument to 'func'.
|
|
499
687
|
args: Positional arguments in 'func' after the DataFrame.
|
|
500
688
|
kwargs: Additional keyword arguments in 'func'.
|
|
501
|
-
n_chunks: Optionally set number of chunks to split
|
|
502
|
-
'df' into. Defaults to the 'processes' attribute
|
|
503
|
-
of the Parallel instance.
|
|
504
689
|
max_rows_per_chunk: Alternatively decide number of chunks
|
|
505
690
|
by a maximum number of rows per chunk.
|
|
506
|
-
concat: Whether to use pd.concat on the results.
|
|
507
|
-
Defaults to True.
|
|
508
691
|
"""
|
|
509
|
-
|
|
510
|
-
n_chunks: int = self.processes
|
|
511
|
-
elif n_chunks is None:
|
|
512
|
-
n_chunks: int = len(df) // max_rows_per_chunk
|
|
513
|
-
elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
|
|
514
|
-
return func(df, *args, **kwargs)
|
|
515
|
-
|
|
516
|
-
chunks = np.array_split(np.arange(len(df)), n_chunks)
|
|
517
|
-
|
|
518
|
-
df_chunked: list[GeoDataFrame] = [df.iloc[chunk] for chunk in chunks]
|
|
519
|
-
|
|
520
|
-
out = self.map(
|
|
692
|
+
return chunkwise(
|
|
521
693
|
func,
|
|
522
|
-
|
|
694
|
+
iterable,
|
|
523
695
|
args=args,
|
|
524
696
|
kwargs=kwargs,
|
|
697
|
+
processes=self.processes,
|
|
698
|
+
max_rows_per_chunk=max_rows_per_chunk,
|
|
699
|
+
backend=self.backend,
|
|
525
700
|
)
|
|
526
|
-
if concat:
|
|
527
|
-
return pd.concat(out, ignore_index=True)
|
|
528
|
-
else:
|
|
529
|
-
return out
|
|
530
701
|
|
|
531
702
|
def _validate_execution(self, func: Callable) -> None:
|
|
532
703
|
"""Multiprocessing doesn't work with local variables in interactive interpreter.
|
|
@@ -594,9 +765,7 @@ def write_municipality_data(
|
|
|
594
765
|
file_type: str = "parquet",
|
|
595
766
|
func: Callable | None = None,
|
|
596
767
|
write_empty: bool = False,
|
|
597
|
-
|
|
598
|
-
max_rows_per_chunk: int = 150_000,
|
|
599
|
-
processes_in_clip: int = 1,
|
|
768
|
+
id_assign_func: Callable = clean_overlay,
|
|
600
769
|
strict: bool = True,
|
|
601
770
|
verbose: bool = True,
|
|
602
771
|
) -> None:
|
|
@@ -621,6 +790,8 @@ def write_municipality_data(
|
|
|
621
790
|
processes_in_clip: Number of processes to use for clipping.
|
|
622
791
|
strict: If True (default) and the data has a municipality column,
|
|
623
792
|
all municipality numbers in 'data' must be present in 'municipalities'.
|
|
793
|
+
id_assign_func: Function to assign ids (e.g. municipality number) to
|
|
794
|
+
the dataframe for missing values.
|
|
624
795
|
verbose: Whether to print during execution.
|
|
625
796
|
|
|
626
797
|
Returns:
|
|
@@ -639,10 +810,8 @@ def write_municipality_data(
|
|
|
639
810
|
file_type=file_type,
|
|
640
811
|
func=func,
|
|
641
812
|
write_empty=write_empty,
|
|
642
|
-
clip=clip,
|
|
643
|
-
max_rows_per_chunk=max_rows_per_chunk,
|
|
644
|
-
processes_in_clip=processes_in_clip,
|
|
645
813
|
strict=strict,
|
|
814
|
+
id_assign_func=id_assign_func,
|
|
646
815
|
verbose=verbose,
|
|
647
816
|
)
|
|
648
817
|
|
|
@@ -675,11 +844,10 @@ def _write_municipality_data(
|
|
|
675
844
|
file_type: str = "parquet",
|
|
676
845
|
func: Callable | None = None,
|
|
677
846
|
write_empty: bool = False,
|
|
678
|
-
|
|
679
|
-
max_rows_per_chunk: int = 150_000,
|
|
680
|
-
processes_in_clip: int = 1,
|
|
847
|
+
processes: int = 1,
|
|
681
848
|
strict: bool = True,
|
|
682
849
|
verbose: bool = True,
|
|
850
|
+
id_assign_func: Callable = clean_overlay,
|
|
683
851
|
) -> None:
|
|
684
852
|
if verbose:
|
|
685
853
|
to_print = out_folder
|
|
@@ -696,22 +864,21 @@ def _write_municipality_data(
|
|
|
696
864
|
gdf,
|
|
697
865
|
municipalities,
|
|
698
866
|
muni_number_col,
|
|
699
|
-
clip,
|
|
700
|
-
max_rows_per_chunk,
|
|
701
|
-
processes_in_clip=processes_in_clip,
|
|
702
867
|
strict=strict,
|
|
703
|
-
|
|
868
|
+
id_assign_func=id_assign_func,
|
|
704
869
|
)
|
|
705
870
|
|
|
706
871
|
if municipalities is None:
|
|
707
|
-
muni_numbers = gdf[muni_number_col]
|
|
872
|
+
muni_numbers = gdf[muni_number_col].unique()
|
|
708
873
|
elif not isinstance(municipalities, DataFrame):
|
|
709
|
-
muni_numbers = municipalities
|
|
874
|
+
muni_numbers = set(municipalities)
|
|
710
875
|
else:
|
|
711
|
-
muni_numbers = municipalities[muni_number_col]
|
|
876
|
+
muni_numbers = municipalities[muni_number_col].unique()
|
|
877
|
+
|
|
878
|
+
muni_numbers = list(sorted(muni_numbers))
|
|
712
879
|
|
|
713
880
|
# hardcode this to threading for efficiency in io bound task
|
|
714
|
-
Parallel(
|
|
881
|
+
Parallel(processes, backend="threading").map(
|
|
715
882
|
_write_one_muni,
|
|
716
883
|
muni_numbers,
|
|
717
884
|
kwargs=dict(
|
|
@@ -733,11 +900,10 @@ def _write_neighbor_municipality_data(
|
|
|
733
900
|
file_type: str = "parquet",
|
|
734
901
|
func: Callable | None = None,
|
|
735
902
|
write_empty: bool = False,
|
|
736
|
-
|
|
737
|
-
max_rows_per_chunk: int = 150_000,
|
|
738
|
-
processes_in_clip: int = 1,
|
|
903
|
+
processes: int = 1,
|
|
739
904
|
strict: bool = True,
|
|
740
905
|
verbose: bool = True,
|
|
906
|
+
id_assign_func: Callable = clean_overlay,
|
|
741
907
|
) -> None:
|
|
742
908
|
if verbose:
|
|
743
909
|
to_print = out_folder
|
|
@@ -754,11 +920,8 @@ def _write_neighbor_municipality_data(
|
|
|
754
920
|
gdf,
|
|
755
921
|
municipalities,
|
|
756
922
|
muni_number_col,
|
|
757
|
-
clip,
|
|
758
|
-
max_rows_per_chunk,
|
|
759
|
-
processes_in_clip,
|
|
760
923
|
strict=strict,
|
|
761
|
-
|
|
924
|
+
id_assign_func=id_assign_func,
|
|
762
925
|
)
|
|
763
926
|
|
|
764
927
|
if municipalities.index.name != muni_number_col:
|
|
@@ -769,7 +932,7 @@ def _write_neighbor_municipality_data(
|
|
|
769
932
|
)
|
|
770
933
|
|
|
771
934
|
# hardcode this to threading for efficiency in io bound task
|
|
772
|
-
Parallel(
|
|
935
|
+
Parallel(processes, backend="threading").map(
|
|
773
936
|
_write_one_muni_with_neighbors,
|
|
774
937
|
municipalities.index,
|
|
775
938
|
kwargs=dict(
|
|
@@ -850,11 +1013,8 @@ def _fix_missing_muni_numbers(
|
|
|
850
1013
|
gdf: GeoDataFrame,
|
|
851
1014
|
municipalities: GeoDataFrame,
|
|
852
1015
|
muni_number_col: str,
|
|
853
|
-
clip: bool,
|
|
854
|
-
max_rows_per_chunk: int,
|
|
855
|
-
processes_in_clip: int,
|
|
856
1016
|
strict: bool,
|
|
857
|
-
|
|
1017
|
+
id_assign_func: Callable,
|
|
858
1018
|
) -> GeoDataFrame:
|
|
859
1019
|
if muni_number_col in gdf and gdf[muni_number_col].notna().all():
|
|
860
1020
|
if municipalities is None:
|
|
@@ -883,123 +1043,37 @@ def _fix_missing_muni_numbers(
|
|
|
883
1043
|
"GeoDataFrame to clip the geometries by."
|
|
884
1044
|
)
|
|
885
1045
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
].to_crs(gdf.crs)
|
|
890
|
-
except Exception as e:
|
|
891
|
-
raise e.__class__(e, to_print) from e
|
|
1046
|
+
municipalities = municipalities[
|
|
1047
|
+
[muni_number_col, municipalities.geometry.name]
|
|
1048
|
+
].to_crs(gdf.crs)
|
|
892
1049
|
|
|
893
1050
|
if muni_number_col in gdf and gdf[muni_number_col].isna().any():
|
|
894
1051
|
notna = gdf[gdf[muni_number_col].notna()]
|
|
895
1052
|
|
|
896
1053
|
isna = gdf[gdf[muni_number_col].isna()].drop(muni_number_col, axis=1)
|
|
897
1054
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
else:
|
|
901
|
-
notna_anymore = parallel_overlay(
|
|
902
|
-
isna,
|
|
903
|
-
municipalities[[muni_number_col, municipalities._geometry_column_name]],
|
|
904
|
-
processes=processes_in_clip,
|
|
905
|
-
max_rows_per_chunk=max_rows_per_chunk,
|
|
906
|
-
to_print=to_print,
|
|
907
|
-
)
|
|
908
|
-
|
|
909
|
-
return pd.concat([notna, notna_anymore], ignore_index=True)
|
|
910
|
-
|
|
911
|
-
if not clip:
|
|
912
|
-
return gdf.sjoin(municipalities).drop(columns="index_right")
|
|
913
|
-
else:
|
|
914
|
-
return parallel_overlay(
|
|
915
|
-
gdf,
|
|
1055
|
+
notna_anymore = id_assign_func(
|
|
1056
|
+
isna,
|
|
916
1057
|
municipalities[[muni_number_col, municipalities._geometry_column_name]],
|
|
917
|
-
processes=processes_in_clip,
|
|
918
|
-
max_rows_per_chunk=max_rows_per_chunk,
|
|
919
|
-
to_print=to_print,
|
|
920
1058
|
)
|
|
921
1059
|
|
|
1060
|
+
return pd.concat([notna, notna_anymore], ignore_index=True)
|
|
922
1061
|
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
processes: int,
|
|
927
|
-
max_rows_per_chunk: int,
|
|
928
|
-
backend: str = "loky",
|
|
929
|
-
to_print: str | None = None,
|
|
930
|
-
**kwargs,
|
|
931
|
-
) -> GeoDataFrame:
|
|
932
|
-
"""Perform spatial overlay operations on two GeoDataFrames in parallel.
|
|
933
|
-
|
|
934
|
-
This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
|
|
935
|
-
overlay operation with the second GeoDataFrame, and then concatenates the results.
|
|
936
|
-
|
|
937
|
-
Note that this function is most useful if df2 has few and simple geometries.
|
|
938
|
-
|
|
939
|
-
Args:
|
|
940
|
-
df1: The first GeoDataFrame for the overlay operation.
|
|
941
|
-
df2: The second GeoDataFrame for the overlay operation.
|
|
942
|
-
how: Type of overlay operation ('intersection', 'union', etc.).
|
|
943
|
-
processes: Number of parallel processes to use.
|
|
944
|
-
max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
|
|
945
|
-
backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
|
|
946
|
-
to_print: Optional text to print to see progression.
|
|
947
|
-
**kwargs: Additional keyword arguments to pass to the overlay function.
|
|
948
|
-
|
|
949
|
-
Returns:
|
|
950
|
-
A GeoDataFrame containing the result of the overlay operation.
|
|
951
|
-
"""
|
|
952
|
-
if len(df1) < max_rows_per_chunk:
|
|
953
|
-
return clean_overlay(df1, df2, **kwargs)
|
|
954
|
-
|
|
955
|
-
n_chunks = len(df1) // max_rows_per_chunk
|
|
956
|
-
chunks = np.array_split(np.arange(len(df1)), n_chunks)
|
|
957
|
-
|
|
958
|
-
try:
|
|
959
|
-
x_mapper = dict(enumerate(df1.centroid))
|
|
960
|
-
sorted_xs = dict(reversed(sorted(x_mapper.items(), key=lambda item: item[1])))
|
|
961
|
-
df1 = df1.iloc[list(sorted_xs)]
|
|
962
|
-
except TypeError:
|
|
963
|
-
pass
|
|
964
|
-
|
|
965
|
-
df1_chunked: list[GeoDataFrame] = [df1.iloc[chunk] for chunk in chunks]
|
|
966
|
-
|
|
967
|
-
out = Parallel(processes, backend=backend).map(
|
|
968
|
-
_clean_intersection,
|
|
969
|
-
df1_chunked,
|
|
970
|
-
args=(df2, to_print) if to_print else (df2,),
|
|
971
|
-
)
|
|
972
|
-
return pd.concat(out, ignore_index=True)
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
def _clean_intersection(
|
|
976
|
-
df1: GeoDataFrame, df2: GeoDataFrame, to_print: str | None = None
|
|
977
|
-
) -> GeoDataFrame:
|
|
978
|
-
print(to_print, "- intersection chunk len:", len(df1))
|
|
979
|
-
cols_to_keep = df1.columns.union(df2.columns.difference({df2.geometry.name}))
|
|
980
|
-
df1["_range_idx"] = range(len(df1))
|
|
981
|
-
joined = df1.sjoin(df2, predicate="within", how="left")
|
|
982
|
-
within = joined.loc[joined["_range_idx"].notna(), cols_to_keep]
|
|
983
|
-
not_within = joined.loc[joined["_range_idx"].isna(), df1.columns]
|
|
984
|
-
return pd.concat(
|
|
985
|
-
[
|
|
986
|
-
within,
|
|
987
|
-
clean_overlay(not_within, df2, how="intersection"),
|
|
988
|
-
],
|
|
989
|
-
ignore_index=True,
|
|
1062
|
+
return id_assign_func(
|
|
1063
|
+
gdf,
|
|
1064
|
+
municipalities[[muni_number_col, municipalities._geometry_column_name]],
|
|
990
1065
|
)
|
|
991
1066
|
|
|
992
1067
|
|
|
993
1068
|
def chunkwise(
|
|
994
1069
|
func: Callable,
|
|
995
|
-
|
|
996
|
-
max_rows_per_chunk: int = 150_000,
|
|
997
|
-
n_chunks: int | None = None,
|
|
1070
|
+
iterable: Collection[Iterable[Any]],
|
|
998
1071
|
args: tuple | None = None,
|
|
999
1072
|
kwargs: dict | None = None,
|
|
1000
|
-
|
|
1073
|
+
processes: int = 1,
|
|
1074
|
+
max_rows_per_chunk: int | None = None,
|
|
1001
1075
|
backend: str = "loky",
|
|
1002
|
-
) ->
|
|
1076
|
+
) -> Collection[Iterable[Any]]:
|
|
1003
1077
|
"""Run a function in parallel on chunks of a DataFrame.
|
|
1004
1078
|
|
|
1005
1079
|
This method is used to process large (Geo)DataFrames in manageable pieces,
|
|
@@ -1008,37 +1082,48 @@ def chunkwise(
|
|
|
1008
1082
|
Args:
|
|
1009
1083
|
func: The function to apply to each chunk. This function must accept a DataFrame as
|
|
1010
1084
|
its first argument and return a DataFrame.
|
|
1011
|
-
|
|
1012
|
-
max_rows_per_chunk: The maximum number of rows each chunk should contain.
|
|
1013
|
-
n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
|
|
1014
|
-
calculated based on 'max_rows_per_chunk'.
|
|
1085
|
+
iterable: Iterable to be chunked and processed.
|
|
1015
1086
|
args: Additional positional arguments to pass to 'func'.
|
|
1016
1087
|
kwargs: Keyword arguments to pass to 'func'.
|
|
1017
|
-
|
|
1088
|
+
processes: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
|
|
1089
|
+
max_rows_per_chunk: The maximum number of rows each chunk should contain.
|
|
1018
1090
|
backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
|
|
1019
1091
|
|
|
1020
1092
|
Returns:
|
|
1021
|
-
|
|
1022
|
-
to each chunk of the original GeoDataFrame.
|
|
1093
|
+
Iterable of iterable.
|
|
1023
1094
|
|
|
1024
1095
|
"""
|
|
1025
|
-
|
|
1026
|
-
|
|
1096
|
+
args = args or ()
|
|
1097
|
+
kwargs = kwargs or {}
|
|
1027
1098
|
|
|
1028
|
-
if
|
|
1029
|
-
n_chunks =
|
|
1099
|
+
if max_rows_per_chunk is None:
|
|
1100
|
+
n_chunks: int = processes
|
|
1101
|
+
else:
|
|
1102
|
+
n_chunks: int = len(iterable) // max_rows_per_chunk
|
|
1103
|
+
|
|
1104
|
+
if n_chunks <= 1:
|
|
1105
|
+
return [func(iterable, *args, **kwargs)]
|
|
1030
1106
|
|
|
1031
|
-
chunks = np.array_split(np.arange(len(
|
|
1107
|
+
chunks = np.array_split(np.arange(len(iterable)), n_chunks)
|
|
1032
1108
|
|
|
1033
|
-
|
|
1109
|
+
if hasattr(iterable, "iloc"):
|
|
1110
|
+
iterable_chunked: list[pd.DataFrame | pd.Series] = [
|
|
1111
|
+
iterable.iloc[chunk] for chunk in chunks
|
|
1112
|
+
]
|
|
1113
|
+
elif is_array_like(iterable):
|
|
1114
|
+
iterable_chunked: list[np.ndarray] = [iterable[chunk] for chunk in chunks]
|
|
1115
|
+
else:
|
|
1116
|
+
to_type: type = iterable.__class__
|
|
1117
|
+
iterable_chunked: list[Iterable] = [
|
|
1118
|
+
to_type(chunk) for chunk in np.array_split(list(iterable), n_chunks)
|
|
1119
|
+
]
|
|
1034
1120
|
|
|
1035
|
-
|
|
1121
|
+
return Parallel(processes, backend=backend).map(
|
|
1036
1122
|
func,
|
|
1037
|
-
|
|
1123
|
+
iterable_chunked,
|
|
1038
1124
|
args=args,
|
|
1039
1125
|
kwargs=kwargs,
|
|
1040
1126
|
)
|
|
1041
|
-
return pd.concat(out, ignore_index=True)
|
|
1042
1127
|
|
|
1043
1128
|
|
|
1044
1129
|
def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
|