ssb-sgis 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sgis/parallel/parallel.py CHANGED
@@ -10,6 +10,8 @@ from collections.abc import Iterable
10
10
  from pathlib import Path
11
11
  from typing import Any
12
12
 
13
+ from pandas.api.types import is_array_like
14
+
13
15
  try:
14
16
  import dapla as dp
15
17
  except ImportError:
@@ -43,6 +45,199 @@ except ImportError:
43
45
  pass
44
46
 
45
47
 
48
+ def parallel_overlay(
49
+ df1: GeoDataFrame,
50
+ df2: GeoDataFrame,
51
+ processes: int,
52
+ how: str = "intersection",
53
+ max_rows_per_chunk: int | None = None,
54
+ backend: str = "loky",
55
+ to_print: str | None = None,
56
+ **kwargs,
57
+ ) -> GeoDataFrame:
58
+ """Perform spatial overlay operations on two GeoDataFrames in parallel.
59
+
60
+ This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
61
+ overlay operation with the second GeoDataFrame, and then concatenates the results.
62
+
63
+ Note that this function is most useful if df2 has few and simple geometries.
64
+
65
+ Args:
66
+ df1: The first GeoDataFrame for the overlay operation.
67
+ df2: The second GeoDataFrame for the overlay operation.
68
+ how: Type of overlay operation ('intersection', 'union', etc.).
69
+ processes: Number of parallel processes to use.
70
+ max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
71
+ backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
72
+ to_print: Optional text to print to see progression.
73
+ **kwargs: Additional keyword arguments to pass to the overlay function.
74
+
75
+ Returns:
76
+ A GeoDataFrame containing the result of the overlay operation.
77
+ """
78
+ return pd.concat(
79
+ chunkwise(
80
+ _clean_overlay_with_print,
81
+ df1,
82
+ kwargs={
83
+ "df2": df2,
84
+ # "to_print": to_print,
85
+ "how": how,
86
+ }
87
+ | kwargs,
88
+ processes=processes,
89
+ max_rows_per_chunk=max_rows_per_chunk,
90
+ backend=backend,
91
+ ),
92
+ ignore_index=True,
93
+ )
94
+
95
+
96
+ def parallel_overlay_rowwise(
97
+ df1: GeoDataFrame,
98
+ df2: GeoDataFrame,
99
+ processes: int,
100
+ max_rows_per_chunk: int | None = None,
101
+ backend: str = "loky",
102
+ to_print: str | None = None,
103
+ **kwargs,
104
+ ) -> GeoDataFrame:
105
+ """Perform spatial clip on two GeoDataFrames in parallel.
106
+
107
+ This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
108
+ overlay operation with the second GeoDataFrame, and then concatenates the results.
109
+
110
+ Note that this function is most useful if df2 has few and simple geometries.
111
+
112
+ Args:
113
+ df1: The first GeoDataFrame for the overlay operation.
114
+ df2: The second GeoDataFrame for the overlay operation.
115
+ how: Type of overlay operation ('intersection', 'union', etc.).
116
+ processes: Number of parallel processes to use.
117
+ max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
118
+ backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
119
+ to_print: Optional text to print to see progression.
120
+ **kwargs: Additional keyword arguments to pass to the overlay function.
121
+
122
+ Returns:
123
+ A GeoDataFrame containing the result of the overlay operation.
124
+ """
125
+ return pd.concat(
126
+ chunkwise(
127
+ _clip_rowwise,
128
+ df1,
129
+ kwargs={
130
+ "df2": df2,
131
+ "to_print": to_print,
132
+ }
133
+ | kwargs,
134
+ processes=processes,
135
+ max_rows_per_chunk=max_rows_per_chunk,
136
+ backend=backend,
137
+ ),
138
+ ignore_index=True,
139
+ )
140
+
141
+
142
+ def parallel_sjoin(
143
+ df1: GeoDataFrame,
144
+ df2: GeoDataFrame,
145
+ processes: int,
146
+ max_rows_per_chunk: int | None = None,
147
+ backend: str = "loky",
148
+ to_print: str | None = None,
149
+ **kwargs,
150
+ ) -> GeoDataFrame:
151
+ """Perform spatial clip on two GeoDataFrames in parallel.
152
+
153
+ This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
154
+ overlay operation with the second GeoDataFrame, and then concatenates the results.
155
+
156
+ Note that this function is most useful if df2 has few and simple geometries.
157
+
158
+ Args:
159
+ df1: The first GeoDataFrame for the overlay operation.
160
+ df2: The second GeoDataFrame for the overlay operation.
161
+ how: Type of overlay operation ('intersection', 'union', etc.).
162
+ processes: Number of parallel processes to use.
163
+ max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
164
+ backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
165
+ to_print: Optional text to print to see progression.
166
+ **kwargs: Additional keyword arguments to pass to the overlay function.
167
+
168
+ Returns:
169
+ A GeoDataFrame containing the result of the overlay operation.
170
+ """
171
+ return pd.concat(
172
+ chunkwise(
173
+ _sjoin_within_first,
174
+ df1,
175
+ kwargs={
176
+ "df2": df2,
177
+ "to_print": to_print,
178
+ }
179
+ | kwargs,
180
+ processes=processes,
181
+ max_rows_per_chunk=max_rows_per_chunk,
182
+ backend=backend,
183
+ ),
184
+ ignore_index=True,
185
+ )
186
+
187
+
188
+ def _sjoin_within_first(
189
+ df1, df2, to_print: str | None = None, predicate: str = "intersects", **kwargs
190
+ ):
191
+ if to_print:
192
+ print(to_print, "- sjoin chunk len:", len(df1))
193
+
194
+ df2 = df2.reset_index(drop=True)
195
+ df2["_from_df2"] = 1
196
+ df1["_range_idx"] = range(len(df1))
197
+ joined = df1.sjoin(df2, predicate="within", how="left")
198
+ within = joined.loc[joined["_from_df2"].notna()].drop(
199
+ columns=["_from_df2", "_range_idx", "index_right"], errors="raise"
200
+ )
201
+ not_within = df1.loc[
202
+ df1["_range_idx"].isin(joined.loc[joined["_from_df2"].isna(), "_range_idx"])
203
+ ]
204
+
205
+ return pd.concat(
206
+ [
207
+ within,
208
+ not_within.sjoin(df2, predicate=predicate, **kwargs),
209
+ ],
210
+ ignore_index=True,
211
+ )
212
+
213
+
214
+ def _clip_rowwise(df1, df2, to_print: str | None = None):
215
+ geom_col = df2.geometry.name
216
+
217
+ def clip_by_one_row(i):
218
+ this: pd.Series = df2.iloc[i]
219
+ clipped = df1.clip(this[geom_col])
220
+ without_geom_col = this.drop(geom_col)
221
+ clipped.loc[:, without_geom_col.index] = without_geom_col.values
222
+ if to_print:
223
+ print(i, to_print, len(clipped))
224
+ return clipped
225
+
226
+ return pd.concat([clip_by_one_row(i) for i in range(len(df2))], ignore_index=True)
227
+
228
+
229
+ def _clean_overlay_with_print(
230
+ df1: GeoDataFrame,
231
+ df2: GeoDataFrame,
232
+ how: str = "intersection",
233
+ to_print: str | None = None,
234
+ **kwargs,
235
+ ) -> GeoDataFrame:
236
+ if to_print:
237
+ print(to_print, f"- {how} chunk len:", len(df1))
238
+ return clean_overlay(df1, df2, how=how, **kwargs)
239
+
240
+
46
241
  class Parallel:
47
242
  """Run functions in parallell.
48
243
 
@@ -398,9 +593,7 @@ class Parallel:
398
593
  muni_number_col: str = "KOMMUNENR",
399
594
  strict: bool = False,
400
595
  write_empty: bool = False,
401
- clip: bool = True,
402
- max_rows_per_chunk: int = 150_000,
403
- processes_in_clip: int = 1,
596
+ id_assign_func: Callable | functools.partial = clean_overlay,
404
597
  verbose: bool = True,
405
598
  ) -> None:
406
599
  """Split multiple datasets into municipalities and write as separate files.
@@ -410,6 +603,7 @@ class Parallel:
410
603
  in parallel. The intersections themselves can also be run in parallel
411
604
  with the 'processes_in_clip' argument.
412
605
 
606
+
413
607
  Args:
414
608
  in_data: Dictionary with dataset names as keys and file paths or
415
609
  (Geo)DataFrames as values. Note that the files will be read
@@ -435,10 +629,8 @@ class Parallel:
435
629
  not have to have the same length as 'in_data'.
436
630
  write_empty: If False (default), municipalities with no data will be skipped.
437
631
  If True, an empty parquet file will be written.
438
- clip: If True (default), the data will be clipped. If False, the data will
439
- be spatial joined.
440
- max_rows_per_chunk: Number of rows per data chunk for processing.
441
- processes_in_clip: Number of parallel processes for data clipping.
632
+ id_assign_func: Function to assign ids (e.g. municipality number) to
633
+ the dataframe for missing values.
442
634
  verbose: Whether to print during execution.
443
635
 
444
636
  """
@@ -448,11 +640,9 @@ class Parallel:
448
640
  "muni_number_col": muni_number_col,
449
641
  "write_empty": write_empty,
450
642
  "with_neighbors": with_neighbors,
451
- "clip": clip,
452
- "max_rows_per_chunk": max_rows_per_chunk,
453
- "processes_in_clip": processes_in_clip,
454
643
  "strict": strict,
455
644
  "verbose": verbose,
645
+ "id_assign_func": id_assign_func,
456
646
  }
457
647
 
458
648
  if isinstance(out_data, (str, Path)):
@@ -482,51 +672,32 @@ class Parallel:
482
672
  def chunkwise(
483
673
  self,
484
674
  func: Callable,
485
- df: GeoDataFrame,
675
+ iterable: Collection[Iterable[Any]],
486
676
  args: tuple | None = None,
487
677
  kwargs: dict | None = None,
488
- n_chunks: int | None = None,
489
678
  max_rows_per_chunk: int | None = None,
490
- concat: bool = True,
491
- ) -> GeoDataFrame:
679
+ ) -> Collection[Iterable[Any]]:
492
680
  """Run a function in parallel on chunks of a (Geo)DataFrame.
493
681
 
494
682
  Args:
495
683
  func: Function to run chunkwise. It should take
496
- a (Geo)DataFrame as first argument.
497
- df: (Geo)DataFrame to split in n_chunks and passed
684
+ (a chunk of) the iterable as first argument.
685
+ iterable: Iterable to split in chunks and passed
498
686
  as first argument to 'func'.
499
687
  args: Positional arguments in 'func' after the DataFrame.
500
688
  kwargs: Additional keyword arguments in 'func'.
501
- n_chunks: Optionally set number of chunks to split
502
- 'df' into. Defaults to the 'processes' attribute
503
- of the Parallel instance.
504
689
  max_rows_per_chunk: Alternatively decide number of chunks
505
690
  by a maximum number of rows per chunk.
506
- concat: Whether to use pd.concat on the results.
507
- Defaults to True.
508
691
  """
509
- if max_rows_per_chunk is None and n_chunks is None:
510
- n_chunks: int = self.processes
511
- elif n_chunks is None:
512
- n_chunks: int = len(df) // max_rows_per_chunk
513
- elif max_rows_per_chunk is not None and len(df) < max_rows_per_chunk:
514
- return func(df, *args, **kwargs)
515
-
516
- chunks = np.array_split(np.arange(len(df)), n_chunks)
517
-
518
- df_chunked: list[GeoDataFrame] = [df.iloc[chunk] for chunk in chunks]
519
-
520
- out = self.map(
692
+ return chunkwise(
521
693
  func,
522
- df_chunked,
694
+ iterable,
523
695
  args=args,
524
696
  kwargs=kwargs,
697
+ processes=self.processes,
698
+ max_rows_per_chunk=max_rows_per_chunk,
699
+ backend=self.backend,
525
700
  )
526
- if concat:
527
- return pd.concat(out, ignore_index=True)
528
- else:
529
- return out
530
701
 
531
702
  def _validate_execution(self, func: Callable) -> None:
532
703
  """Multiprocessing doesn't work with local variables in interactive interpreter.
@@ -594,9 +765,7 @@ def write_municipality_data(
594
765
  file_type: str = "parquet",
595
766
  func: Callable | None = None,
596
767
  write_empty: bool = False,
597
- clip: bool = True,
598
- max_rows_per_chunk: int = 150_000,
599
- processes_in_clip: int = 1,
768
+ id_assign_func: Callable = clean_overlay,
600
769
  strict: bool = True,
601
770
  verbose: bool = True,
602
771
  ) -> None:
@@ -621,6 +790,8 @@ def write_municipality_data(
621
790
  processes_in_clip: Number of processes to use for clipping.
622
791
  strict: If True (default) and the data has a municipality column,
623
792
  all municipality numbers in 'data' must be present in 'municipalities'.
793
+ id_assign_func: Function to assign ids (e.g. municipality number) to
794
+ the dataframe for missing values.
624
795
  verbose: Whether to print during execution.
625
796
 
626
797
  Returns:
@@ -639,10 +810,8 @@ def write_municipality_data(
639
810
  file_type=file_type,
640
811
  func=func,
641
812
  write_empty=write_empty,
642
- clip=clip,
643
- max_rows_per_chunk=max_rows_per_chunk,
644
- processes_in_clip=processes_in_clip,
645
813
  strict=strict,
814
+ id_assign_func=id_assign_func,
646
815
  verbose=verbose,
647
816
  )
648
817
 
@@ -675,11 +844,10 @@ def _write_municipality_data(
675
844
  file_type: str = "parquet",
676
845
  func: Callable | None = None,
677
846
  write_empty: bool = False,
678
- clip: bool = True,
679
- max_rows_per_chunk: int = 150_000,
680
- processes_in_clip: int = 1,
847
+ processes: int = 1,
681
848
  strict: bool = True,
682
849
  verbose: bool = True,
850
+ id_assign_func: Callable = clean_overlay,
683
851
  ) -> None:
684
852
  if verbose:
685
853
  to_print = out_folder
@@ -696,22 +864,21 @@ def _write_municipality_data(
696
864
  gdf,
697
865
  municipalities,
698
866
  muni_number_col,
699
- clip,
700
- max_rows_per_chunk,
701
- processes_in_clip=processes_in_clip,
702
867
  strict=strict,
703
- to_print=to_print,
868
+ id_assign_func=id_assign_func,
704
869
  )
705
870
 
706
871
  if municipalities is None:
707
- muni_numbers = gdf[muni_number_col]
872
+ muni_numbers = gdf[muni_number_col].unique()
708
873
  elif not isinstance(municipalities, DataFrame):
709
- muni_numbers = municipalities
874
+ muni_numbers = set(municipalities)
710
875
  else:
711
- muni_numbers = municipalities[muni_number_col]
876
+ muni_numbers = municipalities[muni_number_col].unique()
877
+
878
+ muni_numbers = list(sorted(muni_numbers))
712
879
 
713
880
  # hardcode this to threading for efficiency in io bound task
714
- Parallel(processes_in_clip, backend="threading").map(
881
+ Parallel(processes, backend="threading").map(
715
882
  _write_one_muni,
716
883
  muni_numbers,
717
884
  kwargs=dict(
@@ -733,11 +900,10 @@ def _write_neighbor_municipality_data(
733
900
  file_type: str = "parquet",
734
901
  func: Callable | None = None,
735
902
  write_empty: bool = False,
736
- clip: bool = True,
737
- max_rows_per_chunk: int = 150_000,
738
- processes_in_clip: int = 1,
903
+ processes: int = 1,
739
904
  strict: bool = True,
740
905
  verbose: bool = True,
906
+ id_assign_func: Callable = clean_overlay,
741
907
  ) -> None:
742
908
  if verbose:
743
909
  to_print = out_folder
@@ -754,11 +920,8 @@ def _write_neighbor_municipality_data(
754
920
  gdf,
755
921
  municipalities,
756
922
  muni_number_col,
757
- clip,
758
- max_rows_per_chunk,
759
- processes_in_clip,
760
923
  strict=strict,
761
- to_print=to_print,
924
+ id_assign_func=id_assign_func,
762
925
  )
763
926
 
764
927
  if municipalities.index.name != muni_number_col:
@@ -769,7 +932,7 @@ def _write_neighbor_municipality_data(
769
932
  )
770
933
 
771
934
  # hardcode this to threading for efficiency in io bound task
772
- Parallel(processes_in_clip, backend="threading").map(
935
+ Parallel(processes, backend="threading").map(
773
936
  _write_one_muni_with_neighbors,
774
937
  municipalities.index,
775
938
  kwargs=dict(
@@ -850,11 +1013,8 @@ def _fix_missing_muni_numbers(
850
1013
  gdf: GeoDataFrame,
851
1014
  municipalities: GeoDataFrame,
852
1015
  muni_number_col: str,
853
- clip: bool,
854
- max_rows_per_chunk: int,
855
- processes_in_clip: int,
856
1016
  strict: bool,
857
- to_print: str,
1017
+ id_assign_func: Callable,
858
1018
  ) -> GeoDataFrame:
859
1019
  if muni_number_col in gdf and gdf[muni_number_col].notna().all():
860
1020
  if municipalities is None:
@@ -883,123 +1043,37 @@ def _fix_missing_muni_numbers(
883
1043
  "GeoDataFrame to clip the geometries by."
884
1044
  )
885
1045
 
886
- try:
887
- municipalities = municipalities[
888
- [muni_number_col, municipalities.geometry.name]
889
- ].to_crs(gdf.crs)
890
- except Exception as e:
891
- raise e.__class__(e, to_print) from e
1046
+ municipalities = municipalities[
1047
+ [muni_number_col, municipalities.geometry.name]
1048
+ ].to_crs(gdf.crs)
892
1049
 
893
1050
  if muni_number_col in gdf and gdf[muni_number_col].isna().any():
894
1051
  notna = gdf[gdf[muni_number_col].notna()]
895
1052
 
896
1053
  isna = gdf[gdf[muni_number_col].isna()].drop(muni_number_col, axis=1)
897
1054
 
898
- if not clip:
899
- notna_anymore = isna.sjoin(municipalities).drop(columns="index_right")
900
- else:
901
- notna_anymore = parallel_overlay(
902
- isna,
903
- municipalities[[muni_number_col, municipalities._geometry_column_name]],
904
- processes=processes_in_clip,
905
- max_rows_per_chunk=max_rows_per_chunk,
906
- to_print=to_print,
907
- )
908
-
909
- return pd.concat([notna, notna_anymore], ignore_index=True)
910
-
911
- if not clip:
912
- return gdf.sjoin(municipalities).drop(columns="index_right")
913
- else:
914
- return parallel_overlay(
915
- gdf,
1055
+ notna_anymore = id_assign_func(
1056
+ isna,
916
1057
  municipalities[[muni_number_col, municipalities._geometry_column_name]],
917
- processes=processes_in_clip,
918
- max_rows_per_chunk=max_rows_per_chunk,
919
- to_print=to_print,
920
1058
  )
921
1059
 
1060
+ return pd.concat([notna, notna_anymore], ignore_index=True)
922
1061
 
923
- def parallel_overlay(
924
- df1: GeoDataFrame,
925
- df2: GeoDataFrame,
926
- processes: int,
927
- max_rows_per_chunk: int,
928
- backend: str = "loky",
929
- to_print: str | None = None,
930
- **kwargs,
931
- ) -> GeoDataFrame:
932
- """Perform spatial overlay operations on two GeoDataFrames in parallel.
933
-
934
- This function splits the first GeoDataFrame into chunks, processes each chunk in parallel using the specified
935
- overlay operation with the second GeoDataFrame, and then concatenates the results.
936
-
937
- Note that this function is most useful if df2 has few and simple geometries.
938
-
939
- Args:
940
- df1: The first GeoDataFrame for the overlay operation.
941
- df2: The second GeoDataFrame for the overlay operation.
942
- how: Type of overlay operation ('intersection', 'union', etc.).
943
- processes: Number of parallel processes to use.
944
- max_rows_per_chunk: Maximum number of rows per chunk for processing. This helps manage memory usage.
945
- backend: The parallelization backend to use ('loky', 'multiprocessing', 'threading').
946
- to_print: Optional text to print to see progression.
947
- **kwargs: Additional keyword arguments to pass to the overlay function.
948
-
949
- Returns:
950
- A GeoDataFrame containing the result of the overlay operation.
951
- """
952
- if len(df1) < max_rows_per_chunk:
953
- return clean_overlay(df1, df2, **kwargs)
954
-
955
- n_chunks = len(df1) // max_rows_per_chunk
956
- chunks = np.array_split(np.arange(len(df1)), n_chunks)
957
-
958
- try:
959
- x_mapper = dict(enumerate(df1.centroid))
960
- sorted_xs = dict(reversed(sorted(x_mapper.items(), key=lambda item: item[1])))
961
- df1 = df1.iloc[list(sorted_xs)]
962
- except TypeError:
963
- pass
964
-
965
- df1_chunked: list[GeoDataFrame] = [df1.iloc[chunk] for chunk in chunks]
966
-
967
- out = Parallel(processes, backend=backend).map(
968
- _clean_intersection,
969
- df1_chunked,
970
- args=(df2, to_print) if to_print else (df2,),
971
- )
972
- return pd.concat(out, ignore_index=True)
973
-
974
-
975
- def _clean_intersection(
976
- df1: GeoDataFrame, df2: GeoDataFrame, to_print: str | None = None
977
- ) -> GeoDataFrame:
978
- print(to_print, "- intersection chunk len:", len(df1))
979
- cols_to_keep = df1.columns.union(df2.columns.difference({df2.geometry.name}))
980
- df1["_range_idx"] = range(len(df1))
981
- joined = df1.sjoin(df2, predicate="within", how="left")
982
- within = joined.loc[joined["_range_idx"].notna(), cols_to_keep]
983
- not_within = joined.loc[joined["_range_idx"].isna(), df1.columns]
984
- return pd.concat(
985
- [
986
- within,
987
- clean_overlay(not_within, df2, how="intersection"),
988
- ],
989
- ignore_index=True,
1062
+ return id_assign_func(
1063
+ gdf,
1064
+ municipalities[[muni_number_col, municipalities._geometry_column_name]],
990
1065
  )
991
1066
 
992
1067
 
993
1068
  def chunkwise(
994
1069
  func: Callable,
995
- df: GeoDataFrame | pd.DataFrame,
996
- max_rows_per_chunk: int = 150_000,
997
- n_chunks: int | None = None,
1070
+ iterable: Collection[Iterable[Any]],
998
1071
  args: tuple | None = None,
999
1072
  kwargs: dict | None = None,
1000
- n_jobs: int = 1,
1073
+ processes: int = 1,
1074
+ max_rows_per_chunk: int | None = None,
1001
1075
  backend: str = "loky",
1002
- ) -> GeoDataFrame | pd.DataFrame:
1076
+ ) -> Collection[Iterable[Any]]:
1003
1077
  """Run a function in parallel on chunks of a DataFrame.
1004
1078
 
1005
1079
  This method is used to process large (Geo)DataFrames in manageable pieces,
@@ -1008,37 +1082,48 @@ def chunkwise(
1008
1082
  Args:
1009
1083
  func: The function to apply to each chunk. This function must accept a DataFrame as
1010
1084
  its first argument and return a DataFrame.
1011
- df: The DataFrame to be chunked and processed.
1012
- max_rows_per_chunk: The maximum number of rows each chunk should contain.
1013
- n_chunks: The exact number of chunks to divide the dataframe into. If None, it will be
1014
- calculated based on 'max_rows_per_chunk'.
1085
+ iterable: Iterable to be chunked and processed.
1015
1086
  args: Additional positional arguments to pass to 'func'.
1016
1087
  kwargs: Keyword arguments to pass to 'func'.
1017
- n_jobs: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
1088
+ processes: The number of parallel jobs to run. Defaults to 1 (no parallel execution).
1089
+ max_rows_per_chunk: The maximum number of rows each chunk should contain.
1018
1090
  backend: The backend to use for parallel execution (e.g., 'loky', 'multiprocessing').
1019
1091
 
1020
1092
  Returns:
1021
- GeoDataFrame: A GeoDataFrame resulting from concatenating the results of applying 'func'
1022
- to each chunk of the original GeoDataFrame.
1093
+ Iterable of iterable.
1023
1094
 
1024
1095
  """
1025
- if len(df) < max_rows_per_chunk:
1026
- return func(df, *args, **kwargs)
1096
+ args = args or ()
1097
+ kwargs = kwargs or {}
1027
1098
 
1028
- if n_chunks is None:
1029
- n_chunks = len(df) // max_rows_per_chunk
1099
+ if max_rows_per_chunk is None:
1100
+ n_chunks: int = processes
1101
+ else:
1102
+ n_chunks: int = len(iterable) // max_rows_per_chunk
1103
+
1104
+ if n_chunks <= 1:
1105
+ return [func(iterable, *args, **kwargs)]
1030
1106
 
1031
- chunks = np.array_split(np.arange(len(df)), n_chunks)
1107
+ chunks = np.array_split(np.arange(len(iterable)), n_chunks)
1032
1108
 
1033
- df_chunked: list[GeoDataFrame] = [df.iloc[chunk] for chunk in chunks]
1109
+ if hasattr(iterable, "iloc"):
1110
+ iterable_chunked: list[pd.DataFrame | pd.Series] = [
1111
+ iterable.iloc[chunk] for chunk in chunks
1112
+ ]
1113
+ elif is_array_like(iterable):
1114
+ iterable_chunked: list[np.ndarray] = [iterable[chunk] for chunk in chunks]
1115
+ else:
1116
+ to_type: type = iterable.__class__
1117
+ iterable_chunked: list[Iterable] = [
1118
+ to_type(chunk) for chunk in np.array_split(list(iterable), n_chunks)
1119
+ ]
1034
1120
 
1035
- out = Parallel(n_jobs, backend=backend).map(
1121
+ return Parallel(processes, backend=backend).map(
1036
1122
  func,
1037
- df_chunked,
1123
+ iterable_chunked,
1038
1124
  args=args,
1039
1125
  kwargs=kwargs,
1040
1126
  )
1041
- return pd.concat(out, ignore_index=True)
1042
1127
 
1043
1128
 
1044
1129
  def _turn_args_into_kwargs(func: Callable, args: tuple, index_start: int) -> dict:
sgis/raster/base.py CHANGED
@@ -189,30 +189,6 @@ def _gdf_to_geojson_with_col(gdf: GeoDataFrame, values: np.ndarray) -> list[dict
189
189
  ]
190
190
 
191
191
 
192
- def _shapely_to_raster(
193
- geometry: Geometry,
194
- res: int | float,
195
- fill: int = 0,
196
- all_touched: bool = False,
197
- merge_alg: Callable = MergeAlg.replace,
198
- default_value: int = 1,
199
- dtype: Any | None = None,
200
- ) -> np.array:
201
- shape = _get_shape_from_bounds(geometry.bounds, res=res, indexes=1)
202
- transform = _get_transform_from_bounds(geometry.bounds, shape)
203
-
204
- return features.rasterize(
205
- [(geometry, default_value)],
206
- out_shape=shape,
207
- transform=transform,
208
- fill=fill,
209
- all_touched=all_touched,
210
- merge_alg=merge_alg,
211
- default_value=default_value,
212
- dtype=dtype,
213
- )
214
-
215
-
216
192
  @contextmanager
217
193
  def memfile_from_array(array: np.ndarray, **profile) -> rasterio.MemoryFile:
218
194
  """Yield a memory file from a numpy array."""
@@ -228,33 +204,3 @@ def get_index_mapper(df: pd.DataFrame) -> tuple[dict[int, int], str]:
228
204
  idx_mapper = dict(enumerate(df.index))
229
205
  idx_name = df.index.name
230
206
  return idx_mapper, idx_name
231
-
232
-
233
- NESSECARY_META = [
234
- "path",
235
- "type",
236
- "bounds",
237
- "crs",
238
- ]
239
-
240
- PROFILE_ATTRS = [
241
- "driver",
242
- "dtype",
243
- "nodata",
244
- "crs",
245
- "height",
246
- "width",
247
- "blockysize",
248
- "blockxsize",
249
- "tiled",
250
- "compress",
251
- "interleave",
252
- "count", # TODO: this should be based on band_index / array depth, so will have no effect
253
- "indexes", # TODO
254
- ]
255
-
256
- ALLOWED_KEYS = (
257
- NESSECARY_META
258
- + PROFILE_ATTRS
259
- + ["array", "res", "transform", "name", "date", "regex"]
260
- )