ssb-sgis 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sgis/__init__.py +10 -3
  2. sgis/debug_config.py +24 -0
  3. sgis/geopandas_tools/bounds.py +16 -21
  4. sgis/geopandas_tools/buffer_dissolve_explode.py +112 -30
  5. sgis/geopandas_tools/centerlines.py +4 -91
  6. sgis/geopandas_tools/cleaning.py +1576 -583
  7. sgis/geopandas_tools/conversion.py +24 -14
  8. sgis/geopandas_tools/duplicates.py +27 -6
  9. sgis/geopandas_tools/general.py +259 -100
  10. sgis/geopandas_tools/geometry_types.py +1 -1
  11. sgis/geopandas_tools/neighbors.py +16 -12
  12. sgis/geopandas_tools/overlay.py +7 -3
  13. sgis/geopandas_tools/point_operations.py +3 -3
  14. sgis/geopandas_tools/polygon_operations.py +505 -100
  15. sgis/geopandas_tools/polygons_as_rings.py +40 -8
  16. sgis/geopandas_tools/sfilter.py +26 -9
  17. sgis/io/dapla_functions.py +238 -19
  18. sgis/maps/examine.py +11 -10
  19. sgis/maps/explore.py +227 -155
  20. sgis/maps/legend.py +13 -4
  21. sgis/maps/map.py +22 -13
  22. sgis/maps/maps.py +100 -29
  23. sgis/maps/thematicmap.py +25 -18
  24. sgis/networkanalysis/_service_area.py +6 -1
  25. sgis/networkanalysis/cutting_lines.py +12 -5
  26. sgis/networkanalysis/finding_isolated_networks.py +13 -6
  27. sgis/networkanalysis/networkanalysis.py +10 -12
  28. sgis/parallel/parallel.py +27 -10
  29. sgis/raster/base.py +208 -0
  30. sgis/raster/cube.py +3 -3
  31. sgis/raster/image_collection.py +1421 -724
  32. sgis/raster/indices.py +10 -7
  33. sgis/raster/raster.py +7 -7
  34. sgis/raster/sentinel_config.py +33 -17
  35. {ssb_sgis-1.0.3.dist-info → ssb_sgis-1.0.5.dist-info}/METADATA +6 -7
  36. ssb_sgis-1.0.5.dist-info/RECORD +62 -0
  37. ssb_sgis-1.0.3.dist-info/RECORD +0 -61
  38. {ssb_sgis-1.0.3.dist-info → ssb_sgis-1.0.5.dist-info}/LICENSE +0 -0
  39. {ssb_sgis-1.0.3.dist-info → ssb_sgis-1.0.5.dist-info}/WHEEL +0 -0
@@ -8,6 +8,7 @@ from geopandas import GeoSeries
8
8
  from geopandas.array import GeometryArray
9
9
  from numpy.typing import NDArray
10
10
  from pyproj import CRS
11
+ from shapely import difference
11
12
  from shapely import get_coordinates
12
13
  from shapely import get_exterior_ring
13
14
  from shapely import get_interior_ring
@@ -320,14 +321,14 @@ class PolygonsAsRings:
320
321
 
321
322
  exterior = self.rings.loc[self.is_exterior].sort_index()
322
323
  assert exterior.shape == (len(self.gdf),)
324
+ nonempty_exteriors = exterior.loc[lambda x: x.notna()]
325
+ empty_exteriors = exterior.loc[lambda x: x.isna()]
323
326
 
324
327
  nonempty_interiors = self.rings.loc[self.is_interior]
325
328
 
326
329
  if not len(nonempty_interiors):
327
- try:
328
- return make_valid(polygons(exterior.values))
329
- except Exception:
330
- return _geoms_to_linearrings_fallback(exterior).values
330
+ nonempty_exteriors.loc[:] = make_valid(polygons(nonempty_exteriors.values))
331
+ return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
331
332
 
332
333
  empty_interiors = pd.Series(
333
334
  [None for _ in range(len(self.gdf) * self.max_rings)],
@@ -343,10 +344,41 @@ class PolygonsAsRings:
343
344
  )
344
345
  assert interiors.shape == (len(self.gdf), self.max_rings), interiors.shape
345
346
 
346
- try:
347
- return make_valid(polygons(exterior.values, interiors.values))
348
- except Exception:
349
- return _geoms_to_linearrings_fallback(exterior, interiors).values
347
+ interiors = interiors.loc[
348
+ interiors.index.get_level_values(1).isin(
349
+ nonempty_exteriors.index.get_level_values(1)
350
+ )
351
+ ]
352
+ assert interiors.index.get_level_values(1).equals(
353
+ nonempty_exteriors.index.get_level_values(1)
354
+ )
355
+
356
+ # nan gives TypeError in shapely.polygons. None does not.
357
+ for i, _ in enumerate(interiors.columns):
358
+ interiors.loc[interiors.iloc[:, i].isna(), i] = None
359
+ nonempty_exteriors.loc[nonempty_exteriors.isna()] = None
360
+
361
+ # construct polygons with holes
362
+ polys = make_valid(
363
+ polygons(
364
+ nonempty_exteriors.values,
365
+ interiors.values,
366
+ )
367
+ )
368
+
369
+ # interiors might have moved (e.g. snapped) so that they are not within the exterior
370
+ # these interiors will not be holes, so we need to erase them manually
371
+ interiors_as_polys = make_valid(polygons(interiors.values))
372
+ # merge interior polygons into 1d array
373
+ interiors_as_polys = np.array(
374
+ [
375
+ make_valid(unary_union(interiors_as_polys[i, :]))
376
+ for i in range(interiors_as_polys.shape[0])
377
+ ]
378
+ )
379
+ # erase rowwise
380
+ nonempty_exteriors.loc[:] = make_valid(difference(polys, interiors_as_polys))
381
+ return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
350
382
 
351
383
 
352
384
  def get_linearring_series(geoms: GeoDataFrame | GeoSeries) -> pd.Series:
@@ -4,7 +4,9 @@ import numpy as np
4
4
  import pandas as pd
5
5
  from geopandas import GeoDataFrame
6
6
  from geopandas import GeoSeries
7
+ from geopandas import __version__ as geopandas_version
7
8
  from shapely import Geometry
9
+ from shapely import STRtree
8
10
 
9
11
  from .conversion import to_gdf
10
12
 
@@ -15,6 +17,7 @@ def sfilter(
15
17
  gdf: GeoDataFrame | GeoSeries,
16
18
  other: GeoDataFrame | GeoSeries | Geometry,
17
19
  predicate: str = "intersects",
20
+ distance: int | float | None = None,
18
21
  ) -> GeoDataFrame:
19
22
  """Filter a GeoDataFrame or GeoSeries by spatial predicate.
20
23
 
@@ -29,6 +32,7 @@ def sfilter(
29
32
  gdf: The GeoDataFrame.
30
33
  other: The geometry object to filter 'gdf' by.
31
34
  predicate: Spatial predicate to use. Defaults to 'intersects'.
35
+ distance: Max distance to allow if predicate=="dwithin".
32
36
 
33
37
  Returns:
34
38
  A copy of 'gdf' with only the rows matching the
@@ -66,7 +70,7 @@ def sfilter(
66
70
  Also equivelent to using the intersects method, which
67
71
  is often a lot slower since df2 must be dissolved:
68
72
 
69
- >>> df1.loc[df1.intersects(df2.unary_union)]
73
+ >>> df1.loc[df1.intersects(df2.union_all())]
70
74
  geometry
71
75
  0 POINT (0.00000 0.00000)
72
76
 
@@ -76,7 +80,7 @@ def sfilter(
76
80
 
77
81
  other = _sfilter_checks(other, crs=gdf.crs)
78
82
 
79
- indices = _get_sfilter_indices(gdf, other, predicate)
83
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
80
84
 
81
85
  return gdf.iloc[indices]
82
86
 
@@ -85,6 +89,7 @@ def sfilter_split(
85
89
  gdf: GeoDataFrame | GeoSeries,
86
90
  other: GeoDataFrame | GeoSeries | Geometry,
87
91
  predicate: str = "intersects",
92
+ distance: int | float | None = None,
88
93
  ) -> tuple[GeoDataFrame, GeoDataFrame]:
89
94
  """Split a GeoDataFrame or GeoSeries by spatial predicate.
90
95
 
@@ -95,6 +100,7 @@ def sfilter_split(
95
100
  gdf: The GeoDataFrame.
96
101
  other: The geometry object to filter 'gdf' by.
97
102
  predicate: Spatial predicate to use. Defaults to 'intersects'.
103
+ distance: Max distance to allow if predicate=="dwithin".
98
104
 
99
105
  Returns:
100
106
  A tuple of GeoDataFrames, one with the rows that match the spatial predicate
@@ -135,7 +141,7 @@ def sfilter_split(
135
141
  Also equivelent to using the intersects method, which
136
142
  is often slower since df2 must be dissolved:
137
143
 
138
- >>> filt = df1.intersects(df2.unary_union)
144
+ >>> filt = df1.intersects(df2.union_all())
139
145
  >>> intersecting = df1.loc[filt]
140
146
  >>> not_intersecting = df1.loc[~filt]
141
147
 
@@ -145,7 +151,7 @@ def sfilter_split(
145
151
 
146
152
  other = _sfilter_checks(other, crs=gdf.crs)
147
153
 
148
- indices = _get_sfilter_indices(gdf, other, predicate)
154
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
149
155
 
150
156
  return (
151
157
  gdf.iloc[indices],
@@ -157,6 +163,7 @@ def sfilter_inverse(
157
163
  gdf: GeoDataFrame | GeoSeries,
158
164
  other: GeoDataFrame | GeoSeries | Geometry,
159
165
  predicate: str = "intersects",
166
+ distance: int | float | None = None,
160
167
  ) -> GeoDataFrame | GeoSeries:
161
168
  """Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
162
169
 
@@ -166,6 +173,7 @@ def sfilter_inverse(
166
173
  gdf: The GeoDataFrame or GeoSeries.
167
174
  other: The geometry object to filter 'gdf' by.
168
175
  predicate: Spatial predicate to use. Defaults to 'intersects'.
176
+ distance: Max distance to allow if predicate=="dwithin".
169
177
 
170
178
  Returns:
171
179
  A copy of 'gdf' with only the rows that do not match the
@@ -202,7 +210,7 @@ def sfilter_inverse(
202
210
  Also equivelent to using the intersects method, which
203
211
  is often slower since df2 must be dissolved:
204
212
 
205
- >>> not_intersecting = df1.loc[~df1.intersects(df2.unary_union)]
213
+ >>> not_intersecting = df1.loc[~df1.intersects(df2.union_all())]
206
214
 
207
215
  """
208
216
  if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
@@ -210,7 +218,7 @@ def sfilter_inverse(
210
218
 
211
219
  other = _sfilter_checks(other, crs=gdf.crs)
212
220
 
213
- indices = _get_sfilter_indices(gdf, other, predicate)
221
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
214
222
 
215
223
  return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
216
224
 
@@ -243,6 +251,7 @@ def _get_sfilter_indices(
243
251
  left: GeoDataFrame | GeoSeries,
244
252
  right: GeoDataFrame | GeoSeries | Geometry,
245
253
  predicate: str,
254
+ distance: int | float | None,
246
255
  ) -> np.ndarray:
247
256
  """Compute geometric comparisons and get matching indices.
248
257
 
@@ -276,17 +285,25 @@ def _get_sfilter_indices(
276
285
  # contains is a faster predicate
277
286
  # see discussion at https://github.com/geopandas/geopandas/pull/1421
278
287
  predicate = "contains"
279
- sindex = left.sindex
288
+ sindex, kwargs = _get_spatial_tree(left)
280
289
  input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
281
290
  else:
282
291
  # all other predicates are symmetric
283
292
  # keep them the same
284
- sindex = right.sindex
293
+ sindex, kwargs = _get_spatial_tree(right)
285
294
  input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
286
295
 
287
- l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
296
+ l_idx, r_idx = sindex.query(
297
+ input_geoms, predicate=predicate, distance=distance, **kwargs
298
+ )
288
299
 
289
300
  if original_predicate == "within":
290
301
  return np.sort(np.unique(r_idx))
291
302
 
292
303
  return np.sort(np.unique(l_idx))
304
+
305
+
306
+ def _get_spatial_tree(df):
307
+ if int(geopandas_version[0]) >= 1:
308
+ return df.sindex, {"sort": False}
309
+ return STRtree(df.geometry.values), {}
@@ -1,24 +1,37 @@
1
1
  """Functions for reading and writing GeoDataFrames in Statistics Norway's GCS Dapla."""
2
2
 
3
+ import json
4
+ import multiprocessing
5
+ import os
6
+ from collections.abc import Iterable
3
7
  from pathlib import Path
4
8
 
5
9
  import dapla as dp
6
10
  import geopandas as gpd
7
11
  import joblib
8
12
  import pandas as pd
13
+ import pyarrow
14
+ import shapely
9
15
  from geopandas import GeoDataFrame
16
+ from geopandas import GeoSeries
10
17
  from geopandas.io.arrow import _geopandas_to_arrow
11
18
  from pandas import DataFrame
12
- from pyarrow import parquet
19
+ from pyarrow import ArrowInvalid
20
+
21
+ from ..geopandas_tools.sfilter import sfilter
22
+
23
+ PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
13
24
 
14
25
 
15
26
  def read_geopandas(
16
- gcs_path: str | Path | list[str | Path],
27
+ gcs_path: str | Path | list[str | Path] | tuple[str | Path] | GeoSeries,
17
28
  pandas_fallback: bool = False,
18
29
  file_system: dp.gcs.GCSFileSystem | None = None,
30
+ mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
31
+ threads: int | None = None,
19
32
  **kwargs,
20
33
  ) -> GeoDataFrame | DataFrame:
21
- """Reads geoparquet or other geodata from a file on GCS.
34
+ """Reads geoparquet or other geodata from one or more files on GCS.
22
35
 
23
36
  If the file has 0 rows, the contents will be returned as a pandas.DataFrame,
24
37
  since geopandas does not read and write empty tables.
@@ -33,6 +46,11 @@ def read_geopandas(
33
46
  not be read with geopandas and the number of rows is more than 0. If True,
34
47
  the file will be read with pandas if geopandas fails.
35
48
  file_system: Optional file system.
49
+ mask: Optional geometry mask to keep only intersecting geometries.
50
+ If 'gcs_path' is an iterable of multiple paths, only the files
51
+ with a bbox that intersects the mask are read, then filtered by location.
52
+ threads: Number of threads to use if reading multiple files. Defaults to
53
+ the number of files to read or the number of available threads (if lower).
36
54
  **kwargs: Additional keyword arguments passed to geopandas' read_parquet
37
55
  or read_file, depending on the file type.
38
56
 
@@ -42,14 +60,52 @@ def read_geopandas(
42
60
  if file_system is None:
43
61
  file_system = dp.FileClient.get_gcs_file_system()
44
62
 
45
- if isinstance(gcs_path, (list, tuple)):
63
+ if not isinstance(gcs_path, (str | Path | os.PathLike)):
46
64
  kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
65
+
66
+ if mask is not None:
67
+ if not isinstance(gcs_path, GeoSeries):
68
+ bounds_series: GeoSeries = get_bounds_series(
69
+ gcs_path,
70
+ file_system,
71
+ threads=threads,
72
+ pandas_fallback=pandas_fallback,
73
+ )
74
+ else:
75
+ bounds_series = gcs_path
76
+ new_bounds_series = sfilter(bounds_series, mask)
77
+ if not len(new_bounds_series):
78
+ if isinstance(kwargs.get("columns"), Iterable):
79
+ cols = {col: [] for col in kwargs["columns"]}
80
+ else:
81
+ cols = {}
82
+ for path in bounds_series.index:
83
+ try:
84
+ cols |= {col: [] for col in _get_columns(path, file_system)}
85
+ except ArrowInvalid as e:
86
+ if file_system.isfile(path):
87
+ raise ArrowInvalid(e, path) from e
88
+
89
+ return GeoDataFrame(cols | {"geometry": []})
90
+ paths = list(new_bounds_series.index)
91
+ else:
92
+ if isinstance(gcs_path, GeoSeries):
93
+ paths = list(gcs_path.index)
94
+ else:
95
+ paths = list(gcs_path)
96
+
97
+ if threads is None:
98
+ threads = min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
99
+
47
100
  # recursive read with threads
48
- with joblib.Parallel(n_jobs=len(gcs_path), backend="threading") as parallel:
101
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
49
102
  dfs: list[GeoDataFrame] = parallel(
50
- joblib.delayed(read_geopandas)(x, **kwargs) for x in gcs_path
103
+ joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
51
104
  )
52
- return pd.concat(dfs)
105
+ df = pd.concat(dfs)
106
+ if mask is not None:
107
+ return sfilter(df, mask)
108
+ return df
53
109
 
54
110
  if not isinstance(gcs_path, str):
55
111
  try:
@@ -60,20 +116,26 @@ def read_geopandas(
60
116
  if "parquet" in gcs_path or "prqt" in gcs_path:
61
117
  with file_system.open(gcs_path, mode="rb") as file:
62
118
  try:
63
- return gpd.read_parquet(file, **kwargs)
119
+ df = gpd.read_parquet(file, **kwargs)
64
120
  except ValueError as e:
65
121
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
66
- raise e
122
+ raise e.__class__(
123
+ f"{e.__class__.__name__}: {e} for {gcs_path}."
124
+ ) from e
67
125
  df = dp.read_pandas(gcs_path, **kwargs)
68
126
 
69
127
  if pandas_fallback or not len(df):
70
128
  return df
71
129
  else:
72
- raise e
130
+ more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
131
+ raise e.__class__(
132
+ f"{e.__class__.__name__}: {e} for {df}." + more_txt
133
+ ) from e
134
+
73
135
  else:
74
136
  with file_system.open(gcs_path, mode="rb") as file:
75
137
  try:
76
- return gpd.read_file(file, **kwargs)
138
+ df = gpd.read_file(file, **kwargs)
77
139
  except ValueError as e:
78
140
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
79
141
  raise e
@@ -82,7 +144,144 @@ def read_geopandas(
82
144
  if pandas_fallback or not len(df):
83
145
  return df
84
146
  else:
85
- raise e
147
+ more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
148
+ raise e.__class__(
149
+ f"{e.__class__.__name__}: {e} for {df}. " + more_txt
150
+ ) from e
151
+
152
+ if mask is not None:
153
+ return sfilter(df, mask)
154
+ return df
155
+
156
+
157
+ def _get_bounds_parquet(
158
+ path: str | Path, file_system: dp.gcs.GCSFileSystem, pandas_fallback: bool = False
159
+ ) -> tuple[list[float], dict] | tuple[None, None]:
160
+ with file_system.open(path) as f:
161
+ try:
162
+ num_rows = pyarrow.parquet.read_metadata(f).num_rows
163
+ except ArrowInvalid as e:
164
+ if not file_system.isfile(f):
165
+ return None, None
166
+ raise ArrowInvalid(e, path) from e
167
+ if not num_rows:
168
+ return None, None
169
+ meta = pyarrow.parquet.read_schema(f).metadata
170
+ try:
171
+ meta = json.loads(meta[b"geo"])["columns"]["geometry"]
172
+ except KeyError as e:
173
+ if pandas_fallback:
174
+ return None, None
175
+ raise KeyError(
176
+ f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
177
+ # f"{num_rows=}",
178
+ # meta,
179
+ ) from e
180
+ return meta["bbox"], meta["crs"]
181
+
182
+
183
+ def _get_columns(path: str | Path, file_system: dp.gcs.GCSFileSystem) -> pd.Index:
184
+ with file_system.open(path) as f:
185
+ schema = pyarrow.parquet.read_schema(f)
186
+ index_cols = _get_index_cols(schema)
187
+ return pd.Index(schema.names).difference(index_cols)
188
+
189
+
190
+ def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
191
+ cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
192
+ return [x for x in cols if not isinstance(x, dict)]
193
+
194
+
195
+ def get_bounds_series(
196
+ paths: list[str | Path] | tuple[str | Path],
197
+ file_system: dp.gcs.GCSFileSystem | None = None,
198
+ threads: int | None = None,
199
+ pandas_fallback: bool = False,
200
+ ) -> GeoSeries:
201
+ """Get a GeoSeries with file paths as indexes and the file's bounds as values.
202
+
203
+ The returned GeoSeries can be used as the first argument of 'read_geopandas'
204
+ along with the 'mask' keyword.
205
+
206
+ Args:
207
+ paths: Iterable of file paths in gcs.
208
+ file_system: Optional instance of dp.gcs.GCSFileSystem.
209
+ If None, an instance is created within the function.
210
+ Note that this is slower in long loops.
211
+ threads: Number of threads to use if reading multiple files. Defaults to
212
+ the number of files to read or the number of available threads (if lower).
213
+ pandas_fallback: If False (default), an exception is raised if the file has
214
+ no geo metadata. If True, the geometry value is set to None for this file.
215
+
216
+ Returns:
217
+ A geopandas.GeoSeries with file paths as indexes and bounds as values.
218
+
219
+ Examples:
220
+ ---------
221
+ >>> import sgis as sg
222
+ >>> import dapla as dp
223
+ >>> file_system = dp.FileClient.get_gcs_file_system()
224
+ >>> all_paths = file_system.ls("...")
225
+
226
+ Get the bounds of all your file paths, indexed by path.
227
+
228
+ >>> bounds_series = sg.get_bounds_series(all_paths, file_system)
229
+ >>> bounds_series
230
+ .../0301.parquet POLYGON ((273514.334 6638380.233, 273514.334 6...
231
+ .../1101.parquet POLYGON ((6464.463 6503547.192, 6464.463 65299...
232
+ .../1103.parquet POLYGON ((-6282.301 6564097.347, -6282.301 660...
233
+ .../1106.parquet POLYGON ((-46359.891 6622984.385, -46359.891 6...
234
+ .../1108.parquet POLYGON ((30490.798 6551661.467, 30490.798 658...
235
+ ...
236
+ .../5628.parquet POLYGON ((1019391.867 7809550.777, 1019391.867...
237
+ .../5630.parquet POLYGON ((1017907.145 7893398.317, 1017907.145...
238
+ .../5632.parquet POLYGON ((1075687.587 7887714.263, 1075687.587...
239
+ .../5634.parquet POLYGON ((1103447.451 7874551.663, 1103447.451...
240
+ .../5636.parquet POLYGON ((1024129.618 7838961.91, 1024129.618 ...
241
+ Length: 357, dtype: geometry
242
+
243
+ Make a grid around the total bounds of the files,
244
+ and read geometries intersecting with the mask in a loop.
245
+
246
+ >>> grid = sg.make_grid(bounds_series, 10_000)
247
+ >>> for mask in grid.geometry:
248
+ ... df = sg.read_geopandas(
249
+ ... bounds_series,
250
+ ... mask=mask,
251
+ ... file_system=file_system,
252
+ ... )
253
+
254
+ """
255
+ if file_system is None:
256
+ file_system = dp.FileClient.get_gcs_file_system()
257
+
258
+ if threads is None:
259
+ threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
260
+
261
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
262
+ bounds: list[tuple[list[float], dict]] = parallel(
263
+ joblib.delayed(_get_bounds_parquet)(
264
+ path, file_system=file_system, pandas_fallback=pandas_fallback
265
+ )
266
+ for path in paths
267
+ )
268
+ crss = {json.dumps(x[1]) for x in bounds}
269
+ crss = {
270
+ crs
271
+ for crs in crss
272
+ if not any(str(crs).lower() == txt for txt in ["none", "null"])
273
+ }
274
+ if not crss:
275
+ crs = None
276
+ elif len(crss) == 1:
277
+ crs = next(iter(crss))
278
+ else:
279
+ raise ValueError(f"crs mismatch: {crss}")
280
+ return GeoSeries(
281
+ [shapely.box(*bbox[0]) if bbox[0] is not None else None for bbox in bounds],
282
+ index=paths,
283
+ crs=crs,
284
+ )
86
285
 
87
286
 
88
287
  def write_geopandas(
@@ -91,6 +290,7 @@ def write_geopandas(
91
290
  overwrite: bool = True,
92
291
  pandas_fallback: bool = False,
93
292
  file_system: dp.gcs.GCSFileSystem | None = None,
293
+ write_covering_bbox: bool = False,
94
294
  **kwargs,
95
295
  ) -> None:
96
296
  """Writes a GeoDataFrame to the speficied format.
@@ -106,6 +306,13 @@ def write_geopandas(
106
306
  not be written with geopandas and the number of rows is more than 0. If True,
107
307
  the file will be written without geo-metadata if >0 rows.
108
308
  file_system: Optional file sustem.
309
+ write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
310
+ Writing a bbox column can be computationally expensive, but allows you to specify
311
+ a bbox in : func:read_parquet for filtered reading.
312
+ Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
313
+ considered as experimental. While writing the column is backwards compatible, using it
314
+ for filtering may not be supported by all readers.
315
+
109
316
  **kwargs: Additional keyword arguments passed to parquet.write_table
110
317
  (for parquet) or geopandas' to_file method (if not parquet).
111
318
  """
@@ -118,25 +325,37 @@ def write_geopandas(
118
325
  if not overwrite and exists(gcs_path):
119
326
  raise ValueError("File already exists.")
120
327
 
121
- if file_system is None:
122
- file_system = dp.FileClient.get_gcs_file_system()
123
-
124
328
  if not isinstance(df, GeoDataFrame):
125
329
  raise ValueError("DataFrame must be GeoDataFrame.")
126
330
 
331
+ if file_system is None:
332
+ file_system = dp.FileClient.get_gcs_file_system()
333
+
127
334
  if not len(df):
128
335
  if pandas_fallback:
129
- df.geometry = df.geometry.astype(str)
130
336
  df = pd.DataFrame(df)
131
- dp.write_pandas(df, gcs_path, **kwargs)
337
+ df.geometry = df.geometry.astype(str)
338
+ df.geometry = None
339
+ try:
340
+ dp.write_pandas(df, gcs_path, **kwargs)
341
+ except Exception as e:
342
+ more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
343
+ raise e.__class__(
344
+ f"{e.__class__.__name__}: {e} for {df}. " + more_txt
345
+ ) from e
132
346
  return
133
347
 
134
348
  file_system = dp.FileClient.get_gcs_file_system()
135
349
 
136
350
  if ".parquet" in gcs_path or "prqt" in gcs_path:
137
351
  with file_system.open(gcs_path, mode="wb") as buffer:
138
- table = _geopandas_to_arrow(df, index=df.index, schema_version=None)
139
- parquet.write_table(table, buffer, compression="snappy", **kwargs)
352
+ table = _geopandas_to_arrow(
353
+ df,
354
+ index=df.index,
355
+ schema_version=None,
356
+ write_covering_bbox=write_covering_bbox,
357
+ )
358
+ pyarrow.parquet.write_table(table, buffer, compression="snappy", **kwargs)
140
359
  return
141
360
 
142
361
  layer = kwargs.pop("layer", None)
sgis/maps/examine.py CHANGED
@@ -90,13 +90,13 @@ class Examine:
90
90
  **kwargs: Additional keyword arguments passed to sgis.clipmap.
91
91
 
92
92
  """
93
- gdfs, column, kwargs = Map._separate_args(gdfs, column, kwargs)
94
-
95
93
  if mask_gdf is None:
96
94
  self.mask_gdf = gdfs[0]
97
95
  else:
98
96
  self.mask_gdf = mask_gdf
99
97
 
98
+ gdfs, column, kwargs = Map._separate_args(gdfs, column, kwargs)
99
+
100
100
  m = Explore(*gdfs, column=column, **kwargs)
101
101
 
102
102
  # m = Map(*gdfs, column=column, **kwargs)
@@ -159,14 +159,15 @@ class Examine:
159
159
  print("All rows are shown.")
160
160
  return
161
161
 
162
- print(f"i == {self.i} (of {len(self.mask_gdf)})")
163
- clipmap(
162
+ print(f"i == {self.i} (max. {len(self.mask_gdf)- 1})")
163
+ self.explorer = clipmap(
164
164
  self.column,
165
165
  *list(self.rasters.values()),
166
166
  **self._gdfs,
167
167
  mask=self.mask_gdf.iloc[[self.i]].buffer(self.size),
168
168
  **self.kwargs,
169
169
  )
170
+
170
171
  self.i += 1
171
172
 
172
173
  def sample(self, **kwargs) -> None:
@@ -182,7 +183,7 @@ class Examine:
182
183
  i = np.random.randint(0, len(self.mask_gdf))
183
184
 
184
185
  print(f"Showing index {i}")
185
- clipmap(
186
+ self.explorer = clipmap(
186
187
  self.column,
187
188
  *list(self.rasters.values()),
188
189
  **self._gdfs,
@@ -202,7 +203,7 @@ class Examine:
202
203
  self.i = i
203
204
 
204
205
  print(f"{self.i + 1} of {len(self.mask_gdf)}")
205
- clipmap(
206
+ self.explorer = clipmap(
206
207
  self.column,
207
208
  *list(self.rasters.values()),
208
209
  **self._gdfs,
@@ -216,7 +217,7 @@ class Examine:
216
217
  kwargs = self._fix_kwargs(kwargs)
217
218
  self.kwargs = self.kwargs | kwargs
218
219
 
219
- explore(
220
+ self.explorer = explore(
220
221
  *list(self.rasters.values()),
221
222
  **self._gdfs,
222
223
  column=self.column,
@@ -229,7 +230,7 @@ class Examine:
229
230
  kwargs = self._fix_kwargs(kwargs)
230
231
  self.kwargs = self.kwargs | kwargs
231
232
 
232
- clipmap(
233
+ self.explorer = clipmap(
233
234
  *list(self.rasters.values()),
234
235
  **self._gdfs,
235
236
  column=self.column,
@@ -242,7 +243,7 @@ class Examine:
242
243
  kwargs = self._fix_kwargs(kwargs)
243
244
  self.kwargs = self.kwargs | kwargs
244
245
 
245
- samplemap(
246
+ self.explorer = samplemap(
246
247
  *list(self.rasters.values()),
247
248
  **self._gdfs,
248
249
  column=self.column,
@@ -252,7 +253,7 @@ class Examine:
252
253
  @property
253
254
  def mask(self) -> gpd.GeoDataFrame:
254
255
  """Returns a GeoDataFrame of the last shown mask geometry."""
255
- return self.mask_gdf.iloc[[self.i]]
256
+ return self.mask_gdf.iloc[[self.i - 1]]
256
257
 
257
258
  @property
258
259
  def gdfs(self) -> dict[str, gpd.GeoDataFrame]: