ssb-sgis 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. sgis/__init__.py +20 -9
  2. sgis/debug_config.py +24 -0
  3. sgis/exceptions.py +2 -2
  4. sgis/geopandas_tools/bounds.py +33 -36
  5. sgis/geopandas_tools/buffer_dissolve_explode.py +136 -35
  6. sgis/geopandas_tools/centerlines.py +4 -91
  7. sgis/geopandas_tools/cleaning.py +1576 -583
  8. sgis/geopandas_tools/conversion.py +38 -19
  9. sgis/geopandas_tools/duplicates.py +29 -8
  10. sgis/geopandas_tools/general.py +263 -100
  11. sgis/geopandas_tools/geometry_types.py +4 -4
  12. sgis/geopandas_tools/neighbors.py +19 -15
  13. sgis/geopandas_tools/overlay.py +2 -2
  14. sgis/geopandas_tools/point_operations.py +5 -5
  15. sgis/geopandas_tools/polygon_operations.py +510 -105
  16. sgis/geopandas_tools/polygons_as_rings.py +40 -8
  17. sgis/geopandas_tools/sfilter.py +29 -12
  18. sgis/helpers.py +3 -3
  19. sgis/io/dapla_functions.py +238 -19
  20. sgis/io/read_parquet.py +1 -1
  21. sgis/maps/examine.py +27 -12
  22. sgis/maps/explore.py +450 -65
  23. sgis/maps/legend.py +177 -76
  24. sgis/maps/map.py +206 -103
  25. sgis/maps/maps.py +178 -105
  26. sgis/maps/thematicmap.py +243 -83
  27. sgis/networkanalysis/_service_area.py +6 -1
  28. sgis/networkanalysis/closing_network_holes.py +2 -2
  29. sgis/networkanalysis/cutting_lines.py +15 -8
  30. sgis/networkanalysis/directednetwork.py +1 -1
  31. sgis/networkanalysis/finding_isolated_networks.py +15 -8
  32. sgis/networkanalysis/networkanalysis.py +17 -19
  33. sgis/networkanalysis/networkanalysisrules.py +1 -1
  34. sgis/networkanalysis/traveling_salesman.py +1 -1
  35. sgis/parallel/parallel.py +64 -27
  36. sgis/raster/__init__.py +0 -6
  37. sgis/raster/base.py +208 -0
  38. sgis/raster/cube.py +54 -8
  39. sgis/raster/image_collection.py +3257 -0
  40. sgis/raster/indices.py +17 -5
  41. sgis/raster/raster.py +138 -243
  42. sgis/raster/sentinel_config.py +120 -0
  43. sgis/raster/zonal.py +0 -1
  44. {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/METADATA +6 -7
  45. ssb_sgis-1.0.4.dist-info/RECORD +62 -0
  46. sgis/raster/methods_as_functions.py +0 -0
  47. sgis/raster/torchgeo.py +0 -171
  48. ssb_sgis-1.0.2.dist-info/RECORD +0 -61
  49. {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/LICENSE +0 -0
  50. {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/WHEEL +0 -0
@@ -8,6 +8,7 @@ from geopandas import GeoSeries
8
8
  from geopandas.array import GeometryArray
9
9
  from numpy.typing import NDArray
10
10
  from pyproj import CRS
11
+ from shapely import difference
11
12
  from shapely import get_coordinates
12
13
  from shapely import get_exterior_ring
13
14
  from shapely import get_interior_ring
@@ -320,14 +321,14 @@ class PolygonsAsRings:
320
321
 
321
322
  exterior = self.rings.loc[self.is_exterior].sort_index()
322
323
  assert exterior.shape == (len(self.gdf),)
324
+ nonempty_exteriors = exterior.loc[lambda x: x.notna()]
325
+ empty_exteriors = exterior.loc[lambda x: x.isna()]
323
326
 
324
327
  nonempty_interiors = self.rings.loc[self.is_interior]
325
328
 
326
329
  if not len(nonempty_interiors):
327
- try:
328
- return make_valid(polygons(exterior.values))
329
- except Exception:
330
- return _geoms_to_linearrings_fallback(exterior).values
330
+ nonempty_exteriors.loc[:] = make_valid(polygons(nonempty_exteriors.values))
331
+ return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
331
332
 
332
333
  empty_interiors = pd.Series(
333
334
  [None for _ in range(len(self.gdf) * self.max_rings)],
@@ -343,10 +344,41 @@ class PolygonsAsRings:
343
344
  )
344
345
  assert interiors.shape == (len(self.gdf), self.max_rings), interiors.shape
345
346
 
346
- try:
347
- return make_valid(polygons(exterior.values, interiors.values))
348
- except Exception:
349
- return _geoms_to_linearrings_fallback(exterior, interiors).values
347
+ interiors = interiors.loc[
348
+ interiors.index.get_level_values(1).isin(
349
+ nonempty_exteriors.index.get_level_values(1)
350
+ )
351
+ ]
352
+ assert interiors.index.get_level_values(1).equals(
353
+ nonempty_exteriors.index.get_level_values(1)
354
+ )
355
+
356
+ # nan gives TypeError in shapely.polygons. None does not.
357
+ for i, _ in enumerate(interiors.columns):
358
+ interiors.loc[interiors.iloc[:, i].isna(), i] = None
359
+ nonempty_exteriors.loc[nonempty_exteriors.isna()] = None
360
+
361
+ # construct polygons with holes
362
+ polys = make_valid(
363
+ polygons(
364
+ nonempty_exteriors.values,
365
+ interiors.values,
366
+ )
367
+ )
368
+
369
+ # interiors might have moved (e.g. snapped) so that they are not within the exterior
370
+ # these interiors will not be holes, so we need to erase them manually
371
+ interiors_as_polys = make_valid(polygons(interiors.values))
372
+ # merge interior polygons into 1d array
373
+ interiors_as_polys = np.array(
374
+ [
375
+ make_valid(unary_union(interiors_as_polys[i, :]))
376
+ for i in range(interiors_as_polys.shape[0])
377
+ ]
378
+ )
379
+ # erase rowwise
380
+ nonempty_exteriors.loc[:] = make_valid(difference(polys, interiors_as_polys))
381
+ return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
350
382
 
351
383
 
352
384
  def get_linearring_series(geoms: GeoDataFrame | GeoSeries) -> pd.Series:
@@ -4,7 +4,9 @@ import numpy as np
4
4
  import pandas as pd
5
5
  from geopandas import GeoDataFrame
6
6
  from geopandas import GeoSeries
7
+ from geopandas import __version__ as geopandas_version
7
8
  from shapely import Geometry
9
+ from shapely import STRtree
8
10
 
9
11
  from .conversion import to_gdf
10
12
 
@@ -15,6 +17,7 @@ def sfilter(
15
17
  gdf: GeoDataFrame | GeoSeries,
16
18
  other: GeoDataFrame | GeoSeries | Geometry,
17
19
  predicate: str = "intersects",
20
+ distance: int | float | None = None,
18
21
  ) -> GeoDataFrame:
19
22
  """Filter a GeoDataFrame or GeoSeries by spatial predicate.
20
23
 
@@ -29,13 +32,14 @@ def sfilter(
29
32
  gdf: The GeoDataFrame.
30
33
  other: The geometry object to filter 'gdf' by.
31
34
  predicate: Spatial predicate to use. Defaults to 'intersects'.
35
+ distance: Max distance to allow if predicate=="dwithin".
32
36
 
33
37
  Returns:
34
38
  A copy of 'gdf' with only the rows matching the
35
39
  spatial predicate with 'other'.
36
40
 
37
41
  Examples:
38
- --------
42
+ ---------
39
43
  >>> import sgis as sg
40
44
  >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
41
45
  >>> df1
@@ -66,7 +70,7 @@ def sfilter(
66
70
  Also equivelent to using the intersects method, which
67
71
  is often a lot slower since df2 must be dissolved:
68
72
 
69
- >>> df1.loc[df1.intersects(df2.unary_union)]
73
+ >>> df1.loc[df1.intersects(df2.union_all())]
70
74
  geometry
71
75
  0 POINT (0.00000 0.00000)
72
76
 
@@ -76,7 +80,7 @@ def sfilter(
76
80
 
77
81
  other = _sfilter_checks(other, crs=gdf.crs)
78
82
 
79
- indices = _get_sfilter_indices(gdf, other, predicate)
83
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
80
84
 
81
85
  return gdf.iloc[indices]
82
86
 
@@ -85,6 +89,7 @@ def sfilter_split(
85
89
  gdf: GeoDataFrame | GeoSeries,
86
90
  other: GeoDataFrame | GeoSeries | Geometry,
87
91
  predicate: str = "intersects",
92
+ distance: int | float | None = None,
88
93
  ) -> tuple[GeoDataFrame, GeoDataFrame]:
89
94
  """Split a GeoDataFrame or GeoSeries by spatial predicate.
90
95
 
@@ -95,13 +100,14 @@ def sfilter_split(
95
100
  gdf: The GeoDataFrame.
96
101
  other: The geometry object to filter 'gdf' by.
97
102
  predicate: Spatial predicate to use. Defaults to 'intersects'.
103
+ distance: Max distance to allow if predicate=="dwithin".
98
104
 
99
105
  Returns:
100
106
  A tuple of GeoDataFrames, one with the rows that match the spatial predicate
101
107
  and one with the rows that do not.
102
108
 
103
109
  Examples:
104
- --------
110
+ ---------
105
111
  >>> import sgis as sg
106
112
  >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
107
113
  >>> df1
@@ -135,7 +141,7 @@ def sfilter_split(
135
141
  Also equivelent to using the intersects method, which
136
142
  is often slower since df2 must be dissolved:
137
143
 
138
- >>> filt = df1.intersects(df2.unary_union)
144
+ >>> filt = df1.intersects(df2.union_all())
139
145
  >>> intersecting = df1.loc[filt]
140
146
  >>> not_intersecting = df1.loc[~filt]
141
147
 
@@ -145,7 +151,7 @@ def sfilter_split(
145
151
 
146
152
  other = _sfilter_checks(other, crs=gdf.crs)
147
153
 
148
- indices = _get_sfilter_indices(gdf, other, predicate)
154
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
149
155
 
150
156
  return (
151
157
  gdf.iloc[indices],
@@ -157,6 +163,7 @@ def sfilter_inverse(
157
163
  gdf: GeoDataFrame | GeoSeries,
158
164
  other: GeoDataFrame | GeoSeries | Geometry,
159
165
  predicate: str = "intersects",
166
+ distance: int | float | None = None,
160
167
  ) -> GeoDataFrame | GeoSeries:
161
168
  """Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
162
169
 
@@ -166,13 +173,14 @@ def sfilter_inverse(
166
173
  gdf: The GeoDataFrame or GeoSeries.
167
174
  other: The geometry object to filter 'gdf' by.
168
175
  predicate: Spatial predicate to use. Defaults to 'intersects'.
176
+ distance: Max distance to allow if predicate=="dwithin".
169
177
 
170
178
  Returns:
171
179
  A copy of 'gdf' with only the rows that do not match the
172
180
  spatial predicate with 'other'.
173
181
 
174
182
  Examples:
175
- --------
183
+ ---------
176
184
  >>> import sgis as sg
177
185
  >>> df1 = sg.to_gdf([(0, 0), (0, 1)])
178
186
  >>> df1
@@ -202,7 +210,7 @@ def sfilter_inverse(
202
210
  Also equivelent to using the intersects method, which
203
211
  is often slower since df2 must be dissolved:
204
212
 
205
- >>> not_intersecting = df1.loc[~df1.intersects(df2.unary_union)]
213
+ >>> not_intersecting = df1.loc[~df1.intersects(df2.union_all())]
206
214
 
207
215
  """
208
216
  if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
@@ -210,7 +218,7 @@ def sfilter_inverse(
210
218
 
211
219
  other = _sfilter_checks(other, crs=gdf.crs)
212
220
 
213
- indices = _get_sfilter_indices(gdf, other, predicate)
221
+ indices = _get_sfilter_indices(gdf, other, predicate, distance)
214
222
 
215
223
  return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
216
224
 
@@ -243,6 +251,7 @@ def _get_sfilter_indices(
243
251
  left: GeoDataFrame | GeoSeries,
244
252
  right: GeoDataFrame | GeoSeries | Geometry,
245
253
  predicate: str,
254
+ distance: int | float | None,
246
255
  ) -> np.ndarray:
247
256
  """Compute geometric comparisons and get matching indices.
248
257
 
@@ -276,17 +285,25 @@ def _get_sfilter_indices(
276
285
  # contains is a faster predicate
277
286
  # see discussion at https://github.com/geopandas/geopandas/pull/1421
278
287
  predicate = "contains"
279
- sindex = left.sindex
288
+ sindex, kwargs = _get_spatial_tree(left)
280
289
  input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
281
290
  else:
282
291
  # all other predicates are symmetric
283
292
  # keep them the same
284
- sindex = right.sindex
293
+ sindex, kwargs = _get_spatial_tree(right)
285
294
  input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
286
295
 
287
- l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
296
+ l_idx, r_idx = sindex.query(
297
+ input_geoms, predicate=predicate, distance=distance, **kwargs
298
+ )
288
299
 
289
300
  if original_predicate == "within":
290
301
  return np.sort(np.unique(r_idx))
291
302
 
292
303
  return np.sort(np.unique(l_idx))
304
+
305
+
306
+ def _get_spatial_tree(df):
307
+ if int(geopandas_version[0]) >= 1:
308
+ return df.sindex, {"sort": False}
309
+ return STRtree(df.geometry.values), {}
sgis/helpers.py CHANGED
@@ -141,7 +141,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
141
141
  A list of file paths.
142
142
  """
143
143
  if not recursive:
144
- return [path for path in glob.glob(str(Path(root)) + "/*")]
144
+ return [path for path in glob.glob(str(Path(root)) + "/**")]
145
145
  paths = []
146
146
  for root_dir, _, files in os.walk(root):
147
147
  for file in files:
@@ -205,7 +205,7 @@ def unit_is_degrees(gdf: GeoDataFrame) -> bool:
205
205
 
206
206
  def get_object_name(
207
207
  var: object, start: int = 2, stop: int = 7, ignore_self: bool = True
208
- ) -> str | None:
208
+ ) -> str:
209
209
  frame = inspect.currentframe() # frame can be FrameType or None
210
210
  if frame:
211
211
  try:
@@ -230,7 +230,7 @@ def get_object_name(
230
230
  finally:
231
231
  if frame:
232
232
  del frame # Explicitly delete frame reference to assist with garbage collection
233
- return None
233
+ raise ValueError(f"Couldn't find name for {var}")
234
234
 
235
235
 
236
236
  def make_namedict(gdfs: tuple[GeoDataFrame]) -> dict[int, str]:
@@ -1,24 +1,37 @@
1
1
  """Functions for reading and writing GeoDataFrames in Statistics Norway's GCS Dapla."""
2
2
 
3
+ import json
4
+ import multiprocessing
5
+ import os
6
+ from collections.abc import Iterable
3
7
  from pathlib import Path
4
8
 
5
9
  import dapla as dp
6
10
  import geopandas as gpd
7
11
  import joblib
8
12
  import pandas as pd
13
+ import pyarrow
14
+ import shapely
9
15
  from geopandas import GeoDataFrame
16
+ from geopandas import GeoSeries
10
17
  from geopandas.io.arrow import _geopandas_to_arrow
11
18
  from pandas import DataFrame
12
- from pyarrow import parquet
19
+ from pyarrow import ArrowInvalid
20
+
21
+ from ..geopandas_tools.sfilter import sfilter
22
+
23
+ PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
13
24
 
14
25
 
15
26
  def read_geopandas(
16
- gcs_path: str | Path | list[str | Path],
27
+ gcs_path: str | Path | list[str | Path] | tuple[str | Path] | GeoSeries,
17
28
  pandas_fallback: bool = False,
18
29
  file_system: dp.gcs.GCSFileSystem | None = None,
30
+ mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
31
+ threads: int | None = None,
19
32
  **kwargs,
20
33
  ) -> GeoDataFrame | DataFrame:
21
- """Reads geoparquet or other geodata from a file on GCS.
34
+ """Reads geoparquet or other geodata from one or more files on GCS.
22
35
 
23
36
  If the file has 0 rows, the contents will be returned as a pandas.DataFrame,
24
37
  since geopandas does not read and write empty tables.
@@ -33,6 +46,11 @@ def read_geopandas(
33
46
  not be read with geopandas and the number of rows is more than 0. If True,
34
47
  the file will be read with pandas if geopandas fails.
35
48
  file_system: Optional file system.
49
+ mask: Optional geometry mask to keep only intersecting geometries.
50
+ If 'gcs_path' is an iterable of multiple paths, only the files
51
+ with a bbox that intersects the mask are read, then filtered by location.
52
+ threads: Number of threads to use if reading multiple files. Defaults to
53
+ the number of files to read or the number of available threads (if lower).
36
54
  **kwargs: Additional keyword arguments passed to geopandas' read_parquet
37
55
  or read_file, depending on the file type.
38
56
 
@@ -42,14 +60,52 @@ def read_geopandas(
42
60
  if file_system is None:
43
61
  file_system = dp.FileClient.get_gcs_file_system()
44
62
 
45
- if isinstance(gcs_path, (list, tuple)):
63
+ if not isinstance(gcs_path, (str | Path | os.PathLike)):
46
64
  kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
65
+
66
+ if mask is not None:
67
+ if not isinstance(gcs_path, GeoSeries):
68
+ bounds_series: GeoSeries = get_bounds_series(
69
+ gcs_path,
70
+ file_system,
71
+ threads=threads,
72
+ pandas_fallback=pandas_fallback,
73
+ )
74
+ else:
75
+ bounds_series = gcs_path
76
+ new_bounds_series = sfilter(bounds_series, mask)
77
+ if not len(new_bounds_series):
78
+ if isinstance(kwargs.get("columns"), Iterable):
79
+ cols = {col: [] for col in kwargs["columns"]}
80
+ else:
81
+ cols = {}
82
+ for path in bounds_series.index:
83
+ try:
84
+ cols |= {col: [] for col in _get_columns(path, file_system)}
85
+ except ArrowInvalid as e:
86
+ if file_system.isfile(path):
87
+ raise ArrowInvalid(e, path) from e
88
+
89
+ return GeoDataFrame(cols | {"geometry": []})
90
+ paths = list(new_bounds_series.index)
91
+ else:
92
+ if isinstance(gcs_path, GeoSeries):
93
+ paths = list(gcs_path.index)
94
+ else:
95
+ paths = list(gcs_path)
96
+
97
+ if threads is None:
98
+ threads = min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
99
+
47
100
  # recursive read with threads
48
- with joblib.Parallel(n_jobs=len(gcs_path), backend="threading") as parallel:
101
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
49
102
  dfs: list[GeoDataFrame] = parallel(
50
- joblib.delayed(read_geopandas)(x, **kwargs) for x in gcs_path
103
+ joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
51
104
  )
52
- return pd.concat(dfs)
105
+ df = pd.concat(dfs)
106
+ if mask is not None:
107
+ return sfilter(df, mask)
108
+ return df
53
109
 
54
110
  if not isinstance(gcs_path, str):
55
111
  try:
@@ -60,20 +116,26 @@ def read_geopandas(
60
116
  if "parquet" in gcs_path or "prqt" in gcs_path:
61
117
  with file_system.open(gcs_path, mode="rb") as file:
62
118
  try:
63
- return gpd.read_parquet(file, **kwargs)
119
+ df = gpd.read_parquet(file, **kwargs)
64
120
  except ValueError as e:
65
121
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
66
- raise e
122
+ raise e.__class__(
123
+ f"{e.__class__.__name__}: {e} for {gcs_path}."
124
+ ) from e
67
125
  df = dp.read_pandas(gcs_path, **kwargs)
68
126
 
69
127
  if pandas_fallback or not len(df):
70
128
  return df
71
129
  else:
72
- raise e
130
+ more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
131
+ raise e.__class__(
132
+ f"{e.__class__.__name__}: {e} for {df}." + more_txt
133
+ ) from e
134
+
73
135
  else:
74
136
  with file_system.open(gcs_path, mode="rb") as file:
75
137
  try:
76
- return gpd.read_file(file, **kwargs)
138
+ df = gpd.read_file(file, **kwargs)
77
139
  except ValueError as e:
78
140
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
79
141
  raise e
@@ -82,7 +144,144 @@ def read_geopandas(
82
144
  if pandas_fallback or not len(df):
83
145
  return df
84
146
  else:
85
- raise e
147
+ more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
148
+ raise e.__class__(
149
+ f"{e.__class__.__name__}: {e} for {df}. " + more_txt
150
+ ) from e
151
+
152
+ if mask is not None:
153
+ return sfilter(df, mask)
154
+ return df
155
+
156
+
157
+ def _get_bounds_parquet(
158
+ path: str | Path, file_system: dp.gcs.GCSFileSystem, pandas_fallback: bool = False
159
+ ) -> tuple[list[float], dict] | tuple[None, None]:
160
+ with file_system.open(path) as f:
161
+ try:
162
+ num_rows = pyarrow.parquet.read_metadata(f).num_rows
163
+ except ArrowInvalid as e:
164
+ if not file_system.isfile(f):
165
+ return None, None
166
+ raise ArrowInvalid(e, path) from e
167
+ if not num_rows:
168
+ return None, None
169
+ meta = pyarrow.parquet.read_schema(f).metadata
170
+ try:
171
+ meta = json.loads(meta[b"geo"])["columns"]["geometry"]
172
+ except KeyError as e:
173
+ if pandas_fallback:
174
+ return None, None
175
+ raise KeyError(
176
+ f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
177
+ # f"{num_rows=}",
178
+ # meta,
179
+ ) from e
180
+ return meta["bbox"], meta["crs"]
181
+
182
+
183
+ def _get_columns(path: str | Path, file_system: dp.gcs.GCSFileSystem) -> pd.Index:
184
+ with file_system.open(path) as f:
185
+ schema = pyarrow.parquet.read_schema(f)
186
+ index_cols = _get_index_cols(schema)
187
+ return pd.Index(schema.names).difference(index_cols)
188
+
189
+
190
+ def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
191
+ cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
192
+ return [x for x in cols if not isinstance(x, dict)]
193
+
194
+
195
+ def get_bounds_series(
196
+ paths: list[str | Path] | tuple[str | Path],
197
+ file_system: dp.gcs.GCSFileSystem | None = None,
198
+ threads: int | None = None,
199
+ pandas_fallback: bool = False,
200
+ ) -> GeoSeries:
201
+ """Get a GeoSeries with file paths as indexes and the file's bounds as values.
202
+
203
+ The returned GeoSeries can be used as the first argument of 'read_geopandas'
204
+ along with the 'mask' keyword.
205
+
206
+ Args:
207
+ paths: Iterable of file paths in gcs.
208
+ file_system: Optional instance of dp.gcs.GCSFileSystem.
209
+ If None, an instance is created within the function.
210
+ Note that this is slower in long loops.
211
+ threads: Number of threads to use if reading multiple files. Defaults to
212
+ the number of files to read or the number of available threads (if lower).
213
+ pandas_fallback: If False (default), an exception is raised if the file has
214
+ no geo metadata. If True, the geometry value is set to None for this file.
215
+
216
+ Returns:
217
+ A geopandas.GeoSeries with file paths as indexes and bounds as values.
218
+
219
+ Examples:
220
+ ---------
221
+ >>> import sgis as sg
222
+ >>> import dapla as dp
223
+ >>> file_system = dp.FileClient.get_gcs_file_system()
224
+ >>> all_paths = file_system.ls("...")
225
+
226
+ Get the bounds of all your file paths, indexed by path.
227
+
228
+ >>> bounds_series = sg.get_bounds_series(all_paths, file_system)
229
+ >>> bounds_series
230
+ .../0301.parquet POLYGON ((273514.334 6638380.233, 273514.334 6...
231
+ .../1101.parquet POLYGON ((6464.463 6503547.192, 6464.463 65299...
232
+ .../1103.parquet POLYGON ((-6282.301 6564097.347, -6282.301 660...
233
+ .../1106.parquet POLYGON ((-46359.891 6622984.385, -46359.891 6...
234
+ .../1108.parquet POLYGON ((30490.798 6551661.467, 30490.798 658...
235
+ ...
236
+ .../5628.parquet POLYGON ((1019391.867 7809550.777, 1019391.867...
237
+ .../5630.parquet POLYGON ((1017907.145 7893398.317, 1017907.145...
238
+ .../5632.parquet POLYGON ((1075687.587 7887714.263, 1075687.587...
239
+ .../5634.parquet POLYGON ((1103447.451 7874551.663, 1103447.451...
240
+ .../5636.parquet POLYGON ((1024129.618 7838961.91, 1024129.618 ...
241
+ Length: 357, dtype: geometry
242
+
243
+ Make a grid around the total bounds of the files,
244
+ and read geometries intersecting with the mask in a loop.
245
+
246
+ >>> grid = sg.make_grid(bounds_series, 10_000)
247
+ >>> for mask in grid.geometry:
248
+ ... df = sg.read_geopandas(
249
+ ... bounds_series,
250
+ ... mask=mask,
251
+ ... file_system=file_system,
252
+ ... )
253
+
254
+ """
255
+ if file_system is None:
256
+ file_system = dp.FileClient.get_gcs_file_system()
257
+
258
+ if threads is None:
259
+ threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
260
+
261
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
262
+ bounds: list[tuple[list[float], dict]] = parallel(
263
+ joblib.delayed(_get_bounds_parquet)(
264
+ path, file_system=file_system, pandas_fallback=pandas_fallback
265
+ )
266
+ for path in paths
267
+ )
268
+ crss = {json.dumps(x[1]) for x in bounds}
269
+ crss = {
270
+ crs
271
+ for crs in crss
272
+ if not any(str(crs).lower() == txt for txt in ["none", "null"])
273
+ }
274
+ if not crss:
275
+ crs = None
276
+ elif len(crss) == 1:
277
+ crs = next(iter(crss))
278
+ else:
279
+ raise ValueError(f"crs mismatch: {crss}")
280
+ return GeoSeries(
281
+ [shapely.box(*bbox[0]) if bbox[0] is not None else None for bbox in bounds],
282
+ index=paths,
283
+ crs=crs,
284
+ )
86
285
 
87
286
 
88
287
  def write_geopandas(
@@ -91,6 +290,7 @@ def write_geopandas(
91
290
  overwrite: bool = True,
92
291
  pandas_fallback: bool = False,
93
292
  file_system: dp.gcs.GCSFileSystem | None = None,
293
+ write_covering_bbox: bool = False,
94
294
  **kwargs,
95
295
  ) -> None:
96
296
  """Writes a GeoDataFrame to the speficied format.
@@ -106,6 +306,13 @@ def write_geopandas(
106
306
  not be written with geopandas and the number of rows is more than 0. If True,
107
307
  the file will be written without geo-metadata if >0 rows.
108
308
  file_system: Optional file sustem.
309
+ write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
310
+ Writing a bbox column can be computationally expensive, but allows you to specify
311
+ a bbox in : func:read_parquet for filtered reading.
312
+ Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
313
+ considered as experimental. While writing the column is backwards compatible, using it
314
+ for filtering may not be supported by all readers.
315
+
109
316
  **kwargs: Additional keyword arguments passed to parquet.write_table
110
317
  (for parquet) or geopandas' to_file method (if not parquet).
111
318
  """
@@ -118,25 +325,37 @@ def write_geopandas(
118
325
  if not overwrite and exists(gcs_path):
119
326
  raise ValueError("File already exists.")
120
327
 
121
- if file_system is None:
122
- file_system = dp.FileClient.get_gcs_file_system()
123
-
124
328
  if not isinstance(df, GeoDataFrame):
125
329
  raise ValueError("DataFrame must be GeoDataFrame.")
126
330
 
331
+ if file_system is None:
332
+ file_system = dp.FileClient.get_gcs_file_system()
333
+
127
334
  if not len(df):
128
335
  if pandas_fallback:
129
- df.geometry = df.geometry.astype(str)
130
336
  df = pd.DataFrame(df)
131
- dp.write_pandas(df, gcs_path, **kwargs)
337
+ df.geometry = df.geometry.astype(str)
338
+ df.geometry = None
339
+ try:
340
+ dp.write_pandas(df, gcs_path, **kwargs)
341
+ except Exception as e:
342
+ more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
343
+ raise e.__class__(
344
+ f"{e.__class__.__name__}: {e} for {df}. " + more_txt
345
+ ) from e
132
346
  return
133
347
 
134
348
  file_system = dp.FileClient.get_gcs_file_system()
135
349
 
136
350
  if ".parquet" in gcs_path or "prqt" in gcs_path:
137
351
  with file_system.open(gcs_path, mode="wb") as buffer:
138
- table = _geopandas_to_arrow(df, index=df.index, schema_version=None)
139
- parquet.write_table(table, buffer, compression="snappy", **kwargs)
352
+ table = _geopandas_to_arrow(
353
+ df,
354
+ index=df.index,
355
+ schema_version=None,
356
+ write_covering_bbox=write_covering_bbox,
357
+ )
358
+ pyarrow.parquet.write_table(table, buffer, compression="snappy", **kwargs)
140
359
  return
141
360
 
142
361
  layer = kwargs.pop("layer", None)
sgis/io/read_parquet.py CHANGED
@@ -15,7 +15,7 @@ def read_parquet_url(url: str) -> GeoDataFrame:
15
15
  A GeoDataFrame.
16
16
 
17
17
  Examples:
18
- --------
18
+ ---------
19
19
  >>> from sgis import read_parquet_url
20
20
  >>> url = "https://media.githubusercontent.com/media/statisticsnorway/ssb-sgis/main/tests/testdata/points_oslo.parquet"
21
21
  >>> points = read_parquet_url(url)