ssb-sgis 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +20 -9
- sgis/debug_config.py +24 -0
- sgis/exceptions.py +2 -2
- sgis/geopandas_tools/bounds.py +33 -36
- sgis/geopandas_tools/buffer_dissolve_explode.py +136 -35
- sgis/geopandas_tools/centerlines.py +4 -91
- sgis/geopandas_tools/cleaning.py +1576 -583
- sgis/geopandas_tools/conversion.py +38 -19
- sgis/geopandas_tools/duplicates.py +29 -8
- sgis/geopandas_tools/general.py +263 -100
- sgis/geopandas_tools/geometry_types.py +4 -4
- sgis/geopandas_tools/neighbors.py +19 -15
- sgis/geopandas_tools/overlay.py +2 -2
- sgis/geopandas_tools/point_operations.py +5 -5
- sgis/geopandas_tools/polygon_operations.py +510 -105
- sgis/geopandas_tools/polygons_as_rings.py +40 -8
- sgis/geopandas_tools/sfilter.py +29 -12
- sgis/helpers.py +3 -3
- sgis/io/dapla_functions.py +238 -19
- sgis/io/read_parquet.py +1 -1
- sgis/maps/examine.py +27 -12
- sgis/maps/explore.py +450 -65
- sgis/maps/legend.py +177 -76
- sgis/maps/map.py +206 -103
- sgis/maps/maps.py +178 -105
- sgis/maps/thematicmap.py +243 -83
- sgis/networkanalysis/_service_area.py +6 -1
- sgis/networkanalysis/closing_network_holes.py +2 -2
- sgis/networkanalysis/cutting_lines.py +15 -8
- sgis/networkanalysis/directednetwork.py +1 -1
- sgis/networkanalysis/finding_isolated_networks.py +15 -8
- sgis/networkanalysis/networkanalysis.py +17 -19
- sgis/networkanalysis/networkanalysisrules.py +1 -1
- sgis/networkanalysis/traveling_salesman.py +1 -1
- sgis/parallel/parallel.py +64 -27
- sgis/raster/__init__.py +0 -6
- sgis/raster/base.py +208 -0
- sgis/raster/cube.py +54 -8
- sgis/raster/image_collection.py +3257 -0
- sgis/raster/indices.py +17 -5
- sgis/raster/raster.py +138 -243
- sgis/raster/sentinel_config.py +120 -0
- sgis/raster/zonal.py +0 -1
- {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/METADATA +6 -7
- ssb_sgis-1.0.4.dist-info/RECORD +62 -0
- sgis/raster/methods_as_functions.py +0 -0
- sgis/raster/torchgeo.py +0 -171
- ssb_sgis-1.0.2.dist-info/RECORD +0 -61
- {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.0.2.dist-info → ssb_sgis-1.0.4.dist-info}/WHEEL +0 -0
|
@@ -8,6 +8,7 @@ from geopandas import GeoSeries
|
|
|
8
8
|
from geopandas.array import GeometryArray
|
|
9
9
|
from numpy.typing import NDArray
|
|
10
10
|
from pyproj import CRS
|
|
11
|
+
from shapely import difference
|
|
11
12
|
from shapely import get_coordinates
|
|
12
13
|
from shapely import get_exterior_ring
|
|
13
14
|
from shapely import get_interior_ring
|
|
@@ -320,14 +321,14 @@ class PolygonsAsRings:
|
|
|
320
321
|
|
|
321
322
|
exterior = self.rings.loc[self.is_exterior].sort_index()
|
|
322
323
|
assert exterior.shape == (len(self.gdf),)
|
|
324
|
+
nonempty_exteriors = exterior.loc[lambda x: x.notna()]
|
|
325
|
+
empty_exteriors = exterior.loc[lambda x: x.isna()]
|
|
323
326
|
|
|
324
327
|
nonempty_interiors = self.rings.loc[self.is_interior]
|
|
325
328
|
|
|
326
329
|
if not len(nonempty_interiors):
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
except Exception:
|
|
330
|
-
return _geoms_to_linearrings_fallback(exterior).values
|
|
330
|
+
nonempty_exteriors.loc[:] = make_valid(polygons(nonempty_exteriors.values))
|
|
331
|
+
return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
|
|
331
332
|
|
|
332
333
|
empty_interiors = pd.Series(
|
|
333
334
|
[None for _ in range(len(self.gdf) * self.max_rings)],
|
|
@@ -343,10 +344,41 @@ class PolygonsAsRings:
|
|
|
343
344
|
)
|
|
344
345
|
assert interiors.shape == (len(self.gdf), self.max_rings), interiors.shape
|
|
345
346
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
347
|
+
interiors = interiors.loc[
|
|
348
|
+
interiors.index.get_level_values(1).isin(
|
|
349
|
+
nonempty_exteriors.index.get_level_values(1)
|
|
350
|
+
)
|
|
351
|
+
]
|
|
352
|
+
assert interiors.index.get_level_values(1).equals(
|
|
353
|
+
nonempty_exteriors.index.get_level_values(1)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# nan gives TypeError in shapely.polygons. None does not.
|
|
357
|
+
for i, _ in enumerate(interiors.columns):
|
|
358
|
+
interiors.loc[interiors.iloc[:, i].isna(), i] = None
|
|
359
|
+
nonempty_exteriors.loc[nonempty_exteriors.isna()] = None
|
|
360
|
+
|
|
361
|
+
# construct polygons with holes
|
|
362
|
+
polys = make_valid(
|
|
363
|
+
polygons(
|
|
364
|
+
nonempty_exteriors.values,
|
|
365
|
+
interiors.values,
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# interiors might have moved (e.g. snapped) so that they are not within the exterior
|
|
370
|
+
# these interiors will not be holes, so we need to erase them manually
|
|
371
|
+
interiors_as_polys = make_valid(polygons(interiors.values))
|
|
372
|
+
# merge interior polygons into 1d array
|
|
373
|
+
interiors_as_polys = np.array(
|
|
374
|
+
[
|
|
375
|
+
make_valid(unary_union(interiors_as_polys[i, :]))
|
|
376
|
+
for i in range(interiors_as_polys.shape[0])
|
|
377
|
+
]
|
|
378
|
+
)
|
|
379
|
+
# erase rowwise
|
|
380
|
+
nonempty_exteriors.loc[:] = make_valid(difference(polys, interiors_as_polys))
|
|
381
|
+
return pd.concat([empty_exteriors, nonempty_exteriors]).sort_index().values
|
|
350
382
|
|
|
351
383
|
|
|
352
384
|
def get_linearring_series(geoms: GeoDataFrame | GeoSeries) -> pd.Series:
|
sgis/geopandas_tools/sfilter.py
CHANGED
|
@@ -4,7 +4,9 @@ import numpy as np
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from geopandas import GeoDataFrame
|
|
6
6
|
from geopandas import GeoSeries
|
|
7
|
+
from geopandas import __version__ as geopandas_version
|
|
7
8
|
from shapely import Geometry
|
|
9
|
+
from shapely import STRtree
|
|
8
10
|
|
|
9
11
|
from .conversion import to_gdf
|
|
10
12
|
|
|
@@ -15,6 +17,7 @@ def sfilter(
|
|
|
15
17
|
gdf: GeoDataFrame | GeoSeries,
|
|
16
18
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
17
19
|
predicate: str = "intersects",
|
|
20
|
+
distance: int | float | None = None,
|
|
18
21
|
) -> GeoDataFrame:
|
|
19
22
|
"""Filter a GeoDataFrame or GeoSeries by spatial predicate.
|
|
20
23
|
|
|
@@ -29,13 +32,14 @@ def sfilter(
|
|
|
29
32
|
gdf: The GeoDataFrame.
|
|
30
33
|
other: The geometry object to filter 'gdf' by.
|
|
31
34
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
35
|
+
distance: Max distance to allow if predicate=="dwithin".
|
|
32
36
|
|
|
33
37
|
Returns:
|
|
34
38
|
A copy of 'gdf' with only the rows matching the
|
|
35
39
|
spatial predicate with 'other'.
|
|
36
40
|
|
|
37
41
|
Examples:
|
|
38
|
-
|
|
42
|
+
---------
|
|
39
43
|
>>> import sgis as sg
|
|
40
44
|
>>> df1 = sg.to_gdf([(0, 0), (0, 1)])
|
|
41
45
|
>>> df1
|
|
@@ -66,7 +70,7 @@ def sfilter(
|
|
|
66
70
|
Also equivelent to using the intersects method, which
|
|
67
71
|
is often a lot slower since df2 must be dissolved:
|
|
68
72
|
|
|
69
|
-
>>> df1.loc[df1.intersects(df2.
|
|
73
|
+
>>> df1.loc[df1.intersects(df2.union_all())]
|
|
70
74
|
geometry
|
|
71
75
|
0 POINT (0.00000 0.00000)
|
|
72
76
|
|
|
@@ -76,7 +80,7 @@ def sfilter(
|
|
|
76
80
|
|
|
77
81
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
78
82
|
|
|
79
|
-
indices = _get_sfilter_indices(gdf, other, predicate)
|
|
83
|
+
indices = _get_sfilter_indices(gdf, other, predicate, distance)
|
|
80
84
|
|
|
81
85
|
return gdf.iloc[indices]
|
|
82
86
|
|
|
@@ -85,6 +89,7 @@ def sfilter_split(
|
|
|
85
89
|
gdf: GeoDataFrame | GeoSeries,
|
|
86
90
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
87
91
|
predicate: str = "intersects",
|
|
92
|
+
distance: int | float | None = None,
|
|
88
93
|
) -> tuple[GeoDataFrame, GeoDataFrame]:
|
|
89
94
|
"""Split a GeoDataFrame or GeoSeries by spatial predicate.
|
|
90
95
|
|
|
@@ -95,13 +100,14 @@ def sfilter_split(
|
|
|
95
100
|
gdf: The GeoDataFrame.
|
|
96
101
|
other: The geometry object to filter 'gdf' by.
|
|
97
102
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
103
|
+
distance: Max distance to allow if predicate=="dwithin".
|
|
98
104
|
|
|
99
105
|
Returns:
|
|
100
106
|
A tuple of GeoDataFrames, one with the rows that match the spatial predicate
|
|
101
107
|
and one with the rows that do not.
|
|
102
108
|
|
|
103
109
|
Examples:
|
|
104
|
-
|
|
110
|
+
---------
|
|
105
111
|
>>> import sgis as sg
|
|
106
112
|
>>> df1 = sg.to_gdf([(0, 0), (0, 1)])
|
|
107
113
|
>>> df1
|
|
@@ -135,7 +141,7 @@ def sfilter_split(
|
|
|
135
141
|
Also equivelent to using the intersects method, which
|
|
136
142
|
is often slower since df2 must be dissolved:
|
|
137
143
|
|
|
138
|
-
>>> filt = df1.intersects(df2.
|
|
144
|
+
>>> filt = df1.intersects(df2.union_all())
|
|
139
145
|
>>> intersecting = df1.loc[filt]
|
|
140
146
|
>>> not_intersecting = df1.loc[~filt]
|
|
141
147
|
|
|
@@ -145,7 +151,7 @@ def sfilter_split(
|
|
|
145
151
|
|
|
146
152
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
147
153
|
|
|
148
|
-
indices = _get_sfilter_indices(gdf, other, predicate)
|
|
154
|
+
indices = _get_sfilter_indices(gdf, other, predicate, distance)
|
|
149
155
|
|
|
150
156
|
return (
|
|
151
157
|
gdf.iloc[indices],
|
|
@@ -157,6 +163,7 @@ def sfilter_inverse(
|
|
|
157
163
|
gdf: GeoDataFrame | GeoSeries,
|
|
158
164
|
other: GeoDataFrame | GeoSeries | Geometry,
|
|
159
165
|
predicate: str = "intersects",
|
|
166
|
+
distance: int | float | None = None,
|
|
160
167
|
) -> GeoDataFrame | GeoSeries:
|
|
161
168
|
"""Filter a GeoDataFrame or GeoSeries by inverse spatial predicate.
|
|
162
169
|
|
|
@@ -166,13 +173,14 @@ def sfilter_inverse(
|
|
|
166
173
|
gdf: The GeoDataFrame or GeoSeries.
|
|
167
174
|
other: The geometry object to filter 'gdf' by.
|
|
168
175
|
predicate: Spatial predicate to use. Defaults to 'intersects'.
|
|
176
|
+
distance: Max distance to allow if predicate=="dwithin".
|
|
169
177
|
|
|
170
178
|
Returns:
|
|
171
179
|
A copy of 'gdf' with only the rows that do not match the
|
|
172
180
|
spatial predicate with 'other'.
|
|
173
181
|
|
|
174
182
|
Examples:
|
|
175
|
-
|
|
183
|
+
---------
|
|
176
184
|
>>> import sgis as sg
|
|
177
185
|
>>> df1 = sg.to_gdf([(0, 0), (0, 1)])
|
|
178
186
|
>>> df1
|
|
@@ -202,7 +210,7 @@ def sfilter_inverse(
|
|
|
202
210
|
Also equivelent to using the intersects method, which
|
|
203
211
|
is often slower since df2 must be dissolved:
|
|
204
212
|
|
|
205
|
-
>>> not_intersecting = df1.loc[~df1.intersects(df2.
|
|
213
|
+
>>> not_intersecting = df1.loc[~df1.intersects(df2.union_all())]
|
|
206
214
|
|
|
207
215
|
"""
|
|
208
216
|
if not isinstance(gdf, (GeoDataFrame | GeoSeries)):
|
|
@@ -210,7 +218,7 @@ def sfilter_inverse(
|
|
|
210
218
|
|
|
211
219
|
other = _sfilter_checks(other, crs=gdf.crs)
|
|
212
220
|
|
|
213
|
-
indices = _get_sfilter_indices(gdf, other, predicate)
|
|
221
|
+
indices = _get_sfilter_indices(gdf, other, predicate, distance)
|
|
214
222
|
|
|
215
223
|
return gdf.iloc[pd.Index(range(len(gdf))).difference(pd.Index(indices))]
|
|
216
224
|
|
|
@@ -243,6 +251,7 @@ def _get_sfilter_indices(
|
|
|
243
251
|
left: GeoDataFrame | GeoSeries,
|
|
244
252
|
right: GeoDataFrame | GeoSeries | Geometry,
|
|
245
253
|
predicate: str,
|
|
254
|
+
distance: int | float | None,
|
|
246
255
|
) -> np.ndarray:
|
|
247
256
|
"""Compute geometric comparisons and get matching indices.
|
|
248
257
|
|
|
@@ -276,17 +285,25 @@ def _get_sfilter_indices(
|
|
|
276
285
|
# contains is a faster predicate
|
|
277
286
|
# see discussion at https://github.com/geopandas/geopandas/pull/1421
|
|
278
287
|
predicate = "contains"
|
|
279
|
-
sindex = left
|
|
288
|
+
sindex, kwargs = _get_spatial_tree(left)
|
|
280
289
|
input_geoms = right.geometry if isinstance(right, GeoDataFrame) else right
|
|
281
290
|
else:
|
|
282
291
|
# all other predicates are symmetric
|
|
283
292
|
# keep them the same
|
|
284
|
-
sindex = right
|
|
293
|
+
sindex, kwargs = _get_spatial_tree(right)
|
|
285
294
|
input_geoms = left.geometry if isinstance(left, GeoDataFrame) else left
|
|
286
295
|
|
|
287
|
-
l_idx, r_idx = sindex.query(
|
|
296
|
+
l_idx, r_idx = sindex.query(
|
|
297
|
+
input_geoms, predicate=predicate, distance=distance, **kwargs
|
|
298
|
+
)
|
|
288
299
|
|
|
289
300
|
if original_predicate == "within":
|
|
290
301
|
return np.sort(np.unique(r_idx))
|
|
291
302
|
|
|
292
303
|
return np.sort(np.unique(l_idx))
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _get_spatial_tree(df):
|
|
307
|
+
if int(geopandas_version[0]) >= 1:
|
|
308
|
+
return df.sindex, {"sort": False}
|
|
309
|
+
return STRtree(df.geometry.values), {}
|
sgis/helpers.py
CHANGED
|
@@ -141,7 +141,7 @@ def get_all_files(root: str, recursive: bool = True) -> list[str]:
|
|
|
141
141
|
A list of file paths.
|
|
142
142
|
"""
|
|
143
143
|
if not recursive:
|
|
144
|
-
return [path for path in glob.glob(str(Path(root)) + "
|
|
144
|
+
return [path for path in glob.glob(str(Path(root)) + "/**")]
|
|
145
145
|
paths = []
|
|
146
146
|
for root_dir, _, files in os.walk(root):
|
|
147
147
|
for file in files:
|
|
@@ -205,7 +205,7 @@ def unit_is_degrees(gdf: GeoDataFrame) -> bool:
|
|
|
205
205
|
|
|
206
206
|
def get_object_name(
|
|
207
207
|
var: object, start: int = 2, stop: int = 7, ignore_self: bool = True
|
|
208
|
-
) -> str
|
|
208
|
+
) -> str:
|
|
209
209
|
frame = inspect.currentframe() # frame can be FrameType or None
|
|
210
210
|
if frame:
|
|
211
211
|
try:
|
|
@@ -230,7 +230,7 @@ def get_object_name(
|
|
|
230
230
|
finally:
|
|
231
231
|
if frame:
|
|
232
232
|
del frame # Explicitly delete frame reference to assist with garbage collection
|
|
233
|
-
|
|
233
|
+
raise ValueError(f"Couldn't find name for {var}")
|
|
234
234
|
|
|
235
235
|
|
|
236
236
|
def make_namedict(gdfs: tuple[GeoDataFrame]) -> dict[int, str]:
|
sgis/io/dapla_functions.py
CHANGED
|
@@ -1,24 +1,37 @@
|
|
|
1
1
|
"""Functions for reading and writing GeoDataFrames in Statistics Norway's GCS Dapla."""
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import os
|
|
6
|
+
from collections.abc import Iterable
|
|
3
7
|
from pathlib import Path
|
|
4
8
|
|
|
5
9
|
import dapla as dp
|
|
6
10
|
import geopandas as gpd
|
|
7
11
|
import joblib
|
|
8
12
|
import pandas as pd
|
|
13
|
+
import pyarrow
|
|
14
|
+
import shapely
|
|
9
15
|
from geopandas import GeoDataFrame
|
|
16
|
+
from geopandas import GeoSeries
|
|
10
17
|
from geopandas.io.arrow import _geopandas_to_arrow
|
|
11
18
|
from pandas import DataFrame
|
|
12
|
-
from pyarrow import
|
|
19
|
+
from pyarrow import ArrowInvalid
|
|
20
|
+
|
|
21
|
+
from ..geopandas_tools.sfilter import sfilter
|
|
22
|
+
|
|
23
|
+
PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
def read_geopandas(
|
|
16
|
-
gcs_path: str | Path | list[str | Path],
|
|
27
|
+
gcs_path: str | Path | list[str | Path] | tuple[str | Path] | GeoSeries,
|
|
17
28
|
pandas_fallback: bool = False,
|
|
18
29
|
file_system: dp.gcs.GCSFileSystem | None = None,
|
|
30
|
+
mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
|
|
31
|
+
threads: int | None = None,
|
|
19
32
|
**kwargs,
|
|
20
33
|
) -> GeoDataFrame | DataFrame:
|
|
21
|
-
"""Reads geoparquet or other geodata from
|
|
34
|
+
"""Reads geoparquet or other geodata from one or more files on GCS.
|
|
22
35
|
|
|
23
36
|
If the file has 0 rows, the contents will be returned as a pandas.DataFrame,
|
|
24
37
|
since geopandas does not read and write empty tables.
|
|
@@ -33,6 +46,11 @@ def read_geopandas(
|
|
|
33
46
|
not be read with geopandas and the number of rows is more than 0. If True,
|
|
34
47
|
the file will be read with pandas if geopandas fails.
|
|
35
48
|
file_system: Optional file system.
|
|
49
|
+
mask: Optional geometry mask to keep only intersecting geometries.
|
|
50
|
+
If 'gcs_path' is an iterable of multiple paths, only the files
|
|
51
|
+
with a bbox that intersects the mask are read, then filtered by location.
|
|
52
|
+
threads: Number of threads to use if reading multiple files. Defaults to
|
|
53
|
+
the number of files to read or the number of available threads (if lower).
|
|
36
54
|
**kwargs: Additional keyword arguments passed to geopandas' read_parquet
|
|
37
55
|
or read_file, depending on the file type.
|
|
38
56
|
|
|
@@ -42,14 +60,52 @@ def read_geopandas(
|
|
|
42
60
|
if file_system is None:
|
|
43
61
|
file_system = dp.FileClient.get_gcs_file_system()
|
|
44
62
|
|
|
45
|
-
if isinstance(gcs_path, (
|
|
63
|
+
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
46
64
|
kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
|
|
65
|
+
|
|
66
|
+
if mask is not None:
|
|
67
|
+
if not isinstance(gcs_path, GeoSeries):
|
|
68
|
+
bounds_series: GeoSeries = get_bounds_series(
|
|
69
|
+
gcs_path,
|
|
70
|
+
file_system,
|
|
71
|
+
threads=threads,
|
|
72
|
+
pandas_fallback=pandas_fallback,
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
bounds_series = gcs_path
|
|
76
|
+
new_bounds_series = sfilter(bounds_series, mask)
|
|
77
|
+
if not len(new_bounds_series):
|
|
78
|
+
if isinstance(kwargs.get("columns"), Iterable):
|
|
79
|
+
cols = {col: [] for col in kwargs["columns"]}
|
|
80
|
+
else:
|
|
81
|
+
cols = {}
|
|
82
|
+
for path in bounds_series.index:
|
|
83
|
+
try:
|
|
84
|
+
cols |= {col: [] for col in _get_columns(path, file_system)}
|
|
85
|
+
except ArrowInvalid as e:
|
|
86
|
+
if file_system.isfile(path):
|
|
87
|
+
raise ArrowInvalid(e, path) from e
|
|
88
|
+
|
|
89
|
+
return GeoDataFrame(cols | {"geometry": []})
|
|
90
|
+
paths = list(new_bounds_series.index)
|
|
91
|
+
else:
|
|
92
|
+
if isinstance(gcs_path, GeoSeries):
|
|
93
|
+
paths = list(gcs_path.index)
|
|
94
|
+
else:
|
|
95
|
+
paths = list(gcs_path)
|
|
96
|
+
|
|
97
|
+
if threads is None:
|
|
98
|
+
threads = min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
|
|
99
|
+
|
|
47
100
|
# recursive read with threads
|
|
48
|
-
with joblib.Parallel(n_jobs=
|
|
101
|
+
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
49
102
|
dfs: list[GeoDataFrame] = parallel(
|
|
50
|
-
joblib.delayed(read_geopandas)(x, **kwargs) for x in
|
|
103
|
+
joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
|
|
51
104
|
)
|
|
52
|
-
|
|
105
|
+
df = pd.concat(dfs)
|
|
106
|
+
if mask is not None:
|
|
107
|
+
return sfilter(df, mask)
|
|
108
|
+
return df
|
|
53
109
|
|
|
54
110
|
if not isinstance(gcs_path, str):
|
|
55
111
|
try:
|
|
@@ -60,20 +116,26 @@ def read_geopandas(
|
|
|
60
116
|
if "parquet" in gcs_path or "prqt" in gcs_path:
|
|
61
117
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
62
118
|
try:
|
|
63
|
-
|
|
119
|
+
df = gpd.read_parquet(file, **kwargs)
|
|
64
120
|
except ValueError as e:
|
|
65
121
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
66
|
-
raise e
|
|
122
|
+
raise e.__class__(
|
|
123
|
+
f"{e.__class__.__name__}: {e} for {gcs_path}."
|
|
124
|
+
) from e
|
|
67
125
|
df = dp.read_pandas(gcs_path, **kwargs)
|
|
68
126
|
|
|
69
127
|
if pandas_fallback or not len(df):
|
|
70
128
|
return df
|
|
71
129
|
else:
|
|
72
|
-
|
|
130
|
+
more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
|
|
131
|
+
raise e.__class__(
|
|
132
|
+
f"{e.__class__.__name__}: {e} for {df}." + more_txt
|
|
133
|
+
) from e
|
|
134
|
+
|
|
73
135
|
else:
|
|
74
136
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
75
137
|
try:
|
|
76
|
-
|
|
138
|
+
df = gpd.read_file(file, **kwargs)
|
|
77
139
|
except ValueError as e:
|
|
78
140
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
79
141
|
raise e
|
|
@@ -82,7 +144,144 @@ def read_geopandas(
|
|
|
82
144
|
if pandas_fallback or not len(df):
|
|
83
145
|
return df
|
|
84
146
|
else:
|
|
85
|
-
|
|
147
|
+
more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
|
|
148
|
+
raise e.__class__(
|
|
149
|
+
f"{e.__class__.__name__}: {e} for {df}. " + more_txt
|
|
150
|
+
) from e
|
|
151
|
+
|
|
152
|
+
if mask is not None:
|
|
153
|
+
return sfilter(df, mask)
|
|
154
|
+
return df
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _get_bounds_parquet(
|
|
158
|
+
path: str | Path, file_system: dp.gcs.GCSFileSystem, pandas_fallback: bool = False
|
|
159
|
+
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
160
|
+
with file_system.open(path) as f:
|
|
161
|
+
try:
|
|
162
|
+
num_rows = pyarrow.parquet.read_metadata(f).num_rows
|
|
163
|
+
except ArrowInvalid as e:
|
|
164
|
+
if not file_system.isfile(f):
|
|
165
|
+
return None, None
|
|
166
|
+
raise ArrowInvalid(e, path) from e
|
|
167
|
+
if not num_rows:
|
|
168
|
+
return None, None
|
|
169
|
+
meta = pyarrow.parquet.read_schema(f).metadata
|
|
170
|
+
try:
|
|
171
|
+
meta = json.loads(meta[b"geo"])["columns"]["geometry"]
|
|
172
|
+
except KeyError as e:
|
|
173
|
+
if pandas_fallback:
|
|
174
|
+
return None, None
|
|
175
|
+
raise KeyError(
|
|
176
|
+
f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
|
|
177
|
+
# f"{num_rows=}",
|
|
178
|
+
# meta,
|
|
179
|
+
) from e
|
|
180
|
+
return meta["bbox"], meta["crs"]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _get_columns(path: str | Path, file_system: dp.gcs.GCSFileSystem) -> pd.Index:
|
|
184
|
+
with file_system.open(path) as f:
|
|
185
|
+
schema = pyarrow.parquet.read_schema(f)
|
|
186
|
+
index_cols = _get_index_cols(schema)
|
|
187
|
+
return pd.Index(schema.names).difference(index_cols)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
|
|
191
|
+
cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
|
|
192
|
+
return [x for x in cols if not isinstance(x, dict)]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_bounds_series(
|
|
196
|
+
paths: list[str | Path] | tuple[str | Path],
|
|
197
|
+
file_system: dp.gcs.GCSFileSystem | None = None,
|
|
198
|
+
threads: int | None = None,
|
|
199
|
+
pandas_fallback: bool = False,
|
|
200
|
+
) -> GeoSeries:
|
|
201
|
+
"""Get a GeoSeries with file paths as indexes and the file's bounds as values.
|
|
202
|
+
|
|
203
|
+
The returned GeoSeries can be used as the first argument of 'read_geopandas'
|
|
204
|
+
along with the 'mask' keyword.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
paths: Iterable of file paths in gcs.
|
|
208
|
+
file_system: Optional instance of dp.gcs.GCSFileSystem.
|
|
209
|
+
If None, an instance is created within the function.
|
|
210
|
+
Note that this is slower in long loops.
|
|
211
|
+
threads: Number of threads to use if reading multiple files. Defaults to
|
|
212
|
+
the number of files to read or the number of available threads (if lower).
|
|
213
|
+
pandas_fallback: If False (default), an exception is raised if the file has
|
|
214
|
+
no geo metadata. If True, the geometry value is set to None for this file.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
A geopandas.GeoSeries with file paths as indexes and bounds as values.
|
|
218
|
+
|
|
219
|
+
Examples:
|
|
220
|
+
---------
|
|
221
|
+
>>> import sgis as sg
|
|
222
|
+
>>> import dapla as dp
|
|
223
|
+
>>> file_system = dp.FileClient.get_gcs_file_system()
|
|
224
|
+
>>> all_paths = file_system.ls("...")
|
|
225
|
+
|
|
226
|
+
Get the bounds of all your file paths, indexed by path.
|
|
227
|
+
|
|
228
|
+
>>> bounds_series = sg.get_bounds_series(all_paths, file_system)
|
|
229
|
+
>>> bounds_series
|
|
230
|
+
.../0301.parquet POLYGON ((273514.334 6638380.233, 273514.334 6...
|
|
231
|
+
.../1101.parquet POLYGON ((6464.463 6503547.192, 6464.463 65299...
|
|
232
|
+
.../1103.parquet POLYGON ((-6282.301 6564097.347, -6282.301 660...
|
|
233
|
+
.../1106.parquet POLYGON ((-46359.891 6622984.385, -46359.891 6...
|
|
234
|
+
.../1108.parquet POLYGON ((30490.798 6551661.467, 30490.798 658...
|
|
235
|
+
...
|
|
236
|
+
.../5628.parquet POLYGON ((1019391.867 7809550.777, 1019391.867...
|
|
237
|
+
.../5630.parquet POLYGON ((1017907.145 7893398.317, 1017907.145...
|
|
238
|
+
.../5632.parquet POLYGON ((1075687.587 7887714.263, 1075687.587...
|
|
239
|
+
.../5634.parquet POLYGON ((1103447.451 7874551.663, 1103447.451...
|
|
240
|
+
.../5636.parquet POLYGON ((1024129.618 7838961.91, 1024129.618 ...
|
|
241
|
+
Length: 357, dtype: geometry
|
|
242
|
+
|
|
243
|
+
Make a grid around the total bounds of the files,
|
|
244
|
+
and read geometries intersecting with the mask in a loop.
|
|
245
|
+
|
|
246
|
+
>>> grid = sg.make_grid(bounds_series, 10_000)
|
|
247
|
+
>>> for mask in grid.geometry:
|
|
248
|
+
... df = sg.read_geopandas(
|
|
249
|
+
... bounds_series,
|
|
250
|
+
... mask=mask,
|
|
251
|
+
... file_system=file_system,
|
|
252
|
+
... )
|
|
253
|
+
|
|
254
|
+
"""
|
|
255
|
+
if file_system is None:
|
|
256
|
+
file_system = dp.FileClient.get_gcs_file_system()
|
|
257
|
+
|
|
258
|
+
if threads is None:
|
|
259
|
+
threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
|
|
260
|
+
|
|
261
|
+
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
262
|
+
bounds: list[tuple[list[float], dict]] = parallel(
|
|
263
|
+
joblib.delayed(_get_bounds_parquet)(
|
|
264
|
+
path, file_system=file_system, pandas_fallback=pandas_fallback
|
|
265
|
+
)
|
|
266
|
+
for path in paths
|
|
267
|
+
)
|
|
268
|
+
crss = {json.dumps(x[1]) for x in bounds}
|
|
269
|
+
crss = {
|
|
270
|
+
crs
|
|
271
|
+
for crs in crss
|
|
272
|
+
if not any(str(crs).lower() == txt for txt in ["none", "null"])
|
|
273
|
+
}
|
|
274
|
+
if not crss:
|
|
275
|
+
crs = None
|
|
276
|
+
elif len(crss) == 1:
|
|
277
|
+
crs = next(iter(crss))
|
|
278
|
+
else:
|
|
279
|
+
raise ValueError(f"crs mismatch: {crss}")
|
|
280
|
+
return GeoSeries(
|
|
281
|
+
[shapely.box(*bbox[0]) if bbox[0] is not None else None for bbox in bounds],
|
|
282
|
+
index=paths,
|
|
283
|
+
crs=crs,
|
|
284
|
+
)
|
|
86
285
|
|
|
87
286
|
|
|
88
287
|
def write_geopandas(
|
|
@@ -91,6 +290,7 @@ def write_geopandas(
|
|
|
91
290
|
overwrite: bool = True,
|
|
92
291
|
pandas_fallback: bool = False,
|
|
93
292
|
file_system: dp.gcs.GCSFileSystem | None = None,
|
|
293
|
+
write_covering_bbox: bool = False,
|
|
94
294
|
**kwargs,
|
|
95
295
|
) -> None:
|
|
96
296
|
"""Writes a GeoDataFrame to the speficied format.
|
|
@@ -106,6 +306,13 @@ def write_geopandas(
|
|
|
106
306
|
not be written with geopandas and the number of rows is more than 0. If True,
|
|
107
307
|
the file will be written without geo-metadata if >0 rows.
|
|
108
308
|
file_system: Optional file sustem.
|
|
309
|
+
write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
|
|
310
|
+
Writing a bbox column can be computationally expensive, but allows you to specify
|
|
311
|
+
a bbox in : func:read_parquet for filtered reading.
|
|
312
|
+
Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
|
|
313
|
+
considered as experimental. While writing the column is backwards compatible, using it
|
|
314
|
+
for filtering may not be supported by all readers.
|
|
315
|
+
|
|
109
316
|
**kwargs: Additional keyword arguments passed to parquet.write_table
|
|
110
317
|
(for parquet) or geopandas' to_file method (if not parquet).
|
|
111
318
|
"""
|
|
@@ -118,25 +325,37 @@ def write_geopandas(
|
|
|
118
325
|
if not overwrite and exists(gcs_path):
|
|
119
326
|
raise ValueError("File already exists.")
|
|
120
327
|
|
|
121
|
-
if file_system is None:
|
|
122
|
-
file_system = dp.FileClient.get_gcs_file_system()
|
|
123
|
-
|
|
124
328
|
if not isinstance(df, GeoDataFrame):
|
|
125
329
|
raise ValueError("DataFrame must be GeoDataFrame.")
|
|
126
330
|
|
|
331
|
+
if file_system is None:
|
|
332
|
+
file_system = dp.FileClient.get_gcs_file_system()
|
|
333
|
+
|
|
127
334
|
if not len(df):
|
|
128
335
|
if pandas_fallback:
|
|
129
|
-
df.geometry = df.geometry.astype(str)
|
|
130
336
|
df = pd.DataFrame(df)
|
|
131
|
-
|
|
337
|
+
df.geometry = df.geometry.astype(str)
|
|
338
|
+
df.geometry = None
|
|
339
|
+
try:
|
|
340
|
+
dp.write_pandas(df, gcs_path, **kwargs)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
|
|
343
|
+
raise e.__class__(
|
|
344
|
+
f"{e.__class__.__name__}: {e} for {df}. " + more_txt
|
|
345
|
+
) from e
|
|
132
346
|
return
|
|
133
347
|
|
|
134
348
|
file_system = dp.FileClient.get_gcs_file_system()
|
|
135
349
|
|
|
136
350
|
if ".parquet" in gcs_path or "prqt" in gcs_path:
|
|
137
351
|
with file_system.open(gcs_path, mode="wb") as buffer:
|
|
138
|
-
table = _geopandas_to_arrow(
|
|
139
|
-
|
|
352
|
+
table = _geopandas_to_arrow(
|
|
353
|
+
df,
|
|
354
|
+
index=df.index,
|
|
355
|
+
schema_version=None,
|
|
356
|
+
write_covering_bbox=write_covering_bbox,
|
|
357
|
+
)
|
|
358
|
+
pyarrow.parquet.write_table(table, buffer, compression="snappy", **kwargs)
|
|
140
359
|
return
|
|
141
360
|
|
|
142
361
|
layer = kwargs.pop("layer", None)
|
sgis/io/read_parquet.py
CHANGED
|
@@ -15,7 +15,7 @@ def read_parquet_url(url: str) -> GeoDataFrame:
|
|
|
15
15
|
A GeoDataFrame.
|
|
16
16
|
|
|
17
17
|
Examples:
|
|
18
|
-
|
|
18
|
+
---------
|
|
19
19
|
>>> from sgis import read_parquet_url
|
|
20
20
|
>>> url = "https://media.githubusercontent.com/media/statisticsnorway/ssb-sgis/main/tests/testdata/points_oslo.parquet"
|
|
21
21
|
>>> points = read_parquet_url(url)
|