ssb-sgis 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/geopandas_tools/bounds.py +4 -4
- sgis/helpers.py +11 -2
- sgis/io/dapla_functions.py +267 -226
- sgis/maps/wms.py +7 -1
- {ssb_sgis-1.1.2.dist-info → ssb_sgis-1.1.4.dist-info}/METADATA +1 -1
- {ssb_sgis-1.1.2.dist-info → ssb_sgis-1.1.4.dist-info}/RECORD +8 -8
- {ssb_sgis-1.1.2.dist-info → ssb_sgis-1.1.4.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.1.2.dist-info → ssb_sgis-1.1.4.dist-info}/WHEEL +0 -0
sgis/geopandas_tools/bounds.py
CHANGED
|
@@ -507,7 +507,7 @@ def make_ssb_grid(
|
|
|
507
507
|
to make sure all data is covered by the grid.
|
|
508
508
|
|
|
509
509
|
Returns:
|
|
510
|
-
GeoDataFrame with grid geometries and a column '
|
|
510
|
+
GeoDataFrame with grid geometries and a column 'ssb_rute_id'.
|
|
511
511
|
|
|
512
512
|
Raises:
|
|
513
513
|
ValueError: If the GeoDataFrame does not have 25833 as crs.
|
|
@@ -568,12 +568,12 @@ def make_ssb_grid(
|
|
|
568
568
|
grid["nordc"] = (
|
|
569
569
|
(np.floor((grid.geometry.centroid.y) / gridsize) * gridsize).apply(int)
|
|
570
570
|
).apply(str)
|
|
571
|
-
grid["
|
|
572
|
-
return grid[["
|
|
571
|
+
grid["ssb_rute_id"] = grid["ostc"] + grid["nordc"]
|
|
572
|
+
return grid[["ssb_rute_id", "geometry"]]
|
|
573
573
|
|
|
574
574
|
|
|
575
575
|
def add_grid_id(
|
|
576
|
-
gdf: GeoDataFrame, gridsize: int, out_column: str = "
|
|
576
|
+
gdf: GeoDataFrame, gridsize: int, out_column: str = "ssb_rute_id"
|
|
577
577
|
) -> GeoDataFrame:
|
|
578
578
|
"""Adds an SSB grid ID column to a GeoDataFrame of points.
|
|
579
579
|
|
sgis/helpers.py
CHANGED
|
@@ -28,12 +28,21 @@ def _get_file_system(
|
|
|
28
28
|
file_system: None | AbstractFileSystem, kwargs: dict
|
|
29
29
|
) -> AbstractFileSystem:
|
|
30
30
|
if (
|
|
31
|
-
file_system is not None and "filesystem" in kwargs or "file_system" in kwargs
|
|
31
|
+
file_system is not None and ("filesystem" in kwargs or "file_system" in kwargs)
|
|
32
32
|
) or ("filesystem" in kwargs and "file_system" in kwargs):
|
|
33
33
|
raise ValueError("Cannot pass both filesystem and file_system.")
|
|
34
34
|
file_system2 = kwargs.pop("file_system", None)
|
|
35
35
|
file_system3 = kwargs.pop("filesystem", None)
|
|
36
|
-
return
|
|
36
|
+
return (
|
|
37
|
+
file_system
|
|
38
|
+
or file_system2
|
|
39
|
+
or file_system3
|
|
40
|
+
or (
|
|
41
|
+
config["file_system"]()
|
|
42
|
+
if callable(config["file_system"])
|
|
43
|
+
else config["file_system"]
|
|
44
|
+
)
|
|
45
|
+
)
|
|
37
46
|
|
|
38
47
|
|
|
39
48
|
def get_numpy_func(text: str, error_message: str | None = None) -> Callable:
|
sgis/io/dapla_functions.py
CHANGED
|
@@ -14,6 +14,7 @@ from collections.abc import Iterable
|
|
|
14
14
|
from concurrent.futures import ThreadPoolExecutor
|
|
15
15
|
from io import BytesIO
|
|
16
16
|
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
17
18
|
|
|
18
19
|
import geopandas as gpd
|
|
19
20
|
import joblib
|
|
@@ -25,6 +26,7 @@ import pyarrow.parquet as pq
|
|
|
25
26
|
import shapely
|
|
26
27
|
from geopandas import GeoDataFrame
|
|
27
28
|
from geopandas import GeoSeries
|
|
29
|
+
from geopandas.io.arrow import _arrow_to_geopandas
|
|
28
30
|
from geopandas.io.arrow import _geopandas_to_arrow
|
|
29
31
|
from pandas import DataFrame
|
|
30
32
|
from pyarrow import ArrowInvalid
|
|
@@ -49,7 +51,7 @@ def read_geopandas(
|
|
|
49
51
|
pandas_fallback: bool = False,
|
|
50
52
|
file_system: GCSFileSystem | None = None,
|
|
51
53
|
mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
|
|
52
|
-
|
|
54
|
+
use_threads: bool = True,
|
|
53
55
|
filters: pyarrow.dataset.Expression | None = None,
|
|
54
56
|
**kwargs,
|
|
55
57
|
) -> GeoDataFrame | DataFrame:
|
|
@@ -68,11 +70,11 @@ def read_geopandas(
|
|
|
68
70
|
not be read with geopandas and the number of rows is more than 0. If True,
|
|
69
71
|
the file will be read with pandas if geopandas fails.
|
|
70
72
|
file_system: Optional file system.
|
|
71
|
-
mask:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
73
|
+
mask: If gcs_path is a partitioned parquet file or an interable of paths.
|
|
74
|
+
Only files with a bbox intersecting mask will be read.
|
|
75
|
+
Note that the data is not filtered on a row level. You should either
|
|
76
|
+
use clip or sfilter to filter the data after reading.
|
|
77
|
+
use_threads: Defaults to True.
|
|
76
78
|
filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
|
|
77
79
|
structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
|
|
78
80
|
More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
|
|
@@ -85,72 +87,38 @@ def read_geopandas(
|
|
|
85
87
|
file_system = _get_file_system(file_system, kwargs)
|
|
86
88
|
|
|
87
89
|
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
)
|
|
97
|
-
else:
|
|
98
|
-
bounds_series = gcs_path
|
|
99
|
-
new_bounds_series = sfilter(bounds_series, mask)
|
|
100
|
-
if not len(new_bounds_series):
|
|
101
|
-
if isinstance(kwargs.get("columns"), Iterable):
|
|
102
|
-
cols = {col: [] for col in kwargs["columns"]}
|
|
103
|
-
else:
|
|
104
|
-
cols = {}
|
|
105
|
-
for path in bounds_series.index:
|
|
106
|
-
try:
|
|
107
|
-
cols |= {col: [] for col in _get_columns(path, file_system)}
|
|
108
|
-
except ArrowInvalid as e:
|
|
109
|
-
if file_system.isfile(path):
|
|
110
|
-
raise ArrowInvalid(e, path) from e
|
|
111
|
-
|
|
112
|
-
return GeoDataFrame(cols | {"geometry": []})
|
|
113
|
-
paths = list(new_bounds_series.index)
|
|
114
|
-
else:
|
|
115
|
-
if isinstance(gcs_path, GeoSeries):
|
|
116
|
-
paths = list(gcs_path.index)
|
|
117
|
-
else:
|
|
118
|
-
paths = list(gcs_path)
|
|
119
|
-
|
|
120
|
-
if threads is None:
|
|
121
|
-
threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
|
|
90
|
+
return _read_geopandas_from_iterable(
|
|
91
|
+
gcs_path,
|
|
92
|
+
mask=mask,
|
|
93
|
+
file_system=file_system,
|
|
94
|
+
use_threads=use_threads,
|
|
95
|
+
pandas_fallback=pandas_fallback,
|
|
96
|
+
**kwargs,
|
|
97
|
+
)
|
|
122
98
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
99
|
+
if (
|
|
100
|
+
isinstance(filters, Iterable)
|
|
101
|
+
and len(filters) == 1
|
|
102
|
+
and ("=" in next(iter(filters)) or "==" in next(iter(filters)))
|
|
103
|
+
):
|
|
104
|
+
# try to read only files in the relevant partition, because glob is slow without GCSFileSystem
|
|
105
|
+
try:
|
|
106
|
+
expression = "".join(next(iter(filters))).replace("==", "=")
|
|
107
|
+
glob_func = _get_glob(file_system)
|
|
108
|
+
paths = glob_func(str(Path(gcs_path) / expression))
|
|
109
|
+
if paths:
|
|
110
|
+
return _read_geopandas_from_iterable(
|
|
111
|
+
paths,
|
|
112
|
+
mask=mask,
|
|
129
113
|
file_system=file_system,
|
|
114
|
+
use_threads=use_threads,
|
|
130
115
|
pandas_fallback=pandas_fallback,
|
|
131
|
-
mask=mask,
|
|
132
|
-
threads=threads,
|
|
133
116
|
**kwargs,
|
|
134
117
|
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
if dfs:
|
|
139
|
-
df = pd.concat(dfs, ignore_index=True)
|
|
140
|
-
try:
|
|
141
|
-
df = GeoDataFrame(df)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
if not pandas_fallback:
|
|
144
|
-
print(e)
|
|
145
|
-
raise e
|
|
146
|
-
else:
|
|
147
|
-
df = GeoDataFrame(cols | {"geometry": []})
|
|
148
|
-
|
|
149
|
-
if mask is not None:
|
|
150
|
-
return sfilter(df, mask)
|
|
151
|
-
return df
|
|
118
|
+
except FileNotFoundError:
|
|
119
|
+
pass
|
|
152
120
|
|
|
153
|
-
child_paths =
|
|
121
|
+
child_paths = get_child_paths(gcs_path, file_system)
|
|
154
122
|
if child_paths:
|
|
155
123
|
return gpd.GeoDataFrame(
|
|
156
124
|
_read_partitioned_parquet(
|
|
@@ -158,65 +126,93 @@ def read_geopandas(
|
|
|
158
126
|
read_func=_read_geopandas,
|
|
159
127
|
file_system=file_system,
|
|
160
128
|
mask=mask,
|
|
161
|
-
pandas_fallback=pandas_fallback,
|
|
162
129
|
filters=filters,
|
|
163
130
|
child_paths=child_paths,
|
|
164
131
|
**kwargs,
|
|
165
132
|
)
|
|
166
133
|
)
|
|
167
134
|
|
|
168
|
-
if "parquet"
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
raise e.__class__(
|
|
187
|
-
f"{e.__class__.__name__}: {e} for {df}." + more_txt
|
|
188
|
-
) from e
|
|
189
|
-
except Exception as e:
|
|
190
|
-
raise e.__class__(f"{e.__class__.__name__}: {e} for {gcs_path}.") from e
|
|
135
|
+
if gcs_path.endswith(".parquet"):
|
|
136
|
+
file_format: str = "parquet"
|
|
137
|
+
read_func = gpd.read_parquet
|
|
138
|
+
else:
|
|
139
|
+
file_format: str = Path(gcs_path).suffix.lstrip(".")
|
|
140
|
+
read_func = gpd.read_file
|
|
141
|
+
|
|
142
|
+
with file_system.open(gcs_path, mode="rb") as file:
|
|
143
|
+
df = _read_geopandas(
|
|
144
|
+
file,
|
|
145
|
+
read_func=read_func,
|
|
146
|
+
file_format=file_format,
|
|
147
|
+
filters=filters,
|
|
148
|
+
**kwargs,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return df
|
|
152
|
+
|
|
191
153
|
|
|
154
|
+
def _read_geopandas_from_iterable(
|
|
155
|
+
paths, mask, file_system, use_threads, pandas_fallback, **kwargs
|
|
156
|
+
):
|
|
157
|
+
cols = {}
|
|
158
|
+
if mask is None and isinstance(paths, GeoSeries):
|
|
159
|
+
# bounds GeoSeries indexed with file paths
|
|
160
|
+
paths = list(paths.index)
|
|
161
|
+
elif mask is None:
|
|
162
|
+
paths = list(paths)
|
|
192
163
|
else:
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
164
|
+
if not isinstance(paths, GeoSeries):
|
|
165
|
+
bounds_series: GeoSeries = get_bounds_series(
|
|
166
|
+
paths,
|
|
167
|
+
file_system,
|
|
168
|
+
use_threads=use_threads,
|
|
169
|
+
pandas_fallback=pandas_fallback,
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
bounds_series = paths
|
|
173
|
+
new_bounds_series = sfilter(bounds_series, mask)
|
|
174
|
+
if not len(new_bounds_series):
|
|
175
|
+
if isinstance(kwargs.get("columns"), Iterable):
|
|
176
|
+
cols = {col: [] for col in kwargs["columns"]}
|
|
177
|
+
else:
|
|
178
|
+
cols = {}
|
|
179
|
+
for path in bounds_series.index:
|
|
180
|
+
try:
|
|
181
|
+
cols |= {col: [] for col in _get_columns(path, file_system)}
|
|
182
|
+
except ArrowInvalid as e:
|
|
183
|
+
if file_system.isfile(path):
|
|
184
|
+
raise ArrowInvalid(e, path) from e
|
|
185
|
+
return GeoDataFrame(cols | {"geometry": []})
|
|
186
|
+
paths = list(new_bounds_series.index)
|
|
187
|
+
|
|
188
|
+
# recursive read with threads
|
|
189
|
+
threads = (
|
|
190
|
+
min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
|
|
191
|
+
)
|
|
192
|
+
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
193
|
+
dfs: list[GeoDataFrame] = parallel(
|
|
194
|
+
joblib.delayed(read_geopandas)(
|
|
195
|
+
x,
|
|
196
|
+
file_system=file_system,
|
|
197
|
+
pandas_fallback=pandas_fallback,
|
|
198
|
+
mask=mask,
|
|
199
|
+
use_threads=use_threads,
|
|
200
|
+
**kwargs,
|
|
201
|
+
)
|
|
202
|
+
for x in paths
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if dfs:
|
|
206
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
207
|
+
try:
|
|
208
|
+
df = GeoDataFrame(df)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
if not pandas_fallback:
|
|
211
|
+
print(e)
|
|
212
|
+
raise e
|
|
213
|
+
else:
|
|
214
|
+
df = GeoDataFrame(cols | {"geometry": []})
|
|
205
215
|
|
|
206
|
-
if pandas_fallback or not len(df):
|
|
207
|
-
return df
|
|
208
|
-
else:
|
|
209
|
-
more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
|
|
210
|
-
raise e.__class__(
|
|
211
|
-
f"{e.__class__.__name__}: {e} for {df}. " + more_txt
|
|
212
|
-
) from e
|
|
213
|
-
except Exception as e:
|
|
214
|
-
raise e.__class__(
|
|
215
|
-
f"{e.__class__.__name__}: {e} for {gcs_path}." + more_txt
|
|
216
|
-
) from e
|
|
217
|
-
|
|
218
|
-
if mask is not None:
|
|
219
|
-
return sfilter(df, mask)
|
|
220
216
|
return df
|
|
221
217
|
|
|
222
218
|
|
|
@@ -230,15 +226,25 @@ def _get_bounds_parquet(
|
|
|
230
226
|
def _get_bounds_parquet_from_open_file(
|
|
231
227
|
file, file_system
|
|
232
228
|
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
233
|
-
geo_metadata =
|
|
229
|
+
geo_metadata = _get_geo_metadata_primary_column(file, file_system)
|
|
230
|
+
|
|
234
231
|
if not geo_metadata:
|
|
235
232
|
return None, None
|
|
236
233
|
return geo_metadata["bbox"], geo_metadata["crs"]
|
|
237
234
|
|
|
238
235
|
|
|
239
236
|
def _get_geo_metadata(file, file_system) -> dict:
|
|
240
|
-
|
|
241
|
-
|
|
237
|
+
try:
|
|
238
|
+
meta = pq.read_schema(file).metadata
|
|
239
|
+
except FileNotFoundError:
|
|
240
|
+
with file_system.open(file, "rb") as f:
|
|
241
|
+
meta = pq.read_schema(f).metadata
|
|
242
|
+
|
|
243
|
+
return json.loads(meta[b"geo"])
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _get_geo_metadata_primary_column(file, file_system) -> dict:
|
|
247
|
+
geo_metadata = _get_geo_metadata(file, file_system)
|
|
242
248
|
try:
|
|
243
249
|
primary_column = geo_metadata["primary_column"]
|
|
244
250
|
except KeyError as e:
|
|
@@ -252,6 +258,7 @@ def _get_geo_metadata(file, file_system) -> dict:
|
|
|
252
258
|
if not file_system.isfile(file):
|
|
253
259
|
return {}
|
|
254
260
|
raise ArrowInvalid(e, file) from e
|
|
261
|
+
# allow for 0 lengthed tables not to have geo metadata
|
|
255
262
|
if not num_rows:
|
|
256
263
|
return {}
|
|
257
264
|
return {}
|
|
@@ -272,7 +279,7 @@ def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
|
|
|
272
279
|
def get_bounds_series(
|
|
273
280
|
paths: list[str | Path] | tuple[str | Path],
|
|
274
281
|
file_system: GCSFileSystem | None = None,
|
|
275
|
-
|
|
282
|
+
use_threads: bool = True,
|
|
276
283
|
pandas_fallback: bool = False,
|
|
277
284
|
) -> GeoSeries:
|
|
278
285
|
"""Get a GeoSeries with file paths as indexes and the file's bounds as values.
|
|
@@ -285,8 +292,7 @@ def get_bounds_series(
|
|
|
285
292
|
file_system: Optional instance of GCSFileSystem.
|
|
286
293
|
If None, an instance is created within the function.
|
|
287
294
|
Note that this is slower in long loops.
|
|
288
|
-
|
|
289
|
-
the number of files to read or the number of available threads (if lower).
|
|
295
|
+
use_threads: Default True.
|
|
290
296
|
pandas_fallback: If False (default), an exception is raised if the file has
|
|
291
297
|
no geo metadata. If True, the geometry value is set to None for this file.
|
|
292
298
|
|
|
@@ -330,8 +336,9 @@ def get_bounds_series(
|
|
|
330
336
|
"""
|
|
331
337
|
file_system = _get_file_system(file_system, {})
|
|
332
338
|
|
|
333
|
-
|
|
334
|
-
|
|
339
|
+
threads = (
|
|
340
|
+
min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
|
|
341
|
+
)
|
|
335
342
|
|
|
336
343
|
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
337
344
|
bounds: list[tuple[list[float], dict]] = parallel(
|
|
@@ -396,9 +403,11 @@ def write_geopandas(
|
|
|
396
403
|
raise ValueError("File already exists.")
|
|
397
404
|
|
|
398
405
|
if not isinstance(df, GeoDataFrame):
|
|
399
|
-
raise ValueError(
|
|
406
|
+
raise ValueError(
|
|
407
|
+
f"DataFrame must be GeoDataFrame. Got {type(df)} for {gcs_path}."
|
|
408
|
+
)
|
|
400
409
|
|
|
401
|
-
if not len(df) and
|
|
410
|
+
if not len(df) and get_child_paths(gcs_path, file_system):
|
|
402
411
|
# no need to write empty df
|
|
403
412
|
return
|
|
404
413
|
elif not len(df):
|
|
@@ -473,6 +482,23 @@ def _to_geopandas(df, path, **kwargs) -> None:
|
|
|
473
482
|
pq.write_table(table, path, compression="snappy", **kwargs)
|
|
474
483
|
|
|
475
484
|
|
|
485
|
+
def _pyarrow_schema_from_geopandas(df: GeoDataFrame) -> pyarrow.Schema:
|
|
486
|
+
geom_name = df.geometry.name
|
|
487
|
+
pandas_columns = [col for col in df if col != geom_name]
|
|
488
|
+
schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
|
|
489
|
+
index_columns = _get_index_cols(schema)
|
|
490
|
+
return pyarrow.schema(
|
|
491
|
+
[
|
|
492
|
+
(
|
|
493
|
+
(schema.field(col).name, schema.field(col).type)
|
|
494
|
+
if col != geom_name
|
|
495
|
+
else (geom_name, pyarrow.binary())
|
|
496
|
+
)
|
|
497
|
+
for col in [*df.columns, *index_columns]
|
|
498
|
+
]
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
|
|
476
502
|
def _remove_file(path, file_system) -> None:
|
|
477
503
|
try:
|
|
478
504
|
file_system.rm_file(str(path))
|
|
@@ -494,38 +520,22 @@ def _write_partitioned_geoparquet(
|
|
|
494
520
|
file_system=None,
|
|
495
521
|
write_func: Callable = _to_geopandas,
|
|
496
522
|
existing_data_behavior: str = "error",
|
|
523
|
+
basename_template: str | None = None,
|
|
497
524
|
**kwargs,
|
|
498
525
|
):
|
|
499
|
-
if isinstance(partition_cols, str):
|
|
500
|
-
partition_cols = [partition_cols]
|
|
501
|
-
|
|
502
526
|
file_system = _get_file_system(file_system, kwargs)
|
|
503
527
|
|
|
504
|
-
|
|
505
|
-
|
|
528
|
+
if basename_template is None:
|
|
529
|
+
basename_template = uuid.uuid4().hex + "-{i}.parquet"
|
|
530
|
+
|
|
531
|
+
if isinstance(partition_cols, str):
|
|
532
|
+
partition_cols = [partition_cols]
|
|
506
533
|
|
|
507
534
|
for col in partition_cols:
|
|
508
535
|
if df[col].isna().all() and not kwargs.get("schema"):
|
|
509
536
|
raise ValueError("Must specify 'schema' when all rows are NA.")
|
|
510
537
|
|
|
511
|
-
|
|
512
|
-
glob_func = functools.partial(file_system.glob, detail=False)
|
|
513
|
-
except AttributeError:
|
|
514
|
-
glob_func = functools.partial(glob.glob, recursive=True)
|
|
515
|
-
|
|
516
|
-
args: list[tuple[Path, DataFrame]] = []
|
|
517
|
-
dirs: list[Path] = set()
|
|
518
|
-
for group, rows in df.groupby(partition_cols, dropna=False):
|
|
519
|
-
name = (
|
|
520
|
-
"/".join(
|
|
521
|
-
f"{col}={value if not pd.isna(value) else NULL_VALUE}"
|
|
522
|
-
for col, value in zip(partition_cols, group, strict=True)
|
|
523
|
-
)
|
|
524
|
-
+ f"/{unique_id}.parquet"
|
|
525
|
-
)
|
|
526
|
-
|
|
527
|
-
dirs.add((path / name).parent)
|
|
528
|
-
args.append((path / name, rows))
|
|
538
|
+
glob_func = _get_glob(file_system)
|
|
529
539
|
|
|
530
540
|
if file_system.exists(path) and file_system.isfile(path):
|
|
531
541
|
_remove_file(path, file_system)
|
|
@@ -533,49 +543,56 @@ def _write_partitioned_geoparquet(
|
|
|
533
543
|
if kwargs.get("schema"):
|
|
534
544
|
schema = kwargs.pop("schema")
|
|
535
545
|
elif isinstance(df, GeoDataFrame):
|
|
536
|
-
|
|
537
|
-
pandas_columns = [col for col in df if col != geom_name]
|
|
538
|
-
schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
|
|
539
|
-
index_columns = _get_index_cols(schema)
|
|
540
|
-
schema = pyarrow.schema(
|
|
541
|
-
[
|
|
542
|
-
(
|
|
543
|
-
(schema.field(col).name, schema.field(col).type)
|
|
544
|
-
if col != geom_name
|
|
545
|
-
else (geom_name, pyarrow.binary())
|
|
546
|
-
)
|
|
547
|
-
for col in [*df.columns, *index_columns]
|
|
548
|
-
# for col in df.columns
|
|
549
|
-
]
|
|
550
|
-
)
|
|
546
|
+
schema = _pyarrow_schema_from_geopandas(df)
|
|
551
547
|
else:
|
|
552
548
|
schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
|
|
553
549
|
|
|
554
|
-
def
|
|
555
|
-
|
|
556
|
-
return {
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
550
|
+
def as_partition_part(col: str, value: Any) -> str:
|
|
551
|
+
value = value if not pd.isna(value) else NULL_VALUE
|
|
552
|
+
return f"{col}={value}"
|
|
553
|
+
|
|
554
|
+
paths: list[Path] = []
|
|
555
|
+
dfs: list[DataFrame] = []
|
|
556
|
+
for group, rows in df.groupby(partition_cols, dropna=False):
|
|
557
|
+
partition_parts = "/".join(
|
|
558
|
+
as_partition_part(col, value)
|
|
559
|
+
for col, value in zip(partition_cols, group, strict=True)
|
|
560
|
+
)
|
|
561
|
+
paths.append(Path(path) / partition_parts)
|
|
562
|
+
dfs.append(rows)
|
|
563
|
+
|
|
564
|
+
def threaded_write(rows: DataFrame, path: str) -> None:
|
|
565
|
+
this_basename = basename_template.replace("-{i}", "0")
|
|
566
|
+
for i, sibling_path in enumerate(sorted(glob_func(str(Path(path) / "**")))):
|
|
567
|
+
if paths_are_equal(sibling_path, path):
|
|
568
|
+
continue
|
|
569
|
+
if existing_data_behavior == "delete_matching":
|
|
570
|
+
_remove_file(sibling_path, file_system)
|
|
571
|
+
elif existing_data_behavior == "error":
|
|
572
|
+
raise pyarrow.ArrowInvalid(
|
|
573
|
+
f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
|
|
574
|
+
)
|
|
575
|
+
else:
|
|
576
|
+
this_basename = basename_template.replace("-{i}", str(i + 1))
|
|
577
|
+
|
|
578
|
+
out_path = str(Path(path) / this_basename)
|
|
569
579
|
try:
|
|
570
|
-
with file_system.open(
|
|
580
|
+
with file_system.open(out_path, mode="wb") as file:
|
|
571
581
|
write_func(rows, file, schema=schema, **kwargs)
|
|
572
582
|
except FileNotFoundError:
|
|
573
|
-
file_system.makedirs(str(
|
|
574
|
-
with file_system.open(
|
|
583
|
+
file_system.makedirs(str(path), exist_ok=True)
|
|
584
|
+
with file_system.open(out_path, mode="wb") as file:
|
|
575
585
|
write_func(rows, file, schema=schema, **kwargs)
|
|
576
586
|
|
|
577
587
|
with ThreadPoolExecutor() as executor:
|
|
578
|
-
|
|
588
|
+
executor.map(threaded_write, dfs, paths)
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _get_glob(file_system) -> functools.partial:
|
|
592
|
+
try:
|
|
593
|
+
return functools.partial(file_system.glob)
|
|
594
|
+
except AttributeError:
|
|
595
|
+
return functools.partial(glob.glob, recursive=True)
|
|
579
596
|
|
|
580
597
|
|
|
581
598
|
def _filters_to_expression(filters) -> list[ds.Expression]:
|
|
@@ -612,6 +629,8 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
|
|
|
612
629
|
>>> expression_match_path(path, expression)
|
|
613
630
|
False
|
|
614
631
|
"""
|
|
632
|
+
# keep only the parts in between the two .parquet parts
|
|
633
|
+
path = str(path).split(".parquet")[1]
|
|
615
634
|
if NULL_VALUE in path:
|
|
616
635
|
return True
|
|
617
636
|
# build a one lengthed pyarrow.Table of the partitioning in the file path
|
|
@@ -627,39 +646,60 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
|
|
|
627
646
|
try:
|
|
628
647
|
table = table.filter(expression)
|
|
629
648
|
except pyarrow.ArrowInvalid as e:
|
|
630
|
-
if "No match for FieldRef"
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
649
|
+
if "No match for FieldRef" in str(e):
|
|
650
|
+
# if a non-partition col is used in 'filters',
|
|
651
|
+
# we cannot determine if the expression match without reading the file
|
|
652
|
+
return True
|
|
653
|
+
raise e
|
|
634
654
|
return bool(len(table))
|
|
635
655
|
|
|
636
656
|
|
|
637
|
-
def _read_geopandas(
|
|
657
|
+
def _read_geopandas(
|
|
658
|
+
file,
|
|
659
|
+
read_func: Callable = gpd.read_parquet,
|
|
660
|
+
file_format: str = "parquet",
|
|
661
|
+
**kwargs,
|
|
662
|
+
):
|
|
638
663
|
try:
|
|
639
|
-
return
|
|
640
|
-
except
|
|
641
|
-
if not
|
|
642
|
-
raise e
|
|
643
|
-
df = pd.read_parquet(file, **kwargs)
|
|
644
|
-
if len(df):
|
|
664
|
+
return read_func(file, **kwargs)
|
|
665
|
+
except ValueError as e:
|
|
666
|
+
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
645
667
|
raise e
|
|
646
|
-
|
|
668
|
+
df = getattr(pd, f"read_{file_format}")(file, **kwargs)
|
|
669
|
+
if not len(df):
|
|
670
|
+
return GeoDataFrame(df)
|
|
671
|
+
raise e.__class__(f"{e.__class__.__name__}: {e} for {df}. ") from e
|
|
672
|
+
except Exception as e:
|
|
673
|
+
raise e.__class__(f"{e.__class__.__name__}: {e} for {file}.") from e
|
|
647
674
|
|
|
648
675
|
|
|
649
676
|
def _read_pandas(gcs_path: str, **kwargs):
|
|
650
677
|
file_system = _get_file_system(None, kwargs)
|
|
651
678
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
679
|
+
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
680
|
+
# recursive read with threads
|
|
681
|
+
threads = (
|
|
682
|
+
min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
|
|
683
|
+
if kwargs.get("use_threads")
|
|
684
|
+
else 1
|
|
685
|
+
)
|
|
686
|
+
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
687
|
+
return pd.concat(
|
|
688
|
+
parallel(
|
|
689
|
+
joblib.delayed(_read_pandas)(x, file_system=file_system, **kwargs)
|
|
690
|
+
for x in gcs_path
|
|
691
|
+
)
|
|
662
692
|
)
|
|
693
|
+
|
|
694
|
+
child_paths = get_child_paths(gcs_path, file_system)
|
|
695
|
+
if child_paths:
|
|
696
|
+
return _read_partitioned_parquet(
|
|
697
|
+
gcs_path,
|
|
698
|
+
read_func=pd.read_parquet,
|
|
699
|
+
file_system=file_system,
|
|
700
|
+
mask=None,
|
|
701
|
+
child_paths=child_paths,
|
|
702
|
+
**kwargs,
|
|
663
703
|
)
|
|
664
704
|
|
|
665
705
|
with file_system.open(gcs_path, "rb") as file:
|
|
@@ -678,10 +718,7 @@ def _read_partitioned_parquet(
|
|
|
678
718
|
file_system = _get_file_system(file_system, kwargs)
|
|
679
719
|
|
|
680
720
|
if child_paths is None:
|
|
681
|
-
|
|
682
|
-
glob_func = functools.partial(file_system.glob)
|
|
683
|
-
except AttributeError:
|
|
684
|
-
glob_func = functools.partial(glob.glob, recursive=True)
|
|
721
|
+
glob_func = _get_glob(file_system)
|
|
685
722
|
child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
|
|
686
723
|
|
|
687
724
|
filters = _filters_to_expression(filters)
|
|
@@ -690,13 +727,13 @@ def _read_partitioned_parquet(
|
|
|
690
727
|
bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
|
|
691
728
|
return shapely.box(*bbox).intersects(to_shapely(mask))
|
|
692
729
|
|
|
693
|
-
def read(path) ->
|
|
730
|
+
def read(path: str) -> pyarrow.Table | None:
|
|
694
731
|
with file_system.open(path, "rb") as file:
|
|
695
732
|
if mask is not None and not intersects(file, mask):
|
|
696
733
|
return
|
|
697
734
|
|
|
735
|
+
# 'get' instead of 'pop' because dict is mutable
|
|
698
736
|
schema = kwargs.get("schema", pq.read_schema(file))
|
|
699
|
-
# copy kwargs because mutable
|
|
700
737
|
new_kwargs = {
|
|
701
738
|
key: value for key, value in kwargs.items() if key != "schema"
|
|
702
739
|
}
|
|
@@ -705,8 +742,8 @@ def _read_partitioned_parquet(
|
|
|
705
742
|
|
|
706
743
|
with ThreadPoolExecutor() as executor:
|
|
707
744
|
results = [
|
|
708
|
-
|
|
709
|
-
for
|
|
745
|
+
df
|
|
746
|
+
for df in (
|
|
710
747
|
executor.map(
|
|
711
748
|
read,
|
|
712
749
|
(
|
|
@@ -716,30 +753,34 @@ def _read_partitioned_parquet(
|
|
|
716
753
|
),
|
|
717
754
|
)
|
|
718
755
|
)
|
|
719
|
-
if
|
|
756
|
+
if df is not None
|
|
720
757
|
]
|
|
758
|
+
|
|
721
759
|
if results:
|
|
722
|
-
if
|
|
723
|
-
|
|
724
|
-
|
|
760
|
+
if all(isinstance(x, DataFrame) for x in results):
|
|
761
|
+
results = pd.concat(results)
|
|
762
|
+
else:
|
|
763
|
+
geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
|
|
764
|
+
results = _arrow_to_geopandas(
|
|
765
|
+
pyarrow.concat_tables(
|
|
766
|
+
results,
|
|
767
|
+
promote_options="permissive",
|
|
768
|
+
),
|
|
769
|
+
geo_metadata,
|
|
770
|
+
)
|
|
771
|
+
return results
|
|
725
772
|
|
|
726
773
|
# add columns to empty DataFrame
|
|
727
774
|
first_path = next(iter(child_paths + [path]))
|
|
728
|
-
return pd.DataFrame(
|
|
729
|
-
columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
|
|
730
|
-
)
|
|
775
|
+
return pd.DataFrame(columns=_get_columns(first_path, file_system))
|
|
731
776
|
|
|
732
777
|
|
|
733
778
|
def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
734
779
|
return Path(path1).parts == Path(path2).parts
|
|
735
780
|
|
|
736
781
|
|
|
737
|
-
def
|
|
738
|
-
|
|
739
|
-
glob_func = functools.partial(file_system.glob, detail=False)
|
|
740
|
-
except AttributeError:
|
|
741
|
-
glob_func = functools.partial(glob.glob, recursive=True)
|
|
742
|
-
|
|
782
|
+
def get_child_paths(path, file_system) -> list[str]:
|
|
783
|
+
glob_func = _get_glob(file_system)
|
|
743
784
|
return [
|
|
744
785
|
x
|
|
745
786
|
for x in glob_func(str(Path(path) / "**/*.parquet"))
|
sgis/maps/wms.py
CHANGED
|
@@ -15,6 +15,8 @@ from ..geopandas_tools.conversion import to_shapely
|
|
|
15
15
|
|
|
16
16
|
JSON_PATH = Path(__file__).parent / "norge_i_bilder.json"
|
|
17
17
|
|
|
18
|
+
JSON_YEARS = [str(year) for year in range(1999, 2025)]
|
|
19
|
+
|
|
18
20
|
DEFAULT_YEARS: tuple[str] = tuple(
|
|
19
21
|
str(year)
|
|
20
22
|
for year in range(
|
|
@@ -168,7 +170,7 @@ class NorgeIBilderWms(WmsLoader):
|
|
|
168
170
|
|
|
169
171
|
self.years = [str(int(year)) for year in self.years]
|
|
170
172
|
|
|
171
|
-
if all(year in
|
|
173
|
+
if all(year in JSON_YEARS for year in self.years):
|
|
172
174
|
try:
|
|
173
175
|
with open(JSON_PATH, encoding="utf-8") as file:
|
|
174
176
|
self.tiles = json.load(file)
|
|
@@ -185,3 +187,7 @@ class NorgeIBilderWms(WmsLoader):
|
|
|
185
187
|
]
|
|
186
188
|
else:
|
|
187
189
|
self.tiles = None
|
|
190
|
+
|
|
191
|
+
def __repr__(self) -> str:
|
|
192
|
+
"""Print representation."""
|
|
193
|
+
return f"{self.__class__.__name__}({len(self.tiles or [])})"
|
|
@@ -3,7 +3,7 @@ sgis/conf.py,sha256=5GYeg00-qXtV_roskBaihZ2s5Ut3oF-cxjICo9UFJU0,2134
|
|
|
3
3
|
sgis/debug_config.py,sha256=Tfr19kU46hSkkspsIJcrUWvlhaL4U3-f8xEPkujSCAQ,593
|
|
4
4
|
sgis/exceptions.py,sha256=WNaEBPNNx0rmz-YDzlFX4vIE7ocJQruUTqS2RNAu2zU,660
|
|
5
5
|
sgis/geopandas_tools/__init__.py,sha256=bo8lFMcltOz7TtWAi52_ekR2gd3mjfBfKeMDV5zuqFY,28
|
|
6
|
-
sgis/geopandas_tools/bounds.py,sha256=
|
|
6
|
+
sgis/geopandas_tools/bounds.py,sha256=YJyF0gp78hFAjLLZmDquRKCBAtbt7QouG3snTcJeNQs,23822
|
|
7
7
|
sgis/geopandas_tools/buffer_dissolve_explode.py,sha256=t9GJqRMDsHEU74RIlqeMr4QBgbTK0hYlXL4af1RKIks,19955
|
|
8
8
|
sgis/geopandas_tools/centerlines.py,sha256=Q65Sx01SeAlulBEd9oaZkB2maBBNdLcJwAbTILg4SPU,11848
|
|
9
9
|
sgis/geopandas_tools/cleaning.py,sha256=_V3KrJBaL8hZk1Iv6HBfTMTe7GCgcRbWfelkWOxqaIg,24116
|
|
@@ -18,10 +18,10 @@ sgis/geopandas_tools/point_operations.py,sha256=JM4hvfIVxZaZdGNlGzcCurrKzkgC_b9h
|
|
|
18
18
|
sgis/geopandas_tools/polygon_operations.py,sha256=FJ-dXCxLHRsmp0oXsmBOFRprFFwmhrxqOPZkW2WWWQM,50088
|
|
19
19
|
sgis/geopandas_tools/polygons_as_rings.py,sha256=BX_GZS6F9I4NbEpiOlNBd7zywJjdfdJVi_MkeONBuiM,14941
|
|
20
20
|
sgis/geopandas_tools/sfilter.py,sha256=SLcMYprQwnY5DNo0R7TGXk4m6u26H8o4PRn-RPhmeZY,9345
|
|
21
|
-
sgis/helpers.py,sha256=
|
|
21
|
+
sgis/helpers.py,sha256=_h7ke9hJrRNhHW-ZX3gA95fOrX2s1ADKBMxc94p2F4Q,9627
|
|
22
22
|
sgis/io/__init__.py,sha256=uyBr20YDqB2bQttrd5q1JuGOvX32A-MSvS7Wmw5f5qg,177
|
|
23
23
|
sgis/io/_is_dapla.py,sha256=wmfkSe98IrLhUg3dtXZusV6OVC8VlY1kbc5EQDf3P-Q,358
|
|
24
|
-
sgis/io/dapla_functions.py,sha256=
|
|
24
|
+
sgis/io/dapla_functions.py,sha256=nH0aIC7LsyP9SQ_y8iz1ALbqoqq6WK_LqZ9dg3Yvq20,29851
|
|
25
25
|
sgis/io/opener.py,sha256=HWO3G1NB6bpXKM94JadCD513vjat1o1TFjWGWzyVasg,898
|
|
26
26
|
sgis/io/read_parquet.py,sha256=FvZYv1rLkUlrSaUY6QW6E1yntmntTeQuZ9ZRgCDO4IM,3776
|
|
27
27
|
sgis/maps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -34,7 +34,7 @@ sgis/maps/maps.py,sha256=gxu0rgcVygjudRtM1dVRmsUMilMUIg3vG-UgvASM91E,23072
|
|
|
34
34
|
sgis/maps/norge_i_bilder.json,sha256=W_mFfte3DxugWbEudZ5fadZ2JeFYb0hyab2Quf4oJME,481311
|
|
35
35
|
sgis/maps/thematicmap.py,sha256=w6q4_gIr8BubQgsPJkc6WXk-tmplDLGcKyjphhFp7ng,21873
|
|
36
36
|
sgis/maps/tilesources.py,sha256=F4mFHxPwkiPJdVKzNkScTX6xbJAMIUtlTq4mQ83oguw,1746
|
|
37
|
-
sgis/maps/wms.py,sha256=
|
|
37
|
+
sgis/maps/wms.py,sha256=Sely3Pt-cym9kRlrK0JTjtMe21kTxWY6ucD2dAoWHI0,6442
|
|
38
38
|
sgis/networkanalysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
39
|
sgis/networkanalysis/_get_route.py,sha256=9I3t9pnccUPr4mozy3TJCOpGCCf3UOIojmsbifubZbA,6368
|
|
40
40
|
sgis/networkanalysis/_od_cost_matrix.py,sha256=zkyPX7ObT996ahaFJ2oI0D0SqQWbWyfy_qLtXwValPg,3434
|
|
@@ -58,7 +58,7 @@ sgis/raster/indices.py,sha256=-J1HYmnT240iozvgagvyis6K0_GHZHRuUrPOgyoeIrY,223
|
|
|
58
58
|
sgis/raster/regex.py,sha256=kYhVpRYzoXutx1dSYmqMoselWXww7MMEsTPmLZwHjbM,3759
|
|
59
59
|
sgis/raster/sentinel_config.py,sha256=nySDqn2R8M6W8jguoBeSAK_zzbAsqmaI59i32446FwY,1268
|
|
60
60
|
sgis/raster/zonal.py,sha256=D4Gyptw-yOLTCO41peIuYbY-DANsJCG19xXDlf1QAz4,2299
|
|
61
|
-
ssb_sgis-1.1.
|
|
62
|
-
ssb_sgis-1.1.
|
|
63
|
-
ssb_sgis-1.1.
|
|
64
|
-
ssb_sgis-1.1.
|
|
61
|
+
ssb_sgis-1.1.4.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
|
|
62
|
+
ssb_sgis-1.1.4.dist-info/METADATA,sha256=lPqCX4slMY8NR7PPVDLRLYdkFOstiSMvUN6NTWXX_8M,11740
|
|
63
|
+
ssb_sgis-1.1.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
64
|
+
ssb_sgis-1.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|