ssb-sgis 1.1.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/PKG-INFO +1 -1
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/pyproject.toml +1 -1
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/__init__.py +1 -5
- ssb_sgis-1.1.1/src/sgis/conf.py +16 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/io/dapla_functions.py +223 -54
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/LICENSE +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/README.md +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/debug_config.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/exceptions.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/__init__.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/bounds.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/buffer_dissolve_explode.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/centerlines.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/cleaning.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/conversion.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/duplicates.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/general.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/geocoding.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/geometry_types.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/neighbors.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/overlay.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/point_operations.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/polygon_operations.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/polygons_as_rings.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/geopandas_tools/sfilter.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/helpers.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/io/_is_dapla.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/io/opener.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/io/read_parquet.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/__init__.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/examine.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/explore.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/httpserver.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/legend.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/map.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/maps.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/norge_i_bilder.json +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/thematicmap.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/tilesources.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/maps/wms.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/__init__.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/_get_route.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/_od_cost_matrix.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/_points.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/_service_area.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/closing_network_holes.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/cutting_lines.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/directednetwork.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/finding_isolated_networks.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/network.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/networkanalysis.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/networkanalysisrules.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/nodes.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/networkanalysis/traveling_salesman.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/parallel/parallel.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/py.typed +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/__init__.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/base.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/image_collection.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/indices.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/regex.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/sentinel_config.py +0 -0
- {ssb_sgis-1.1.0 → ssb_sgis-1.1.1}/src/sgis/raster/zonal.py +0 -0
|
@@ -1,10 +1,6 @@
|
|
|
1
|
-
config = {
|
|
2
|
-
"n_jobs": 1,
|
|
3
|
-
}
|
|
4
|
-
|
|
5
|
-
|
|
6
1
|
import sgis.raster.indices as indices
|
|
7
2
|
|
|
3
|
+
from .conf import config
|
|
8
4
|
from .geopandas_tools.bounds import Gridlooper
|
|
9
5
|
from .geopandas_tools.bounds import bounds_to_points
|
|
10
6
|
from .geopandas_tools.bounds import bounds_to_polygon
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from gcsfs import GCSFileSystem
|
|
3
|
+
except ImportError:
|
|
4
|
+
|
|
5
|
+
class GCSFileSystem:
|
|
6
|
+
"""Placeholder."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
9
|
+
"""Placeholder."""
|
|
10
|
+
raise ImportError("gcsfs")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
config = {
|
|
14
|
+
"n_jobs": 1,
|
|
15
|
+
"file_system": GCSFileSystem,
|
|
16
|
+
}
|
|
@@ -2,13 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import functools
|
|
6
|
+
import glob
|
|
5
7
|
import json
|
|
6
8
|
import multiprocessing
|
|
7
9
|
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import uuid
|
|
8
12
|
from collections.abc import Iterable
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
14
|
from pathlib import Path
|
|
10
15
|
|
|
11
|
-
import dapla as dp
|
|
12
16
|
import geopandas as gpd
|
|
13
17
|
import joblib
|
|
14
18
|
import pandas as pd
|
|
@@ -22,10 +26,12 @@ from geopandas.io.arrow import _geopandas_to_arrow
|
|
|
22
26
|
from pandas import DataFrame
|
|
23
27
|
from pyarrow import ArrowInvalid
|
|
24
28
|
|
|
29
|
+
from ..geopandas_tools.conversion import to_shapely
|
|
25
30
|
from ..geopandas_tools.general import get_common_crs
|
|
26
31
|
from ..geopandas_tools.sfilter import sfilter
|
|
27
32
|
|
|
28
33
|
PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
|
|
34
|
+
from ..conf import config
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
def read_geopandas(
|
|
@@ -63,7 +69,7 @@ def read_geopandas(
|
|
|
63
69
|
A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
|
|
64
70
|
"""
|
|
65
71
|
if file_system is None:
|
|
66
|
-
file_system =
|
|
72
|
+
file_system = config["file_system"]()
|
|
67
73
|
|
|
68
74
|
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
69
75
|
kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
|
|
@@ -130,6 +136,18 @@ def read_geopandas(
|
|
|
130
136
|
except TypeError as e:
|
|
131
137
|
raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
|
|
132
138
|
|
|
139
|
+
if has_partitions(gcs_path, file_system):
|
|
140
|
+
filters = kwargs.pop("filters", None)
|
|
141
|
+
return _read_partitioned_parquet(
|
|
142
|
+
gcs_path,
|
|
143
|
+
file_system=file_system,
|
|
144
|
+
mask=mask,
|
|
145
|
+
pandas_fallback=pandas_fallback,
|
|
146
|
+
threads=threads,
|
|
147
|
+
filters=filters,
|
|
148
|
+
**kwargs,
|
|
149
|
+
)
|
|
150
|
+
|
|
133
151
|
if "parquet" in gcs_path or "prqt" in gcs_path:
|
|
134
152
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
135
153
|
try:
|
|
@@ -179,31 +197,42 @@ def read_geopandas(
|
|
|
179
197
|
def _get_bounds_parquet(
|
|
180
198
|
path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
|
|
181
199
|
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
182
|
-
with file_system.open(path) as
|
|
200
|
+
with file_system.open(path, "rb") as file:
|
|
201
|
+
return _get_bounds_parquet_from_open_file(file, file_system)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _get_bounds_parquet_from_open_file(
|
|
205
|
+
file, file_system
|
|
206
|
+
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
207
|
+
geo_metadata = _get_geo_metadata(file, file_system)
|
|
208
|
+
if not geo_metadata:
|
|
209
|
+
return None, None
|
|
210
|
+
return geo_metadata["bbox"], geo_metadata["crs"]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _get_geo_metadata(file, file_system) -> dict:
|
|
214
|
+
meta = pq.read_schema(file).metadata
|
|
215
|
+
geo_metadata = json.loads(meta[b"geo"])
|
|
216
|
+
try:
|
|
217
|
+
primary_column = geo_metadata["primary_column"]
|
|
218
|
+
except KeyError as e:
|
|
219
|
+
raise KeyError(e, geo_metadata) from e
|
|
220
|
+
try:
|
|
221
|
+
return geo_metadata["columns"][primary_column]
|
|
222
|
+
except KeyError as e:
|
|
183
223
|
try:
|
|
184
|
-
num_rows = pq.read_metadata(
|
|
224
|
+
num_rows = pq.read_metadata(file).num_rows
|
|
185
225
|
except ArrowInvalid as e:
|
|
186
|
-
if not file_system.isfile(
|
|
187
|
-
return
|
|
188
|
-
raise ArrowInvalid(e,
|
|
226
|
+
if not file_system.isfile(file):
|
|
227
|
+
return {}
|
|
228
|
+
raise ArrowInvalid(e, file) from e
|
|
189
229
|
if not num_rows:
|
|
190
|
-
return
|
|
191
|
-
|
|
192
|
-
try:
|
|
193
|
-
meta = json.loads(meta[b"geo"])["columns"]["geometry"]
|
|
194
|
-
except KeyError as e:
|
|
195
|
-
if pandas_fallback:
|
|
196
|
-
return None, None
|
|
197
|
-
raise KeyError(
|
|
198
|
-
f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
|
|
199
|
-
# f"{num_rows=}",
|
|
200
|
-
# meta,
|
|
201
|
-
) from e
|
|
202
|
-
return meta["bbox"], meta["crs"]
|
|
230
|
+
return {}
|
|
231
|
+
return {}
|
|
203
232
|
|
|
204
233
|
|
|
205
234
|
def _get_columns(path: str | Path, file_system: GCSFileSystem) -> pd.Index:
|
|
206
|
-
with file_system.open(path) as f:
|
|
235
|
+
with file_system.open(path, "rb") as f:
|
|
207
236
|
schema = pq.read_schema(f)
|
|
208
237
|
index_cols = _get_index_cols(schema)
|
|
209
238
|
return pd.Index(schema.names).difference(index_cols)
|
|
@@ -242,8 +271,7 @@ def get_bounds_series(
|
|
|
242
271
|
---------
|
|
243
272
|
>>> import sgis as sg
|
|
244
273
|
>>> import dapla as dp
|
|
245
|
-
>>>
|
|
246
|
-
>>> all_paths = file_system.ls("...")
|
|
274
|
+
>>> all_paths = GCSFileSystem().ls("...")
|
|
247
275
|
|
|
248
276
|
Get the bounds of all your file paths, indexed by path.
|
|
249
277
|
|
|
@@ -275,7 +303,7 @@ def get_bounds_series(
|
|
|
275
303
|
|
|
276
304
|
"""
|
|
277
305
|
if file_system is None:
|
|
278
|
-
file_system =
|
|
306
|
+
file_system = config["file_system"]()
|
|
279
307
|
|
|
280
308
|
if threads is None:
|
|
281
309
|
threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
|
|
@@ -308,7 +336,7 @@ def write_geopandas(
|
|
|
308
336
|
overwrite: bool = True,
|
|
309
337
|
pandas_fallback: bool = False,
|
|
310
338
|
file_system: GCSFileSystem | None = None,
|
|
311
|
-
|
|
339
|
+
partition_cols=None,
|
|
312
340
|
**kwargs,
|
|
313
341
|
) -> None:
|
|
314
342
|
"""Writes a GeoDataFrame to the speficied format.
|
|
@@ -324,13 +352,7 @@ def write_geopandas(
|
|
|
324
352
|
not be written with geopandas and the number of rows is more than 0. If True,
|
|
325
353
|
the file will be written without geo-metadata if >0 rows.
|
|
326
354
|
file_system: Optional file sustem.
|
|
327
|
-
|
|
328
|
-
Writing a bbox column can be computationally expensive, but allows you to specify
|
|
329
|
-
a bbox in : func:read_parquet for filtered reading.
|
|
330
|
-
Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
|
|
331
|
-
considered as experimental. While writing the column is backwards compatible, using it
|
|
332
|
-
for filtering may not be supported by all readers.
|
|
333
|
-
|
|
355
|
+
partition_cols: Column(s) to partition by. Only for parquet files.
|
|
334
356
|
**kwargs: Additional keyword arguments passed to parquet.write_table
|
|
335
357
|
(for parquet) or geopandas' to_file method (if not parquet).
|
|
336
358
|
"""
|
|
@@ -340,22 +362,25 @@ def write_geopandas(
|
|
|
340
362
|
except TypeError as e:
|
|
341
363
|
raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
|
|
342
364
|
|
|
343
|
-
if
|
|
365
|
+
if file_system is None:
|
|
366
|
+
file_system = config["file_system"]()
|
|
367
|
+
|
|
368
|
+
if not overwrite and file_system.exists(gcs_path):
|
|
344
369
|
raise ValueError("File already exists.")
|
|
345
370
|
|
|
346
371
|
if not isinstance(df, GeoDataFrame):
|
|
347
372
|
raise ValueError("DataFrame must be GeoDataFrame.")
|
|
348
373
|
|
|
349
|
-
if
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
if not len(df):
|
|
374
|
+
if not len(df) and has_partitions(gcs_path, file_system):
|
|
375
|
+
return
|
|
376
|
+
elif not len(df):
|
|
353
377
|
if pandas_fallback:
|
|
354
378
|
df = pd.DataFrame(df)
|
|
355
379
|
df.geometry = df.geometry.astype(str)
|
|
356
380
|
df.geometry = None
|
|
357
381
|
try:
|
|
358
|
-
|
|
382
|
+
with file_system.open(gcs_path, "wb") as file:
|
|
383
|
+
df.to_parquet(gcs_path, **kwargs)
|
|
359
384
|
except Exception as e:
|
|
360
385
|
more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
|
|
361
386
|
raise e.__class__(
|
|
@@ -363,17 +388,22 @@ def write_geopandas(
|
|
|
363
388
|
) from e
|
|
364
389
|
return
|
|
365
390
|
|
|
366
|
-
file_system = dp.FileClient.get_gcs_file_system()
|
|
367
|
-
|
|
368
391
|
if ".parquet" in gcs_path or "prqt" in gcs_path:
|
|
369
|
-
|
|
392
|
+
if partition_cols is not None:
|
|
393
|
+
return _write_partitioned_geoparquet(
|
|
394
|
+
df,
|
|
395
|
+
gcs_path,
|
|
396
|
+
partition_cols,
|
|
397
|
+
file_system,
|
|
398
|
+
**kwargs,
|
|
399
|
+
)
|
|
400
|
+
with file_system.open(gcs_path, mode="wb") as file:
|
|
370
401
|
table = _geopandas_to_arrow(
|
|
371
402
|
df,
|
|
372
403
|
index=df.index,
|
|
373
404
|
schema_version=None,
|
|
374
|
-
write_covering_bbox=write_covering_bbox,
|
|
375
405
|
)
|
|
376
|
-
pq.write_table(table,
|
|
406
|
+
pq.write_table(table, file, compression="snappy", **kwargs)
|
|
377
407
|
return
|
|
378
408
|
|
|
379
409
|
layer = kwargs.pop("layer", None)
|
|
@@ -393,17 +423,156 @@ def write_geopandas(
|
|
|
393
423
|
df.to_file(file, driver=driver, layer=layer)
|
|
394
424
|
|
|
395
425
|
|
|
396
|
-
def
|
|
397
|
-
|
|
426
|
+
def _remove_file(path, file_system) -> None:
|
|
427
|
+
try:
|
|
428
|
+
file_system.rm_file(path)
|
|
429
|
+
except (AttributeError, TypeError, PermissionError):
|
|
430
|
+
try:
|
|
431
|
+
shutil.rmtree(path)
|
|
432
|
+
except NotADirectoryError:
|
|
433
|
+
try:
|
|
434
|
+
os.remove(path)
|
|
435
|
+
except PermissionError:
|
|
436
|
+
pass
|
|
398
437
|
|
|
399
|
-
Args:
|
|
400
|
-
path (str): The path to the file or directory.
|
|
401
438
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
439
|
+
def _write_partitioned_geoparquet(df, path, partition_cols, file_system, **kwargs):
|
|
440
|
+
path = Path(path)
|
|
441
|
+
unique_id = uuid.uuid4()
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
glob_func = functools.partial(file_system.glob, detail=False)
|
|
445
|
+
except AttributeError:
|
|
446
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
447
|
+
|
|
448
|
+
args: list[tuple[Path, DataFrame]] = []
|
|
449
|
+
dirs: list[Path] = set()
|
|
450
|
+
for group, rows in df.groupby(partition_cols):
|
|
451
|
+
name = (
|
|
452
|
+
"/".join(
|
|
453
|
+
f"{col}={value}"
|
|
454
|
+
for col, value in zip(partition_cols, group, strict=True)
|
|
455
|
+
)
|
|
456
|
+
+ f"/{unique_id}.parquet"
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
dirs.add((path / name).parent)
|
|
460
|
+
args.append((path / name, rows))
|
|
461
|
+
|
|
462
|
+
if file_system.exists(path) and not has_partitions(path, file_system):
|
|
463
|
+
_remove_file(path, file_system)
|
|
464
|
+
|
|
465
|
+
for dir_ in dirs:
|
|
466
|
+
try:
|
|
467
|
+
os.makedirs(dir_, exist_ok=True)
|
|
468
|
+
except (OSError, FileNotFoundError, FileExistsError) as e:
|
|
469
|
+
print(e)
|
|
470
|
+
pass
|
|
471
|
+
|
|
472
|
+
def threaded_write(path_rows):
|
|
473
|
+
new_path, rows = path_rows
|
|
474
|
+
for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
|
|
475
|
+
if not paths_are_equal(sibling_path, Path(new_path).parent):
|
|
476
|
+
_remove_file(sibling_path, file_system)
|
|
477
|
+
with file_system.open(new_path, mode="wb") as file:
|
|
478
|
+
table = _geopandas_to_arrow(
|
|
479
|
+
rows,
|
|
480
|
+
index=df.index,
|
|
481
|
+
schema_version=None,
|
|
482
|
+
)
|
|
483
|
+
pq.write_table(table, file, compression="snappy", **kwargs)
|
|
484
|
+
|
|
485
|
+
with ThreadPoolExecutor() as executor:
|
|
486
|
+
list(executor.map(threaded_write, args))
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def _read_partitioned_parquet(
|
|
490
|
+
path, filters, file_system, mask, pandas_fallback, threads, **kwargs
|
|
491
|
+
):
|
|
492
|
+
try:
|
|
493
|
+
glob_func = functools.partial(file_system.glob, detail=False)
|
|
494
|
+
except AttributeError:
|
|
495
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
496
|
+
|
|
497
|
+
filters = filters or []
|
|
498
|
+
new_filters = []
|
|
499
|
+
for filt in filters:
|
|
500
|
+
if "in" in filt:
|
|
501
|
+
values = [
|
|
502
|
+
x.strip("(")
|
|
503
|
+
.strip(")")
|
|
504
|
+
.strip("[")
|
|
505
|
+
.strip("]")
|
|
506
|
+
.strip("{")
|
|
507
|
+
.strip("}")
|
|
508
|
+
.strip(" ")
|
|
509
|
+
for x in filt[-1].split(",")
|
|
510
|
+
]
|
|
511
|
+
filt = [filt[0] + "=" + x for x in values]
|
|
512
|
+
else:
|
|
513
|
+
filt = ["".join(filt)]
|
|
514
|
+
new_filters.append(filt)
|
|
515
|
+
|
|
516
|
+
def intersects(file, mask) -> bool:
|
|
517
|
+
bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
|
|
518
|
+
return shapely.box(*bbox).intersects(to_shapely(mask))
|
|
519
|
+
|
|
520
|
+
def read(path) -> GeoDataFrame | None:
|
|
521
|
+
with file_system.open(path, "rb") as file:
|
|
522
|
+
if mask is not None and not intersects(file, mask):
|
|
523
|
+
return
|
|
524
|
+
|
|
525
|
+
schema = kwargs.pop("schema", pq.read_schema(file))
|
|
526
|
+
|
|
527
|
+
return gpd.read_parquet(file, schema=schema, **kwargs)
|
|
528
|
+
|
|
529
|
+
with ThreadPoolExecutor() as executor:
|
|
530
|
+
results = [
|
|
531
|
+
x
|
|
532
|
+
for x in (
|
|
533
|
+
executor.map(
|
|
534
|
+
read,
|
|
535
|
+
(
|
|
536
|
+
path
|
|
537
|
+
for path in glob_func(str(Path(path) / "**/*.parquet"))
|
|
538
|
+
if all(
|
|
539
|
+
any(subfilt in Path(path).parts for subfilt in filt)
|
|
540
|
+
for filt in new_filters
|
|
541
|
+
)
|
|
542
|
+
),
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
if x is not None
|
|
546
|
+
]
|
|
547
|
+
if results:
|
|
548
|
+
if mask is not None:
|
|
549
|
+
return sfilter(pd.concat(results), mask)
|
|
550
|
+
return pd.concat(results)
|
|
551
|
+
|
|
552
|
+
# add columns to empty DataFrame
|
|
553
|
+
first_path = next(iter(glob_func(str(Path(path) / "**/*.parquet"))))
|
|
554
|
+
return gpd.GeoDataFrame(
|
|
555
|
+
columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
560
|
+
return Path(path1).parts == Path(path2).parts
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def has_partitions(path, file_system) -> bool:
|
|
564
|
+
try:
|
|
565
|
+
glob_func = functools.partial(file_system.glob, detail=False)
|
|
566
|
+
except AttributeError:
|
|
567
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
568
|
+
|
|
569
|
+
return bool(
|
|
570
|
+
[
|
|
571
|
+
x
|
|
572
|
+
for x in glob_func(str(Path(path) / "**/*.parquet"))
|
|
573
|
+
if not paths_are_equal(x, path)
|
|
574
|
+
]
|
|
575
|
+
)
|
|
407
576
|
|
|
408
577
|
|
|
409
578
|
def check_files(
|
|
@@ -419,7 +588,7 @@ def check_files(
|
|
|
419
588
|
within_minutes: Optionally include only files that were updated in the
|
|
420
589
|
last n minutes.
|
|
421
590
|
"""
|
|
422
|
-
file_system =
|
|
591
|
+
file_system = config["file_system"]()
|
|
423
592
|
|
|
424
593
|
# (recursive doesn't work, so doing recursive search below)
|
|
425
594
|
info = file_system.ls(folder, detail=True, recursive=True)
|
|
@@ -474,7 +643,7 @@ def check_files(
|
|
|
474
643
|
|
|
475
644
|
|
|
476
645
|
def _get_files_in_subfolders(folderinfo: list[dict]) -> list[tuple]:
|
|
477
|
-
file_system =
|
|
646
|
+
file_system = config["file_system"]()
|
|
478
647
|
|
|
479
648
|
fileinfo = []
|
|
480
649
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|