ssb-sgis 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +2 -5
- sgis/conf.py +72 -0
- sgis/geopandas_tools/cleaning.py +583 -1577
- sgis/geopandas_tools/duplicates.py +17 -3
- sgis/helpers.py +22 -0
- sgis/io/__init__.py +6 -0
- sgis/io/dapla_functions.py +415 -74
- sgis/maps/explore.py +23 -5
- {ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/METADATA +1 -1
- {ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/RECORD +12 -10
- {ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.1.0.dist-info → ssb_sgis-1.1.2.dist-info}/WHEEL +0 -0
sgis/io/dapla_functions.py
CHANGED
|
@@ -2,30 +2,46 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import functools
|
|
6
|
+
import glob
|
|
5
7
|
import json
|
|
6
8
|
import multiprocessing
|
|
7
9
|
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import uuid
|
|
12
|
+
from collections.abc import Callable
|
|
8
13
|
from collections.abc import Iterable
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
15
|
+
from io import BytesIO
|
|
9
16
|
from pathlib import Path
|
|
10
17
|
|
|
11
|
-
import dapla as dp
|
|
12
18
|
import geopandas as gpd
|
|
13
19
|
import joblib
|
|
14
20
|
import pandas as pd
|
|
15
21
|
import pyarrow
|
|
22
|
+
import pyarrow.dataset
|
|
23
|
+
import pyarrow.dataset as ds
|
|
16
24
|
import pyarrow.parquet as pq
|
|
17
25
|
import shapely
|
|
18
|
-
from gcsfs import GCSFileSystem
|
|
19
26
|
from geopandas import GeoDataFrame
|
|
20
27
|
from geopandas import GeoSeries
|
|
21
28
|
from geopandas.io.arrow import _geopandas_to_arrow
|
|
22
29
|
from pandas import DataFrame
|
|
23
30
|
from pyarrow import ArrowInvalid
|
|
24
31
|
|
|
32
|
+
from ..conf import config
|
|
33
|
+
from ..geopandas_tools.conversion import to_shapely
|
|
25
34
|
from ..geopandas_tools.general import get_common_crs
|
|
26
35
|
from ..geopandas_tools.sfilter import sfilter
|
|
36
|
+
from ..helpers import _get_file_system
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from gcsfs import GCSFileSystem
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass
|
|
27
42
|
|
|
28
43
|
PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
|
|
44
|
+
NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
|
|
29
45
|
|
|
30
46
|
|
|
31
47
|
def read_geopandas(
|
|
@@ -34,6 +50,7 @@ def read_geopandas(
|
|
|
34
50
|
file_system: GCSFileSystem | None = None,
|
|
35
51
|
mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
|
|
36
52
|
threads: int | None = None,
|
|
53
|
+
filters: pyarrow.dataset.Expression | None = None,
|
|
37
54
|
**kwargs,
|
|
38
55
|
) -> GeoDataFrame | DataFrame:
|
|
39
56
|
"""Reads geoparquet or other geodata from one or more files on GCS.
|
|
@@ -56,18 +73,18 @@ def read_geopandas(
|
|
|
56
73
|
with a bbox that intersects the mask are read, then filtered by location.
|
|
57
74
|
threads: Number of threads to use if reading multiple files. Defaults to
|
|
58
75
|
the number of files to read or the number of available threads (if lower).
|
|
76
|
+
filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
|
|
77
|
+
structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
|
|
78
|
+
More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
|
|
59
79
|
**kwargs: Additional keyword arguments passed to geopandas' read_parquet
|
|
60
80
|
or read_file, depending on the file type.
|
|
61
81
|
|
|
62
82
|
Returns:
|
|
63
83
|
A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
|
|
64
84
|
"""
|
|
65
|
-
|
|
66
|
-
file_system = dp.FileClient.get_gcs_file_system()
|
|
85
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
67
86
|
|
|
68
87
|
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
69
|
-
kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
|
|
70
|
-
|
|
71
88
|
cols = {}
|
|
72
89
|
if mask is not None:
|
|
73
90
|
if not isinstance(gcs_path, GeoSeries):
|
|
@@ -106,7 +123,16 @@ def read_geopandas(
|
|
|
106
123
|
# recursive read with threads
|
|
107
124
|
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
108
125
|
dfs: list[GeoDataFrame] = parallel(
|
|
109
|
-
joblib.delayed(read_geopandas)(
|
|
126
|
+
joblib.delayed(read_geopandas)(
|
|
127
|
+
x,
|
|
128
|
+
filters=filters,
|
|
129
|
+
file_system=file_system,
|
|
130
|
+
pandas_fallback=pandas_fallback,
|
|
131
|
+
mask=mask,
|
|
132
|
+
threads=threads,
|
|
133
|
+
**kwargs,
|
|
134
|
+
)
|
|
135
|
+
for x in paths
|
|
110
136
|
)
|
|
111
137
|
|
|
112
138
|
if dfs:
|
|
@@ -124,22 +150,35 @@ def read_geopandas(
|
|
|
124
150
|
return sfilter(df, mask)
|
|
125
151
|
return df
|
|
126
152
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
153
|
+
child_paths = has_partitions(gcs_path, file_system)
|
|
154
|
+
if child_paths:
|
|
155
|
+
return gpd.GeoDataFrame(
|
|
156
|
+
_read_partitioned_parquet(
|
|
157
|
+
gcs_path,
|
|
158
|
+
read_func=_read_geopandas,
|
|
159
|
+
file_system=file_system,
|
|
160
|
+
mask=mask,
|
|
161
|
+
pandas_fallback=pandas_fallback,
|
|
162
|
+
filters=filters,
|
|
163
|
+
child_paths=child_paths,
|
|
164
|
+
**kwargs,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
132
167
|
|
|
133
168
|
if "parquet" in gcs_path or "prqt" in gcs_path:
|
|
134
169
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
135
170
|
try:
|
|
136
|
-
df = gpd.read_parquet(
|
|
171
|
+
df = gpd.read_parquet(
|
|
172
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
173
|
+
)
|
|
137
174
|
except ValueError as e:
|
|
138
175
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
139
176
|
raise e.__class__(
|
|
140
177
|
f"{e.__class__.__name__}: {e} for {gcs_path}."
|
|
141
178
|
) from e
|
|
142
|
-
df = pd.read_parquet(
|
|
179
|
+
df = pd.read_parquet(
|
|
180
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
181
|
+
)
|
|
143
182
|
if pandas_fallback or not len(df):
|
|
144
183
|
return df
|
|
145
184
|
else:
|
|
@@ -153,11 +192,16 @@ def read_geopandas(
|
|
|
153
192
|
else:
|
|
154
193
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
155
194
|
try:
|
|
156
|
-
df = gpd.read_file(
|
|
195
|
+
df = gpd.read_file(
|
|
196
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
197
|
+
)
|
|
157
198
|
except ValueError as e:
|
|
158
199
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
159
200
|
raise e
|
|
160
|
-
|
|
201
|
+
file_type: str = Path(gcs_path).suffix.strip(".")
|
|
202
|
+
df = getattr(pd, f"read_{file_type}")(
|
|
203
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
204
|
+
)
|
|
161
205
|
|
|
162
206
|
if pandas_fallback or not len(df):
|
|
163
207
|
return df
|
|
@@ -179,31 +223,42 @@ def read_geopandas(
|
|
|
179
223
|
def _get_bounds_parquet(
|
|
180
224
|
path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
|
|
181
225
|
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
182
|
-
with file_system.open(path) as
|
|
226
|
+
with file_system.open(path, "rb") as file:
|
|
227
|
+
return _get_bounds_parquet_from_open_file(file, file_system)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _get_bounds_parquet_from_open_file(
|
|
231
|
+
file, file_system
|
|
232
|
+
) -> tuple[list[float], dict] | tuple[None, None]:
|
|
233
|
+
geo_metadata = _get_geo_metadata(file, file_system)
|
|
234
|
+
if not geo_metadata:
|
|
235
|
+
return None, None
|
|
236
|
+
return geo_metadata["bbox"], geo_metadata["crs"]
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _get_geo_metadata(file, file_system) -> dict:
|
|
240
|
+
meta = pq.read_schema(file).metadata
|
|
241
|
+
geo_metadata = json.loads(meta[b"geo"])
|
|
242
|
+
try:
|
|
243
|
+
primary_column = geo_metadata["primary_column"]
|
|
244
|
+
except KeyError as e:
|
|
245
|
+
raise KeyError(e, geo_metadata) from e
|
|
246
|
+
try:
|
|
247
|
+
return geo_metadata["columns"][primary_column]
|
|
248
|
+
except KeyError as e:
|
|
183
249
|
try:
|
|
184
|
-
num_rows = pq.read_metadata(
|
|
250
|
+
num_rows = pq.read_metadata(file).num_rows
|
|
185
251
|
except ArrowInvalid as e:
|
|
186
|
-
if not file_system.isfile(
|
|
187
|
-
return
|
|
188
|
-
raise ArrowInvalid(e,
|
|
252
|
+
if not file_system.isfile(file):
|
|
253
|
+
return {}
|
|
254
|
+
raise ArrowInvalid(e, file) from e
|
|
189
255
|
if not num_rows:
|
|
190
|
-
return
|
|
191
|
-
|
|
192
|
-
try:
|
|
193
|
-
meta = json.loads(meta[b"geo"])["columns"]["geometry"]
|
|
194
|
-
except KeyError as e:
|
|
195
|
-
if pandas_fallback:
|
|
196
|
-
return None, None
|
|
197
|
-
raise KeyError(
|
|
198
|
-
f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
|
|
199
|
-
# f"{num_rows=}",
|
|
200
|
-
# meta,
|
|
201
|
-
) from e
|
|
202
|
-
return meta["bbox"], meta["crs"]
|
|
256
|
+
return {}
|
|
257
|
+
return {}
|
|
203
258
|
|
|
204
259
|
|
|
205
260
|
def _get_columns(path: str | Path, file_system: GCSFileSystem) -> pd.Index:
|
|
206
|
-
with file_system.open(path) as f:
|
|
261
|
+
with file_system.open(path, "rb") as f:
|
|
207
262
|
schema = pq.read_schema(f)
|
|
208
263
|
index_cols = _get_index_cols(schema)
|
|
209
264
|
return pd.Index(schema.names).difference(index_cols)
|
|
@@ -242,8 +297,7 @@ def get_bounds_series(
|
|
|
242
297
|
---------
|
|
243
298
|
>>> import sgis as sg
|
|
244
299
|
>>> import dapla as dp
|
|
245
|
-
>>>
|
|
246
|
-
>>> all_paths = file_system.ls("...")
|
|
300
|
+
>>> all_paths = GCSFileSystem().ls("...")
|
|
247
301
|
|
|
248
302
|
Get the bounds of all your file paths, indexed by path.
|
|
249
303
|
|
|
@@ -274,8 +328,7 @@ def get_bounds_series(
|
|
|
274
328
|
... )
|
|
275
329
|
|
|
276
330
|
"""
|
|
277
|
-
|
|
278
|
-
file_system = dp.FileClient.get_gcs_file_system()
|
|
331
|
+
file_system = _get_file_system(file_system, {})
|
|
279
332
|
|
|
280
333
|
if threads is None:
|
|
281
334
|
threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
|
|
@@ -308,7 +361,8 @@ def write_geopandas(
|
|
|
308
361
|
overwrite: bool = True,
|
|
309
362
|
pandas_fallback: bool = False,
|
|
310
363
|
file_system: GCSFileSystem | None = None,
|
|
311
|
-
|
|
364
|
+
partition_cols=None,
|
|
365
|
+
existing_data_behavior: str = "error",
|
|
312
366
|
**kwargs,
|
|
313
367
|
) -> None:
|
|
314
368
|
"""Writes a GeoDataFrame to the speficied format.
|
|
@@ -324,13 +378,9 @@ def write_geopandas(
|
|
|
324
378
|
not be written with geopandas and the number of rows is more than 0. If True,
|
|
325
379
|
the file will be written without geo-metadata if >0 rows.
|
|
326
380
|
file_system: Optional file sustem.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
|
|
331
|
-
considered as experimental. While writing the column is backwards compatible, using it
|
|
332
|
-
for filtering may not be supported by all readers.
|
|
333
|
-
|
|
381
|
+
partition_cols: Column(s) to partition by. Only for parquet files.
|
|
382
|
+
existing_data_behavior : 'error' | 'overwrite_or_ignore' | 'delete_matching'.
|
|
383
|
+
Defaults to 'error'. More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html
|
|
334
384
|
**kwargs: Additional keyword arguments passed to parquet.write_table
|
|
335
385
|
(for parquet) or geopandas' to_file method (if not parquet).
|
|
336
386
|
"""
|
|
@@ -340,22 +390,25 @@ def write_geopandas(
|
|
|
340
390
|
except TypeError as e:
|
|
341
391
|
raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
|
|
342
392
|
|
|
343
|
-
|
|
393
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
394
|
+
|
|
395
|
+
if not overwrite and file_system.exists(gcs_path):
|
|
344
396
|
raise ValueError("File already exists.")
|
|
345
397
|
|
|
346
398
|
if not isinstance(df, GeoDataFrame):
|
|
347
|
-
raise ValueError("DataFrame must be GeoDataFrame.")
|
|
399
|
+
raise ValueError(f"DataFrame must be GeoDataFrame. Got {type(df)}.")
|
|
348
400
|
|
|
349
|
-
if
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
401
|
+
if not len(df) and has_partitions(gcs_path, file_system):
|
|
402
|
+
# no need to write empty df
|
|
403
|
+
return
|
|
404
|
+
elif not len(df):
|
|
353
405
|
if pandas_fallback:
|
|
354
406
|
df = pd.DataFrame(df)
|
|
355
407
|
df.geometry = df.geometry.astype(str)
|
|
356
408
|
df.geometry = None
|
|
357
409
|
try:
|
|
358
|
-
|
|
410
|
+
with file_system.open(gcs_path, "wb") as file:
|
|
411
|
+
df.to_parquet(file, **kwargs)
|
|
359
412
|
except Exception as e:
|
|
360
413
|
more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
|
|
361
414
|
raise e.__class__(
|
|
@@ -363,17 +416,19 @@ def write_geopandas(
|
|
|
363
416
|
) from e
|
|
364
417
|
return
|
|
365
418
|
|
|
366
|
-
file_system = dp.FileClient.get_gcs_file_system()
|
|
367
|
-
|
|
368
419
|
if ".parquet" in gcs_path or "prqt" in gcs_path:
|
|
369
|
-
|
|
370
|
-
|
|
420
|
+
if partition_cols is not None:
|
|
421
|
+
return _write_partitioned_geoparquet(
|
|
371
422
|
df,
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
423
|
+
gcs_path,
|
|
424
|
+
partition_cols,
|
|
425
|
+
file_system,
|
|
426
|
+
existing_data_behavior=existing_data_behavior,
|
|
427
|
+
write_func=_to_geopandas,
|
|
428
|
+
**kwargs,
|
|
375
429
|
)
|
|
376
|
-
|
|
430
|
+
with file_system.open(gcs_path, mode="wb") as file:
|
|
431
|
+
df.to_parquet(file, **kwargs)
|
|
377
432
|
return
|
|
378
433
|
|
|
379
434
|
layer = kwargs.pop("layer", None)
|
|
@@ -389,21 +444,307 @@ def write_geopandas(
|
|
|
389
444
|
else:
|
|
390
445
|
driver = None
|
|
391
446
|
|
|
392
|
-
with
|
|
393
|
-
df.to_file(
|
|
447
|
+
with BytesIO() as buffer:
|
|
448
|
+
df.to_file(buffer, driver=driver)
|
|
449
|
+
buffer.seek(0) # Rewind the buffer to the beginning
|
|
394
450
|
|
|
451
|
+
# Upload buffer content to the desired storage
|
|
452
|
+
with file_system.open(gcs_path, "wb") as file:
|
|
453
|
+
file.write(buffer.read())
|
|
395
454
|
|
|
396
|
-
def exists(path: str | Path) -> bool:
|
|
397
|
-
"""Returns True if the path exists, and False if it doesn't.
|
|
398
455
|
|
|
399
|
-
|
|
400
|
-
|
|
456
|
+
def _to_geopandas(df, path, **kwargs) -> None:
|
|
457
|
+
table = _geopandas_to_arrow(
|
|
458
|
+
df,
|
|
459
|
+
index=df.index,
|
|
460
|
+
schema_version=None,
|
|
461
|
+
)
|
|
401
462
|
|
|
402
|
-
|
|
403
|
-
|
|
463
|
+
if "schema" in kwargs:
|
|
464
|
+
schema = kwargs.pop("schema")
|
|
465
|
+
|
|
466
|
+
# make sure to get the actual metadata
|
|
467
|
+
schema = pyarrow.schema(
|
|
468
|
+
[(schema.field(col).name, schema.field(col).type) for col in schema.names],
|
|
469
|
+
metadata=table.schema.metadata,
|
|
470
|
+
)
|
|
471
|
+
table = table.select(schema.names).cast(schema)
|
|
472
|
+
|
|
473
|
+
pq.write_table(table, path, compression="snappy", **kwargs)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _remove_file(path, file_system) -> None:
|
|
477
|
+
try:
|
|
478
|
+
file_system.rm_file(str(path))
|
|
479
|
+
except (AttributeError, TypeError, PermissionError) as e:
|
|
480
|
+
print(path, type(e), e)
|
|
481
|
+
try:
|
|
482
|
+
shutil.rmtree(path)
|
|
483
|
+
except NotADirectoryError:
|
|
484
|
+
try:
|
|
485
|
+
os.remove(path)
|
|
486
|
+
except PermissionError:
|
|
487
|
+
pass
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _write_partitioned_geoparquet(
|
|
491
|
+
df,
|
|
492
|
+
path,
|
|
493
|
+
partition_cols,
|
|
494
|
+
file_system=None,
|
|
495
|
+
write_func: Callable = _to_geopandas,
|
|
496
|
+
existing_data_behavior: str = "error",
|
|
497
|
+
**kwargs,
|
|
498
|
+
):
|
|
499
|
+
if isinstance(partition_cols, str):
|
|
500
|
+
partition_cols = [partition_cols]
|
|
501
|
+
|
|
502
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
503
|
+
|
|
504
|
+
path = Path(path)
|
|
505
|
+
unique_id = uuid.uuid4()
|
|
506
|
+
|
|
507
|
+
for col in partition_cols:
|
|
508
|
+
if df[col].isna().all() and not kwargs.get("schema"):
|
|
509
|
+
raise ValueError("Must specify 'schema' when all rows are NA.")
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
glob_func = functools.partial(file_system.glob, detail=False)
|
|
513
|
+
except AttributeError:
|
|
514
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
515
|
+
|
|
516
|
+
args: list[tuple[Path, DataFrame]] = []
|
|
517
|
+
dirs: list[Path] = set()
|
|
518
|
+
for group, rows in df.groupby(partition_cols, dropna=False):
|
|
519
|
+
name = (
|
|
520
|
+
"/".join(
|
|
521
|
+
f"{col}={value if not pd.isna(value) else NULL_VALUE}"
|
|
522
|
+
for col, value in zip(partition_cols, group, strict=True)
|
|
523
|
+
)
|
|
524
|
+
+ f"/{unique_id}.parquet"
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
dirs.add((path / name).parent)
|
|
528
|
+
args.append((path / name, rows))
|
|
529
|
+
|
|
530
|
+
if file_system.exists(path) and file_system.isfile(path):
|
|
531
|
+
_remove_file(path, file_system)
|
|
532
|
+
|
|
533
|
+
if kwargs.get("schema"):
|
|
534
|
+
schema = kwargs.pop("schema")
|
|
535
|
+
elif isinstance(df, GeoDataFrame):
|
|
536
|
+
geom_name = df.geometry.name
|
|
537
|
+
pandas_columns = [col for col in df if col != geom_name]
|
|
538
|
+
schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
|
|
539
|
+
index_columns = _get_index_cols(schema)
|
|
540
|
+
schema = pyarrow.schema(
|
|
541
|
+
[
|
|
542
|
+
(
|
|
543
|
+
(schema.field(col).name, schema.field(col).type)
|
|
544
|
+
if col != geom_name
|
|
545
|
+
else (geom_name, pyarrow.binary())
|
|
546
|
+
)
|
|
547
|
+
for col in [*df.columns, *index_columns]
|
|
548
|
+
# for col in df.columns
|
|
549
|
+
]
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
|
|
553
|
+
|
|
554
|
+
def get_siblings(path: str, paths: list[str]) -> list[str]:
|
|
555
|
+
parts = path.parts
|
|
556
|
+
return {x for x in paths if all(part in parts for part in x.parts)}
|
|
557
|
+
|
|
558
|
+
def threaded_write(path_rows):
|
|
559
|
+
new_path, rows = path_rows
|
|
560
|
+
# for sibling_path in get_siblings(new_path, child_paths):
|
|
561
|
+
for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
|
|
562
|
+
if not paths_are_equal(sibling_path, Path(new_path).parent):
|
|
563
|
+
if existing_data_behavior == "delete_matching":
|
|
564
|
+
_remove_file(sibling_path, file_system)
|
|
565
|
+
elif existing_data_behavior == "error":
|
|
566
|
+
raise pyarrow.ArrowInvalid(
|
|
567
|
+
f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
|
|
568
|
+
)
|
|
569
|
+
try:
|
|
570
|
+
with file_system.open(new_path, mode="wb") as file:
|
|
571
|
+
write_func(rows, file, schema=schema, **kwargs)
|
|
572
|
+
except FileNotFoundError:
|
|
573
|
+
file_system.makedirs(str(Path(new_path).parent), exist_ok=True)
|
|
574
|
+
with file_system.open(new_path, mode="wb") as file:
|
|
575
|
+
write_func(rows, file, schema=schema, **kwargs)
|
|
576
|
+
|
|
577
|
+
with ThreadPoolExecutor() as executor:
|
|
578
|
+
list(executor.map(threaded_write, args))
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def _filters_to_expression(filters) -> list[ds.Expression]:
|
|
582
|
+
if filters is None:
|
|
583
|
+
return None
|
|
584
|
+
elif isinstance(filters, pyarrow.dataset.Expression):
|
|
585
|
+
return filters
|
|
586
|
+
|
|
587
|
+
for filt in filters:
|
|
588
|
+
if "in" in filt and isinstance(filt[-1], str):
|
|
589
|
+
raise ValueError(
|
|
590
|
+
"Using strings with 'in' is ambigous. Use a list of strings."
|
|
591
|
+
)
|
|
592
|
+
try:
|
|
593
|
+
return pq.core.filters_to_expression(filters)
|
|
594
|
+
except ValueError as e:
|
|
595
|
+
raise ValueError(f"{e}: {filters}") from e
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def expression_match_path(expression: ds.Expression, path: str) -> bool:
|
|
599
|
+
"""Check if a file path match a pyarrow Expression.
|
|
600
|
+
|
|
601
|
+
Examples:
|
|
602
|
+
--------
|
|
603
|
+
>>> import pyarrow.compute as pc
|
|
604
|
+
>>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
|
|
605
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
|
|
606
|
+
>>> expression_match_path(path, expression)
|
|
607
|
+
True
|
|
608
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 5)
|
|
609
|
+
>>> expression_match_path(path, expression)
|
|
610
|
+
False
|
|
611
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("z") == 10)
|
|
612
|
+
>>> expression_match_path(path, expression)
|
|
613
|
+
False
|
|
404
614
|
"""
|
|
405
|
-
|
|
406
|
-
|
|
615
|
+
if NULL_VALUE in path:
|
|
616
|
+
return True
|
|
617
|
+
# build a one lengthed pyarrow.Table of the partitioning in the file path
|
|
618
|
+
values = []
|
|
619
|
+
names = []
|
|
620
|
+
for part in Path(path).parts:
|
|
621
|
+
if part.count("=") != 1:
|
|
622
|
+
continue
|
|
623
|
+
name, value = part.split("=")
|
|
624
|
+
values.append([value])
|
|
625
|
+
names.append(name)
|
|
626
|
+
table = pyarrow.Table.from_arrays(values, names=names)
|
|
627
|
+
try:
|
|
628
|
+
table = table.filter(expression)
|
|
629
|
+
except pyarrow.ArrowInvalid as e:
|
|
630
|
+
if "No match for FieldRef" not in str(e):
|
|
631
|
+
raise e
|
|
632
|
+
# cannot determine if the expression match without reading the file
|
|
633
|
+
return True
|
|
634
|
+
return bool(len(table))
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def _read_geopandas(file, pandas_fallback: bool, **kwargs):
|
|
638
|
+
try:
|
|
639
|
+
return gpd.read_parquet(file, **kwargs)
|
|
640
|
+
except Exception as e:
|
|
641
|
+
if not pandas_fallback:
|
|
642
|
+
raise e
|
|
643
|
+
df = pd.read_parquet(file, **kwargs)
|
|
644
|
+
if len(df):
|
|
645
|
+
raise e
|
|
646
|
+
return df
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def _read_pandas(gcs_path: str, **kwargs):
|
|
650
|
+
file_system = _get_file_system(None, kwargs)
|
|
651
|
+
|
|
652
|
+
child_paths = has_partitions(gcs_path, file_system)
|
|
653
|
+
if child_paths:
|
|
654
|
+
return gpd.GeoDataFrame(
|
|
655
|
+
_read_partitioned_parquet(
|
|
656
|
+
gcs_path,
|
|
657
|
+
read_func=pd.read_parquet,
|
|
658
|
+
file_system=file_system,
|
|
659
|
+
mask=None,
|
|
660
|
+
child_paths=child_paths,
|
|
661
|
+
**kwargs,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
with file_system.open(gcs_path, "rb") as file:
|
|
666
|
+
return pd.read_parquet(file, **kwargs)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def _read_partitioned_parquet(
|
|
670
|
+
path: str,
|
|
671
|
+
read_func: Callable,
|
|
672
|
+
filters=None,
|
|
673
|
+
file_system=None,
|
|
674
|
+
mask=None,
|
|
675
|
+
child_paths: list[str] | None = None,
|
|
676
|
+
**kwargs,
|
|
677
|
+
):
|
|
678
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
679
|
+
|
|
680
|
+
if child_paths is None:
|
|
681
|
+
try:
|
|
682
|
+
glob_func = functools.partial(file_system.glob)
|
|
683
|
+
except AttributeError:
|
|
684
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
685
|
+
child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
|
|
686
|
+
|
|
687
|
+
filters = _filters_to_expression(filters)
|
|
688
|
+
|
|
689
|
+
def intersects(file, mask) -> bool:
|
|
690
|
+
bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
|
|
691
|
+
return shapely.box(*bbox).intersects(to_shapely(mask))
|
|
692
|
+
|
|
693
|
+
def read(path) -> GeoDataFrame | None:
|
|
694
|
+
with file_system.open(path, "rb") as file:
|
|
695
|
+
if mask is not None and not intersects(file, mask):
|
|
696
|
+
return
|
|
697
|
+
|
|
698
|
+
schema = kwargs.get("schema", pq.read_schema(file))
|
|
699
|
+
# copy kwargs because mutable
|
|
700
|
+
new_kwargs = {
|
|
701
|
+
key: value for key, value in kwargs.items() if key != "schema"
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
return read_func(file, schema=schema, filters=filters, **new_kwargs)
|
|
705
|
+
|
|
706
|
+
with ThreadPoolExecutor() as executor:
|
|
707
|
+
results = [
|
|
708
|
+
x
|
|
709
|
+
for x in (
|
|
710
|
+
executor.map(
|
|
711
|
+
read,
|
|
712
|
+
(
|
|
713
|
+
path
|
|
714
|
+
for path in child_paths
|
|
715
|
+
if filters is None or expression_match_path(filters, path)
|
|
716
|
+
),
|
|
717
|
+
)
|
|
718
|
+
)
|
|
719
|
+
if x is not None
|
|
720
|
+
]
|
|
721
|
+
if results:
|
|
722
|
+
if mask is not None:
|
|
723
|
+
return sfilter(pd.concat(results), mask)
|
|
724
|
+
return pd.concat(results)
|
|
725
|
+
|
|
726
|
+
# add columns to empty DataFrame
|
|
727
|
+
first_path = next(iter(child_paths + [path]))
|
|
728
|
+
return pd.DataFrame(
|
|
729
|
+
columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
734
|
+
return Path(path1).parts == Path(path2).parts
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def has_partitions(path, file_system) -> list[str]:
|
|
738
|
+
try:
|
|
739
|
+
glob_func = functools.partial(file_system.glob, detail=False)
|
|
740
|
+
except AttributeError:
|
|
741
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
742
|
+
|
|
743
|
+
return [
|
|
744
|
+
x
|
|
745
|
+
for x in glob_func(str(Path(path) / "**/*.parquet"))
|
|
746
|
+
if not paths_are_equal(x, path)
|
|
747
|
+
]
|
|
407
748
|
|
|
408
749
|
|
|
409
750
|
def check_files(
|
|
@@ -419,7 +760,7 @@ def check_files(
|
|
|
419
760
|
within_minutes: Optionally include only files that were updated in the
|
|
420
761
|
last n minutes.
|
|
421
762
|
"""
|
|
422
|
-
file_system =
|
|
763
|
+
file_system = config["file_system"]()
|
|
423
764
|
|
|
424
765
|
# (recursive doesn't work, so doing recursive search below)
|
|
425
766
|
info = file_system.ls(folder, detail=True, recursive=True)
|
|
@@ -474,7 +815,7 @@ def check_files(
|
|
|
474
815
|
|
|
475
816
|
|
|
476
817
|
def _get_files_in_subfolders(folderinfo: list[dict]) -> list[tuple]:
|
|
477
|
-
file_system =
|
|
818
|
+
file_system = config["file_system"]()
|
|
478
819
|
|
|
479
820
|
fileinfo = []
|
|
480
821
|
|