ssb-sgis 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgis/__init__.py +1 -0
- sgis/conf.py +62 -6
- sgis/geopandas_tools/cleaning.py +583 -1577
- sgis/geopandas_tools/duplicates.py +17 -3
- sgis/helpers.py +22 -0
- sgis/io/__init__.py +6 -0
- sgis/io/dapla_functions.py +273 -101
- sgis/maps/explore.py +23 -5
- sgis/maps/wms.py +7 -1
- {ssb_sgis-1.1.1.dist-info → ssb_sgis-1.1.3.dist-info}/METADATA +1 -1
- {ssb_sgis-1.1.1.dist-info → ssb_sgis-1.1.3.dist-info}/RECORD +13 -12
- {ssb_sgis-1.1.1.dist-info → ssb_sgis-1.1.3.dist-info}/LICENSE +0 -0
- {ssb_sgis-1.1.1.dist-info → ssb_sgis-1.1.3.dist-info}/WHEEL +0 -0
sgis/io/dapla_functions.py
CHANGED
|
@@ -9,29 +9,39 @@ import multiprocessing
|
|
|
9
9
|
import os
|
|
10
10
|
import shutil
|
|
11
11
|
import uuid
|
|
12
|
+
from collections.abc import Callable
|
|
12
13
|
from collections.abc import Iterable
|
|
13
14
|
from concurrent.futures import ThreadPoolExecutor
|
|
15
|
+
from io import BytesIO
|
|
14
16
|
from pathlib import Path
|
|
15
17
|
|
|
16
18
|
import geopandas as gpd
|
|
17
19
|
import joblib
|
|
18
20
|
import pandas as pd
|
|
19
21
|
import pyarrow
|
|
22
|
+
import pyarrow.dataset
|
|
23
|
+
import pyarrow.dataset as ds
|
|
20
24
|
import pyarrow.parquet as pq
|
|
21
25
|
import shapely
|
|
22
|
-
from gcsfs import GCSFileSystem
|
|
23
26
|
from geopandas import GeoDataFrame
|
|
24
27
|
from geopandas import GeoSeries
|
|
25
28
|
from geopandas.io.arrow import _geopandas_to_arrow
|
|
26
29
|
from pandas import DataFrame
|
|
27
30
|
from pyarrow import ArrowInvalid
|
|
28
31
|
|
|
32
|
+
from ..conf import config
|
|
29
33
|
from ..geopandas_tools.conversion import to_shapely
|
|
30
34
|
from ..geopandas_tools.general import get_common_crs
|
|
31
35
|
from ..geopandas_tools.sfilter import sfilter
|
|
36
|
+
from ..helpers import _get_file_system
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from gcsfs import GCSFileSystem
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass
|
|
32
42
|
|
|
33
43
|
PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
|
|
34
|
-
|
|
44
|
+
NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
|
|
35
45
|
|
|
36
46
|
|
|
37
47
|
def read_geopandas(
|
|
@@ -40,6 +50,7 @@ def read_geopandas(
|
|
|
40
50
|
file_system: GCSFileSystem | None = None,
|
|
41
51
|
mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
|
|
42
52
|
threads: int | None = None,
|
|
53
|
+
filters: pyarrow.dataset.Expression | None = None,
|
|
43
54
|
**kwargs,
|
|
44
55
|
) -> GeoDataFrame | DataFrame:
|
|
45
56
|
"""Reads geoparquet or other geodata from one or more files on GCS.
|
|
@@ -62,18 +73,18 @@ def read_geopandas(
|
|
|
62
73
|
with a bbox that intersects the mask are read, then filtered by location.
|
|
63
74
|
threads: Number of threads to use if reading multiple files. Defaults to
|
|
64
75
|
the number of files to read or the number of available threads (if lower).
|
|
76
|
+
filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
|
|
77
|
+
structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
|
|
78
|
+
More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
|
|
65
79
|
**kwargs: Additional keyword arguments passed to geopandas' read_parquet
|
|
66
80
|
or read_file, depending on the file type.
|
|
67
81
|
|
|
68
82
|
Returns:
|
|
69
83
|
A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
|
|
70
84
|
"""
|
|
71
|
-
|
|
72
|
-
file_system = config["file_system"]()
|
|
85
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
73
86
|
|
|
74
87
|
if not isinstance(gcs_path, (str | Path | os.PathLike)):
|
|
75
|
-
kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
|
|
76
|
-
|
|
77
88
|
cols = {}
|
|
78
89
|
if mask is not None:
|
|
79
90
|
if not isinstance(gcs_path, GeoSeries):
|
|
@@ -112,7 +123,16 @@ def read_geopandas(
|
|
|
112
123
|
# recursive read with threads
|
|
113
124
|
with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
|
|
114
125
|
dfs: list[GeoDataFrame] = parallel(
|
|
115
|
-
joblib.delayed(read_geopandas)(
|
|
126
|
+
joblib.delayed(read_geopandas)(
|
|
127
|
+
x,
|
|
128
|
+
filters=filters,
|
|
129
|
+
file_system=file_system,
|
|
130
|
+
pandas_fallback=pandas_fallback,
|
|
131
|
+
mask=mask,
|
|
132
|
+
threads=threads,
|
|
133
|
+
**kwargs,
|
|
134
|
+
)
|
|
135
|
+
for x in paths
|
|
116
136
|
)
|
|
117
137
|
|
|
118
138
|
if dfs:
|
|
@@ -130,34 +150,35 @@ def read_geopandas(
|
|
|
130
150
|
return sfilter(df, mask)
|
|
131
151
|
return df
|
|
132
152
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
threads=threads,
|
|
147
|
-
filters=filters,
|
|
148
|
-
**kwargs,
|
|
153
|
+
child_paths = has_partitions(gcs_path, file_system)
|
|
154
|
+
if child_paths:
|
|
155
|
+
return gpd.GeoDataFrame(
|
|
156
|
+
_read_partitioned_parquet(
|
|
157
|
+
gcs_path,
|
|
158
|
+
read_func=_read_geopandas,
|
|
159
|
+
file_system=file_system,
|
|
160
|
+
mask=mask,
|
|
161
|
+
pandas_fallback=pandas_fallback,
|
|
162
|
+
filters=filters,
|
|
163
|
+
child_paths=child_paths,
|
|
164
|
+
**kwargs,
|
|
165
|
+
)
|
|
149
166
|
)
|
|
150
167
|
|
|
151
168
|
if "parquet" in gcs_path or "prqt" in gcs_path:
|
|
152
169
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
153
170
|
try:
|
|
154
|
-
df = gpd.read_parquet(
|
|
171
|
+
df = gpd.read_parquet(
|
|
172
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
173
|
+
)
|
|
155
174
|
except ValueError as e:
|
|
156
175
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
157
176
|
raise e.__class__(
|
|
158
177
|
f"{e.__class__.__name__}: {e} for {gcs_path}."
|
|
159
178
|
) from e
|
|
160
|
-
df = pd.read_parquet(
|
|
179
|
+
df = pd.read_parquet(
|
|
180
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
181
|
+
)
|
|
161
182
|
if pandas_fallback or not len(df):
|
|
162
183
|
return df
|
|
163
184
|
else:
|
|
@@ -171,11 +192,16 @@ def read_geopandas(
|
|
|
171
192
|
else:
|
|
172
193
|
with file_system.open(gcs_path, mode="rb") as file:
|
|
173
194
|
try:
|
|
174
|
-
df = gpd.read_file(
|
|
195
|
+
df = gpd.read_file(
|
|
196
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
197
|
+
)
|
|
175
198
|
except ValueError as e:
|
|
176
199
|
if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
|
|
177
200
|
raise e
|
|
178
|
-
|
|
201
|
+
file_type: str = Path(gcs_path).suffix.strip(".")
|
|
202
|
+
df = getattr(pd, f"read_{file_type}")(
|
|
203
|
+
file, filters=filters, filesystem=file_system, **kwargs
|
|
204
|
+
)
|
|
179
205
|
|
|
180
206
|
if pandas_fallback or not len(df):
|
|
181
207
|
return df
|
|
@@ -302,8 +328,7 @@ def get_bounds_series(
|
|
|
302
328
|
... )
|
|
303
329
|
|
|
304
330
|
"""
|
|
305
|
-
|
|
306
|
-
file_system = config["file_system"]()
|
|
331
|
+
file_system = _get_file_system(file_system, {})
|
|
307
332
|
|
|
308
333
|
if threads is None:
|
|
309
334
|
threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
|
|
@@ -337,6 +362,7 @@ def write_geopandas(
|
|
|
337
362
|
pandas_fallback: bool = False,
|
|
338
363
|
file_system: GCSFileSystem | None = None,
|
|
339
364
|
partition_cols=None,
|
|
365
|
+
existing_data_behavior: str = "error",
|
|
340
366
|
**kwargs,
|
|
341
367
|
) -> None:
|
|
342
368
|
"""Writes a GeoDataFrame to the speficied format.
|
|
@@ -353,6 +379,8 @@ def write_geopandas(
|
|
|
353
379
|
the file will be written without geo-metadata if >0 rows.
|
|
354
380
|
file_system: Optional file sustem.
|
|
355
381
|
partition_cols: Column(s) to partition by. Only for parquet files.
|
|
382
|
+
existing_data_behavior : 'error' | 'overwrite_or_ignore' | 'delete_matching'.
|
|
383
|
+
Defaults to 'error'. More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html
|
|
356
384
|
**kwargs: Additional keyword arguments passed to parquet.write_table
|
|
357
385
|
(for parquet) or geopandas' to_file method (if not parquet).
|
|
358
386
|
"""
|
|
@@ -362,16 +390,16 @@ def write_geopandas(
|
|
|
362
390
|
except TypeError as e:
|
|
363
391
|
raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
|
|
364
392
|
|
|
365
|
-
|
|
366
|
-
file_system = config["file_system"]()
|
|
393
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
367
394
|
|
|
368
395
|
if not overwrite and file_system.exists(gcs_path):
|
|
369
396
|
raise ValueError("File already exists.")
|
|
370
397
|
|
|
371
398
|
if not isinstance(df, GeoDataFrame):
|
|
372
|
-
raise ValueError("DataFrame must be GeoDataFrame.")
|
|
399
|
+
raise ValueError(f"DataFrame must be GeoDataFrame. Got {type(df)}.")
|
|
373
400
|
|
|
374
401
|
if not len(df) and has_partitions(gcs_path, file_system):
|
|
402
|
+
# no need to write empty df
|
|
375
403
|
return
|
|
376
404
|
elif not len(df):
|
|
377
405
|
if pandas_fallback:
|
|
@@ -380,7 +408,7 @@ def write_geopandas(
|
|
|
380
408
|
df.geometry = None
|
|
381
409
|
try:
|
|
382
410
|
with file_system.open(gcs_path, "wb") as file:
|
|
383
|
-
df.to_parquet(
|
|
411
|
+
df.to_parquet(file, **kwargs)
|
|
384
412
|
except Exception as e:
|
|
385
413
|
more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
|
|
386
414
|
raise e.__class__(
|
|
@@ -395,15 +423,12 @@ def write_geopandas(
|
|
|
395
423
|
gcs_path,
|
|
396
424
|
partition_cols,
|
|
397
425
|
file_system,
|
|
426
|
+
existing_data_behavior=existing_data_behavior,
|
|
427
|
+
write_func=_to_geopandas,
|
|
398
428
|
**kwargs,
|
|
399
429
|
)
|
|
400
430
|
with file_system.open(gcs_path, mode="wb") as file:
|
|
401
|
-
|
|
402
|
-
df,
|
|
403
|
-
index=df.index,
|
|
404
|
-
schema_version=None,
|
|
405
|
-
)
|
|
406
|
-
pq.write_table(table, file, compression="snappy", **kwargs)
|
|
431
|
+
df.to_parquet(file, **kwargs)
|
|
407
432
|
return
|
|
408
433
|
|
|
409
434
|
layer = kwargs.pop("layer", None)
|
|
@@ -419,14 +444,40 @@ def write_geopandas(
|
|
|
419
444
|
else:
|
|
420
445
|
driver = None
|
|
421
446
|
|
|
422
|
-
with
|
|
423
|
-
df.to_file(
|
|
447
|
+
with BytesIO() as buffer:
|
|
448
|
+
df.to_file(buffer, driver=driver)
|
|
449
|
+
buffer.seek(0) # Rewind the buffer to the beginning
|
|
450
|
+
|
|
451
|
+
# Upload buffer content to the desired storage
|
|
452
|
+
with file_system.open(gcs_path, "wb") as file:
|
|
453
|
+
file.write(buffer.read())
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _to_geopandas(df, path, **kwargs) -> None:
|
|
457
|
+
table = _geopandas_to_arrow(
|
|
458
|
+
df,
|
|
459
|
+
index=df.index,
|
|
460
|
+
schema_version=None,
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
if "schema" in kwargs:
|
|
464
|
+
schema = kwargs.pop("schema")
|
|
465
|
+
|
|
466
|
+
# make sure to get the actual metadata
|
|
467
|
+
schema = pyarrow.schema(
|
|
468
|
+
[(schema.field(col).name, schema.field(col).type) for col in schema.names],
|
|
469
|
+
metadata=table.schema.metadata,
|
|
470
|
+
)
|
|
471
|
+
table = table.select(schema.names).cast(schema)
|
|
472
|
+
|
|
473
|
+
pq.write_table(table, path, compression="snappy", **kwargs)
|
|
424
474
|
|
|
425
475
|
|
|
426
476
|
def _remove_file(path, file_system) -> None:
|
|
427
477
|
try:
|
|
428
|
-
file_system.rm_file(path)
|
|
429
|
-
except (AttributeError, TypeError, PermissionError):
|
|
478
|
+
file_system.rm_file(str(path))
|
|
479
|
+
except (AttributeError, TypeError, PermissionError) as e:
|
|
480
|
+
print(path, type(e), e)
|
|
430
481
|
try:
|
|
431
482
|
shutil.rmtree(path)
|
|
432
483
|
except NotADirectoryError:
|
|
@@ -436,10 +487,27 @@ def _remove_file(path, file_system) -> None:
|
|
|
436
487
|
pass
|
|
437
488
|
|
|
438
489
|
|
|
439
|
-
def _write_partitioned_geoparquet(
|
|
490
|
+
def _write_partitioned_geoparquet(
|
|
491
|
+
df,
|
|
492
|
+
path,
|
|
493
|
+
partition_cols,
|
|
494
|
+
file_system=None,
|
|
495
|
+
write_func: Callable = _to_geopandas,
|
|
496
|
+
existing_data_behavior: str = "error",
|
|
497
|
+
**kwargs,
|
|
498
|
+
):
|
|
499
|
+
if isinstance(partition_cols, str):
|
|
500
|
+
partition_cols = [partition_cols]
|
|
501
|
+
|
|
502
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
503
|
+
|
|
440
504
|
path = Path(path)
|
|
441
505
|
unique_id = uuid.uuid4()
|
|
442
506
|
|
|
507
|
+
for col in partition_cols:
|
|
508
|
+
if df[col].isna().all() and not kwargs.get("schema"):
|
|
509
|
+
raise ValueError("Must specify 'schema' when all rows are NA.")
|
|
510
|
+
|
|
443
511
|
try:
|
|
444
512
|
glob_func = functools.partial(file_system.glob, detail=False)
|
|
445
513
|
except AttributeError:
|
|
@@ -447,10 +515,10 @@ def _write_partitioned_geoparquet(df, path, partition_cols, file_system, **kwarg
|
|
|
447
515
|
|
|
448
516
|
args: list[tuple[Path, DataFrame]] = []
|
|
449
517
|
dirs: list[Path] = set()
|
|
450
|
-
for group, rows in df.groupby(partition_cols):
|
|
518
|
+
for group, rows in df.groupby(partition_cols, dropna=False):
|
|
451
519
|
name = (
|
|
452
520
|
"/".join(
|
|
453
|
-
f"{col}={value}"
|
|
521
|
+
f"{col}={value if not pd.isna(value) else NULL_VALUE}"
|
|
454
522
|
for col, value in zip(partition_cols, group, strict=True)
|
|
455
523
|
)
|
|
456
524
|
+ f"/{unique_id}.parquet"
|
|
@@ -459,59 +527,164 @@ def _write_partitioned_geoparquet(df, path, partition_cols, file_system, **kwarg
|
|
|
459
527
|
dirs.add((path / name).parent)
|
|
460
528
|
args.append((path / name, rows))
|
|
461
529
|
|
|
462
|
-
if file_system.exists(path) and
|
|
530
|
+
if file_system.exists(path) and file_system.isfile(path):
|
|
463
531
|
_remove_file(path, file_system)
|
|
464
532
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
533
|
+
if kwargs.get("schema"):
|
|
534
|
+
schema = kwargs.pop("schema")
|
|
535
|
+
elif isinstance(df, GeoDataFrame):
|
|
536
|
+
geom_name = df.geometry.name
|
|
537
|
+
pandas_columns = [col for col in df if col != geom_name]
|
|
538
|
+
schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
|
|
539
|
+
index_columns = _get_index_cols(schema)
|
|
540
|
+
schema = pyarrow.schema(
|
|
541
|
+
[
|
|
542
|
+
(
|
|
543
|
+
(schema.field(col).name, schema.field(col).type)
|
|
544
|
+
if col != geom_name
|
|
545
|
+
else (geom_name, pyarrow.binary())
|
|
546
|
+
)
|
|
547
|
+
for col in [*df.columns, *index_columns]
|
|
548
|
+
# for col in df.columns
|
|
549
|
+
]
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
|
|
553
|
+
|
|
554
|
+
def get_siblings(path: str, paths: list[str]) -> list[str]:
|
|
555
|
+
parts = path.parts
|
|
556
|
+
return {x for x in paths if all(part in parts for part in x.parts)}
|
|
471
557
|
|
|
472
558
|
def threaded_write(path_rows):
|
|
473
559
|
new_path, rows = path_rows
|
|
560
|
+
# for sibling_path in get_siblings(new_path, child_paths):
|
|
474
561
|
for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
|
|
475
562
|
if not paths_are_equal(sibling_path, Path(new_path).parent):
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
563
|
+
if existing_data_behavior == "delete_matching":
|
|
564
|
+
_remove_file(sibling_path, file_system)
|
|
565
|
+
elif existing_data_behavior == "error":
|
|
566
|
+
raise pyarrow.ArrowInvalid(
|
|
567
|
+
f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
|
|
568
|
+
)
|
|
569
|
+
try:
|
|
570
|
+
with file_system.open(new_path, mode="wb") as file:
|
|
571
|
+
write_func(rows, file, schema=schema, **kwargs)
|
|
572
|
+
except FileNotFoundError:
|
|
573
|
+
file_system.makedirs(str(Path(new_path).parent), exist_ok=True)
|
|
574
|
+
with file_system.open(new_path, mode="wb") as file:
|
|
575
|
+
write_func(rows, file, schema=schema, **kwargs)
|
|
484
576
|
|
|
485
577
|
with ThreadPoolExecutor() as executor:
|
|
486
578
|
list(executor.map(threaded_write, args))
|
|
487
579
|
|
|
488
580
|
|
|
581
|
+
def _filters_to_expression(filters) -> list[ds.Expression]:
|
|
582
|
+
if filters is None:
|
|
583
|
+
return None
|
|
584
|
+
elif isinstance(filters, pyarrow.dataset.Expression):
|
|
585
|
+
return filters
|
|
586
|
+
|
|
587
|
+
for filt in filters:
|
|
588
|
+
if "in" in filt and isinstance(filt[-1], str):
|
|
589
|
+
raise ValueError(
|
|
590
|
+
"Using strings with 'in' is ambigous. Use a list of strings."
|
|
591
|
+
)
|
|
592
|
+
try:
|
|
593
|
+
return pq.core.filters_to_expression(filters)
|
|
594
|
+
except ValueError as e:
|
|
595
|
+
raise ValueError(f"{e}: {filters}") from e
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def expression_match_path(expression: ds.Expression, path: str) -> bool:
|
|
599
|
+
"""Check if a file path match a pyarrow Expression.
|
|
600
|
+
|
|
601
|
+
Examples:
|
|
602
|
+
--------
|
|
603
|
+
>>> import pyarrow.compute as pc
|
|
604
|
+
>>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
|
|
605
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
|
|
606
|
+
>>> expression_match_path(path, expression)
|
|
607
|
+
True
|
|
608
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 5)
|
|
609
|
+
>>> expression_match_path(path, expression)
|
|
610
|
+
False
|
|
611
|
+
>>> expression = (pc.Field("x") == 1) & (pc.Field("z") == 10)
|
|
612
|
+
>>> expression_match_path(path, expression)
|
|
613
|
+
False
|
|
614
|
+
"""
|
|
615
|
+
if NULL_VALUE in path:
|
|
616
|
+
return True
|
|
617
|
+
# build a one lengthed pyarrow.Table of the partitioning in the file path
|
|
618
|
+
values = []
|
|
619
|
+
names = []
|
|
620
|
+
for part in Path(path).parts:
|
|
621
|
+
if part.count("=") != 1:
|
|
622
|
+
continue
|
|
623
|
+
name, value = part.split("=")
|
|
624
|
+
values.append([value])
|
|
625
|
+
names.append(name)
|
|
626
|
+
table = pyarrow.Table.from_arrays(values, names=names)
|
|
627
|
+
try:
|
|
628
|
+
table = table.filter(expression)
|
|
629
|
+
except pyarrow.ArrowInvalid as e:
|
|
630
|
+
if "No match for FieldRef" not in str(e):
|
|
631
|
+
raise e
|
|
632
|
+
# cannot determine if the expression match without reading the file
|
|
633
|
+
return True
|
|
634
|
+
return bool(len(table))
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def _read_geopandas(file, pandas_fallback: bool, **kwargs):
|
|
638
|
+
try:
|
|
639
|
+
return gpd.read_parquet(file, **kwargs)
|
|
640
|
+
except Exception as e:
|
|
641
|
+
if not pandas_fallback:
|
|
642
|
+
raise e
|
|
643
|
+
df = pd.read_parquet(file, **kwargs)
|
|
644
|
+
if len(df):
|
|
645
|
+
raise e
|
|
646
|
+
return df
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def _read_pandas(gcs_path: str, **kwargs):
|
|
650
|
+
file_system = _get_file_system(None, kwargs)
|
|
651
|
+
|
|
652
|
+
child_paths = has_partitions(gcs_path, file_system)
|
|
653
|
+
if child_paths:
|
|
654
|
+
return gpd.GeoDataFrame(
|
|
655
|
+
_read_partitioned_parquet(
|
|
656
|
+
gcs_path,
|
|
657
|
+
read_func=pd.read_parquet,
|
|
658
|
+
file_system=file_system,
|
|
659
|
+
mask=None,
|
|
660
|
+
child_paths=child_paths,
|
|
661
|
+
**kwargs,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
with file_system.open(gcs_path, "rb") as file:
|
|
666
|
+
return pd.read_parquet(file, **kwargs)
|
|
667
|
+
|
|
668
|
+
|
|
489
669
|
def _read_partitioned_parquet(
|
|
490
|
-
path
|
|
670
|
+
path: str,
|
|
671
|
+
read_func: Callable,
|
|
672
|
+
filters=None,
|
|
673
|
+
file_system=None,
|
|
674
|
+
mask=None,
|
|
675
|
+
child_paths: list[str] | None = None,
|
|
676
|
+
**kwargs,
|
|
491
677
|
):
|
|
492
|
-
|
|
493
|
-
glob_func = functools.partial(file_system.glob, detail=False)
|
|
494
|
-
except AttributeError:
|
|
495
|
-
glob_func = functools.partial(glob.glob, recursive=True)
|
|
678
|
+
file_system = _get_file_system(file_system, kwargs)
|
|
496
679
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
.strip("]")
|
|
506
|
-
.strip("{")
|
|
507
|
-
.strip("}")
|
|
508
|
-
.strip(" ")
|
|
509
|
-
for x in filt[-1].split(",")
|
|
510
|
-
]
|
|
511
|
-
filt = [filt[0] + "=" + x for x in values]
|
|
512
|
-
else:
|
|
513
|
-
filt = ["".join(filt)]
|
|
514
|
-
new_filters.append(filt)
|
|
680
|
+
if child_paths is None:
|
|
681
|
+
try:
|
|
682
|
+
glob_func = functools.partial(file_system.glob)
|
|
683
|
+
except AttributeError:
|
|
684
|
+
glob_func = functools.partial(glob.glob, recursive=True)
|
|
685
|
+
child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
|
|
686
|
+
|
|
687
|
+
filters = _filters_to_expression(filters)
|
|
515
688
|
|
|
516
689
|
def intersects(file, mask) -> bool:
|
|
517
690
|
bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
|
|
@@ -522,9 +695,13 @@ def _read_partitioned_parquet(
|
|
|
522
695
|
if mask is not None and not intersects(file, mask):
|
|
523
696
|
return
|
|
524
697
|
|
|
525
|
-
|
|
698
|
+
# get instead of pop, then copy kwargs (because mutable)
|
|
699
|
+
schema = kwargs.get("schema", pq.read_schema(file))
|
|
700
|
+
new_kwargs = {
|
|
701
|
+
key: value for key, value in kwargs.items() if key != "schema"
|
|
702
|
+
}
|
|
526
703
|
|
|
527
|
-
return
|
|
704
|
+
return read_func(file, schema=schema, filters=filters, **new_kwargs)
|
|
528
705
|
|
|
529
706
|
with ThreadPoolExecutor() as executor:
|
|
530
707
|
results = [
|
|
@@ -534,11 +711,8 @@ def _read_partitioned_parquet(
|
|
|
534
711
|
read,
|
|
535
712
|
(
|
|
536
713
|
path
|
|
537
|
-
for path in
|
|
538
|
-
if
|
|
539
|
-
any(subfilt in Path(path).parts for subfilt in filt)
|
|
540
|
-
for filt in new_filters
|
|
541
|
-
)
|
|
714
|
+
for path in child_paths
|
|
715
|
+
if filters is None or expression_match_path(filters, path)
|
|
542
716
|
),
|
|
543
717
|
)
|
|
544
718
|
)
|
|
@@ -550,8 +724,8 @@ def _read_partitioned_parquet(
|
|
|
550
724
|
return pd.concat(results)
|
|
551
725
|
|
|
552
726
|
# add columns to empty DataFrame
|
|
553
|
-
first_path = next(iter(
|
|
554
|
-
return
|
|
727
|
+
first_path = next(iter(child_paths + [path]))
|
|
728
|
+
return pd.DataFrame(
|
|
555
729
|
columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
|
|
556
730
|
)
|
|
557
731
|
|
|
@@ -560,19 +734,17 @@ def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
|
560
734
|
return Path(path1).parts == Path(path2).parts
|
|
561
735
|
|
|
562
736
|
|
|
563
|
-
def has_partitions(path, file_system) ->
|
|
737
|
+
def has_partitions(path, file_system) -> list[str]:
|
|
564
738
|
try:
|
|
565
739
|
glob_func = functools.partial(file_system.glob, detail=False)
|
|
566
740
|
except AttributeError:
|
|
567
741
|
glob_func = functools.partial(glob.glob, recursive=True)
|
|
568
742
|
|
|
569
|
-
return
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
]
|
|
575
|
-
)
|
|
743
|
+
return [
|
|
744
|
+
x
|
|
745
|
+
for x in glob_func(str(Path(path) / "**/*.parquet"))
|
|
746
|
+
if not paths_are_equal(x, path)
|
|
747
|
+
]
|
|
576
748
|
|
|
577
749
|
|
|
578
750
|
def check_files(
|
sgis/maps/explore.py
CHANGED
|
@@ -4,7 +4,6 @@ This module holds the Explore class, which is the basis for the explore, samplem
|
|
|
4
4
|
clipmap functions from the 'maps' module.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import os
|
|
8
7
|
import random
|
|
9
8
|
import re
|
|
10
9
|
import warnings
|
|
@@ -44,6 +43,7 @@ from ..geopandas_tools.general import clean_geoms
|
|
|
44
43
|
from ..geopandas_tools.general import make_all_singlepart
|
|
45
44
|
from ..geopandas_tools.geometry_types import get_geom_type
|
|
46
45
|
from ..geopandas_tools.geometry_types import to_single_geom_type
|
|
46
|
+
from ..helpers import _get_file_system
|
|
47
47
|
from .wms import WmsLoader
|
|
48
48
|
|
|
49
49
|
try:
|
|
@@ -114,6 +114,20 @@ _MAP_KWARGS = [
|
|
|
114
114
|
]
|
|
115
115
|
|
|
116
116
|
|
|
117
|
+
class HtmlViewer:
|
|
118
|
+
"""To be passed to IPython.display.display to show as map in Jupyter."""
|
|
119
|
+
|
|
120
|
+
def __init__(self, path: str, file_system=None) -> None:
|
|
121
|
+
"""Takes a file path."""
|
|
122
|
+
self.file_system = _get_file_system(file_system, {})
|
|
123
|
+
self.path = path
|
|
124
|
+
|
|
125
|
+
def _repr_html_(self) -> str:
|
|
126
|
+
"""Method to be used by IPython.display.display."""
|
|
127
|
+
with self.file_system.open(self.path, "r") as file:
|
|
128
|
+
return file.read()
|
|
129
|
+
|
|
130
|
+
|
|
117
131
|
class MeasureControlFix(plugins.MeasureControl):
|
|
118
132
|
"""Monkey-patch to fix a bug in the lenght measurement control.
|
|
119
133
|
|
|
@@ -281,6 +295,7 @@ class Explore(Map):
|
|
|
281
295
|
max_nodata_percentage: int = 100,
|
|
282
296
|
display: bool = True,
|
|
283
297
|
wms: WmsLoader | None = None,
|
|
298
|
+
file_system=None,
|
|
284
299
|
**kwargs,
|
|
285
300
|
) -> None:
|
|
286
301
|
"""Initialiser.
|
|
@@ -311,6 +326,8 @@ class Explore(Map):
|
|
|
311
326
|
image arrays.
|
|
312
327
|
display: Whether to display the map interactively.
|
|
313
328
|
wms: A WmsLoader instance for loading image tiles as layers. E.g. NorgeIBilderWms.
|
|
329
|
+
file_system: Any file system instance with an 'open' method. Used to write html file
|
|
330
|
+
to 'out_path'.
|
|
314
331
|
**kwargs: Additional keyword arguments. Can also be geometry-like objects
|
|
315
332
|
where the key is the label.
|
|
316
333
|
"""
|
|
@@ -329,6 +346,7 @@ class Explore(Map):
|
|
|
329
346
|
self.display = display
|
|
330
347
|
self.wms = [wms] if isinstance(wms, WmsLoader) else wms
|
|
331
348
|
self.legend = None
|
|
349
|
+
self.file_system = _get_file_system(file_system, kwargs)
|
|
332
350
|
|
|
333
351
|
self.browser = browser
|
|
334
352
|
if not self.browser and "show_in_browser" in kwargs:
|
|
@@ -614,8 +632,9 @@ class Explore(Map):
|
|
|
614
632
|
|
|
615
633
|
def save(self, path: str) -> None:
|
|
616
634
|
"""Save the map to local disk as an html document."""
|
|
617
|
-
with open(path, "w") as f:
|
|
635
|
+
with self.file_system.open(path, "w") as f:
|
|
618
636
|
f.write(self.map._repr_html_())
|
|
637
|
+
print(f"display(sg.HtmlViewer('{self.out_path}'))")
|
|
619
638
|
|
|
620
639
|
def _explore(self, **kwargs) -> None:
|
|
621
640
|
self.kwargs = self.kwargs | kwargs
|
|
@@ -629,10 +648,9 @@ class Explore(Map):
|
|
|
629
648
|
self._create_continous_map()
|
|
630
649
|
|
|
631
650
|
if self.out_path:
|
|
632
|
-
with open(
|
|
633
|
-
os.getcwd() + "/" + self.out_path.strip(".html") + ".html", "w"
|
|
634
|
-
) as f:
|
|
651
|
+
with self.file_system.open(self.out_path, "w") as f:
|
|
635
652
|
f.write(self.map._repr_html_())
|
|
653
|
+
print(f"display(sg.HtmlViewer('{self.out_path}'))")
|
|
636
654
|
elif self.browser:
|
|
637
655
|
run_html_server(self.map._repr_html_())
|
|
638
656
|
elif not self.display:
|