ssb-sgis 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,30 +2,46 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import functools
6
+ import glob
5
7
  import json
6
8
  import multiprocessing
7
9
  import os
10
+ import shutil
11
+ import uuid
12
+ from collections.abc import Callable
8
13
  from collections.abc import Iterable
14
+ from concurrent.futures import ThreadPoolExecutor
15
+ from io import BytesIO
9
16
  from pathlib import Path
10
17
 
11
- import dapla as dp
12
18
  import geopandas as gpd
13
19
  import joblib
14
20
  import pandas as pd
15
21
  import pyarrow
22
+ import pyarrow.dataset
23
+ import pyarrow.dataset as ds
16
24
  import pyarrow.parquet as pq
17
25
  import shapely
18
- from gcsfs import GCSFileSystem
19
26
  from geopandas import GeoDataFrame
20
27
  from geopandas import GeoSeries
21
28
  from geopandas.io.arrow import _geopandas_to_arrow
22
29
  from pandas import DataFrame
23
30
  from pyarrow import ArrowInvalid
24
31
 
32
+ from ..conf import config
33
+ from ..geopandas_tools.conversion import to_shapely
25
34
  from ..geopandas_tools.general import get_common_crs
26
35
  from ..geopandas_tools.sfilter import sfilter
36
+ from ..helpers import _get_file_system
37
+
38
+ try:
39
+ from gcsfs import GCSFileSystem
40
+ except ImportError:
41
+ pass
27
42
 
28
43
  PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
44
+ NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
29
45
 
30
46
 
31
47
  def read_geopandas(
@@ -34,6 +50,7 @@ def read_geopandas(
34
50
  file_system: GCSFileSystem | None = None,
35
51
  mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
36
52
  threads: int | None = None,
53
+ filters: pyarrow.dataset.Expression | None = None,
37
54
  **kwargs,
38
55
  ) -> GeoDataFrame | DataFrame:
39
56
  """Reads geoparquet or other geodata from one or more files on GCS.
@@ -56,18 +73,18 @@ def read_geopandas(
56
73
  with a bbox that intersects the mask are read, then filtered by location.
57
74
  threads: Number of threads to use if reading multiple files. Defaults to
58
75
  the number of files to read or the number of available threads (if lower).
76
+ filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
77
+ structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
78
+ More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
59
79
  **kwargs: Additional keyword arguments passed to geopandas' read_parquet
60
80
  or read_file, depending on the file type.
61
81
 
62
82
  Returns:
63
83
  A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
64
84
  """
65
- if file_system is None:
66
- file_system = dp.FileClient.get_gcs_file_system()
85
+ file_system = _get_file_system(file_system, kwargs)
67
86
 
68
87
  if not isinstance(gcs_path, (str | Path | os.PathLike)):
69
- kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
70
-
71
88
  cols = {}
72
89
  if mask is not None:
73
90
  if not isinstance(gcs_path, GeoSeries):
@@ -106,7 +123,16 @@ def read_geopandas(
106
123
  # recursive read with threads
107
124
  with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
108
125
  dfs: list[GeoDataFrame] = parallel(
109
- joblib.delayed(read_geopandas)(x, **kwargs) for x in paths
126
+ joblib.delayed(read_geopandas)(
127
+ x,
128
+ filters=filters,
129
+ file_system=file_system,
130
+ pandas_fallback=pandas_fallback,
131
+ mask=mask,
132
+ threads=threads,
133
+ **kwargs,
134
+ )
135
+ for x in paths
110
136
  )
111
137
 
112
138
  if dfs:
@@ -124,22 +150,35 @@ def read_geopandas(
124
150
  return sfilter(df, mask)
125
151
  return df
126
152
 
127
- if not isinstance(gcs_path, str):
128
- try:
129
- gcs_path = str(gcs_path)
130
- except TypeError as e:
131
- raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
153
+ child_paths = has_partitions(gcs_path, file_system)
154
+ if child_paths:
155
+ return gpd.GeoDataFrame(
156
+ _read_partitioned_parquet(
157
+ gcs_path,
158
+ read_func=_read_geopandas,
159
+ file_system=file_system,
160
+ mask=mask,
161
+ pandas_fallback=pandas_fallback,
162
+ filters=filters,
163
+ child_paths=child_paths,
164
+ **kwargs,
165
+ )
166
+ )
132
167
 
133
168
  if "parquet" in gcs_path or "prqt" in gcs_path:
134
169
  with file_system.open(gcs_path, mode="rb") as file:
135
170
  try:
136
- df = gpd.read_parquet(file, **kwargs)
171
+ df = gpd.read_parquet(
172
+ file, filters=filters, filesystem=file_system, **kwargs
173
+ )
137
174
  except ValueError as e:
138
175
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
139
176
  raise e.__class__(
140
177
  f"{e.__class__.__name__}: {e} for {gcs_path}."
141
178
  ) from e
142
- df = pd.read_parquet(file, **kwargs)
179
+ df = pd.read_parquet(
180
+ file, filters=filters, filesystem=file_system, **kwargs
181
+ )
143
182
  if pandas_fallback or not len(df):
144
183
  return df
145
184
  else:
@@ -153,11 +192,16 @@ def read_geopandas(
153
192
  else:
154
193
  with file_system.open(gcs_path, mode="rb") as file:
155
194
  try:
156
- df = gpd.read_file(file, **kwargs)
195
+ df = gpd.read_file(
196
+ file, filters=filters, filesystem=file_system, **kwargs
197
+ )
157
198
  except ValueError as e:
158
199
  if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
159
200
  raise e
160
- df = pd.read_parquet(file, **kwargs)
201
+ file_type: str = Path(gcs_path).suffix.strip(".")
202
+ df = getattr(pd, f"read_{file_type}")(
203
+ file, filters=filters, filesystem=file_system, **kwargs
204
+ )
161
205
 
162
206
  if pandas_fallback or not len(df):
163
207
  return df
@@ -179,31 +223,42 @@ def read_geopandas(
179
223
  def _get_bounds_parquet(
180
224
  path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
181
225
  ) -> tuple[list[float], dict] | tuple[None, None]:
182
- with file_system.open(path) as f:
226
+ with file_system.open(path, "rb") as file:
227
+ return _get_bounds_parquet_from_open_file(file, file_system)
228
+
229
+
230
+ def _get_bounds_parquet_from_open_file(
231
+ file, file_system
232
+ ) -> tuple[list[float], dict] | tuple[None, None]:
233
+ geo_metadata = _get_geo_metadata(file, file_system)
234
+ if not geo_metadata:
235
+ return None, None
236
+ return geo_metadata["bbox"], geo_metadata["crs"]
237
+
238
+
239
+ def _get_geo_metadata(file, file_system) -> dict:
240
+ meta = pq.read_schema(file).metadata
241
+ geo_metadata = json.loads(meta[b"geo"])
242
+ try:
243
+ primary_column = geo_metadata["primary_column"]
244
+ except KeyError as e:
245
+ raise KeyError(e, geo_metadata) from e
246
+ try:
247
+ return geo_metadata["columns"][primary_column]
248
+ except KeyError as e:
183
249
  try:
184
- num_rows = pq.read_metadata(f).num_rows
250
+ num_rows = pq.read_metadata(file).num_rows
185
251
  except ArrowInvalid as e:
186
- if not file_system.isfile(f):
187
- return None, None
188
- raise ArrowInvalid(e, path) from e
252
+ if not file_system.isfile(file):
253
+ return {}
254
+ raise ArrowInvalid(e, file) from e
189
255
  if not num_rows:
190
- return None, None
191
- meta = pq.read_schema(f).metadata
192
- try:
193
- meta = json.loads(meta[b"geo"])["columns"]["geometry"]
194
- except KeyError as e:
195
- if pandas_fallback:
196
- return None, None
197
- raise KeyError(
198
- f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
199
- # f"{num_rows=}",
200
- # meta,
201
- ) from e
202
- return meta["bbox"], meta["crs"]
256
+ return {}
257
+ return {}
203
258
 
204
259
 
205
260
  def _get_columns(path: str | Path, file_system: GCSFileSystem) -> pd.Index:
206
- with file_system.open(path) as f:
261
+ with file_system.open(path, "rb") as f:
207
262
  schema = pq.read_schema(f)
208
263
  index_cols = _get_index_cols(schema)
209
264
  return pd.Index(schema.names).difference(index_cols)
@@ -242,8 +297,7 @@ def get_bounds_series(
242
297
  ---------
243
298
  >>> import sgis as sg
244
299
  >>> import dapla as dp
245
- >>> file_system = dp.FileClient.get_gcs_file_system()
246
- >>> all_paths = file_system.ls("...")
300
+ >>> all_paths = GCSFileSystem().ls("...")
247
301
 
248
302
  Get the bounds of all your file paths, indexed by path.
249
303
 
@@ -274,8 +328,7 @@ def get_bounds_series(
274
328
  ... )
275
329
 
276
330
  """
277
- if file_system is None:
278
- file_system = dp.FileClient.get_gcs_file_system()
331
+ file_system = _get_file_system(file_system, {})
279
332
 
280
333
  if threads is None:
281
334
  threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
@@ -308,7 +361,8 @@ def write_geopandas(
308
361
  overwrite: bool = True,
309
362
  pandas_fallback: bool = False,
310
363
  file_system: GCSFileSystem | None = None,
311
- write_covering_bbox: bool = False,
364
+ partition_cols=None,
365
+ existing_data_behavior: str = "error",
312
366
  **kwargs,
313
367
  ) -> None:
314
368
  """Writes a GeoDataFrame to the speficied format.
@@ -324,13 +378,9 @@ def write_geopandas(
324
378
  not be written with geopandas and the number of rows is more than 0. If True,
325
379
  the file will be written without geo-metadata if >0 rows.
326
380
  file_system: Optional file sustem.
327
- write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
328
- Writing a bbox column can be computationally expensive, but allows you to specify
329
- a bbox in : func:read_parquet for filtered reading.
330
- Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
331
- considered as experimental. While writing the column is backwards compatible, using it
332
- for filtering may not be supported by all readers.
333
-
381
+ partition_cols: Column(s) to partition by. Only for parquet files.
382
+ existing_data_behavior : 'error' | 'overwrite_or_ignore' | 'delete_matching'.
383
+ Defaults to 'error'. More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html
334
384
  **kwargs: Additional keyword arguments passed to parquet.write_table
335
385
  (for parquet) or geopandas' to_file method (if not parquet).
336
386
  """
@@ -340,22 +390,25 @@ def write_geopandas(
340
390
  except TypeError as e:
341
391
  raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
342
392
 
343
- if not overwrite and exists(gcs_path):
393
+ file_system = _get_file_system(file_system, kwargs)
394
+
395
+ if not overwrite and file_system.exists(gcs_path):
344
396
  raise ValueError("File already exists.")
345
397
 
346
398
  if not isinstance(df, GeoDataFrame):
347
- raise ValueError("DataFrame must be GeoDataFrame.")
399
+ raise ValueError(f"DataFrame must be GeoDataFrame. Got {type(df)}.")
348
400
 
349
- if file_system is None:
350
- file_system = dp.FileClient.get_gcs_file_system()
351
-
352
- if not len(df):
401
+ if not len(df) and has_partitions(gcs_path, file_system):
402
+ # no need to write empty df
403
+ return
404
+ elif not len(df):
353
405
  if pandas_fallback:
354
406
  df = pd.DataFrame(df)
355
407
  df.geometry = df.geometry.astype(str)
356
408
  df.geometry = None
357
409
  try:
358
- dp.write_pandas(df, gcs_path, **kwargs)
410
+ with file_system.open(gcs_path, "wb") as file:
411
+ df.to_parquet(file, **kwargs)
359
412
  except Exception as e:
360
413
  more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
361
414
  raise e.__class__(
@@ -363,17 +416,19 @@ def write_geopandas(
363
416
  ) from e
364
417
  return
365
418
 
366
- file_system = dp.FileClient.get_gcs_file_system()
367
-
368
419
  if ".parquet" in gcs_path or "prqt" in gcs_path:
369
- with file_system.open(gcs_path, mode="wb") as buffer:
370
- table = _geopandas_to_arrow(
420
+ if partition_cols is not None:
421
+ return _write_partitioned_geoparquet(
371
422
  df,
372
- index=df.index,
373
- schema_version=None,
374
- write_covering_bbox=write_covering_bbox,
423
+ gcs_path,
424
+ partition_cols,
425
+ file_system,
426
+ existing_data_behavior=existing_data_behavior,
427
+ write_func=_to_geopandas,
428
+ **kwargs,
375
429
  )
376
- pq.write_table(table, buffer, compression="snappy", **kwargs)
430
+ with file_system.open(gcs_path, mode="wb") as file:
431
+ df.to_parquet(file, **kwargs)
377
432
  return
378
433
 
379
434
  layer = kwargs.pop("layer", None)
@@ -389,21 +444,307 @@ def write_geopandas(
389
444
  else:
390
445
  driver = None
391
446
 
392
- with file_system.open(gcs_path, "wb") as file:
393
- df.to_file(file, driver=driver, layer=layer)
447
+ with BytesIO() as buffer:
448
+ df.to_file(buffer, driver=driver)
449
+ buffer.seek(0) # Rewind the buffer to the beginning
394
450
 
451
+ # Upload buffer content to the desired storage
452
+ with file_system.open(gcs_path, "wb") as file:
453
+ file.write(buffer.read())
395
454
 
396
- def exists(path: str | Path) -> bool:
397
- """Returns True if the path exists, and False if it doesn't.
398
455
 
399
- Args:
400
- path (str): The path to the file or directory.
456
+ def _to_geopandas(df, path, **kwargs) -> None:
457
+ table = _geopandas_to_arrow(
458
+ df,
459
+ index=df.index,
460
+ schema_version=None,
461
+ )
401
462
 
402
- Returns:
403
- True if the path exists, False if not.
463
+ if "schema" in kwargs:
464
+ schema = kwargs.pop("schema")
465
+
466
+ # make sure to get the actual metadata
467
+ schema = pyarrow.schema(
468
+ [(schema.field(col).name, schema.field(col).type) for col in schema.names],
469
+ metadata=table.schema.metadata,
470
+ )
471
+ table = table.select(schema.names).cast(schema)
472
+
473
+ pq.write_table(table, path, compression="snappy", **kwargs)
474
+
475
+
476
+ def _remove_file(path, file_system) -> None:
477
+ try:
478
+ file_system.rm_file(str(path))
479
+ except (AttributeError, TypeError, PermissionError) as e:
480
+ print(path, type(e), e)
481
+ try:
482
+ shutil.rmtree(path)
483
+ except NotADirectoryError:
484
+ try:
485
+ os.remove(path)
486
+ except PermissionError:
487
+ pass
488
+
489
+
490
+ def _write_partitioned_geoparquet(
491
+ df,
492
+ path,
493
+ partition_cols,
494
+ file_system=None,
495
+ write_func: Callable = _to_geopandas,
496
+ existing_data_behavior: str = "error",
497
+ **kwargs,
498
+ ):
499
+ if isinstance(partition_cols, str):
500
+ partition_cols = [partition_cols]
501
+
502
+ file_system = _get_file_system(file_system, kwargs)
503
+
504
+ path = Path(path)
505
+ unique_id = uuid.uuid4()
506
+
507
+ for col in partition_cols:
508
+ if df[col].isna().all() and not kwargs.get("schema"):
509
+ raise ValueError("Must specify 'schema' when all rows are NA.")
510
+
511
+ try:
512
+ glob_func = functools.partial(file_system.glob, detail=False)
513
+ except AttributeError:
514
+ glob_func = functools.partial(glob.glob, recursive=True)
515
+
516
+ args: list[tuple[Path, DataFrame]] = []
517
+ dirs: list[Path] = set()
518
+ for group, rows in df.groupby(partition_cols, dropna=False):
519
+ name = (
520
+ "/".join(
521
+ f"{col}={value if not pd.isna(value) else NULL_VALUE}"
522
+ for col, value in zip(partition_cols, group, strict=True)
523
+ )
524
+ + f"/{unique_id}.parquet"
525
+ )
526
+
527
+ dirs.add((path / name).parent)
528
+ args.append((path / name, rows))
529
+
530
+ if file_system.exists(path) and file_system.isfile(path):
531
+ _remove_file(path, file_system)
532
+
533
+ if kwargs.get("schema"):
534
+ schema = kwargs.pop("schema")
535
+ elif isinstance(df, GeoDataFrame):
536
+ geom_name = df.geometry.name
537
+ pandas_columns = [col for col in df if col != geom_name]
538
+ schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
539
+ index_columns = _get_index_cols(schema)
540
+ schema = pyarrow.schema(
541
+ [
542
+ (
543
+ (schema.field(col).name, schema.field(col).type)
544
+ if col != geom_name
545
+ else (geom_name, pyarrow.binary())
546
+ )
547
+ for col in [*df.columns, *index_columns]
548
+ # for col in df.columns
549
+ ]
550
+ )
551
+ else:
552
+ schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
553
+
554
+ def get_siblings(path: str, paths: list[str]) -> list[str]:
555
+ parts = path.parts
556
+ return {x for x in paths if all(part in parts for part in x.parts)}
557
+
558
+ def threaded_write(path_rows):
559
+ new_path, rows = path_rows
560
+ # for sibling_path in get_siblings(new_path, child_paths):
561
+ for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
562
+ if not paths_are_equal(sibling_path, Path(new_path).parent):
563
+ if existing_data_behavior == "delete_matching":
564
+ _remove_file(sibling_path, file_system)
565
+ elif existing_data_behavior == "error":
566
+ raise pyarrow.ArrowInvalid(
567
+ f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
568
+ )
569
+ try:
570
+ with file_system.open(new_path, mode="wb") as file:
571
+ write_func(rows, file, schema=schema, **kwargs)
572
+ except FileNotFoundError:
573
+ file_system.makedirs(str(Path(new_path).parent), exist_ok=True)
574
+ with file_system.open(new_path, mode="wb") as file:
575
+ write_func(rows, file, schema=schema, **kwargs)
576
+
577
+ with ThreadPoolExecutor() as executor:
578
+ list(executor.map(threaded_write, args))
579
+
580
+
581
+ def _filters_to_expression(filters) -> list[ds.Expression]:
582
+ if filters is None:
583
+ return None
584
+ elif isinstance(filters, pyarrow.dataset.Expression):
585
+ return filters
586
+
587
+ for filt in filters:
588
+ if "in" in filt and isinstance(filt[-1], str):
589
+ raise ValueError(
590
+ "Using strings with 'in' is ambigous. Use a list of strings."
591
+ )
592
+ try:
593
+ return pq.core.filters_to_expression(filters)
594
+ except ValueError as e:
595
+ raise ValueError(f"{e}: {filters}") from e
596
+
597
+
598
+ def expression_match_path(expression: ds.Expression, path: str) -> bool:
599
+ """Check if a file path match a pyarrow Expression.
600
+
601
+ Examples:
602
+ --------
603
+ >>> import pyarrow.compute as pc
604
+ >>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
605
+ >>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
606
+ >>> expression_match_path(path, expression)
607
+ True
608
+ >>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 5)
609
+ >>> expression_match_path(path, expression)
610
+ False
611
+ >>> expression = (pc.Field("x") == 1) & (pc.Field("z") == 10)
612
+ >>> expression_match_path(path, expression)
613
+ False
404
614
  """
405
- file_system = dp.FileClient.get_gcs_file_system()
406
- return file_system.exists(path)
615
+ if NULL_VALUE in path:
616
+ return True
617
+ # build a one lengthed pyarrow.Table of the partitioning in the file path
618
+ values = []
619
+ names = []
620
+ for part in Path(path).parts:
621
+ if part.count("=") != 1:
622
+ continue
623
+ name, value = part.split("=")
624
+ values.append([value])
625
+ names.append(name)
626
+ table = pyarrow.Table.from_arrays(values, names=names)
627
+ try:
628
+ table = table.filter(expression)
629
+ except pyarrow.ArrowInvalid as e:
630
+ if "No match for FieldRef" not in str(e):
631
+ raise e
632
+ # cannot determine if the expression match without reading the file
633
+ return True
634
+ return bool(len(table))
635
+
636
+
637
+ def _read_geopandas(file, pandas_fallback: bool, **kwargs):
638
+ try:
639
+ return gpd.read_parquet(file, **kwargs)
640
+ except Exception as e:
641
+ if not pandas_fallback:
642
+ raise e
643
+ df = pd.read_parquet(file, **kwargs)
644
+ if len(df):
645
+ raise e
646
+ return df
647
+
648
+
649
+ def _read_pandas(gcs_path: str, **kwargs):
650
+ file_system = _get_file_system(None, kwargs)
651
+
652
+ child_paths = has_partitions(gcs_path, file_system)
653
+ if child_paths:
654
+ return gpd.GeoDataFrame(
655
+ _read_partitioned_parquet(
656
+ gcs_path,
657
+ read_func=pd.read_parquet,
658
+ file_system=file_system,
659
+ mask=None,
660
+ child_paths=child_paths,
661
+ **kwargs,
662
+ )
663
+ )
664
+
665
+ with file_system.open(gcs_path, "rb") as file:
666
+ return pd.read_parquet(file, **kwargs)
667
+
668
+
669
+ def _read_partitioned_parquet(
670
+ path: str,
671
+ read_func: Callable,
672
+ filters=None,
673
+ file_system=None,
674
+ mask=None,
675
+ child_paths: list[str] | None = None,
676
+ **kwargs,
677
+ ):
678
+ file_system = _get_file_system(file_system, kwargs)
679
+
680
+ if child_paths is None:
681
+ try:
682
+ glob_func = functools.partial(file_system.glob)
683
+ except AttributeError:
684
+ glob_func = functools.partial(glob.glob, recursive=True)
685
+ child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
686
+
687
+ filters = _filters_to_expression(filters)
688
+
689
+ def intersects(file, mask) -> bool:
690
+ bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
691
+ return shapely.box(*bbox).intersects(to_shapely(mask))
692
+
693
+ def read(path) -> GeoDataFrame | None:
694
+ with file_system.open(path, "rb") as file:
695
+ if mask is not None and not intersects(file, mask):
696
+ return
697
+
698
+ schema = kwargs.get("schema", pq.read_schema(file))
699
+ # copy kwargs because mutable
700
+ new_kwargs = {
701
+ key: value for key, value in kwargs.items() if key != "schema"
702
+ }
703
+
704
+ return read_func(file, schema=schema, filters=filters, **new_kwargs)
705
+
706
+ with ThreadPoolExecutor() as executor:
707
+ results = [
708
+ x
709
+ for x in (
710
+ executor.map(
711
+ read,
712
+ (
713
+ path
714
+ for path in child_paths
715
+ if filters is None or expression_match_path(filters, path)
716
+ ),
717
+ )
718
+ )
719
+ if x is not None
720
+ ]
721
+ if results:
722
+ if mask is not None:
723
+ return sfilter(pd.concat(results), mask)
724
+ return pd.concat(results)
725
+
726
+ # add columns to empty DataFrame
727
+ first_path = next(iter(child_paths + [path]))
728
+ return pd.DataFrame(
729
+ columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
730
+ )
731
+
732
+
733
+ def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
734
+ return Path(path1).parts == Path(path2).parts
735
+
736
+
737
+ def has_partitions(path, file_system) -> list[str]:
738
+ try:
739
+ glob_func = functools.partial(file_system.glob, detail=False)
740
+ except AttributeError:
741
+ glob_func = functools.partial(glob.glob, recursive=True)
742
+
743
+ return [
744
+ x
745
+ for x in glob_func(str(Path(path) / "**/*.parquet"))
746
+ if not paths_are_equal(x, path)
747
+ ]
407
748
 
408
749
 
409
750
  def check_files(
@@ -419,7 +760,7 @@ def check_files(
419
760
  within_minutes: Optionally include only files that were updated in the
420
761
  last n minutes.
421
762
  """
422
- file_system = dp.FileClient.get_gcs_file_system()
763
+ file_system = config["file_system"]()
423
764
 
424
765
  # (recursive doesn't work, so doing recursive search below)
425
766
  info = file_system.ls(folder, detail=True, recursive=True)
@@ -474,7 +815,7 @@ def check_files(
474
815
 
475
816
 
476
817
  def _get_files_in_subfolders(folderinfo: list[dict]) -> list[tuple]:
477
- file_system = dp.FileClient.get_gcs_file_system()
818
+ file_system = config["file_system"]()
478
819
 
479
820
  fileinfo = []
480
821