ssb-sgis 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sgis/__init__.py CHANGED
@@ -1,10 +1,6 @@
1
- config = {
2
- "n_jobs": 1,
3
- }
4
-
5
-
6
1
  import sgis.raster.indices as indices
7
2
 
3
+ from .conf import config
8
4
  from .geopandas_tools.bounds import Gridlooper
9
5
  from .geopandas_tools.bounds import bounds_to_points
10
6
  from .geopandas_tools.bounds import bounds_to_polygon
sgis/conf.py ADDED
@@ -0,0 +1,16 @@
1
+ try:
2
+ from gcsfs import GCSFileSystem
3
+ except ImportError:
4
+
5
+ class GCSFileSystem:
6
+ """Placeholder."""
7
+
8
+ def __init__(self, *args, **kwargs) -> None:
9
+ """Placeholder."""
10
+ raise ImportError("gcsfs")
11
+
12
+
13
+ config = {
14
+ "n_jobs": 1,
15
+ "file_system": GCSFileSystem,
16
+ }
@@ -2,13 +2,17 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import functools
6
+ import glob
5
7
  import json
6
8
  import multiprocessing
7
9
  import os
10
+ import shutil
11
+ import uuid
8
12
  from collections.abc import Iterable
13
+ from concurrent.futures import ThreadPoolExecutor
9
14
  from pathlib import Path
10
15
 
11
- import dapla as dp
12
16
  import geopandas as gpd
13
17
  import joblib
14
18
  import pandas as pd
@@ -22,10 +26,12 @@ from geopandas.io.arrow import _geopandas_to_arrow
22
26
  from pandas import DataFrame
23
27
  from pyarrow import ArrowInvalid
24
28
 
29
+ from ..geopandas_tools.conversion import to_shapely
25
30
  from ..geopandas_tools.general import get_common_crs
26
31
  from ..geopandas_tools.sfilter import sfilter
27
32
 
28
33
  PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
34
+ from ..conf import config
29
35
 
30
36
 
31
37
  def read_geopandas(
@@ -63,7 +69,7 @@ def read_geopandas(
63
69
  A GeoDataFrame if it has rows. If zero rows, a pandas DataFrame is returned.
64
70
  """
65
71
  if file_system is None:
66
- file_system = dp.FileClient.get_gcs_file_system()
72
+ file_system = config["file_system"]()
67
73
 
68
74
  if not isinstance(gcs_path, (str | Path | os.PathLike)):
69
75
  kwargs |= {"file_system": file_system, "pandas_fallback": pandas_fallback}
@@ -130,6 +136,18 @@ def read_geopandas(
130
136
  except TypeError as e:
131
137
  raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
132
138
 
139
+ if has_partitions(gcs_path, file_system):
140
+ filters = kwargs.pop("filters", None)
141
+ return _read_partitioned_parquet(
142
+ gcs_path,
143
+ file_system=file_system,
144
+ mask=mask,
145
+ pandas_fallback=pandas_fallback,
146
+ threads=threads,
147
+ filters=filters,
148
+ **kwargs,
149
+ )
150
+
133
151
  if "parquet" in gcs_path or "prqt" in gcs_path:
134
152
  with file_system.open(gcs_path, mode="rb") as file:
135
153
  try:
@@ -179,31 +197,42 @@ def read_geopandas(
179
197
  def _get_bounds_parquet(
180
198
  path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
181
199
  ) -> tuple[list[float], dict] | tuple[None, None]:
182
- with file_system.open(path) as f:
200
+ with file_system.open(path, "rb") as file:
201
+ return _get_bounds_parquet_from_open_file(file, file_system)
202
+
203
+
204
+ def _get_bounds_parquet_from_open_file(
205
+ file, file_system
206
+ ) -> tuple[list[float], dict] | tuple[None, None]:
207
+ geo_metadata = _get_geo_metadata(file, file_system)
208
+ if not geo_metadata:
209
+ return None, None
210
+ return geo_metadata["bbox"], geo_metadata["crs"]
211
+
212
+
213
+ def _get_geo_metadata(file, file_system) -> dict:
214
+ meta = pq.read_schema(file).metadata
215
+ geo_metadata = json.loads(meta[b"geo"])
216
+ try:
217
+ primary_column = geo_metadata["primary_column"]
218
+ except KeyError as e:
219
+ raise KeyError(e, geo_metadata) from e
220
+ try:
221
+ return geo_metadata["columns"][primary_column]
222
+ except KeyError as e:
183
223
  try:
184
- num_rows = pq.read_metadata(f).num_rows
224
+ num_rows = pq.read_metadata(file).num_rows
185
225
  except ArrowInvalid as e:
186
- if not file_system.isfile(f):
187
- return None, None
188
- raise ArrowInvalid(e, path) from e
226
+ if not file_system.isfile(file):
227
+ return {}
228
+ raise ArrowInvalid(e, file) from e
189
229
  if not num_rows:
190
- return None, None
191
- meta = pq.read_schema(f).metadata
192
- try:
193
- meta = json.loads(meta[b"geo"])["columns"]["geometry"]
194
- except KeyError as e:
195
- if pandas_fallback:
196
- return None, None
197
- raise KeyError(
198
- f"{e.__class__.__name__}: {e} for {path}." + PANDAS_FALLBACK_INFO,
199
- # f"{num_rows=}",
200
- # meta,
201
- ) from e
202
- return meta["bbox"], meta["crs"]
230
+ return {}
231
+ return {}
203
232
 
204
233
 
205
234
  def _get_columns(path: str | Path, file_system: GCSFileSystem) -> pd.Index:
206
- with file_system.open(path) as f:
235
+ with file_system.open(path, "rb") as f:
207
236
  schema = pq.read_schema(f)
208
237
  index_cols = _get_index_cols(schema)
209
238
  return pd.Index(schema.names).difference(index_cols)
@@ -242,8 +271,7 @@ def get_bounds_series(
242
271
  ---------
243
272
  >>> import sgis as sg
244
273
  >>> import dapla as dp
245
- >>> file_system = dp.FileClient.get_gcs_file_system()
246
- >>> all_paths = file_system.ls("...")
274
+ >>> all_paths = GCSFileSystem().ls("...")
247
275
 
248
276
  Get the bounds of all your file paths, indexed by path.
249
277
 
@@ -275,7 +303,7 @@ def get_bounds_series(
275
303
 
276
304
  """
277
305
  if file_system is None:
278
- file_system = dp.FileClient.get_gcs_file_system()
306
+ file_system = config["file_system"]()
279
307
 
280
308
  if threads is None:
281
309
  threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
@@ -308,7 +336,7 @@ def write_geopandas(
308
336
  overwrite: bool = True,
309
337
  pandas_fallback: bool = False,
310
338
  file_system: GCSFileSystem | None = None,
311
- write_covering_bbox: bool = False,
339
+ partition_cols=None,
312
340
  **kwargs,
313
341
  ) -> None:
314
342
  """Writes a GeoDataFrame to the speficied format.
@@ -324,13 +352,7 @@ def write_geopandas(
324
352
  not be written with geopandas and the number of rows is more than 0. If True,
325
353
  the file will be written without geo-metadata if >0 rows.
326
354
  file_system: Optional file sustem.
327
- write_covering_bbox: Writes the bounding box column for each row entry with column name "bbox".
328
- Writing a bbox column can be computationally expensive, but allows you to specify
329
- a bbox in : func:read_parquet for filtered reading.
330
- Note: this bbox column is part of the newer GeoParquet 1.1 specification and should be
331
- considered as experimental. While writing the column is backwards compatible, using it
332
- for filtering may not be supported by all readers.
333
-
355
+ partition_cols: Column(s) to partition by. Only for parquet files.
334
356
  **kwargs: Additional keyword arguments passed to parquet.write_table
335
357
  (for parquet) or geopandas' to_file method (if not parquet).
336
358
  """
@@ -340,22 +362,25 @@ def write_geopandas(
340
362
  except TypeError as e:
341
363
  raise TypeError(f"Unexpected type {type(gcs_path)}.") from e
342
364
 
343
- if not overwrite and exists(gcs_path):
365
+ if file_system is None:
366
+ file_system = config["file_system"]()
367
+
368
+ if not overwrite and file_system.exists(gcs_path):
344
369
  raise ValueError("File already exists.")
345
370
 
346
371
  if not isinstance(df, GeoDataFrame):
347
372
  raise ValueError("DataFrame must be GeoDataFrame.")
348
373
 
349
- if file_system is None:
350
- file_system = dp.FileClient.get_gcs_file_system()
351
-
352
- if not len(df):
374
+ if not len(df) and has_partitions(gcs_path, file_system):
375
+ return
376
+ elif not len(df):
353
377
  if pandas_fallback:
354
378
  df = pd.DataFrame(df)
355
379
  df.geometry = df.geometry.astype(str)
356
380
  df.geometry = None
357
381
  try:
358
- dp.write_pandas(df, gcs_path, **kwargs)
382
+ with file_system.open(gcs_path, "wb") as file:
383
+ df.to_parquet(gcs_path, **kwargs)
359
384
  except Exception as e:
360
385
  more_txt = PANDAS_FALLBACK_INFO if not pandas_fallback else ""
361
386
  raise e.__class__(
@@ -363,17 +388,22 @@ def write_geopandas(
363
388
  ) from e
364
389
  return
365
390
 
366
- file_system = dp.FileClient.get_gcs_file_system()
367
-
368
391
  if ".parquet" in gcs_path or "prqt" in gcs_path:
369
- with file_system.open(gcs_path, mode="wb") as buffer:
392
+ if partition_cols is not None:
393
+ return _write_partitioned_geoparquet(
394
+ df,
395
+ gcs_path,
396
+ partition_cols,
397
+ file_system,
398
+ **kwargs,
399
+ )
400
+ with file_system.open(gcs_path, mode="wb") as file:
370
401
  table = _geopandas_to_arrow(
371
402
  df,
372
403
  index=df.index,
373
404
  schema_version=None,
374
- write_covering_bbox=write_covering_bbox,
375
405
  )
376
- pq.write_table(table, buffer, compression="snappy", **kwargs)
406
+ pq.write_table(table, file, compression="snappy", **kwargs)
377
407
  return
378
408
 
379
409
  layer = kwargs.pop("layer", None)
@@ -393,17 +423,156 @@ def write_geopandas(
393
423
  df.to_file(file, driver=driver, layer=layer)
394
424
 
395
425
 
396
- def exists(path: str | Path) -> bool:
397
- """Returns True if the path exists, and False if it doesn't.
426
+ def _remove_file(path, file_system) -> None:
427
+ try:
428
+ file_system.rm_file(path)
429
+ except (AttributeError, TypeError, PermissionError):
430
+ try:
431
+ shutil.rmtree(path)
432
+ except NotADirectoryError:
433
+ try:
434
+ os.remove(path)
435
+ except PermissionError:
436
+ pass
398
437
 
399
- Args:
400
- path (str): The path to the file or directory.
401
438
 
402
- Returns:
403
- True if the path exists, False if not.
404
- """
405
- file_system = dp.FileClient.get_gcs_file_system()
406
- return file_system.exists(path)
439
+ def _write_partitioned_geoparquet(df, path, partition_cols, file_system, **kwargs):
440
+ path = Path(path)
441
+ unique_id = uuid.uuid4()
442
+
443
+ try:
444
+ glob_func = functools.partial(file_system.glob, detail=False)
445
+ except AttributeError:
446
+ glob_func = functools.partial(glob.glob, recursive=True)
447
+
448
+ args: list[tuple[Path, DataFrame]] = []
449
+ dirs: list[Path] = set()
450
+ for group, rows in df.groupby(partition_cols):
451
+ name = (
452
+ "/".join(
453
+ f"{col}={value}"
454
+ for col, value in zip(partition_cols, group, strict=True)
455
+ )
456
+ + f"/{unique_id}.parquet"
457
+ )
458
+
459
+ dirs.add((path / name).parent)
460
+ args.append((path / name, rows))
461
+
462
+ if file_system.exists(path) and not has_partitions(path, file_system):
463
+ _remove_file(path, file_system)
464
+
465
+ for dir_ in dirs:
466
+ try:
467
+ os.makedirs(dir_, exist_ok=True)
468
+ except (OSError, FileNotFoundError, FileExistsError) as e:
469
+ print(e)
470
+ pass
471
+
472
+ def threaded_write(path_rows):
473
+ new_path, rows = path_rows
474
+ for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
475
+ if not paths_are_equal(sibling_path, Path(new_path).parent):
476
+ _remove_file(sibling_path, file_system)
477
+ with file_system.open(new_path, mode="wb") as file:
478
+ table = _geopandas_to_arrow(
479
+ rows,
480
+ index=df.index,
481
+ schema_version=None,
482
+ )
483
+ pq.write_table(table, file, compression="snappy", **kwargs)
484
+
485
+ with ThreadPoolExecutor() as executor:
486
+ list(executor.map(threaded_write, args))
487
+
488
+
489
+ def _read_partitioned_parquet(
490
+ path, filters, file_system, mask, pandas_fallback, threads, **kwargs
491
+ ):
492
+ try:
493
+ glob_func = functools.partial(file_system.glob, detail=False)
494
+ except AttributeError:
495
+ glob_func = functools.partial(glob.glob, recursive=True)
496
+
497
+ filters = filters or []
498
+ new_filters = []
499
+ for filt in filters:
500
+ if "in" in filt:
501
+ values = [
502
+ x.strip("(")
503
+ .strip(")")
504
+ .strip("[")
505
+ .strip("]")
506
+ .strip("{")
507
+ .strip("}")
508
+ .strip(" ")
509
+ for x in filt[-1].split(",")
510
+ ]
511
+ filt = [filt[0] + "=" + x for x in values]
512
+ else:
513
+ filt = ["".join(filt)]
514
+ new_filters.append(filt)
515
+
516
+ def intersects(file, mask) -> bool:
517
+ bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
518
+ return shapely.box(*bbox).intersects(to_shapely(mask))
519
+
520
+ def read(path) -> GeoDataFrame | None:
521
+ with file_system.open(path, "rb") as file:
522
+ if mask is not None and not intersects(file, mask):
523
+ return
524
+
525
+ schema = kwargs.pop("schema", pq.read_schema(file))
526
+
527
+ return gpd.read_parquet(file, schema=schema, **kwargs)
528
+
529
+ with ThreadPoolExecutor() as executor:
530
+ results = [
531
+ x
532
+ for x in (
533
+ executor.map(
534
+ read,
535
+ (
536
+ path
537
+ for path in glob_func(str(Path(path) / "**/*.parquet"))
538
+ if all(
539
+ any(subfilt in Path(path).parts for subfilt in filt)
540
+ for filt in new_filters
541
+ )
542
+ ),
543
+ )
544
+ )
545
+ if x is not None
546
+ ]
547
+ if results:
548
+ if mask is not None:
549
+ return sfilter(pd.concat(results), mask)
550
+ return pd.concat(results)
551
+
552
+ # add columns to empty DataFrame
553
+ first_path = next(iter(glob_func(str(Path(path) / "**/*.parquet"))))
554
+ return gpd.GeoDataFrame(
555
+ columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
556
+ )
557
+
558
+
559
+ def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
560
+ return Path(path1).parts == Path(path2).parts
561
+
562
+
563
+ def has_partitions(path, file_system) -> bool:
564
+ try:
565
+ glob_func = functools.partial(file_system.glob, detail=False)
566
+ except AttributeError:
567
+ glob_func = functools.partial(glob.glob, recursive=True)
568
+
569
+ return bool(
570
+ [
571
+ x
572
+ for x in glob_func(str(Path(path) / "**/*.parquet"))
573
+ if not paths_are_equal(x, path)
574
+ ]
575
+ )
407
576
 
408
577
 
409
578
  def check_files(
@@ -419,7 +588,7 @@ def check_files(
419
588
  within_minutes: Optionally include only files that were updated in the
420
589
  last n minutes.
421
590
  """
422
- file_system = dp.FileClient.get_gcs_file_system()
591
+ file_system = config["file_system"]()
423
592
 
424
593
  # (recursive doesn't work, so doing recursive search below)
425
594
  info = file_system.ls(folder, detail=True, recursive=True)
@@ -474,7 +643,7 @@ def check_files(
474
643
 
475
644
 
476
645
  def _get_files_in_subfolders(folderinfo: list[dict]) -> list[tuple]:
477
- file_system = dp.FileClient.get_gcs_file_system()
646
+ file_system = config["file_system"]()
478
647
 
479
648
  fileinfo = []
480
649
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ssb-sgis
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: GIS functions used at Statistics Norway.
5
5
  Home-page: https://github.com/statisticsnorway/ssb-sgis
6
6
  License: MIT
@@ -1,4 +1,5 @@
1
- sgis/__init__.py,sha256=Bh-W4cB6-1uc-xRzUxqxECwwoennpdlikZI3gwXtZ7E,7389
1
+ sgis/__init__.py,sha256=h6B-UD121eAYNpesXIPfEYISrSTN6mfZd7aXPfcMiqs,7382
2
+ sgis/conf.py,sha256=-PraZWfHZerrMVKutPv7u-MezwAG7RlhGgmf5z-iPxA,304
2
3
  sgis/debug_config.py,sha256=Tfr19kU46hSkkspsIJcrUWvlhaL4U3-f8xEPkujSCAQ,593
3
4
  sgis/exceptions.py,sha256=WNaEBPNNx0rmz-YDzlFX4vIE7ocJQruUTqS2RNAu2zU,660
4
5
  sgis/geopandas_tools/__init__.py,sha256=bo8lFMcltOz7TtWAi52_ekR2gd3mjfBfKeMDV5zuqFY,28
@@ -19,7 +20,7 @@ sgis/geopandas_tools/polygons_as_rings.py,sha256=BX_GZS6F9I4NbEpiOlNBd7zywJjdfdJ
19
20
  sgis/geopandas_tools/sfilter.py,sha256=SLcMYprQwnY5DNo0R7TGXk4m6u26H8o4PRn-RPhmeZY,9345
20
21
  sgis/helpers.py,sha256=3NqPfVBKlZcZTiMJrsTAlDv5tNKDHrJr_8NimutVzQg,8797
21
22
  sgis/io/_is_dapla.py,sha256=wmfkSe98IrLhUg3dtXZusV6OVC8VlY1kbc5EQDf3P-Q,358
22
- sgis/io/dapla_functions.py,sha256=o1TlMyKhuOpXx6I_Pi2XPAPUcN5XRYEzTcZMfmfe09c,18205
23
+ sgis/io/dapla_functions.py,sha256=rTdTrxUQLM0NtnuYKiVFXInagh6wCWWr4lUTbzk_V0Q,23130
23
24
  sgis/io/opener.py,sha256=HWO3G1NB6bpXKM94JadCD513vjat1o1TFjWGWzyVasg,898
24
25
  sgis/io/read_parquet.py,sha256=FvZYv1rLkUlrSaUY6QW6E1yntmntTeQuZ9ZRgCDO4IM,3776
25
26
  sgis/maps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -56,7 +57,7 @@ sgis/raster/indices.py,sha256=-J1HYmnT240iozvgagvyis6K0_GHZHRuUrPOgyoeIrY,223
56
57
  sgis/raster/regex.py,sha256=kYhVpRYzoXutx1dSYmqMoselWXww7MMEsTPmLZwHjbM,3759
57
58
  sgis/raster/sentinel_config.py,sha256=nySDqn2R8M6W8jguoBeSAK_zzbAsqmaI59i32446FwY,1268
58
59
  sgis/raster/zonal.py,sha256=D4Gyptw-yOLTCO41peIuYbY-DANsJCG19xXDlf1QAz4,2299
59
- ssb_sgis-1.1.0.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
60
- ssb_sgis-1.1.0.dist-info/METADATA,sha256=eu1eJIwc822GIYWGPYj9tgJlM7NPfJ2pRX9YuRp9J8s,11740
61
- ssb_sgis-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- ssb_sgis-1.1.0.dist-info/RECORD,,
60
+ ssb_sgis-1.1.1.dist-info/LICENSE,sha256=np3IfD5m0ZUofn_kVzDZqliozuiO6wrktw3LRPjyEiI,1073
61
+ ssb_sgis-1.1.1.dist-info/METADATA,sha256=dwdCCyDOy441bczyc2JwdB9eZo_BtmTqL0Dqq05IZeQ,11740
62
+ ssb_sgis-1.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
63
+ ssb_sgis-1.1.1.dist-info/RECORD,,