ssb-sgis 1.1.3__tar.gz → 1.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/PKG-INFO +1 -1
  2. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/pyproject.toml +1 -1
  3. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/bounds.py +4 -4
  4. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/helpers.py +11 -2
  5. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/io/dapla_functions.py +267 -226
  6. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/LICENSE +0 -0
  7. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/README.md +0 -0
  8. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/__init__.py +0 -0
  9. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/conf.py +0 -0
  10. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/debug_config.py +0 -0
  11. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/exceptions.py +0 -0
  12. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/__init__.py +0 -0
  13. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/buffer_dissolve_explode.py +0 -0
  14. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/centerlines.py +0 -0
  15. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/cleaning.py +0 -0
  16. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/conversion.py +0 -0
  17. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/duplicates.py +0 -0
  18. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/general.py +0 -0
  19. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/geocoding.py +0 -0
  20. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/geometry_types.py +0 -0
  21. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/neighbors.py +0 -0
  22. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/overlay.py +0 -0
  23. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/point_operations.py +0 -0
  24. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/polygon_operations.py +0 -0
  25. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/polygons_as_rings.py +0 -0
  26. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/geopandas_tools/sfilter.py +0 -0
  27. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/io/__init__.py +0 -0
  28. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/io/_is_dapla.py +0 -0
  29. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/io/opener.py +0 -0
  30. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/io/read_parquet.py +0 -0
  31. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/__init__.py +0 -0
  32. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/examine.py +0 -0
  33. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/explore.py +0 -0
  34. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/httpserver.py +0 -0
  35. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/legend.py +0 -0
  36. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/map.py +0 -0
  37. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/maps.py +0 -0
  38. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/norge_i_bilder.json +0 -0
  39. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/thematicmap.py +0 -0
  40. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/tilesources.py +0 -0
  41. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/maps/wms.py +0 -0
  42. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/__init__.py +0 -0
  43. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/_get_route.py +0 -0
  44. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/_od_cost_matrix.py +0 -0
  45. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/_points.py +0 -0
  46. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/_service_area.py +0 -0
  47. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/closing_network_holes.py +0 -0
  48. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/cutting_lines.py +0 -0
  49. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/directednetwork.py +0 -0
  50. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/finding_isolated_networks.py +0 -0
  51. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/network.py +0 -0
  52. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/networkanalysis.py +0 -0
  53. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/networkanalysisrules.py +0 -0
  54. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/nodes.py +0 -0
  55. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/networkanalysis/traveling_salesman.py +0 -0
  56. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/parallel/parallel.py +0 -0
  57. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/py.typed +0 -0
  58. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/__init__.py +0 -0
  59. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/base.py +0 -0
  60. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/image_collection.py +0 -0
  61. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/indices.py +0 -0
  62. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/regex.py +0 -0
  63. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/sentinel_config.py +0 -0
  64. {ssb_sgis-1.1.3 → ssb_sgis-1.1.4}/src/sgis/raster/zonal.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ssb-sgis
3
- Version: 1.1.3
3
+ Version: 1.1.4
4
4
  Summary: GIS functions used at Statistics Norway.
5
5
  Home-page: https://github.com/statisticsnorway/ssb-sgis
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "ssb-sgis"
3
- version = "1.1.3"
3
+ version = "1.1.4"
4
4
  description = "GIS functions used at Statistics Norway."
5
5
  authors = ["Morten Letnes <morten.letnes@ssb.no>"]
6
6
  license = "MIT"
@@ -507,7 +507,7 @@ def make_ssb_grid(
507
507
  to make sure all data is covered by the grid.
508
508
 
509
509
  Returns:
510
- GeoDataFrame with grid geometries and a column 'SSBID'.
510
+ GeoDataFrame with grid geometries and a column 'ssb_rute_id'.
511
511
 
512
512
  Raises:
513
513
  ValueError: If the GeoDataFrame does not have 25833 as crs.
@@ -568,12 +568,12 @@ def make_ssb_grid(
568
568
  grid["nordc"] = (
569
569
  (np.floor((grid.geometry.centroid.y) / gridsize) * gridsize).apply(int)
570
570
  ).apply(str)
571
- grid["SSBID"] = grid["ostc"] + grid["nordc"]
572
- return grid[["SSBID", "geometry"]]
571
+ grid["ssb_rute_id"] = grid["ostc"] + grid["nordc"]
572
+ return grid[["ssb_rute_id", "geometry"]]
573
573
 
574
574
 
575
575
  def add_grid_id(
576
- gdf: GeoDataFrame, gridsize: int, out_column: str = "SSBID"
576
+ gdf: GeoDataFrame, gridsize: int, out_column: str = "ssb_rute_id"
577
577
  ) -> GeoDataFrame:
578
578
  """Adds an SSB grid ID column to a GeoDataFrame of points.
579
579
 
@@ -28,12 +28,21 @@ def _get_file_system(
28
28
  file_system: None | AbstractFileSystem, kwargs: dict
29
29
  ) -> AbstractFileSystem:
30
30
  if (
31
- file_system is not None and "filesystem" in kwargs or "file_system" in kwargs
31
+ file_system is not None and ("filesystem" in kwargs or "file_system" in kwargs)
32
32
  ) or ("filesystem" in kwargs and "file_system" in kwargs):
33
33
  raise ValueError("Cannot pass both filesystem and file_system.")
34
34
  file_system2 = kwargs.pop("file_system", None)
35
35
  file_system3 = kwargs.pop("filesystem", None)
36
- return file_system or file_system2 or file_system3 or config["file_system"]()
36
+ return (
37
+ file_system
38
+ or file_system2
39
+ or file_system3
40
+ or (
41
+ config["file_system"]()
42
+ if callable(config["file_system"])
43
+ else config["file_system"]
44
+ )
45
+ )
37
46
 
38
47
 
39
48
  def get_numpy_func(text: str, error_message: str | None = None) -> Callable:
@@ -14,6 +14,7 @@ from collections.abc import Iterable
14
14
  from concurrent.futures import ThreadPoolExecutor
15
15
  from io import BytesIO
16
16
  from pathlib import Path
17
+ from typing import Any
17
18
 
18
19
  import geopandas as gpd
19
20
  import joblib
@@ -25,6 +26,7 @@ import pyarrow.parquet as pq
25
26
  import shapely
26
27
  from geopandas import GeoDataFrame
27
28
  from geopandas import GeoSeries
29
+ from geopandas.io.arrow import _arrow_to_geopandas
28
30
  from geopandas.io.arrow import _geopandas_to_arrow
29
31
  from pandas import DataFrame
30
32
  from pyarrow import ArrowInvalid
@@ -49,7 +51,7 @@ def read_geopandas(
49
51
  pandas_fallback: bool = False,
50
52
  file_system: GCSFileSystem | None = None,
51
53
  mask: GeoSeries | GeoDataFrame | shapely.Geometry | tuple | None = None,
52
- threads: int | None = None,
54
+ use_threads: bool = True,
53
55
  filters: pyarrow.dataset.Expression | None = None,
54
56
  **kwargs,
55
57
  ) -> GeoDataFrame | DataFrame:
@@ -68,11 +70,11 @@ def read_geopandas(
68
70
  not be read with geopandas and the number of rows is more than 0. If True,
69
71
  the file will be read with pandas if geopandas fails.
70
72
  file_system: Optional file system.
71
- mask: Optional geometry mask to keep only intersecting geometries.
72
- If 'gcs_path' is an iterable of multiple paths, only the files
73
- with a bbox that intersects the mask are read, then filtered by location.
74
- threads: Number of threads to use if reading multiple files. Defaults to
75
- the number of files to read or the number of available threads (if lower).
73
+ mask: If gcs_path is a partitioned parquet file or an interable of paths.
74
+ Only files with a bbox intersecting mask will be read.
75
+ Note that the data is not filtered on a row level. You should either
76
+ use clip or sfilter to filter the data after reading.
77
+ use_threads: Defaults to True.
76
78
  filters: To filter out data. Either a pyarrow.dataset.Expression, or a list in the
77
79
  structure [[(column, op, val), …],…] where op is [==, =, >, >=, <, <=, !=, in, not in].
78
80
  More details here: https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html
@@ -85,72 +87,38 @@ def read_geopandas(
85
87
  file_system = _get_file_system(file_system, kwargs)
86
88
 
87
89
  if not isinstance(gcs_path, (str | Path | os.PathLike)):
88
- cols = {}
89
- if mask is not None:
90
- if not isinstance(gcs_path, GeoSeries):
91
- bounds_series: GeoSeries = get_bounds_series(
92
- gcs_path,
93
- file_system,
94
- threads=threads,
95
- pandas_fallback=pandas_fallback,
96
- )
97
- else:
98
- bounds_series = gcs_path
99
- new_bounds_series = sfilter(bounds_series, mask)
100
- if not len(new_bounds_series):
101
- if isinstance(kwargs.get("columns"), Iterable):
102
- cols = {col: [] for col in kwargs["columns"]}
103
- else:
104
- cols = {}
105
- for path in bounds_series.index:
106
- try:
107
- cols |= {col: [] for col in _get_columns(path, file_system)}
108
- except ArrowInvalid as e:
109
- if file_system.isfile(path):
110
- raise ArrowInvalid(e, path) from e
111
-
112
- return GeoDataFrame(cols | {"geometry": []})
113
- paths = list(new_bounds_series.index)
114
- else:
115
- if isinstance(gcs_path, GeoSeries):
116
- paths = list(gcs_path.index)
117
- else:
118
- paths = list(gcs_path)
119
-
120
- if threads is None:
121
- threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
90
+ return _read_geopandas_from_iterable(
91
+ gcs_path,
92
+ mask=mask,
93
+ file_system=file_system,
94
+ use_threads=use_threads,
95
+ pandas_fallback=pandas_fallback,
96
+ **kwargs,
97
+ )
122
98
 
123
- # recursive read with threads
124
- with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
125
- dfs: list[GeoDataFrame] = parallel(
126
- joblib.delayed(read_geopandas)(
127
- x,
128
- filters=filters,
99
+ if (
100
+ isinstance(filters, Iterable)
101
+ and len(filters) == 1
102
+ and ("=" in next(iter(filters)) or "==" in next(iter(filters)))
103
+ ):
104
+ # try to read only files in the relevant partition, because glob is slow without GCSFileSystem
105
+ try:
106
+ expression = "".join(next(iter(filters))).replace("==", "=")
107
+ glob_func = _get_glob(file_system)
108
+ paths = glob_func(str(Path(gcs_path) / expression))
109
+ if paths:
110
+ return _read_geopandas_from_iterable(
111
+ paths,
112
+ mask=mask,
129
113
  file_system=file_system,
114
+ use_threads=use_threads,
130
115
  pandas_fallback=pandas_fallback,
131
- mask=mask,
132
- threads=threads,
133
116
  **kwargs,
134
117
  )
135
- for x in paths
136
- )
137
-
138
- if dfs:
139
- df = pd.concat(dfs, ignore_index=True)
140
- try:
141
- df = GeoDataFrame(df)
142
- except Exception as e:
143
- if not pandas_fallback:
144
- print(e)
145
- raise e
146
- else:
147
- df = GeoDataFrame(cols | {"geometry": []})
148
-
149
- if mask is not None:
150
- return sfilter(df, mask)
151
- return df
118
+ except FileNotFoundError:
119
+ pass
152
120
 
153
- child_paths = has_partitions(gcs_path, file_system)
121
+ child_paths = get_child_paths(gcs_path, file_system)
154
122
  if child_paths:
155
123
  return gpd.GeoDataFrame(
156
124
  _read_partitioned_parquet(
@@ -158,65 +126,93 @@ def read_geopandas(
158
126
  read_func=_read_geopandas,
159
127
  file_system=file_system,
160
128
  mask=mask,
161
- pandas_fallback=pandas_fallback,
162
129
  filters=filters,
163
130
  child_paths=child_paths,
164
131
  **kwargs,
165
132
  )
166
133
  )
167
134
 
168
- if "parquet" in gcs_path or "prqt" in gcs_path:
169
- with file_system.open(gcs_path, mode="rb") as file:
170
- try:
171
- df = gpd.read_parquet(
172
- file, filters=filters, filesystem=file_system, **kwargs
173
- )
174
- except ValueError as e:
175
- if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
176
- raise e.__class__(
177
- f"{e.__class__.__name__}: {e} for {gcs_path}."
178
- ) from e
179
- df = pd.read_parquet(
180
- file, filters=filters, filesystem=file_system, **kwargs
181
- )
182
- if pandas_fallback or not len(df):
183
- return df
184
- else:
185
- more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
186
- raise e.__class__(
187
- f"{e.__class__.__name__}: {e} for {df}." + more_txt
188
- ) from e
189
- except Exception as e:
190
- raise e.__class__(f"{e.__class__.__name__}: {e} for {gcs_path}.") from e
135
+ if gcs_path.endswith(".parquet"):
136
+ file_format: str = "parquet"
137
+ read_func = gpd.read_parquet
138
+ else:
139
+ file_format: str = Path(gcs_path).suffix.lstrip(".")
140
+ read_func = gpd.read_file
141
+
142
+ with file_system.open(gcs_path, mode="rb") as file:
143
+ df = _read_geopandas(
144
+ file,
145
+ read_func=read_func,
146
+ file_format=file_format,
147
+ filters=filters,
148
+ **kwargs,
149
+ )
150
+
151
+ return df
152
+
191
153
 
154
+ def _read_geopandas_from_iterable(
155
+ paths, mask, file_system, use_threads, pandas_fallback, **kwargs
156
+ ):
157
+ cols = {}
158
+ if mask is None and isinstance(paths, GeoSeries):
159
+ # bounds GeoSeries indexed with file paths
160
+ paths = list(paths.index)
161
+ elif mask is None:
162
+ paths = list(paths)
192
163
  else:
193
- with file_system.open(gcs_path, mode="rb") as file:
194
- try:
195
- df = gpd.read_file(
196
- file, filters=filters, filesystem=file_system, **kwargs
197
- )
198
- except ValueError as e:
199
- if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
200
- raise e
201
- file_type: str = Path(gcs_path).suffix.strip(".")
202
- df = getattr(pd, f"read_{file_type}")(
203
- file, filters=filters, filesystem=file_system, **kwargs
204
- )
164
+ if not isinstance(paths, GeoSeries):
165
+ bounds_series: GeoSeries = get_bounds_series(
166
+ paths,
167
+ file_system,
168
+ use_threads=use_threads,
169
+ pandas_fallback=pandas_fallback,
170
+ )
171
+ else:
172
+ bounds_series = paths
173
+ new_bounds_series = sfilter(bounds_series, mask)
174
+ if not len(new_bounds_series):
175
+ if isinstance(kwargs.get("columns"), Iterable):
176
+ cols = {col: [] for col in kwargs["columns"]}
177
+ else:
178
+ cols = {}
179
+ for path in bounds_series.index:
180
+ try:
181
+ cols |= {col: [] for col in _get_columns(path, file_system)}
182
+ except ArrowInvalid as e:
183
+ if file_system.isfile(path):
184
+ raise ArrowInvalid(e, path) from e
185
+ return GeoDataFrame(cols | {"geometry": []})
186
+ paths = list(new_bounds_series.index)
187
+
188
+ # recursive read with threads
189
+ threads = (
190
+ min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
191
+ )
192
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
193
+ dfs: list[GeoDataFrame] = parallel(
194
+ joblib.delayed(read_geopandas)(
195
+ x,
196
+ file_system=file_system,
197
+ pandas_fallback=pandas_fallback,
198
+ mask=mask,
199
+ use_threads=use_threads,
200
+ **kwargs,
201
+ )
202
+ for x in paths
203
+ )
204
+
205
+ if dfs:
206
+ df = pd.concat(dfs, ignore_index=True)
207
+ try:
208
+ df = GeoDataFrame(df)
209
+ except Exception as e:
210
+ if not pandas_fallback:
211
+ print(e)
212
+ raise e
213
+ else:
214
+ df = GeoDataFrame(cols | {"geometry": []})
205
215
 
206
- if pandas_fallback or not len(df):
207
- return df
208
- else:
209
- more_txt = PANDAS_FALLBACK_INFO if not len(df) else ""
210
- raise e.__class__(
211
- f"{e.__class__.__name__}: {e} for {df}. " + more_txt
212
- ) from e
213
- except Exception as e:
214
- raise e.__class__(
215
- f"{e.__class__.__name__}: {e} for {gcs_path}." + more_txt
216
- ) from e
217
-
218
- if mask is not None:
219
- return sfilter(df, mask)
220
216
  return df
221
217
 
222
218
 
@@ -230,15 +226,25 @@ def _get_bounds_parquet(
230
226
  def _get_bounds_parquet_from_open_file(
231
227
  file, file_system
232
228
  ) -> tuple[list[float], dict] | tuple[None, None]:
233
- geo_metadata = _get_geo_metadata(file, file_system)
229
+ geo_metadata = _get_geo_metadata_primary_column(file, file_system)
230
+
234
231
  if not geo_metadata:
235
232
  return None, None
236
233
  return geo_metadata["bbox"], geo_metadata["crs"]
237
234
 
238
235
 
239
236
  def _get_geo_metadata(file, file_system) -> dict:
240
- meta = pq.read_schema(file).metadata
241
- geo_metadata = json.loads(meta[b"geo"])
237
+ try:
238
+ meta = pq.read_schema(file).metadata
239
+ except FileNotFoundError:
240
+ with file_system.open(file, "rb") as f:
241
+ meta = pq.read_schema(f).metadata
242
+
243
+ return json.loads(meta[b"geo"])
244
+
245
+
246
+ def _get_geo_metadata_primary_column(file, file_system) -> dict:
247
+ geo_metadata = _get_geo_metadata(file, file_system)
242
248
  try:
243
249
  primary_column = geo_metadata["primary_column"]
244
250
  except KeyError as e:
@@ -252,6 +258,7 @@ def _get_geo_metadata(file, file_system) -> dict:
252
258
  if not file_system.isfile(file):
253
259
  return {}
254
260
  raise ArrowInvalid(e, file) from e
261
+ # allow for 0 lengthed tables not to have geo metadata
255
262
  if not num_rows:
256
263
  return {}
257
264
  return {}
@@ -272,7 +279,7 @@ def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
272
279
  def get_bounds_series(
273
280
  paths: list[str | Path] | tuple[str | Path],
274
281
  file_system: GCSFileSystem | None = None,
275
- threads: int | None = None,
282
+ use_threads: bool = True,
276
283
  pandas_fallback: bool = False,
277
284
  ) -> GeoSeries:
278
285
  """Get a GeoSeries with file paths as indexes and the file's bounds as values.
@@ -285,8 +292,7 @@ def get_bounds_series(
285
292
  file_system: Optional instance of GCSFileSystem.
286
293
  If None, an instance is created within the function.
287
294
  Note that this is slower in long loops.
288
- threads: Number of threads to use if reading multiple files. Defaults to
289
- the number of files to read or the number of available threads (if lower).
295
+ use_threads: Default True.
290
296
  pandas_fallback: If False (default), an exception is raised if the file has
291
297
  no geo metadata. If True, the geometry value is set to None for this file.
292
298
 
@@ -330,8 +336,9 @@ def get_bounds_series(
330
336
  """
331
337
  file_system = _get_file_system(file_system, {})
332
338
 
333
- if threads is None:
334
- threads = min(len(paths), int(multiprocessing.cpu_count())) or 1
339
+ threads = (
340
+ min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
341
+ )
335
342
 
336
343
  with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
337
344
  bounds: list[tuple[list[float], dict]] = parallel(
@@ -396,9 +403,11 @@ def write_geopandas(
396
403
  raise ValueError("File already exists.")
397
404
 
398
405
  if not isinstance(df, GeoDataFrame):
399
- raise ValueError(f"DataFrame must be GeoDataFrame. Got {type(df)}.")
406
+ raise ValueError(
407
+ f"DataFrame must be GeoDataFrame. Got {type(df)} for {gcs_path}."
408
+ )
400
409
 
401
- if not len(df) and has_partitions(gcs_path, file_system):
410
+ if not len(df) and get_child_paths(gcs_path, file_system):
402
411
  # no need to write empty df
403
412
  return
404
413
  elif not len(df):
@@ -473,6 +482,23 @@ def _to_geopandas(df, path, **kwargs) -> None:
473
482
  pq.write_table(table, path, compression="snappy", **kwargs)
474
483
 
475
484
 
485
+ def _pyarrow_schema_from_geopandas(df: GeoDataFrame) -> pyarrow.Schema:
486
+ geom_name = df.geometry.name
487
+ pandas_columns = [col for col in df if col != geom_name]
488
+ schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
489
+ index_columns = _get_index_cols(schema)
490
+ return pyarrow.schema(
491
+ [
492
+ (
493
+ (schema.field(col).name, schema.field(col).type)
494
+ if col != geom_name
495
+ else (geom_name, pyarrow.binary())
496
+ )
497
+ for col in [*df.columns, *index_columns]
498
+ ]
499
+ )
500
+
501
+
476
502
  def _remove_file(path, file_system) -> None:
477
503
  try:
478
504
  file_system.rm_file(str(path))
@@ -494,38 +520,22 @@ def _write_partitioned_geoparquet(
494
520
  file_system=None,
495
521
  write_func: Callable = _to_geopandas,
496
522
  existing_data_behavior: str = "error",
523
+ basename_template: str | None = None,
497
524
  **kwargs,
498
525
  ):
499
- if isinstance(partition_cols, str):
500
- partition_cols = [partition_cols]
501
-
502
526
  file_system = _get_file_system(file_system, kwargs)
503
527
 
504
- path = Path(path)
505
- unique_id = uuid.uuid4()
528
+ if basename_template is None:
529
+ basename_template = uuid.uuid4().hex + "-{i}.parquet"
530
+
531
+ if isinstance(partition_cols, str):
532
+ partition_cols = [partition_cols]
506
533
 
507
534
  for col in partition_cols:
508
535
  if df[col].isna().all() and not kwargs.get("schema"):
509
536
  raise ValueError("Must specify 'schema' when all rows are NA.")
510
537
 
511
- try:
512
- glob_func = functools.partial(file_system.glob, detail=False)
513
- except AttributeError:
514
- glob_func = functools.partial(glob.glob, recursive=True)
515
-
516
- args: list[tuple[Path, DataFrame]] = []
517
- dirs: list[Path] = set()
518
- for group, rows in df.groupby(partition_cols, dropna=False):
519
- name = (
520
- "/".join(
521
- f"{col}={value if not pd.isna(value) else NULL_VALUE}"
522
- for col, value in zip(partition_cols, group, strict=True)
523
- )
524
- + f"/{unique_id}.parquet"
525
- )
526
-
527
- dirs.add((path / name).parent)
528
- args.append((path / name, rows))
538
+ glob_func = _get_glob(file_system)
529
539
 
530
540
  if file_system.exists(path) and file_system.isfile(path):
531
541
  _remove_file(path, file_system)
@@ -533,49 +543,56 @@ def _write_partitioned_geoparquet(
533
543
  if kwargs.get("schema"):
534
544
  schema = kwargs.pop("schema")
535
545
  elif isinstance(df, GeoDataFrame):
536
- geom_name = df.geometry.name
537
- pandas_columns = [col for col in df if col != geom_name]
538
- schema = pyarrow.Schema.from_pandas(df[pandas_columns], preserve_index=True)
539
- index_columns = _get_index_cols(schema)
540
- schema = pyarrow.schema(
541
- [
542
- (
543
- (schema.field(col).name, schema.field(col).type)
544
- if col != geom_name
545
- else (geom_name, pyarrow.binary())
546
- )
547
- for col in [*df.columns, *index_columns]
548
- # for col in df.columns
549
- ]
550
- )
546
+ schema = _pyarrow_schema_from_geopandas(df)
551
547
  else:
552
548
  schema = pyarrow.Schema.from_pandas(df, preserve_index=True)
553
549
 
554
- def get_siblings(path: str, paths: list[str]) -> list[str]:
555
- parts = path.parts
556
- return {x for x in paths if all(part in parts for part in x.parts)}
557
-
558
- def threaded_write(path_rows):
559
- new_path, rows = path_rows
560
- # for sibling_path in get_siblings(new_path, child_paths):
561
- for sibling_path in glob_func(str(Path(new_path).with_name("**"))):
562
- if not paths_are_equal(sibling_path, Path(new_path).parent):
563
- if existing_data_behavior == "delete_matching":
564
- _remove_file(sibling_path, file_system)
565
- elif existing_data_behavior == "error":
566
- raise pyarrow.ArrowInvalid(
567
- f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
568
- )
550
+ def as_partition_part(col: str, value: Any) -> str:
551
+ value = value if not pd.isna(value) else NULL_VALUE
552
+ return f"{col}={value}"
553
+
554
+ paths: list[Path] = []
555
+ dfs: list[DataFrame] = []
556
+ for group, rows in df.groupby(partition_cols, dropna=False):
557
+ partition_parts = "/".join(
558
+ as_partition_part(col, value)
559
+ for col, value in zip(partition_cols, group, strict=True)
560
+ )
561
+ paths.append(Path(path) / partition_parts)
562
+ dfs.append(rows)
563
+
564
+ def threaded_write(rows: DataFrame, path: str) -> None:
565
+ this_basename = basename_template.replace("-{i}", "0")
566
+ for i, sibling_path in enumerate(sorted(glob_func(str(Path(path) / "**")))):
567
+ if paths_are_equal(sibling_path, path):
568
+ continue
569
+ if existing_data_behavior == "delete_matching":
570
+ _remove_file(sibling_path, file_system)
571
+ elif existing_data_behavior == "error":
572
+ raise pyarrow.ArrowInvalid(
573
+ f"Could not write to {path} as the directory is not empty and existing_data_behavior is to error"
574
+ )
575
+ else:
576
+ this_basename = basename_template.replace("-{i}", str(i + 1))
577
+
578
+ out_path = str(Path(path) / this_basename)
569
579
  try:
570
- with file_system.open(new_path, mode="wb") as file:
580
+ with file_system.open(out_path, mode="wb") as file:
571
581
  write_func(rows, file, schema=schema, **kwargs)
572
582
  except FileNotFoundError:
573
- file_system.makedirs(str(Path(new_path).parent), exist_ok=True)
574
- with file_system.open(new_path, mode="wb") as file:
583
+ file_system.makedirs(str(path), exist_ok=True)
584
+ with file_system.open(out_path, mode="wb") as file:
575
585
  write_func(rows, file, schema=schema, **kwargs)
576
586
 
577
587
  with ThreadPoolExecutor() as executor:
578
- list(executor.map(threaded_write, args))
588
+ executor.map(threaded_write, dfs, paths)
589
+
590
+
591
+ def _get_glob(file_system) -> functools.partial:
592
+ try:
593
+ return functools.partial(file_system.glob)
594
+ except AttributeError:
595
+ return functools.partial(glob.glob, recursive=True)
579
596
 
580
597
 
581
598
  def _filters_to_expression(filters) -> list[ds.Expression]:
@@ -612,6 +629,8 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
612
629
  >>> expression_match_path(path, expression)
613
630
  False
614
631
  """
632
+ # keep only the parts in between the two .parquet parts
633
+ path = str(path).split(".parquet")[1]
615
634
  if NULL_VALUE in path:
616
635
  return True
617
636
  # build a one lengthed pyarrow.Table of the partitioning in the file path
@@ -627,39 +646,60 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
627
646
  try:
628
647
  table = table.filter(expression)
629
648
  except pyarrow.ArrowInvalid as e:
630
- if "No match for FieldRef" not in str(e):
631
- raise e
632
- # cannot determine if the expression match without reading the file
633
- return True
649
+ if "No match for FieldRef" in str(e):
650
+ # if a non-partition col is used in 'filters',
651
+ # we cannot determine if the expression match without reading the file
652
+ return True
653
+ raise e
634
654
  return bool(len(table))
635
655
 
636
656
 
637
- def _read_geopandas(file, pandas_fallback: bool, **kwargs):
657
+ def _read_geopandas(
658
+ file,
659
+ read_func: Callable = gpd.read_parquet,
660
+ file_format: str = "parquet",
661
+ **kwargs,
662
+ ):
638
663
  try:
639
- return gpd.read_parquet(file, **kwargs)
640
- except Exception as e:
641
- if not pandas_fallback:
642
- raise e
643
- df = pd.read_parquet(file, **kwargs)
644
- if len(df):
664
+ return read_func(file, **kwargs)
665
+ except ValueError as e:
666
+ if "Missing geo metadata" not in str(e) and "geometry" not in str(e):
645
667
  raise e
646
- return df
668
+ df = getattr(pd, f"read_{file_format}")(file, **kwargs)
669
+ if not len(df):
670
+ return GeoDataFrame(df)
671
+ raise e.__class__(f"{e.__class__.__name__}: {e} for {df}. ") from e
672
+ except Exception as e:
673
+ raise e.__class__(f"{e.__class__.__name__}: {e} for {file}.") from e
647
674
 
648
675
 
649
676
  def _read_pandas(gcs_path: str, **kwargs):
650
677
  file_system = _get_file_system(None, kwargs)
651
678
 
652
- child_paths = has_partitions(gcs_path, file_system)
653
- if child_paths:
654
- return gpd.GeoDataFrame(
655
- _read_partitioned_parquet(
656
- gcs_path,
657
- read_func=pd.read_parquet,
658
- file_system=file_system,
659
- mask=None,
660
- child_paths=child_paths,
661
- **kwargs,
679
+ if not isinstance(gcs_path, (str | Path | os.PathLike)):
680
+ # recursive read with threads
681
+ threads = (
682
+ min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
683
+ if kwargs.get("use_threads")
684
+ else 1
685
+ )
686
+ with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
687
+ return pd.concat(
688
+ parallel(
689
+ joblib.delayed(_read_pandas)(x, file_system=file_system, **kwargs)
690
+ for x in gcs_path
691
+ )
662
692
  )
693
+
694
+ child_paths = get_child_paths(gcs_path, file_system)
695
+ if child_paths:
696
+ return _read_partitioned_parquet(
697
+ gcs_path,
698
+ read_func=pd.read_parquet,
699
+ file_system=file_system,
700
+ mask=None,
701
+ child_paths=child_paths,
702
+ **kwargs,
663
703
  )
664
704
 
665
705
  with file_system.open(gcs_path, "rb") as file:
@@ -678,10 +718,7 @@ def _read_partitioned_parquet(
678
718
  file_system = _get_file_system(file_system, kwargs)
679
719
 
680
720
  if child_paths is None:
681
- try:
682
- glob_func = functools.partial(file_system.glob)
683
- except AttributeError:
684
- glob_func = functools.partial(glob.glob, recursive=True)
721
+ glob_func = _get_glob(file_system)
685
722
  child_paths = list(glob_func(str(Path(path) / "**/*.parquet")))
686
723
 
687
724
  filters = _filters_to_expression(filters)
@@ -690,12 +727,12 @@ def _read_partitioned_parquet(
690
727
  bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
691
728
  return shapely.box(*bbox).intersects(to_shapely(mask))
692
729
 
693
- def read(path) -> GeoDataFrame | None:
730
+ def read(path: str) -> pyarrow.Table | None:
694
731
  with file_system.open(path, "rb") as file:
695
732
  if mask is not None and not intersects(file, mask):
696
733
  return
697
734
 
698
- # get instead of pop, then copy kwargs (because mutable)
735
+ # 'get' instead of 'pop' because dict is mutable
699
736
  schema = kwargs.get("schema", pq.read_schema(file))
700
737
  new_kwargs = {
701
738
  key: value for key, value in kwargs.items() if key != "schema"
@@ -705,8 +742,8 @@ def _read_partitioned_parquet(
705
742
 
706
743
  with ThreadPoolExecutor() as executor:
707
744
  results = [
708
- x
709
- for x in (
745
+ df
746
+ for df in (
710
747
  executor.map(
711
748
  read,
712
749
  (
@@ -716,30 +753,34 @@ def _read_partitioned_parquet(
716
753
  ),
717
754
  )
718
755
  )
719
- if x is not None
756
+ if df is not None
720
757
  ]
758
+
721
759
  if results:
722
- if mask is not None:
723
- return sfilter(pd.concat(results), mask)
724
- return pd.concat(results)
760
+ if all(isinstance(x, DataFrame) for x in results):
761
+ results = pd.concat(results)
762
+ else:
763
+ geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
764
+ results = _arrow_to_geopandas(
765
+ pyarrow.concat_tables(
766
+ results,
767
+ promote_options="permissive",
768
+ ),
769
+ geo_metadata,
770
+ )
771
+ return results
725
772
 
726
773
  # add columns to empty DataFrame
727
774
  first_path = next(iter(child_paths + [path]))
728
- return pd.DataFrame(
729
- columns=list(dict.fromkeys(_get_columns(first_path, file_system)))
730
- )
775
+ return pd.DataFrame(columns=_get_columns(first_path, file_system))
731
776
 
732
777
 
733
778
  def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
734
779
  return Path(path1).parts == Path(path2).parts
735
780
 
736
781
 
737
- def has_partitions(path, file_system) -> list[str]:
738
- try:
739
- glob_func = functools.partial(file_system.glob, detail=False)
740
- except AttributeError:
741
- glob_func = functools.partial(glob.glob, recursive=True)
742
-
782
+ def get_child_paths(path, file_system) -> list[str]:
783
+ glob_func = _get_glob(file_system)
743
784
  return [
744
785
  x
745
786
  for x in glob_func(str(Path(path) / "**/*.parquet"))
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes