ssb-sgis 1.2.0__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/PKG-INFO +1 -1
  2. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/pyproject.toml +1 -1
  3. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/__init__.py +1 -0
  4. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/conf.py +13 -47
  5. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/buffer_dissolve_explode.py +2 -1
  6. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/duplicates.py +4 -3
  7. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/general.py +61 -30
  8. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/neighbors.py +25 -4
  9. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/overlay.py +4 -3
  10. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/polygon_operations.py +3 -2
  11. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/runners.py +94 -45
  12. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/sfilter.py +4 -2
  13. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/dapla_functions.py +98 -106
  14. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/LICENSE +0 -0
  15. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/README.md +0 -0
  16. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/debug_config.py +0 -0
  17. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/exceptions.py +0 -0
  18. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/__init__.py +0 -0
  19. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/bounds.py +0 -0
  20. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/centerlines.py +0 -0
  21. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/cleaning.py +0 -0
  22. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/conversion.py +0 -0
  23. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/geocoding.py +0 -0
  24. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/geometry_types.py +0 -0
  25. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/point_operations.py +0 -0
  26. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/polygons_as_rings.py +0 -0
  27. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/geopandas_tools/utils.py +0 -0
  28. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/helpers.py +0 -0
  29. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/__init__.py +0 -0
  30. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/_is_dapla.py +0 -0
  31. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/opener.py +0 -0
  32. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/io/read_parquet.py +0 -0
  33. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/__init__.py +0 -0
  34. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/examine.py +0 -0
  35. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/explore.py +0 -0
  36. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/httpserver.py +0 -0
  37. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/legend.py +0 -0
  38. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/map.py +0 -0
  39. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/maps.py +0 -0
  40. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/norge_i_bilder.json +0 -0
  41. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/thematicmap.py +0 -0
  42. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/tilesources.py +0 -0
  43. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/maps/wms.py +0 -0
  44. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/__init__.py +0 -0
  45. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_get_route.py +0 -0
  46. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_od_cost_matrix.py +0 -0
  47. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_points.py +0 -0
  48. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/_service_area.py +0 -0
  49. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/closing_network_holes.py +0 -0
  50. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/cutting_lines.py +0 -0
  51. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/directednetwork.py +0 -0
  52. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/finding_isolated_networks.py +0 -0
  53. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/network.py +0 -0
  54. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/networkanalysis.py +0 -0
  55. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/networkanalysisrules.py +0 -0
  56. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/nodes.py +0 -0
  57. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/networkanalysis/traveling_salesman.py +0 -0
  58. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/parallel/__init__.py +0 -0
  59. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/parallel/parallel.py +0 -0
  60. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/py.typed +0 -0
  61. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/__init__.py +0 -0
  62. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/base.py +0 -0
  63. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/image_collection.py +0 -0
  64. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/indices.py +0 -0
  65. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/regex.py +0 -0
  66. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/sentinel_config.py +0 -0
  67. {ssb_sgis-1.2.0 → ssb_sgis-1.2.2}/src/sgis/raster/zonal.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ssb-sgis
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: GIS functions used at Statistics Norway.
5
5
  Home-page: https://github.com/statisticsnorway/ssb-sgis
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "ssb-sgis"
3
- version = "1.2.0"
3
+ version = "1.2.2"
4
4
  description = "GIS functions used at Statistics Norway."
5
5
  authors = ["Morten Letnes <morten.letnes@ssb.no>"]
6
6
  license = "MIT"
@@ -42,6 +42,7 @@ from .geopandas_tools.general import make_lines_between_points
42
42
  from .geopandas_tools.general import points_in_bounds
43
43
  from .geopandas_tools.general import random_points
44
44
  from .geopandas_tools.general import random_points_in_polygons
45
+ from .geopandas_tools.general import random_points_norway
45
46
  from .geopandas_tools.general import sort_large_first
46
47
  from .geopandas_tools.general import sort_long_first
47
48
  from .geopandas_tools.general import sort_short_first
@@ -1,4 +1,3 @@
1
- from collections.abc import Iterable
2
1
  from typing import Any
3
2
 
4
3
  try:
@@ -74,51 +73,18 @@ from .geopandas_tools.runners import RTreeQueryRunner
74
73
  from .geopandas_tools.runners import UnionRunner
75
74
 
76
75
 
77
- class Config:
78
- """Dictlike config with a 'get_instance' method."""
76
+ def _get_instance(data: dict, key: str, **kwargs) -> Any:
77
+ """Get the dict value and call it if callable."""
78
+ x = data[key]
79
+ if callable(x):
80
+ return x(**kwargs)
81
+ return x
79
82
 
80
- def __init__(self, data: dict) -> None:
81
- """Initialise with dict."""
82
- self.data = data
83
83
 
84
- def get_instance(self, key: str, *args, **kwargs) -> Any:
85
- """Get the dict value and call it if callable."""
86
- x = self.data[key]
87
- if callable(x):
88
- return x(*args, **kwargs)
89
- return x
90
-
91
- def __getattr__(self, attr: str) -> Any:
92
- """Get dict attribute."""
93
- return getattr(self.data, attr)
94
-
95
- def __getitem__(self, key: str) -> Any:
96
- """Get dict value."""
97
- return self.data[key]
98
-
99
- def __setitem__(self, key: str, value) -> None:
100
- """Set dict value."""
101
- self.data[key] = value
102
-
103
- def __iter__(self) -> Iterable[str]:
104
- """Iterate over dict keys."""
105
- return iter(self.data)
106
-
107
- def __len__(self) -> int:
108
- """Length of dict."""
109
- return len(self.data)
110
-
111
- def __str__(self) -> str:
112
- """String representation of dict."""
113
- return str(self.data)
114
-
115
-
116
- config = Config(
117
- {
118
- "n_jobs": 1,
119
- "file_system": file_system,
120
- "rtree_runner": RTreeQueryRunner,
121
- "overlay_runner": OverlayRunner,
122
- "union_runner": UnionRunner,
123
- }
124
- )
84
+ config = {
85
+ "n_jobs": 1,
86
+ "file_system": file_system,
87
+ "rtree_runner": RTreeQueryRunner,
88
+ "overlay_runner": OverlayRunner,
89
+ "union_runner": UnionRunner,
90
+ }
@@ -23,6 +23,7 @@ from geopandas import GeoDataFrame
23
23
  from geopandas import GeoSeries
24
24
  from shapely import get_num_geometries
25
25
 
26
+ from ..conf import _get_instance
26
27
  from ..conf import config
27
28
  from ..parallel.parallel import Parallel
28
29
  from .geometry_types import make_all_singlepart
@@ -197,7 +198,7 @@ def _dissolve(
197
198
  return gdf
198
199
 
199
200
  if union_runner is None:
200
- union_runner = config.get_instance("union_runner", n_jobs)
201
+ union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
201
202
 
202
203
  geom_col = gdf.geometry.name
203
204
  by = dissolve_kwargs.pop("by", None)
@@ -8,6 +8,7 @@ from shapely import STRtree
8
8
  from shapely import difference
9
9
  from shapely import simplify
10
10
 
11
+ from ..conf import _get_instance
11
12
  from ..conf import config
12
13
  from .general import _determine_geom_type_args
13
14
  from .general import _push_geom_col
@@ -106,11 +107,11 @@ def update_geometries(
106
107
  return gdf
107
108
 
108
109
  if rtree_runner is None:
109
- rtree_runner = config.get_instance("rtree_runner", n_jobs)
110
+ rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
110
111
  if union_runner is None:
111
- union_runner = config.get_instance("union_runner", n_jobs)
112
+ union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
112
113
  if overlay_runner is None:
113
- overlay_runner = config.get_instance("overlay_runner", n_jobs)
114
+ overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
114
115
 
115
116
  if geom_type == "polygon" or get_geom_type(gdf) == "polygon":
116
117
  gdf.geometry = gdf.buffer(0)
@@ -21,7 +21,6 @@ from shapely import get_coordinates
21
21
  from shapely import get_parts
22
22
  from shapely import linestrings
23
23
  from shapely import make_valid
24
- from shapely import points as shapely_points
25
24
  from shapely.geometry import LineString
26
25
  from shapely.geometry import MultiPoint
27
26
  from shapely.geometry import Point
@@ -34,6 +33,7 @@ from .geometry_types import get_geom_type
34
33
  from .geometry_types import make_all_singlepart
35
34
  from .geometry_types import to_single_geom_type
36
35
  from .neighbors import get_k_nearest_neighbors
36
+ from .sfilter import sfilter
37
37
  from .sfilter import sfilter_split
38
38
 
39
39
 
@@ -416,7 +416,7 @@ def random_points(n: int, loc: float | int = 0.5) -> GeoDataFrame:
416
416
  """Creates a GeoDataFrame with n random points.
417
417
 
418
418
  Args:
419
- n: Number of points/rows to create.
419
+ n: Number of points to create.
420
420
  loc: Mean ('centre') of the distribution.
421
421
 
422
422
  Returns:
@@ -459,47 +459,78 @@ def random_points(n: int, loc: float | int = 0.5) -> GeoDataFrame:
459
459
  9999 POINT (134.503 168.155)
460
460
  [10000 rows x 1 columns]
461
461
  """
462
- if isinstance(n, (str, float)):
463
- n = int(n)
464
-
465
462
  x = np.random.rand(n) * float(loc) * 2
466
463
  y = np.random.rand(n) * float(loc) * 2
464
+ return GeoDataFrame(shapely.points(x, y=y), columns=["geometry"])
467
465
 
468
- return GeoDataFrame(
469
- (Point(x, y) for x, y in zip(x, y, strict=True)), columns=["geometry"]
470
- )
471
466
 
467
+ def random_points_norway(size: int, *, seed: int | None = None) -> GeoDataFrame:
468
+ """Creates a GeoDataFrame with crs=25833 n random points aprox. within the borders of mainland Norway.
469
+
470
+ Args:
471
+ size: Number of points to create.
472
+ seed: Optional random seed.
472
473
 
473
- def random_points_in_polygons(gdf: GeoDataFrame, n: int, seed=None) -> GeoDataFrame:
474
+ Returns:
475
+ A GeoDataFrame of points with n rows.
476
+ """
477
+ return random_points_in_polygons(
478
+ [
479
+ shapely.wkt.loads(x)
480
+ for x in [
481
+ "POLYGON ((546192 7586393, 546191 7586393, 526598 7592425, 526597 7592425, 526596 7592425, 526595 7592426, 526594 7592426, 525831 7593004, 525830 7593005, 525327 7593495, 525326 7593496, 525326 7593497, 525325 7593498, 525325 7593499, 525324 7593500, 525192 7594183, 525192 7594184, 524157 7606517, 524157 7606518, 524157 7606519, 524157 7606520, 524157 7606521, 526235 7613535, 526236 7613536, 559423 7676952, 559424 7676953, 559511 7677088, 579978 7708379, 636963 7792940, 636963 7792941, 636964 7792942, 636965 7792943, 641013 7795664, 823514 7912323, 823515 7912323, 823516 7912323, 882519 7931958, 882520 7931959, 882521 7931959, 953896 7939985, 953897 7939985, 973544 7939988, 973545 7939988, 973546 7939988, 975510 7939467, 1051029 7913762, 1051030 7913762, 1055067 7912225, 1055068 7912224, 1056725 7911491, 1098379 7890321, 1098380 7890320, 1098381 7890320, 1099197 7889670, 1099198 7889669, 1099442 7889429, 1099443 7889429, 1099444 7889428, 1099444 7889427, 1099445 7889426, 1099445 7889425, 1099445 7889424, 1099446 7889423, 1114954 7799458, 1115106 7797736, 1115106 7797735, 1115106 7797734, 1115106 7797733, 1115106 7797732, 1115105 7797731, 1115105 7797730, 1114774 7797199, 1112876 7794451, 1057595 7720320, 1057112 7719702, 1057112 7719701, 1057111 7719701, 1057110 7719700, 1057109 7719699, 902599 7637176, 902598 7637176, 902597 7637175, 902596 7637175, 702394 7590633, 702393 7590633, 702392 7590633, 546193 7586393, 546192 7586393))",
482
+ "POLYGON ((60672 6448410, 60671 6448411, 57185 6448783, 39229 6451077, 39228 6451077, 39227 6451077, 27839 6454916, 27838 6454916, 27808 6454929, 27807 6454929, 8939 6465625, 8938 6465626, 7449 6466699, 7448 6466700, 6876 6467215, 6876 6467216, -31966 6512038, -31968 6512040, -32554 6512779, -32554 6512780, -40259 6524877, -42041 6527698, -42217 6528008, -42546 6528677, -42547 6528678, -77251 6614452, -77252 6614453, -77252 6614454, -77252 6614455, -77252 6614456, -77206 6615751, -77206 6615752, -65669 6811422, -65669 6811423, -65608 6812139, -65608 6812140, -65608 6812141, -50907 6879624, -50907 6879625, -50907 6879626, -50906 6879627, -50889 6879658, -50889 6879659, -16217 6934790, -16217 6934791, -16216 6934792, -2958 6949589, -2957 6949590, 55128 6995098, 144915 7064393, 144915 7064394, 144916 7064395, 144958 7064418, 144959 7064418, 144960 7064418, 144961 7064419, 144962 7064419, 144963 7064419, 150493 7064408, 150494 7064408, 150495 7064408, 150770 7064370, 150771 7064370, 150772 7064370, 188559 7048106, 188560 7048105, 188664 7048054, 188665 7048054, 188666 7048053, 357806 6914084, 357807 6914083, 357808 6914082, 357809 6914081, 357809 6914080, 357810 6914079, 357810 6914078, 359829 6906908, 386160 6804356, 386160 6804355, 386160 6804354, 386160 6804353, 386160 6804352, 386160 6804351, 368140 6699014, 368140 6699013, 363725 6675483, 363725 6675482, 361041 6665071, 361040 6665070, 361040 6665069, 308721 6537573, 308720 6537572, 307187 6534433, 307187 6534432, 307186 6534431, 307185 6534430, 307184 6534429, 307183 6534429, 307182 6534428, 303562 6532881, 300420 6531558, 99437 6459510, 99436 6459510, 67654 6449332, 65417 6448682, 65416 6448682, 65415 6448682, 60673 6448410, 60672 6448410))",
483
+ "POLYGON ((219870 6914350, 219869 6914350, 219868 6914351, 219867 6914351, 194827 6928565, 194826 6928566, 193100 6929790, 193099 6929790, 193098 6929791, 193098 6929792, 193097 6929793, 157353 7006877, 157353 7006878, 154402 7017846, 154402 7017847, 154392 7017923, 154392 7017924, 154392 7017925, 154392 7017926, 166616 7077346, 166617 7077347, 169164 7087256, 169165 7087257, 170277 7089848, 173146 7096147, 173147 7096148, 174684 7098179, 174685 7098180, 314514 7253805, 314515 7253805, 314515 7253806, 314516 7253806, 314517 7253807, 314518 7253807, 314519 7253808, 314520 7253808, 314521 7253808, 314522 7253808, 314523 7253808, 314524 7253808, 332374.8847495829 7250200.016409928, 327615 7280207, 327615 7280208, 327615 7280209, 327615 7280210, 328471 7285637, 364549 7480637, 364549 7480638, 367030 7488919, 367030 7488920, 367045 7488948, 367045 7488949, 367046 7488950, 419493 7560257, 472291 7626092, 506326 7665544, 506327 7665545, 506328 7665546, 541847 7692387, 541848 7692388, 541849 7692388, 541850 7692389, 541851 7692389, 541852 7692389, 545852 7692619, 546265 7692617, 546266 7692617, 546267 7692617, 546268 7692617, 546269 7692616, 546270 7692616, 546270 7692615, 546271 7692615, 546272 7692614, 623027 7613734, 623028 7613733, 623029 7613732, 627609 7605928, 627610 7605928, 627610 7605927, 627610 7605926, 627611 7605925, 627611 7605924, 630573 7568363, 630573 7568362, 630573 7568361, 630573 7568360, 630573 7568359, 628567 7562381, 621356 7542293, 621356 7542292, 468368 7221876.188770507, 468368 7221876, 459071 7119021, 459071 7119020, 459071 7119019, 459070 7119018, 459070 7119017, 454728 7109371, 451784 7102984, 449525 7098307, 357809 6914071, 357808 6914070, 357808 6914069, 357807 6914068, 357806 6914068, 357806 6914067, 357805 6914067, 357804 6914066, 353158 6912240, 353157 6912239, 353156 6912239, 351669 6911974, 351668 6911974, 351667 6911974, 219871 6914350, 219870 6914350))",
484
+ ]
485
+ ],
486
+ size=size,
487
+ crs=25833,
488
+ seed=seed,
489
+ ).sample(size)
490
+
491
+
492
+ def random_points_in_polygons(
493
+ polygons: Geometry | GeoDataFrame | GeoSeries,
494
+ size: int,
495
+ *,
496
+ seed: int | None = None,
497
+ crs: Any = 25833,
498
+ ) -> GeoDataFrame:
474
499
  """Creates a GeoDataFrame with n random points within the geometries of 'gdf'.
475
500
 
476
501
  Args:
477
- gdf: A GeoDataFrame.
478
- n: Number of points/rows to create.
479
- seed: Optional random seet.
502
+ polygons: A GeoDataFrame or GeoSeries of polygons. Or a single polygon.
503
+ size: Number of points to create.
504
+ seed: Optional random seed.
505
+ crs: Optional crs of the output GeoDataFrame if input is shapely.Geometry.
480
506
 
481
507
  Returns:
482
508
  A GeoDataFrame of points with n rows.
483
509
  """
484
- all_points = []
485
-
510
+ if crs is None:
511
+ try:
512
+ crs = polygons.crs
513
+ except AttributeError:
514
+ pass
486
515
  rng = np.random.default_rng(seed)
487
-
488
- for i, geom in enumerate(gdf.geometry):
489
- minx, miny, maxx, maxy = geom.bounds
490
-
491
- xs = rng.uniform(minx, maxx, size=n * 500)
492
- ys = rng.uniform(miny, maxy, size=n * 500)
493
-
494
- points = GeoSeries(shapely_points(xs, y=ys), index=[i] * len(xs))
495
- all_points.append(points)
496
-
497
- return (
498
- pd.concat(all_points)
499
- .loc[lambda x: x.intersects(gdf.geometry)]
500
- .groupby(level=0)
501
- .head(n)
502
- )
516
+ polygons = to_gdf(polygons, crs=crs).geometry
517
+ bounds = polygons.bounds
518
+ minx = np.repeat(bounds["minx"].values, size)
519
+ maxx = np.repeat(bounds["maxx"].values, size)
520
+ miny = np.repeat(bounds["miny"].values, size)
521
+ maxy = np.repeat(bounds["maxy"].values, size)
522
+ index = np.repeat(np.arange(len(polygons)), size)
523
+ length = len(index)
524
+ out = []
525
+ while sum(len(df) for df in out) < size * len(polygons):
526
+ xs = rng.uniform(low=minx, high=maxx, size=length)
527
+ ys = rng.uniform(low=miny, high=maxy, size=length)
528
+ out.append(
529
+ GeoDataFrame(
530
+ shapely.points(xs, y=ys), index=index, columns=["geometry"], crs=crs
531
+ ).pipe(sfilter, polygons)
532
+ )
533
+ return pd.concat(out).groupby(level=0).sample(size, replace=True).sort_index()
503
534
 
504
535
 
505
536
  def polygons_to_lines(
@@ -17,8 +17,11 @@ from pandas import MultiIndex
17
17
  from pandas import Series
18
18
  from sklearn.neighbors import NearestNeighbors
19
19
 
20
+ from ..conf import _get_instance
21
+ from ..conf import config
20
22
  from .conversion import coordinate_array
21
23
  from .geometry_types import get_geom_type
24
+ from .runners import RTreeQueryRunner
22
25
 
23
26
 
24
27
  def get_neighbor_indices(
@@ -26,6 +29,8 @@ def get_neighbor_indices(
26
29
  neighbors: GeoDataFrame | GeoSeries,
27
30
  max_distance: int = 0,
28
31
  predicate: str = "intersects",
32
+ rtree_runner: RTreeQueryRunner | None = None,
33
+ n_jobs: int | None = None,
29
34
  ) -> Series:
30
35
  """Creates a pandas Series with the index of 'gdf' and values of 'neighbors'.
31
36
 
@@ -41,6 +46,9 @@ def get_neighbor_indices(
41
46
  predicate: Spatial predicate to use in sjoin. Defaults to "intersects", meaning
42
47
  the geometry itself and geometries within will be considered neighbors if
43
48
  they are part of the 'neighbors' GeoDataFrame.
49
+ rtree_runner: Optionally debug/manipulate the spatial indexing operations.
50
+ See the 'runners' module for example implementations.
51
+ n_jobs: Number of workers.
44
52
 
45
53
  Returns:
46
54
  A pandas Series with values of the intersecting 'neighbors' indices.
@@ -103,6 +111,9 @@ def get_neighbor_indices(
103
111
  if gdf.crs != neighbors.crs:
104
112
  raise ValueError(f"'crs' mismatch. Got {gdf.crs} and {neighbors.crs}")
105
113
 
114
+ if rtree_runner is None:
115
+ rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
116
+
106
117
  if isinstance(neighbors, GeoSeries):
107
118
  neighbors = neighbors.to_frame()
108
119
  else:
@@ -119,11 +130,21 @@ def get_neighbor_indices(
119
130
 
120
131
  if predicate == "nearest":
121
132
  max_distance = None if max_distance == 0 else max_distance
122
- joined = gdf.sjoin_nearest(neighbors, how="inner", max_distance=max_distance)
133
+ left, right = rtree_runner.run(
134
+ gdf.geometry.values,
135
+ neighbors.geometry.values,
136
+ method="query_nearest",
137
+ max_distance=max_distance,
138
+ )
123
139
  else:
124
- joined = gdf.sjoin(neighbors, how="inner", predicate=predicate)
125
-
126
- return joined.rename(columns={"index_right": "neighbor_index"})["neighbor_index"]
140
+ left, right = rtree_runner.run(
141
+ gdf.geometry.values, neighbors.geometry.values, predicate=predicate
142
+ )
143
+ index_mapper1 = {i: x for i, x in enumerate(gdf.index)}
144
+ left = np.array([index_mapper1[i] for i in left])
145
+ index_mapper2 = {i: x for i, x in enumerate(neighbors.index)}
146
+ right = np.array([index_mapper2[i] for i in right])
147
+ return Series(right, index=left, name="neighbor_index")
127
148
 
128
149
 
129
150
  def get_neighbor_dfs(
@@ -23,6 +23,7 @@ from shapely import is_empty
23
23
  from shapely import make_valid
24
24
  from shapely import union_all
25
25
 
26
+ from ..conf import _get_instance
26
27
  from ..conf import config
27
28
  from .general import _determine_geom_type_args
28
29
  from .general import clean_geoms
@@ -109,11 +110,11 @@ def clean_overlay(
109
110
  raise ValueError(f"'crs' mismatch. Got {df1.crs} and {df2.crs}")
110
111
 
111
112
  if rtree_runner is None:
112
- rtree_runner = config.get_instance("rtree_runner", n_jobs)
113
+ rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
113
114
  if union_runner is None:
114
- union_runner = config.get_instance("union_runner", n_jobs)
115
+ union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
115
116
  if overlay_runner is None:
116
- overlay_runner = config.get_instance("overlay_runner", n_jobs)
117
+ overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
117
118
 
118
119
  crs = df1.crs
119
120
 
@@ -26,6 +26,7 @@ from shapely.errors import GEOSException
26
26
  from shapely.geometry import LinearRing
27
27
  from shapely.ops import SplitOp
28
28
 
29
+ from ..conf import _get_instance
29
30
  from ..conf import config
30
31
  from ..debug_config import _DEBUG_CONFIG
31
32
  from ..debug_config import _try_debug_print
@@ -776,9 +777,9 @@ def _eliminate(
776
777
  return gdf
777
778
 
778
779
  if union_runner is None:
779
- union_runner = config.get_instance("union_runner", n_jobs)
780
+ union_runner = _get_instance(config, "union_runner", n_jobs=n_jobs)
780
781
  if overlay_runner is None:
781
- overlay_runner = config.get_instance("overlay_runner", n_jobs)
782
+ overlay_runner = _get_instance(config, "overlay_runner", n_jobs=n_jobs)
782
783
 
783
784
  gdf["_range_idx_elim"] = range(len(gdf))
784
785
 
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
  from geopandas import GeoDataFrame
12
12
  from geopandas import GeoSeries
13
+ from shapely import Geometry
13
14
  from shapely import STRtree
14
15
  from shapely import get_parts
15
16
  from shapely import make_valid
@@ -38,6 +39,12 @@ class AbstractRunner(ABC):
38
39
  def run(self, *args, **kwargs) -> Any:
39
40
  """Abstract run method."""
40
41
 
42
+ def __str__(self) -> str:
43
+ """String representation."""
44
+ return (
45
+ f"{self.__class__.__name__}(n_jobs={self.n_jobs}, backend='{self.backend}')"
46
+ )
47
+
41
48
 
42
49
  @dataclass
43
50
  class UnionRunner(AbstractRunner):
@@ -68,10 +75,10 @@ class UnionRunner(AbstractRunner):
68
75
  """Run groupby on geometries in parallel (if n_jobs > 1)."""
69
76
  # assume geometry column is 'geometry' if input is pandas.Series og pandas.DataFrame
70
77
  try:
71
- geom_col = df.geometry.name
78
+ geom_col: str = df.geometry.name
72
79
  except AttributeError:
73
80
  try:
74
- geom_col = df.name
81
+ geom_col: str | None = df.name
75
82
  if geom_col is None:
76
83
  geom_col = "geometry"
77
84
  except AttributeError:
@@ -90,10 +97,10 @@ class UnionRunner(AbstractRunner):
90
97
  by = np.zeros(len(df), dtype="int64")
91
98
 
92
99
  try:
93
- # DataFrame
100
+ # (Geo)DataFrame
94
101
  groupby_obj = df.groupby(by, **kwargs)[geom_col]
95
102
  except KeyError:
96
- # Series
103
+ # (Geo)Series
97
104
  groupby_obj = df.groupby(by, **kwargs)
98
105
 
99
106
  if self.n_jobs is None or self.n_jobs == 1:
@@ -113,9 +120,24 @@ class UnionRunner(AbstractRunner):
113
120
  return agged
114
121
 
115
122
 
116
- def _strtree_query(arr1, arr2, **kwargs):
123
+ def _strtree_query(
124
+ arr1: np.ndarray,
125
+ arr2: np.ndarray,
126
+ method: str,
127
+ indices1: np.ndarray | None = None,
128
+ indices2: np.ndarray | None = None,
129
+ **kwargs,
130
+ ):
117
131
  tree = STRtree(arr2)
118
- return tree.query(arr1, **kwargs)
132
+ func = getattr(tree, method)
133
+ left, right = func(arr1, **kwargs)
134
+ if indices1 is not None:
135
+ index_mapper1 = {i: x for i, x in enumerate(indices1)}
136
+ left = np.array([index_mapper1[i] for i in left])
137
+ if indices2 is not None:
138
+ index_mapper2 = {i: x for i, x in enumerate(indices2)}
139
+ right = np.array([index_mapper2[i] for i in right])
140
+ return left, right
119
141
 
120
142
 
121
143
  @dataclass
@@ -138,39 +160,52 @@ class RTreeQueryRunner(AbstractRunner):
138
160
  backend: str = "loky"
139
161
 
140
162
  def run(
141
- self, arr1: np.ndarray, arr2: np.ndarray, **kwargs
163
+ self, arr1: np.ndarray, arr2: np.ndarray, method: str = "query", **kwargs
142
164
  ) -> tuple[np.ndarray, np.ndarray]:
143
165
  """Run a spatial rtree query and return indices of hits from arr1 and arr2 in a tuple of two arrays."""
144
- # if (
145
- # self.n_jobs > 1
146
- # and len(arr1) / self.n_jobs > 1000
147
- # # and len(arr1) / len(arr2) > 3
148
- # ):
149
- # chunks = np.array_split(np.arange(len(arr1)), self.n_jobs)
150
- # assert sum(len(x) for x in chunks) == len(arr1)
151
- # with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
152
- # results = parallel(
153
- # joblib.delayed(_strtree_query)(arr1[chunk], arr2, **kwargs)
154
- # for chunk in chunks
155
- # )
156
- # left = np.concatenate([x[0] for x in results])
157
- # right = np.concatenate([x[1] for x in results])
158
- # return left, right
159
- # elif (
160
- # self.n_jobs > 1
161
- # and len(arr2) / self.n_jobs > 1000
162
- # and len(arr2) / len(arr1) > 3
163
- # ):
164
- # chunks = np.array_split(np.arange(len(arr2)), self.n_jobs)
165
- # with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
166
- # results = parallel(
167
- # joblib.delayed(_strtree_query)(arr1, arr2[chunk], **kwargs)
168
- # for chunk in chunks
169
- # )
170
- # left = np.concatenate([x[0] for x in results])
171
- # right = np.concatenate([x[1] for x in results])
172
- # return left, right
173
- return _strtree_query(arr1, arr2, **kwargs)
166
+ if (
167
+ (self.n_jobs or 1) > 1
168
+ and len(arr1) / self.n_jobs > 10_000
169
+ and len(arr1) / len(arr2)
170
+ ):
171
+ chunks = np.array_split(np.arange(len(arr1)), self.n_jobs)
172
+ assert sum(len(x) for x in chunks) == len(arr1)
173
+ with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
174
+ results = parallel(
175
+ joblib.delayed(_strtree_query)(
176
+ arr1[chunk],
177
+ arr2,
178
+ method=method,
179
+ indices1=chunk,
180
+ **kwargs,
181
+ )
182
+ for chunk in chunks
183
+ )
184
+ left = np.concatenate([x[0] for x in results])
185
+ right = np.concatenate([x[1] for x in results])
186
+ return left, right
187
+ elif (
188
+ (self.n_jobs or 1) > 1
189
+ and len(arr2) / self.n_jobs > 10_000
190
+ and len(arr2) / len(arr1)
191
+ ):
192
+ chunks = np.array_split(np.arange(len(arr2)), self.n_jobs)
193
+ with joblib.Parallel(self.n_jobs, backend=self.backend) as parallel:
194
+ results = parallel(
195
+ joblib.delayed(_strtree_query)(
196
+ arr1,
197
+ arr2[chunk],
198
+ method=method,
199
+ indices2=chunk,
200
+ **kwargs,
201
+ )
202
+ for chunk in chunks
203
+ )
204
+ left = np.concatenate([x[0] for x in results])
205
+ right = np.concatenate([x[1] for x in results])
206
+ return left, right
207
+
208
+ return _strtree_query(arr1, arr2, method=method, **kwargs)
174
209
 
175
210
 
176
211
  @dataclass
@@ -189,8 +224,8 @@ class OverlayRunner(AbstractRunner):
189
224
  n_jobs: None = None
190
225
  backend: None = None
191
226
 
192
- @staticmethod
193
227
  def run(
228
+ self,
194
229
  func: Callable,
195
230
  arr1: np.ndarray,
196
231
  arr2: np.ndarray,
@@ -219,7 +254,7 @@ class GridSizeOverlayRunner(OverlayRunner):
219
254
 
220
255
  n_jobs: int
221
256
  backend: str | None
222
- grid_sizes: list[float] | None = None
257
+ grid_sizes: list[float | int] | None = None
223
258
 
224
259
  def __post_init__(self) -> None:
225
260
  """Check that grid_sizes is passed."""
@@ -247,7 +282,9 @@ class GridSizeOverlayRunner(OverlayRunner):
247
282
 
248
283
  """
249
284
  kwargs = dict(
250
- grid_size=grid_size, geom_type=geom_type.lower(), grid_sizes=self.grid_sizes
285
+ grid_size=grid_size,
286
+ geom_type=geom_type.lower() if geom_type is not None else None,
287
+ grid_sizes=self.grid_sizes,
251
288
  )
252
289
  with joblib.Parallel(self.n_jobs, backend="threading") as parallel:
253
290
  return parallel(
@@ -256,15 +293,27 @@ class GridSizeOverlayRunner(OverlayRunner):
256
293
  )
257
294
 
258
295
 
259
- def _run_overlay_rowwise(func, geom1, geom2, grid_size, geom_type, grid_sizes):
296
+ def _fix_gemetry_fast(geom: Geometry, geom_type: str | None) -> Geometry:
297
+ geom = make_valid(geom)
298
+ if geom.geom_type == geom_type or geom_type is None:
299
+ return geom
300
+ return union_all([g for g in get_parts(geom) if geom_type in g.geom_type])
301
+
302
+
303
+ def _run_overlay_rowwise(
304
+ func: Callable,
305
+ geom1: Geometry,
306
+ geom2: Geometry,
307
+ grid_size: float | int | None,
308
+ geom_type: str | None,
309
+ grid_sizes: list[float | int],
310
+ ) -> Geometry:
260
311
  try:
261
312
  return func(geom1, geom2, grid_size=grid_size)
262
313
  except GEOSException:
263
314
  pass
264
- geom1 = get_parts(make_valid(geom1))
265
- geom2 = get_parts(make_valid(geom2))
266
- geom1 = union_all([g for g in geom1 if pd.notna(g) and geom_type in g.geom_type])
267
- geom2 = union_all([g for g in geom2 if pd.notna(g) and geom_type in g.geom_type])
315
+ geom1 = _fix_gemetry_fast(geom1, geom_type)
316
+ geom2 = _fix_gemetry_fast(geom2, geom_type)
268
317
  try:
269
318
  return func(geom1, geom2)
270
319
  except GEOSException:
@@ -6,6 +6,8 @@ from geopandas import GeoDataFrame
6
6
  from geopandas import GeoSeries
7
7
  from shapely import Geometry
8
8
 
9
+ from ..conf import _get_instance
10
+ from ..conf import config
9
11
  from .conversion import to_gdf
10
12
  from .runners import RTreeQueryRunner
11
13
 
@@ -17,7 +19,7 @@ def sfilter(
17
19
  other: GeoDataFrame | GeoSeries | Geometry,
18
20
  predicate: str = "intersects",
19
21
  distance: int | float | None = None,
20
- n_jobs: int = 1,
22
+ n_jobs: int | None = None,
21
23
  rtree_runner: RTreeQueryRunner | None = None,
22
24
  ) -> GeoDataFrame:
23
25
  """Filter a GeoDataFrame or GeoSeries by spatial predicate.
@@ -296,7 +298,7 @@ def _get_sfilter_indices(
296
298
  original_predicate = predicate
297
299
 
298
300
  if rtree_runner is None:
299
- rtree_runner = RTreeQueryRunner(n_jobs)
301
+ rtree_runner = _get_instance(config, "rtree_runner", n_jobs=n_jobs)
300
302
 
301
303
  with warnings.catch_warnings():
302
304
  # We don't need to show our own warning here
@@ -110,7 +110,8 @@ def read_geopandas(
110
110
  try:
111
111
  expression = "".join(next(iter(filters))).replace("==", "=")
112
112
  glob_func = _get_glob_func(file_system)
113
- paths = glob_func(str(Path(gcs_path) / expression))
113
+ suffix: str = Path(gcs_path).suffix
114
+ paths = glob_func(str(Path(gcs_path) / expression / f"*{suffix}"))
114
115
  if paths:
115
116
  return _read_geopandas_from_iterable(
116
117
  paths,
@@ -128,11 +129,11 @@ def read_geopandas(
128
129
  return gpd.GeoDataFrame(
129
130
  _read_partitioned_parquet(
130
131
  gcs_path,
131
- read_func=pq.read_table,
132
132
  file_system=file_system,
133
133
  mask=mask,
134
134
  filters=filters,
135
135
  child_paths=child_paths,
136
+ use_threads=use_threads,
136
137
  **kwargs,
137
138
  )
138
139
  )
@@ -145,7 +146,7 @@ def read_geopandas(
145
146
  read_func = gpd.read_file
146
147
 
147
148
  with file_system.open(gcs_path, mode="rb") as file:
148
- return _read_geopandas(
149
+ return _read_geopandas_single_path(
149
150
  file,
150
151
  read_func=read_func,
151
152
  file_format=file_format,
@@ -163,18 +164,10 @@ def _read_geopandas_from_iterable(
163
164
  paths = list(paths.index)
164
165
  elif mask is None:
165
166
  paths = list(paths)
166
- else:
167
- if not isinstance(paths, GeoSeries):
168
- bounds_series: GeoSeries = get_bounds_series(
169
- paths,
170
- file_system,
171
- use_threads=use_threads,
172
- pandas_fallback=pandas_fallback,
173
- )
174
- else:
175
- bounds_series = paths
176
- new_bounds_series = sfilter(bounds_series, mask)
177
- if not len(new_bounds_series):
167
+ elif isinstance(paths, GeoSeries):
168
+ bounds_series = sfilter(paths, mask)
169
+ if not len(bounds_series):
170
+ # return GeoDataFrame with correct columns
178
171
  if isinstance(kwargs.get("columns"), Iterable):
179
172
  cols = {col: [] for col in kwargs["columns"]}
180
173
  else:
@@ -186,29 +179,14 @@ def _read_geopandas_from_iterable(
186
179
  if file_system.isfile(path):
187
180
  raise ArrowInvalid(e, path) from e
188
181
  return GeoDataFrame(cols | {"geometry": []})
189
- paths = list(new_bounds_series.index)
182
+ paths = list(bounds_series.index)
190
183
 
191
- # recursive read with threads
192
- threads = (
193
- min(len(paths), int(multiprocessing.cpu_count())) or 1 if use_threads else 1
184
+ results: list[pyarrow.Table] = _read_pyarrow_with_treads(
185
+ paths, file_system=file_system, mask=mask, use_threads=use_threads, **kwargs
194
186
  )
195
- with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
196
- dfs: list[GeoDataFrame] = parallel(
197
- joblib.delayed(read_geopandas)(
198
- x,
199
- file_system=file_system,
200
- pandas_fallback=pandas_fallback,
201
- mask=mask,
202
- use_threads=use_threads,
203
- **kwargs,
204
- )
205
- for x in paths
206
- )
207
-
208
- if dfs:
209
- df = pd.concat(dfs, ignore_index=True)
187
+ if results:
210
188
  try:
211
- df = GeoDataFrame(df)
189
+ return _concat_pyarrow_to_geopandas(results, paths, file_system)
212
190
  except Exception as e:
213
191
  if not pandas_fallback:
214
192
  print(e)
@@ -219,6 +197,49 @@ def _read_geopandas_from_iterable(
219
197
  return df
220
198
 
221
199
 
200
+ def _read_pyarrow_with_treads(
201
+ paths: list[str | Path | os.PathLike], file_system, use_threads, mask, **kwargs
202
+ ) -> list[pyarrow.Table]:
203
+ read_partial = functools.partial(
204
+ _read_pyarrow, mask=mask, file_system=file_system, **kwargs
205
+ )
206
+ if not use_threads:
207
+ return [x for x in map(read_partial, paths) if x is not None]
208
+ with ThreadPoolExecutor() as executor:
209
+ return [x for x in executor.map(read_partial, paths) if x is not None]
210
+
211
+
212
+ def intersects(file, mask, file_system) -> bool:
213
+ bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
214
+ return shapely.box(*bbox).intersects(to_shapely(mask))
215
+
216
+
217
+ def _read_pyarrow(path: str, file_system, mask=None, **kwargs) -> pyarrow.Table | None:
218
+ try:
219
+ with file_system.open(path, "rb") as file:
220
+ if mask is not None and not intersects(file, mask, file_system):
221
+ return
222
+
223
+ # 'get' instead of 'pop' because dict is mutable
224
+ schema = kwargs.get("schema", pq.read_schema(file))
225
+ new_kwargs = {
226
+ key: value for key, value in kwargs.items() if key != "schema"
227
+ }
228
+
229
+ return pq.read_table(file, schema=schema, **new_kwargs)
230
+ except ArrowInvalid as e:
231
+ glob_func = _get_glob_func(file_system)
232
+ if not len(
233
+ {
234
+ x
235
+ for x in glob_func(str(Path(path) / "**"))
236
+ if not paths_are_equal(path, x)
237
+ }
238
+ ):
239
+ raise e
240
+ # allow not being able to read empty directories that are hard to delete in gcs
241
+
242
+
222
243
  def _get_bounds_parquet(
223
244
  path: str | Path, file_system: GCSFileSystem, pandas_fallback: bool = False
224
245
  ) -> tuple[list[float], dict] | tuple[None, None]:
@@ -662,10 +683,10 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
662
683
  return bool(len(table))
663
684
 
664
685
 
665
- def _read_geopandas(
686
+ def _read_geopandas_single_path(
666
687
  file,
667
- read_func: Callable = gpd.read_parquet,
668
- file_format: str = "parquet",
688
+ read_func: Callable,
689
+ file_format: str,
669
690
  **kwargs,
670
691
  ):
671
692
  try:
@@ -681,32 +702,29 @@ def _read_geopandas(
681
702
  raise e.__class__(f"{e.__class__.__name__}: {e} for {file}.") from e
682
703
 
683
704
 
684
- def _read_pandas(gcs_path: str, **kwargs):
705
+ def _read_pandas(gcs_path: str, use_threads: bool = True, **kwargs):
685
706
  file_system = _get_file_system(None, kwargs)
686
707
 
687
708
  if not isinstance(gcs_path, (str | Path | os.PathLike)):
688
- # recursive read with threads
689
- threads = (
690
- min(len(gcs_path), int(multiprocessing.cpu_count())) or 1
691
- if kwargs.get("use_threads")
692
- else 1
709
+ results: list[pyarrow.Table] = _read_pyarrow_with_treads(
710
+ gcs_path,
711
+ file_system=file_system,
712
+ mask=None,
713
+ use_threads=use_threads,
714
+ **kwargs,
693
715
  )
694
- with joblib.Parallel(n_jobs=threads, backend="threading") as parallel:
695
- return pd.concat(
696
- parallel(
697
- joblib.delayed(_read_pandas)(x, file_system=file_system, **kwargs)
698
- for x in gcs_path
699
- )
700
- )
716
+ results = pyarrow.concat_tables(results, promote_options="permissive")
717
+ return results.to_pandas()
701
718
 
702
719
  child_paths = get_child_paths(gcs_path, file_system)
703
720
  if child_paths:
704
721
  return _read_partitioned_parquet(
705
722
  gcs_path,
706
- read_func=pd.read_parquet,
707
723
  file_system=file_system,
708
724
  mask=None,
709
725
  child_paths=child_paths,
726
+ use_threads=use_threads,
727
+ to_geopandas=False,
710
728
  **kwargs,
711
729
  )
712
730
 
@@ -716,11 +734,12 @@ def _read_pandas(gcs_path: str, **kwargs):
716
734
 
717
735
  def _read_partitioned_parquet(
718
736
  path: str,
719
- read_func: Callable,
720
737
  filters=None,
721
738
  file_system=None,
722
739
  mask=None,
723
740
  child_paths: list[str] | None = None,
741
+ use_threads: bool = True,
742
+ to_geopandas: bool = True,
724
743
  **kwargs,
725
744
  ):
726
745
  file_system = _get_file_system(file_system, kwargs)
@@ -731,60 +750,22 @@ def _read_partitioned_parquet(
731
750
 
732
751
  filters = _filters_to_expression(filters)
733
752
 
734
- def intersects(file, mask) -> bool:
735
- bbox, _ = _get_bounds_parquet_from_open_file(file, file_system)
736
- return shapely.box(*bbox).intersects(to_shapely(mask))
737
-
738
- def read(child_path: str) -> pyarrow.Table | None:
739
- try:
740
- with file_system.open(child_path, "rb") as file:
741
- if mask is not None and not intersects(file, mask):
742
- return
743
-
744
- # 'get' instead of 'pop' because dict is mutable
745
- schema = kwargs.get("schema", pq.read_schema(file))
746
- new_kwargs = {
747
- key: value for key, value in kwargs.items() if key != "schema"
748
- }
749
-
750
- return read_func(file, schema=schema, filters=filters, **new_kwargs)
751
- except ArrowInvalid as e:
752
- if not len(
753
- {
754
- x
755
- for x in glob_func(str(Path(child_path) / "**"))
756
- if not paths_are_equal(child_path, x)
757
- }
758
- ):
759
- raise e
760
- # allow not being able to read hard-to-delete empty directories
753
+ results: list[pyarrow.Table] = _read_pyarrow_with_treads(
754
+ (
755
+ path
756
+ for path in child_paths
757
+ if filters is None or expression_match_path(filters, path)
758
+ ),
759
+ file_system=file_system,
760
+ mask=mask,
761
+ use_threads=use_threads,
762
+ **kwargs,
763
+ )
761
764
 
762
- with ThreadPoolExecutor() as executor:
763
- results = [
764
- df
765
- for df in (
766
- executor.map(
767
- read,
768
- (
769
- path
770
- for path in child_paths
771
- if filters is None or expression_match_path(filters, path)
772
- ),
773
- )
774
- )
775
- if df is not None
776
- ]
777
-
778
- if results:
779
- if all(isinstance(x, DataFrame) for x in results):
780
- return pd.concat(results)
781
- else:
782
- results = pyarrow.concat_tables(
783
- results,
784
- promote_options="permissive",
785
- )
786
- geo_metadata = _get_geo_metadata(next(iter(child_paths)), file_system)
787
- return _arrow_to_geopandas(results, geo_metadata)
765
+ if results and to_geopandas:
766
+ return _concat_pyarrow_to_geopandas(results, child_paths, file_system)
767
+ elif results:
768
+ return pyarrow.concat_tables(results, promote_options="permissive").to_pandas()
788
769
 
789
770
  # add columns to empty DataFrame
790
771
  first_path = next(iter(child_paths + [path]))
@@ -794,6 +775,17 @@ def _read_partitioned_parquet(
794
775
  return df
795
776
 
796
777
 
778
+ def _concat_pyarrow_to_geopandas(
779
+ results: list[pyarrow.Table], paths: list[str], file_system: Any
780
+ ):
781
+ results = pyarrow.concat_tables(
782
+ results,
783
+ promote_options="permissive",
784
+ )
785
+ geo_metadata = _get_geo_metadata(next(iter(paths)), file_system)
786
+ return _arrow_to_geopandas(results, geo_metadata)
787
+
788
+
797
789
  def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
798
790
  return Path(path1).parts == Path(path2).parts
799
791
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes