pyogrio 0.12.0__cp314-cp314t-macosx_12_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. pyogrio/.dylibs/libgdal.37.3.11.4.dylib +0 -0
  2. pyogrio/__init__.py +57 -0
  3. pyogrio/_compat.py +54 -0
  4. pyogrio/_env.py +59 -0
  5. pyogrio/_err.cpython-314t-darwin.so +0 -0
  6. pyogrio/_geometry.cpython-314t-darwin.so +0 -0
  7. pyogrio/_io.cpython-314t-darwin.so +0 -0
  8. pyogrio/_ogr.cpython-314t-darwin.so +0 -0
  9. pyogrio/_version.py +21 -0
  10. pyogrio/_vsi.cpython-314t-darwin.so +0 -0
  11. pyogrio/core.py +387 -0
  12. pyogrio/errors.py +25 -0
  13. pyogrio/gdal_data/GDAL-targets-release.cmake +19 -0
  14. pyogrio/gdal_data/GDAL-targets.cmake +106 -0
  15. pyogrio/gdal_data/GDALConfig.cmake +24 -0
  16. pyogrio/gdal_data/GDALConfigVersion.cmake +65 -0
  17. pyogrio/gdal_data/GDALLogoBW.svg +138 -0
  18. pyogrio/gdal_data/GDALLogoColor.svg +126 -0
  19. pyogrio/gdal_data/GDALLogoGS.svg +126 -0
  20. pyogrio/gdal_data/LICENSE.TXT +467 -0
  21. pyogrio/gdal_data/MM_m_idofic.csv +321 -0
  22. pyogrio/gdal_data/copyright +467 -0
  23. pyogrio/gdal_data/cubewerx_extra.wkt +48 -0
  24. pyogrio/gdal_data/default.rsc +0 -0
  25. pyogrio/gdal_data/ecw_cs.wkt +1453 -0
  26. pyogrio/gdal_data/eedaconf.json +23 -0
  27. pyogrio/gdal_data/epsg.wkt +1 -0
  28. pyogrio/gdal_data/esri_StatePlane_extra.wkt +631 -0
  29. pyogrio/gdal_data/gdal_algorithm.schema.json +220 -0
  30. pyogrio/gdal_data/gdalg.schema.json +36 -0
  31. pyogrio/gdal_data/gdalicon.png +0 -0
  32. pyogrio/gdal_data/gdalinfo_output.schema.json +390 -0
  33. pyogrio/gdal_data/gdalmdiminfo_output.schema.json +326 -0
  34. pyogrio/gdal_data/gdaltileindex.xsd +253 -0
  35. pyogrio/gdal_data/gdalvrt.xsd +927 -0
  36. pyogrio/gdal_data/gfs.xsd +246 -0
  37. pyogrio/gdal_data/gml_registry.xml +117 -0
  38. pyogrio/gdal_data/gml_registry.xsd +66 -0
  39. pyogrio/gdal_data/grib2_center.csv +251 -0
  40. pyogrio/gdal_data/grib2_process.csv +102 -0
  41. pyogrio/gdal_data/grib2_subcenter.csv +63 -0
  42. pyogrio/gdal_data/grib2_table_4_2_0_0.csv +261 -0
  43. pyogrio/gdal_data/grib2_table_4_2_0_1.csv +261 -0
  44. pyogrio/gdal_data/grib2_table_4_2_0_13.csv +261 -0
  45. pyogrio/gdal_data/grib2_table_4_2_0_14.csv +261 -0
  46. pyogrio/gdal_data/grib2_table_4_2_0_15.csv +261 -0
  47. pyogrio/gdal_data/grib2_table_4_2_0_16.csv +261 -0
  48. pyogrio/gdal_data/grib2_table_4_2_0_17.csv +11 -0
  49. pyogrio/gdal_data/grib2_table_4_2_0_18.csv +261 -0
  50. pyogrio/gdal_data/grib2_table_4_2_0_19.csv +261 -0
  51. pyogrio/gdal_data/grib2_table_4_2_0_190.csv +261 -0
  52. pyogrio/gdal_data/grib2_table_4_2_0_191.csv +261 -0
  53. pyogrio/gdal_data/grib2_table_4_2_0_2.csv +261 -0
  54. pyogrio/gdal_data/grib2_table_4_2_0_20.csv +261 -0
  55. pyogrio/gdal_data/grib2_table_4_2_0_21.csv +261 -0
  56. pyogrio/gdal_data/grib2_table_4_2_0_3.csv +261 -0
  57. pyogrio/gdal_data/grib2_table_4_2_0_4.csv +261 -0
  58. pyogrio/gdal_data/grib2_table_4_2_0_5.csv +261 -0
  59. pyogrio/gdal_data/grib2_table_4_2_0_6.csv +261 -0
  60. pyogrio/gdal_data/grib2_table_4_2_0_7.csv +261 -0
  61. pyogrio/gdal_data/grib2_table_4_2_10_0.csv +261 -0
  62. pyogrio/gdal_data/grib2_table_4_2_10_1.csv +261 -0
  63. pyogrio/gdal_data/grib2_table_4_2_10_191.csv +261 -0
  64. pyogrio/gdal_data/grib2_table_4_2_10_2.csv +261 -0
  65. pyogrio/gdal_data/grib2_table_4_2_10_3.csv +261 -0
  66. pyogrio/gdal_data/grib2_table_4_2_10_4.csv +261 -0
  67. pyogrio/gdal_data/grib2_table_4_2_1_0.csv +261 -0
  68. pyogrio/gdal_data/grib2_table_4_2_1_1.csv +261 -0
  69. pyogrio/gdal_data/grib2_table_4_2_1_2.csv +261 -0
  70. pyogrio/gdal_data/grib2_table_4_2_20_0.csv +261 -0
  71. pyogrio/gdal_data/grib2_table_4_2_20_1.csv +261 -0
  72. pyogrio/gdal_data/grib2_table_4_2_20_2.csv +261 -0
  73. pyogrio/gdal_data/grib2_table_4_2_2_0.csv +261 -0
  74. pyogrio/gdal_data/grib2_table_4_2_2_3.csv +261 -0
  75. pyogrio/gdal_data/grib2_table_4_2_2_4.csv +261 -0
  76. pyogrio/gdal_data/grib2_table_4_2_2_5.csv +261 -0
  77. pyogrio/gdal_data/grib2_table_4_2_2_6.csv +261 -0
  78. pyogrio/gdal_data/grib2_table_4_2_3_0.csv +261 -0
  79. pyogrio/gdal_data/grib2_table_4_2_3_1.csv +261 -0
  80. pyogrio/gdal_data/grib2_table_4_2_3_2.csv +28 -0
  81. pyogrio/gdal_data/grib2_table_4_2_3_3.csv +8 -0
  82. pyogrio/gdal_data/grib2_table_4_2_3_4.csv +14 -0
  83. pyogrio/gdal_data/grib2_table_4_2_3_5.csv +11 -0
  84. pyogrio/gdal_data/grib2_table_4_2_3_6.csv +11 -0
  85. pyogrio/gdal_data/grib2_table_4_2_4_0.csv +261 -0
  86. pyogrio/gdal_data/grib2_table_4_2_4_1.csv +261 -0
  87. pyogrio/gdal_data/grib2_table_4_2_4_10.csv +261 -0
  88. pyogrio/gdal_data/grib2_table_4_2_4_2.csv +261 -0
  89. pyogrio/gdal_data/grib2_table_4_2_4_3.csv +261 -0
  90. pyogrio/gdal_data/grib2_table_4_2_4_4.csv +261 -0
  91. pyogrio/gdal_data/grib2_table_4_2_4_5.csv +261 -0
  92. pyogrio/gdal_data/grib2_table_4_2_4_6.csv +261 -0
  93. pyogrio/gdal_data/grib2_table_4_2_4_7.csv +261 -0
  94. pyogrio/gdal_data/grib2_table_4_2_4_8.csv +261 -0
  95. pyogrio/gdal_data/grib2_table_4_2_4_9.csv +261 -0
  96. pyogrio/gdal_data/grib2_table_4_2_local_Canada.csv +5 -0
  97. pyogrio/gdal_data/grib2_table_4_2_local_HPC.csv +2 -0
  98. pyogrio/gdal_data/grib2_table_4_2_local_MRMS.csv +175 -0
  99. pyogrio/gdal_data/grib2_table_4_2_local_NCEP.csv +401 -0
  100. pyogrio/gdal_data/grib2_table_4_2_local_NDFD.csv +38 -0
  101. pyogrio/gdal_data/grib2_table_4_2_local_index.csv +7 -0
  102. pyogrio/gdal_data/grib2_table_4_5.csv +261 -0
  103. pyogrio/gdal_data/grib2_table_versions.csv +3 -0
  104. pyogrio/gdal_data/gt_datum.csv +229 -0
  105. pyogrio/gdal_data/gt_ellips.csv +24 -0
  106. pyogrio/gdal_data/header.dxf +1124 -0
  107. pyogrio/gdal_data/inspire_cp_BasicPropertyUnit.gfs +57 -0
  108. pyogrio/gdal_data/inspire_cp_CadastralBoundary.gfs +60 -0
  109. pyogrio/gdal_data/inspire_cp_CadastralParcel.gfs +81 -0
  110. pyogrio/gdal_data/inspire_cp_CadastralZoning.gfs +161 -0
  111. pyogrio/gdal_data/jpfgdgml_AdmArea.gfs +59 -0
  112. pyogrio/gdal_data/jpfgdgml_AdmBdry.gfs +49 -0
  113. pyogrio/gdal_data/jpfgdgml_AdmPt.gfs +59 -0
  114. pyogrio/gdal_data/jpfgdgml_BldA.gfs +54 -0
  115. pyogrio/gdal_data/jpfgdgml_BldL.gfs +54 -0
  116. pyogrio/gdal_data/jpfgdgml_Cntr.gfs +54 -0
  117. pyogrio/gdal_data/jpfgdgml_CommBdry.gfs +49 -0
  118. pyogrio/gdal_data/jpfgdgml_CommPt.gfs +59 -0
  119. pyogrio/gdal_data/jpfgdgml_Cstline.gfs +54 -0
  120. pyogrio/gdal_data/jpfgdgml_ElevPt.gfs +54 -0
  121. pyogrio/gdal_data/jpfgdgml_GCP.gfs +94 -0
  122. pyogrio/gdal_data/jpfgdgml_LeveeEdge.gfs +49 -0
  123. pyogrio/gdal_data/jpfgdgml_RailCL.gfs +54 -0
  124. pyogrio/gdal_data/jpfgdgml_RdASL.gfs +44 -0
  125. pyogrio/gdal_data/jpfgdgml_RdArea.gfs +54 -0
  126. pyogrio/gdal_data/jpfgdgml_RdCompt.gfs +59 -0
  127. pyogrio/gdal_data/jpfgdgml_RdEdg.gfs +59 -0
  128. pyogrio/gdal_data/jpfgdgml_RdMgtBdry.gfs +49 -0
  129. pyogrio/gdal_data/jpfgdgml_RdSgmtA.gfs +59 -0
  130. pyogrio/gdal_data/jpfgdgml_RvrMgtBdry.gfs +49 -0
  131. pyogrio/gdal_data/jpfgdgml_SBAPt.gfs +49 -0
  132. pyogrio/gdal_data/jpfgdgml_SBArea.gfs +54 -0
  133. pyogrio/gdal_data/jpfgdgml_SBBdry.gfs +44 -0
  134. pyogrio/gdal_data/jpfgdgml_WA.gfs +54 -0
  135. pyogrio/gdal_data/jpfgdgml_WL.gfs +54 -0
  136. pyogrio/gdal_data/jpfgdgml_WStrA.gfs +54 -0
  137. pyogrio/gdal_data/jpfgdgml_WStrL.gfs +54 -0
  138. pyogrio/gdal_data/leaflet_template.html +102 -0
  139. pyogrio/gdal_data/nitf_spec.xml +3288 -0
  140. pyogrio/gdal_data/nitf_spec.xsd +171 -0
  141. pyogrio/gdal_data/ogr_fields_override.schema.json +125 -0
  142. pyogrio/gdal_data/ogrinfo_output.schema.json +528 -0
  143. pyogrio/gdal_data/ogrvrt.xsd +528 -0
  144. pyogrio/gdal_data/osmconf.ini +134 -0
  145. pyogrio/gdal_data/ozi_datum.csv +131 -0
  146. pyogrio/gdal_data/ozi_ellips.csv +35 -0
  147. pyogrio/gdal_data/pci_datum.txt +530 -0
  148. pyogrio/gdal_data/pci_ellips.txt +129 -0
  149. pyogrio/gdal_data/pdfcomposition.xsd +703 -0
  150. pyogrio/gdal_data/pds4_template.xml +65 -0
  151. pyogrio/gdal_data/plscenesconf.json +1985 -0
  152. pyogrio/gdal_data/ruian_vf_ob_v1.gfs +1455 -0
  153. pyogrio/gdal_data/ruian_vf_st_uvoh_v1.gfs +86 -0
  154. pyogrio/gdal_data/ruian_vf_st_v1.gfs +1489 -0
  155. pyogrio/gdal_data/ruian_vf_v1.gfs +2126 -0
  156. pyogrio/gdal_data/s57agencies.csv +249 -0
  157. pyogrio/gdal_data/s57attributes.csv +484 -0
  158. pyogrio/gdal_data/s57expectedinput.csv +1008 -0
  159. pyogrio/gdal_data/s57objectclasses.csv +287 -0
  160. pyogrio/gdal_data/seed_2d.dgn +0 -0
  161. pyogrio/gdal_data/seed_3d.dgn +0 -0
  162. pyogrio/gdal_data/stateplane.csv +259 -0
  163. pyogrio/gdal_data/template_tiles.mapml +28 -0
  164. pyogrio/gdal_data/tms_LINZAntarticaMapTileGrid.json +190 -0
  165. pyogrio/gdal_data/tms_MapML_APSTILE.json +268 -0
  166. pyogrio/gdal_data/tms_MapML_CBMTILE.json +346 -0
  167. pyogrio/gdal_data/tms_NZTM2000.json +243 -0
  168. pyogrio/gdal_data/trailer.dxf +434 -0
  169. pyogrio/gdal_data/usage +4 -0
  170. pyogrio/gdal_data/vcpkg-cmake-wrapper.cmake +23 -0
  171. pyogrio/gdal_data/vcpkg.spdx.json +291 -0
  172. pyogrio/gdal_data/vcpkg_abi_info.txt +45 -0
  173. pyogrio/gdal_data/vdv452.xml +349 -0
  174. pyogrio/gdal_data/vdv452.xsd +45 -0
  175. pyogrio/gdal_data/vicar.json +164 -0
  176. pyogrio/geopandas.py +978 -0
  177. pyogrio/proj_data/CH +22 -0
  178. pyogrio/proj_data/GL27 +23 -0
  179. pyogrio/proj_data/ITRF2000 +24 -0
  180. pyogrio/proj_data/ITRF2008 +94 -0
  181. pyogrio/proj_data/ITRF2014 +55 -0
  182. pyogrio/proj_data/ITRF2020 +91 -0
  183. pyogrio/proj_data/copyright +34 -0
  184. pyogrio/proj_data/deformation_model.schema.json +582 -0
  185. pyogrio/proj_data/nad.lst +142 -0
  186. pyogrio/proj_data/nad27 +810 -0
  187. pyogrio/proj_data/nad83 +745 -0
  188. pyogrio/proj_data/other.extra +53 -0
  189. pyogrio/proj_data/proj-config-version.cmake +44 -0
  190. pyogrio/proj_data/proj-config.cmake +79 -0
  191. pyogrio/proj_data/proj-targets-release.cmake +19 -0
  192. pyogrio/proj_data/proj-targets.cmake +107 -0
  193. pyogrio/proj_data/proj.db +0 -0
  194. pyogrio/proj_data/proj.ini +59 -0
  195. pyogrio/proj_data/proj4-targets-release.cmake +19 -0
  196. pyogrio/proj_data/proj4-targets.cmake +107 -0
  197. pyogrio/proj_data/projjson.schema.json +1174 -0
  198. pyogrio/proj_data/triangulation.schema.json +214 -0
  199. pyogrio/proj_data/usage +9 -0
  200. pyogrio/proj_data/vcpkg.spdx.json +203 -0
  201. pyogrio/proj_data/vcpkg_abi_info.txt +28 -0
  202. pyogrio/proj_data/world +214 -0
  203. pyogrio/raw.py +897 -0
  204. pyogrio/tests/__init__.py +0 -0
  205. pyogrio/tests/conftest.py +588 -0
  206. pyogrio/tests/fixtures/README.md +108 -0
  207. pyogrio/tests/fixtures/curve.gpkg +0 -0
  208. pyogrio/tests/fixtures/curvepolygon.gpkg +0 -0
  209. pyogrio/tests/fixtures/line_zm.gpkg +0 -0
  210. pyogrio/tests/fixtures/list_field_values_file.parquet +0 -0
  211. pyogrio/tests/fixtures/list_nested_struct_file.parquet +0 -0
  212. pyogrio/tests/fixtures/multisurface.gpkg +0 -0
  213. pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.cpg +1 -0
  214. pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.dbf +0 -0
  215. pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.prj +1 -0
  216. pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp +0 -0
  217. pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shx +0 -0
  218. pyogrio/tests/fixtures/sample.osm.pbf +0 -0
  219. pyogrio/tests/fixtures/test_gpkg_nulls.gpkg +0 -0
  220. pyogrio/tests/test_arrow.py +1160 -0
  221. pyogrio/tests/test_core.py +702 -0
  222. pyogrio/tests/test_geopandas_io.py +3218 -0
  223. pyogrio/tests/test_path.py +374 -0
  224. pyogrio/tests/test_raw_io.py +1473 -0
  225. pyogrio/tests/test_util.py +56 -0
  226. pyogrio/util.py +258 -0
  227. pyogrio-0.12.0.dist-info/METADATA +125 -0
  228. pyogrio-0.12.0.dist-info/RECORD +231 -0
  229. pyogrio-0.12.0.dist-info/WHEEL +6 -0
  230. pyogrio-0.12.0.dist-info/licenses/LICENSE +21 -0
  231. pyogrio-0.12.0.dist-info/top_level.txt +1 -0
pyogrio/geopandas.py ADDED
@@ -0,0 +1,978 @@
1
+ """Functions for reading and writing GeoPandas dataframes."""
2
+
3
+ import json
4
+ import os
5
+ import warnings
6
+ from datetime import datetime
7
+
8
+ import numpy as np
9
+
10
+ from pyogrio._compat import (
11
+ HAS_GEOPANDAS,
12
+ HAS_PYARROW,
13
+ PANDAS_GE_15,
14
+ PANDAS_GE_20,
15
+ PANDAS_GE_22,
16
+ PANDAS_GE_30,
17
+ PYARROW_GE_19,
18
+ __gdal_version__,
19
+ )
20
+ from pyogrio.errors import DataSourceError
21
+ from pyogrio.raw import (
22
+ DRIVERS_NO_MIXED_DIMENSIONS,
23
+ DRIVERS_NO_MIXED_SINGLE_MULTI,
24
+ _get_write_path_driver,
25
+ read,
26
+ read_arrow,
27
+ write,
28
+ )
29
+
30
+
31
+ def _stringify_path(path):
32
+ """Convert path-like to a string if possible, pass-through other objects."""
33
+ if isinstance(path, str):
34
+ return path
35
+
36
+ # checking whether path implements the filesystem protocol
37
+ if hasattr(path, "__fspath__"):
38
+ return path.__fspath__()
39
+
40
+ # pass-though other objects
41
+ return path
42
+
43
+
44
+ def _try_parse_datetime(ser, datetime_as_string: bool, mixed_offsets_as_utc: bool):
45
+ import pandas as pd # only called when pandas is known to be installed
46
+ from pandas.api.types import is_string_dtype
47
+
48
+ datetime_kwargs = {}
49
+ if datetime_as_string:
50
+ if not is_string_dtype(ser.dtype):
51
+ # Support to return datetimes as strings using arrow only available for
52
+ # GDAL >= 3.11, so convert to string here if needed.
53
+ res = ser.astype("str")
54
+ if not PANDAS_GE_30:
55
+ # astype("str") also stringifies missing values in pandas < 3
56
+ res[ser.isna()] = None
57
+ res = res.str.replace(" ", "T")
58
+ return res
59
+ if __gdal_version__ < (3, 7, 0):
60
+ # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that
61
+ return ser.str.replace(" ", "T").str.replace("/", "-")
62
+ return ser
63
+
64
+ if PANDAS_GE_22:
65
+ datetime_kwargs["format"] = "ISO8601"
66
+ elif PANDAS_GE_20:
67
+ datetime_kwargs["format"] = "ISO8601"
68
+ datetime_kwargs["errors"] = "ignore"
69
+ else:
70
+ datetime_kwargs["yearfirst"] = True
71
+
72
+ with warnings.catch_warnings():
73
+ warnings.filterwarnings(
74
+ "ignore",
75
+ ".*parsing datetimes with mixed time zones will raise.*",
76
+ FutureWarning,
77
+ )
78
+
79
+ warning = "Error parsing datetimes, original strings are returned: {message}"
80
+ try:
81
+ res = pd.to_datetime(ser, **datetime_kwargs)
82
+
83
+ # With pandas >2 and <3, mixed time zones were returned as pandas
84
+ # Timestamps, so convert them to datetime objects.
85
+ if not mixed_offsets_as_utc and PANDAS_GE_20 and res.dtype == "object":
86
+ res = res.map(lambda x: x.to_pydatetime(), na_action="ignore")
87
+
88
+ except Exception as ex:
89
+ if isinstance(ex, ValueError) and "Mixed timezones detected" in str(ex):
90
+ # Parsing mixed time zones with to_datetime is not supported
91
+ # anymore in pandas >= 3.0, leading to a ValueError.
92
+ if mixed_offsets_as_utc:
93
+ # Convert mixed time zone datetimes to UTC.
94
+ try:
95
+ res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
96
+ except Exception as ex:
97
+ warnings.warn(warning.format(message=str(ex)), stacklevel=3)
98
+ return ser
99
+ else:
100
+ # Using map seems to be the fastest way to convert the strings to
101
+ # datetime objects.
102
+ try:
103
+ res = ser.map(datetime.fromisoformat, na_action="ignore")
104
+ except Exception as ex:
105
+ warnings.warn(warning.format(message=str(ex)), stacklevel=3)
106
+ return ser
107
+
108
+ else:
109
+ # If the error is not related to mixed time zones, log it and return
110
+ # the original series.
111
+ warnings.warn(warning.format(message=str(ex)), stacklevel=3)
112
+ if __gdal_version__ < (3, 7, 0):
113
+ # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that
114
+ return ser.str.replace(" ", "T").str.replace("/", "-")
115
+
116
+ return ser
117
+
118
+ # For pandas < 3.0, to_datetime converted mixed time zone data to datetime objects.
119
+ # For mixed_offsets_as_utc they should be converted to UTC though...
120
+ if mixed_offsets_as_utc and res.dtype in ("object", "string"):
121
+ try:
122
+ res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
123
+ except Exception as ex:
124
+ warnings.warn(warning.format(message=str(ex)), stacklevel=3)
125
+
126
+ if res.dtype.kind == "M": # any datetime64
127
+ # GDAL only supports ms precision, convert outputs to match.
128
+ # Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns],
129
+ # Instead, round the values to [ms] precision.
130
+ if PANDAS_GE_20:
131
+ res = res.dt.as_unit("ms")
132
+ else:
133
+ res = res.dt.round(freq="ms")
134
+
135
+ return res
136
+
137
+
138
+ def read_dataframe(
139
+ path_or_buffer,
140
+ /,
141
+ layer=None,
142
+ encoding=None,
143
+ columns=None,
144
+ read_geometry=True,
145
+ force_2d=False,
146
+ skip_features=0,
147
+ max_features=None,
148
+ where=None,
149
+ bbox=None,
150
+ mask=None,
151
+ fids=None,
152
+ sql=None,
153
+ sql_dialect=None,
154
+ fid_as_index=False,
155
+ use_arrow=None,
156
+ on_invalid="raise",
157
+ arrow_to_pandas_kwargs=None,
158
+ datetime_as_string=False,
159
+ mixed_offsets_as_utc=True,
160
+ **kwargs,
161
+ ):
162
+ """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
163
+
164
+ If the data source does not have a geometry column or ``read_geometry`` is False,
165
+ a DataFrame will be returned.
166
+
167
+ If you read data with datetime columns containing time zone information, check out
168
+ the notes below.
169
+
170
+ Requires ``geopandas`` >= 0.8.
171
+
172
+ Parameters
173
+ ----------
174
+ path_or_buffer : pathlib.Path or str, or bytes buffer
175
+ A dataset path or URI, raw buffer, or file-like object with a read method.
176
+ layer : int or str, optional (default: first layer)
177
+ If an integer is provided, it corresponds to the index of the layer
178
+ with the data source. If a string is provided, it must match the name
179
+ of the layer in the data source. Defaults to first layer in data source.
180
+ encoding : str, optional (default: None)
181
+ If present, will be used as the encoding for reading string values from
182
+ the data source. By default will automatically try to detect the native
183
+ encoding and decode to ``UTF-8``.
184
+ columns : list-like, optional (default: all columns)
185
+ List of column names to import from the data source. Column names must
186
+ exactly match the names in the data source, and will be returned in
187
+ the order they occur in the data source. To avoid reading any columns,
188
+ pass an empty list-like. If combined with ``where`` parameter, must
189
+ include columns referenced in the ``where`` expression or the data may
190
+ not be correctly read; the data source may return empty results or
191
+ raise an exception (behavior varies by driver).
192
+ read_geometry : bool, optional (default: True)
193
+ If True, will read geometry into a GeoSeries. If False, a Pandas DataFrame
194
+ will be returned instead.
195
+ force_2d : bool, optional (default: False)
196
+ If the geometry has Z values, setting this to True will cause those to
197
+ be ignored and 2D geometries to be returned
198
+ skip_features : int, optional (default: 0)
199
+ Number of features to skip from the beginning of the file before
200
+ returning features. If greater than available number of features, an
201
+ empty DataFrame will be returned. Using this parameter may incur
202
+ significant overhead if the driver does not support the capability to
203
+ randomly seek to a specific feature, because it will need to iterate
204
+ over all prior features.
205
+ max_features : int, optional (default: None)
206
+ Number of features to read from the file.
207
+ where : str, optional (default: None)
208
+ Where clause to filter features in layer by attribute values. If the data source
209
+ natively supports SQL, its specific SQL dialect should be used (eg. SQLite and
210
+ GeoPackage: `SQLITE`_, PostgreSQL). If it doesn't, the `OGRSQL WHERE`_ syntax
211
+ should be used. Note that it is not possible to overrule the SQL dialect, this
212
+ is only possible when you use the ``sql`` parameter.
213
+ Examples: ``"ISO_A3 = 'CAN'"``, ``"POP_EST > 10000000 AND POP_EST < 100000000"``
214
+ bbox : tuple of (xmin, ymin, xmax, ymax) (default: None)
215
+ If present, will be used to filter records whose geometry intersects this
216
+ box. This must be in the same CRS as the dataset. If GEOS is present
217
+ and used by GDAL, only geometries that intersect this bbox will be
218
+ returned; if GEOS is not available or not used by GDAL, all geometries
219
+ with bounding boxes that intersect this bbox will be returned.
220
+ Cannot be combined with ``mask`` keyword.
221
+ mask : Shapely geometry, optional (default: None)
222
+ If present, will be used to filter records whose geometry intersects
223
+ this geometry. This must be in the same CRS as the dataset. If GEOS is
224
+ present and used by GDAL, only geometries that intersect this geometry
225
+ will be returned; if GEOS is not available or not used by GDAL, all
226
+ geometries with bounding boxes that intersect the bounding box of this
227
+ geometry will be returned. Requires Shapely >= 2.0.
228
+ Cannot be combined with ``bbox`` keyword.
229
+ fids : array-like, optional (default: None)
230
+ Array of integer feature id (FID) values to select. Cannot be combined
231
+ with other keywords to select a subset (``skip_features``,
232
+ ``max_features``, ``where``, ``bbox``, ``mask``, or ``sql``). Note that
233
+ the starting index is driver and file specific (e.g. typically 0 for
234
+ Shapefile and 1 for GeoPackage, but can still depend on the specific
235
+ file). The performance of reading a large number of features usings FIDs
236
+ is also driver specific and depends on the value of ``use_arrow``. The order
237
+ of the rows returned is undefined. If you would like to sort based on FID, use
238
+ ``fid_as_index=True`` to have the index of the GeoDataFrame returned set to the
239
+ FIDs of the features read. If ``use_arrow=True``, the number of FIDs is limited
240
+ to 4997 for drivers with 'OGRSQL' as default SQL dialect. To read a larger
241
+ number of FIDs, set ``user_arrow=False``.
242
+ sql : str, optional (default: None)
243
+ The SQL statement to execute. Look at the sql_dialect parameter for more
244
+ information on the syntax to use for the query. When combined with other
245
+ keywords like ``columns``, ``skip_features``, ``max_features``,
246
+ ``where``, ``bbox``, or ``mask``, those are applied after the SQL query.
247
+ Be aware that this can have an impact on performance, (e.g. filtering
248
+ with the ``bbox`` or ``mask`` keywords may not use spatial indexes).
249
+ Cannot be combined with the ``layer`` or ``fids`` keywords.
250
+ sql_dialect : str, optional (default: None)
251
+ The SQL dialect the SQL statement is written in. Possible values:
252
+
253
+ - **None**: if the data source natively supports SQL, its specific SQL dialect
254
+ will be used by default (eg. SQLite and Geopackage: `SQLITE`_, PostgreSQL).
255
+ If the data source doesn't natively support SQL, the `OGRSQL`_ dialect is
256
+ the default.
257
+ - '`OGRSQL`_': can be used on any data source. Performance can suffer
258
+ when used on data sources with native support for SQL.
259
+ - '`SQLITE`_': can be used on any data source. All spatialite_
260
+ functions can be used. Performance can suffer on data sources with
261
+ native support for SQL, except for Geopackage and SQLite as this is
262
+ their native SQL dialect.
263
+
264
+ fid_as_index : bool, optional (default: False)
265
+ If True, will use the FIDs of the features that were read as the
266
+ index of the GeoDataFrame. May start at 0 or 1 depending on the driver.
267
+ use_arrow : bool, optional (default: False)
268
+ Whether to use Arrow as the transfer mechanism of the read data
269
+ from GDAL to Python (requires GDAL >= 3.6 and `pyarrow` to be
270
+ installed). When enabled, this provides a further speed-up.
271
+ Defaults to False, but this default can also be globally overridden
272
+ by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
273
+ on_invalid : str, optional (default: "raise")
274
+ The action to take when an invalid geometry is encountered. Possible
275
+ values:
276
+
277
+ - **raise**: an exception will be raised if a WKB input geometry is
278
+ invalid.
279
+ - **warn**: invalid WKB geometries will be returned as ``None`` and a
280
+ warning will be raised.
281
+ - **ignore**: invalid WKB geometries will be returned as ``None``
282
+ without a warning.
283
+ - **fix**: an effort is made to fix invalid input geometries (currently
284
+ just unclosed rings). If this is not possible, they are returned as
285
+ ``None`` without a warning. Requires GEOS >= 3.11 and shapely >= 2.1.
286
+
287
+ arrow_to_pandas_kwargs : dict, optional (default: None)
288
+ When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_
289
+ call for the arrow to pandas conversion.
290
+ datetime_as_string : bool, optional (default: False)
291
+ If True, will return datetime columns as detected by GDAL as ISO8601
292
+ strings and ``mixed_offsets_as_utc`` will be ignored.
293
+ mixed_offsets_as_utc: bool, optional (default: True)
294
+ By default, datetime columns are read as the pandas datetime64 dtype.
295
+ This can represent the data as-is in the case that the column contains
296
+ only naive datetimes (without time zone information), only UTC datetimes,
297
+ or if all datetimes in the column have the same time zone offset. Note
298
+ that in time zones with daylight saving time, datetimes will have
299
+ different offsets throughout the year!
300
+
301
+ For columns that don't comply with the above, i.e. columns that contain
302
+ mixed offsets, the behavior depends on the value of this parameter:
303
+
304
+ - If ``True`` (default), such datetimes are converted to UTC. In the case
305
+ of a mixture of time zone aware and naive datetimes, the naive
306
+ datetimes are assumed to be in UTC already. Datetime columns returned
307
+ will always be pandas datetime64.
308
+ - If ``False``, such datetimes with mixed offsets are returned with
309
+ those offsets preserved. Because pandas datetime64 columns don't
310
+ support mixed time zone offsets, such columns are returned as object
311
+ columns with python datetime values with fixed offsets. If you want
312
+ to roundtrip datetimes without data loss, this is the recommended
313
+ option, but you lose the functionality of a datetime64 column.
314
+
315
+ If ``datetime_as_string`` is True, this option is ignored.
316
+
317
+ **kwargs
318
+ Additional driver-specific dataset open options passed to OGR. Invalid
319
+ options will trigger a warning.
320
+
321
+ Returns
322
+ -------
323
+ GeoDataFrame or DataFrame (if no geometry is present)
324
+
325
+ Notes
326
+ -----
327
+ When you have datetime columns with time zone information, it is important to
328
+ note that GDAL only represents time zones as UTC offsets, whilst pandas uses
329
+ IANA time zones (via `pytz` or `zoneinfo`). As a result, even if a column in a
330
+ DataFrame contains datetimes in a single time zone, this will often still result
331
+ in mixed time zone offsets being written for time zones where daylight saving
332
+ time is used (e.g. +01:00 and +02:00 offsets for time zone Europe/Brussels). When
333
+ roundtripping through GDAL, the information about the original time zone is
334
+ lost, only the offsets can be preserved. By default, `pyogrio.read_dataframe()`
335
+ will convert columns with mixed offsets to UTC to return a datetime64 column. If
336
+ you want to preserve the original offsets, you can use `datetime_as_string=True`
337
+ or `mixed_offsets_as_utc=False`.
338
+
339
+ .. _OGRSQL:
340
+
341
+ https://gdal.org/user/ogr_sql_dialect.html#ogr-sql-dialect
342
+
343
+ .. _OGRSQL WHERE:
344
+
345
+ https://gdal.org/user/ogr_sql_dialect.html#where
346
+
347
+ .. _SQLITE:
348
+
349
+ https://gdal.org/user/sql_sqlite_dialect.html#sql-sqlite-dialect
350
+
351
+ .. _spatialite:
352
+
353
+ https://www.gaia-gis.it/gaia-sins/spatialite-sql-latest.html
354
+
355
+ .. _to_pandas:
356
+
357
+ https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas
358
+
359
+ """
360
+ if not HAS_GEOPANDAS:
361
+ raise ImportError("geopandas is required to use pyogrio.read_dataframe()")
362
+
363
+ import geopandas as gp
364
+ import pandas as pd
365
+
366
+ import shapely # if geopandas is present, shapely is expected to be present
367
+
368
+ path_or_buffer = _stringify_path(path_or_buffer)
369
+
370
+ if use_arrow is None:
371
+ use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0")))
372
+
373
+ read_func = read_arrow if use_arrow else read
374
+ gdal_force_2d = False if use_arrow else force_2d
375
+
376
+ # Always read datetimes as string values to preserve (mixed) time zone info
377
+ # correctly. If arrow is not used, it is needed because numpy does not
378
+ # directly support time zones + performance is also a lot better. If arrow
379
+ # is used, needed because datetime columns don't support mixed time zone
380
+ # offsets + e.g. for .fgb files time zone info isn't handled correctly even
381
+ # for unique time zone offsets if datetimes are not read as string.
382
+ result = read_func(
383
+ path_or_buffer,
384
+ layer=layer,
385
+ encoding=encoding,
386
+ columns=columns,
387
+ read_geometry=read_geometry,
388
+ force_2d=gdal_force_2d,
389
+ skip_features=skip_features,
390
+ max_features=max_features,
391
+ where=where,
392
+ bbox=bbox,
393
+ mask=mask,
394
+ fids=fids,
395
+ sql=sql,
396
+ sql_dialect=sql_dialect,
397
+ return_fids=fid_as_index,
398
+ datetime_as_string=True,
399
+ **kwargs,
400
+ )
401
+
402
+ if use_arrow:
403
+ import pyarrow as pa
404
+
405
+ meta, table = result
406
+
407
+ # split_blocks and self_destruct decrease memory usage, but have as side effect
408
+ # that accessing table afterwards causes crash, so del table to avoid.
409
+ kwargs = {"self_destruct": True}
410
+ if PANDAS_GE_30:
411
+ # starting with pyarrow 19.0, pyarrow will correctly handle this themselves,
412
+ # so only use types_mapper as workaround for older versions
413
+ if not PYARROW_GE_19:
414
+ kwargs["types_mapper"] = {
415
+ pa.string(): pd.StringDtype(na_value=np.nan),
416
+ pa.large_string(): pd.StringDtype(na_value=np.nan),
417
+ pa.json_(): pd.StringDtype(na_value=np.nan),
418
+ }.get
419
+ # TODO enable the below block when upstream issue to accept extension types
420
+ # is fixed
421
+ # else:
422
+ # # for newer pyarrow, still include mapping for json
423
+ # # GDAL 3.11 started to emit this extension type, but pyarrow does not
424
+ # # yet support it properly in the conversion to pandas
425
+ # kwargs["types_mapper"] = {
426
+ # pa.json_(): pd.StringDtype(na_value=np.nan),
427
+ # }.get
428
+ if arrow_to_pandas_kwargs is not None:
429
+ kwargs.update(arrow_to_pandas_kwargs)
430
+
431
+ try:
432
+ df = table.to_pandas(**kwargs)
433
+ except UnicodeDecodeError as ex:
434
+ # Arrow does not support reading data in a non-UTF-8 encoding
435
+ raise DataSourceError(
436
+ "The file being read is not encoded in UTF-8; please use_arrow=False"
437
+ ) from ex
438
+
439
+ del table
440
+
441
+ # convert datetime columns that were read as string to datetime
442
+ for dtype, column in zip(meta["dtypes"], meta["fields"]):
443
+ if dtype is not None and dtype.startswith("datetime"):
444
+ df[column] = _try_parse_datetime(
445
+ df[column], datetime_as_string, mixed_offsets_as_utc
446
+ )
447
+ for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
448
+ if ogr_subtype == "OFSTJSON":
449
+ # When reading .parquet files with arrow, JSON fields are already
450
+ # parsed, so only parse if strings.
451
+ dtype = pd.api.types.infer_dtype(df[c])
452
+ if dtype == "string":
453
+ try:
454
+ df[c] = df[c].map(json.loads, na_action="ignore")
455
+ except Exception:
456
+ warnings.warn(
457
+ f"Could not parse column '{c}' as JSON; leaving as string",
458
+ stacklevel=2,
459
+ )
460
+
461
+ if fid_as_index:
462
+ df = df.set_index(meta["fid_column"])
463
+ df.index.names = ["fid"]
464
+
465
+ geometry_name = meta["geometry_name"] or "wkb_geometry"
466
+ if not fid_as_index and len(df.columns) == 0:
467
+ # Index not asked, no geometry column and no attribute columns: return empty
468
+ return pd.DataFrame()
469
+ elif geometry_name in df.columns:
470
+ wkb_values = df.pop(geometry_name)
471
+ if PANDAS_GE_15 and wkb_values.dtype != object:
472
+ if (
473
+ HAS_PYARROW
474
+ and isinstance(wkb_values.dtype, pd.ArrowDtype)
475
+ and isinstance(wkb_values.dtype.pyarrow_dtype, pa.BaseExtensionType)
476
+ ):
477
+ # handle BaseExtensionType(extension<geoarrow.wkb>)
478
+ wkb_values = pa.array(wkb_values.array).to_numpy(
479
+ zero_copy_only=False
480
+ )
481
+ else:
482
+ # for example ArrowDtype will otherwise give numpy array with pd.NA
483
+ wkb_values = wkb_values.to_numpy(na_value=None)
484
+ df["geometry"] = shapely.from_wkb(wkb_values, on_invalid=on_invalid)
485
+ if force_2d:
486
+ df["geometry"] = shapely.force_2d(df["geometry"])
487
+ return gp.GeoDataFrame(df, geometry="geometry", crs=meta["crs"])
488
+ else:
489
+ return df
490
+
491
+ meta, index, geometry, field_data = result
492
+
493
+ columns = meta["fields"].tolist()
494
+ data = {columns[i]: field_data[i] for i in range(len(columns))}
495
+ if fid_as_index:
496
+ index = pd.Index(index, name="fid")
497
+ else:
498
+ index = None
499
+ df = pd.DataFrame(data, columns=columns, index=index)
500
+ for dtype, c in zip(meta["dtypes"], df.columns):
501
+ if dtype.startswith("datetime"):
502
+ df[c] = _try_parse_datetime(df[c], datetime_as_string, mixed_offsets_as_utc)
503
+ for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
504
+ if ogr_subtype == "OFSTJSON":
505
+ dtype = pd.api.types.infer_dtype(df[c])
506
+ if dtype == "string":
507
+ try:
508
+ df[c] = df[c].map(json.loads, na_action="ignore")
509
+ except Exception:
510
+ warnings.warn(
511
+ f"Could not parse column '{c}' as JSON; leaving as string",
512
+ stacklevel=2,
513
+ )
514
+
515
+ if geometry is None or not read_geometry:
516
+ return df
517
+
518
+ geometry = shapely.from_wkb(geometry, on_invalid=on_invalid)
519
+
520
+ return gp.GeoDataFrame(df, geometry=geometry, crs=meta["crs"])
521
+
522
+
523
+ def write_dataframe(
524
+ df,
525
+ path,
526
+ layer=None,
527
+ driver=None,
528
+ encoding=None,
529
+ geometry_type=None,
530
+ promote_to_multi=None,
531
+ nan_as_null=True,
532
+ append=False,
533
+ use_arrow=None,
534
+ dataset_metadata=None,
535
+ layer_metadata=None,
536
+ metadata=None,
537
+ dataset_options=None,
538
+ layer_options=None,
539
+ **kwargs,
540
+ ):
541
+ """Write GeoPandas GeoDataFrame to an OGR file format.
542
+
543
+ Parameters
544
+ ----------
545
+ df : GeoDataFrame or DataFrame
546
+ The data to write. For attribute columns of the "object" dtype,
547
+ all values will be converted to strings to be written to the
548
+ output file, except None and np.nan, which will be set to NULL
549
+ in the output file.
550
+ path : str or io.BytesIO
551
+ path to output file on writeable file system or an io.BytesIO object to
552
+ allow writing to memory. Will raise NotImplementedError if an open file
553
+ handle is passed; use BytesIO instead.
554
+ NOTE: support for writing to memory is limited to specific drivers.
555
+ layer : str, optional (default: None)
556
+ layer name to create. If writing to memory and layer name is not
557
+ provided, it layer name will be set to a UUID4 value.
558
+ driver : string, optional (default: None)
559
+ The OGR format driver used to write the vector file. By default attempts
560
+ to infer driver from path. Must be provided to write to memory.
561
+ encoding : str, optional (default: None)
562
+ If present, will be used as the encoding for writing string values to
563
+ the file. Use with caution, only certain drivers support encodings
564
+ other than UTF-8.
565
+ geometry_type : string, optional (default: None)
566
+ By default, the geometry type of the layer will be inferred from the
567
+ data, after applying the promote_to_multi logic. If the data only contains a
568
+ single geometry type (after applying the logic of promote_to_multi), this type
569
+ is used for the layer. If the data (still) contains mixed geometry types, the
570
+ output layer geometry type will be set to "Unknown".
571
+
572
+ This parameter does not modify the geometry, but it will try to force the layer
573
+ type of the output file to this value. Use this parameter with caution because
574
+ using a non-default layer geometry type may result in errors when writing the
575
+ file, may be ignored by the driver, or may result in invalid files. Possible
576
+ values are: "Unknown", "Point", "LineString", "Polygon", "MultiPoint",
577
+ "MultiLineString", "MultiPolygon" or "GeometryCollection".
578
+ promote_to_multi : bool, optional (default: None)
579
+ If True, will convert singular geometry types in the data to their
580
+ corresponding multi geometry type for writing. By default, will convert
581
+ mixed singular and multi geometry types to multi geometry types for drivers
582
+ that do not support mixed singular and multi geometry types. If False, geometry
583
+ types will not be promoted, which may result in errors or invalid files when
584
+ attempting to write mixed singular and multi geometry types to drivers that do
585
+ not support such combinations.
586
+ nan_as_null : bool, default True
587
+ For floating point columns (float32 / float64), whether NaN values are
588
+ written as "null" (missing value). Defaults to True because in pandas
589
+ NaNs are typically used as missing value. Note that when set to False,
590
+ behaviour is format specific: some formats don't support NaNs by
591
+ default (e.g. GeoJSON will skip this property) or might treat them as
592
+ null anyway (e.g. GeoPackage).
593
+ append : bool, optional (default: False)
594
+ If True, the data source specified by path already exists, and the
595
+ driver supports appending to an existing data source, will cause the
596
+ data to be appended to the existing records in the data source. Not
597
+ supported for writing to in-memory files.
598
+ NOTE: append support is limited to specific drivers and GDAL versions.
599
+ use_arrow : bool, optional (default: False)
600
+ Whether to use Arrow as the transfer mechanism of the data to write
601
+ from Python to GDAL (requires GDAL >= 3.8 and `pyarrow` to be
602
+ installed). When enabled, this provides a further speed-up.
603
+ Defaults to False, but this default can also be globally overridden
604
+ by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
605
+ Using Arrow does not support writing an object-dtype column with
606
+ mixed types.
607
+ dataset_metadata : dict, optional (default: None)
608
+ Metadata to be stored at the dataset level in the output file; limited
609
+ to drivers that support writing metadata, such as GPKG, and silently
610
+ ignored otherwise. Keys and values must be strings.
611
+ layer_metadata : dict, optional (default: None)
612
+ Metadata to be stored at the layer level in the output file; limited to
613
+ drivers that support writing metadata, such as GPKG, and silently
614
+ ignored otherwise. Keys and values must be strings.
615
+ metadata : dict, optional (default: None)
616
+ alias of layer_metadata
617
+ dataset_options : dict, optional
618
+ Dataset creation options (format specific) passed to OGR. Specify as
619
+ a key-value dictionary.
620
+ layer_options : dict, optional
621
+ Layer creation options (format specific) passed to OGR. Specify as
622
+ a key-value dictionary.
623
+ **kwargs
624
+ Additional driver-specific dataset or layer creation options passed
625
+ to OGR. pyogrio will attempt to automatically pass those keywords
626
+ either as dataset or as layer creation option based on the known
627
+ options for the specific driver. Alternatively, you can use the
628
+ explicit `dataset_options` or `layer_options` keywords to manually
629
+ do this (for example if an option exists as both dataset and layer
630
+ option).
631
+
632
+ Notes
633
+ -----
634
+ When you have datetime columns with time zone information, it is important to
635
+ note that GDAL only represents time zones as UTC offsets, whilst pandas uses
636
+ IANA time zones (via `pytz` or `zoneinfo`). As a result, even if a column in a
637
+ DataFrame contains datetimes in a single time zone, this will often still result
638
+ in mixed time zone offsets being written for time zones where daylight saving
639
+ time is used (e.g. +01:00 and +02:00 offsets for time zone Europe/Brussels).
640
+
641
+ Object dtype columns containing `datetime` or `pandas.Timestamp` objects will
642
+ also be written as datetime fields, preserving time zone information where possible.
643
+
644
+ """
645
+ # TODO: add examples to the docstring (e.g. OGR kwargs)
646
+
647
+ if not HAS_GEOPANDAS:
648
+ raise ImportError("geopandas is required to use pyogrio.write_dataframe()")
649
+
650
+ import pandas as pd
651
+ from geopandas.array import to_wkb
652
+
653
+ if not isinstance(df, pd.DataFrame):
654
+ raise ValueError("'df' must be a DataFrame or GeoDataFrame")
655
+
656
+ if use_arrow is None:
657
+ use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0")))
658
+ path, driver = _get_write_path_driver(path, driver, append=append)
659
+
660
+ geometry_columns = df.columns[df.dtypes == "geometry"]
661
+ if len(geometry_columns) > 1:
662
+ raise ValueError(
663
+ "'df' must have only one geometry column. "
664
+ "Multiple geometry columns are not supported for output using OGR."
665
+ )
666
+
667
+ if len(geometry_columns) > 0:
668
+ geometry_column = geometry_columns[0]
669
+ geometry = df[geometry_column]
670
+ else:
671
+ geometry_column = None
672
+ geometry = None
673
+
674
+ # Determine geometry_type and/or promote_to_multi
675
+ if geometry_column is not None:
676
+ geometry_types_all = geometry.geom_type
677
+
678
+ if geometry_column is not None and (
679
+ geometry_type is None or promote_to_multi is None
680
+ ):
681
+ tmp_geometry_type = "Unknown"
682
+ has_z = False
683
+
684
+ # If there is data, infer layer geometry type + promote_to_multi
685
+ if not df.empty:
686
+ # None/Empty geometries sometimes report as Z incorrectly, so ignore them
687
+ with warnings.catch_warnings():
688
+ warnings.filterwarnings("ignore", r"GeoSeries\.notna", UserWarning)
689
+ geometry_notna = geometry.notna()
690
+ has_z_arr = geometry[geometry_notna & (~geometry.is_empty)].has_z
691
+ has_z = has_z_arr.any()
692
+ all_z = has_z_arr.all()
693
+
694
+ if driver in DRIVERS_NO_MIXED_DIMENSIONS and has_z and not all_z:
695
+ raise DataSourceError(
696
+ f"Mixed 2D and 3D coordinates are not supported by {driver}"
697
+ )
698
+
699
+ geometry_types = pd.Series(geometry_types_all.unique()).dropna().values
700
+ if len(geometry_types) == 1:
701
+ tmp_geometry_type = geometry_types[0]
702
+ if promote_to_multi and tmp_geometry_type in (
703
+ "Point",
704
+ "LineString",
705
+ "Polygon",
706
+ ):
707
+ tmp_geometry_type = f"Multi{tmp_geometry_type}"
708
+ elif len(geometry_types) == 2:
709
+ # Check if the types are corresponding multi + single types
710
+ if "Polygon" in geometry_types and "MultiPolygon" in geometry_types:
711
+ multi_type = "MultiPolygon"
712
+ elif (
713
+ "LineString" in geometry_types
714
+ and "MultiLineString" in geometry_types
715
+ ):
716
+ multi_type = "MultiLineString"
717
+ elif "Point" in geometry_types and "MultiPoint" in geometry_types:
718
+ multi_type = "MultiPoint"
719
+ else:
720
+ multi_type = None
721
+
722
+ # If they are corresponding multi + single types
723
+ if multi_type is not None:
724
+ if (
725
+ promote_to_multi is None
726
+ and driver in DRIVERS_NO_MIXED_SINGLE_MULTI
727
+ ):
728
+ promote_to_multi = True
729
+ if promote_to_multi:
730
+ tmp_geometry_type = multi_type
731
+
732
+ if geometry_type is None:
733
+ geometry_type = tmp_geometry_type
734
+ if has_z and geometry_type != "Unknown":
735
+ geometry_type = f"{geometry_type} Z"
736
+
737
+ crs = None
738
+ if geometry_column is not None and geometry.crs:
739
+ # TODO: this may need to be WKT1, due to issues
740
+ # if possible use EPSG codes instead
741
+ epsg = geometry.crs.to_epsg()
742
+ if epsg:
743
+ crs = f"EPSG:{epsg}"
744
+ else:
745
+ crs = geometry.crs.to_wkt("WKT1_GDAL")
746
+
747
+ if use_arrow:
748
+ import pandas as pd # only called when pandas is known to be installed
749
+ import pyarrow as pa
750
+
751
+ from pyogrio.raw import write_arrow
752
+
753
+ if geometry_column is not None:
754
+ # Convert to multi type
755
+ if promote_to_multi:
756
+ import shapely
757
+
758
+ mask_points = geometry_types_all == "Point"
759
+ mask_linestrings = geometry_types_all == "LineString"
760
+ mask_polygons = geometry_types_all == "Polygon"
761
+
762
+ if mask_points.any():
763
+ geometry[mask_points] = shapely.multipoints(
764
+ np.atleast_2d(geometry[mask_points]), axis=0
765
+ )
766
+
767
+ if mask_linestrings.any():
768
+ geometry[mask_linestrings] = shapely.multilinestrings(
769
+ np.atleast_2d(geometry[mask_linestrings]), axis=0
770
+ )
771
+
772
+ if mask_polygons.any():
773
+ geometry[mask_polygons] = shapely.multipolygons(
774
+ np.atleast_2d(geometry[mask_polygons]), axis=0
775
+ )
776
+
777
+ geometry = to_wkb(geometry.values)
778
+ df = df.copy(deep=False)
779
+ # convert to plain DataFrame to avoid warning from geopandas about
780
+ # writing non-geometries to the geometry column
781
+ df = pd.DataFrame(df, copy=False)
782
+ df[geometry_column] = geometry
783
+
784
+ # Arrow doesn't support datetime columns with mixed time zones, and GDAL only
785
+ # supports time zone offsets. Hence, to avoid data loss, convert columns that
786
+ # can contain datetime values with different offsets to strings.
787
+ # Also pass a list of these columns on to GDAL so it can still treat them as
788
+ # datetime columns when writing the dataset.
789
+ datetime_cols = []
790
+ for name, dtype in df.dtypes.items():
791
+ if dtype == "object":
792
+ # An object column with datetimes can contain multiple offsets.
793
+ if pd.api.types.infer_dtype(df[name]) == "datetime":
794
+ df[name] = df[name].astype("string")
795
+ datetime_cols.append(name)
796
+
797
+ elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC":
798
+ # A pd.datetime64 column with a time zone different than UTC can contain
799
+ # data with different offsets because of summer/winter time.
800
+ df[name] = df[name].astype("string")
801
+ datetime_cols.append(name)
802
+
803
+ table = pa.Table.from_pandas(df, preserve_index=False)
804
+
805
+ # Add metadata to datetime columns so GDAL knows they are datetimes.
806
+ table = _add_column_metadata(
807
+ table,
808
+ column_metadata={
809
+ col: {"GDAL:OGR:type": "DateTime"} for col in datetime_cols
810
+ },
811
+ )
812
+
813
+ # Null arrow columns are not supported by GDAL, so convert to string
814
+ for field_index, field in enumerate(table.schema):
815
+ if field.type == pa.null():
816
+ table = table.set_column(
817
+ field_index,
818
+ field.with_type(pa.string()),
819
+ table[field_index].cast(pa.string()),
820
+ )
821
+
822
+ if geometry_column is not None:
823
+ # ensure that the geometry column is binary (for all-null geometries,
824
+ # this could be a wrong type)
825
+ geom_field = table.schema.field(geometry_column)
826
+ if not (
827
+ pa.types.is_binary(geom_field.type)
828
+ or pa.types.is_large_binary(geom_field.type)
829
+ ):
830
+ table = table.set_column(
831
+ table.schema.get_field_index(geometry_column),
832
+ geom_field.with_type(pa.binary()),
833
+ table[geometry_column].cast(pa.binary()),
834
+ )
835
+
836
+ write_arrow(
837
+ table,
838
+ path,
839
+ layer=layer,
840
+ driver=driver,
841
+ geometry_name=geometry_column,
842
+ geometry_type=geometry_type,
843
+ crs=crs,
844
+ encoding=encoding,
845
+ append=append,
846
+ dataset_metadata=dataset_metadata,
847
+ layer_metadata=layer_metadata,
848
+ metadata=metadata,
849
+ dataset_options=dataset_options,
850
+ layer_options=layer_options,
851
+ **kwargs,
852
+ )
853
+ return
854
+
855
+ # If there is geometry data, prepare it to be written
856
+ if geometry_column is not None:
857
+ geometry = to_wkb(geometry.values)
858
+ fields = [c for c in df.columns if not c == geometry_column]
859
+ else:
860
+ fields = list(df.columns)
861
+
862
+ # Convert data to numpy arrays for writing
863
+ # TODO: may need to fill in pd.NA, etc
864
+ field_data = []
865
+ field_mask = []
866
+ # dict[str, np.array(int)] special case for dt-tz fields
867
+ gdal_tz_offsets = {}
868
+ for name in fields:
869
+ col = df[name]
870
+ values = None
871
+
872
+ if isinstance(col.dtype, pd.DatetimeTZDtype):
873
+ # Deal with datetimes with time zones by passing down time zone separately
874
+ # pass down naive datetime
875
+ naive = col.dt.tz_localize(None)
876
+ values = naive.values
877
+ # compute offset relative to UTC explicitly
878
+ tz_offset = naive - col.dt.tz_convert("UTC").dt.tz_localize(None)
879
+ # Convert to GDAL time zone offset representation.
880
+ # GMT is represented as 100 and offsets are represented by adding /
881
+ # subtracting 1 for every 15 minutes different from GMT.
882
+ # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes
883
+ # Convert each row offset to a signed multiple of 15m and add to GMT value
884
+ gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
885
+ gdal_tz_offsets[name] = gdal_offset_representation.values
886
+
887
+ elif col.dtype == "object":
888
+ # Column of Timestamp/datetime objects, split in naive datetime and tz.
889
+ if pd.api.types.infer_dtype(df[name]) == "datetime":
890
+ tz_offset = col.map(lambda x: x.utcoffset(), na_action="ignore")
891
+ gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100
892
+ gdal_tz_offsets[name] = gdal_offset_repr.values
893
+ naive = col.map(lambda x: x.replace(tzinfo=None), na_action="ignore")
894
+ values = naive.values
895
+
896
+ if values is None:
897
+ values = col.values
898
+
899
+ if isinstance(values, pd.api.extensions.ExtensionArray):
900
+ from pandas.arrays import BooleanArray, FloatingArray, IntegerArray
901
+
902
+ if isinstance(values, IntegerArray | FloatingArray | BooleanArray):
903
+ field_data.append(values._data)
904
+ field_mask.append(values._mask)
905
+ else:
906
+ field_data.append(np.asarray(values))
907
+ field_mask.append(np.asarray(values.isna()))
908
+ else:
909
+ field_data.append(values)
910
+ field_mask.append(None)
911
+
912
+ write(
913
+ path,
914
+ layer=layer,
915
+ driver=driver,
916
+ geometry=geometry,
917
+ field_data=field_data,
918
+ field_mask=field_mask,
919
+ fields=fields,
920
+ crs=crs,
921
+ geometry_type=geometry_type,
922
+ encoding=encoding,
923
+ promote_to_multi=promote_to_multi,
924
+ nan_as_null=nan_as_null,
925
+ append=append,
926
+ dataset_metadata=dataset_metadata,
927
+ layer_metadata=layer_metadata,
928
+ metadata=metadata,
929
+ dataset_options=dataset_options,
930
+ layer_options=layer_options,
931
+ gdal_tz_offsets=gdal_tz_offsets,
932
+ **kwargs,
933
+ )
934
+
935
+
936
+ def _add_column_metadata(table, column_metadata: dict = {}):
937
+ """Add or update column-level metadata to an arrow table.
938
+
939
+ Parameters
940
+ ----------
941
+ table : pyarrow.Table
942
+ The table to add the column metadata to.
943
+ column_metadata : dict
944
+ A dictionary with column metadata in the form
945
+ {
946
+ "column_1": {"some": "data"},
947
+ "column_2": {"more": "stuff"},
948
+ }
949
+
950
+ Returns
951
+ -------
952
+ pyarrow.Table: table with the updated column metadata.
953
+ """
954
+ import pyarrow as pa
955
+
956
+ if not column_metadata:
957
+ return table
958
+
959
+ # Create updated column fields with new metadata
960
+ fields = []
961
+ for col in table.schema.names:
962
+ if col in column_metadata:
963
+ # Add/update column metadata
964
+ metadata = table.field(col).metadata or {}
965
+ for key, value in column_metadata[col].items():
966
+ metadata[key] = value
967
+ # Update field with updated metadata
968
+ fields.append(table.field(col).with_metadata(metadata))
969
+ else:
970
+ fields.append(table.field(col))
971
+
972
+ # Create new schema with the updated field metadata
973
+ schema = pa.schema(fields, metadata=table.schema.metadata)
974
+
975
+ # Build new table with updated schema (shouldn't copy data)
976
+ table = table.cast(schema)
977
+
978
+ return table