pyogrio 0.7.2__cp39-cp39-win_amd64.whl → 0.9.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyogrio might be problematic. Click here for more details.

Files changed (85) hide show
  1. pyogrio/__init__.py +12 -7
  2. pyogrio/_compat.py +6 -1
  3. pyogrio/_err.c +855 -321
  4. pyogrio/_err.cp39-win_amd64.pyd +0 -0
  5. pyogrio/_err.pyx +7 -3
  6. pyogrio/_geometry.c +134 -75
  7. pyogrio/_geometry.cp39-win_amd64.pyd +0 -0
  8. pyogrio/_io.c +28462 -22609
  9. pyogrio/_io.cp39-win_amd64.pyd +0 -0
  10. pyogrio/_io.pyx +904 -242
  11. pyogrio/_ogr.c +1317 -1640
  12. pyogrio/_ogr.cp39-win_amd64.pyd +0 -0
  13. pyogrio/_ogr.pxd +69 -13
  14. pyogrio/_ogr.pyx +8 -24
  15. pyogrio/_version.py +3 -3
  16. pyogrio/_vsi.c +6815 -0
  17. pyogrio/_vsi.cp39-win_amd64.pyd +0 -0
  18. pyogrio/_vsi.pxd +4 -0
  19. pyogrio/_vsi.pyx +140 -0
  20. pyogrio/core.py +43 -44
  21. pyogrio/gdal_data/GDAL-targets-release.cmake +1 -1
  22. pyogrio/gdal_data/GDAL-targets.cmake +10 -6
  23. pyogrio/gdal_data/GDALConfigVersion.cmake +3 -3
  24. pyogrio/gdal_data/gdalinfo_output.schema.json +2 -0
  25. pyogrio/gdal_data/gdalvrt.xsd +163 -0
  26. pyogrio/gdal_data/ogrinfo_output.schema.json +12 -1
  27. pyogrio/gdal_data/vcpkg.spdx.json +23 -23
  28. pyogrio/gdal_data/vcpkg_abi_info.txt +29 -28
  29. pyogrio/geopandas.py +140 -34
  30. pyogrio/proj_data/ITRF2008 +2 -2
  31. pyogrio/proj_data/proj-config-version.cmake +2 -2
  32. pyogrio/proj_data/proj-config.cmake +2 -1
  33. pyogrio/proj_data/proj-targets-release.cmake +0 -1
  34. pyogrio/proj_data/proj-targets.cmake +10 -6
  35. pyogrio/proj_data/proj.db +0 -0
  36. pyogrio/proj_data/proj4-targets-release.cmake +0 -1
  37. pyogrio/proj_data/proj4-targets.cmake +10 -6
  38. pyogrio/proj_data/vcpkg.spdx.json +21 -43
  39. pyogrio/proj_data/vcpkg_abi_info.txt +16 -17
  40. pyogrio/raw.py +438 -116
  41. pyogrio/tests/conftest.py +75 -6
  42. pyogrio/tests/fixtures/poly_not_enough_points.shp.zip +0 -0
  43. pyogrio/tests/test_arrow.py +841 -7
  44. pyogrio/tests/test_core.py +99 -7
  45. pyogrio/tests/test_geopandas_io.py +827 -121
  46. pyogrio/tests/test_path.py +23 -3
  47. pyogrio/tests/test_raw_io.py +276 -50
  48. pyogrio/util.py +39 -19
  49. pyogrio-0.9.0.dist-info/DELVEWHEEL +2 -0
  50. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/METADATA +2 -2
  51. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/RECORD +73 -68
  52. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/WHEEL +1 -1
  53. pyogrio.libs/.load-order-pyogrio-0.9.0 +18 -0
  54. pyogrio.libs/Lerc-5e4d8cbeeabca06f95e2270792304dc3.dll +0 -0
  55. pyogrio.libs/{gdal-c3b1d8f66682071d0cd26d86e4182013.dll → gdal-b434963605a006e01c486c0df6dea4e0.dll} +0 -0
  56. pyogrio.libs/geos-f0622d0794b81c937a851b2e6fa9b712.dll +0 -0
  57. pyogrio.libs/geos_c-0e16bf70612fc3301d077b9d863a3fdb.dll +0 -0
  58. pyogrio.libs/{geotiff-e43cdab688866b59f8800cfcde836d16.dll → geotiff-772e7c705fb15ddf91b432adb4eb1f6c.dll} +0 -0
  59. pyogrio.libs/iconv-2-8fcc23ddc6f096c45871011b6e008b44.dll +0 -0
  60. pyogrio.libs/{jpeg62-567ab743ac805dfb57fe3867ba5788a4.dll → jpeg62-2f9b7af22d78338e8f0be0058503dc35.dll} +0 -0
  61. pyogrio.libs/json-c-e52a077545e4057de42beb4948289b41.dll +0 -0
  62. pyogrio.libs/libcurl-bc81cd8afe15b10c0821b181b6af8bd0.dll +0 -0
  63. pyogrio.libs/libexpat-fbe03ca8917dfda776562d4338b289b8.dll +0 -0
  64. pyogrio.libs/{liblzma-de7f4770d4e3715acd031ca93883f10c.dll → liblzma-6b36f24d54d3dd45f274a2aebef81085.dll} +0 -0
  65. pyogrio.libs/libpng16-13928571ad910705eae8d7dd8eef8b11.dll +0 -0
  66. pyogrio.libs/{msvcp140-83b6a1a2fa8b1735a358b2fe13cabe4e.dll → msvcp140-46db46e967c8db2cb7a20fc75872a57e.dll} +0 -0
  67. pyogrio.libs/proj-8a30239ef2dfc3b9dd2bb48e8abb330f.dll +0 -0
  68. pyogrio.libs/{qhull_r-99ae8a526357acc44b162cb4df2c3bb6.dll → qhull_r-c45abde5d0c92faf723cc2942138af77.dll} +0 -0
  69. pyogrio.libs/sqlite3-df30c3cf230727e23c43c40126a530f7.dll +0 -0
  70. pyogrio.libs/{tiff-7c2d4b204ec2db46c81f6a597895c2f7.dll → tiff-43630f30487a9015213475ae86ed3fa3.dll} +0 -0
  71. pyogrio.libs/{zlib1-824de9299616f0908aeeb9441a084848.dll → zlib1-e1272810861a13dd8d6cff3beac47f17.dll} +0 -0
  72. pyogrio/tests/win32.py +0 -86
  73. pyogrio-0.7.2.dist-info/DELVEWHEEL +0 -2
  74. pyogrio.libs/.load-order-pyogrio-0.7.2 +0 -17
  75. pyogrio.libs/Lerc-d5afc4101deffe7de21241ccd4d562f6.dll +0 -0
  76. pyogrio.libs/geos-1c764a1384537a0ad2995e83d23e8642.dll +0 -0
  77. pyogrio.libs/geos_c-0d7dfdcee49efa8df585e2fb993157aa.dll +0 -0
  78. pyogrio.libs/json-c-36c91e30c4410d41c22b2010c31183e3.dll +0 -0
  79. pyogrio.libs/libcurl-ebcc8c18195071a90e59f818902e10c6.dll +0 -0
  80. pyogrio.libs/libexpat-345379c9c11632130d8c383cbacde1a6.dll +0 -0
  81. pyogrio.libs/libpng16-2c30e6846653c47ef2ff9d7dec3338ba.dll +0 -0
  82. pyogrio.libs/proj-98758c96a6cb682b5cec7e8dc5e29a50.dll +0 -0
  83. pyogrio.libs/sqlite3-327ed7b38bfd91fb4a17544960e055e9.dll +0 -0
  84. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/LICENSE +0 -0
  85. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/top_level.txt +0 -0
pyogrio/_io.pyx CHANGED
@@ -10,24 +10,27 @@ import locale
10
10
  import logging
11
11
  import math
12
12
  import os
13
+ import sys
13
14
  import warnings
14
15
 
15
16
  from libc.stdint cimport uint8_t, uintptr_t
16
17
  from libc.stdlib cimport malloc, free
17
18
  from libc.string cimport strlen
18
19
  from libc.math cimport isnan
20
+ from cpython.pycapsule cimport PyCapsule_GetPointer
19
21
 
20
22
  cimport cython
23
+ from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
24
+
21
25
  import numpy as np
22
- cimport numpy as np
23
26
 
24
27
  from pyogrio._ogr cimport *
25
28
  from pyogrio._err cimport *
29
+ from pyogrio._vsi cimport *
26
30
  from pyogrio._err import CPLE_BaseError, CPLE_NotSupportedError, NullPointerError
27
31
  from pyogrio._geometry cimport get_geometry_type, get_geometry_type_code
28
32
  from pyogrio.errors import CRSError, DataSourceError, DataLayerError, GeometryError, FieldError, FeatureError
29
33
 
30
-
31
34
  log = logging.getLogger(__name__)
32
35
 
33
36
 
@@ -135,7 +138,52 @@ cdef char** dict_to_options(object values):
135
138
  return options
136
139
 
137
140
 
141
+ cdef const char* override_threadlocal_config_option(str key, str value):
142
+ """Set the CPLSetThreadLocalConfigOption for key=value
143
+
144
+ Parameters
145
+ ----------
146
+ key : str
147
+ value : str
148
+
149
+ Returns
150
+ -------
151
+ const char*
152
+ value previously set for key, so that it can be later restored. Caller
153
+ is responsible for freeing this via CPLFree() if not NULL.
154
+ """
155
+
156
+ key_b = key.encode("UTF-8")
157
+ cdef const char* key_c = key_b
158
+
159
+ value_b = value.encode("UTF-8")
160
+ cdef const char* value_c = value_b
161
+
162
+
163
+ cdef const char *prev_value = CPLGetThreadLocalConfigOption(key_c, NULL)
164
+ if prev_value != NULL:
165
+ # strings returned from config options may be replaced via
166
+ # CPLSetConfigOption() below; GDAL instructs us to save a copy
167
+ # in a new string
168
+ prev_value = CPLStrdup(prev_value)
169
+
170
+ CPLSetThreadLocalConfigOption(key_c, value_c)
171
+
172
+ return prev_value
173
+
174
+
138
175
  cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL:
176
+ """Open an existing OGR data source
177
+
178
+ Parameters
179
+ ----------
180
+ path_c : char *
181
+ input path, including an in-memory path (/vsimem/...)
182
+ mode : int
183
+ set to 1 to allow updating data source
184
+ options : char **, optional
185
+ dataset open options
186
+ """
139
187
  cdef void* ogr_dataset = NULL
140
188
 
141
189
  # Force linear approximations in all cases
@@ -163,7 +211,7 @@ cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL:
163
211
  ) from None
164
212
 
165
213
  except CPLE_BaseError as exc:
166
- if str(exc).endswith("not recognized as a supported file format."):
214
+ if str(exc).endswith("a supported file format."):
167
215
  raise DataSourceError(
168
216
  f"{str(exc)} It might help to specify the correct driver explicitly by "
169
217
  "prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'."
@@ -171,6 +219,25 @@ cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL:
171
219
  raise DataSourceError(str(exc)) from None
172
220
 
173
221
 
222
+ cdef ogr_close(GDALDatasetH ogr_dataset):
223
+ """Close the dataset and raise exception if that fails.
224
+ NOTE: some drivers only raise errors on write when calling GDALClose()
225
+ """
226
+ if ogr_dataset != NULL:
227
+ IF CTE_GDAL_VERSION >= (3, 7, 0):
228
+ if GDALClose(ogr_dataset) != CE_None:
229
+ return exc_check()
230
+
231
+ return
232
+
233
+ ELSE:
234
+ GDALClose(ogr_dataset)
235
+
236
+ # GDAL will set an error if there was an error writing the data source
237
+ # on close
238
+ return exc_check()
239
+
240
+
174
241
  cdef OGRLayerH get_ogr_layer(GDALDatasetH ogr_dataset, layer) except NULL:
175
242
  """Open OGR layer by index or name.
176
243
 
@@ -462,9 +529,11 @@ cdef get_metadata(GDALMajorObjectH obj):
462
529
 
463
530
 
464
531
  cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
465
- """Attempt to detect the encoding of the layer.
466
- If it supports UTF-8, use that.
467
- If it is a shapefile, it must otherwise be ISO-8859-1.
532
+ """Attempt to detect the encoding to use to read/write string values.
533
+
534
+ If the layer/dataset supports reading/writing data in UTF-8, returns UTF-8.
535
+ If UTF-8 is not supported and ESRI Shapefile, returns ISO-8859-1
536
+ Otherwise the system locale preferred encoding is returned.
468
537
 
469
538
  Parameters
470
539
  ----------
@@ -477,18 +546,53 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
477
546
  """
478
547
 
479
548
  if OGR_L_TestCapability(ogr_layer, OLCStringsAsUTF8):
480
- return 'UTF-8'
549
+ # OGR_L_TestCapability returns True for OLCStringsAsUTF8 if GDAL hides encoding
550
+ # complexities for this layer/driver type. In this case all string attribute
551
+ # values have to be supplied in UTF-8 and values will be returned in UTF-8.
552
+ # The encoding used to read/write under the hood depends on the driver used.
553
+ # For layers/drivers where False is returned, the string values are written and
554
+ # read without recoding. Hence, it is up to you to supply the data in the
555
+ # appropriate encoding. More info:
556
+ # https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields
557
+ # NOTE: for shapefiles, this always returns False for the layer returned
558
+ # when executing SQL, even when it supports UTF-8 (patched below);
559
+ # this may be fixed by https://github.com/OSGeo/gdal/pull/9649 (GDAL >=3.9.0?)
560
+ return "UTF-8"
481
561
 
482
562
  driver = get_driver(ogr_dataset)
483
- if driver == 'ESRI Shapefile':
484
- return 'ISO-8859-1'
563
+ if driver == "ESRI Shapefile":
564
+ # OGR_L_TestCapability returns True for OLCStringsAsUTF8 (above) for
565
+ # shapefiles when a .cpg file is present with a valid encoding, or GDAL
566
+ # auto-detects the encoding from the code page of the .dbf file, or
567
+ # SHAPE_ENCODING config option is set, or ENCODING layer creation option
568
+ # is specified (shapefiles only). Otherwise, we can only assume that
569
+ # shapefiles are in their default encoding of ISO-8859-1 (which may be
570
+ # incorrect and must be overridden by user-provided encoding)
571
+
572
+ # Always use the first layer to test capabilities until detection for
573
+ # SQL results from shapefiles are fixed (above)
574
+ # This block should only be used for unfixed versions of GDAL (<3.9.0?)
575
+ if OGR_L_TestCapability(GDALDatasetGetLayer(ogr_dataset, 0), OLCStringsAsUTF8):
576
+ return "UTF-8"
577
+
578
+ return "ISO-8859-1"
485
579
 
486
580
  if driver == "OSM":
487
581
  # always set OSM data to UTF-8
488
582
  # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
489
583
  return "UTF-8"
490
584
 
491
- return None
585
+ if driver in ("XLSX", "ODS"):
586
+ # TestCapability for OLCStringsAsUTF8 for XLSX and ODS was False for new files
587
+ # being created for GDAL < 3.8.5. Once these versions of GDAL are no longer
588
+ # supported, this can be removed.
589
+ return "UTF-8"
590
+
591
+ if driver == "GeoJSONSeq":
592
+ # In old gdal versions, OLCStringsAsUTF8 wasn't advertised yet.
593
+ return "UTF-8"
594
+
595
+ return locale.getpreferredencoding()
492
596
 
493
597
 
494
598
  cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):
@@ -608,7 +712,7 @@ cdef apply_bbox_filter(OGRLayerH ogr_layer, bbox):
608
712
  Parameters
609
713
  ----------
610
714
  ogr_layer : pointer to open OGR layer
611
- bbox: list or tuple of xmin, ymin, xmax, ymax
715
+ bbox : list or tuple of xmin, ymin, xmax, ymax
612
716
 
613
717
  Raises
614
718
  ------
@@ -629,7 +733,7 @@ cdef apply_geometry_filter(OGRLayerH ogr_layer, wkb):
629
733
  Parameters
630
734
  ----------
631
735
  ogr_layer : pointer to open OGR layer
632
- wkb: WKB encoding of geometry
736
+ wkb : WKB encoding of geometry
633
737
  """
634
738
 
635
739
  cdef OGRGeometryH ogr_geometry = NULL
@@ -783,7 +887,7 @@ cdef process_fields(
783
887
  data[i] = bin_value[:ret_length]
784
888
 
785
889
  elif field_type == OFTDateTime or field_type == OFTDate:
786
-
890
+
787
891
  if datetime_as_string:
788
892
  # defer datetime parsing to user/ pandas layer
789
893
  # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only
@@ -851,7 +955,7 @@ cdef get_features(
851
955
 
852
956
  field_data = [
853
957
  np.empty(shape=(num_features, ),
854
- dtype = ("object" if datetime_as_string and
958
+ dtype = ("object" if datetime_as_string and
855
959
  fields[field_index,3].startswith("datetime") else fields[field_index,3])
856
960
  ) for field_index in range(n_fields)
857
961
  ]
@@ -950,8 +1054,8 @@ cdef get_features_by_fid(
950
1054
  field_ogr_types = fields[:,1]
951
1055
  field_data = [
952
1056
  np.empty(shape=(count, ),
953
- dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime")
954
- else fields[field_index,3]))
1057
+ dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime")
1058
+ else fields[field_index,3]))
955
1059
  for field_index in range(n_fields)
956
1060
  ]
957
1061
 
@@ -1060,7 +1164,7 @@ cdef get_bounds(
1060
1164
 
1061
1165
 
1062
1166
  def ogr_read(
1063
- str path,
1167
+ object path_or_buffer,
1064
1168
  object dataset_kwargs,
1065
1169
  object layer=None,
1066
1170
  object encoding=None,
@@ -1080,6 +1184,7 @@ def ogr_read(
1080
1184
  ):
1081
1185
 
1082
1186
  cdef int err = 0
1187
+ cdef bint is_vsimem = isinstance(path_or_buffer, bytes)
1083
1188
  cdef const char *path_c = NULL
1084
1189
  cdef char **dataset_options = NULL
1085
1190
  cdef const char *where_c = NULL
@@ -1089,9 +1194,8 @@ def ogr_read(
1089
1194
  cdef OGRLayerH ogr_layer = NULL
1090
1195
  cdef int feature_count = 0
1091
1196
  cdef double xmin, ymin, xmax, ymax
1092
-
1093
- path_b = path.encode('utf-8')
1094
- path_c = path_b
1197
+ cdef const char *prev_shape_encoding = NULL
1198
+ cdef bint override_shape_encoding = False
1095
1199
 
1096
1200
  if fids is not None:
1097
1201
  if where is not None or bbox is not None or mask is not None or sql is not None or skip_features or max_features:
@@ -1120,13 +1224,23 @@ def ogr_read(
1120
1224
  raise ValueError("'max_features' must be >= 0")
1121
1225
 
1122
1226
  try:
1227
+ path = read_buffer_to_vsimem(path_or_buffer) if is_vsimem else path_or_buffer
1228
+
1229
+ if encoding:
1230
+ # for shapefiles, SHAPE_ENCODING must be set before opening the file
1231
+ # to prevent automatic decoding to UTF-8 by GDAL, so we save previous
1232
+ # SHAPE_ENCODING so that it can be restored later
1233
+ # (we do this for all data sources where encoding is set because
1234
+ # we don't know the driver until after it is opened, which is too late)
1235
+ override_shape_encoding = True
1236
+ prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
1237
+
1123
1238
  dataset_options = dict_to_options(dataset_kwargs)
1124
- ogr_dataset = ogr_open(path_c, 0, dataset_options)
1239
+ ogr_dataset = ogr_open(path.encode('UTF-8'), 0, dataset_options)
1125
1240
 
1126
1241
  if sql is None:
1127
- # layer defaults to index 0
1128
1242
  if layer is None:
1129
- layer = 0
1243
+ layer = get_default_layer(ogr_dataset)
1130
1244
  ogr_layer = get_ogr_layer(ogr_dataset, layer)
1131
1245
  else:
1132
1246
  ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect)
@@ -1135,23 +1249,31 @@ def ogr_read(
1135
1249
 
1136
1250
  # Encoding is derived from the user, from the dataset capabilities / type,
1137
1251
  # or from the system locale
1138
- encoding = (
1139
- encoding
1140
- or detect_encoding(ogr_dataset, ogr_layer)
1141
- or locale.getpreferredencoding()
1142
- )
1252
+ if encoding:
1253
+ if get_driver(ogr_dataset) == "ESRI Shapefile":
1254
+ # NOTE: SHAPE_ENCODING is a configuration option whereas ENCODING is the dataset open option
1255
+ if "ENCODING" in dataset_kwargs:
1256
+ raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source')
1257
+
1258
+ # Because SHAPE_ENCODING is set above, GDAL will automatically
1259
+ # decode shapefiles to UTF-8; ignore any encoding set by user
1260
+ encoding = "UTF-8"
1261
+
1262
+ else:
1263
+ encoding = detect_encoding(ogr_dataset, ogr_layer)
1143
1264
 
1144
1265
  fields = get_fields(ogr_layer, encoding)
1145
1266
 
1146
1267
  ignored_fields = []
1147
1268
  if columns is not None:
1269
+ # identify ignored fields first
1270
+ ignored_fields = list(set(fields[:,2]) - set(columns))
1271
+
1148
1272
  # Fields are matched exactly by name, duplicates are dropped.
1149
1273
  # Find index of each field into fields
1150
1274
  idx = np.intersect1d(fields[:,2], columns, return_indices=True)[1]
1151
1275
  fields = fields[idx, :]
1152
1276
 
1153
- ignored_fields = list(set(fields[:,2]) - set(columns))
1154
-
1155
1277
  if not read_geometry:
1156
1278
  ignored_fields.append("OGR_GEOMETRY")
1157
1279
 
@@ -1232,6 +1354,17 @@ def ogr_read(
1232
1354
  GDALClose(ogr_dataset)
1233
1355
  ogr_dataset = NULL
1234
1356
 
1357
+ # reset SHAPE_ENCODING config parameter if temporarily set above
1358
+ if override_shape_encoding:
1359
+ CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
1360
+
1361
+ if prev_shape_encoding != NULL:
1362
+ CPLFree(<void*>prev_shape_encoding)
1363
+ prev_shape_encoding = NULL
1364
+
1365
+ if is_vsimem:
1366
+ delete_vsimem_file(path)
1367
+
1235
1368
  return (
1236
1369
  meta,
1237
1370
  fid_data,
@@ -1239,9 +1372,38 @@ def ogr_read(
1239
1372
  field_data
1240
1373
  )
1241
1374
 
1375
+
1376
+ cdef void pycapsule_array_stream_deleter(object stream_capsule) noexcept:
1377
+ cdef ArrowArrayStream* stream = <ArrowArrayStream*>PyCapsule_GetPointer(
1378
+ stream_capsule, 'arrow_array_stream'
1379
+ )
1380
+ # Do not invoke the deleter on a used/moved capsule
1381
+ if stream.release != NULL:
1382
+ stream.release(stream)
1383
+
1384
+ free(stream)
1385
+
1386
+
1387
+ cdef object alloc_c_stream(ArrowArrayStream** c_stream):
1388
+ c_stream[0] = <ArrowArrayStream*> malloc(sizeof(ArrowArrayStream))
1389
+ # Ensure the capsule destructor doesn't call a random release pointer
1390
+ c_stream[0].release = NULL
1391
+ return PyCapsule_New(c_stream[0], 'arrow_array_stream', &pycapsule_array_stream_deleter)
1392
+
1393
+
1394
+ class _ArrowStream:
1395
+ def __init__(self, capsule):
1396
+ self._capsule = capsule
1397
+
1398
+ def __arrow_c_stream__(self, requested_schema=None):
1399
+ if requested_schema is not None:
1400
+ raise NotImplementedError("requested_schema is not supported")
1401
+ return self._capsule
1402
+
1403
+
1242
1404
  @contextlib.contextmanager
1243
1405
  def ogr_open_arrow(
1244
- str path,
1406
+ object path_or_buffer,
1245
1407
  dataset_kwargs,
1246
1408
  object layer=None,
1247
1409
  object encoding=None,
@@ -1257,31 +1419,38 @@ def ogr_open_arrow(
1257
1419
  str sql=None,
1258
1420
  str sql_dialect=None,
1259
1421
  int return_fids=False,
1260
- int batch_size=0):
1422
+ int batch_size=0,
1423
+ use_pyarrow=False,
1424
+ ):
1261
1425
 
1262
1426
  cdef int err = 0
1427
+ cdef bint is_vsimem = isinstance(path_or_buffer, bytes)
1263
1428
  cdef const char *path_c = NULL
1264
1429
  cdef char **dataset_options = NULL
1265
1430
  cdef const char *where_c = NULL
1266
1431
  cdef OGRDataSourceH ogr_dataset = NULL
1267
1432
  cdef OGRLayerH ogr_layer = NULL
1433
+ cdef void *ogr_driver = NULL
1268
1434
  cdef char **fields_c = NULL
1269
1435
  cdef const char *field_c = NULL
1270
1436
  cdef char **options = NULL
1271
- cdef ArrowArrayStream stream
1437
+ cdef const char *prev_shape_encoding = NULL
1438
+ cdef bint override_shape_encoding = False
1439
+ cdef ArrowArrayStream* stream
1272
1440
  cdef ArrowSchema schema
1273
1441
 
1274
1442
  IF CTE_GDAL_VERSION < (3, 6, 0):
1275
1443
  raise RuntimeError("Need GDAL>=3.6 for Arrow support")
1276
1444
 
1277
- path_b = path.encode('utf-8')
1278
- path_c = path_b
1279
-
1280
1445
  if force_2d:
1281
1446
  raise ValueError("forcing 2D is not supported for Arrow")
1282
1447
 
1283
1448
  if fids is not None:
1284
- raise ValueError("reading by FID is not supported for Arrow")
1449
+ if where is not None or bbox is not None or mask is not None or sql is not None or skip_features or max_features:
1450
+ raise ValueError(
1451
+ "cannot set both 'fids' and any of 'where', 'bbox', 'mask', "
1452
+ "'sql', 'skip_features', or 'max_features'"
1453
+ )
1285
1454
 
1286
1455
  IF CTE_GDAL_VERSION < (3, 8, 0):
1287
1456
  if skip_features:
@@ -1311,13 +1480,18 @@ def ogr_open_arrow(
1311
1480
 
1312
1481
  reader = None
1313
1482
  try:
1483
+ path = read_buffer_to_vsimem(path_or_buffer) if is_vsimem else path_or_buffer
1484
+
1485
+ if encoding:
1486
+ override_shape_encoding = True
1487
+ prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
1488
+
1314
1489
  dataset_options = dict_to_options(dataset_kwargs)
1315
- ogr_dataset = ogr_open(path_c, 0, dataset_options)
1490
+ ogr_dataset = ogr_open(path.encode('UTF-8'), 0, dataset_options)
1316
1491
 
1317
1492
  if sql is None:
1318
- # layer defaults to index 0
1319
1493
  if layer is None:
1320
- layer = 0
1494
+ layer = get_default_layer(ogr_dataset)
1321
1495
  ogr_layer = get_ogr_layer(ogr_dataset, layer)
1322
1496
  else:
1323
1497
  ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect)
@@ -1326,11 +1500,18 @@ def ogr_open_arrow(
1326
1500
 
1327
1501
  # Encoding is derived from the user, from the dataset capabilities / type,
1328
1502
  # or from the system locale
1329
- encoding = (
1330
- encoding
1331
- or detect_encoding(ogr_dataset, ogr_layer)
1332
- or locale.getpreferredencoding()
1333
- )
1503
+ if encoding:
1504
+ if get_driver(ogr_dataset) == "ESRI Shapefile":
1505
+ if "ENCODING" in dataset_kwargs:
1506
+ raise ValueError('cannot provide both encoding parameter and "ENCODING" option; use encoding parameter to specify correct encoding for data source')
1507
+
1508
+ encoding = "UTF-8"
1509
+
1510
+ elif encoding.replace('-','').upper() != 'UTF8':
1511
+ raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead")
1512
+
1513
+ else:
1514
+ encoding = detect_encoding(ogr_dataset, ogr_layer)
1334
1515
 
1335
1516
  fields = get_fields(ogr_layer, encoding, use_arrow=True)
1336
1517
 
@@ -1341,19 +1522,64 @@ def ogr_open_arrow(
1341
1522
  if not read_geometry:
1342
1523
  ignored_fields.append("OGR_GEOMETRY")
1343
1524
 
1525
+ # raise error if schema has bool values for FGB / GPKG and GDAL <3.8.3
1526
+ # due to https://github.com/OSGeo/gdal/issues/8998
1527
+ IF CTE_GDAL_VERSION < (3, 8, 3):
1528
+
1529
+ driver = get_driver(ogr_dataset)
1530
+ if driver in {'FlatGeobuf', 'GPKG'}:
1531
+ ignored = set(ignored_fields)
1532
+ for f in fields:
1533
+ if f[2] not in ignored and f[3] == 'bool':
1534
+ raise RuntimeError(
1535
+ "GDAL < 3.8.3 does not correctly read boolean data values using the "
1536
+ "Arrow API. Do not use read_arrow() / use_arrow=True for this dataset."
1537
+ )
1538
+
1344
1539
  geometry_type = get_geometry_type(ogr_layer)
1345
1540
 
1346
1541
  geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer))
1347
1542
 
1348
1543
  fid_column = get_string(OGR_L_GetFIDColumn(ogr_layer))
1544
+ fid_column_where = fid_column
1349
1545
  # OGR_L_GetFIDColumn returns the column name if it is a custom column,
1350
- # or "" if not. For arrow, the default column name is "OGC_FID".
1546
+ # or "" if not. For arrow, the default column name used to return the FID data
1547
+ # read is "OGC_FID". When accessing the underlying datasource like when using a
1548
+ # where clause, the default column name is "FID".
1351
1549
  if fid_column == "":
1352
1550
  fid_column = "OGC_FID"
1551
+ fid_column_where = "FID"
1552
+
1553
+ # Use fids list to create a where clause, as arrow doesn't support direct fid
1554
+ # filtering.
1555
+ if fids is not None:
1556
+ IF CTE_GDAL_VERSION < (3, 8, 0):
1557
+ driver = get_driver(ogr_dataset)
1558
+ if driver not in {"GPKG", "GeoJSON"}:
1559
+ warnings.warn(
1560
+ "Using 'fids' and 'use_arrow=True' with GDAL < 3.8 can be slow "
1561
+ "for some drivers. Upgrading GDAL or using 'use_arrow=False' "
1562
+ "can avoid this.",
1563
+ stacklevel=2,
1564
+ )
1565
+
1566
+ fids_str = ",".join([str(fid) for fid in fids])
1567
+ where = f"{fid_column_where} IN ({fids_str})"
1353
1568
 
1354
1569
  # Apply the attribute filter
1355
1570
  if where is not None and where != "":
1356
- apply_where_filter(ogr_layer, where)
1571
+ try:
1572
+ apply_where_filter(ogr_layer, where)
1573
+ except ValueError as ex:
1574
+ if fids is not None and str(ex).startswith("Invalid SQL query"):
1575
+ # If fids is not None, the where being applied is the one formatted
1576
+ # above.
1577
+ raise ValueError(
1578
+ f"error applying filter for {len(fids)} fids; max. number for "
1579
+ f"drivers with default SQL dialect 'OGRSQL' is 4997"
1580
+ ) from ex
1581
+
1582
+ raise
1357
1583
 
1358
1584
  # Apply the spatial filter
1359
1585
  if bbox is not None:
@@ -1381,22 +1607,34 @@ def ogr_open_arrow(
1381
1607
  str(batch_size).encode('UTF-8')
1382
1608
  )
1383
1609
 
1610
+ # Default to geoarrow metadata encoding
1611
+ IF CTE_GDAL_VERSION >= (3, 8, 0):
1612
+ options = CSLSetNameValue(
1613
+ options,
1614
+ "GEOMETRY_METADATA_ENCODING",
1615
+ "GEOARROW".encode('UTF-8')
1616
+ )
1617
+
1384
1618
  # make sure layer is read from beginning
1385
1619
  OGR_L_ResetReading(ogr_layer)
1386
1620
 
1387
- if not OGR_L_GetArrowStream(ogr_layer, &stream, options):
1388
- raise RuntimeError("Failed to open ArrowArrayStream from Layer")
1621
+ # allocate the stream struct and wrap in capsule to ensure clean-up on error
1622
+ capsule = alloc_c_stream(&stream)
1389
1623
 
1390
- stream_ptr = <uintptr_t> &stream
1624
+ if not OGR_L_GetArrowStream(ogr_layer, stream, options):
1625
+ raise RuntimeError("Failed to open ArrowArrayStream from Layer")
1391
1626
 
1392
1627
  if skip_features:
1393
1628
  # only supported for GDAL >= 3.8.0; have to do this after getting
1394
1629
  # the Arrow stream
1395
1630
  OGR_L_SetNextByIndex(ogr_layer, skip_features)
1396
1631
 
1397
- # stream has to be consumed before the Dataset is closed
1398
- import pyarrow as pa
1399
- reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr)
1632
+ if use_pyarrow:
1633
+ import pyarrow as pa
1634
+
1635
+ reader = pa.RecordBatchStreamReader._import_from_c(<uintptr_t> stream)
1636
+ else:
1637
+ reader = _ArrowStream(capsule)
1400
1638
 
1401
1639
  meta = {
1402
1640
  'crs': crs,
@@ -1407,13 +1645,16 @@ def ogr_open_arrow(
1407
1645
  'fid_column': fid_column,
1408
1646
  }
1409
1647
 
1648
+ # stream has to be consumed before the Dataset is closed
1410
1649
  yield meta, reader
1411
1650
 
1412
1651
  finally:
1413
- if reader is not None:
1652
+ if use_pyarrow and reader is not None:
1414
1653
  # Mark reader as closed to prevent reading batches
1415
1654
  reader.close()
1416
1655
 
1656
+ # `stream` will be freed through `capsule` destructor
1657
+
1417
1658
  CSLDestroy(options)
1418
1659
  if fields_c != NULL:
1419
1660
  CSLDestroy(fields_c)
@@ -1430,8 +1671,20 @@ def ogr_open_arrow(
1430
1671
  GDALClose(ogr_dataset)
1431
1672
  ogr_dataset = NULL
1432
1673
 
1674
+ # reset SHAPE_ENCODING config parameter if temporarily set above
1675
+ if override_shape_encoding:
1676
+ CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
1677
+
1678
+ if prev_shape_encoding != NULL:
1679
+ CPLFree(<void*>prev_shape_encoding)
1680
+ prev_shape_encoding = NULL
1681
+
1682
+ if is_vsimem:
1683
+ delete_vsimem_file(path)
1684
+
1685
+
1433
1686
  def ogr_read_bounds(
1434
- str path,
1687
+ object path_or_buffer,
1435
1688
  object layer=None,
1436
1689
  object encoding=None,
1437
1690
  int read_geometry=True,
@@ -1444,6 +1697,7 @@ def ogr_read_bounds(
1444
1697
  object mask=None):
1445
1698
 
1446
1699
  cdef int err = 0
1700
+ cdef bint is_vsimem = isinstance(path_or_buffer, bytes)
1447
1701
  cdef const char *path_c = NULL
1448
1702
  cdef const char *where_c = NULL
1449
1703
  cdef OGRDataSourceH ogr_dataset = NULL
@@ -1460,77 +1714,91 @@ def ogr_read_bounds(
1460
1714
  if max_features < 0:
1461
1715
  raise ValueError("'max_features' must be >= 0")
1462
1716
 
1463
- path_b = path.encode('utf-8')
1464
- path_c = path_b
1717
+ try:
1718
+ path = read_buffer_to_vsimem(path_or_buffer) if is_vsimem else path_or_buffer
1719
+ ogr_dataset = ogr_open(path.encode('UTF-8'), 0, NULL)
1465
1720
 
1466
- # layer defaults to index 0
1467
- if layer is None:
1468
- layer = 0
1721
+ if layer is None:
1722
+ layer = get_default_layer(ogr_dataset)
1469
1723
 
1470
- ogr_dataset = ogr_open(path_c, 0, NULL)
1471
- ogr_layer = get_ogr_layer(ogr_dataset, layer)
1724
+ ogr_layer = get_ogr_layer(ogr_dataset, layer)
1472
1725
 
1473
- # Apply the attribute filter
1474
- if where is not None and where != "":
1475
- apply_where_filter(ogr_layer, where)
1726
+ # Apply the attribute filter
1727
+ if where is not None and where != "":
1728
+ apply_where_filter(ogr_layer, where)
1729
+
1730
+ # Apply the spatial filter
1731
+ if bbox is not None:
1732
+ apply_bbox_filter(ogr_layer, bbox)
1733
+
1734
+ elif mask is not None:
1735
+ apply_geometry_filter(ogr_layer, mask)
1476
1736
 
1477
- # Apply the spatial filter
1478
- if bbox is not None:
1479
- apply_bbox_filter(ogr_layer, bbox)
1737
+ # Limit feature range to available range
1738
+ skip_features, num_features = validate_feature_range(ogr_layer, skip_features, max_features)
1480
1739
 
1481
- elif mask is not None:
1482
- apply_geometry_filter(ogr_layer, mask)
1740
+ bounds = get_bounds(ogr_layer, skip_features, num_features)
1483
1741
 
1484
- # Limit feature range to available range
1485
- skip_features, num_features = validate_feature_range(ogr_layer, skip_features, max_features)
1742
+ finally:
1743
+ if ogr_dataset != NULL:
1744
+ GDALClose(ogr_dataset)
1745
+ ogr_dataset = NULL
1486
1746
 
1487
- return get_bounds(ogr_layer, skip_features, num_features)
1747
+ if is_vsimem:
1748
+ delete_vsimem_file(path)
1749
+
1750
+ return bounds
1488
1751
 
1489
1752
 
1490
1753
  def ogr_read_info(
1491
- str path,
1754
+ object path_or_buffer,
1492
1755
  dataset_kwargs,
1493
1756
  object layer=None,
1494
1757
  object encoding=None,
1495
1758
  int force_feature_count=False,
1496
1759
  int force_total_bounds=False):
1497
1760
 
1761
+ cdef bint is_vsimem = isinstance(path_or_buffer, bytes)
1498
1762
  cdef const char *path_c = NULL
1499
1763
  cdef char **dataset_options = NULL
1500
1764
  cdef OGRDataSourceH ogr_dataset = NULL
1501
1765
  cdef OGRLayerH ogr_layer = NULL
1766
+ cdef const char *prev_shape_encoding = NULL
1767
+ cdef bint override_shape_encoding = False
1502
1768
 
1503
- path_b = path.encode('utf-8')
1504
- path_c = path_b
1769
+ try:
1770
+ path = read_buffer_to_vsimem(path_or_buffer) if is_vsimem else path_or_buffer
1505
1771
 
1506
- # layer defaults to index 0
1507
- if layer is None:
1508
- layer = 0
1772
+ if encoding:
1773
+ override_shape_encoding = True
1774
+ prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding)
1509
1775
 
1510
- try:
1511
1776
  dataset_options = dict_to_options(dataset_kwargs)
1512
- ogr_dataset = ogr_open(path_c, 0, dataset_options)
1777
+ ogr_dataset = ogr_open(path.encode('UTF-8'), 0, dataset_options)
1778
+
1779
+ if layer is None:
1780
+ layer = get_default_layer(ogr_dataset)
1513
1781
  ogr_layer = get_ogr_layer(ogr_dataset, layer)
1514
1782
 
1515
- # Encoding is derived from the user, from the dataset capabilities / type,
1516
- # or from the system locale
1517
- encoding = (
1518
- encoding
1519
- or detect_encoding(ogr_dataset, ogr_layer)
1520
- or locale.getpreferredencoding()
1521
- )
1783
+ if encoding and get_driver(ogr_dataset) == "ESRI Shapefile":
1784
+ encoding = "UTF-8"
1785
+ else:
1786
+ encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
1522
1787
 
1523
1788
  fields = get_fields(ogr_layer, encoding)
1524
1789
 
1525
1790
  meta = {
1526
- 'crs': get_crs(ogr_layer),
1527
- 'encoding': encoding,
1528
- 'fields': fields[:,2], # return only names
1529
- 'dtypes': fields[:,3],
1530
- 'geometry_type': get_geometry_type(ogr_layer),
1531
- 'features': get_feature_count(ogr_layer, force_feature_count),
1532
- 'total_bounds': get_total_bounds(ogr_layer, force_total_bounds),
1533
- 'driver': get_driver(ogr_dataset),
1791
+ "layer_name": get_string(OGR_L_GetName(ogr_layer)),
1792
+ "crs": get_crs(ogr_layer),
1793
+ "encoding": encoding,
1794
+ "fields": fields[:,2], # return only names
1795
+ "dtypes": fields[:,3],
1796
+ "fid_column": get_string(OGR_L_GetFIDColumn(ogr_layer)),
1797
+ "geometry_name": get_string(OGR_L_GetGeometryColumn(ogr_layer)),
1798
+ "geometry_type": get_geometry_type(ogr_layer),
1799
+ "features": get_feature_count(ogr_layer, force_feature_count),
1800
+ "total_bounds": get_total_bounds(ogr_layer, force_total_bounds),
1801
+ "driver": get_driver(ogr_dataset),
1534
1802
  "capabilities": {
1535
1803
  "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead) == 1,
1536
1804
  "fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex) == 1,
@@ -1538,8 +1806,8 @@ def ogr_read_info(
1538
1806
  "fast_feature_count": OGR_L_TestCapability(ogr_layer, OLCFastFeatureCount) == 1,
1539
1807
  "fast_total_bounds": OGR_L_TestCapability(ogr_layer, OLCFastGetExtent) == 1,
1540
1808
  },
1541
- 'layer_metadata': get_metadata(ogr_layer),
1542
- 'dataset_metadata': get_metadata(ogr_dataset),
1809
+ "layer_metadata": get_metadata(ogr_layer),
1810
+ "dataset_metadata": get_metadata(ogr_dataset),
1543
1811
  }
1544
1812
 
1545
1813
  finally:
@@ -1551,19 +1819,88 @@ def ogr_read_info(
1551
1819
  GDALClose(ogr_dataset)
1552
1820
  ogr_dataset = NULL
1553
1821
 
1822
+ # reset SHAPE_ENCODING config parameter if temporarily set above
1823
+ if override_shape_encoding:
1824
+ CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding)
1825
+
1826
+ if prev_shape_encoding != NULL:
1827
+ CPLFree(<void*>prev_shape_encoding)
1828
+
1829
+ if is_vsimem:
1830
+ delete_vsimem_file(path)
1831
+
1554
1832
  return meta
1555
1833
 
1556
1834
 
1557
- def ogr_list_layers(str path):
1835
+ def ogr_list_layers(object path_or_buffer):
1836
+ cdef bint is_vsimem = isinstance(path_or_buffer, bytes)
1558
1837
  cdef const char *path_c = NULL
1559
- cdef const char *ogr_name = NULL
1560
1838
  cdef OGRDataSourceH ogr_dataset = NULL
1561
- cdef OGRLayerH ogr_layer = NULL
1562
1839
 
1563
- path_b = path.encode('utf-8')
1564
- path_c = path_b
1840
+ try:
1841
+ path = read_buffer_to_vsimem(path_or_buffer) if is_vsimem else path_or_buffer
1842
+ ogr_dataset = ogr_open(path.encode('UTF-8'), 0, NULL)
1843
+ layers = get_layer_names(ogr_dataset)
1844
+
1845
+ finally:
1846
+ if ogr_dataset != NULL:
1847
+ GDALClose(ogr_dataset)
1848
+ ogr_dataset = NULL
1849
+
1850
+ if is_vsimem:
1851
+ delete_vsimem_file(path)
1852
+
1853
+ return layers
1854
+
1855
+
1856
+ cdef str get_default_layer(OGRDataSourceH ogr_dataset):
1857
+ """ Get the layer in the dataset that is read by default.
1858
+
1859
+ The caller is responsible for closing the dataset.
1860
+
1861
+ Parameters
1862
+ ----------
1863
+ ogr_dataset : pointer to open OGR dataset
1864
+
1865
+ Returns
1866
+ -------
1867
+ str
1868
+ the name of the default layer to be read.
1869
+
1870
+ """
1871
+ layers = get_layer_names(ogr_dataset)
1872
+ first_layer_name = layers[0][0]
1873
+
1874
+ if len(layers) > 1:
1875
+ dataset_name = os.path.basename(get_string(OGR_DS_GetName(ogr_dataset)))
1876
+
1877
+ other_layer_names = ', '.join([f"'{l}'" for l in layers[1:, 0]])
1878
+ warnings.warn(
1879
+ f"More than one layer found in '{dataset_name}': '{first_layer_name}' "
1880
+ f"(default), {other_layer_names}. Specify layer parameter to avoid this "
1881
+ "warning.",
1882
+ stacklevel=2,
1883
+ )
1565
1884
 
1566
- ogr_dataset = ogr_open(path_c, 0, NULL)
1885
+ return first_layer_name
1886
+
1887
+
1888
+ cdef get_layer_names(OGRDataSourceH ogr_dataset):
1889
+ """ Get the layers in the dataset.
1890
+
1891
+ The caller is responsible for closing the dataset.
1892
+
1893
+ Parameters
1894
+ ----------
1895
+ ogr_dataset : pointer to open OGR dataset
1896
+
1897
+ Returns
1898
+ -------
1899
+ ndarray(n)
1900
+ array of layer names
1901
+
1902
+ """
1903
+ cdef OGRLayerH ogr_layer = NULL
1567
1904
 
1568
1905
  layer_count = GDALDatasetGetLayerCount(ogr_dataset)
1569
1906
 
@@ -1575,10 +1912,6 @@ def ogr_list_layers(str path):
1575
1912
  data_view[i, 0] = get_string(OGR_L_GetName(ogr_layer))
1576
1913
  data_view[i, 1] = get_geometry_type(ogr_layer)
1577
1914
 
1578
- if ogr_dataset != NULL:
1579
- GDALClose(ogr_dataset)
1580
- ogr_dataset = NULL
1581
-
1582
1915
  return data
1583
1916
 
1584
1917
 
@@ -1659,10 +1992,10 @@ cdef infer_field_types(list dtypes):
1659
1992
  field_types_view[i, 0] = OFTString
1660
1993
  # Convert to unicode string then take itemsize
1661
1994
  # TODO: better implementation of this
1662
- # width = values.astype(np.unicode_).dtype.itemsize // 4
1995
+ # width = values.astype(np.str_).dtype.itemsize // 4
1663
1996
  # DO WE NEED WIDTH HERE?
1664
1997
 
1665
- elif dtype.type is np.unicode_ or dtype.type is np.string_:
1998
+ elif dtype.type is np.str_ or dtype.type is np.bytes_:
1666
1999
  field_types_view[i, 0] = OFTString
1667
2000
  field_types_view[i, 2] = int(dtype.itemsize // 4)
1668
2001
 
@@ -1679,14 +2012,49 @@ cdef infer_field_types(list dtypes):
1679
2012
  return field_types
1680
2013
 
1681
2014
 
1682
- # TODO: set geometry and field data as memory views?
1683
- def ogr_write(
1684
- str path, str layer, str driver, geometry, fields, field_data, field_mask,
1685
- str crs, str geometry_type, str encoding, object dataset_kwargs,
1686
- object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True,
1687
- bint append=False, dataset_metadata=None, layer_metadata=None,
1688
- gdal_tz_offsets=None
2015
+ cdef create_ogr_dataset_layer(
2016
+ str path,
2017
+ bint is_vsi,
2018
+ str layer,
2019
+ str driver,
2020
+ str crs,
2021
+ str geometry_type,
2022
+ str encoding,
2023
+ object dataset_kwargs,
2024
+ object layer_kwargs,
2025
+ bint append,
2026
+ dataset_metadata,
2027
+ layer_metadata,
2028
+ OGRDataSourceH* ogr_dataset_out,
2029
+ OGRLayerH* ogr_layer_out,
1689
2030
  ):
2031
+ """
2032
+ Construct the OGRDataSource and OGRLayer objects based on input
2033
+ path and layer.
2034
+
2035
+ If the file already exists, will open the existing dataset and overwrite
2036
+ or append the layer (depending on `append`), otherwise will create a new
2037
+ dataset.
2038
+
2039
+ Fills in the `ogr_dataset_out` and `ogr_layer_out` pointers passed as
2040
+ parameter with initialized objects (or raise error is it fails to do so).
2041
+ It is the responsibility of the caller to clean up those objects after use.
2042
+ Returns whether a new layer was created or not (when the layer was created,
2043
+ the caller still needs to set up the layer definition, i.e. create the
2044
+ fields).
2045
+
2046
+ Parameters
2047
+ ----------
2048
+ encoding : str
2049
+ Only used if `driver` is "ESRI Shapefile". If not None, it overrules the default
2050
+ shapefile encoding, which is "UTF-8" in pyogrio.
2051
+
2052
+ Returns
2053
+ -------
2054
+ bool :
2055
+ Whether a new layer was created, or False if we are appending to an
2056
+ existing layer.
2057
+ """
1690
2058
  cdef const char *path_c = NULL
1691
2059
  cdef const char *layer_c = NULL
1692
2060
  cdef const char *driver_c = NULL
@@ -1697,55 +2065,9 @@ def ogr_write(
1697
2065
  cdef const char *ogr_name = NULL
1698
2066
  cdef OGRDataSourceH ogr_dataset = NULL
1699
2067
  cdef OGRLayerH ogr_layer = NULL
1700
- cdef OGRFeatureH ogr_feature = NULL
1701
- cdef OGRGeometryH ogr_geometry = NULL
1702
- cdef OGRGeometryH ogr_geometry_multi = NULL
1703
- cdef OGRFeatureDefnH ogr_featuredef = NULL
1704
- cdef OGRFieldDefnH ogr_fielddef = NULL
1705
- cdef unsigned char *wkb_buffer = NULL
1706
2068
  cdef OGRSpatialReferenceH ogr_crs = NULL
1707
- cdef int layer_idx = -1
1708
- cdef int supports_transactions = 0
1709
2069
  cdef OGRwkbGeometryType geometry_code
1710
- cdef int err = 0
1711
- cdef int i = 0
1712
- cdef int num_records = -1
1713
- cdef int num_field_data = len(field_data) if field_data is not None else 0
1714
- cdef int num_fields = len(fields) if fields is not None else 0
1715
-
1716
- if num_fields != num_field_data:
1717
- raise ValueError("field_data array needs to be same length as fields array")
1718
-
1719
- if num_fields == 0 and geometry is None:
1720
- raise ValueError("You must provide at least a geometry column or a field")
1721
-
1722
- if num_fields > 0:
1723
- num_records = len(field_data[0])
1724
- for i in range(1, len(field_data)):
1725
- if len(field_data[i]) != num_records:
1726
- raise ValueError("field_data arrays must be same length")
1727
-
1728
- if geometry is None:
1729
- # If no geometry data, we ignore the geometry_type and don't create a geometry
1730
- # column
1731
- geometry_type = None
1732
- else:
1733
- if num_fields > 0:
1734
- if len(geometry) != num_records:
1735
- raise ValueError(
1736
- "field_data arrays must be same length as geometry array"
1737
- )
1738
- else:
1739
- num_records = len(geometry)
1740
-
1741
- if field_mask is not None:
1742
- if len(field_data) != len(field_mask):
1743
- raise ValueError("field_data and field_mask must be same length")
1744
- for i in range(0, len(field_mask)):
1745
- if field_mask[i] is not None and len(field_mask[i]) != num_records:
1746
- raise ValueError("field_mask arrays must be same length as geometry array")
1747
- else:
1748
- field_mask = [None] * num_fields
2070
+ cdef int layer_idx = -1
1749
2071
 
1750
2072
  path_b = path.encode('UTF-8')
1751
2073
  path_c = path_b
@@ -1753,22 +2075,22 @@ def ogr_write(
1753
2075
  driver_b = driver.encode('UTF-8')
1754
2076
  driver_c = driver_b
1755
2077
 
2078
+ # in-memory dataset is always created from scratch
2079
+ path_exists = os.path.exists(path) if not is_vsi else False
2080
+
1756
2081
  if not layer:
1757
2082
  layer = os.path.splitext(os.path.split(path)[1])[0]
1758
2083
 
1759
- if gdal_tz_offsets is None:
1760
- gdal_tz_offsets = {}
1761
-
1762
-
1763
2084
  # if shapefile, GeoJSON, or FlatGeobuf, always delete first
1764
2085
  # for other types, check if we can create layers
1765
2086
  # GPKG might be the only multi-layer writeable type. TODO: check this
1766
- if driver in ('ESRI Shapefile', 'GeoJSON', 'GeoJSONSeq', 'FlatGeobuf') and os.path.exists(path):
2087
+ if driver in ('ESRI Shapefile', 'GeoJSON', 'GeoJSONSeq', 'FlatGeobuf') and path_exists:
1767
2088
  if not append:
1768
2089
  os.unlink(path)
2090
+ path_exists = False
1769
2091
 
1770
2092
  layer_exists = False
1771
- if os.path.exists(path):
2093
+ if path_exists:
1772
2094
  try:
1773
2095
  ogr_dataset = ogr_open(path_c, 1, NULL)
1774
2096
 
@@ -1790,7 +2112,11 @@ def ogr_write(
1790
2112
  raise exc
1791
2113
 
1792
2114
  # otherwise create from scratch
1793
- os.unlink(path)
2115
+ if is_vsi:
2116
+ VSIUnlink(path_c)
2117
+ else:
2118
+ os.unlink(path)
2119
+
1794
2120
  ogr_dataset = NULL
1795
2121
 
1796
2122
  # either it didn't exist or could not open it in write mode
@@ -1808,25 +2134,20 @@ def ogr_write(
1808
2134
  if crs is not None:
1809
2135
  try:
1810
2136
  ogr_crs = create_crs(crs)
2137
+ # force geographic CRS to use lon, lat order and ignore axis order specified by CRS, in order
2138
+ # to correctly write KML and GeoJSON coordinates in correct order
2139
+ OSRSetAxisMappingStrategy(ogr_crs, OAMS_TRADITIONAL_GIS_ORDER)
2140
+
1811
2141
 
1812
2142
  except Exception as exc:
1813
- OGRReleaseDataSource(ogr_dataset)
1814
- ogr_dataset = NULL
1815
2143
  if dataset_options != NULL:
1816
2144
  CSLDestroy(dataset_options)
1817
2145
  dataset_options = NULL
1818
- raise exc
1819
2146
 
1820
- # Setup layer creation options
1821
- if not encoding:
1822
- encoding = locale.getpreferredencoding()
2147
+ GDALClose(ogr_dataset)
2148
+ ogr_dataset = NULL
1823
2149
 
1824
- if driver == 'ESRI Shapefile':
1825
- # Fiona only sets encoding for shapefiles; other drivers do not support
1826
- # encoding as an option.
1827
- encoding_b = encoding.upper().encode('UTF-8')
1828
- encoding_c = encoding_b
1829
- layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
2150
+ raise exc
1830
2151
 
1831
2152
  # Setup other layer creation options
1832
2153
  for k, v in layer_kwargs.items():
@@ -1834,6 +2155,21 @@ def ogr_write(
1834
2155
  v = v.encode('UTF-8')
1835
2156
  layer_options = CSLAddNameValue(layer_options, <const char *>k, <const char *>v)
1836
2157
 
2158
+ if driver == 'ESRI Shapefile':
2159
+ # ENCODING option must be set for shapefiles to properly write *.cpg
2160
+ # file containing the encoding; this is not a supported option for
2161
+ # other drivers. This is done after setting general options above
2162
+ # to override ENCODING if passed by the user as a layer option.
2163
+ if encoding and "ENCODING" in layer_kwargs:
2164
+ raise ValueError('cannot provide both encoding parameter and "ENCODING" layer creation option; use the encoding parameter')
2165
+
2166
+ # always write to UTF-8 if encoding is not set
2167
+ encoding = encoding or "UTF-8"
2168
+ encoding_b = encoding.upper().encode('UTF-8')
2169
+ encoding_c = encoding_b
2170
+ layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
2171
+
2172
+
1837
2173
  ### Get geometry type
1838
2174
  # TODO: this is brittle for 3D / ZM / M types
1839
2175
  # TODO: fail on M / ZM types
@@ -1856,7 +2192,7 @@ def ogr_write(
1856
2192
  set_metadata(ogr_layer, layer_metadata)
1857
2193
 
1858
2194
  except Exception as exc:
1859
- OGRReleaseDataSource(ogr_dataset)
2195
+ GDALClose(ogr_dataset)
1860
2196
  ogr_dataset = NULL
1861
2197
  raise DataLayerError(str(exc))
1862
2198
 
@@ -1873,60 +2209,152 @@ def ogr_write(
1873
2209
  CSLDestroy(layer_options)
1874
2210
  layer_options = NULL
1875
2211
 
1876
- ### Create the fields
1877
- field_types = None
2212
+ ogr_dataset_out[0] = ogr_dataset
2213
+ ogr_layer_out[0] = ogr_layer
2214
+
2215
+ return create_layer
2216
+
2217
+
2218
+ # TODO: set geometry and field data as memory views?
2219
+ def ogr_write(
2220
+ object path_or_fp,
2221
+ str layer,
2222
+ str driver,
2223
+ geometry,
2224
+ fields,
2225
+ field_data,
2226
+ field_mask,
2227
+ str crs,
2228
+ str geometry_type,
2229
+ str encoding,
2230
+ object dataset_kwargs,
2231
+ object layer_kwargs,
2232
+ bint promote_to_multi=False,
2233
+ bint nan_as_null=True,
2234
+ bint append=False,
2235
+ dataset_metadata=None,
2236
+ layer_metadata=None,
2237
+ gdal_tz_offsets=None
2238
+ ):
2239
+ cdef OGRDataSourceH ogr_dataset = NULL
2240
+ cdef OGRLayerH ogr_layer = NULL
2241
+ cdef OGRFeatureH ogr_feature = NULL
2242
+ cdef OGRGeometryH ogr_geometry = NULL
2243
+ cdef OGRGeometryH ogr_geometry_multi = NULL
2244
+ cdef OGRFeatureDefnH ogr_featuredef = NULL
2245
+ cdef OGRFieldDefnH ogr_fielddef = NULL
2246
+ cdef unsigned char *wkb_buffer = NULL
2247
+ cdef int supports_transactions = 0
2248
+ cdef int err = 0
2249
+ cdef int i = 0
2250
+ cdef int num_records = -1
2251
+ cdef int num_field_data = len(field_data) if field_data is not None else 0
2252
+ cdef int num_fields = len(fields) if fields is not None else 0
2253
+ cdef bint is_vsi = False
2254
+
2255
+ if num_fields != num_field_data:
2256
+ raise ValueError("field_data array needs to be same length as fields array")
2257
+
2258
+ if num_fields == 0 and geometry is None:
2259
+ raise ValueError("You must provide at least a geometry column or a field")
2260
+
1878
2261
  if num_fields > 0:
1879
- field_types = infer_field_types([field.dtype for field in field_data])
2262
+ num_records = len(field_data[0])
2263
+ for i in range(1, len(field_data)):
2264
+ if len(field_data[i]) != num_records:
2265
+ raise ValueError("field_data arrays must be same length")
1880
2266
 
1881
- ### Create the fields
1882
- if create_layer:
1883
- for i in range(num_fields):
1884
- field_type, field_subtype, width, precision = field_types[i]
2267
+ if geometry is None:
2268
+ # If no geometry data, we ignore the geometry_type and don't create a geometry
2269
+ # column
2270
+ geometry_type = None
2271
+ else:
2272
+ if num_fields > 0:
2273
+ if len(geometry) != num_records:
2274
+ raise ValueError(
2275
+ "field_data arrays must be same length as geometry array"
2276
+ )
2277
+ else:
2278
+ num_records = len(geometry)
1885
2279
 
1886
- name_b = fields[i].encode(encoding)
1887
- try:
1888
- ogr_fielddef = exc_wrap_pointer(OGR_Fld_Create(name_b, field_type))
2280
+ if field_mask is not None:
2281
+ if len(field_data) != len(field_mask):
2282
+ raise ValueError("field_data and field_mask must be same length")
2283
+ for i in range(0, len(field_mask)):
2284
+ if field_mask[i] is not None and len(field_mask[i]) != num_records:
2285
+ raise ValueError("field_mask arrays must be same length as geometry array")
2286
+ else:
2287
+ field_mask = [None] * num_fields
1889
2288
 
1890
- # subtypes, see: https://gdal.org/development/rfc/rfc50_ogr_field_subtype.html
1891
- if field_subtype != OFSTNone:
1892
- OGR_Fld_SetSubType(ogr_fielddef, field_subtype)
2289
+ if gdal_tz_offsets is None:
2290
+ gdal_tz_offsets = {}
1893
2291
 
1894
- if width:
1895
- OGR_Fld_SetWidth(ogr_fielddef, width)
2292
+ try:
2293
+ # Setup in-memory handler if needed
2294
+ path = get_ogr_vsimem_write_path(path_or_fp, driver)
2295
+ is_vsi = path.startswith('/vsimem/')
2296
+
2297
+ # Setup dataset and layer
2298
+ layer_created = create_ogr_dataset_layer(
2299
+ path, is_vsi, layer, driver, crs, geometry_type, encoding,
2300
+ dataset_kwargs, layer_kwargs, append,
2301
+ dataset_metadata, layer_metadata,
2302
+ &ogr_dataset, &ogr_layer,
2303
+ )
1896
2304
 
1897
- # TODO: set precision
2305
+ if driver == 'ESRI Shapefile':
2306
+ # force encoding for remaining operations to be in UTF-8 (even if user
2307
+ # provides an encoding) because GDAL will automatically convert those to
2308
+ # the target encoding because ENCODING is set as a layer creation option
2309
+ encoding = "UTF-8"
1898
2310
 
1899
- except:
1900
- if ogr_fielddef != NULL:
1901
- OGR_Fld_Destroy(ogr_fielddef)
1902
- ogr_fielddef = NULL
2311
+ else:
2312
+ # Now the dataset and layer have been created, we can properly determine the
2313
+ # encoding. It is derived from the user, from the dataset capabilities / type,
2314
+ # or from the system locale
2315
+ encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)
1903
2316
 
1904
- OGRReleaseDataSource(ogr_dataset)
1905
- ogr_dataset = NULL
1906
- raise FieldError(f"Error creating field '{fields[i]}' from field_data") from None
2317
+ ### Create the fields
2318
+ field_types = None
2319
+ if num_fields > 0:
2320
+ field_types = infer_field_types([field.dtype for field in field_data])
1907
2321
 
1908
- try:
1909
- exc_wrap_int(OGR_L_CreateField(ogr_layer, ogr_fielddef, 1))
2322
+ if layer_created:
2323
+ for i in range(num_fields):
2324
+ field_type, field_subtype, width, precision = field_types[i]
1910
2325
 
1911
- except:
1912
- OGRReleaseDataSource(ogr_dataset)
1913
- ogr_dataset = NULL
1914
- raise FieldError(f"Error adding field '{fields[i]}' to layer") from None
2326
+ name_b = fields[i].encode(encoding)
2327
+ try:
2328
+ ogr_fielddef = exc_wrap_pointer(OGR_Fld_Create(name_b, field_type))
1915
2329
 
1916
- finally:
1917
- if ogr_fielddef != NULL:
1918
- OGR_Fld_Destroy(ogr_fielddef)
2330
+ # subtypes, see: https://gdal.org/development/rfc/rfc50_ogr_field_subtype.html
2331
+ if field_subtype != OFSTNone:
2332
+ OGR_Fld_SetSubType(ogr_fielddef, field_subtype)
1919
2333
 
2334
+ if width:
2335
+ OGR_Fld_SetWidth(ogr_fielddef, width)
1920
2336
 
1921
- ### Create the features
1922
- ogr_featuredef = OGR_L_GetLayerDefn(ogr_layer)
2337
+ # TODO: set precision
1923
2338
 
1924
- supports_transactions = OGR_L_TestCapability(ogr_layer, OLCTransactions)
1925
- if supports_transactions:
1926
- start_transaction(ogr_dataset, 0)
2339
+ exc_wrap_int(OGR_L_CreateField(ogr_layer, ogr_fielddef, 1))
1927
2340
 
1928
- for i in range(num_records):
1929
- try:
2341
+ except:
2342
+ raise FieldError(f"Error adding field '{fields[i]}' to layer") from None
2343
+
2344
+ finally:
2345
+ if ogr_fielddef != NULL:
2346
+ OGR_Fld_Destroy(ogr_fielddef)
2347
+ ogr_fielddef = NULL
2348
+
2349
+
2350
+ ### Create the features
2351
+ ogr_featuredef = OGR_L_GetLayerDefn(ogr_layer)
2352
+
2353
+ supports_transactions = OGR_L_TestCapability(ogr_layer, OLCTransactions)
2354
+ if supports_transactions:
2355
+ start_transaction(ogr_dataset, 0)
2356
+
2357
+ for i in range(num_records):
1930
2358
  # create the feature
1931
2359
  ogr_feature = OGR_F_Create(ogr_featuredef)
1932
2360
  if ogr_feature == NULL:
@@ -1947,9 +2375,6 @@ def ogr_write(
1947
2375
  wkb_buffer = wkb
1948
2376
  err = OGR_G_ImportFromWkb(ogr_geometry, wkb_buffer, len(wkb))
1949
2377
  if err:
1950
- if ogr_geometry != NULL:
1951
- OGR_G_DestroyGeometry(ogr_geometry)
1952
- ogr_geometry = NULL
1953
2378
  raise GeometryError(f"Could not create geometry from WKB at index {i}") from None
1954
2379
 
1955
2380
  # Convert to multi type
@@ -1964,6 +2389,7 @@ def ogr_write(
1964
2389
  # Set the geometry on the feature
1965
2390
  # this assumes ownership of the geometry and it's cleanup
1966
2391
  err = OGR_F_SetGeometryDirectly(ogr_feature, ogr_geometry)
2392
+ ogr_geometry = NULL # to prevent cleanup after this point
1967
2393
  if err:
1968
2394
  raise GeometryError(f"Could not set geometry for feature at index {i}") from None
1969
2395
 
@@ -1977,7 +2403,6 @@ def ogr_write(
1977
2403
  OGR_F_SetFieldNull(ogr_feature, field_idx)
1978
2404
 
1979
2405
  elif field_type == OFTString:
1980
- # TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
1981
2406
  if (
1982
2407
  field_value is None
1983
2408
  or (isinstance(field_value, float) and isnan(field_value))
@@ -1989,7 +2414,7 @@ def ogr_write(
1989
2414
  field_value = str(field_value)
1990
2415
 
1991
2416
  try:
1992
- value_b = field_value.encode("UTF-8")
2417
+ value_b = field_value.encode(encoding)
1993
2418
  OGR_F_SetFieldString(ogr_feature, field_idx, value_b)
1994
2419
 
1995
2420
  except AttributeError:
@@ -2056,25 +2481,262 @@ def ogr_write(
2056
2481
  # Add feature to the layer
2057
2482
  try:
2058
2483
  exc_wrap_int(OGR_L_CreateFeature(ogr_layer, ogr_feature))
2484
+
2059
2485
  except CPLE_BaseError as exc:
2060
2486
  raise FeatureError(f"Could not add feature to layer at index {i}: {exc}") from None
2061
2487
 
2062
- finally:
2063
- if ogr_feature != NULL:
2064
- OGR_F_Destroy(ogr_feature)
2065
- ogr_feature = NULL
2488
+ OGR_F_Destroy(ogr_feature)
2489
+ ogr_feature = NULL
2066
2490
 
2067
- if supports_transactions:
2068
- commit_transaction(ogr_dataset)
2069
2491
 
2070
- log.info(f"Created {num_records:,} records" )
2492
+ if supports_transactions:
2493
+ commit_transaction(ogr_dataset)
2071
2494
 
2072
- ### Final cleanup
2073
- if ogr_dataset != NULL:
2074
- GDALClose(ogr_dataset)
2495
+ log.info(f"Created {num_records:,} records" )
2496
+
2497
+ # close dataset to force driver to flush data
2498
+ exc = ogr_close(ogr_dataset)
2499
+ ogr_dataset = NULL
2500
+ if exc:
2501
+ raise DataSourceError(f"Failed to write features to dataset {path}; {exc}")
2502
+
2503
+ # copy in-memory file back to path_or_fp object
2504
+ if is_vsi:
2505
+ read_vsimem_to_buffer(path, path_or_fp)
2506
+
2507
+ finally:
2508
+ ### Final cleanup
2509
+ # make sure that all objects allocated above are released if exceptions
2510
+ # are raised, and the dataset is closed
2511
+ if ogr_fielddef != NULL:
2512
+ OGR_Fld_Destroy(ogr_fielddef)
2513
+ ogr_fielddef = NULL
2514
+
2515
+ if ogr_feature != NULL:
2516
+ OGR_F_Destroy(ogr_feature)
2517
+ ogr_feature = NULL
2518
+
2519
+ if ogr_geometry != NULL:
2520
+ OGR_G_DestroyGeometry(ogr_geometry)
2521
+ ogr_geometry = NULL
2522
+
2523
+ if ogr_dataset != NULL:
2524
+ ogr_close(ogr_dataset)
2525
+
2526
+ if is_vsi:
2527
+ delete_vsimem_file(path)
2528
+
2529
+
2530
+ def ogr_write_arrow(
2531
+ object path_or_fp,
2532
+ str layer,
2533
+ str driver,
2534
+ object arrow_obj,
2535
+ str crs,
2536
+ str geometry_type,
2537
+ str geometry_name,
2538
+ str encoding,
2539
+ object dataset_kwargs,
2540
+ object layer_kwargs,
2541
+ bint append=False,
2542
+ dataset_metadata=None,
2543
+ layer_metadata=None,
2544
+ ):
2545
+ IF CTE_GDAL_VERSION < (3, 8, 0):
2546
+ raise RuntimeError("Need GDAL>=3.8 for Arrow write support")
2547
+
2548
+ cdef OGRDataSourceH ogr_dataset = NULL
2549
+ cdef OGRLayerH ogr_layer = NULL
2550
+ cdef char **options = NULL
2551
+ cdef bint is_vsi = False
2552
+ cdef ArrowArrayStream* stream = NULL
2553
+ cdef ArrowSchema schema
2554
+ cdef ArrowArray array
2555
+
2556
+ schema.release = NULL
2557
+ array.release = NULL
2558
+
2559
+ try:
2560
+ path = get_ogr_vsimem_write_path(path_or_fp, driver)
2561
+ is_vsi = path.startswith('/vsimem/')
2562
+
2563
+ layer_created = create_ogr_dataset_layer(
2564
+ path, is_vsi, layer, driver, crs, geometry_type, encoding,
2565
+ dataset_kwargs, layer_kwargs, append,
2566
+ dataset_metadata, layer_metadata,
2567
+ &ogr_dataset, &ogr_layer,
2568
+ )
2569
+
2570
+ # only shapefile supports non-UTF encoding because ENCODING option is set
2571
+ # during dataset creation and GDAL auto-translates from UTF-8 values to that
2572
+ # encoding
2573
+ if encoding and encoding.replace('-','').upper() != 'UTF8' and driver != 'ESRI Shapefile':
2574
+ raise ValueError("non-UTF-8 encoding is not supported for Arrow; use the non-Arrow interface instead")
2575
+
2576
+ if geometry_name:
2577
+ opts = {"GEOMETRY_NAME": geometry_name}
2578
+ else:
2579
+ opts = {}
2075
2580
 
2076
- # GDAL will set an error if there was an error writing the data source
2077
- # on close
2078
- exc = exc_check()
2581
+ options = dict_to_options(opts)
2582
+
2583
+ stream_capsule = arrow_obj.__arrow_c_stream__()
2584
+ stream = <ArrowArrayStream*>PyCapsule_GetPointer(
2585
+ stream_capsule, "arrow_array_stream"
2586
+ )
2587
+
2588
+ if stream == NULL:
2589
+ raise RuntimeError("Could not extract valid Arrow array stream.")
2590
+
2591
+ if stream.release == NULL:
2592
+ raise RuntimeError("Arrow array stream was already released.")
2593
+
2594
+ if stream.get_schema(stream, &schema) != 0:
2595
+ raise RuntimeError("Could not get Arrow schema from stream.")
2596
+
2597
+ if layer_created:
2598
+ create_fields_from_arrow_schema(ogr_layer, &schema, options, geometry_name)
2599
+
2600
+ while True:
2601
+ if stream.get_next(stream, &array) != 0:
2602
+ raise RuntimeError("Error while accessing batch from stream.")
2603
+
2604
+ # We've reached the end of the stream
2605
+ if array.release == NULL:
2606
+ break
2607
+
2608
+ if not OGR_L_WriteArrowBatch(ogr_layer, &schema, &array, options):
2609
+ exc = exc_check()
2610
+ gdal_msg = f": {str(exc)}" if exc else "."
2611
+ raise DataLayerError(
2612
+ f"Error while writing batch to OGR layer{gdal_msg}"
2613
+ )
2614
+
2615
+ if array.release != NULL:
2616
+ array.release(&array)
2617
+
2618
+ # close dataset to force driver to flush data
2619
+ exc = ogr_close(ogr_dataset)
2620
+ ogr_dataset = NULL
2079
2621
  if exc:
2080
- raise DataSourceError(f"Failed to write features to dataset {path}; {exc}")
2622
+ raise DataSourceError(f"Failed to write features to dataset {path}; {exc}")
2623
+
2624
+ # copy in-memory file back to path_or_fp object
2625
+ if is_vsi:
2626
+ read_vsimem_to_buffer(path, path_or_fp)
2627
+
2628
+ finally:
2629
+ if stream != NULL and stream.release != NULL:
2630
+ stream.release(stream)
2631
+
2632
+ if schema.release != NULL:
2633
+ schema.release(&schema)
2634
+
2635
+ if array.release != NULL:
2636
+ array.release(&array)
2637
+
2638
+ if options != NULL:
2639
+ CSLDestroy(options)
2640
+ options = NULL
2641
+
2642
+ if ogr_dataset != NULL:
2643
+ ogr_close(ogr_dataset)
2644
+
2645
+ if is_vsi:
2646
+ delete_vsimem_file(path)
2647
+
2648
+
2649
+ cdef get_arrow_extension_metadata(const ArrowSchema* schema):
2650
+ """
2651
+ Parse the metadata of the ArrowSchema and extract extension type
2652
+ metadata (extension name and metadata).
2653
+
2654
+ For the exact layout of the bytes, see
2655
+ https://arrow.apache.org/docs/dev/format/CDataInterface.html#c.ArrowSchema.metadata
2656
+ """
2657
+ cdef const char *metadata = schema.metadata
2658
+
2659
+ extension_name = None
2660
+ extension_metadata = None
2661
+
2662
+ if metadata == NULL:
2663
+ return extension_name, extension_metadata
2664
+
2665
+ # the number of metadata key/value pairs is stored
2666
+ # as an int32 value in the first 4 bytes
2667
+ n = int.from_bytes(metadata[:4], byteorder=sys.byteorder)
2668
+ pos = 4
2669
+
2670
+ for i in range(n):
2671
+ # for each metadata key/value pair, the first 4 bytes is the byte length
2672
+ # of the key as an int32, then follows the key (not null-terminated),
2673
+ # and then the same for the value length and bytes
2674
+ key_length = int.from_bytes(
2675
+ metadata[pos:pos+4], byteorder=sys.byteorder, signed=True
2676
+ )
2677
+ pos += 4
2678
+ key = metadata[pos:pos+key_length]
2679
+ pos += key_length
2680
+ value_length = int.from_bytes(
2681
+ metadata[pos:pos+4], byteorder=sys.byteorder, signed=True
2682
+ )
2683
+ pos += 4
2684
+ value = metadata[pos:pos+value_length]
2685
+ pos += value_length
2686
+
2687
+ if key == b"ARROW:extension:name":
2688
+ extension_name = value
2689
+ elif key == b"ARROW:extension:metadata":
2690
+ extension_metadata = value
2691
+
2692
+ if extension_name is not None and extension_metadata is not None:
2693
+ break
2694
+
2695
+ return extension_name, extension_metadata
2696
+
2697
+
2698
+ cdef is_arrow_geometry_field(const ArrowSchema* schema):
2699
+ name, _ = get_arrow_extension_metadata(schema)
2700
+ if name is not None:
2701
+ if name == b"geoarrow.wkb" or name == b"ogc.wkb":
2702
+ return True
2703
+
2704
+ # raise an error for other geoarrow types
2705
+ if name.startswith(b"geoarrow."):
2706
+ raise NotImplementedError(
2707
+ f"Writing a geometry column of type {name.decode()} is not yet "
2708
+ "supported. Only WKB is currently supported ('geoarrow.wkb' or "
2709
+ "'ogc.wkb' types)."
2710
+ )
2711
+
2712
+ return False
2713
+
2714
+
2715
+ cdef create_fields_from_arrow_schema(
2716
+ OGRLayerH destLayer, const ArrowSchema* schema, char** options, str geometry_name
2717
+ ):
2718
+ """Create output fields using CreateFieldFromArrowSchema()"""
2719
+
2720
+ IF CTE_GDAL_VERSION < (3, 8, 0):
2721
+ raise RuntimeError("Need GDAL>=3.8 for Arrow write support")
2722
+
2723
+ # The schema object is a struct type where each child is a column.
2724
+ cdef ArrowSchema* child
2725
+ for i in range(schema.n_children):
2726
+ child = schema.children[i]
2727
+
2728
+ if child == NULL:
2729
+ raise RuntimeError("Received invalid Arrow schema (null child)")
2730
+
2731
+ # Don't create property for geometry column
2732
+ if get_string(child.name) == geometry_name or is_arrow_geometry_field(child):
2733
+ continue
2734
+
2735
+ if not OGR_L_CreateFieldFromArrowSchema(destLayer, child, options):
2736
+ exc = exc_check()
2737
+ gdal_msg = f" ({str(exc)})" if exc else ""
2738
+ raise FieldError(
2739
+ f"Error while creating field from Arrow for field {i} with name "
2740
+ f"'{get_string(child.name)}' and type {get_string(child.format)}"
2741
+ f"{gdal_msg}."
2742
+ )