pyogrio 0.7.2__cp39-cp39-manylinux_2_28_aarch64.whl → 0.9.0__cp39-cp39-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyogrio might be problematic. Click here for more details.

Files changed (48) hide show
  1. pyogrio/__init__.py +4 -0
  2. pyogrio/_compat.py +6 -1
  3. pyogrio/_err.cpython-39-aarch64-linux-gnu.so +0 -0
  4. pyogrio/_err.pyx +7 -3
  5. pyogrio/_geometry.cpython-39-aarch64-linux-gnu.so +0 -0
  6. pyogrio/_io.cpython-39-aarch64-linux-gnu.so +0 -0
  7. pyogrio/_io.pyx +904 -242
  8. pyogrio/_ogr.cpython-39-aarch64-linux-gnu.so +0 -0
  9. pyogrio/_ogr.pxd +69 -13
  10. pyogrio/_ogr.pyx +8 -24
  11. pyogrio/_version.py +3 -3
  12. pyogrio/_vsi.cpython-39-aarch64-linux-gnu.so +0 -0
  13. pyogrio/_vsi.pxd +4 -0
  14. pyogrio/_vsi.pyx +140 -0
  15. pyogrio/core.py +43 -44
  16. pyogrio/gdal_data/GDAL-targets-release.cmake +3 -3
  17. pyogrio/gdal_data/GDAL-targets.cmake +10 -6
  18. pyogrio/gdal_data/GDALConfigVersion.cmake +3 -3
  19. pyogrio/gdal_data/gdalinfo_output.schema.json +2 -0
  20. pyogrio/gdal_data/gdalvrt.xsd +163 -0
  21. pyogrio/gdal_data/ogrinfo_output.schema.json +12 -1
  22. pyogrio/gdal_data/vcpkg.spdx.json +26 -26
  23. pyogrio/gdal_data/vcpkg_abi_info.txt +27 -26
  24. pyogrio/geopandas.py +140 -34
  25. pyogrio/proj_data/ITRF2008 +2 -2
  26. pyogrio/proj_data/proj-config-version.cmake +2 -2
  27. pyogrio/proj_data/proj-config.cmake +2 -1
  28. pyogrio/proj_data/proj-targets.cmake +13 -13
  29. pyogrio/proj_data/proj.db +0 -0
  30. pyogrio/proj_data/proj4-targets.cmake +13 -13
  31. pyogrio/proj_data/vcpkg.spdx.json +20 -42
  32. pyogrio/proj_data/vcpkg_abi_info.txt +14 -15
  33. pyogrio/raw.py +438 -116
  34. pyogrio/tests/conftest.py +75 -6
  35. pyogrio/tests/fixtures/poly_not_enough_points.shp.zip +0 -0
  36. pyogrio/tests/test_arrow.py +841 -7
  37. pyogrio/tests/test_core.py +99 -7
  38. pyogrio/tests/test_geopandas_io.py +827 -121
  39. pyogrio/tests/test_path.py +23 -3
  40. pyogrio/tests/test_raw_io.py +276 -50
  41. pyogrio/util.py +39 -19
  42. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/METADATA +2 -2
  43. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/RECORD +210 -207
  44. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/WHEEL +1 -1
  45. pyogrio.libs/{libgdal-cb554135.so.33.3.7.2 → libgdal-6ff0914e.so.34.3.8.5} +0 -0
  46. pyogrio/tests/win32.py +0 -86
  47. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/LICENSE +0 -0
  48. {pyogrio-0.7.2.dist-info → pyogrio-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  import contextlib
2
2
  from datetime import datetime
3
- import os
3
+ from io import BytesIO
4
+ import locale
5
+
4
6
  import numpy as np
5
7
  import pytest
6
8
 
7
- from pyogrio import list_layers, read_info, __gdal_version__
9
+ from pyogrio import list_layers, list_drivers, read_info, __gdal_version__
8
10
  from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
9
11
  from pyogrio.geopandas import read_dataframe, write_dataframe, PANDAS_GE_20
10
12
  from pyogrio.raw import (
@@ -14,10 +16,11 @@ from pyogrio.raw import (
14
16
  from pyogrio.tests.conftest import (
15
17
  ALL_EXTS,
16
18
  DRIVERS,
17
- requires_arrow_api,
19
+ requires_pyarrow_api,
20
+ requires_arrow_write_api,
18
21
  requires_gdal_geos,
19
22
  )
20
- from pyogrio._compat import PANDAS_GE_15
23
+ from pyogrio._compat import PANDAS_GE_15, HAS_ARROW_WRITE_API
21
24
 
22
25
  try:
23
26
  import pandas as pd
@@ -45,13 +48,30 @@ pytest.importorskip("geopandas")
45
48
  scope="session",
46
49
  params=[
47
50
  False,
48
- pytest.param(True, marks=requires_arrow_api),
51
+ pytest.param(True, marks=requires_pyarrow_api),
49
52
  ],
50
53
  )
51
54
  def use_arrow(request):
52
55
  return request.param
53
56
 
54
57
 
58
+ @pytest.fixture(autouse=True)
59
+ def skip_if_no_arrow_write_api(request):
60
+ # automatically skip tests with use_arrow=True and that require Arrow write
61
+ # API (marked with `@pytest.mark.requires_arrow_write_api`) if it is not available
62
+ use_arrow = (
63
+ request.getfixturevalue("use_arrow")
64
+ if "use_arrow" in request.fixturenames
65
+ else False
66
+ )
67
+ if (
68
+ use_arrow
69
+ and not HAS_ARROW_WRITE_API
70
+ and request.node.get_closest_marker("requires_arrow_write_api")
71
+ ):
72
+ pytest.skip("GDAL>=3.8 required for Arrow write API")
73
+
74
+
55
75
  def spatialite_available(path):
56
76
  try:
57
77
  _ = read_dataframe(
@@ -62,6 +82,45 @@ def spatialite_available(path):
62
82
  return False
63
83
 
64
84
 
85
+ @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
86
+ def test_read_csv_encoding(tmp_path, encoding):
87
+ # Write csv test file. Depending on the os this will be written in a different
88
+ # encoding: for linux and macos this is utf-8, for windows it is cp1252.
89
+ csv_path = tmp_path / "test.csv"
90
+ with open(csv_path, "w", encoding=encoding) as csv:
91
+ csv.write("näme,city\n")
92
+ csv.write("Wilhelm Röntgen,Zürich\n")
93
+
94
+ # Read csv. The data should be read with the same default encoding as the csv file
95
+ # was written in, but should have been converted to utf-8 in the dataframe returned.
96
+ # Hence, the asserts below, with strings in utf-8, be OK.
97
+ df = read_dataframe(csv_path, encoding=encoding)
98
+
99
+ assert len(df) == 1
100
+ assert df.columns.tolist() == ["näme", "city"]
101
+ assert df.city.tolist() == ["Zürich"]
102
+ assert df.näme.tolist() == ["Wilhelm Röntgen"]
103
+
104
+
105
+ @pytest.mark.skipif(
106
+ locale.getpreferredencoding().upper() == "UTF-8",
107
+ reason="test requires non-UTF-8 default platform",
108
+ )
109
+ def test_read_csv_platform_encoding(tmp_path):
110
+ """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
111
+ csv_path = tmp_path / "test.csv"
112
+ with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
113
+ csv.write("näme,city\n")
114
+ csv.write("Wilhelm Röntgen,Zürich\n")
115
+
116
+ df = read_dataframe(csv_path)
117
+
118
+ assert len(df) == 1
119
+ assert df.columns.tolist() == ["näme", "city"]
120
+ assert df.city.tolist() == ["Zürich"]
121
+ assert df.näme.tolist() == ["Wilhelm Röntgen"]
122
+
123
+
65
124
  def test_read_dataframe(naturalearth_lowres_all_ext):
66
125
  df = read_dataframe(naturalearth_lowres_all_ext)
67
126
 
@@ -77,8 +136,8 @@ def test_read_dataframe(naturalearth_lowres_all_ext):
77
136
  ]
78
137
 
79
138
 
80
- def test_read_dataframe_vsi(naturalearth_lowres_vsi):
81
- df = read_dataframe(naturalearth_lowres_vsi[1])
139
+ def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow):
140
+ df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow)
82
141
  assert len(df) == 177
83
142
 
84
143
 
@@ -154,6 +213,7 @@ def test_read_force_2d(test_fgdb_vsi, use_arrow):
154
213
 
155
214
 
156
215
  @pytest.mark.filterwarnings("ignore: Measured")
216
+ @pytest.mark.filterwarnings("ignore: More than one layer found in")
157
217
  def test_read_layer(test_fgdb_vsi, use_arrow):
158
218
  layers = list_layers(test_fgdb_vsi)
159
219
  kwargs = {"use_arrow": use_arrow, "read_geometry": False, "max_features": 1}
@@ -186,8 +246,13 @@ def test_read_datetime(test_fgdb_vsi, use_arrow):
186
246
  assert df.SURVEY_DAT.dtype.name == "datetime64[ns]"
187
247
 
188
248
 
189
- def test_read_datetime_tz(test_datetime_tz, tmp_path):
249
+ @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ")
250
+ @pytest.mark.requires_arrow_write_api
251
+ def test_read_datetime_tz(test_datetime_tz, tmp_path, use_arrow):
190
252
  df = read_dataframe(test_datetime_tz)
253
+ # Make the index non-consecutive to test this case as well. Added for issue
254
+ # https://github.com/geopandas/pyogrio/issues/324
255
+ df = df.set_index(np.array([0, 2]))
191
256
  raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]
192
257
 
193
258
  if PANDAS_GE_20:
@@ -195,15 +260,22 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path):
195
260
  else:
196
261
  expected = pd.to_datetime(raw_expected)
197
262
  expected = pd.Series(expected, name="datetime_col")
198
- assert_series_equal(df.datetime_col, expected)
263
+ assert_series_equal(df.datetime_col, expected, check_index=False)
199
264
  # test write and read round trips
200
265
  fpath = tmp_path / "test.gpkg"
201
- write_dataframe(df, fpath)
202
- df_read = read_dataframe(fpath)
266
+ write_dataframe(df, fpath, use_arrow=use_arrow)
267
+ df_read = read_dataframe(fpath, use_arrow=use_arrow)
268
+ if use_arrow:
269
+ # with Arrow, the datetimes are always read as UTC
270
+ expected = expected.dt.tz_convert("UTC")
203
271
  assert_series_equal(df_read.datetime_col, expected)
204
272
 
205
273
 
206
- def test_write_datetime_mixed_offset(tmp_path):
274
+ @pytest.mark.filterwarnings(
275
+ "ignore: Non-conformant content for record 1 in column dates"
276
+ )
277
+ @pytest.mark.requires_arrow_write_api
278
+ def test_write_datetime_mixed_offset(tmp_path, use_arrow):
207
279
  # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10)
208
280
  dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"]
209
281
  naive_col = pd.Series(pd.to_datetime(dates), name="dates")
@@ -217,14 +289,18 @@ def test_write_datetime_mixed_offset(tmp_path):
217
289
  crs="EPSG:4326",
218
290
  )
219
291
  fpath = tmp_path / "test.gpkg"
220
- write_dataframe(df, fpath)
221
- result = read_dataframe(fpath)
292
+ write_dataframe(df, fpath, use_arrow=use_arrow)
293
+ result = read_dataframe(fpath, use_arrow=use_arrow)
222
294
  # GDAL tz only encodes offsets, not timezones
223
295
  # check multiple offsets are read as utc datetime instead of string values
224
296
  assert_series_equal(result["dates"], utc_col)
225
297
 
226
298
 
227
- def test_read_write_datetime_tz_with_nulls(tmp_path):
299
+ @pytest.mark.filterwarnings(
300
+ "ignore: Non-conformant content for record 1 in column dates"
301
+ )
302
+ @pytest.mark.requires_arrow_write_api
303
+ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
228
304
  dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT]
229
305
  if PANDAS_GE_20:
230
306
  dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
@@ -235,13 +311,18 @@ def test_read_write_datetime_tz_with_nulls(tmp_path):
235
311
  crs="EPSG:4326",
236
312
  )
237
313
  fpath = tmp_path / "test.gpkg"
238
- write_dataframe(df, fpath)
239
- result = read_dataframe(fpath)
314
+ write_dataframe(df, fpath, use_arrow=use_arrow)
315
+ result = read_dataframe(fpath, use_arrow=use_arrow)
316
+ if use_arrow:
317
+ # with Arrow, the datetimes are always read as UTC
318
+ df["dates"] = df["dates"].dt.tz_convert("UTC")
240
319
  assert_geodataframe_equal(df, result)
241
320
 
242
321
 
243
322
  def test_read_null_values(test_fgdb_vsi, use_arrow):
244
- df = read_dataframe(test_fgdb_vsi, use_arrow=use_arrow, read_geometry=False)
323
+ df = read_dataframe(
324
+ test_fgdb_vsi, layer="basetable_2", use_arrow=use_arrow, read_geometry=False
325
+ )
245
326
 
246
327
  # make sure that Null values are preserved
247
328
  assert df.SEGMENT_NAME.isnull().max()
@@ -331,6 +412,21 @@ def test_read_where_invalid(request, naturalearth_lowres_all_ext, use_arrow):
331
412
  )
332
413
 
333
414
 
415
+ def test_read_where_ignored_field(naturalearth_lowres, use_arrow):
416
+ # column included in where is not also included in list of columns, which means
417
+ # GDAL will return no features
418
+ # NOTE: this behavior is inconsistent across drivers so only shapefiles are
419
+ # tested for this
420
+ df = read_dataframe(
421
+ naturalearth_lowres,
422
+ where=""" "iso_a3" = 'CAN' """,
423
+ columns=["name"],
424
+ use_arrow=use_arrow,
425
+ )
426
+
427
+ assert len(df) == 0
428
+
429
+
334
430
  @pytest.mark.parametrize("bbox", [(1,), (1, 2), (1, 2, 3)])
335
431
  def test_read_bbox_invalid(naturalearth_lowres_all_ext, bbox, use_arrow):
336
432
  with pytest.raises(ValueError, match="Invalid bbox"):
@@ -349,7 +445,7 @@ def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected):
349
445
  if (
350
446
  use_arrow
351
447
  and __gdal_version__ < (3, 8, 0)
352
- and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg"
448
+ and naturalearth_lowres_all_ext.suffix == ".gpkg"
353
449
  ):
354
450
  pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
355
451
 
@@ -438,7 +534,7 @@ def test_read_mask(
438
534
  if (
439
535
  use_arrow
440
536
  and __gdal_version__ < (3, 8, 0)
441
- and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg"
537
+ and naturalearth_lowres_all_ext.suffix == ".gpkg"
442
538
  ):
443
539
  pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
444
540
 
@@ -470,14 +566,45 @@ def test_read_mask_where(naturalearth_lowres_all_ext, use_arrow):
470
566
  assert np.array_equal(df.iso_a3, ["CAN"])
471
567
 
472
568
 
473
- def test_read_fids(naturalearth_lowres_all_ext):
569
+ @pytest.mark.parametrize("fids", [[1, 5, 10], np.array([1, 5, 10], dtype=np.int64)])
570
+ def test_read_fids(naturalearth_lowres_all_ext, fids, use_arrow):
474
571
  # ensure keyword is properly passed through
475
- fids = np.array([1, 10, 5], dtype=np.int64)
476
- df = read_dataframe(naturalearth_lowres_all_ext, fids=fids, fid_as_index=True)
572
+ df = read_dataframe(
573
+ naturalearth_lowres_all_ext, fids=fids, fid_as_index=True, use_arrow=use_arrow
574
+ )
477
575
  assert len(df) == 3
478
576
  assert np.array_equal(fids, df.index.values)
479
577
 
480
578
 
579
+ @requires_pyarrow_api
580
+ def test_read_fids_arrow_max_exception(naturalearth_lowres):
581
+ # Maximum number at time of writing is 4997 for "OGRSQL". For e.g. for SQLite based
582
+ # formats like Geopackage, there is no limit.
583
+ nb_fids = 4998
584
+ fids = range(nb_fids)
585
+ with pytest.raises(ValueError, match=f"error applying filter for {nb_fids} fids"):
586
+ _ = read_dataframe(naturalearth_lowres, fids=fids, use_arrow=True)
587
+
588
+
589
+ @requires_pyarrow_api
590
+ @pytest.mark.skipif(
591
+ __gdal_version__ >= (3, 8, 0), reason="GDAL >= 3.8.0 does not need to warn"
592
+ )
593
+ def test_read_fids_arrow_warning_old_gdal(naturalearth_lowres_all_ext):
594
+ # A warning should be given for old GDAL versions, except for some file formats.
595
+ if naturalearth_lowres_all_ext.suffix not in [".gpkg", ".geojson"]:
596
+ handler = pytest.warns(
597
+ UserWarning,
598
+ match="Using 'fids' and 'use_arrow=True' with GDAL < 3.8 can be slow",
599
+ )
600
+ else:
601
+ handler = contextlib.nullcontext()
602
+
603
+ with handler:
604
+ df = read_dataframe(naturalearth_lowres_all_ext, fids=[22], use_arrow=True)
605
+ assert len(df) == 1
606
+
607
+
481
608
  def test_read_fids_force_2d(test_fgdb_vsi):
482
609
  with pytest.warns(
483
610
  UserWarning, match=r"Measured \(M\) geometry types are not supported"
@@ -573,13 +700,17 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
573
700
  # The geometry column cannot be specified when using the
574
701
  # default OGRSQL dialect but is returned nonetheless, so 4 columns.
575
702
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
576
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
703
+ df = read_dataframe(
704
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
705
+ )
577
706
  assert len(df.columns) == 4
578
707
  assert len(df) == 177
579
708
 
580
709
  # Should return single row
581
710
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
582
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
711
+ df = read_dataframe(
712
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
713
+ )
583
714
  assert len(df) == 1
584
715
  assert len(df.columns) == 6
585
716
  assert df.iloc[0].iso_a3 == "CAN"
@@ -587,7 +718,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
587
718
  sql = """SELECT *
588
719
  FROM naturalearth_lowres
589
720
  WHERE iso_a3 IN ('CAN', 'USA', 'MEX')"""
590
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
721
+ df = read_dataframe(
722
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
723
+ )
591
724
  assert len(df.columns) == 6
592
725
  assert len(df) == 3
593
726
  assert df.iso_a3.tolist() == ["CAN", "USA", "MEX"]
@@ -596,7 +729,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
596
729
  FROM naturalearth_lowres
597
730
  WHERE iso_a3 IN ('CAN', 'USA', 'MEX')
598
731
  ORDER BY name"""
599
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
732
+ df = read_dataframe(
733
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
734
+ )
600
735
  assert len(df.columns) == 6
601
736
  assert len(df) == 3
602
737
  assert df.iso_a3.tolist() == ["CAN", "MEX", "USA"]
@@ -605,7 +740,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
605
740
  sql = """SELECT *
606
741
  FROM naturalearth_lowres
607
742
  WHERE POP_EST >= 10000000 AND POP_EST < 100000000"""
608
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
743
+ df = read_dataframe(
744
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
745
+ )
609
746
  assert len(df) == 75
610
747
  assert len(df.columns) == 6
611
748
  assert df.pop_est.min() >= 10000000
@@ -613,25 +750,36 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
613
750
 
614
751
  # Should match no items.
615
752
  sql = "SELECT * FROM naturalearth_lowres WHERE ISO_A3 = 'INVALID'"
616
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
753
+ df = read_dataframe(
754
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
755
+ )
617
756
  assert len(df) == 0
618
757
 
619
758
 
620
- def test_read_sql_invalid(naturalearth_lowres_all_ext):
759
+ def test_read_sql_invalid(naturalearth_lowres_all_ext, use_arrow):
621
760
  if naturalearth_lowres_all_ext.suffix == ".gpkg":
622
761
  with pytest.raises(Exception, match="In ExecuteSQL().*"):
623
- read_dataframe(naturalearth_lowres_all_ext, sql="invalid")
762
+ read_dataframe(
763
+ naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
764
+ )
624
765
  else:
625
766
  with pytest.raises(Exception, match="SQL Expression Parsing Error"):
626
- read_dataframe(naturalearth_lowres_all_ext, sql="invalid")
767
+ read_dataframe(
768
+ naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
769
+ )
627
770
 
628
771
  with pytest.raises(
629
772
  ValueError, match="'sql' paramater cannot be combined with 'layer'"
630
773
  ):
631
- read_dataframe(naturalearth_lowres_all_ext, sql="whatever", layer="invalid")
774
+ read_dataframe(
775
+ naturalearth_lowres_all_ext,
776
+ sql="whatever",
777
+ layer="invalid",
778
+ use_arrow=use_arrow,
779
+ )
632
780
 
633
781
 
634
- def test_read_sql_columns_where(naturalearth_lowres_all_ext):
782
+ def test_read_sql_columns_where(naturalearth_lowres_all_ext, use_arrow):
635
783
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
636
784
  df = read_dataframe(
637
785
  naturalearth_lowres_all_ext,
@@ -639,13 +787,14 @@ def test_read_sql_columns_where(naturalearth_lowres_all_ext):
639
787
  sql_dialect="OGRSQL",
640
788
  columns=["iso_a3_renamed", "name"],
641
789
  where="iso_a3_renamed IN ('CAN', 'USA', 'MEX')",
790
+ use_arrow=use_arrow,
642
791
  )
643
792
  assert len(df.columns) == 3
644
793
  assert len(df) == 3
645
794
  assert df.iso_a3_renamed.tolist() == ["CAN", "USA", "MEX"]
646
795
 
647
796
 
648
- def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext):
797
+ def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext, use_arrow):
649
798
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
650
799
  df = read_dataframe(
651
800
  naturalearth_lowres_all_ext,
@@ -654,13 +803,14 @@ def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext):
654
803
  columns=["iso_a3_renamed", "name"],
655
804
  where="iso_a3_renamed IN ('CRI', 'PAN')",
656
805
  bbox=(-85, 8, -80, 10),
806
+ use_arrow=use_arrow,
657
807
  )
658
808
  assert len(df.columns) == 3
659
809
  assert len(df) == 2
660
810
  assert df.iso_a3_renamed.tolist() == ["PAN", "CRI"]
661
811
 
662
812
 
663
- def test_read_sql_skip_max(naturalearth_lowres_all_ext):
813
+ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow):
664
814
  sql = """SELECT *
665
815
  FROM naturalearth_lowres
666
816
  WHERE iso_a3 IN ('CAN', 'MEX', 'USA')
@@ -671,6 +821,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
671
821
  skip_features=1,
672
822
  max_features=1,
673
823
  sql_dialect="OGRSQL",
824
+ use_arrow=use_arrow,
674
825
  )
675
826
  assert len(df.columns) == 6
676
827
  assert len(df) == 1
@@ -678,13 +829,21 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
678
829
 
679
830
  sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
680
831
  df = read_dataframe(
681
- naturalearth_lowres_all_ext, sql=sql, max_features=3, sql_dialect="OGRSQL"
832
+ naturalearth_lowres_all_ext,
833
+ sql=sql,
834
+ max_features=3,
835
+ sql_dialect="OGRSQL",
836
+ use_arrow=use_arrow,
682
837
  )
683
838
  assert len(df) == 1
684
839
 
685
840
  sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
686
841
  df = read_dataframe(
687
- naturalearth_lowres_all_ext, sql=sql, skip_features=1, sql_dialect="OGRSQL"
842
+ naturalearth_lowres_all_ext,
843
+ sql=sql,
844
+ sql_dialect="OGRSQL",
845
+ skip_features=1,
846
+ use_arrow=use_arrow,
688
847
  )
689
848
  assert len(df) == 0
690
849
 
@@ -695,10 +854,12 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
695
854
  [ext for ext in ALL_EXTS if ext != ".gpkg"],
696
855
  indirect=["naturalearth_lowres"],
697
856
  )
698
- def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
857
+ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres, use_arrow):
699
858
  # Should return singular item
700
859
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
701
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE")
860
+ df = read_dataframe(
861
+ naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
862
+ )
702
863
  assert len(df) == 1
703
864
  assert len(df.columns) == 6
704
865
  assert df.iloc[0].iso_a3 == "CAN"
@@ -708,7 +869,9 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
708
869
  sql = """SELECT ST_Buffer(geometry, 5) AS geometry, name, pop_est, iso_a3
709
870
  FROM naturalearth_lowres
710
871
  WHERE ISO_A3 = 'CAN'"""
711
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE")
872
+ df = read_dataframe(
873
+ naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
874
+ )
712
875
  assert len(df) == 1
713
876
  assert len(df.columns) == 4
714
877
  assert df.iloc[0].geometry.area > area_canada
@@ -718,12 +881,14 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
718
881
  @pytest.mark.parametrize(
719
882
  "naturalearth_lowres", [".gpkg"], indirect=["naturalearth_lowres"]
720
883
  )
721
- def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres):
884
+ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
722
885
  # "INDIRECT_SQL" prohibits GDAL from passing the SQL statement to sqlite.
723
886
  # Because the statement is processed within GDAL it is possible to use
724
887
  # spatialite functions even if sqlite isn't built with spatialite support.
725
888
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
726
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE")
889
+ df = read_dataframe(
890
+ naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
891
+ )
727
892
  assert len(df) == 1
728
893
  assert len(df.columns) == 6
729
894
  assert df.iloc[0].iso_a3 == "CAN"
@@ -733,29 +898,67 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres):
733
898
  sql = """SELECT ST_Buffer(geom, 5) AS geometry, name, pop_est, iso_a3
734
899
  FROM naturalearth_lowres
735
900
  WHERE ISO_A3 = 'CAN'"""
736
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE")
901
+ df = read_dataframe(
902
+ naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
903
+ )
737
904
  assert len(df) == 1
738
905
  assert len(df.columns) == 4
739
906
  assert df.iloc[0].geometry.area > area_canada
740
907
 
741
908
 
909
+ @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
910
+ def test_write_csv_encoding(tmp_path, encoding):
911
+ """Test if write_dataframe uses the default encoding correctly."""
912
+ # Write csv test file. Depending on the os this will be written in a different
913
+ # encoding: for linux and macos this is utf-8, for windows it is cp1252.
914
+ csv_path = tmp_path / "test.csv"
915
+
916
+ with open(csv_path, "w", encoding=encoding) as csv:
917
+ csv.write("näme,city\n")
918
+ csv.write("Wilhelm Röntgen,Zürich\n")
919
+
920
+ # Write csv test file with the same data using write_dataframe. It should use the
921
+ # same encoding as above.
922
+ df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
923
+ csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
924
+ write_dataframe(df, csv_pyogrio_path, encoding=encoding)
925
+
926
+ # Check if the text files written both ways can be read again and give same result.
927
+ with open(csv_path, "r", encoding=encoding) as csv:
928
+ csv_str = csv.read()
929
+ with open(csv_pyogrio_path, "r", encoding=encoding) as csv_pyogrio:
930
+ csv_pyogrio_str = csv_pyogrio.read()
931
+ assert csv_str == csv_pyogrio_str
932
+
933
+ # Check if they files are binary identical, to be 100% sure they were written with
934
+ # the same encoding.
935
+ with open(csv_path, "rb") as csv:
936
+ csv_bytes = csv.read()
937
+ with open(csv_pyogrio_path, "rb") as csv_pyogrio:
938
+ csv_pyogrio_bytes = csv_pyogrio.read()
939
+ assert csv_bytes == csv_pyogrio_bytes
940
+
941
+
742
942
  @pytest.mark.parametrize("ext", ALL_EXTS)
743
- def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
943
+ @pytest.mark.requires_arrow_write_api
944
+ def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):
744
945
  input_gdf = read_dataframe(naturalearth_lowres)
745
946
  output_path = tmp_path / f"test{ext}"
746
947
 
747
948
  if ext == ".fgb":
748
949
  # For .fgb, spatial_index=False to avoid the rows being reordered
749
- write_dataframe(input_gdf, output_path, spatial_index=False)
950
+ write_dataframe(
951
+ input_gdf, output_path, use_arrow=use_arrow, spatial_index=False
952
+ )
750
953
  else:
751
- write_dataframe(input_gdf, output_path)
954
+ write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
752
955
 
753
956
  assert output_path.exists()
754
957
  result_gdf = read_dataframe(output_path)
755
958
 
756
959
  geometry_types = result_gdf.geometry.type.unique()
757
960
  if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
758
- assert geometry_types == ["MultiPolygon"]
961
+ assert list(geometry_types) == ["MultiPolygon"]
759
962
  else:
760
963
  assert set(geometry_types) == set(["MultiPolygon", "Polygon"])
761
964
 
@@ -776,14 +979,21 @@ def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
776
979
 
777
980
 
778
981
  @pytest.mark.filterwarnings("ignore:.*No SRS set on layer.*")
982
+ @pytest.mark.parametrize("write_geodf", [True, False])
779
983
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS + [".xlsx"] if ext != ".fgb"])
780
- def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
781
- """Test writing a dataframe without a geometry column.
984
+ @pytest.mark.requires_arrow_write_api
985
+ def test_write_dataframe_no_geom(
986
+ request, tmp_path, naturalearth_lowres, write_geodf, ext, use_arrow
987
+ ):
988
+ """Test writing a (geo)dataframe without a geometry column.
782
989
 
783
990
  FlatGeobuf (.fgb) doesn't seem to support this, and just writes an empty file.
784
991
  """
785
992
  # Prepare test data
786
993
  input_df = read_dataframe(naturalearth_lowres, read_geometry=False)
994
+ if write_geodf:
995
+ input_df = gp.GeoDataFrame(input_df)
996
+
787
997
  output_path = tmp_path / f"test{ext}"
788
998
 
789
999
  # A shapefile without geometry column results in only a .dbf file.
@@ -793,7 +1003,7 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
793
1003
  # Determine driver
794
1004
  driver = DRIVERS[ext] if ext != ".xlsx" else "XLSX"
795
1005
 
796
- write_dataframe(input_df, output_path, driver=driver)
1006
+ write_dataframe(input_df, output_path, use_arrow=use_arrow, driver=driver)
797
1007
 
798
1008
  assert output_path.exists()
799
1009
  result_df = read_dataframe(output_path)
@@ -806,6 +1016,9 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
806
1016
  if ext in [".gpkg", ".shp", ".xlsx"]:
807
1017
  # These file types return a DataFrame when read.
808
1018
  assert not isinstance(result_df, gp.GeoDataFrame)
1019
+ if isinstance(input_df, gp.GeoDataFrame):
1020
+ input_df = pd.DataFrame(input_df)
1021
+
809
1022
  pd.testing.assert_frame_equal(
810
1023
  result_df, input_df, check_index_type=False, check_dtype=check_dtype
811
1024
  )
@@ -822,12 +1035,27 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
822
1035
  )
823
1036
 
824
1037
 
1038
+ @pytest.mark.requires_arrow_write_api
1039
+ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow):
1040
+ # dataframe writing ignores the index
1041
+ input_gdf = read_dataframe(naturalearth_lowres)
1042
+ input_gdf = input_gdf.set_index("iso_a3")
1043
+
1044
+ output_path = tmp_path / "test.shp"
1045
+ write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
1046
+
1047
+ result_gdf = read_dataframe(output_path)
1048
+ assert isinstance(result_gdf.index, pd.RangeIndex)
1049
+ assert_geodataframe_equal(result_gdf, input_gdf.reset_index(drop=True))
1050
+
1051
+
825
1052
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"])
826
- def test_write_empty_dataframe(tmp_path, ext):
1053
+ @pytest.mark.requires_arrow_write_api
1054
+ def test_write_empty_dataframe(tmp_path, ext, use_arrow):
827
1055
  expected = gp.GeoDataFrame(geometry=[], crs=4326)
828
1056
 
829
1057
  filename = tmp_path / f"test{ext}"
830
- write_dataframe(expected, filename)
1058
+ write_dataframe(expected, filename, use_arrow=use_arrow)
831
1059
 
832
1060
  assert filename.exists()
833
1061
  df = read_dataframe(filename)
@@ -835,83 +1063,119 @@ def test_write_empty_dataframe(tmp_path, ext):
835
1063
 
836
1064
 
837
1065
  @pytest.mark.parametrize("ext", [".geojsonl", ".geojsons"])
838
- def test_write_read_empty_dataframe_unsupported(tmp_path, ext):
1066
+ @pytest.mark.requires_arrow_write_api
1067
+ def test_write_read_empty_dataframe_unsupported(tmp_path, ext, use_arrow):
839
1068
  # Writing empty dataframe to .geojsons or .geojsonl results logically in a 0 byte
840
1069
  # file, but gdal isn't able to read those again at the time of writing.
841
1070
  # Issue logged here: https://github.com/geopandas/pyogrio/issues/94
842
1071
  expected = gp.GeoDataFrame(geometry=[], crs=4326)
843
1072
 
844
1073
  filename = tmp_path / f"test{ext}"
845
- write_dataframe(expected, filename)
1074
+ write_dataframe(expected, filename, use_arrow=use_arrow)
846
1075
 
847
1076
  assert filename.exists()
848
1077
  with pytest.raises(
849
- Exception, match=".* not recognized as a supported file format."
1078
+ Exception, match=".* not recognized as( being in)? a supported file format."
850
1079
  ):
851
- _ = read_dataframe(filename)
1080
+ _ = read_dataframe(filename, use_arrow=use_arrow)
852
1081
 
853
1082
 
854
- def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres):
1083
+ @pytest.mark.requires_arrow_write_api
1084
+ def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres, use_arrow):
855
1085
  input_gdf = read_dataframe(naturalearth_lowres)
856
- output_path = tmp_path / "test.gpkg"
1086
+ filename = tmp_path / "test.gpkg"
857
1087
 
858
- write_dataframe(input_gdf, output_path, layer="first", promote_to_multi=True)
1088
+ write_dataframe(
1089
+ input_gdf,
1090
+ filename,
1091
+ layer="first",
1092
+ promote_to_multi=True,
1093
+ use_arrow=use_arrow,
1094
+ )
859
1095
 
860
- assert os.path.exists(output_path)
861
- assert np.array_equal(list_layers(output_path), [["first", "MultiPolygon"]])
1096
+ assert filename.exists()
1097
+ assert np.array_equal(list_layers(filename), [["first", "MultiPolygon"]])
862
1098
 
863
- write_dataframe(input_gdf, output_path, layer="second", promote_to_multi=True)
1099
+ write_dataframe(
1100
+ input_gdf,
1101
+ filename,
1102
+ layer="second",
1103
+ promote_to_multi=True,
1104
+ use_arrow=use_arrow,
1105
+ )
864
1106
  assert np.array_equal(
865
- list_layers(output_path),
1107
+ list_layers(filename),
866
1108
  [["first", "MultiPolygon"], ["second", "MultiPolygon"]],
867
1109
  )
868
1110
 
869
1111
 
870
1112
  @pytest.mark.parametrize("ext", ALL_EXTS)
871
- def test_write_dataframe_append(tmp_path, naturalearth_lowres, ext):
1113
+ @pytest.mark.requires_arrow_write_api
1114
+ def test_write_dataframe_append(request, tmp_path, naturalearth_lowres, ext, use_arrow):
872
1115
  if ext == ".fgb" and __gdal_version__ <= (3, 5, 0):
873
1116
  pytest.skip("Append to FlatGeobuf fails for GDAL <= 3.5.0")
874
1117
 
875
1118
  if ext in (".geojsonl", ".geojsons") and __gdal_version__ <= (3, 6, 0):
876
1119
  pytest.skip("Append to GeoJSONSeq only available for GDAL >= 3.6.0")
877
1120
 
1121
+ if use_arrow and ext.startswith(".geojson"):
1122
+ # Bug in GDAL when appending int64 to GeoJSON
1123
+ # (https://github.com/OSGeo/gdal/issues/9792)
1124
+ request.node.add_marker(
1125
+ pytest.mark.xfail(reason="Bugs with append when writing Arrow to GeoJSON")
1126
+ )
1127
+
878
1128
  input_gdf = read_dataframe(naturalearth_lowres)
879
- output_path = tmp_path / f"test{ext}"
1129
+ filename = tmp_path / f"test{ext}"
880
1130
 
881
- write_dataframe(input_gdf, output_path)
1131
+ write_dataframe(input_gdf, filename, use_arrow=use_arrow)
882
1132
 
883
- assert os.path.exists(output_path)
884
- assert len(read_dataframe(output_path)) == 177
1133
+ filename.exists()
1134
+ assert len(read_dataframe(filename)) == 177
885
1135
 
886
- write_dataframe(input_gdf, output_path, append=True)
887
- assert len(read_dataframe(output_path)) == 354
1136
+ write_dataframe(input_gdf, filename, use_arrow=use_arrow, append=True)
1137
+ assert len(read_dataframe(filename)) == 354
888
1138
 
889
1139
 
890
1140
  @pytest.mark.parametrize("spatial_index", [False, True])
891
- def test_write_dataframe_gdal_options(tmp_path, naturalearth_lowres, spatial_index):
1141
+ @pytest.mark.requires_arrow_write_api
1142
+ def test_write_dataframe_gdal_options(
1143
+ tmp_path, naturalearth_lowres, spatial_index, use_arrow
1144
+ ):
892
1145
  df = read_dataframe(naturalearth_lowres)
893
1146
 
894
1147
  outfilename1 = tmp_path / "test1.shp"
895
- write_dataframe(df, outfilename1, SPATIAL_INDEX="YES" if spatial_index else "NO")
1148
+ write_dataframe(
1149
+ df,
1150
+ outfilename1,
1151
+ use_arrow=use_arrow,
1152
+ SPATIAL_INDEX="YES" if spatial_index else "NO",
1153
+ )
896
1154
  assert outfilename1.exists() is True
897
1155
  index_filename1 = tmp_path / "test1.qix"
898
1156
  assert index_filename1.exists() is spatial_index
899
1157
 
900
1158
  # using explicit layer_options instead
901
1159
  outfilename2 = tmp_path / "test2.shp"
902
- write_dataframe(df, outfilename2, layer_options=dict(spatial_index=spatial_index))
1160
+ write_dataframe(
1161
+ df,
1162
+ outfilename2,
1163
+ use_arrow=use_arrow,
1164
+ layer_options=dict(spatial_index=spatial_index),
1165
+ )
903
1166
  assert outfilename2.exists() is True
904
1167
  index_filename2 = tmp_path / "test2.qix"
905
1168
  assert index_filename2.exists() is spatial_index
906
1169
 
907
1170
 
908
- def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres):
1171
+ @pytest.mark.requires_arrow_write_api
1172
+ def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres, use_arrow):
909
1173
  df = read_dataframe(naturalearth_lowres)
910
1174
 
911
1175
  # geojson has no spatial index, so passing keyword should raise
912
1176
  outfilename = tmp_path / "test.geojson"
913
1177
  with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"):
914
- write_dataframe(df, outfilename, spatial_index=True)
1178
+ write_dataframe(df, outfilename, use_arrow=use_arrow, spatial_index=True)
915
1179
 
916
1180
 
917
1181
  def _get_gpkg_table_names(path):
@@ -924,21 +1188,25 @@ def _get_gpkg_table_names(path):
924
1188
  return [res[0] for res in result]
925
1189
 
926
1190
 
927
- def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres):
1191
+ @pytest.mark.requires_arrow_write_api
1192
+ def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres, use_arrow):
928
1193
  df = read_dataframe(naturalearth_lowres)
929
1194
 
930
1195
  test_default_filename = tmp_path / "test_default.gpkg"
931
- write_dataframe(df, test_default_filename)
1196
+ write_dataframe(df, test_default_filename, use_arrow=use_arrow)
932
1197
  assert "gpkg_ogr_contents" in _get_gpkg_table_names(test_default_filename)
933
1198
 
934
1199
  test_no_contents_filename = tmp_path / "test_no_contents.gpkg"
935
- write_dataframe(df, test_default_filename, ADD_GPKG_OGR_CONTENTS="NO")
1200
+ write_dataframe(
1201
+ df, test_default_filename, use_arrow=use_arrow, ADD_GPKG_OGR_CONTENTS="NO"
1202
+ )
936
1203
  assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename)
937
1204
 
938
1205
  test_no_contents_filename2 = tmp_path / "test_no_contents2.gpkg"
939
1206
  write_dataframe(
940
1207
  df,
941
1208
  test_no_contents_filename2,
1209
+ use_arrow=use_arrow,
942
1210
  dataset_options=dict(add_gpkg_ogr_contents=False),
943
1211
  )
944
1212
  assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename2)
@@ -955,6 +1223,7 @@ def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres):
955
1223
  (".geojson", False, ["MultiPolygon", "Polygon"], "Unknown"),
956
1224
  ],
957
1225
  )
1226
+ @pytest.mark.requires_arrow_write_api
958
1227
  def test_write_dataframe_promote_to_multi(
959
1228
  tmp_path,
960
1229
  naturalearth_lowres,
@@ -962,11 +1231,14 @@ def test_write_dataframe_promote_to_multi(
962
1231
  promote_to_multi,
963
1232
  expected_geometry_types,
964
1233
  expected_geometry_type,
1234
+ use_arrow,
965
1235
  ):
966
1236
  input_gdf = read_dataframe(naturalearth_lowres)
967
1237
 
968
1238
  output_path = tmp_path / f"test_promote{ext}"
969
- write_dataframe(input_gdf, output_path, promote_to_multi=promote_to_multi)
1239
+ write_dataframe(
1240
+ input_gdf, output_path, use_arrow=use_arrow, promote_to_multi=promote_to_multi
1241
+ )
970
1242
 
971
1243
  assert output_path.exists()
972
1244
  output_gdf = read_dataframe(output_path)
@@ -999,6 +1271,7 @@ def test_write_dataframe_promote_to_multi(
999
1271
  (".shp", True, "Unknown", ["MultiPolygon", "Polygon"], "Polygon"),
1000
1272
  ],
1001
1273
  )
1274
+ @pytest.mark.requires_arrow_write_api
1002
1275
  def test_write_dataframe_promote_to_multi_layer_geom_type(
1003
1276
  tmp_path,
1004
1277
  naturalearth_lowres,
@@ -1007,6 +1280,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1007
1280
  geometry_type,
1008
1281
  expected_geometry_types,
1009
1282
  expected_geometry_type,
1283
+ use_arrow,
1010
1284
  ):
1011
1285
  input_gdf = read_dataframe(naturalearth_lowres)
1012
1286
 
@@ -1023,6 +1297,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1023
1297
  write_dataframe(
1024
1298
  input_gdf,
1025
1299
  output_path,
1300
+ use_arrow=use_arrow,
1026
1301
  promote_to_multi=promote_to_multi,
1027
1302
  geometry_type=geometry_type,
1028
1303
  )
@@ -1041,9 +1316,15 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1041
1316
  (".fgb", False, "Polygon", "Mismatched geometry type"),
1042
1317
  (".fgb", None, "Point", "Mismatched geometry type"),
1043
1318
  (".fgb", None, "Polygon", "Mismatched geometry type"),
1044
- (".shp", None, "Point", "Could not add feature to layer at index"),
1319
+ (
1320
+ ".shp",
1321
+ None,
1322
+ "Point",
1323
+ "Could not add feature to layer at index|Error while writing batch to OGR layer",
1324
+ ),
1045
1325
  ],
1046
1326
  )
1327
+ @pytest.mark.requires_arrow_write_api
1047
1328
  def test_write_dataframe_promote_to_multi_layer_geom_type_invalid(
1048
1329
  tmp_path,
1049
1330
  naturalearth_lowres,
@@ -1051,31 +1332,37 @@ def test_write_dataframe_promote_to_multi_layer_geom_type_invalid(
1051
1332
  promote_to_multi,
1052
1333
  geometry_type,
1053
1334
  expected_raises_match,
1335
+ use_arrow,
1054
1336
  ):
1055
1337
  input_gdf = read_dataframe(naturalearth_lowres)
1056
1338
 
1057
1339
  output_path = tmp_path / f"test{ext}"
1058
- with pytest.raises(FeatureError, match=expected_raises_match):
1340
+ with pytest.raises((FeatureError, DataLayerError), match=expected_raises_match):
1059
1341
  write_dataframe(
1060
1342
  input_gdf,
1061
1343
  output_path,
1344
+ use_arrow=use_arrow,
1062
1345
  promote_to_multi=promote_to_multi,
1063
1346
  geometry_type=geometry_type,
1064
1347
  )
1065
1348
 
1066
1349
 
1067
- def test_write_dataframe_layer_geom_type_invalid(tmp_path, naturalearth_lowres):
1350
+ @pytest.mark.requires_arrow_write_api
1351
+ def test_write_dataframe_layer_geom_type_invalid(
1352
+ tmp_path, naturalearth_lowres, use_arrow
1353
+ ):
1068
1354
  df = read_dataframe(naturalearth_lowres)
1069
1355
 
1070
1356
  filename = tmp_path / "test.geojson"
1071
1357
  with pytest.raises(
1072
1358
  GeometryError, match="Geometry type is not supported: NotSupported"
1073
1359
  ):
1074
- write_dataframe(df, filename, geometry_type="NotSupported")
1360
+ write_dataframe(df, filename, use_arrow=use_arrow, geometry_type="NotSupported")
1075
1361
 
1076
1362
 
1077
1363
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".shp"])
1078
- def test_write_dataframe_truly_mixed(tmp_path, ext):
1364
+ @pytest.mark.requires_arrow_write_api
1365
+ def test_write_dataframe_truly_mixed(tmp_path, ext, use_arrow):
1079
1366
  geometry = [
1080
1367
  shapely.Point(0, 0),
1081
1368
  shapely.LineString([(0, 0), (1, 1)]),
@@ -1095,9 +1382,9 @@ def test_write_dataframe_truly_mixed(tmp_path, ext):
1095
1382
 
1096
1383
  if ext == ".fgb":
1097
1384
  # For .fgb, spatial_index=False to avoid the rows being reordered
1098
- write_dataframe(df, filename, spatial_index=False)
1385
+ write_dataframe(df, filename, use_arrow=use_arrow, spatial_index=False)
1099
1386
  else:
1100
- write_dataframe(df, filename)
1387
+ write_dataframe(df, filename, use_arrow=use_arrow)
1101
1388
 
1102
1389
  # Drivers that support mixed geometries will default to "Unknown" geometry type
1103
1390
  assert read_info(filename)["geometry_type"] == "Unknown"
@@ -1105,7 +1392,8 @@ def test_write_dataframe_truly_mixed(tmp_path, ext):
1105
1392
  assert_geodataframe_equal(result, df, check_geom_type=True)
1106
1393
 
1107
1394
 
1108
- def test_write_dataframe_truly_mixed_invalid(tmp_path):
1395
+ @pytest.mark.requires_arrow_write_api
1396
+ def test_write_dataframe_truly_mixed_invalid(tmp_path, use_arrow):
1109
1397
  # Shapefile doesn't support generic "Geometry" / "Unknown" type
1110
1398
  # for mixed geometries
1111
1399
 
@@ -1123,9 +1411,12 @@ def test_write_dataframe_truly_mixed_invalid(tmp_path):
1123
1411
  msg = (
1124
1412
  "Could not add feature to layer at index 1: Attempt to "
1125
1413
  r"write non-point \(LINESTRING\) geometry to point shapefile."
1414
+ # DataLayerError when using Arrow
1415
+ "|Error while writing batch to OGR layer: Attempt to "
1416
+ r"write non-point \(LINESTRING\) geometry to point shapefile."
1126
1417
  )
1127
- with pytest.raises(FeatureError, match=msg):
1128
- write_dataframe(df, tmp_path / "test.shp")
1418
+ with pytest.raises((FeatureError, DataLayerError), match=msg):
1419
+ write_dataframe(df, tmp_path / "test.shp", use_arrow=use_arrow)
1129
1420
 
1130
1421
 
1131
1422
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".fgb"])
@@ -1138,11 +1429,12 @@ def test_write_dataframe_truly_mixed_invalid(tmp_path):
1138
1429
  [None, None],
1139
1430
  ],
1140
1431
  )
1141
- def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext):
1432
+ @pytest.mark.requires_arrow_write_api
1433
+ def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext, use_arrow):
1142
1434
  filename = tmp_path / f"test{ext}"
1143
1435
 
1144
1436
  df = gp.GeoDataFrame({"col": [1.0, 2.0]}, geometry=geoms, crs="EPSG:4326")
1145
- write_dataframe(df, filename)
1437
+ write_dataframe(df, filename, use_arrow=use_arrow)
1146
1438
  result = read_dataframe(filename)
1147
1439
  assert_geodataframe_equal(result, df)
1148
1440
 
@@ -1150,16 +1442,19 @@ def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext):
1150
1442
  @pytest.mark.filterwarnings(
1151
1443
  "ignore: You will likely lose important projection information"
1152
1444
  )
1153
- def test_custom_crs_io(tmpdir, naturalearth_lowres_all_ext):
1445
+ @pytest.mark.requires_arrow_write_api
1446
+ def test_custom_crs_io(tmp_path, naturalearth_lowres_all_ext, use_arrow):
1154
1447
  df = read_dataframe(naturalearth_lowres_all_ext)
1155
1448
  # project Belgium to a custom Albers Equal Area projection
1156
- expected = df.loc[df.name == "Belgium"].to_crs(
1157
- "+proj=aea +lat_1=49.5 +lat_2=51.5 +lon_0=4.3"
1449
+ expected = (
1450
+ df.loc[df.name == "Belgium"]
1451
+ .reset_index(drop=True)
1452
+ .to_crs("+proj=aea +lat_1=49.5 +lat_2=51.5 +lon_0=4.3")
1158
1453
  )
1159
- filename = os.path.join(str(tmpdir), "test.shp")
1160
- write_dataframe(expected, filename)
1454
+ filename = tmp_path / "test.shp"
1455
+ write_dataframe(expected, filename, use_arrow=use_arrow)
1161
1456
 
1162
- assert os.path.exists(filename)
1457
+ assert filename.exists()
1163
1458
 
1164
1459
  df = read_dataframe(filename)
1165
1460
 
@@ -1171,6 +1466,7 @@ def test_custom_crs_io(tmpdir, naturalearth_lowres_all_ext):
1171
1466
 
1172
1467
 
1173
1468
  def test_write_read_mixed_column_values(tmp_path):
1469
+ # use_arrow=True is tested separately below
1174
1470
  mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
1175
1471
  geoms = [shapely.Point(0, 0) for _ in mixed_values]
1176
1472
  test_gdf = gp.GeoDataFrame(
@@ -1187,7 +1483,21 @@ def test_write_read_mixed_column_values(tmp_path):
1187
1483
  assert output_gdf["mixed"][idx] == str(value)
1188
1484
 
1189
1485
 
1190
- def test_write_read_null(tmp_path):
1486
+ @requires_arrow_write_api
1487
+ def test_write_read_mixed_column_values_arrow(tmp_path):
1488
+ # Arrow cannot represent a column of mixed types
1489
+ mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
1490
+ geoms = [shapely.Point(0, 0) for _ in mixed_values]
1491
+ test_gdf = gp.GeoDataFrame(
1492
+ {"geometry": geoms, "mixed": mixed_values}, crs="epsg:31370"
1493
+ )
1494
+ output_path = tmp_path / "test_write_mixed_column.gpkg"
1495
+ with pytest.raises(TypeError, match=".*Conversion failed for column"):
1496
+ write_dataframe(test_gdf, output_path, use_arrow=True)
1497
+
1498
+
1499
+ @pytest.mark.requires_arrow_write_api
1500
+ def test_write_read_null(tmp_path, use_arrow):
1191
1501
  output_path = tmp_path / "test_write_nan.gpkg"
1192
1502
  geom = shapely.Point(0, 0)
1193
1503
  test_data = {
@@ -1196,7 +1506,7 @@ def test_write_read_null(tmp_path):
1196
1506
  "object_str": ["test", None, np.nan],
1197
1507
  }
1198
1508
  test_gdf = gp.GeoDataFrame(test_data, crs="epsg:31370")
1199
- write_dataframe(test_gdf, output_path)
1509
+ write_dataframe(test_gdf, output_path, use_arrow=use_arrow)
1200
1510
  result_gdf = read_dataframe(output_path)
1201
1511
  assert len(test_gdf) == len(result_gdf)
1202
1512
  assert result_gdf["float64"][0] == 1.0
@@ -1219,7 +1529,7 @@ def test_write_read_null(tmp_path):
1219
1529
  ["2.5D MultiLineString", "MultiLineString Z"],
1220
1530
  ),
1221
1531
  (
1222
- "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))", # NOQA
1532
+ "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))",
1223
1533
  ["2.5D MultiPolygon", "MultiPolygon Z"],
1224
1534
  ),
1225
1535
  (
@@ -1228,11 +1538,12 @@ def test_write_read_null(tmp_path):
1228
1538
  ),
1229
1539
  ],
1230
1540
  )
1231
- def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1541
+ @pytest.mark.requires_arrow_write_api
1542
+ def test_write_geometry_z_types(tmp_path, wkt, geom_types, use_arrow):
1232
1543
  filename = tmp_path / "test.fgb"
1233
1544
  gdf = gp.GeoDataFrame(geometry=from_wkt([wkt]), crs="EPSG:4326")
1234
1545
  for geom_type in geom_types:
1235
- write_dataframe(gdf, filename, geometry_type=geom_type)
1546
+ write_dataframe(gdf, filename, use_arrow=use_arrow, geometry_type=geom_type)
1236
1547
  df = read_dataframe(filename)
1237
1548
  assert_geodataframe_equal(df, gdf)
1238
1549
 
@@ -1261,7 +1572,7 @@ def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1261
1572
  "MultiPolygon Z",
1262
1573
  False,
1263
1574
  [
1264
- "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))" # noqa: E501
1575
+ "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))"
1265
1576
  ],
1266
1577
  ),
1267
1578
  (
@@ -1286,8 +1597,9 @@ def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1286
1597
  ),
1287
1598
  ],
1288
1599
  )
1600
+ @pytest.mark.requires_arrow_write_api
1289
1601
  def test_write_geometry_z_types_auto(
1290
- tmp_path, ext, test_descr, exp_geometry_type, mixed_dimensions, wkt
1602
+ tmp_path, ext, test_descr, exp_geometry_type, mixed_dimensions, wkt, use_arrow
1291
1603
  ):
1292
1604
  # Shapefile has some different behaviour that other file types
1293
1605
  if ext == ".shp":
@@ -1314,10 +1626,10 @@ def test_write_geometry_z_types_auto(
1314
1626
  DataSourceError,
1315
1627
  match=("Mixed 2D and 3D coordinates are not supported by"),
1316
1628
  ):
1317
- write_dataframe(gdf, filename)
1629
+ write_dataframe(gdf, filename, use_arrow=use_arrow)
1318
1630
  return
1319
1631
  else:
1320
- write_dataframe(gdf, filename)
1632
+ write_dataframe(gdf, filename, use_arrow=use_arrow)
1321
1633
 
1322
1634
  info = read_info(filename)
1323
1635
  assert info["geometry_type"] == exp_geometry_type
@@ -1329,11 +1641,48 @@ def test_write_geometry_z_types_auto(
1329
1641
  assert_geodataframe_equal(gdf, result_gdf)
1330
1642
 
1331
1643
 
1332
- def test_read_multisurface(data_dir):
1333
- df = read_dataframe(data_dir / "test_multisurface.gpkg")
1644
+ @pytest.mark.parametrize(
1645
+ "on_invalid, message",
1646
+ [
1647
+ (
1648
+ "warn",
1649
+ "Invalid WKB: geometry is returned as None. IllegalArgumentException: "
1650
+ "Invalid number of points in LinearRing found 2 - must be 0 or >=",
1651
+ ),
1652
+ ("raise", "Invalid number of points in LinearRing found 2 - must be 0 or >="),
1653
+ ("ignore", None),
1654
+ ],
1655
+ )
1656
+ def test_read_invalid_shp(data_dir, use_arrow, on_invalid, message):
1657
+ if on_invalid == "raise":
1658
+ handler = pytest.raises(shapely.errors.GEOSException, match=message)
1659
+ elif on_invalid == "warn":
1660
+ handler = pytest.warns(match=message)
1661
+ elif on_invalid == "ignore":
1662
+ handler = contextlib.nullcontext()
1663
+ else:
1664
+ raise ValueError(f"unknown value for on_invalid: {on_invalid}")
1665
+
1666
+ with handler:
1667
+ df = read_dataframe(
1668
+ data_dir / "poly_not_enough_points.shp.zip",
1669
+ use_arrow=use_arrow,
1670
+ on_invalid=on_invalid,
1671
+ )
1672
+ df.geometry.isnull().all()
1673
+
1674
+
1675
+ def test_read_multisurface(data_dir, use_arrow):
1676
+ if use_arrow:
1677
+ with pytest.raises(shapely.errors.GEOSException):
1678
+ # TODO(Arrow)
1679
+ # shapely fails parsing the WKB
1680
+ read_dataframe(data_dir / "test_multisurface.gpkg", use_arrow=True)
1681
+ else:
1682
+ df = read_dataframe(data_dir / "test_multisurface.gpkg")
1334
1683
 
1335
- # MultiSurface should be converted to MultiPolygon
1336
- assert df.geometry.type.tolist() == ["MultiPolygon"]
1684
+ # MultiSurface should be converted to MultiPolygon
1685
+ assert df.geometry.type.tolist() == ["MultiPolygon"]
1337
1686
 
1338
1687
 
1339
1688
  def test_read_dataset_kwargs(data_dir, use_arrow):
@@ -1372,7 +1721,8 @@ def test_read_invalid_dataset_kwargs(naturalearth_lowres, use_arrow):
1372
1721
  read_dataframe(naturalearth_lowres, use_arrow=use_arrow, INVALID="YES")
1373
1722
 
1374
1723
 
1375
- def test_write_nullable_dtypes(tmp_path):
1724
+ @pytest.mark.requires_arrow_write_api
1725
+ def test_write_nullable_dtypes(tmp_path, use_arrow):
1376
1726
  path = tmp_path / "test_nullable_dtypes.gpkg"
1377
1727
  test_data = {
1378
1728
  "col1": pd.Series([1, 2, 3], dtype="int64"),
@@ -1384,7 +1734,7 @@ def test_write_nullable_dtypes(tmp_path):
1384
1734
  input_gdf = gp.GeoDataFrame(
1385
1735
  test_data, geometry=[shapely.Point(0, 0)] * 3, crs="epsg:31370"
1386
1736
  )
1387
- write_dataframe(input_gdf, path)
1737
+ write_dataframe(input_gdf, path, use_arrow=use_arrow)
1388
1738
  output_gdf = read_dataframe(path)
1389
1739
  # We read it back as default (non-nullable) numpy dtypes, so we cast
1390
1740
  # to those for the expected result
@@ -1393,19 +1743,21 @@ def test_write_nullable_dtypes(tmp_path):
1393
1743
  expected["col3"] = expected["col3"].astype("float32")
1394
1744
  expected["col4"] = expected["col4"].astype("float64")
1395
1745
  expected["col5"] = expected["col5"].astype(object)
1746
+ expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
1396
1747
  assert_geodataframe_equal(output_gdf, expected)
1397
1748
 
1398
1749
 
1399
1750
  @pytest.mark.parametrize(
1400
1751
  "metadata_type", ["dataset_metadata", "layer_metadata", "metadata"]
1401
1752
  )
1402
- def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type):
1753
+ @pytest.mark.requires_arrow_write_api
1754
+ def test_metadata_io(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
1403
1755
  metadata = {"level": metadata_type}
1404
1756
 
1405
1757
  df = read_dataframe(naturalearth_lowres)
1406
1758
 
1407
- filename = os.path.join(str(tmpdir), "test.gpkg")
1408
- write_dataframe(df, filename, **{metadata_type: metadata})
1759
+ filename = tmp_path / "test.gpkg"
1760
+ write_dataframe(df, filename, use_arrow=use_arrow, **{metadata_type: metadata})
1409
1761
 
1410
1762
  metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type
1411
1763
 
@@ -1421,22 +1773,27 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type):
1421
1773
  {"key": 1},
1422
1774
  ],
1423
1775
  )
1424
- def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata):
1776
+ @pytest.mark.requires_arrow_write_api
1777
+ def test_invalid_metadata(
1778
+ tmp_path, naturalearth_lowres, metadata_type, metadata, use_arrow
1779
+ ):
1780
+ df = read_dataframe(naturalearth_lowres)
1425
1781
  with pytest.raises(ValueError, match="must be a string"):
1426
- filename = os.path.join(str(tmpdir), "test.gpkg")
1427
1782
  write_dataframe(
1428
- read_dataframe(naturalearth_lowres), filename, **{metadata_type: metadata}
1783
+ df, tmp_path / "test.gpkg", use_arrow=use_arrow, **{metadata_type: metadata}
1429
1784
  )
1430
1785
 
1431
1786
 
1432
1787
  @pytest.mark.parametrize("metadata_type", ["dataset_metadata", "layer_metadata"])
1433
- def test_metadata_unsupported(tmpdir, naturalearth_lowres, metadata_type):
1788
+ @pytest.mark.requires_arrow_write_api
1789
+ def test_metadata_unsupported(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
1434
1790
  """metadata is silently ignored"""
1435
1791
 
1436
- filename = os.path.join(str(tmpdir), "test.geojson")
1792
+ filename = tmp_path / "test.geojson"
1437
1793
  write_dataframe(
1438
1794
  read_dataframe(naturalearth_lowres),
1439
1795
  filename,
1796
+ use_arrow=use_arrow,
1440
1797
  **{metadata_type: {"key": "value"}},
1441
1798
  )
1442
1799
 
@@ -1466,3 +1823,352 @@ def test_read_dataframe_arrow_dtypes(tmp_path):
1466
1823
  assert isinstance(result["col"].dtype, pd.ArrowDtype)
1467
1824
  result["col"] = result["col"].astype("float64")
1468
1825
  assert_geodataframe_equal(result, df)
1826
+
1827
+
1828
+ @requires_pyarrow_api
1829
+ @pytest.mark.skipif(
1830
+ __gdal_version__ < (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
1831
+ )
1832
+ @pytest.mark.parametrize("ext", ALL_EXTS)
1833
+ def test_arrow_bool_roundtrip(tmp_path, ext):
1834
+ filename = tmp_path / f"test{ext}"
1835
+
1836
+ kwargs = {}
1837
+
1838
+ if ext == ".fgb":
1839
+ # For .fgb, spatial_index=False to avoid the rows being reordered
1840
+ kwargs["spatial_index"] = False
1841
+
1842
+ df = gp.GeoDataFrame(
1843
+ {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
1844
+ crs="EPSG:4326",
1845
+ )
1846
+
1847
+ write_dataframe(df, filename, **kwargs)
1848
+ result = read_dataframe(filename, use_arrow=True)
1849
+ # Shapefiles do not support bool columns; these are returned as int32
1850
+ assert_geodataframe_equal(result, df, check_dtype=ext != ".shp")
1851
+
1852
+
1853
+ @requires_pyarrow_api
1854
+ @pytest.mark.skipif(
1855
+ __gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
1856
+ )
1857
+ @pytest.mark.parametrize("ext", ALL_EXTS)
1858
+ def test_arrow_bool_exception(tmp_path, ext):
1859
+ filename = tmp_path / f"test{ext}"
1860
+
1861
+ df = gp.GeoDataFrame(
1862
+ {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
1863
+ crs="EPSG:4326",
1864
+ )
1865
+
1866
+ write_dataframe(df, filename)
1867
+
1868
+ if ext in {".fgb", ".gpkg"}:
1869
+ # only raise exception for GPKG / FGB
1870
+ with pytest.raises(
1871
+ RuntimeError,
1872
+ match="GDAL < 3.8.3 does not correctly read boolean data values using "
1873
+ "the Arrow API",
1874
+ ):
1875
+ read_dataframe(filename, use_arrow=True)
1876
+
1877
+ # do not raise exception if no bool columns are read
1878
+ read_dataframe(filename, use_arrow=True, columns=[])
1879
+
1880
+ else:
1881
+ _ = read_dataframe(filename, use_arrow=True)
1882
+
1883
+
1884
+ @pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning")
1885
+ @pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
1886
+ def test_write_memory(naturalearth_lowres, driver):
1887
+ df = read_dataframe(naturalearth_lowres)
1888
+
1889
+ buffer = BytesIO()
1890
+ write_dataframe(df, buffer, driver=driver, layer="test")
1891
+
1892
+ assert len(buffer.getbuffer()) > 0
1893
+
1894
+ actual = read_dataframe(buffer)
1895
+ assert len(actual) == len(df)
1896
+
1897
+ is_json = driver == "GeoJSON"
1898
+
1899
+ assert_geodataframe_equal(
1900
+ actual,
1901
+ df,
1902
+ check_less_precise=is_json,
1903
+ check_index_type=False,
1904
+ check_dtype=not is_json,
1905
+ )
1906
+
1907
+
1908
+ def test_write_memory_driver_required(naturalearth_lowres):
1909
+ df = read_dataframe(naturalearth_lowres)
1910
+
1911
+ buffer = BytesIO()
1912
+
1913
+ with pytest.raises(
1914
+ ValueError,
1915
+ match="driver must be provided to write to in-memory file",
1916
+ ):
1917
+ write_dataframe(df.head(1), buffer, driver=None, layer="test")
1918
+
1919
+
1920
+ @pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"])
1921
+ def test_write_memory_unsupported_driver(naturalearth_lowres, driver):
1922
+ if driver == "OpenFileGDB" and __gdal_version__ < (3, 6, 0):
1923
+ pytest.skip("OpenFileGDB write support only available for GDAL >= 3.6.0")
1924
+
1925
+ df = read_dataframe(naturalearth_lowres)
1926
+
1927
+ buffer = BytesIO()
1928
+
1929
+ with pytest.raises(
1930
+ ValueError, match=f"writing to in-memory file is not supported for {driver}"
1931
+ ):
1932
+ write_dataframe(df, buffer, driver=driver, layer="test")
1933
+
1934
+
1935
+ @pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
1936
+ def test_write_memory_append_unsupported(naturalearth_lowres, driver):
1937
+ df = read_dataframe(naturalearth_lowres)
1938
+
1939
+ buffer = BytesIO()
1940
+
1941
+ with pytest.raises(
1942
+ NotImplementedError, match="append is not supported for in-memory files"
1943
+ ):
1944
+ write_dataframe(df.head(1), buffer, driver=driver, layer="test", append=True)
1945
+
1946
+
1947
+ def test_write_memory_existing_unsupported(naturalearth_lowres):
1948
+ df = read_dataframe(naturalearth_lowres)
1949
+
1950
+ buffer = BytesIO(b"0000")
1951
+ with pytest.raises(
1952
+ NotImplementedError,
1953
+ match="writing to existing in-memory object is not supported",
1954
+ ):
1955
+ write_dataframe(df.head(1), buffer, driver="GeoJSON", layer="test")
1956
+
1957
+
1958
+ @pytest.mark.parametrize("ext", ["gpkg", "geojson"])
1959
+ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
1960
+ """Verify that we write non-UTF data to the data source
1961
+
1962
+ IMPORTANT: this may not be valid for the data source and will likely render
1963
+ them unusable in other tools, but should successfully roundtrip unless we
1964
+ disable writing using other encodings.
1965
+
1966
+ NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9
1967
+
1968
+ NOTE: pyarrow cannot handle non-UTF-8 characters in this way
1969
+ """
1970
+
1971
+ encoding, text = encoded_text
1972
+ output_path = tmp_path / f"test.{ext}"
1973
+
1974
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
1975
+ write_dataframe(df, output_path, encoding=encoding)
1976
+
1977
+ # cannot open these files without specifying encoding
1978
+ with pytest.raises(UnicodeDecodeError):
1979
+ read_dataframe(output_path)
1980
+
1981
+ # must provide encoding to read these properly
1982
+ actual = read_dataframe(output_path, encoding=encoding)
1983
+ assert actual.columns[0] == text
1984
+ assert actual[text].values[0] == text
1985
+
1986
+
1987
+ @requires_pyarrow_api
1988
+ @pytest.mark.parametrize("ext", ["gpkg", "geojson"])
1989
+ def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
1990
+ encoding, text = encoded_text
1991
+ output_path = tmp_path / f"test.{ext}"
1992
+
1993
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
1994
+ write_dataframe(df, output_path, encoding=encoding)
1995
+
1996
+ # cannot open these files without specifying encoding
1997
+ with pytest.raises(UnicodeDecodeError):
1998
+ read_dataframe(output_path)
1999
+
2000
+ with pytest.raises(
2001
+ ValueError, match="non-UTF-8 encoding is not supported for Arrow"
2002
+ ):
2003
+ read_dataframe(output_path, encoding=encoding, use_arrow=True)
2004
+
2005
+
2006
+ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
2007
+ encoding, text = encoded_text
2008
+
2009
+ output_path = tmp_path / "test.shp"
2010
+
2011
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
2012
+ write_dataframe(df, output_path, encoding=encoding)
2013
+
2014
+ # NOTE: GDAL automatically creates a cpg file with the encoding name, which
2015
+ # means that if we read this without specifying the encoding it uses the
2016
+ # correct one
2017
+ actual = read_dataframe(output_path, use_arrow=use_arrow)
2018
+ assert actual.columns[0] == text
2019
+ assert actual[text].values[0] == text
2020
+
2021
+ # verify that if cpg file is not present, that user-provided encoding must be used
2022
+ output_path.with_suffix(".cpg").unlink()
2023
+
2024
+ # We will assume ISO-8859-1, which is wrong
2025
+ miscoded = text.encode(encoding).decode("ISO-8859-1")
2026
+
2027
+ if use_arrow:
2028
+ # pyarrow cannot decode column name with incorrect encoding
2029
+ with pytest.raises(UnicodeDecodeError):
2030
+ read_dataframe(output_path, use_arrow=True)
2031
+ else:
2032
+ bad = read_dataframe(output_path, use_arrow=False)
2033
+ assert bad.columns[0] == miscoded
2034
+ assert bad[miscoded].values[0] == miscoded
2035
+
2036
+ # If encoding is provided, that should yield correct text
2037
+ actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
2038
+ assert actual.columns[0] == text
2039
+ assert actual[text].values[0] == text
2040
+
2041
+ # if ENCODING open option, that should yield correct text
2042
+ actual = read_dataframe(output_path, use_arrow=use_arrow, ENCODING=encoding)
2043
+ assert actual.columns[0] == text
2044
+ assert actual[text].values[0] == text
2045
+
2046
+
2047
+ def test_encoding_read_option_collision_shapefile(naturalearth_lowres, use_arrow):
2048
+ """Providing both encoding parameter and ENCODING open option (even if blank) is not allowed"""
2049
+
2050
+ with pytest.raises(
2051
+ ValueError, match='cannot provide both encoding parameter and "ENCODING" option'
2052
+ ):
2053
+ read_dataframe(
2054
+ naturalearth_lowres, encoding="CP936", ENCODING="", use_arrow=use_arrow
2055
+ )
2056
+
2057
+
2058
+ def test_encoding_write_layer_option_collision_shapefile(tmp_path, encoded_text):
2059
+ """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed"""
2060
+ encoding, text = encoded_text
2061
+
2062
+ output_path = tmp_path / "test.shp"
2063
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
2064
+
2065
+ with pytest.raises(
2066
+ ValueError,
2067
+ match='cannot provide both encoding parameter and "ENCODING" layer creation option',
2068
+ ):
2069
+ write_dataframe(
2070
+ df, output_path, encoding=encoding, layer_options={"ENCODING": ""}
2071
+ )
2072
+
2073
+
2074
+ def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
2075
+ encoding = "CP936"
2076
+
2077
+ output_path = tmp_path / "test.shp"
2078
+
2079
+ mandarin = "中文"
2080
+ df = gp.GeoDataFrame(
2081
+ {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
2082
+ )
2083
+ write_dataframe(df, output_path, encoding=encoding)
2084
+
2085
+ actual = read_dataframe(
2086
+ output_path,
2087
+ sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
2088
+ use_arrow=use_arrow,
2089
+ )
2090
+ assert actual.columns[0] == mandarin
2091
+ assert actual[mandarin].values[0] == mandarin
2092
+
2093
+ actual = read_dataframe(
2094
+ output_path,
2095
+ sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
2096
+ encoding=encoding,
2097
+ use_arrow=use_arrow,
2098
+ )
2099
+ assert actual.columns[0] == mandarin
2100
+ assert actual[mandarin].values[0] == mandarin
2101
+
2102
+
2103
+ @pytest.mark.requires_arrow_write_api
2104
+ def test_write_kml_file_coordinate_order(tmp_path, use_arrow):
2105
+ # confirm KML coordinates are written in lon, lat order even if CRS axis specifies otherwise
2106
+ points = [Point(10, 20), Point(30, 40), Point(50, 60)]
2107
+ gdf = gp.GeoDataFrame(geometry=points, crs="EPSG:4326")
2108
+ output_path = tmp_path / "test.kml"
2109
+ write_dataframe(
2110
+ gdf, output_path, layer="tmp_layer", driver="KML", use_arrow=use_arrow
2111
+ )
2112
+
2113
+ gdf_in = read_dataframe(output_path, use_arrow=use_arrow)
2114
+
2115
+ assert np.array_equal(gdf_in.geometry.values, points)
2116
+
2117
+ if "LIBKML" in list_drivers():
2118
+ # test appending to the existing file only if LIBKML is available
2119
+ # as it appears to fall back on LIBKML driver when appending.
2120
+ points_append = [Point(70, 80), Point(90, 100), Point(110, 120)]
2121
+ gdf_append = gp.GeoDataFrame(geometry=points_append, crs="EPSG:4326")
2122
+
2123
+ write_dataframe(
2124
+ gdf_append,
2125
+ output_path,
2126
+ layer="tmp_layer",
2127
+ driver="KML",
2128
+ use_arrow=use_arrow,
2129
+ append=True,
2130
+ )
2131
+ # force_2d used to only compare xy geometry as z-dimension is undesirably
2132
+ # introduced when the kml file is over-written.
2133
+ gdf_in_appended = read_dataframe(
2134
+ output_path, use_arrow=use_arrow, force_2d=True
2135
+ )
2136
+
2137
+ assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
2138
+
2139
+
2140
+ @pytest.mark.requires_arrow_write_api
2141
+ def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
2142
+ points = [Point(10, 20), Point(30, 40), Point(50, 60)]
2143
+ gdf = gp.GeoDataFrame(geometry=points, crs="EPSG:4326")
2144
+ output_path = tmp_path / "test.geojson"
2145
+ write_dataframe(
2146
+ gdf,
2147
+ output_path,
2148
+ layer="tmp_layer",
2149
+ driver="GeoJSON",
2150
+ RFC7946=True,
2151
+ use_arrow=use_arrow,
2152
+ )
2153
+
2154
+ gdf_in = read_dataframe(output_path, use_arrow=use_arrow)
2155
+
2156
+ assert np.array_equal(gdf_in.geometry.values, points)
2157
+
2158
+ # test appending to the existing file
2159
+
2160
+ points_append = [Point(70, 80), Point(90, 100), Point(110, 120)]
2161
+ gdf_append = gp.GeoDataFrame(geometry=points_append, crs="EPSG:4326")
2162
+
2163
+ write_dataframe(
2164
+ gdf_append,
2165
+ output_path,
2166
+ layer="tmp_layer",
2167
+ driver="GeoJSON",
2168
+ RFC7946=True,
2169
+ use_arrow=use_arrow,
2170
+ append=True,
2171
+ )
2172
+
2173
+ gdf_in_appended = read_dataframe(output_path, use_arrow=use_arrow)
2174
+ assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)