pyogrio 0.7.1__cp38-cp38-manylinux_2_28_aarch64.whl → 0.8.0__cp38-cp38-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyogrio might be problematic. Click here for more details.

Files changed (47) hide show
  1. pyogrio/__init__.py +4 -0
  2. pyogrio/_compat.py +7 -1
  3. pyogrio/_err.cpython-38-aarch64-linux-gnu.so +0 -0
  4. pyogrio/_err.pyx +7 -3
  5. pyogrio/_geometry.cpython-38-aarch64-linux-gnu.so +0 -0
  6. pyogrio/_io.cpython-38-aarch64-linux-gnu.so +0 -0
  7. pyogrio/_io.pyx +900 -242
  8. pyogrio/_ogr.cpython-38-aarch64-linux-gnu.so +0 -0
  9. pyogrio/_ogr.pxd +65 -12
  10. pyogrio/_ogr.pyx +8 -24
  11. pyogrio/_version.py +3 -3
  12. pyogrio/_vsi.cpython-38-aarch64-linux-gnu.so +0 -0
  13. pyogrio/_vsi.pxd +4 -0
  14. pyogrio/_vsi.pyx +140 -0
  15. pyogrio/core.py +43 -44
  16. pyogrio/gdal_data/GDAL-targets-release.cmake +3 -3
  17. pyogrio/gdal_data/GDAL-targets.cmake +10 -6
  18. pyogrio/gdal_data/GDALConfigVersion.cmake +3 -3
  19. pyogrio/gdal_data/gdalinfo_output.schema.json +2 -0
  20. pyogrio/gdal_data/gdalvrt.xsd +163 -0
  21. pyogrio/gdal_data/ogrinfo_output.schema.json +12 -1
  22. pyogrio/gdal_data/vcpkg.spdx.json +25 -25
  23. pyogrio/gdal_data/vcpkg_abi_info.txt +27 -26
  24. pyogrio/geopandas.py +131 -30
  25. pyogrio/proj_data/ITRF2008 +2 -2
  26. pyogrio/proj_data/proj-config-version.cmake +2 -2
  27. pyogrio/proj_data/proj-config.cmake +2 -1
  28. pyogrio/proj_data/proj-targets.cmake +13 -13
  29. pyogrio/proj_data/proj.db +0 -0
  30. pyogrio/proj_data/proj4-targets.cmake +13 -13
  31. pyogrio/proj_data/vcpkg.spdx.json +20 -42
  32. pyogrio/proj_data/vcpkg_abi_info.txt +14 -15
  33. pyogrio/raw.py +438 -116
  34. pyogrio/tests/conftest.py +75 -6
  35. pyogrio/tests/test_arrow.py +841 -7
  36. pyogrio/tests/test_core.py +99 -7
  37. pyogrio/tests/test_geopandas_io.py +744 -119
  38. pyogrio/tests/test_path.py +22 -3
  39. pyogrio/tests/test_raw_io.py +276 -50
  40. pyogrio/util.py +41 -19
  41. {pyogrio-0.7.1.dist-info → pyogrio-0.8.0.dist-info}/METADATA +3 -2
  42. {pyogrio-0.7.1.dist-info → pyogrio-0.8.0.dist-info}/RECORD +211 -209
  43. {pyogrio-0.7.1.dist-info → pyogrio-0.8.0.dist-info}/WHEEL +1 -1
  44. pyogrio.libs/{libgdal-d9f9f680.so.33.3.7.2 → libgdal-b2fb2022.so.34.3.8.5} +0 -0
  45. pyogrio/tests/win32.py +0 -86
  46. {pyogrio-0.7.1.dist-info → pyogrio-0.8.0.dist-info}/LICENSE +0 -0
  47. {pyogrio-0.7.1.dist-info → pyogrio-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  import contextlib
2
2
  from datetime import datetime
3
- import os
3
+ from io import BytesIO
4
+ import locale
5
+
4
6
  import numpy as np
5
7
  import pytest
6
8
 
@@ -14,9 +16,11 @@ from pyogrio.raw import (
14
16
  from pyogrio.tests.conftest import (
15
17
  ALL_EXTS,
16
18
  DRIVERS,
17
- requires_arrow_api,
19
+ requires_pyarrow_api,
20
+ requires_arrow_write_api,
18
21
  requires_gdal_geos,
19
22
  )
23
+ from pyogrio._compat import PANDAS_GE_15, HAS_ARROW_WRITE_API
20
24
 
21
25
  try:
22
26
  import pandas as pd
@@ -44,13 +48,30 @@ pytest.importorskip("geopandas")
44
48
  scope="session",
45
49
  params=[
46
50
  False,
47
- pytest.param(True, marks=requires_arrow_api),
51
+ pytest.param(True, marks=requires_pyarrow_api),
48
52
  ],
49
53
  )
50
54
  def use_arrow(request):
51
55
  return request.param
52
56
 
53
57
 
58
+ @pytest.fixture(autouse=True)
59
+ def skip_if_no_arrow_write_api(request):
60
+ # automatically skip tests with use_arrow=True and that require Arrow write
61
+ # API (marked with `@pytest.mark.requires_arrow_write_api`) if it is not available
62
+ use_arrow = (
63
+ request.getfixturevalue("use_arrow")
64
+ if "use_arrow" in request.fixturenames
65
+ else False
66
+ )
67
+ if (
68
+ use_arrow
69
+ and not HAS_ARROW_WRITE_API
70
+ and request.node.get_closest_marker("requires_arrow_write_api")
71
+ ):
72
+ pytest.skip("GDAL>=3.8 required for Arrow write API")
73
+
74
+
54
75
  def spatialite_available(path):
55
76
  try:
56
77
  _ = read_dataframe(
@@ -61,6 +82,45 @@ def spatialite_available(path):
61
82
  return False
62
83
 
63
84
 
85
+ @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
86
+ def test_read_csv_encoding(tmp_path, encoding):
87
+ # Write csv test file. Depending on the os this will be written in a different
88
+ # encoding: for linux and macos this is utf-8, for windows it is cp1252.
89
+ csv_path = tmp_path / "test.csv"
90
+ with open(csv_path, "w", encoding=encoding) as csv:
91
+ csv.write("näme,city\n")
92
+ csv.write("Wilhelm Röntgen,Zürich\n")
93
+
94
+ # Read csv. The data should be read with the same default encoding as the csv file
95
+ # was written in, but should have been converted to utf-8 in the dataframe returned.
96
+ # Hence, the asserts below, with strings in utf-8, be OK.
97
+ df = read_dataframe(csv_path, encoding=encoding)
98
+
99
+ assert len(df) == 1
100
+ assert df.columns.tolist() == ["näme", "city"]
101
+ assert df.city.tolist() == ["Zürich"]
102
+ assert df.näme.tolist() == ["Wilhelm Röntgen"]
103
+
104
+
105
+ @pytest.mark.skipif(
106
+ locale.getpreferredencoding().upper() == "UTF-8",
107
+ reason="test requires non-UTF-8 default platform",
108
+ )
109
+ def test_read_csv_platform_encoding(tmp_path):
110
+ """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
111
+ csv_path = tmp_path / "test.csv"
112
+ with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
113
+ csv.write("näme,city\n")
114
+ csv.write("Wilhelm Röntgen,Zürich\n")
115
+
116
+ df = read_dataframe(csv_path)
117
+
118
+ assert len(df) == 1
119
+ assert df.columns.tolist() == ["näme", "city"]
120
+ assert df.city.tolist() == ["Zürich"]
121
+ assert df.näme.tolist() == ["Wilhelm Röntgen"]
122
+
123
+
64
124
  def test_read_dataframe(naturalearth_lowres_all_ext):
65
125
  df = read_dataframe(naturalearth_lowres_all_ext)
66
126
 
@@ -76,8 +136,8 @@ def test_read_dataframe(naturalearth_lowres_all_ext):
76
136
  ]
77
137
 
78
138
 
79
- def test_read_dataframe_vsi(naturalearth_lowres_vsi):
80
- df = read_dataframe(naturalearth_lowres_vsi[1])
139
+ def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow):
140
+ df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow)
81
141
  assert len(df) == 177
82
142
 
83
143
 
@@ -153,6 +213,7 @@ def test_read_force_2d(test_fgdb_vsi, use_arrow):
153
213
 
154
214
 
155
215
  @pytest.mark.filterwarnings("ignore: Measured")
216
+ @pytest.mark.filterwarnings("ignore: More than one layer found in")
156
217
  def test_read_layer(test_fgdb_vsi, use_arrow):
157
218
  layers = list_layers(test_fgdb_vsi)
158
219
  kwargs = {"use_arrow": use_arrow, "read_geometry": False, "max_features": 1}
@@ -185,8 +246,13 @@ def test_read_datetime(test_fgdb_vsi, use_arrow):
185
246
  assert df.SURVEY_DAT.dtype.name == "datetime64[ns]"
186
247
 
187
248
 
188
- def test_read_datetime_tz(test_datetime_tz, tmp_path):
249
+ @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ")
250
+ @pytest.mark.requires_arrow_write_api
251
+ def test_read_datetime_tz(test_datetime_tz, tmp_path, use_arrow):
189
252
  df = read_dataframe(test_datetime_tz)
253
+ # Make the index non-consecutive to test this case as well. Added for issue
254
+ # https://github.com/geopandas/pyogrio/issues/324
255
+ df = df.set_index(np.array([0, 2]))
190
256
  raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]
191
257
 
192
258
  if PANDAS_GE_20:
@@ -194,15 +260,22 @@ def test_read_datetime_tz(test_datetime_tz, tmp_path):
194
260
  else:
195
261
  expected = pd.to_datetime(raw_expected)
196
262
  expected = pd.Series(expected, name="datetime_col")
197
- assert_series_equal(df.datetime_col, expected)
263
+ assert_series_equal(df.datetime_col, expected, check_index=False)
198
264
  # test write and read round trips
199
265
  fpath = tmp_path / "test.gpkg"
200
- write_dataframe(df, fpath)
201
- df_read = read_dataframe(fpath)
266
+ write_dataframe(df, fpath, use_arrow=use_arrow)
267
+ df_read = read_dataframe(fpath, use_arrow=use_arrow)
268
+ if use_arrow:
269
+ # with Arrow, the datetimes are always read as UTC
270
+ expected = expected.dt.tz_convert("UTC")
202
271
  assert_series_equal(df_read.datetime_col, expected)
203
272
 
204
273
 
205
- def test_write_datetime_mixed_offset(tmp_path):
274
+ @pytest.mark.filterwarnings(
275
+ "ignore: Non-conformant content for record 1 in column dates"
276
+ )
277
+ @pytest.mark.requires_arrow_write_api
278
+ def test_write_datetime_mixed_offset(tmp_path, use_arrow):
206
279
  # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10)
207
280
  dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"]
208
281
  naive_col = pd.Series(pd.to_datetime(dates), name="dates")
@@ -216,14 +289,18 @@ def test_write_datetime_mixed_offset(tmp_path):
216
289
  crs="EPSG:4326",
217
290
  )
218
291
  fpath = tmp_path / "test.gpkg"
219
- write_dataframe(df, fpath)
220
- result = read_dataframe(fpath)
292
+ write_dataframe(df, fpath, use_arrow=use_arrow)
293
+ result = read_dataframe(fpath, use_arrow=use_arrow)
221
294
  # GDAL tz only encodes offsets, not timezones
222
295
  # check multiple offsets are read as utc datetime instead of string values
223
296
  assert_series_equal(result["dates"], utc_col)
224
297
 
225
298
 
226
- def test_read_write_datetime_tz_with_nulls(tmp_path):
299
+ @pytest.mark.filterwarnings(
300
+ "ignore: Non-conformant content for record 1 in column dates"
301
+ )
302
+ @pytest.mark.requires_arrow_write_api
303
+ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
227
304
  dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT]
228
305
  if PANDAS_GE_20:
229
306
  dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
@@ -234,13 +311,18 @@ def test_read_write_datetime_tz_with_nulls(tmp_path):
234
311
  crs="EPSG:4326",
235
312
  )
236
313
  fpath = tmp_path / "test.gpkg"
237
- write_dataframe(df, fpath)
238
- result = read_dataframe(fpath)
314
+ write_dataframe(df, fpath, use_arrow=use_arrow)
315
+ result = read_dataframe(fpath, use_arrow=use_arrow)
316
+ if use_arrow:
317
+ # with Arrow, the datetimes are always read as UTC
318
+ df["dates"] = df["dates"].dt.tz_convert("UTC")
239
319
  assert_geodataframe_equal(df, result)
240
320
 
241
321
 
242
322
  def test_read_null_values(test_fgdb_vsi, use_arrow):
243
- df = read_dataframe(test_fgdb_vsi, use_arrow=use_arrow, read_geometry=False)
323
+ df = read_dataframe(
324
+ test_fgdb_vsi, layer="basetable_2", use_arrow=use_arrow, read_geometry=False
325
+ )
244
326
 
245
327
  # make sure that Null values are preserved
246
328
  assert df.SEGMENT_NAME.isnull().max()
@@ -330,6 +412,21 @@ def test_read_where_invalid(request, naturalearth_lowres_all_ext, use_arrow):
330
412
  )
331
413
 
332
414
 
415
+ def test_read_where_ignored_field(naturalearth_lowres, use_arrow):
416
+ # column included in where is not also included in list of columns, which means
417
+ # GDAL will return no features
418
+ # NOTE: this behavior is inconsistent across drivers so only shapefiles are
419
+ # tested for this
420
+ df = read_dataframe(
421
+ naturalearth_lowres,
422
+ where=""" "iso_a3" = 'CAN' """,
423
+ columns=["name"],
424
+ use_arrow=use_arrow,
425
+ )
426
+
427
+ assert len(df) == 0
428
+
429
+
333
430
  @pytest.mark.parametrize("bbox", [(1,), (1, 2), (1, 2, 3)])
334
431
  def test_read_bbox_invalid(naturalearth_lowres_all_ext, bbox, use_arrow):
335
432
  with pytest.raises(ValueError, match="Invalid bbox"):
@@ -348,7 +445,7 @@ def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected):
348
445
  if (
349
446
  use_arrow
350
447
  and __gdal_version__ < (3, 8, 0)
351
- and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg"
448
+ and naturalearth_lowres_all_ext.suffix == ".gpkg"
352
449
  ):
353
450
  pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
354
451
 
@@ -437,7 +534,7 @@ def test_read_mask(
437
534
  if (
438
535
  use_arrow
439
536
  and __gdal_version__ < (3, 8, 0)
440
- and os.path.splitext(naturalearth_lowres_all_ext)[1] == ".gpkg"
537
+ and naturalearth_lowres_all_ext.suffix == ".gpkg"
441
538
  ):
442
539
  pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
443
540
 
@@ -469,14 +566,45 @@ def test_read_mask_where(naturalearth_lowres_all_ext, use_arrow):
469
566
  assert np.array_equal(df.iso_a3, ["CAN"])
470
567
 
471
568
 
472
- def test_read_fids(naturalearth_lowres_all_ext):
569
+ @pytest.mark.parametrize("fids", [[1, 5, 10], np.array([1, 5, 10], dtype=np.int64)])
570
+ def test_read_fids(naturalearth_lowres_all_ext, fids, use_arrow):
473
571
  # ensure keyword is properly passed through
474
- fids = np.array([1, 10, 5], dtype=np.int64)
475
- df = read_dataframe(naturalearth_lowres_all_ext, fids=fids, fid_as_index=True)
572
+ df = read_dataframe(
573
+ naturalearth_lowres_all_ext, fids=fids, fid_as_index=True, use_arrow=use_arrow
574
+ )
476
575
  assert len(df) == 3
477
576
  assert np.array_equal(fids, df.index.values)
478
577
 
479
578
 
579
+ @requires_pyarrow_api
580
+ def test_read_fids_arrow_max_exception(naturalearth_lowres):
581
+ # Maximum number at time of writing is 4997 for "OGRSQL". For e.g. for SQLite based
582
+ # formats like Geopackage, there is no limit.
583
+ nb_fids = 4998
584
+ fids = range(nb_fids)
585
+ with pytest.raises(ValueError, match=f"error applying filter for {nb_fids} fids"):
586
+ _ = read_dataframe(naturalearth_lowres, fids=fids, use_arrow=True)
587
+
588
+
589
+ @requires_pyarrow_api
590
+ @pytest.mark.skipif(
591
+ __gdal_version__ >= (3, 8, 0), reason="GDAL >= 3.8.0 does not need to warn"
592
+ )
593
+ def test_read_fids_arrow_warning_old_gdal(naturalearth_lowres_all_ext):
594
+ # A warning should be given for old GDAL versions, except for some file formats.
595
+ if naturalearth_lowres_all_ext.suffix not in [".gpkg", ".geojson"]:
596
+ handler = pytest.warns(
597
+ UserWarning,
598
+ match="Using 'fids' and 'use_arrow=True' with GDAL < 3.8 can be slow",
599
+ )
600
+ else:
601
+ handler = contextlib.nullcontext()
602
+
603
+ with handler:
604
+ df = read_dataframe(naturalearth_lowres_all_ext, fids=[22], use_arrow=True)
605
+ assert len(df) == 1
606
+
607
+
480
608
  def test_read_fids_force_2d(test_fgdb_vsi):
481
609
  with pytest.warns(
482
610
  UserWarning, match=r"Measured \(M\) geometry types are not supported"
@@ -572,13 +700,17 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
572
700
  # The geometry column cannot be specified when using the
573
701
  # default OGRSQL dialect but is returned nonetheless, so 4 columns.
574
702
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
575
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
703
+ df = read_dataframe(
704
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
705
+ )
576
706
  assert len(df.columns) == 4
577
707
  assert len(df) == 177
578
708
 
579
709
  # Should return single row
580
710
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
581
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
711
+ df = read_dataframe(
712
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
713
+ )
582
714
  assert len(df) == 1
583
715
  assert len(df.columns) == 6
584
716
  assert df.iloc[0].iso_a3 == "CAN"
@@ -586,7 +718,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
586
718
  sql = """SELECT *
587
719
  FROM naturalearth_lowres
588
720
  WHERE iso_a3 IN ('CAN', 'USA', 'MEX')"""
589
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
721
+ df = read_dataframe(
722
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
723
+ )
590
724
  assert len(df.columns) == 6
591
725
  assert len(df) == 3
592
726
  assert df.iso_a3.tolist() == ["CAN", "USA", "MEX"]
@@ -595,7 +729,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
595
729
  FROM naturalearth_lowres
596
730
  WHERE iso_a3 IN ('CAN', 'USA', 'MEX')
597
731
  ORDER BY name"""
598
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
732
+ df = read_dataframe(
733
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
734
+ )
599
735
  assert len(df.columns) == 6
600
736
  assert len(df) == 3
601
737
  assert df.iso_a3.tolist() == ["CAN", "MEX", "USA"]
@@ -604,7 +740,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
604
740
  sql = """SELECT *
605
741
  FROM naturalearth_lowres
606
742
  WHERE POP_EST >= 10000000 AND POP_EST < 100000000"""
607
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
743
+ df = read_dataframe(
744
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
745
+ )
608
746
  assert len(df) == 75
609
747
  assert len(df.columns) == 6
610
748
  assert df.pop_est.min() >= 10000000
@@ -612,25 +750,36 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
612
750
 
613
751
  # Should match no items.
614
752
  sql = "SELECT * FROM naturalearth_lowres WHERE ISO_A3 = 'INVALID'"
615
- df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL")
753
+ df = read_dataframe(
754
+ naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
755
+ )
616
756
  assert len(df) == 0
617
757
 
618
758
 
619
- def test_read_sql_invalid(naturalearth_lowres_all_ext):
759
+ def test_read_sql_invalid(naturalearth_lowres_all_ext, use_arrow):
620
760
  if naturalearth_lowres_all_ext.suffix == ".gpkg":
621
761
  with pytest.raises(Exception, match="In ExecuteSQL().*"):
622
- read_dataframe(naturalearth_lowres_all_ext, sql="invalid")
762
+ read_dataframe(
763
+ naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
764
+ )
623
765
  else:
624
766
  with pytest.raises(Exception, match="SQL Expression Parsing Error"):
625
- read_dataframe(naturalearth_lowres_all_ext, sql="invalid")
767
+ read_dataframe(
768
+ naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
769
+ )
626
770
 
627
771
  with pytest.raises(
628
772
  ValueError, match="'sql' paramater cannot be combined with 'layer'"
629
773
  ):
630
- read_dataframe(naturalearth_lowres_all_ext, sql="whatever", layer="invalid")
774
+ read_dataframe(
775
+ naturalearth_lowres_all_ext,
776
+ sql="whatever",
777
+ layer="invalid",
778
+ use_arrow=use_arrow,
779
+ )
631
780
 
632
781
 
633
- def test_read_sql_columns_where(naturalearth_lowres_all_ext):
782
+ def test_read_sql_columns_where(naturalearth_lowres_all_ext, use_arrow):
634
783
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
635
784
  df = read_dataframe(
636
785
  naturalearth_lowres_all_ext,
@@ -638,13 +787,14 @@ def test_read_sql_columns_where(naturalearth_lowres_all_ext):
638
787
  sql_dialect="OGRSQL",
639
788
  columns=["iso_a3_renamed", "name"],
640
789
  where="iso_a3_renamed IN ('CAN', 'USA', 'MEX')",
790
+ use_arrow=use_arrow,
641
791
  )
642
792
  assert len(df.columns) == 3
643
793
  assert len(df) == 3
644
794
  assert df.iso_a3_renamed.tolist() == ["CAN", "USA", "MEX"]
645
795
 
646
796
 
647
- def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext):
797
+ def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext, use_arrow):
648
798
  sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
649
799
  df = read_dataframe(
650
800
  naturalearth_lowres_all_ext,
@@ -653,13 +803,14 @@ def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext):
653
803
  columns=["iso_a3_renamed", "name"],
654
804
  where="iso_a3_renamed IN ('CRI', 'PAN')",
655
805
  bbox=(-85, 8, -80, 10),
806
+ use_arrow=use_arrow,
656
807
  )
657
808
  assert len(df.columns) == 3
658
809
  assert len(df) == 2
659
810
  assert df.iso_a3_renamed.tolist() == ["PAN", "CRI"]
660
811
 
661
812
 
662
- def test_read_sql_skip_max(naturalearth_lowres_all_ext):
813
+ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow):
663
814
  sql = """SELECT *
664
815
  FROM naturalearth_lowres
665
816
  WHERE iso_a3 IN ('CAN', 'MEX', 'USA')
@@ -670,6 +821,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
670
821
  skip_features=1,
671
822
  max_features=1,
672
823
  sql_dialect="OGRSQL",
824
+ use_arrow=use_arrow,
673
825
  )
674
826
  assert len(df.columns) == 6
675
827
  assert len(df) == 1
@@ -677,13 +829,21 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
677
829
 
678
830
  sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
679
831
  df = read_dataframe(
680
- naturalearth_lowres_all_ext, sql=sql, max_features=3, sql_dialect="OGRSQL"
832
+ naturalearth_lowres_all_ext,
833
+ sql=sql,
834
+ max_features=3,
835
+ sql_dialect="OGRSQL",
836
+ use_arrow=use_arrow,
681
837
  )
682
838
  assert len(df) == 1
683
839
 
684
840
  sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
685
841
  df = read_dataframe(
686
- naturalearth_lowres_all_ext, sql=sql, skip_features=1, sql_dialect="OGRSQL"
842
+ naturalearth_lowres_all_ext,
843
+ sql=sql,
844
+ sql_dialect="OGRSQL",
845
+ skip_features=1,
846
+ use_arrow=use_arrow,
687
847
  )
688
848
  assert len(df) == 0
689
849
 
@@ -694,10 +854,12 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext):
694
854
  [ext for ext in ALL_EXTS if ext != ".gpkg"],
695
855
  indirect=["naturalearth_lowres"],
696
856
  )
697
- def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
857
+ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres, use_arrow):
698
858
  # Should return singular item
699
859
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
700
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE")
860
+ df = read_dataframe(
861
+ naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
862
+ )
701
863
  assert len(df) == 1
702
864
  assert len(df.columns) == 6
703
865
  assert df.iloc[0].iso_a3 == "CAN"
@@ -707,7 +869,9 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
707
869
  sql = """SELECT ST_Buffer(geometry, 5) AS geometry, name, pop_est, iso_a3
708
870
  FROM naturalearth_lowres
709
871
  WHERE ISO_A3 = 'CAN'"""
710
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE")
872
+ df = read_dataframe(
873
+ naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
874
+ )
711
875
  assert len(df) == 1
712
876
  assert len(df.columns) == 4
713
877
  assert df.iloc[0].geometry.area > area_canada
@@ -717,12 +881,14 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres):
717
881
  @pytest.mark.parametrize(
718
882
  "naturalearth_lowres", [".gpkg"], indirect=["naturalearth_lowres"]
719
883
  )
720
- def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres):
884
+ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
721
885
  # "INDIRECT_SQL" prohibits GDAL from passing the SQL statement to sqlite.
722
886
  # Because the statement is processed within GDAL it is possible to use
723
887
  # spatialite functions even if sqlite isn't built with spatialite support.
724
888
  sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
725
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE")
889
+ df = read_dataframe(
890
+ naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
891
+ )
726
892
  assert len(df) == 1
727
893
  assert len(df.columns) == 6
728
894
  assert df.iloc[0].iso_a3 == "CAN"
@@ -732,29 +898,67 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres):
732
898
  sql = """SELECT ST_Buffer(geom, 5) AS geometry, name, pop_est, iso_a3
733
899
  FROM naturalearth_lowres
734
900
  WHERE ISO_A3 = 'CAN'"""
735
- df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE")
901
+ df = read_dataframe(
902
+ naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
903
+ )
736
904
  assert len(df) == 1
737
905
  assert len(df.columns) == 4
738
906
  assert df.iloc[0].geometry.area > area_canada
739
907
 
740
908
 
909
+ @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
910
+ def test_write_csv_encoding(tmp_path, encoding):
911
+ """Test if write_dataframe uses the default encoding correctly."""
912
+ # Write csv test file. Depending on the os this will be written in a different
913
+ # encoding: for linux and macos this is utf-8, for windows it is cp1252.
914
+ csv_path = tmp_path / "test.csv"
915
+
916
+ with open(csv_path, "w", encoding=encoding) as csv:
917
+ csv.write("näme,city\n")
918
+ csv.write("Wilhelm Röntgen,Zürich\n")
919
+
920
+ # Write csv test file with the same data using write_dataframe. It should use the
921
+ # same encoding as above.
922
+ df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
923
+ csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
924
+ write_dataframe(df, csv_pyogrio_path, encoding=encoding)
925
+
926
+ # Check if the text files written both ways can be read again and give same result.
927
+ with open(csv_path, "r", encoding=encoding) as csv:
928
+ csv_str = csv.read()
929
+ with open(csv_pyogrio_path, "r", encoding=encoding) as csv_pyogrio:
930
+ csv_pyogrio_str = csv_pyogrio.read()
931
+ assert csv_str == csv_pyogrio_str
932
+
933
+ # Check if they files are binary identical, to be 100% sure they were written with
934
+ # the same encoding.
935
+ with open(csv_path, "rb") as csv:
936
+ csv_bytes = csv.read()
937
+ with open(csv_pyogrio_path, "rb") as csv_pyogrio:
938
+ csv_pyogrio_bytes = csv_pyogrio.read()
939
+ assert csv_bytes == csv_pyogrio_bytes
940
+
941
+
741
942
  @pytest.mark.parametrize("ext", ALL_EXTS)
742
- def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
943
+ @pytest.mark.requires_arrow_write_api
944
+ def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):
743
945
  input_gdf = read_dataframe(naturalearth_lowres)
744
946
  output_path = tmp_path / f"test{ext}"
745
947
 
746
948
  if ext == ".fgb":
747
949
  # For .fgb, spatial_index=False to avoid the rows being reordered
748
- write_dataframe(input_gdf, output_path, spatial_index=False)
950
+ write_dataframe(
951
+ input_gdf, output_path, use_arrow=use_arrow, spatial_index=False
952
+ )
749
953
  else:
750
- write_dataframe(input_gdf, output_path)
954
+ write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
751
955
 
752
956
  assert output_path.exists()
753
957
  result_gdf = read_dataframe(output_path)
754
958
 
755
959
  geometry_types = result_gdf.geometry.type.unique()
756
960
  if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
757
- assert geometry_types == ["MultiPolygon"]
961
+ assert list(geometry_types) == ["MultiPolygon"]
758
962
  else:
759
963
  assert set(geometry_types) == set(["MultiPolygon", "Polygon"])
760
964
 
@@ -775,14 +979,21 @@ def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
775
979
 
776
980
 
777
981
  @pytest.mark.filterwarnings("ignore:.*No SRS set on layer.*")
982
+ @pytest.mark.parametrize("write_geodf", [True, False])
778
983
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS + [".xlsx"] if ext != ".fgb"])
779
- def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
780
- """Test writing a dataframe without a geometry column.
984
+ @pytest.mark.requires_arrow_write_api
985
+ def test_write_dataframe_no_geom(
986
+ request, tmp_path, naturalearth_lowres, write_geodf, ext, use_arrow
987
+ ):
988
+ """Test writing a (geo)dataframe without a geometry column.
781
989
 
782
990
  FlatGeobuf (.fgb) doesn't seem to support this, and just writes an empty file.
783
991
  """
784
992
  # Prepare test data
785
993
  input_df = read_dataframe(naturalearth_lowres, read_geometry=False)
994
+ if write_geodf:
995
+ input_df = gp.GeoDataFrame(input_df)
996
+
786
997
  output_path = tmp_path / f"test{ext}"
787
998
 
788
999
  # A shapefile without geometry column results in only a .dbf file.
@@ -792,7 +1003,7 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
792
1003
  # Determine driver
793
1004
  driver = DRIVERS[ext] if ext != ".xlsx" else "XLSX"
794
1005
 
795
- write_dataframe(input_df, output_path, driver=driver)
1006
+ write_dataframe(input_df, output_path, use_arrow=use_arrow, driver=driver)
796
1007
 
797
1008
  assert output_path.exists()
798
1009
  result_df = read_dataframe(output_path)
@@ -805,6 +1016,9 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
805
1016
  if ext in [".gpkg", ".shp", ".xlsx"]:
806
1017
  # These file types return a DataFrame when read.
807
1018
  assert not isinstance(result_df, gp.GeoDataFrame)
1019
+ if isinstance(input_df, gp.GeoDataFrame):
1020
+ input_df = pd.DataFrame(input_df)
1021
+
808
1022
  pd.testing.assert_frame_equal(
809
1023
  result_df, input_df, check_index_type=False, check_dtype=check_dtype
810
1024
  )
@@ -821,12 +1035,27 @@ def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
821
1035
  )
822
1036
 
823
1037
 
1038
+ @pytest.mark.requires_arrow_write_api
1039
+ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow):
1040
+ # dataframe writing ignores the index
1041
+ input_gdf = read_dataframe(naturalearth_lowres)
1042
+ input_gdf = input_gdf.set_index("iso_a3")
1043
+
1044
+ output_path = tmp_path / "test.shp"
1045
+ write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
1046
+
1047
+ result_gdf = read_dataframe(output_path)
1048
+ assert isinstance(result_gdf.index, pd.RangeIndex)
1049
+ assert_geodataframe_equal(result_gdf, input_gdf.reset_index(drop=True))
1050
+
1051
+
824
1052
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"])
825
- def test_write_empty_dataframe(tmp_path, ext):
1053
+ @pytest.mark.requires_arrow_write_api
1054
+ def test_write_empty_dataframe(tmp_path, ext, use_arrow):
826
1055
  expected = gp.GeoDataFrame(geometry=[], crs=4326)
827
1056
 
828
1057
  filename = tmp_path / f"test{ext}"
829
- write_dataframe(expected, filename)
1058
+ write_dataframe(expected, filename, use_arrow=use_arrow)
830
1059
 
831
1060
  assert filename.exists()
832
1061
  df = read_dataframe(filename)
@@ -834,83 +1063,119 @@ def test_write_empty_dataframe(tmp_path, ext):
834
1063
 
835
1064
 
836
1065
  @pytest.mark.parametrize("ext", [".geojsonl", ".geojsons"])
837
- def test_write_read_empty_dataframe_unsupported(tmp_path, ext):
1066
+ @pytest.mark.requires_arrow_write_api
1067
+ def test_write_read_empty_dataframe_unsupported(tmp_path, ext, use_arrow):
838
1068
  # Writing empty dataframe to .geojsons or .geojsonl results logically in a 0 byte
839
1069
  # file, but gdal isn't able to read those again at the time of writing.
840
1070
  # Issue logged here: https://github.com/geopandas/pyogrio/issues/94
841
1071
  expected = gp.GeoDataFrame(geometry=[], crs=4326)
842
1072
 
843
1073
  filename = tmp_path / f"test{ext}"
844
- write_dataframe(expected, filename)
1074
+ write_dataframe(expected, filename, use_arrow=use_arrow)
845
1075
 
846
1076
  assert filename.exists()
847
1077
  with pytest.raises(
848
- Exception, match=".* not recognized as a supported file format."
1078
+ Exception, match=".* not recognized as( being in)? a supported file format."
849
1079
  ):
850
- _ = read_dataframe(filename)
1080
+ _ = read_dataframe(filename, use_arrow=use_arrow)
851
1081
 
852
1082
 
853
- def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres):
1083
+ @pytest.mark.requires_arrow_write_api
1084
+ def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres, use_arrow):
854
1085
  input_gdf = read_dataframe(naturalearth_lowres)
855
- output_path = tmp_path / "test.gpkg"
1086
+ filename = tmp_path / "test.gpkg"
856
1087
 
857
- write_dataframe(input_gdf, output_path, layer="first", promote_to_multi=True)
1088
+ write_dataframe(
1089
+ input_gdf,
1090
+ filename,
1091
+ layer="first",
1092
+ promote_to_multi=True,
1093
+ use_arrow=use_arrow,
1094
+ )
858
1095
 
859
- assert os.path.exists(output_path)
860
- assert np.array_equal(list_layers(output_path), [["first", "MultiPolygon"]])
1096
+ assert filename.exists()
1097
+ assert np.array_equal(list_layers(filename), [["first", "MultiPolygon"]])
861
1098
 
862
- write_dataframe(input_gdf, output_path, layer="second", promote_to_multi=True)
1099
+ write_dataframe(
1100
+ input_gdf,
1101
+ filename,
1102
+ layer="second",
1103
+ promote_to_multi=True,
1104
+ use_arrow=use_arrow,
1105
+ )
863
1106
  assert np.array_equal(
864
- list_layers(output_path),
1107
+ list_layers(filename),
865
1108
  [["first", "MultiPolygon"], ["second", "MultiPolygon"]],
866
1109
  )
867
1110
 
868
1111
 
869
1112
  @pytest.mark.parametrize("ext", ALL_EXTS)
870
- def test_write_dataframe_append(tmp_path, naturalearth_lowres, ext):
1113
+ @pytest.mark.requires_arrow_write_api
1114
+ def test_write_dataframe_append(request, tmp_path, naturalearth_lowres, ext, use_arrow):
871
1115
  if ext == ".fgb" and __gdal_version__ <= (3, 5, 0):
872
1116
  pytest.skip("Append to FlatGeobuf fails for GDAL <= 3.5.0")
873
1117
 
874
1118
  if ext in (".geojsonl", ".geojsons") and __gdal_version__ <= (3, 6, 0):
875
1119
  pytest.skip("Append to GeoJSONSeq only available for GDAL >= 3.6.0")
876
1120
 
1121
+ if use_arrow and ext.startswith(".geojson"):
1122
+ # Bug in GDAL when appending int64 to GeoJSON
1123
+ # (https://github.com/OSGeo/gdal/issues/9792)
1124
+ request.node.add_marker(
1125
+ pytest.mark.xfail(reason="Bugs with append when writing Arrow to GeoJSON")
1126
+ )
1127
+
877
1128
  input_gdf = read_dataframe(naturalearth_lowres)
878
- output_path = tmp_path / f"test{ext}"
1129
+ filename = tmp_path / f"test{ext}"
879
1130
 
880
- write_dataframe(input_gdf, output_path)
1131
+ write_dataframe(input_gdf, filename, use_arrow=use_arrow)
881
1132
 
882
- assert os.path.exists(output_path)
883
- assert len(read_dataframe(output_path)) == 177
1133
+ filename.exists()
1134
+ assert len(read_dataframe(filename)) == 177
884
1135
 
885
- write_dataframe(input_gdf, output_path, append=True)
886
- assert len(read_dataframe(output_path)) == 354
1136
+ write_dataframe(input_gdf, filename, use_arrow=use_arrow, append=True)
1137
+ assert len(read_dataframe(filename)) == 354
887
1138
 
888
1139
 
889
1140
  @pytest.mark.parametrize("spatial_index", [False, True])
890
- def test_write_dataframe_gdal_options(tmp_path, naturalearth_lowres, spatial_index):
1141
+ @pytest.mark.requires_arrow_write_api
1142
+ def test_write_dataframe_gdal_options(
1143
+ tmp_path, naturalearth_lowres, spatial_index, use_arrow
1144
+ ):
891
1145
  df = read_dataframe(naturalearth_lowres)
892
1146
 
893
1147
  outfilename1 = tmp_path / "test1.shp"
894
- write_dataframe(df, outfilename1, SPATIAL_INDEX="YES" if spatial_index else "NO")
1148
+ write_dataframe(
1149
+ df,
1150
+ outfilename1,
1151
+ use_arrow=use_arrow,
1152
+ SPATIAL_INDEX="YES" if spatial_index else "NO",
1153
+ )
895
1154
  assert outfilename1.exists() is True
896
1155
  index_filename1 = tmp_path / "test1.qix"
897
1156
  assert index_filename1.exists() is spatial_index
898
1157
 
899
1158
  # using explicit layer_options instead
900
1159
  outfilename2 = tmp_path / "test2.shp"
901
- write_dataframe(df, outfilename2, layer_options=dict(spatial_index=spatial_index))
1160
+ write_dataframe(
1161
+ df,
1162
+ outfilename2,
1163
+ use_arrow=use_arrow,
1164
+ layer_options=dict(spatial_index=spatial_index),
1165
+ )
902
1166
  assert outfilename2.exists() is True
903
1167
  index_filename2 = tmp_path / "test2.qix"
904
1168
  assert index_filename2.exists() is spatial_index
905
1169
 
906
1170
 
907
- def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres):
1171
+ @pytest.mark.requires_arrow_write_api
1172
+ def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres, use_arrow):
908
1173
  df = read_dataframe(naturalearth_lowres)
909
1174
 
910
1175
  # geojson has no spatial index, so passing keyword should raise
911
1176
  outfilename = tmp_path / "test.geojson"
912
1177
  with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"):
913
- write_dataframe(df, outfilename, spatial_index=True)
1178
+ write_dataframe(df, outfilename, use_arrow=use_arrow, spatial_index=True)
914
1179
 
915
1180
 
916
1181
  def _get_gpkg_table_names(path):
@@ -923,21 +1188,25 @@ def _get_gpkg_table_names(path):
923
1188
  return [res[0] for res in result]
924
1189
 
925
1190
 
926
- def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres):
1191
+ @pytest.mark.requires_arrow_write_api
1192
+ def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres, use_arrow):
927
1193
  df = read_dataframe(naturalearth_lowres)
928
1194
 
929
1195
  test_default_filename = tmp_path / "test_default.gpkg"
930
- write_dataframe(df, test_default_filename)
1196
+ write_dataframe(df, test_default_filename, use_arrow=use_arrow)
931
1197
  assert "gpkg_ogr_contents" in _get_gpkg_table_names(test_default_filename)
932
1198
 
933
1199
  test_no_contents_filename = tmp_path / "test_no_contents.gpkg"
934
- write_dataframe(df, test_default_filename, ADD_GPKG_OGR_CONTENTS="NO")
1200
+ write_dataframe(
1201
+ df, test_default_filename, use_arrow=use_arrow, ADD_GPKG_OGR_CONTENTS="NO"
1202
+ )
935
1203
  assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename)
936
1204
 
937
1205
  test_no_contents_filename2 = tmp_path / "test_no_contents2.gpkg"
938
1206
  write_dataframe(
939
1207
  df,
940
1208
  test_no_contents_filename2,
1209
+ use_arrow=use_arrow,
941
1210
  dataset_options=dict(add_gpkg_ogr_contents=False),
942
1211
  )
943
1212
  assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename2)
@@ -954,6 +1223,7 @@ def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres):
954
1223
  (".geojson", False, ["MultiPolygon", "Polygon"], "Unknown"),
955
1224
  ],
956
1225
  )
1226
+ @pytest.mark.requires_arrow_write_api
957
1227
  def test_write_dataframe_promote_to_multi(
958
1228
  tmp_path,
959
1229
  naturalearth_lowres,
@@ -961,11 +1231,14 @@ def test_write_dataframe_promote_to_multi(
961
1231
  promote_to_multi,
962
1232
  expected_geometry_types,
963
1233
  expected_geometry_type,
1234
+ use_arrow,
964
1235
  ):
965
1236
  input_gdf = read_dataframe(naturalearth_lowres)
966
1237
 
967
1238
  output_path = tmp_path / f"test_promote{ext}"
968
- write_dataframe(input_gdf, output_path, promote_to_multi=promote_to_multi)
1239
+ write_dataframe(
1240
+ input_gdf, output_path, use_arrow=use_arrow, promote_to_multi=promote_to_multi
1241
+ )
969
1242
 
970
1243
  assert output_path.exists()
971
1244
  output_gdf = read_dataframe(output_path)
@@ -998,6 +1271,7 @@ def test_write_dataframe_promote_to_multi(
998
1271
  (".shp", True, "Unknown", ["MultiPolygon", "Polygon"], "Polygon"),
999
1272
  ],
1000
1273
  )
1274
+ @pytest.mark.requires_arrow_write_api
1001
1275
  def test_write_dataframe_promote_to_multi_layer_geom_type(
1002
1276
  tmp_path,
1003
1277
  naturalearth_lowres,
@@ -1006,6 +1280,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1006
1280
  geometry_type,
1007
1281
  expected_geometry_types,
1008
1282
  expected_geometry_type,
1283
+ use_arrow,
1009
1284
  ):
1010
1285
  input_gdf = read_dataframe(naturalearth_lowres)
1011
1286
 
@@ -1022,6 +1297,7 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1022
1297
  write_dataframe(
1023
1298
  input_gdf,
1024
1299
  output_path,
1300
+ use_arrow=use_arrow,
1025
1301
  promote_to_multi=promote_to_multi,
1026
1302
  geometry_type=geometry_type,
1027
1303
  )
@@ -1040,9 +1316,15 @@ def test_write_dataframe_promote_to_multi_layer_geom_type(
1040
1316
  (".fgb", False, "Polygon", "Mismatched geometry type"),
1041
1317
  (".fgb", None, "Point", "Mismatched geometry type"),
1042
1318
  (".fgb", None, "Polygon", "Mismatched geometry type"),
1043
- (".shp", None, "Point", "Could not add feature to layer at index"),
1319
+ (
1320
+ ".shp",
1321
+ None,
1322
+ "Point",
1323
+ "Could not add feature to layer at index|Error while writing batch to OGR layer",
1324
+ ),
1044
1325
  ],
1045
1326
  )
1327
+ @pytest.mark.requires_arrow_write_api
1046
1328
  def test_write_dataframe_promote_to_multi_layer_geom_type_invalid(
1047
1329
  tmp_path,
1048
1330
  naturalearth_lowres,
@@ -1050,31 +1332,37 @@ def test_write_dataframe_promote_to_multi_layer_geom_type_invalid(
1050
1332
  promote_to_multi,
1051
1333
  geometry_type,
1052
1334
  expected_raises_match,
1335
+ use_arrow,
1053
1336
  ):
1054
1337
  input_gdf = read_dataframe(naturalearth_lowres)
1055
1338
 
1056
1339
  output_path = tmp_path / f"test{ext}"
1057
- with pytest.raises(FeatureError, match=expected_raises_match):
1340
+ with pytest.raises((FeatureError, DataLayerError), match=expected_raises_match):
1058
1341
  write_dataframe(
1059
1342
  input_gdf,
1060
1343
  output_path,
1344
+ use_arrow=use_arrow,
1061
1345
  promote_to_multi=promote_to_multi,
1062
1346
  geometry_type=geometry_type,
1063
1347
  )
1064
1348
 
1065
1349
 
1066
- def test_write_dataframe_layer_geom_type_invalid(tmp_path, naturalearth_lowres):
1350
+ @pytest.mark.requires_arrow_write_api
1351
+ def test_write_dataframe_layer_geom_type_invalid(
1352
+ tmp_path, naturalearth_lowres, use_arrow
1353
+ ):
1067
1354
  df = read_dataframe(naturalearth_lowres)
1068
1355
 
1069
1356
  filename = tmp_path / "test.geojson"
1070
1357
  with pytest.raises(
1071
1358
  GeometryError, match="Geometry type is not supported: NotSupported"
1072
1359
  ):
1073
- write_dataframe(df, filename, geometry_type="NotSupported")
1360
+ write_dataframe(df, filename, use_arrow=use_arrow, geometry_type="NotSupported")
1074
1361
 
1075
1362
 
1076
1363
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".shp"])
1077
- def test_write_dataframe_truly_mixed(tmp_path, ext):
1364
+ @pytest.mark.requires_arrow_write_api
1365
+ def test_write_dataframe_truly_mixed(tmp_path, ext, use_arrow):
1078
1366
  geometry = [
1079
1367
  shapely.Point(0, 0),
1080
1368
  shapely.LineString([(0, 0), (1, 1)]),
@@ -1094,9 +1382,9 @@ def test_write_dataframe_truly_mixed(tmp_path, ext):
1094
1382
 
1095
1383
  if ext == ".fgb":
1096
1384
  # For .fgb, spatial_index=False to avoid the rows being reordered
1097
- write_dataframe(df, filename, spatial_index=False)
1385
+ write_dataframe(df, filename, use_arrow=use_arrow, spatial_index=False)
1098
1386
  else:
1099
- write_dataframe(df, filename)
1387
+ write_dataframe(df, filename, use_arrow=use_arrow)
1100
1388
 
1101
1389
  # Drivers that support mixed geometries will default to "Unknown" geometry type
1102
1390
  assert read_info(filename)["geometry_type"] == "Unknown"
@@ -1104,7 +1392,8 @@ def test_write_dataframe_truly_mixed(tmp_path, ext):
1104
1392
  assert_geodataframe_equal(result, df, check_geom_type=True)
1105
1393
 
1106
1394
 
1107
- def test_write_dataframe_truly_mixed_invalid(tmp_path):
1395
+ @pytest.mark.requires_arrow_write_api
1396
+ def test_write_dataframe_truly_mixed_invalid(tmp_path, use_arrow):
1108
1397
  # Shapefile doesn't support generic "Geometry" / "Unknown" type
1109
1398
  # for mixed geometries
1110
1399
 
@@ -1122,9 +1411,12 @@ def test_write_dataframe_truly_mixed_invalid(tmp_path):
1122
1411
  msg = (
1123
1412
  "Could not add feature to layer at index 1: Attempt to "
1124
1413
  r"write non-point \(LINESTRING\) geometry to point shapefile."
1414
+ # DataLayerError when using Arrow
1415
+ "|Error while writing batch to OGR layer: Attempt to "
1416
+ r"write non-point \(LINESTRING\) geometry to point shapefile."
1125
1417
  )
1126
- with pytest.raises(FeatureError, match=msg):
1127
- write_dataframe(df, tmp_path / "test.shp")
1418
+ with pytest.raises((FeatureError, DataLayerError), match=msg):
1419
+ write_dataframe(df, tmp_path / "test.shp", use_arrow=use_arrow)
1128
1420
 
1129
1421
 
1130
1422
  @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".fgb"])
@@ -1137,11 +1429,12 @@ def test_write_dataframe_truly_mixed_invalid(tmp_path):
1137
1429
  [None, None],
1138
1430
  ],
1139
1431
  )
1140
- def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext):
1432
+ @pytest.mark.requires_arrow_write_api
1433
+ def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext, use_arrow):
1141
1434
  filename = tmp_path / f"test{ext}"
1142
1435
 
1143
1436
  df = gp.GeoDataFrame({"col": [1.0, 2.0]}, geometry=geoms, crs="EPSG:4326")
1144
- write_dataframe(df, filename)
1437
+ write_dataframe(df, filename, use_arrow=use_arrow)
1145
1438
  result = read_dataframe(filename)
1146
1439
  assert_geodataframe_equal(result, df)
1147
1440
 
@@ -1149,16 +1442,19 @@ def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext):
1149
1442
  @pytest.mark.filterwarnings(
1150
1443
  "ignore: You will likely lose important projection information"
1151
1444
  )
1152
- def test_custom_crs_io(tmpdir, naturalearth_lowres_all_ext):
1445
+ @pytest.mark.requires_arrow_write_api
1446
+ def test_custom_crs_io(tmp_path, naturalearth_lowres_all_ext, use_arrow):
1153
1447
  df = read_dataframe(naturalearth_lowres_all_ext)
1154
1448
  # project Belgium to a custom Albers Equal Area projection
1155
- expected = df.loc[df.name == "Belgium"].to_crs(
1156
- "+proj=aea +lat_1=49.5 +lat_2=51.5 +lon_0=4.3"
1449
+ expected = (
1450
+ df.loc[df.name == "Belgium"]
1451
+ .reset_index(drop=True)
1452
+ .to_crs("+proj=aea +lat_1=49.5 +lat_2=51.5 +lon_0=4.3")
1157
1453
  )
1158
- filename = os.path.join(str(tmpdir), "test.shp")
1159
- write_dataframe(expected, filename)
1454
+ filename = tmp_path / "test.shp"
1455
+ write_dataframe(expected, filename, use_arrow=use_arrow)
1160
1456
 
1161
- assert os.path.exists(filename)
1457
+ assert filename.exists()
1162
1458
 
1163
1459
  df = read_dataframe(filename)
1164
1460
 
@@ -1170,6 +1466,7 @@ def test_custom_crs_io(tmpdir, naturalearth_lowres_all_ext):
1170
1466
 
1171
1467
 
1172
1468
  def test_write_read_mixed_column_values(tmp_path):
1469
+ # use_arrow=True is tested separately below
1173
1470
  mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
1174
1471
  geoms = [shapely.Point(0, 0) for _ in mixed_values]
1175
1472
  test_gdf = gp.GeoDataFrame(
@@ -1186,7 +1483,21 @@ def test_write_read_mixed_column_values(tmp_path):
1186
1483
  assert output_gdf["mixed"][idx] == str(value)
1187
1484
 
1188
1485
 
1189
- def test_write_read_null(tmp_path):
1486
+ @requires_arrow_write_api
1487
+ def test_write_read_mixed_column_values_arrow(tmp_path):
1488
+ # Arrow cannot represent a column of mixed types
1489
+ mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
1490
+ geoms = [shapely.Point(0, 0) for _ in mixed_values]
1491
+ test_gdf = gp.GeoDataFrame(
1492
+ {"geometry": geoms, "mixed": mixed_values}, crs="epsg:31370"
1493
+ )
1494
+ output_path = tmp_path / "test_write_mixed_column.gpkg"
1495
+ with pytest.raises(TypeError, match=".*Conversion failed for column"):
1496
+ write_dataframe(test_gdf, output_path, use_arrow=True)
1497
+
1498
+
1499
+ @pytest.mark.requires_arrow_write_api
1500
+ def test_write_read_null(tmp_path, use_arrow):
1190
1501
  output_path = tmp_path / "test_write_nan.gpkg"
1191
1502
  geom = shapely.Point(0, 0)
1192
1503
  test_data = {
@@ -1195,7 +1506,7 @@ def test_write_read_null(tmp_path):
1195
1506
  "object_str": ["test", None, np.nan],
1196
1507
  }
1197
1508
  test_gdf = gp.GeoDataFrame(test_data, crs="epsg:31370")
1198
- write_dataframe(test_gdf, output_path)
1509
+ write_dataframe(test_gdf, output_path, use_arrow=use_arrow)
1199
1510
  result_gdf = read_dataframe(output_path)
1200
1511
  assert len(test_gdf) == len(result_gdf)
1201
1512
  assert result_gdf["float64"][0] == 1.0
@@ -1218,7 +1529,7 @@ def test_write_read_null(tmp_path):
1218
1529
  ["2.5D MultiLineString", "MultiLineString Z"],
1219
1530
  ),
1220
1531
  (
1221
- "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))", # NOQA
1532
+ "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))",
1222
1533
  ["2.5D MultiPolygon", "MultiPolygon Z"],
1223
1534
  ),
1224
1535
  (
@@ -1227,11 +1538,12 @@ def test_write_read_null(tmp_path):
1227
1538
  ),
1228
1539
  ],
1229
1540
  )
1230
- def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1541
+ @pytest.mark.requires_arrow_write_api
1542
+ def test_write_geometry_z_types(tmp_path, wkt, geom_types, use_arrow):
1231
1543
  filename = tmp_path / "test.fgb"
1232
1544
  gdf = gp.GeoDataFrame(geometry=from_wkt([wkt]), crs="EPSG:4326")
1233
1545
  for geom_type in geom_types:
1234
- write_dataframe(gdf, filename, geometry_type=geom_type)
1546
+ write_dataframe(gdf, filename, use_arrow=use_arrow, geometry_type=geom_type)
1235
1547
  df = read_dataframe(filename)
1236
1548
  assert_geodataframe_equal(df, gdf)
1237
1549
 
@@ -1260,7 +1572,7 @@ def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1260
1572
  "MultiPolygon Z",
1261
1573
  False,
1262
1574
  [
1263
- "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))" # noqa: E501
1575
+ "MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))"
1264
1576
  ],
1265
1577
  ),
1266
1578
  (
@@ -1285,8 +1597,9 @@ def test_write_geometry_z_types(tmp_path, wkt, geom_types):
1285
1597
  ),
1286
1598
  ],
1287
1599
  )
1600
+ @pytest.mark.requires_arrow_write_api
1288
1601
  def test_write_geometry_z_types_auto(
1289
- tmp_path, ext, test_descr, exp_geometry_type, mixed_dimensions, wkt
1602
+ tmp_path, ext, test_descr, exp_geometry_type, mixed_dimensions, wkt, use_arrow
1290
1603
  ):
1291
1604
  # Shapefile has some different behaviour that other file types
1292
1605
  if ext == ".shp":
@@ -1313,10 +1626,10 @@ def test_write_geometry_z_types_auto(
1313
1626
  DataSourceError,
1314
1627
  match=("Mixed 2D and 3D coordinates are not supported by"),
1315
1628
  ):
1316
- write_dataframe(gdf, filename)
1629
+ write_dataframe(gdf, filename, use_arrow=use_arrow)
1317
1630
  return
1318
1631
  else:
1319
- write_dataframe(gdf, filename)
1632
+ write_dataframe(gdf, filename, use_arrow=use_arrow)
1320
1633
 
1321
1634
  info = read_info(filename)
1322
1635
  assert info["geometry_type"] == exp_geometry_type
@@ -1328,11 +1641,17 @@ def test_write_geometry_z_types_auto(
1328
1641
  assert_geodataframe_equal(gdf, result_gdf)
1329
1642
 
1330
1643
 
1331
- def test_read_multisurface(data_dir):
1332
- df = read_dataframe(data_dir / "test_multisurface.gpkg")
1644
+ def test_read_multisurface(data_dir, use_arrow):
1645
+ if use_arrow:
1646
+ with pytest.raises(shapely.errors.GEOSException):
1647
+ # TODO(Arrow)
1648
+ # shapely fails parsing the WKB
1649
+ read_dataframe(data_dir / "test_multisurface.gpkg", use_arrow=True)
1650
+ else:
1651
+ df = read_dataframe(data_dir / "test_multisurface.gpkg")
1333
1652
 
1334
- # MultiSurface should be converted to MultiPolygon
1335
- assert df.geometry.type.tolist() == ["MultiPolygon"]
1653
+ # MultiSurface should be converted to MultiPolygon
1654
+ assert df.geometry.type.tolist() == ["MultiPolygon"]
1336
1655
 
1337
1656
 
1338
1657
  def test_read_dataset_kwargs(data_dir, use_arrow):
@@ -1371,7 +1690,8 @@ def test_read_invalid_dataset_kwargs(naturalearth_lowres, use_arrow):
1371
1690
  read_dataframe(naturalearth_lowres, use_arrow=use_arrow, INVALID="YES")
1372
1691
 
1373
1692
 
1374
- def test_write_nullable_dtypes(tmp_path):
1693
+ @pytest.mark.requires_arrow_write_api
1694
+ def test_write_nullable_dtypes(tmp_path, use_arrow):
1375
1695
  path = tmp_path / "test_nullable_dtypes.gpkg"
1376
1696
  test_data = {
1377
1697
  "col1": pd.Series([1, 2, 3], dtype="int64"),
@@ -1383,7 +1703,7 @@ def test_write_nullable_dtypes(tmp_path):
1383
1703
  input_gdf = gp.GeoDataFrame(
1384
1704
  test_data, geometry=[shapely.Point(0, 0)] * 3, crs="epsg:31370"
1385
1705
  )
1386
- write_dataframe(input_gdf, path)
1706
+ write_dataframe(input_gdf, path, use_arrow=use_arrow)
1387
1707
  output_gdf = read_dataframe(path)
1388
1708
  # We read it back as default (non-nullable) numpy dtypes, so we cast
1389
1709
  # to those for the expected result
@@ -1392,19 +1712,21 @@ def test_write_nullable_dtypes(tmp_path):
1392
1712
  expected["col3"] = expected["col3"].astype("float32")
1393
1713
  expected["col4"] = expected["col4"].astype("float64")
1394
1714
  expected["col5"] = expected["col5"].astype(object)
1715
+ expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
1395
1716
  assert_geodataframe_equal(output_gdf, expected)
1396
1717
 
1397
1718
 
1398
1719
  @pytest.mark.parametrize(
1399
1720
  "metadata_type", ["dataset_metadata", "layer_metadata", "metadata"]
1400
1721
  )
1401
- def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type):
1722
+ @pytest.mark.requires_arrow_write_api
1723
+ def test_metadata_io(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
1402
1724
  metadata = {"level": metadata_type}
1403
1725
 
1404
1726
  df = read_dataframe(naturalearth_lowres)
1405
1727
 
1406
- filename = os.path.join(str(tmpdir), "test.gpkg")
1407
- write_dataframe(df, filename, **{metadata_type: metadata})
1728
+ filename = tmp_path / "test.gpkg"
1729
+ write_dataframe(df, filename, use_arrow=use_arrow, **{metadata_type: metadata})
1408
1730
 
1409
1731
  metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type
1410
1732
 
@@ -1420,25 +1742,328 @@ def test_metadata_io(tmpdir, naturalearth_lowres, metadata_type):
1420
1742
  {"key": 1},
1421
1743
  ],
1422
1744
  )
1423
- def test_invalid_metadata(tmpdir, naturalearth_lowres, metadata_type, metadata):
1745
+ @pytest.mark.requires_arrow_write_api
1746
+ def test_invalid_metadata(
1747
+ tmp_path, naturalearth_lowres, metadata_type, metadata, use_arrow
1748
+ ):
1749
+ df = read_dataframe(naturalearth_lowres)
1424
1750
  with pytest.raises(ValueError, match="must be a string"):
1425
- filename = os.path.join(str(tmpdir), "test.gpkg")
1426
1751
  write_dataframe(
1427
- read_dataframe(naturalearth_lowres), filename, **{metadata_type: metadata}
1752
+ df, tmp_path / "test.gpkg", use_arrow=use_arrow, **{metadata_type: metadata}
1428
1753
  )
1429
1754
 
1430
1755
 
1431
1756
  @pytest.mark.parametrize("metadata_type", ["dataset_metadata", "layer_metadata"])
1432
- def test_metadata_unsupported(tmpdir, naturalearth_lowres, metadata_type):
1757
+ @pytest.mark.requires_arrow_write_api
1758
+ def test_metadata_unsupported(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
1433
1759
  """metadata is silently ignored"""
1434
1760
 
1435
- filename = os.path.join(str(tmpdir), "test.geojson")
1761
+ filename = tmp_path / "test.geojson"
1436
1762
  write_dataframe(
1437
1763
  read_dataframe(naturalearth_lowres),
1438
1764
  filename,
1765
+ use_arrow=use_arrow,
1439
1766
  **{metadata_type: {"key": "value"}},
1440
1767
  )
1441
1768
 
1442
1769
  metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type
1443
1770
 
1444
1771
  assert read_info(filename)[metadata_key] is None
1772
+
1773
+
1774
+ @pytest.mark.skipif(not PANDAS_GE_15, reason="ArrowDtype requires pandas 1.5+")
1775
+ def test_read_dataframe_arrow_dtypes(tmp_path):
1776
+ # https://github.com/geopandas/pyogrio/issues/319 - ensure arrow binary
1777
+ # column can be converted with from_wkb in case of missing values
1778
+ pytest.importorskip("pyarrow")
1779
+ filename = tmp_path / "test.gpkg"
1780
+ df = gp.GeoDataFrame(
1781
+ {"col": [1.0, 2.0]}, geometry=[Point(1, 1), None], crs="EPSG:4326"
1782
+ )
1783
+ write_dataframe(df, filename)
1784
+
1785
+ result = read_dataframe(
1786
+ filename,
1787
+ use_arrow=True,
1788
+ arrow_to_pandas_kwargs={
1789
+ "types_mapper": lambda pa_dtype: pd.ArrowDtype(pa_dtype)
1790
+ },
1791
+ )
1792
+ assert isinstance(result["col"].dtype, pd.ArrowDtype)
1793
+ result["col"] = result["col"].astype("float64")
1794
+ assert_geodataframe_equal(result, df)
1795
+
1796
+
1797
+ @requires_pyarrow_api
1798
+ @pytest.mark.skipif(
1799
+ __gdal_version__ < (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
1800
+ )
1801
+ @pytest.mark.parametrize("ext", ALL_EXTS)
1802
+ def test_arrow_bool_roundtrip(tmp_path, ext):
1803
+ filename = tmp_path / f"test{ext}"
1804
+
1805
+ kwargs = {}
1806
+
1807
+ if ext == ".fgb":
1808
+ # For .fgb, spatial_index=False to avoid the rows being reordered
1809
+ kwargs["spatial_index"] = False
1810
+
1811
+ df = gp.GeoDataFrame(
1812
+ {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
1813
+ crs="EPSG:4326",
1814
+ )
1815
+
1816
+ write_dataframe(df, filename, **kwargs)
1817
+ result = read_dataframe(filename, use_arrow=True)
1818
+ # Shapefiles do not support bool columns; these are returned as int32
1819
+ assert_geodataframe_equal(result, df, check_dtype=ext != ".shp")
1820
+
1821
+
1822
+ @requires_pyarrow_api
1823
+ @pytest.mark.skipif(
1824
+ __gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
1825
+ )
1826
+ @pytest.mark.parametrize("ext", ALL_EXTS)
1827
+ def test_arrow_bool_exception(tmp_path, ext):
1828
+ filename = tmp_path / f"test{ext}"
1829
+
1830
+ df = gp.GeoDataFrame(
1831
+ {"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
1832
+ crs="EPSG:4326",
1833
+ )
1834
+
1835
+ write_dataframe(df, filename)
1836
+
1837
+ if ext in {".fgb", ".gpkg"}:
1838
+ # only raise exception for GPKG / FGB
1839
+ with pytest.raises(
1840
+ RuntimeError,
1841
+ match="GDAL < 3.8.3 does not correctly read boolean data values using "
1842
+ "the Arrow API",
1843
+ ):
1844
+ read_dataframe(filename, use_arrow=True)
1845
+
1846
+ # do not raise exception if no bool columns are read
1847
+ read_dataframe(filename, use_arrow=True, columns=[])
1848
+
1849
+ else:
1850
+ _ = read_dataframe(filename, use_arrow=True)
1851
+
1852
+
1853
+ @pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning")
1854
+ @pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
1855
+ def test_write_memory(naturalearth_lowres, driver):
1856
+ df = read_dataframe(naturalearth_lowres)
1857
+
1858
+ buffer = BytesIO()
1859
+ write_dataframe(df, buffer, driver=driver, layer="test")
1860
+
1861
+ assert len(buffer.getbuffer()) > 0
1862
+
1863
+ actual = read_dataframe(buffer)
1864
+ assert len(actual) == len(df)
1865
+
1866
+ is_json = driver == "GeoJSON"
1867
+
1868
+ assert_geodataframe_equal(
1869
+ actual,
1870
+ df,
1871
+ check_less_precise=is_json,
1872
+ check_index_type=False,
1873
+ check_dtype=not is_json,
1874
+ )
1875
+
1876
+
1877
+ def test_write_memory_driver_required(naturalearth_lowres):
1878
+ df = read_dataframe(naturalearth_lowres)
1879
+
1880
+ buffer = BytesIO()
1881
+
1882
+ with pytest.raises(
1883
+ ValueError,
1884
+ match="driver must be provided to write to in-memory file",
1885
+ ):
1886
+ write_dataframe(df.head(1), buffer, driver=None, layer="test")
1887
+
1888
+
1889
+ @pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"])
1890
+ def test_write_memory_unsupported_driver(naturalearth_lowres, driver):
1891
+ if driver == "OpenFileGDB" and __gdal_version__ < (3, 6, 0):
1892
+ pytest.skip("OpenFileGDB write support only available for GDAL >= 3.6.0")
1893
+
1894
+ df = read_dataframe(naturalearth_lowres)
1895
+
1896
+ buffer = BytesIO()
1897
+
1898
+ with pytest.raises(
1899
+ ValueError, match=f"writing to in-memory file is not supported for {driver}"
1900
+ ):
1901
+ write_dataframe(df, buffer, driver=driver, layer="test")
1902
+
1903
+
1904
+ @pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
1905
+ def test_write_memory_append_unsupported(naturalearth_lowres, driver):
1906
+ df = read_dataframe(naturalearth_lowres)
1907
+
1908
+ buffer = BytesIO()
1909
+
1910
+ with pytest.raises(
1911
+ NotImplementedError, match="append is not supported for in-memory files"
1912
+ ):
1913
+ write_dataframe(df.head(1), buffer, driver=driver, layer="test", append=True)
1914
+
1915
+
1916
+ def test_write_memory_existing_unsupported(naturalearth_lowres):
1917
+ df = read_dataframe(naturalearth_lowres)
1918
+
1919
+ buffer = BytesIO(b"0000")
1920
+ with pytest.raises(
1921
+ NotImplementedError,
1922
+ match="writing to existing in-memory object is not supported",
1923
+ ):
1924
+ write_dataframe(df.head(1), buffer, driver="GeoJSON", layer="test")
1925
+
1926
+
1927
+ @pytest.mark.parametrize("ext", ["gpkg", "geojson"])
1928
+ def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
1929
+ """Verify that we write non-UTF data to the data source
1930
+
1931
+ IMPORTANT: this may not be valid for the data source and will likely render
1932
+ them unusable in other tools, but should successfully roundtrip unless we
1933
+ disable writing using other encodings.
1934
+
1935
+ NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9
1936
+
1937
+ NOTE: pyarrow cannot handle non-UTF-8 characters in this way
1938
+ """
1939
+
1940
+ encoding, text = encoded_text
1941
+ output_path = tmp_path / f"test.{ext}"
1942
+
1943
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
1944
+ write_dataframe(df, output_path, encoding=encoding)
1945
+
1946
+ # cannot open these files without specifying encoding
1947
+ with pytest.raises(UnicodeDecodeError):
1948
+ read_dataframe(output_path)
1949
+
1950
+ # must provide encoding to read these properly
1951
+ actual = read_dataframe(output_path, encoding=encoding)
1952
+ assert actual.columns[0] == text
1953
+ assert actual[text].values[0] == text
1954
+
1955
+
1956
+ @requires_pyarrow_api
1957
+ @pytest.mark.parametrize("ext", ["gpkg", "geojson"])
1958
+ def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
1959
+ encoding, text = encoded_text
1960
+ output_path = tmp_path / f"test.{ext}"
1961
+
1962
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
1963
+ write_dataframe(df, output_path, encoding=encoding)
1964
+
1965
+ # cannot open these files without specifying encoding
1966
+ with pytest.raises(UnicodeDecodeError):
1967
+ read_dataframe(output_path)
1968
+
1969
+ with pytest.raises(
1970
+ ValueError, match="non-UTF-8 encoding is not supported for Arrow"
1971
+ ):
1972
+ read_dataframe(output_path, encoding=encoding, use_arrow=True)
1973
+
1974
+
1975
+ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
1976
+ encoding, text = encoded_text
1977
+
1978
+ output_path = tmp_path / "test.shp"
1979
+
1980
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
1981
+ write_dataframe(df, output_path, encoding=encoding)
1982
+
1983
+ # NOTE: GDAL automatically creates a cpg file with the encoding name, which
1984
+ # means that if we read this without specifying the encoding it uses the
1985
+ # correct one
1986
+ actual = read_dataframe(output_path, use_arrow=use_arrow)
1987
+ assert actual.columns[0] == text
1988
+ assert actual[text].values[0] == text
1989
+
1990
+ # verify that if cpg file is not present, that user-provided encoding must be used
1991
+ output_path.with_suffix(".cpg").unlink()
1992
+
1993
+ # We will assume ISO-8859-1, which is wrong
1994
+ miscoded = text.encode(encoding).decode("ISO-8859-1")
1995
+
1996
+ if use_arrow:
1997
+ # pyarrow cannot decode column name with incorrect encoding
1998
+ with pytest.raises(UnicodeDecodeError):
1999
+ read_dataframe(output_path, use_arrow=True)
2000
+ else:
2001
+ bad = read_dataframe(output_path, use_arrow=False)
2002
+ assert bad.columns[0] == miscoded
2003
+ assert bad[miscoded].values[0] == miscoded
2004
+
2005
+ # If encoding is provided, that should yield correct text
2006
+ actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
2007
+ assert actual.columns[0] == text
2008
+ assert actual[text].values[0] == text
2009
+
2010
+ # if ENCODING open option, that should yield correct text
2011
+ actual = read_dataframe(output_path, use_arrow=use_arrow, ENCODING=encoding)
2012
+ assert actual.columns[0] == text
2013
+ assert actual[text].values[0] == text
2014
+
2015
+
2016
+ def test_encoding_read_option_collision_shapefile(naturalearth_lowres, use_arrow):
2017
+ """Providing both encoding parameter and ENCODING open option (even if blank) is not allowed"""
2018
+
2019
+ with pytest.raises(
2020
+ ValueError, match='cannot provide both encoding parameter and "ENCODING" option'
2021
+ ):
2022
+ read_dataframe(
2023
+ naturalearth_lowres, encoding="CP936", ENCODING="", use_arrow=use_arrow
2024
+ )
2025
+
2026
+
2027
+ def test_encoding_write_layer_option_collision_shapefile(tmp_path, encoded_text):
2028
+ """Providing both encoding parameter and ENCODING layer creation option (even if blank) is not allowed"""
2029
+ encoding, text = encoded_text
2030
+
2031
+ output_path = tmp_path / "test.shp"
2032
+ df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
2033
+
2034
+ with pytest.raises(
2035
+ ValueError,
2036
+ match='cannot provide both encoding parameter and "ENCODING" layer creation option',
2037
+ ):
2038
+ write_dataframe(
2039
+ df, output_path, encoding=encoding, layer_options={"ENCODING": ""}
2040
+ )
2041
+
2042
+
2043
+ def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
2044
+ encoding = "CP936"
2045
+
2046
+ output_path = tmp_path / "test.shp"
2047
+
2048
+ mandarin = "中文"
2049
+ df = gp.GeoDataFrame(
2050
+ {mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
2051
+ )
2052
+ write_dataframe(df, output_path, encoding=encoding)
2053
+
2054
+ actual = read_dataframe(
2055
+ output_path,
2056
+ sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
2057
+ use_arrow=use_arrow,
2058
+ )
2059
+ assert actual.columns[0] == mandarin
2060
+ assert actual[mandarin].values[0] == mandarin
2061
+
2062
+ actual = read_dataframe(
2063
+ output_path,
2064
+ sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
2065
+ encoding=encoding,
2066
+ use_arrow=use_arrow,
2067
+ )
2068
+ assert actual.columns[0] == mandarin
2069
+ assert actual[mandarin].values[0] == mandarin