pyogrio 0.12.0__cp314-cp314t-macosx_12_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyogrio/.dylibs/libgdal.37.3.11.4.dylib +0 -0
- pyogrio/__init__.py +57 -0
- pyogrio/_compat.py +54 -0
- pyogrio/_env.py +59 -0
- pyogrio/_err.cpython-314t-darwin.so +0 -0
- pyogrio/_geometry.cpython-314t-darwin.so +0 -0
- pyogrio/_io.cpython-314t-darwin.so +0 -0
- pyogrio/_ogr.cpython-314t-darwin.so +0 -0
- pyogrio/_version.py +21 -0
- pyogrio/_vsi.cpython-314t-darwin.so +0 -0
- pyogrio/core.py +387 -0
- pyogrio/errors.py +25 -0
- pyogrio/gdal_data/GDAL-targets-release.cmake +19 -0
- pyogrio/gdal_data/GDAL-targets.cmake +106 -0
- pyogrio/gdal_data/GDALConfig.cmake +24 -0
- pyogrio/gdal_data/GDALConfigVersion.cmake +65 -0
- pyogrio/gdal_data/GDALLogoBW.svg +138 -0
- pyogrio/gdal_data/GDALLogoColor.svg +126 -0
- pyogrio/gdal_data/GDALLogoGS.svg +126 -0
- pyogrio/gdal_data/LICENSE.TXT +467 -0
- pyogrio/gdal_data/MM_m_idofic.csv +321 -0
- pyogrio/gdal_data/copyright +467 -0
- pyogrio/gdal_data/cubewerx_extra.wkt +48 -0
- pyogrio/gdal_data/default.rsc +0 -0
- pyogrio/gdal_data/ecw_cs.wkt +1453 -0
- pyogrio/gdal_data/eedaconf.json +23 -0
- pyogrio/gdal_data/epsg.wkt +1 -0
- pyogrio/gdal_data/esri_StatePlane_extra.wkt +631 -0
- pyogrio/gdal_data/gdal_algorithm.schema.json +220 -0
- pyogrio/gdal_data/gdalg.schema.json +36 -0
- pyogrio/gdal_data/gdalicon.png +0 -0
- pyogrio/gdal_data/gdalinfo_output.schema.json +390 -0
- pyogrio/gdal_data/gdalmdiminfo_output.schema.json +326 -0
- pyogrio/gdal_data/gdaltileindex.xsd +253 -0
- pyogrio/gdal_data/gdalvrt.xsd +927 -0
- pyogrio/gdal_data/gfs.xsd +246 -0
- pyogrio/gdal_data/gml_registry.xml +117 -0
- pyogrio/gdal_data/gml_registry.xsd +66 -0
- pyogrio/gdal_data/grib2_center.csv +251 -0
- pyogrio/gdal_data/grib2_process.csv +102 -0
- pyogrio/gdal_data/grib2_subcenter.csv +63 -0
- pyogrio/gdal_data/grib2_table_4_2_0_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_13.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_14.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_15.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_16.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_17.csv +11 -0
- pyogrio/gdal_data/grib2_table_4_2_0_18.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_19.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_190.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_191.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_2.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_20.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_21.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_3.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_4.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_5.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_6.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_0_7.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_191.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_2.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_3.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_10_4.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_1_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_1_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_1_2.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_20_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_20_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_20_2.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_2_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_2_3.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_2_4.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_2_5.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_2_6.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_3_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_3_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_3_2.csv +28 -0
- pyogrio/gdal_data/grib2_table_4_2_3_3.csv +8 -0
- pyogrio/gdal_data/grib2_table_4_2_3_4.csv +14 -0
- pyogrio/gdal_data/grib2_table_4_2_3_5.csv +11 -0
- pyogrio/gdal_data/grib2_table_4_2_3_6.csv +11 -0
- pyogrio/gdal_data/grib2_table_4_2_4_0.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_1.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_10.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_2.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_3.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_4.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_5.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_6.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_7.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_8.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_4_9.csv +261 -0
- pyogrio/gdal_data/grib2_table_4_2_local_Canada.csv +5 -0
- pyogrio/gdal_data/grib2_table_4_2_local_HPC.csv +2 -0
- pyogrio/gdal_data/grib2_table_4_2_local_MRMS.csv +175 -0
- pyogrio/gdal_data/grib2_table_4_2_local_NCEP.csv +401 -0
- pyogrio/gdal_data/grib2_table_4_2_local_NDFD.csv +38 -0
- pyogrio/gdal_data/grib2_table_4_2_local_index.csv +7 -0
- pyogrio/gdal_data/grib2_table_4_5.csv +261 -0
- pyogrio/gdal_data/grib2_table_versions.csv +3 -0
- pyogrio/gdal_data/gt_datum.csv +229 -0
- pyogrio/gdal_data/gt_ellips.csv +24 -0
- pyogrio/gdal_data/header.dxf +1124 -0
- pyogrio/gdal_data/inspire_cp_BasicPropertyUnit.gfs +57 -0
- pyogrio/gdal_data/inspire_cp_CadastralBoundary.gfs +60 -0
- pyogrio/gdal_data/inspire_cp_CadastralParcel.gfs +81 -0
- pyogrio/gdal_data/inspire_cp_CadastralZoning.gfs +161 -0
- pyogrio/gdal_data/jpfgdgml_AdmArea.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_AdmBdry.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_AdmPt.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_BldA.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_BldL.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_Cntr.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_CommBdry.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_CommPt.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_Cstline.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_ElevPt.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_GCP.gfs +94 -0
- pyogrio/gdal_data/jpfgdgml_LeveeEdge.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_RailCL.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_RdASL.gfs +44 -0
- pyogrio/gdal_data/jpfgdgml_RdArea.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_RdCompt.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_RdEdg.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_RdMgtBdry.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_RdSgmtA.gfs +59 -0
- pyogrio/gdal_data/jpfgdgml_RvrMgtBdry.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_SBAPt.gfs +49 -0
- pyogrio/gdal_data/jpfgdgml_SBArea.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_SBBdry.gfs +44 -0
- pyogrio/gdal_data/jpfgdgml_WA.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_WL.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_WStrA.gfs +54 -0
- pyogrio/gdal_data/jpfgdgml_WStrL.gfs +54 -0
- pyogrio/gdal_data/leaflet_template.html +102 -0
- pyogrio/gdal_data/nitf_spec.xml +3288 -0
- pyogrio/gdal_data/nitf_spec.xsd +171 -0
- pyogrio/gdal_data/ogr_fields_override.schema.json +125 -0
- pyogrio/gdal_data/ogrinfo_output.schema.json +528 -0
- pyogrio/gdal_data/ogrvrt.xsd +528 -0
- pyogrio/gdal_data/osmconf.ini +134 -0
- pyogrio/gdal_data/ozi_datum.csv +131 -0
- pyogrio/gdal_data/ozi_ellips.csv +35 -0
- pyogrio/gdal_data/pci_datum.txt +530 -0
- pyogrio/gdal_data/pci_ellips.txt +129 -0
- pyogrio/gdal_data/pdfcomposition.xsd +703 -0
- pyogrio/gdal_data/pds4_template.xml +65 -0
- pyogrio/gdal_data/plscenesconf.json +1985 -0
- pyogrio/gdal_data/ruian_vf_ob_v1.gfs +1455 -0
- pyogrio/gdal_data/ruian_vf_st_uvoh_v1.gfs +86 -0
- pyogrio/gdal_data/ruian_vf_st_v1.gfs +1489 -0
- pyogrio/gdal_data/ruian_vf_v1.gfs +2126 -0
- pyogrio/gdal_data/s57agencies.csv +249 -0
- pyogrio/gdal_data/s57attributes.csv +484 -0
- pyogrio/gdal_data/s57expectedinput.csv +1008 -0
- pyogrio/gdal_data/s57objectclasses.csv +287 -0
- pyogrio/gdal_data/seed_2d.dgn +0 -0
- pyogrio/gdal_data/seed_3d.dgn +0 -0
- pyogrio/gdal_data/stateplane.csv +259 -0
- pyogrio/gdal_data/template_tiles.mapml +28 -0
- pyogrio/gdal_data/tms_LINZAntarticaMapTileGrid.json +190 -0
- pyogrio/gdal_data/tms_MapML_APSTILE.json +268 -0
- pyogrio/gdal_data/tms_MapML_CBMTILE.json +346 -0
- pyogrio/gdal_data/tms_NZTM2000.json +243 -0
- pyogrio/gdal_data/trailer.dxf +434 -0
- pyogrio/gdal_data/usage +4 -0
- pyogrio/gdal_data/vcpkg-cmake-wrapper.cmake +23 -0
- pyogrio/gdal_data/vcpkg.spdx.json +291 -0
- pyogrio/gdal_data/vcpkg_abi_info.txt +45 -0
- pyogrio/gdal_data/vdv452.xml +349 -0
- pyogrio/gdal_data/vdv452.xsd +45 -0
- pyogrio/gdal_data/vicar.json +164 -0
- pyogrio/geopandas.py +978 -0
- pyogrio/proj_data/CH +22 -0
- pyogrio/proj_data/GL27 +23 -0
- pyogrio/proj_data/ITRF2000 +24 -0
- pyogrio/proj_data/ITRF2008 +94 -0
- pyogrio/proj_data/ITRF2014 +55 -0
- pyogrio/proj_data/ITRF2020 +91 -0
- pyogrio/proj_data/copyright +34 -0
- pyogrio/proj_data/deformation_model.schema.json +582 -0
- pyogrio/proj_data/nad.lst +142 -0
- pyogrio/proj_data/nad27 +810 -0
- pyogrio/proj_data/nad83 +745 -0
- pyogrio/proj_data/other.extra +53 -0
- pyogrio/proj_data/proj-config-version.cmake +44 -0
- pyogrio/proj_data/proj-config.cmake +79 -0
- pyogrio/proj_data/proj-targets-release.cmake +19 -0
- pyogrio/proj_data/proj-targets.cmake +107 -0
- pyogrio/proj_data/proj.db +0 -0
- pyogrio/proj_data/proj.ini +59 -0
- pyogrio/proj_data/proj4-targets-release.cmake +19 -0
- pyogrio/proj_data/proj4-targets.cmake +107 -0
- pyogrio/proj_data/projjson.schema.json +1174 -0
- pyogrio/proj_data/triangulation.schema.json +214 -0
- pyogrio/proj_data/usage +9 -0
- pyogrio/proj_data/vcpkg.spdx.json +203 -0
- pyogrio/proj_data/vcpkg_abi_info.txt +28 -0
- pyogrio/proj_data/world +214 -0
- pyogrio/raw.py +897 -0
- pyogrio/tests/__init__.py +0 -0
- pyogrio/tests/conftest.py +588 -0
- pyogrio/tests/fixtures/README.md +108 -0
- pyogrio/tests/fixtures/curve.gpkg +0 -0
- pyogrio/tests/fixtures/curvepolygon.gpkg +0 -0
- pyogrio/tests/fixtures/line_zm.gpkg +0 -0
- pyogrio/tests/fixtures/list_field_values_file.parquet +0 -0
- pyogrio/tests/fixtures/list_nested_struct_file.parquet +0 -0
- pyogrio/tests/fixtures/multisurface.gpkg +0 -0
- pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.cpg +1 -0
- pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.dbf +0 -0
- pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.prj +1 -0
- pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp +0 -0
- pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shx +0 -0
- pyogrio/tests/fixtures/sample.osm.pbf +0 -0
- pyogrio/tests/fixtures/test_gpkg_nulls.gpkg +0 -0
- pyogrio/tests/test_arrow.py +1160 -0
- pyogrio/tests/test_core.py +702 -0
- pyogrio/tests/test_geopandas_io.py +3218 -0
- pyogrio/tests/test_path.py +374 -0
- pyogrio/tests/test_raw_io.py +1473 -0
- pyogrio/tests/test_util.py +56 -0
- pyogrio/util.py +258 -0
- pyogrio-0.12.0.dist-info/METADATA +125 -0
- pyogrio-0.12.0.dist-info/RECORD +231 -0
- pyogrio-0.12.0.dist-info/WHEEL +6 -0
- pyogrio-0.12.0.dist-info/licenses/LICENSE +21 -0
- pyogrio-0.12.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,3218 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import locale
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import warnings
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from zipfile import ZipFile
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from pyogrio import (
|
|
13
|
+
__gdal_version__,
|
|
14
|
+
list_drivers,
|
|
15
|
+
list_layers,
|
|
16
|
+
read_info,
|
|
17
|
+
set_gdal_config_options,
|
|
18
|
+
vsi_listtree,
|
|
19
|
+
vsi_unlink,
|
|
20
|
+
)
|
|
21
|
+
from pyogrio._compat import (
|
|
22
|
+
GDAL_GE_37,
|
|
23
|
+
GDAL_GE_311,
|
|
24
|
+
HAS_ARROW_WRITE_API,
|
|
25
|
+
HAS_PYPROJ,
|
|
26
|
+
PANDAS_GE_15,
|
|
27
|
+
PANDAS_GE_23,
|
|
28
|
+
PANDAS_GE_30,
|
|
29
|
+
SHAPELY_GE_21,
|
|
30
|
+
)
|
|
31
|
+
from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
|
|
32
|
+
from pyogrio.geopandas import PANDAS_GE_20, read_dataframe, write_dataframe
|
|
33
|
+
from pyogrio.raw import (
|
|
34
|
+
DRIVERS_NO_MIXED_DIMENSIONS,
|
|
35
|
+
DRIVERS_NO_MIXED_SINGLE_MULTI,
|
|
36
|
+
)
|
|
37
|
+
from pyogrio.tests.conftest import (
|
|
38
|
+
ALL_EXTS,
|
|
39
|
+
DRIVERS,
|
|
40
|
+
GDAL_HAS_PARQUET_DRIVER,
|
|
41
|
+
START_FID,
|
|
42
|
+
requires_arrow_write_api,
|
|
43
|
+
requires_gdal_geos,
|
|
44
|
+
requires_pyarrow_api,
|
|
45
|
+
requires_pyproj,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
import pytest
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import geopandas as gp
|
|
52
|
+
import pandas as pd
|
|
53
|
+
from geopandas.array import from_wkt
|
|
54
|
+
from pandas.api.types import is_datetime64_dtype, is_object_dtype, is_string_dtype
|
|
55
|
+
|
|
56
|
+
import shapely # if geopandas is present, shapely is expected to be present
|
|
57
|
+
from shapely.geometry import Point
|
|
58
|
+
|
|
59
|
+
from geopandas.testing import assert_geodataframe_equal
|
|
60
|
+
from pandas.testing import (
|
|
61
|
+
assert_index_equal,
|
|
62
|
+
assert_series_equal,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
except ImportError:
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
pytest.importorskip("geopandas")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.fixture(
|
|
73
|
+
scope="session",
|
|
74
|
+
params=[
|
|
75
|
+
False,
|
|
76
|
+
pytest.param(True, marks=requires_pyarrow_api),
|
|
77
|
+
],
|
|
78
|
+
)
|
|
79
|
+
def use_arrow(request):
|
|
80
|
+
return request.param
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.fixture(autouse=True)
|
|
84
|
+
def skip_if_no_arrow_write_api(request):
|
|
85
|
+
# automatically skip tests with use_arrow=True and that require Arrow write
|
|
86
|
+
# API (marked with `@pytest.mark.requires_arrow_write_api`) if it is not available
|
|
87
|
+
use_arrow = (
|
|
88
|
+
request.getfixturevalue("use_arrow")
|
|
89
|
+
if "use_arrow" in request.fixturenames
|
|
90
|
+
else False
|
|
91
|
+
)
|
|
92
|
+
if (
|
|
93
|
+
use_arrow
|
|
94
|
+
and not HAS_ARROW_WRITE_API
|
|
95
|
+
and request.node.get_closest_marker("requires_arrow_write_api")
|
|
96
|
+
):
|
|
97
|
+
pytest.skip("GDAL>=3.8 required for Arrow write API")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@contextlib.contextmanager
|
|
101
|
+
def use_arrow_context():
|
|
102
|
+
original = os.environ.get("PYOGRIO_USE_ARROW", None)
|
|
103
|
+
os.environ["PYOGRIO_USE_ARROW"] = "1"
|
|
104
|
+
yield
|
|
105
|
+
if original:
|
|
106
|
+
os.environ["PYOGRIO_USE_ARROW"] = original
|
|
107
|
+
else:
|
|
108
|
+
del os.environ["PYOGRIO_USE_ARROW"]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_spatialite_available(test_gpkg_nulls):
|
|
112
|
+
"""Check if SpatiaLite is available by running a simple SQL query."""
|
|
113
|
+
_ = read_dataframe(
|
|
114
|
+
test_gpkg_nulls, sql="select spatialite_version();", sql_dialect="SQLITE"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@pytest.mark.parametrize(
|
|
119
|
+
"encoding, arrow",
|
|
120
|
+
[
|
|
121
|
+
("utf-8", False),
|
|
122
|
+
pytest.param("utf-8", True, marks=requires_pyarrow_api),
|
|
123
|
+
("cp1252", False),
|
|
124
|
+
(None, False),
|
|
125
|
+
],
|
|
126
|
+
)
|
|
127
|
+
def test_read_csv_encoding(tmp_path, encoding, arrow):
|
|
128
|
+
""" "Test reading CSV files with different encodings.
|
|
129
|
+
|
|
130
|
+
Arrow only supports utf-8 encoding.
|
|
131
|
+
"""
|
|
132
|
+
# Write csv test file. Depending on the os this will be written in a different
|
|
133
|
+
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
|
|
134
|
+
csv_path = tmp_path / "test.csv"
|
|
135
|
+
with open(csv_path, "w", encoding=encoding) as csv:
|
|
136
|
+
csv.write("näme,city\n")
|
|
137
|
+
csv.write("Wilhelm Röntgen,Zürich\n")
|
|
138
|
+
|
|
139
|
+
# Read csv. The data should be read with the same default encoding as the csv file
|
|
140
|
+
# was written in, but should have been converted to utf-8 in the dataframe returned.
|
|
141
|
+
# Hence, the asserts below, with strings in utf-8, be OK.
|
|
142
|
+
df = read_dataframe(csv_path, encoding=encoding, use_arrow=arrow)
|
|
143
|
+
|
|
144
|
+
assert len(df) == 1
|
|
145
|
+
assert df.columns.tolist() == ["näme", "city"]
|
|
146
|
+
assert df.city.tolist() == ["Zürich"]
|
|
147
|
+
assert df.näme.tolist() == ["Wilhelm Röntgen"]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@pytest.mark.skipif(
|
|
151
|
+
locale.getpreferredencoding().upper() == "UTF-8",
|
|
152
|
+
reason="test requires non-UTF-8 default platform",
|
|
153
|
+
)
|
|
154
|
+
def test_read_csv_platform_encoding(tmp_path, use_arrow):
|
|
155
|
+
"""Verify that read defaults to platform encoding; only works on Windows (CP1252).
|
|
156
|
+
|
|
157
|
+
When use_arrow=True, reading an non-UTF8 fails.
|
|
158
|
+
"""
|
|
159
|
+
csv_path = tmp_path / "test.csv"
|
|
160
|
+
with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
|
|
161
|
+
csv.write("näme,city\n")
|
|
162
|
+
csv.write("Wilhelm Röntgen,Zürich\n")
|
|
163
|
+
|
|
164
|
+
if use_arrow:
|
|
165
|
+
with pytest.raises(
|
|
166
|
+
DataSourceError,
|
|
167
|
+
match="; please use_arrow=False",
|
|
168
|
+
):
|
|
169
|
+
df = read_dataframe(csv_path, use_arrow=use_arrow)
|
|
170
|
+
else:
|
|
171
|
+
df = read_dataframe(csv_path, use_arrow=use_arrow)
|
|
172
|
+
|
|
173
|
+
assert len(df) == 1
|
|
174
|
+
assert df.columns.tolist() == ["näme", "city"]
|
|
175
|
+
assert df.city.tolist() == ["Zürich"]
|
|
176
|
+
assert df.näme.tolist() == ["Wilhelm Röntgen"]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_read_dataframe(naturalearth_lowres_all_ext):
|
|
180
|
+
df = read_dataframe(naturalearth_lowres_all_ext)
|
|
181
|
+
|
|
182
|
+
if HAS_PYPROJ:
|
|
183
|
+
assert df.crs == "EPSG:4326"
|
|
184
|
+
assert len(df) == 177
|
|
185
|
+
assert df.columns.tolist() == [
|
|
186
|
+
"pop_est",
|
|
187
|
+
"continent",
|
|
188
|
+
"name",
|
|
189
|
+
"iso_a3",
|
|
190
|
+
"gdp_md_est",
|
|
191
|
+
"geometry",
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def test_read_dataframe_vsi(naturalearth_lowres_vsi, use_arrow):
|
|
196
|
+
df = read_dataframe(naturalearth_lowres_vsi[1], use_arrow=use_arrow)
|
|
197
|
+
assert len(df) == 177
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@pytest.mark.parametrize(
|
|
201
|
+
"columns, fid_as_index, exp_len", [(None, False, 3), ([], True, 3), ([], False, 0)]
|
|
202
|
+
)
|
|
203
|
+
def test_read_layer_without_geometry(
|
|
204
|
+
no_geometry_file, columns, fid_as_index, use_arrow, exp_len
|
|
205
|
+
):
|
|
206
|
+
result = read_dataframe(
|
|
207
|
+
no_geometry_file,
|
|
208
|
+
columns=columns,
|
|
209
|
+
fid_as_index=fid_as_index,
|
|
210
|
+
use_arrow=use_arrow,
|
|
211
|
+
)
|
|
212
|
+
assert type(result) is pd.DataFrame
|
|
213
|
+
assert len(result) == exp_len
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@pytest.mark.parametrize(
|
|
217
|
+
"naturalearth_lowres, expected_ext",
|
|
218
|
+
[(".gpkg", ".gpkg"), (".shp", ".shp")],
|
|
219
|
+
indirect=["naturalearth_lowres"],
|
|
220
|
+
)
|
|
221
|
+
def test_fixture_naturalearth_lowres(naturalearth_lowres, expected_ext):
|
|
222
|
+
# Test the fixture with "indirect" parameter
|
|
223
|
+
assert naturalearth_lowres.suffix == expected_ext
|
|
224
|
+
df = read_dataframe(naturalearth_lowres)
|
|
225
|
+
assert len(df) == 177
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def test_read_no_geometry(naturalearth_lowres_all_ext, use_arrow):
|
|
229
|
+
df = read_dataframe(
|
|
230
|
+
naturalearth_lowres_all_ext, use_arrow=use_arrow, read_geometry=False
|
|
231
|
+
)
|
|
232
|
+
assert isinstance(df, pd.DataFrame)
|
|
233
|
+
assert not isinstance(df, gp.GeoDataFrame)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def test_read_no_geometry_no_columns_no_fids(naturalearth_lowres, use_arrow):
|
|
237
|
+
with pytest.raises(
|
|
238
|
+
ValueError,
|
|
239
|
+
match=(
|
|
240
|
+
"at least one of read_geometry or return_fids must be True or columns must "
|
|
241
|
+
"be None or non-empty"
|
|
242
|
+
),
|
|
243
|
+
):
|
|
244
|
+
_ = read_dataframe(
|
|
245
|
+
naturalearth_lowres,
|
|
246
|
+
columns=[],
|
|
247
|
+
read_geometry=False,
|
|
248
|
+
fid_as_index=False,
|
|
249
|
+
use_arrow=use_arrow,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def test_read_force_2d(tmp_path, use_arrow):
|
|
254
|
+
filename = tmp_path / "test.gpkg"
|
|
255
|
+
|
|
256
|
+
# create a GPKG with 3D point values
|
|
257
|
+
expected = gp.GeoDataFrame(
|
|
258
|
+
geometry=[Point(0, 0, 0), Point(1, 1, 0)], crs="EPSG:4326"
|
|
259
|
+
)
|
|
260
|
+
write_dataframe(expected, filename)
|
|
261
|
+
|
|
262
|
+
df = read_dataframe(filename)
|
|
263
|
+
assert df.iloc[0].geometry.has_z
|
|
264
|
+
|
|
265
|
+
df = read_dataframe(
|
|
266
|
+
filename,
|
|
267
|
+
force_2d=True,
|
|
268
|
+
max_features=1,
|
|
269
|
+
use_arrow=use_arrow,
|
|
270
|
+
)
|
|
271
|
+
assert not df.iloc[0].geometry.has_z
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def test_read_geojson_error(naturalearth_lowres_geojson, use_arrow):
|
|
275
|
+
try:
|
|
276
|
+
set_gdal_config_options({"OGR_GEOJSON_MAX_OBJ_SIZE": 0.01})
|
|
277
|
+
with pytest.raises(
|
|
278
|
+
DataSourceError,
|
|
279
|
+
match="Failed to read GeoJSON data; .* GeoJSON object too complex",
|
|
280
|
+
):
|
|
281
|
+
read_dataframe(naturalearth_lowres_geojson, use_arrow=use_arrow)
|
|
282
|
+
finally:
|
|
283
|
+
set_gdal_config_options({"OGR_GEOJSON_MAX_OBJ_SIZE": None})
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@pytest.mark.skipif(
|
|
287
|
+
"LIBKML" not in list_drivers(),
|
|
288
|
+
reason="LIBKML driver is not available and is needed to read simpledata element",
|
|
289
|
+
)
|
|
290
|
+
def test_read_kml_simpledata(kml_file, use_arrow):
|
|
291
|
+
"""Test reading a KML file with a simpledata element.
|
|
292
|
+
|
|
293
|
+
Simpledata elements are only read by the LibKML driver, not the KML driver.
|
|
294
|
+
"""
|
|
295
|
+
gdf = read_dataframe(kml_file, use_arrow=use_arrow)
|
|
296
|
+
|
|
297
|
+
# Check if the simpledata column is present.
|
|
298
|
+
assert "formation" in gdf.columns
|
|
299
|
+
assert gdf["formation"].iloc[0] == "Ton"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def test_read_layer(tmp_path, use_arrow):
|
|
303
|
+
filename = tmp_path / "test.gpkg"
|
|
304
|
+
|
|
305
|
+
# create a multilayer GPKG
|
|
306
|
+
expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326")
|
|
307
|
+
if use_arrow:
|
|
308
|
+
# TODO this needs to be fixed on the geopandas side (to ensure the
|
|
309
|
+
# GeoDataFrame() constructor does this), when use_arrow we already
|
|
310
|
+
# get columns Index with string dtype
|
|
311
|
+
expected1.columns = expected1.columns.astype("str")
|
|
312
|
+
write_dataframe(
|
|
313
|
+
expected1,
|
|
314
|
+
filename,
|
|
315
|
+
layer="layer1",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326")
|
|
319
|
+
if use_arrow:
|
|
320
|
+
expected2.columns = expected2.columns.astype("str")
|
|
321
|
+
write_dataframe(expected2, filename, layer="layer2", append=True)
|
|
322
|
+
|
|
323
|
+
assert np.array_equal(
|
|
324
|
+
list_layers(filename), [["layer1", "Point"], ["layer2", "Point"]]
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
kwargs = {"use_arrow": use_arrow, "max_features": 1}
|
|
328
|
+
|
|
329
|
+
# The first layer is read by default, which will warn when there are multiple
|
|
330
|
+
# layers
|
|
331
|
+
with pytest.warns(UserWarning, match="More than one layer found"):
|
|
332
|
+
df = read_dataframe(filename, **kwargs)
|
|
333
|
+
|
|
334
|
+
assert_geodataframe_equal(df, expected1)
|
|
335
|
+
|
|
336
|
+
# Reading a specific layer by name should return that layer.
|
|
337
|
+
# Detected here by a known column.
|
|
338
|
+
df = read_dataframe(filename, layer="layer2", **kwargs)
|
|
339
|
+
assert_geodataframe_equal(df, expected2)
|
|
340
|
+
|
|
341
|
+
# Reading a specific layer by index should return that layer
|
|
342
|
+
df = read_dataframe(filename, layer=1, **kwargs)
|
|
343
|
+
assert_geodataframe_equal(df, expected2)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def test_read_layer_invalid(naturalearth_lowres_all_ext, use_arrow):
|
|
347
|
+
with pytest.raises(DataLayerError, match="Layer 'wrong' could not be opened"):
|
|
348
|
+
read_dataframe(naturalearth_lowres_all_ext, layer="wrong", use_arrow=use_arrow)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def test_read_datetime(datetime_file, use_arrow):
|
|
352
|
+
df = read_dataframe(datetime_file, use_arrow=use_arrow)
|
|
353
|
+
if PANDAS_GE_20:
|
|
354
|
+
# starting with pandas 2.0, it preserves the passed datetime resolution
|
|
355
|
+
assert df.col.dtype.name == "datetime64[ms]"
|
|
356
|
+
else:
|
|
357
|
+
assert df.col.dtype.name == "datetime64[ns]"
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def test_read_list_types(list_field_values_files, use_arrow):
|
|
361
|
+
"""Test reading a geojson file containing fields with lists."""
|
|
362
|
+
if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
|
|
363
|
+
pytest.skip(
|
|
364
|
+
"Skipping test for parquet as the GDAL Parquet driver is not available"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
info = read_info(list_field_values_files)
|
|
368
|
+
suffix = list_field_values_files.suffix
|
|
369
|
+
|
|
370
|
+
result = read_dataframe(list_field_values_files, use_arrow=use_arrow)
|
|
371
|
+
|
|
372
|
+
# Check list_int column
|
|
373
|
+
assert "list_int" in result.columns
|
|
374
|
+
assert info["fields"][1] == "list_int"
|
|
375
|
+
assert info["ogr_types"][1] in ("OFTIntegerList", "OFTInteger64List")
|
|
376
|
+
assert result["list_int"][0].tolist() == [0, 1]
|
|
377
|
+
assert result["list_int"][1].tolist() == [2, 3]
|
|
378
|
+
assert result["list_int"][2].tolist() == []
|
|
379
|
+
assert result["list_int"][3] is None
|
|
380
|
+
assert result["list_int"][4] is None
|
|
381
|
+
|
|
382
|
+
# Check list_double column
|
|
383
|
+
assert "list_double" in result.columns
|
|
384
|
+
assert info["fields"][2] == "list_double"
|
|
385
|
+
assert info["ogr_types"][2] == "OFTRealList"
|
|
386
|
+
assert result["list_double"][0].tolist() == [0.0, 1.0]
|
|
387
|
+
assert result["list_double"][1].tolist() == [2.0, 3.0]
|
|
388
|
+
assert result["list_double"][2].tolist() == []
|
|
389
|
+
assert result["list_double"][3] is None
|
|
390
|
+
assert result["list_double"][4] is None
|
|
391
|
+
|
|
392
|
+
# Check list_string column
|
|
393
|
+
assert "list_string" in result.columns
|
|
394
|
+
assert info["fields"][3] == "list_string"
|
|
395
|
+
assert info["ogr_types"][3] == "OFTStringList"
|
|
396
|
+
assert result["list_string"][0].tolist() == ["string1", "string2"]
|
|
397
|
+
assert result["list_string"][1].tolist() == ["string3", "string4", ""]
|
|
398
|
+
assert result["list_string"][2].tolist() == []
|
|
399
|
+
assert result["list_string"][3] is None
|
|
400
|
+
assert result["list_string"][4] == [""]
|
|
401
|
+
|
|
402
|
+
# Check list_int_with_null column
|
|
403
|
+
if suffix == ".geojson":
|
|
404
|
+
# Once any row of a column contains a null value in a list, the column isn't
|
|
405
|
+
# recognized as a list column anymore for .geojson files, but as a JSON column.
|
|
406
|
+
# Because JSON columns containing JSON Arrays are also parsed to python lists,
|
|
407
|
+
# the end result is the same...
|
|
408
|
+
exp_type = "OFTString"
|
|
409
|
+
exp_subtype = "OFSTJSON"
|
|
410
|
+
exp_list_int_with_null_value = [0, None]
|
|
411
|
+
else:
|
|
412
|
+
# For .parquet files, the list column is preserved as a list column.
|
|
413
|
+
exp_type = "OFTInteger64List"
|
|
414
|
+
exp_subtype = "OFSTNone"
|
|
415
|
+
if use_arrow:
|
|
416
|
+
exp_list_int_with_null_value = [0.0, np.nan]
|
|
417
|
+
else:
|
|
418
|
+
exp_list_int_with_null_value = [0, 0]
|
|
419
|
+
# xfail: when reading a list of int with None values without Arrow from a
|
|
420
|
+
# .parquet file, the None values become 0, which is wrong.
|
|
421
|
+
# https://github.com/OSGeo/gdal/issues/13448
|
|
422
|
+
|
|
423
|
+
assert "list_int_with_null" in result.columns
|
|
424
|
+
assert info["fields"][4] == "list_int_with_null"
|
|
425
|
+
assert info["ogr_types"][4] == exp_type
|
|
426
|
+
assert info["ogr_subtypes"][4] == exp_subtype
|
|
427
|
+
assert result["list_int_with_null"][0][0] == 0
|
|
428
|
+
if exp_list_int_with_null_value[1] == 0:
|
|
429
|
+
assert result["list_int_with_null"][0][1] == exp_list_int_with_null_value[1]
|
|
430
|
+
else:
|
|
431
|
+
assert pd.isna(result["list_int_with_null"][0][1])
|
|
432
|
+
|
|
433
|
+
if suffix == ".geojson":
|
|
434
|
+
# For .geojson, the lists are already python lists
|
|
435
|
+
assert result["list_int_with_null"][1] == [2, 3]
|
|
436
|
+
assert result["list_int_with_null"][2] == []
|
|
437
|
+
else:
|
|
438
|
+
# For .parquet, the lists are numpy arrays
|
|
439
|
+
assert result["list_int_with_null"][1].tolist() == [2, 3]
|
|
440
|
+
assert result["list_int_with_null"][2].tolist() == []
|
|
441
|
+
|
|
442
|
+
assert pd.isna(result["list_int_with_null"][3])
|
|
443
|
+
assert pd.isna(result["list_int_with_null"][4])
|
|
444
|
+
|
|
445
|
+
# Check list_string_with_null column
|
|
446
|
+
if suffix == ".geojson":
|
|
447
|
+
# Once any row of a column contains a null value in a list, the column isn't
|
|
448
|
+
# recognized as a list column anymore for .geojson files, but as a JSON column.
|
|
449
|
+
# Because JSON columns containing JSON Arrays are also parsed to python lists,
|
|
450
|
+
# the end result is the same...
|
|
451
|
+
exp_type = "OFTString"
|
|
452
|
+
exp_subtype = "OFSTJSON"
|
|
453
|
+
else:
|
|
454
|
+
# For .parquet files, the list column is preserved as a list column.
|
|
455
|
+
exp_type = "OFTStringList"
|
|
456
|
+
exp_subtype = "OFSTNone"
|
|
457
|
+
|
|
458
|
+
assert "list_string_with_null" in result.columns
|
|
459
|
+
assert info["fields"][5] == "list_string_with_null"
|
|
460
|
+
assert info["ogr_types"][5] == exp_type
|
|
461
|
+
assert info["ogr_subtypes"][5] == exp_subtype
|
|
462
|
+
|
|
463
|
+
if suffix == ".geojson":
|
|
464
|
+
# For .geojson, the lists are already python lists
|
|
465
|
+
assert result["list_string_with_null"][0] == ["string1", None]
|
|
466
|
+
assert result["list_string_with_null"][1] == ["string3", "string4", ""]
|
|
467
|
+
assert result["list_string_with_null"][2] == []
|
|
468
|
+
else:
|
|
469
|
+
# For .parquet, the lists are numpy arrays
|
|
470
|
+
# When use_arrow=False, the None becomes an empty string, which is wrong.
|
|
471
|
+
exp_value = ["string1", ""] if not use_arrow else ["string1", None]
|
|
472
|
+
assert result["list_string_with_null"][0].tolist() == exp_value
|
|
473
|
+
assert result["list_string_with_null"][1].tolist() == ["string3", "string4", ""]
|
|
474
|
+
assert result["list_string_with_null"][2].tolist() == []
|
|
475
|
+
|
|
476
|
+
assert pd.isna(result["list_string_with_null"][3])
|
|
477
|
+
assert result["list_string_with_null"][4] == [""]
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@pytest.mark.requires_arrow_write_api
|
|
481
|
+
@pytest.mark.skipif(
|
|
482
|
+
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
|
|
483
|
+
)
|
|
484
|
+
def test_read_list_nested_struct_parquet_file(
|
|
485
|
+
list_nested_struct_parquet_file, use_arrow
|
|
486
|
+
):
|
|
487
|
+
"""Test reading a Parquet file containing nested struct and list types."""
|
|
488
|
+
if not use_arrow:
|
|
489
|
+
pytest.skip(
|
|
490
|
+
"When use_arrow=False, gdal flattens nested columns to seperate columns. "
|
|
491
|
+
"Not sure how we want to deal with this case, but for now just skip."
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
result = read_dataframe(list_nested_struct_parquet_file, use_arrow=use_arrow)
|
|
495
|
+
|
|
496
|
+
assert "col_flat" in result.columns
|
|
497
|
+
assert np.array_equal(result["col_flat"].to_numpy(), np.array([0, 1, 2]))
|
|
498
|
+
|
|
499
|
+
assert "col_list" in result.columns
|
|
500
|
+
assert result["col_list"].dtype == object
|
|
501
|
+
assert result["col_list"][0].tolist() == [1, 2, 3]
|
|
502
|
+
assert result["col_list"][1].tolist() == [1, 2, 3]
|
|
503
|
+
assert result["col_list"][2].tolist() == [1, 2, 3]
|
|
504
|
+
|
|
505
|
+
assert "col_nested" in result.columns
|
|
506
|
+
assert result["col_nested"].dtype == object
|
|
507
|
+
assert result["col_nested"][0].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
|
|
508
|
+
assert result["col_nested"][1].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
|
|
509
|
+
assert result["col_nested"][2].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
|
|
510
|
+
|
|
511
|
+
assert "col_struct" in result.columns
|
|
512
|
+
assert result["col_struct"].dtype == object
|
|
513
|
+
assert result["col_struct"][0] == {"a": 1, "b": 2}
|
|
514
|
+
assert result["col_struct"][1] == {"a": 1, "b": 2}
|
|
515
|
+
assert result["col_struct"][2] == {"a": 1, "b": 2}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
@pytest.mark.filterwarnings(
|
|
519
|
+
"ignore: Non-conformant content for record 1 in column dates"
|
|
520
|
+
)
|
|
521
|
+
@pytest.mark.requires_arrow_write_api
|
|
522
|
+
def test_write_datetime_mixed_offset(tmp_path, use_arrow):
|
|
523
|
+
# Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10)
|
|
524
|
+
dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"]
|
|
525
|
+
naive_col = pd.Series(pd.to_datetime(dates), name="dates")
|
|
526
|
+
localised_col = naive_col.dt.tz_localize("Australia/Sydney")
|
|
527
|
+
utc_col = localised_col.dt.tz_convert("UTC")
|
|
528
|
+
if PANDAS_GE_20:
|
|
529
|
+
utc_col = utc_col.dt.as_unit("ms")
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
533
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
534
|
+
def test_read_datetime_long_ago(
|
|
535
|
+
geojson_datetime_long_ago, use_arrow, mixed_offsets_as_utc, datetime_as_string
|
|
536
|
+
):
|
|
537
|
+
"""Test writing/reading a column with a datetime far in the past.
|
|
538
|
+
Dates from before 1678-1-1 aren't parsed correctly by pandas < 3.0, so they
|
|
539
|
+
stay strings.
|
|
540
|
+
Reported in https://github.com/geopandas/pyogrio/issues/553.
|
|
541
|
+
"""
|
|
542
|
+
handler = contextlib.nullcontext()
|
|
543
|
+
overflow_occured = False
|
|
544
|
+
if not datetime_as_string and not PANDAS_GE_30 and (not use_arrow or GDAL_GE_311):
|
|
545
|
+
# When datetimes should not be returned as string and arrow is not used or
|
|
546
|
+
# arrow is used with GDAL >= 3.11, `pandas.to_datetime` is used to parse the
|
|
547
|
+
# datetimes. However, when using pandas < 3.0, this raises an
|
|
548
|
+
# "Out of bounds nanosecond timestamp" error for very old dates.
|
|
549
|
+
# As a result, `read_dataframe` gives a warning and the datetimes stay strings.
|
|
550
|
+
handler = pytest.warns(
|
|
551
|
+
UserWarning, match="Error parsing datetimes, original strings are returned"
|
|
552
|
+
)
|
|
553
|
+
overflow_occured = True
|
|
554
|
+
# XFAIL: datetimes before 1678-1-1 give overflow with arrow=False and pandas<3.0
|
|
555
|
+
elif use_arrow and not PANDAS_GE_20 and not GDAL_GE_311:
|
|
556
|
+
# When arrow is used with pandas < 2.0 and GDAL < 3.11, an overflow occurs in
|
|
557
|
+
# pyarrow.to_pandas().
|
|
558
|
+
handler = pytest.raises(
|
|
559
|
+
Exception,
|
|
560
|
+
match=re.escape("Casting from timestamp[ms] to timestamp[ns] would result"),
|
|
561
|
+
)
|
|
562
|
+
overflow_occured = True
|
|
563
|
+
# XFAIL: datetimes before 1678-1-1 give overflow with arrow=True and pandas<2.0
|
|
564
|
+
|
|
565
|
+
with handler:
|
|
566
|
+
df = read_dataframe(
|
|
567
|
+
geojson_datetime_long_ago,
|
|
568
|
+
use_arrow=use_arrow,
|
|
569
|
+
datetime_as_string=datetime_as_string,
|
|
570
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
exp_dates_str = pd.Series(["1670-01-01T09:00:00"], name="datetime_col")
|
|
574
|
+
if datetime_as_string:
|
|
575
|
+
assert is_string_dtype(df.datetime_col.dtype)
|
|
576
|
+
assert_series_equal(df.datetime_col, exp_dates_str)
|
|
577
|
+
else:
|
|
578
|
+
# It is a single naive datetime, so regardless of mixed_offsets_as_utc the
|
|
579
|
+
# expected "ideal" result is the same: a datetime64 without time zone info.
|
|
580
|
+
if overflow_occured:
|
|
581
|
+
# Strings are returned because of an overflow.
|
|
582
|
+
assert is_string_dtype(df.datetime_col.dtype)
|
|
583
|
+
assert_series_equal(df.datetime_col, exp_dates_str)
|
|
584
|
+
else:
|
|
585
|
+
# With use_arrow or pandas >= 3.0, old datetimes are parsed correctly.
|
|
586
|
+
assert is_datetime64_dtype(df.datetime_col)
|
|
587
|
+
assert df.datetime_col.iloc[0] == pd.Timestamp(1670, 1, 1, 9, 0, 0)
|
|
588
|
+
assert df.datetime_col.iloc[0].unit == "ms"
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
592
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
593
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
594
|
+
@pytest.mark.requires_arrow_write_api
|
|
595
|
+
def test_write_read_datetime_no_tz(
|
|
596
|
+
tmp_path, ext, datetime_as_string, mixed_offsets_as_utc, use_arrow
|
|
597
|
+
):
|
|
598
|
+
"""Test writing/reading a column with naive datetimes (no time zone information)."""
|
|
599
|
+
dates_raw = ["2020-01-01T09:00:00.123", "2020-01-01T10:00:00", np.nan]
|
|
600
|
+
if PANDAS_GE_20:
|
|
601
|
+
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
|
|
602
|
+
else:
|
|
603
|
+
dates = pd.to_datetime(dates_raw)
|
|
604
|
+
df = gp.GeoDataFrame(
|
|
605
|
+
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
fpath = tmp_path / f"test{ext}"
|
|
609
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
610
|
+
result = read_dataframe(
|
|
611
|
+
fpath,
|
|
612
|
+
use_arrow=use_arrow,
|
|
613
|
+
datetime_as_string=datetime_as_string,
|
|
614
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0):
|
|
618
|
+
# With GDAL < 3.11 with arrow, columns with naive datetimes are written
|
|
619
|
+
# correctly, but when read they are wrongly interpreted as being in UTC.
|
|
620
|
+
# The reason is complicated, but more info can be found e.g. here:
|
|
621
|
+
# https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807
|
|
622
|
+
exp_dates = df.dates.dt.tz_localize("UTC")
|
|
623
|
+
if datetime_as_string:
|
|
624
|
+
exp_dates = exp_dates.astype("str").str.replace(" ", "T")
|
|
625
|
+
exp_dates[2] = np.nan
|
|
626
|
+
assert_series_equal(result.dates, exp_dates)
|
|
627
|
+
elif not mixed_offsets_as_utc:
|
|
628
|
+
assert_series_equal(result.dates, exp_dates)
|
|
629
|
+
# XFAIL: naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow
|
|
630
|
+
|
|
631
|
+
elif datetime_as_string:
|
|
632
|
+
assert is_string_dtype(result.dates.dtype)
|
|
633
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
634
|
+
dates_str = df.dates.astype("str").str.replace(" ", "T")
|
|
635
|
+
dates_str[2] = np.nan
|
|
636
|
+
else:
|
|
637
|
+
dates_str = pd.Series(dates_raw, name="dates")
|
|
638
|
+
assert_series_equal(result.dates, dates_str)
|
|
639
|
+
else:
|
|
640
|
+
assert is_datetime64_dtype(result.dates.dtype)
|
|
641
|
+
assert_geodataframe_equal(result, df)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
645
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
646
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
647
|
+
@pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ")
|
|
648
|
+
@pytest.mark.requires_arrow_write_api
|
|
649
|
+
def test_write_read_datetime_tz(
|
|
650
|
+
request, tmp_path, ext, datetime_as_string, mixed_offsets_as_utc, use_arrow
|
|
651
|
+
):
|
|
652
|
+
"""Write and read file with all equal time zones.
|
|
653
|
+
|
|
654
|
+
This should result in the result being in pandas datetime64 dtype column.
|
|
655
|
+
"""
|
|
656
|
+
if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"):
|
|
657
|
+
# With GDAL < 3.10 with arrow, the time zone offset was applied to the datetime
|
|
658
|
+
# as well as retaining the time zone.
|
|
659
|
+
# This was fixed in https://github.com/OSGeo/gdal/pull/11049
|
|
660
|
+
request.node.add_marker(
|
|
661
|
+
pytest.mark.xfail(
|
|
662
|
+
reason="Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow"
|
|
663
|
+
)
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", np.nan]
|
|
667
|
+
if PANDAS_GE_20:
|
|
668
|
+
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
|
|
669
|
+
else:
|
|
670
|
+
dates = pd.to_datetime(dates_raw)
|
|
671
|
+
|
|
672
|
+
# Make the index non-consecutive to test this case as well. Added for issue
|
|
673
|
+
# https://github.com/geopandas/pyogrio/issues/324
|
|
674
|
+
df = gp.GeoDataFrame(
|
|
675
|
+
{"dates": dates, "geometry": [Point(1, 1)] * 3},
|
|
676
|
+
index=[0, 2, 3],
|
|
677
|
+
crs="EPSG:4326",
|
|
678
|
+
)
|
|
679
|
+
assert isinstance(df.dates.dtype, pd.DatetimeTZDtype)
|
|
680
|
+
|
|
681
|
+
fpath = tmp_path / f"test{ext}"
|
|
682
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
683
|
+
result = read_dataframe(
|
|
684
|
+
fpath,
|
|
685
|
+
use_arrow=use_arrow,
|
|
686
|
+
datetime_as_string=datetime_as_string,
|
|
687
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
# With some older versions, the offset is represented slightly differently
|
|
691
|
+
if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"):
|
|
692
|
+
result.dates = result.dates.astype(df.dates.dtype)
|
|
693
|
+
|
|
694
|
+
if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0):
|
|
695
|
+
# With GDAL < 3.11 with arrow, datetime columns are written as string type
|
|
696
|
+
df_exp = df.copy()
|
|
697
|
+
df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str)
|
|
698
|
+
assert_series_equal(result.dates, df_exp.dates, check_index=False)
|
|
699
|
+
# XFAIL: datetime columns written as string with GDAL < 3.11 via arrow
|
|
700
|
+
elif datetime_as_string:
|
|
701
|
+
assert is_string_dtype(result.dates.dtype)
|
|
702
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
703
|
+
dates_str = df.dates.astype("str").str.replace(" ", "T")
|
|
704
|
+
dates_str.iloc[2] = np.nan
|
|
705
|
+
elif __gdal_version__ < (3, 7, 0):
|
|
706
|
+
# With GDAL < 3.7, time zone minutes aren't included in the string
|
|
707
|
+
dates_str = [x[:-3] for x in dates_raw if pd.notna(x)] + [np.nan]
|
|
708
|
+
dates_str = pd.Series(dates_str, name="dates")
|
|
709
|
+
else:
|
|
710
|
+
dates_str = pd.Series(dates_raw, name="dates")
|
|
711
|
+
assert_series_equal(result.dates, dates_str, check_index=False)
|
|
712
|
+
else:
|
|
713
|
+
assert_series_equal(result.dates, df.dates, check_index=False)
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
717
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
718
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
719
|
+
@pytest.mark.filterwarnings(
|
|
720
|
+
"ignore: Non-conformant content for record 1 in column dates"
|
|
721
|
+
)
|
|
722
|
+
@pytest.mark.requires_arrow_write_api
|
|
723
|
+
def test_write_read_datetime_tz_localized_mixed_offset(
|
|
724
|
+
tmp_path, ext, datetime_as_string, mixed_offsets_as_utc, use_arrow
|
|
725
|
+
):
|
|
726
|
+
"""Test with localized dates across a different summer/winter time zone offset."""
|
|
727
|
+
# Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10)
|
|
728
|
+
dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", np.nan]
|
|
729
|
+
dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates")
|
|
730
|
+
dates_local = dates_naive.dt.tz_localize("Australia/Sydney")
|
|
731
|
+
dates_local_offsets_str = dates_local.astype(str)
|
|
732
|
+
if datetime_as_string:
|
|
733
|
+
exp_dates = dates_local_offsets_str.str.replace(" ", "T")
|
|
734
|
+
exp_dates = exp_dates.str.replace(".111000", ".111")
|
|
735
|
+
if __gdal_version__ < (3, 7, 0):
|
|
736
|
+
# With GDAL < 3.7, time zone minutes aren't included in the string
|
|
737
|
+
exp_dates = exp_dates.str.slice(0, -3)
|
|
738
|
+
elif mixed_offsets_as_utc:
|
|
739
|
+
exp_dates = dates_local.dt.tz_convert("UTC")
|
|
740
|
+
if PANDAS_GE_20:
|
|
741
|
+
exp_dates = exp_dates.dt.as_unit("ms")
|
|
742
|
+
else:
|
|
743
|
+
exp_dates = dates_local_offsets_str.apply(
|
|
744
|
+
lambda x: pd.Timestamp(x) if pd.notna(x) else None
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
df = gp.GeoDataFrame(
|
|
748
|
+
{"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
|
|
749
|
+
)
|
|
750
|
+
fpath = tmp_path / f"test{ext}"
|
|
751
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
752
|
+
result = read_dataframe(
|
|
753
|
+
fpath,
|
|
754
|
+
use_arrow=use_arrow,
|
|
755
|
+
datetime_as_string=datetime_as_string,
|
|
756
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
760
|
+
if ext in (".geojson", ".geojsonl"):
|
|
761
|
+
# With GDAL < 3.11 with arrow, GDAL converts mixed time zone datetimes to
|
|
762
|
+
# UTC when read as the arrow datetime column type does not support mixed tz.
|
|
763
|
+
dates_utc = dates_local.dt.tz_convert("UTC")
|
|
764
|
+
if PANDAS_GE_20:
|
|
765
|
+
dates_utc = dates_utc.dt.as_unit("ms")
|
|
766
|
+
if datetime_as_string:
|
|
767
|
+
assert is_string_dtype(result.dates.dtype)
|
|
768
|
+
dates_utc = dates_utc.astype(str).str.replace(" ", "T")
|
|
769
|
+
assert pd.isna(result.dates[2])
|
|
770
|
+
assert_series_equal(result.dates.head(2), dates_utc.head(2))
|
|
771
|
+
# XFAIL: mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow
|
|
772
|
+
return
|
|
773
|
+
|
|
774
|
+
elif ext in (".gpkg", ".fgb"):
|
|
775
|
+
# With GDAL < 3.11 with arrow, datetime columns written as string type
|
|
776
|
+
assert pd.isna(result.dates[2])
|
|
777
|
+
assert_series_equal(result.dates.head(2), dates_local_offsets_str.head(2))
|
|
778
|
+
# XFAIL: datetime columns written as string with GDAL < 3.11 + arrow
|
|
779
|
+
return
|
|
780
|
+
|
|
781
|
+
# GDAL tz only encodes offsets, not time zones
|
|
782
|
+
if datetime_as_string:
|
|
783
|
+
assert is_string_dtype(result.dates.dtype)
|
|
784
|
+
elif mixed_offsets_as_utc:
|
|
785
|
+
assert isinstance(result.dates.dtype, pd.DatetimeTZDtype)
|
|
786
|
+
else:
|
|
787
|
+
assert is_object_dtype(result.dates.dtype)
|
|
788
|
+
|
|
789
|
+
# Check isna for the third value seperately as depending on versions this is
|
|
790
|
+
# different + pandas 3.0 assert_series_equal becomes strict about this.
|
|
791
|
+
assert pd.isna(result.dates[2])
|
|
792
|
+
assert_series_equal(result.dates.head(2), exp_dates.head(2))
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
796
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
797
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
798
|
+
@pytest.mark.filterwarnings(
|
|
799
|
+
"ignore: Non-conformant content for record 1 in column dates"
|
|
800
|
+
)
|
|
801
|
+
@pytest.mark.requires_arrow_write_api
|
|
802
|
+
def test_write_read_datetime_tz_mixed_offsets(
|
|
803
|
+
tmp_path, ext, datetime_as_string, mixed_offsets_as_utc, use_arrow
|
|
804
|
+
):
|
|
805
|
+
"""Test with dates with mixed time zone offsets."""
|
|
806
|
+
# Pandas datetime64 column types doesn't support mixed time zone offsets, so
|
|
807
|
+
# it needs to be a list of pandas.Timestamp objects instead.
|
|
808
|
+
dates = [
|
|
809
|
+
pd.Timestamp("2023-01-01 11:00:01.111+01:00"),
|
|
810
|
+
pd.Timestamp("2023-06-01 10:00:01.111+05:00"),
|
|
811
|
+
np.nan,
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
df = gp.GeoDataFrame(
|
|
815
|
+
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
|
|
816
|
+
)
|
|
817
|
+
fpath = tmp_path / f"test{ext}"
|
|
818
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
819
|
+
result = read_dataframe(
|
|
820
|
+
fpath,
|
|
821
|
+
use_arrow=use_arrow,
|
|
822
|
+
datetime_as_string=datetime_as_string,
|
|
823
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
827
|
+
if ext in (".geojson", ".geojsonl"):
|
|
828
|
+
# With GDAL < 3.11 with arrow, GDAL converts mixed time zone datetimes to
|
|
829
|
+
# UTC when read as the arrow datetime column type does not support mixed tz.
|
|
830
|
+
df_exp = df.copy()
|
|
831
|
+
df_exp.dates = pd.to_datetime(dates, utc=True)
|
|
832
|
+
if PANDAS_GE_20:
|
|
833
|
+
df_exp.dates = df_exp.dates.dt.as_unit("ms")
|
|
834
|
+
if datetime_as_string:
|
|
835
|
+
df_exp.dates = df_exp.dates.astype("str").str.replace(" ", "T")
|
|
836
|
+
df_exp.loc[2, "dates"] = pd.NA
|
|
837
|
+
assert_geodataframe_equal(result, df_exp)
|
|
838
|
+
# XFAIL: mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow
|
|
839
|
+
return
|
|
840
|
+
|
|
841
|
+
elif ext in (".gpkg", ".fgb"):
|
|
842
|
+
# With arrow and GDAL < 3.11, mixed time zone datetimes are written as
|
|
843
|
+
# string type columns, so no proper roundtrip possible.
|
|
844
|
+
df_exp = df.copy()
|
|
845
|
+
df_exp.dates = df_exp.dates.astype("string").astype("O")
|
|
846
|
+
assert_geodataframe_equal(result, df_exp)
|
|
847
|
+
# XFAIL: mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow
|
|
848
|
+
return
|
|
849
|
+
|
|
850
|
+
if datetime_as_string:
|
|
851
|
+
assert is_string_dtype(result.dates.dtype)
|
|
852
|
+
dates_str = df.dates.map(
|
|
853
|
+
lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else np.nan
|
|
854
|
+
)
|
|
855
|
+
if __gdal_version__ < (3, 7, 0):
|
|
856
|
+
# With GDAL < 3.7, time zone minutes aren't included in the string
|
|
857
|
+
dates_str = dates_str.str.slice(0, -3)
|
|
858
|
+
assert_series_equal(result.dates, dates_str)
|
|
859
|
+
elif mixed_offsets_as_utc:
|
|
860
|
+
assert isinstance(result.dates.dtype, pd.DatetimeTZDtype)
|
|
861
|
+
exp_dates = pd.to_datetime(df.dates, utc=True)
|
|
862
|
+
if PANDAS_GE_20:
|
|
863
|
+
exp_dates = exp_dates.dt.as_unit("ms")
|
|
864
|
+
assert_series_equal(result.dates, exp_dates)
|
|
865
|
+
else:
|
|
866
|
+
assert is_object_dtype(result.dates.dtype)
|
|
867
|
+
assert_geodataframe_equal(result, df)
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
871
|
+
@pytest.mark.parametrize(
|
|
872
|
+
"dates_raw",
|
|
873
|
+
[
|
|
874
|
+
(
|
|
875
|
+
pd.Timestamp("2020-01-01T09:00:00.123-05:00"),
|
|
876
|
+
pd.Timestamp("2020-01-01T10:00:00-05:00"),
|
|
877
|
+
np.nan,
|
|
878
|
+
),
|
|
879
|
+
(
|
|
880
|
+
datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"),
|
|
881
|
+
datetime.fromisoformat("2020-01-01T10:00:00-05:00"),
|
|
882
|
+
np.nan,
|
|
883
|
+
),
|
|
884
|
+
],
|
|
885
|
+
)
|
|
886
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
887
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
888
|
+
@pytest.mark.filterwarnings(
|
|
889
|
+
"ignore: Non-conformant content for record 1 in column dates"
|
|
890
|
+
)
|
|
891
|
+
@pytest.mark.requires_arrow_write_api
|
|
892
|
+
def test_write_read_datetime_tz_objects(
|
|
893
|
+
tmp_path, dates_raw, ext, use_arrow, datetime_as_string, mixed_offsets_as_utc
|
|
894
|
+
):
|
|
895
|
+
"""Datetime objects with equal offsets are read as datetime64."""
|
|
896
|
+
dates = pd.Series(dates_raw, dtype="O")
|
|
897
|
+
df = gp.GeoDataFrame(
|
|
898
|
+
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
fpath = tmp_path / f"test{ext}"
|
|
902
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
903
|
+
result = read_dataframe(
|
|
904
|
+
fpath,
|
|
905
|
+
use_arrow=use_arrow,
|
|
906
|
+
datetime_as_string=datetime_as_string,
|
|
907
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
# Check result
|
|
911
|
+
if PANDAS_GE_20:
|
|
912
|
+
exp_dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
|
|
913
|
+
else:
|
|
914
|
+
exp_dates = pd.to_datetime(dates_raw)
|
|
915
|
+
exp_df = df.copy()
|
|
916
|
+
exp_df["dates"] = pd.Series(exp_dates, name="dates")
|
|
917
|
+
|
|
918
|
+
# With some older versions, the offset is represented slightly differently
|
|
919
|
+
if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"):
|
|
920
|
+
result["dates"] = result.dates.astype(exp_df.dates.dtype)
|
|
921
|
+
|
|
922
|
+
if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"):
|
|
923
|
+
# XFAIL: Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow.
|
|
924
|
+
# The time zone offset was applied to the datetime as well as retaining
|
|
925
|
+
# the time zone. This was fixed in https://github.com/OSGeo/gdal/pull/11049
|
|
926
|
+
|
|
927
|
+
# Subtract 5 hours from the expected datetimes to match the wrong result.
|
|
928
|
+
if datetime_as_string:
|
|
929
|
+
exp_df["dates"] = pd.Series(
|
|
930
|
+
[
|
|
931
|
+
"2020-01-01T04:00:00.123000-05:00",
|
|
932
|
+
"2020-01-01T05:00:00-05:00",
|
|
933
|
+
np.nan,
|
|
934
|
+
]
|
|
935
|
+
)
|
|
936
|
+
else:
|
|
937
|
+
exp_df["dates"] = exp_df.dates - pd.Timedelta(hours=5)
|
|
938
|
+
if PANDAS_GE_20:
|
|
939
|
+
# The unit needs to be applied again apparently
|
|
940
|
+
exp_df["dates"] = exp_df.dates.dt.as_unit("ms")
|
|
941
|
+
assert_geodataframe_equal(result, exp_df)
|
|
942
|
+
return
|
|
943
|
+
|
|
944
|
+
if use_arrow and __gdal_version__ < (3, 11, 0) and ext in (".fgb", ".gpkg"):
|
|
945
|
+
# XFAIL: datetime columns are written as string with GDAL < 3.11 + arrow
|
|
946
|
+
# -> custom formatting because the df column is object dtype and thus
|
|
947
|
+
# astype(str) converted the datetime objects one by one
|
|
948
|
+
exp_df["dates"] = pd.Series(
|
|
949
|
+
["2020-01-01 09:00:00.123000-05:00", "2020-01-01 10:00:00-05:00", np.nan]
|
|
950
|
+
)
|
|
951
|
+
assert_geodataframe_equal(result, exp_df)
|
|
952
|
+
return
|
|
953
|
+
|
|
954
|
+
if datetime_as_string:
|
|
955
|
+
assert is_string_dtype(result.dates.dtype)
|
|
956
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
957
|
+
# With GDAL < 3.11 with arrow, datetime columns are written as string type
|
|
958
|
+
exp_df["dates"] = pd.Series(
|
|
959
|
+
[
|
|
960
|
+
"2020-01-01T09:00:00.123000-05:00",
|
|
961
|
+
"2020-01-01T10:00:00-05:00",
|
|
962
|
+
np.nan,
|
|
963
|
+
]
|
|
964
|
+
)
|
|
965
|
+
else:
|
|
966
|
+
exp_df["dates"] = pd.Series(
|
|
967
|
+
["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", np.nan]
|
|
968
|
+
)
|
|
969
|
+
if __gdal_version__ < (3, 7, 0):
|
|
970
|
+
# With GDAL < 3.7, time zone minutes aren't included in the string
|
|
971
|
+
exp_df["dates"] = exp_df.dates.str.slice(0, -3)
|
|
972
|
+
elif mixed_offsets_as_utc:
|
|
973
|
+
# the offsets are all -05:00, so the result retains the offset and not UTC
|
|
974
|
+
assert isinstance(result.dates.dtype, pd.DatetimeTZDtype)
|
|
975
|
+
assert str(result.dates.dtype.tz) in ("UTC-05:00", "pytz.FixedOffset(-300)")
|
|
976
|
+
else:
|
|
977
|
+
assert isinstance(result.dates.dtype, pd.DatetimeTZDtype)
|
|
978
|
+
|
|
979
|
+
assert_geodataframe_equal(result, exp_df)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
|
|
983
|
+
@pytest.mark.parametrize("datetime_as_string", [False, True])
|
|
984
|
+
@pytest.mark.parametrize("mixed_offsets_as_utc", [False, True])
|
|
985
|
+
@pytest.mark.requires_arrow_write_api
|
|
986
|
+
def test_write_read_datetime_utc(
|
|
987
|
+
tmp_path, ext, use_arrow, datetime_as_string, mixed_offsets_as_utc
|
|
988
|
+
):
|
|
989
|
+
"""Test writing/reading a column with UTC datetimes."""
|
|
990
|
+
dates_raw = ["2020-01-01T09:00:00.123Z", "2020-01-01T10:00:00Z", np.nan]
|
|
991
|
+
if PANDAS_GE_20:
|
|
992
|
+
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
|
|
993
|
+
else:
|
|
994
|
+
dates = pd.to_datetime(dates_raw)
|
|
995
|
+
df = gp.GeoDataFrame(
|
|
996
|
+
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
|
|
997
|
+
)
|
|
998
|
+
assert df.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]")
|
|
999
|
+
|
|
1000
|
+
fpath = tmp_path / f"test{ext}"
|
|
1001
|
+
write_dataframe(df, fpath, use_arrow=use_arrow)
|
|
1002
|
+
result = read_dataframe(
|
|
1003
|
+
fpath,
|
|
1004
|
+
use_arrow=use_arrow,
|
|
1005
|
+
datetime_as_string=datetime_as_string,
|
|
1006
|
+
mixed_offsets_as_utc=mixed_offsets_as_utc,
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0):
|
|
1010
|
+
# With GDAL < 3.11 with arrow, time zone information is dropped when reading
|
|
1011
|
+
# .fgb
|
|
1012
|
+
if datetime_as_string:
|
|
1013
|
+
assert is_string_dtype(result.dates.dtype)
|
|
1014
|
+
dates_str = pd.Series(
|
|
1015
|
+
["2020-01-01T09:00:00.123", "2020-01-01T10:00:00.000", np.nan],
|
|
1016
|
+
name="dates",
|
|
1017
|
+
)
|
|
1018
|
+
assert_series_equal(result.dates, dates_str)
|
|
1019
|
+
else:
|
|
1020
|
+
assert_series_equal(result.dates, df.dates.dt.tz_localize(None))
|
|
1021
|
+
# XFAIL: UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow
|
|
1022
|
+
elif datetime_as_string:
|
|
1023
|
+
assert is_string_dtype(result.dates.dtype)
|
|
1024
|
+
if use_arrow and __gdal_version__ < (3, 11, 0):
|
|
1025
|
+
dates_str = df.dates.astype("str").str.replace(" ", "T")
|
|
1026
|
+
dates_str[2] = np.nan
|
|
1027
|
+
else:
|
|
1028
|
+
dates_str = pd.Series(dates_raw, name="dates")
|
|
1029
|
+
if __gdal_version__ < (3, 7, 0):
|
|
1030
|
+
# With GDAL < 3.7, datetime ends with +00 for UTC, not Z
|
|
1031
|
+
dates_str = dates_str.str.replace("Z", "+00")
|
|
1032
|
+
assert_series_equal(result.dates, dates_str)
|
|
1033
|
+
else:
|
|
1034
|
+
assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]")
|
|
1035
|
+
assert_geodataframe_equal(result, df)
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def test_read_null_values(tmp_path, use_arrow):
|
|
1039
|
+
filename = tmp_path / "test_null_values_no_geometry.gpkg"
|
|
1040
|
+
|
|
1041
|
+
# create a GPKG with no geometries and only null values
|
|
1042
|
+
expected = pd.DataFrame({"col": [None, None]})
|
|
1043
|
+
write_dataframe(expected, filename)
|
|
1044
|
+
|
|
1045
|
+
df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False)
|
|
1046
|
+
|
|
1047
|
+
# make sure that Null values are preserved
|
|
1048
|
+
assert df["col"].isna().all()
|
|
1049
|
+
|
|
1050
|
+
|
|
1051
|
+
def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow):
|
|
1052
|
+
kwargs = {"use_arrow": use_arrow, "skip_features": 2, "max_features": 2}
|
|
1053
|
+
|
|
1054
|
+
# default is to not set FIDs as index
|
|
1055
|
+
df = read_dataframe(naturalearth_lowres_all_ext, **kwargs)
|
|
1056
|
+
assert_index_equal(df.index, pd.RangeIndex(0, 2))
|
|
1057
|
+
|
|
1058
|
+
df = read_dataframe(naturalearth_lowres_all_ext, fid_as_index=False, **kwargs)
|
|
1059
|
+
assert_index_equal(df.index, pd.RangeIndex(0, 2))
|
|
1060
|
+
|
|
1061
|
+
df = read_dataframe(
|
|
1062
|
+
naturalearth_lowres_all_ext,
|
|
1063
|
+
fid_as_index=True,
|
|
1064
|
+
**kwargs,
|
|
1065
|
+
)
|
|
1066
|
+
fids_expected = pd.Index([2, 3], name="fid")
|
|
1067
|
+
fids_expected += START_FID[naturalearth_lowres_all_ext.suffix]
|
|
1068
|
+
assert_index_equal(df.index, fids_expected)
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
def test_read_fid_as_index_only(naturalearth_lowres, use_arrow):
|
|
1072
|
+
df = read_dataframe(
|
|
1073
|
+
naturalearth_lowres,
|
|
1074
|
+
columns=[],
|
|
1075
|
+
read_geometry=False,
|
|
1076
|
+
fid_as_index=True,
|
|
1077
|
+
use_arrow=use_arrow,
|
|
1078
|
+
)
|
|
1079
|
+
assert df is not None
|
|
1080
|
+
assert len(df) == 177
|
|
1081
|
+
assert len(df.columns) == 0
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def test_read_where(naturalearth_lowres_all_ext, use_arrow):
|
|
1085
|
+
# empty filter should return full set of records
|
|
1086
|
+
df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, where="")
|
|
1087
|
+
assert len(df) == 177
|
|
1088
|
+
|
|
1089
|
+
# should return singular item
|
|
1090
|
+
df = read_dataframe(
|
|
1091
|
+
naturalearth_lowres_all_ext, use_arrow=use_arrow, where="iso_a3 = 'CAN'"
|
|
1092
|
+
)
|
|
1093
|
+
assert len(df) == 1
|
|
1094
|
+
assert df.iloc[0].iso_a3 == "CAN"
|
|
1095
|
+
|
|
1096
|
+
df = read_dataframe(
|
|
1097
|
+
naturalearth_lowres_all_ext,
|
|
1098
|
+
use_arrow=use_arrow,
|
|
1099
|
+
where="iso_a3 IN ('CAN', 'USA', 'MEX')",
|
|
1100
|
+
)
|
|
1101
|
+
assert len(df) == 3
|
|
1102
|
+
assert len(set(df.iso_a3.unique()).difference(["CAN", "USA", "MEX"])) == 0
|
|
1103
|
+
|
|
1104
|
+
# should return items within range
|
|
1105
|
+
df = read_dataframe(
|
|
1106
|
+
naturalearth_lowres_all_ext,
|
|
1107
|
+
use_arrow=use_arrow,
|
|
1108
|
+
where="POP_EST >= 10000000 AND POP_EST < 100000000",
|
|
1109
|
+
)
|
|
1110
|
+
assert len(df) == 75
|
|
1111
|
+
assert df.pop_est.min() >= 10000000
|
|
1112
|
+
assert df.pop_est.max() < 100000000
|
|
1113
|
+
|
|
1114
|
+
# should match no items
|
|
1115
|
+
df = read_dataframe(
|
|
1116
|
+
naturalearth_lowres_all_ext, use_arrow=use_arrow, where="ISO_A3 = 'INVALID'"
|
|
1117
|
+
)
|
|
1118
|
+
assert len(df) == 0
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
def test_read_where_invalid(request, naturalearth_lowres_all_ext, use_arrow):
|
|
1122
|
+
if use_arrow and naturalearth_lowres_all_ext.suffix == ".gpkg":
|
|
1123
|
+
# https://github.com/OSGeo/gdal/issues/8492
|
|
1124
|
+
request.node.add_marker(pytest.mark.xfail(reason="GDAL doesn't error for GPGK"))
|
|
1125
|
+
|
|
1126
|
+
if naturalearth_lowres_all_ext.suffix == ".gpkg" and __gdal_version__ >= (3, 11, 0):
|
|
1127
|
+
with pytest.raises(DataLayerError, match="no such column"):
|
|
1128
|
+
read_dataframe(
|
|
1129
|
+
naturalearth_lowres_all_ext, use_arrow=use_arrow, where="invalid"
|
|
1130
|
+
)
|
|
1131
|
+
else:
|
|
1132
|
+
with pytest.raises(ValueError, match="Invalid SQL"):
|
|
1133
|
+
read_dataframe(
|
|
1134
|
+
naturalearth_lowres_all_ext, use_arrow=use_arrow, where="invalid"
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
def test_read_where_ignored_field(naturalearth_lowres, use_arrow):
|
|
1139
|
+
# column included in where is not also included in list of columns, which means
|
|
1140
|
+
# GDAL will return no features
|
|
1141
|
+
# NOTE: this behavior is inconsistent across drivers so only shapefiles are
|
|
1142
|
+
# tested for this
|
|
1143
|
+
df = read_dataframe(
|
|
1144
|
+
naturalearth_lowres,
|
|
1145
|
+
where=""" "iso_a3" = 'CAN' """,
|
|
1146
|
+
columns=["name"],
|
|
1147
|
+
use_arrow=use_arrow,
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
assert len(df) == 0
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
@pytest.mark.parametrize("bbox", [(1,), (1, 2), (1, 2, 3)])
|
|
1154
|
+
def test_read_bbox_invalid(naturalearth_lowres_all_ext, bbox, use_arrow):
|
|
1155
|
+
with pytest.raises(ValueError, match="Invalid bbox"):
|
|
1156
|
+
read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox)
|
|
1157
|
+
|
|
1158
|
+
|
|
1159
|
+
@pytest.mark.parametrize(
|
|
1160
|
+
"bbox,expected",
|
|
1161
|
+
[
|
|
1162
|
+
((0, 0, 0.00001, 0.00001), []),
|
|
1163
|
+
((-85, 8, -80, 10), ["PAN", "CRI"]),
|
|
1164
|
+
((-104, 54, -105, 55), ["CAN"]),
|
|
1165
|
+
],
|
|
1166
|
+
)
|
|
1167
|
+
def test_read_bbox(naturalearth_lowres_all_ext, use_arrow, bbox, expected):
|
|
1168
|
+
if (
|
|
1169
|
+
use_arrow
|
|
1170
|
+
and __gdal_version__ < (3, 8, 0)
|
|
1171
|
+
and naturalearth_lowres_all_ext.suffix == ".gpkg"
|
|
1172
|
+
):
|
|
1173
|
+
pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
|
|
1174
|
+
|
|
1175
|
+
df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, bbox=bbox)
|
|
1176
|
+
|
|
1177
|
+
assert np.array_equal(df.iso_a3, expected)
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
def test_read_bbox_sql(naturalearth_lowres_all_ext, use_arrow):
|
|
1181
|
+
df = read_dataframe(
|
|
1182
|
+
naturalearth_lowres_all_ext,
|
|
1183
|
+
use_arrow=use_arrow,
|
|
1184
|
+
bbox=(-180, 50, -100, 90),
|
|
1185
|
+
sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')",
|
|
1186
|
+
)
|
|
1187
|
+
assert len(df) == 1
|
|
1188
|
+
assert np.array_equal(df.iso_a3, ["CAN"])
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
def test_read_bbox_where(naturalearth_lowres_all_ext, use_arrow):
|
|
1192
|
+
df = read_dataframe(
|
|
1193
|
+
naturalearth_lowres_all_ext,
|
|
1194
|
+
use_arrow=use_arrow,
|
|
1195
|
+
bbox=(-180, 50, -100, 90),
|
|
1196
|
+
where="iso_a3 not in ('USA', 'RUS')",
|
|
1197
|
+
)
|
|
1198
|
+
assert len(df) == 1
|
|
1199
|
+
assert np.array_equal(df.iso_a3, ["CAN"])
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
@pytest.mark.parametrize(
|
|
1203
|
+
"mask",
|
|
1204
|
+
[
|
|
1205
|
+
{"type": "Point", "coordinates": [0, 0]},
|
|
1206
|
+
'{"type": "Point", "coordinates": [0, 0]}',
|
|
1207
|
+
"invalid",
|
|
1208
|
+
],
|
|
1209
|
+
)
|
|
1210
|
+
def test_read_mask_invalid(naturalearth_lowres, use_arrow, mask):
|
|
1211
|
+
with pytest.raises(ValueError, match="'mask' parameter must be a Shapely geometry"):
|
|
1212
|
+
read_dataframe(naturalearth_lowres, use_arrow=use_arrow, mask=mask)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def test_read_bbox_mask_invalid(naturalearth_lowres, use_arrow):
|
|
1216
|
+
with pytest.raises(ValueError, match="cannot set both 'bbox' and 'mask'"):
|
|
1217
|
+
read_dataframe(
|
|
1218
|
+
naturalearth_lowres,
|
|
1219
|
+
use_arrow=use_arrow,
|
|
1220
|
+
bbox=(-85, 8, -80, 10),
|
|
1221
|
+
mask=shapely.Point(-105, 55),
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
@pytest.mark.parametrize(
|
|
1226
|
+
"mask,expected",
|
|
1227
|
+
[
|
|
1228
|
+
(shapely.Point(-105, 55), ["CAN"]),
|
|
1229
|
+
(shapely.box(-85, 8, -80, 10), ["PAN", "CRI"]),
|
|
1230
|
+
(
|
|
1231
|
+
shapely.Polygon(
|
|
1232
|
+
(
|
|
1233
|
+
[6.101929483362767, 50.97085041206964],
|
|
1234
|
+
[5.773001596839322, 50.90661120482673],
|
|
1235
|
+
[5.593156133704326, 50.642648747710325],
|
|
1236
|
+
[6.059271089606312, 50.686051894002475],
|
|
1237
|
+
[6.374064065737485, 50.851481340346965],
|
|
1238
|
+
[6.101929483362767, 50.97085041206964],
|
|
1239
|
+
)
|
|
1240
|
+
),
|
|
1241
|
+
["DEU", "BEL", "NLD"],
|
|
1242
|
+
),
|
|
1243
|
+
(
|
|
1244
|
+
shapely.GeometryCollection(
|
|
1245
|
+
[shapely.Point(-7.7, 53), shapely.box(-85, 8, -80, 10)]
|
|
1246
|
+
),
|
|
1247
|
+
["PAN", "CRI", "IRL"],
|
|
1248
|
+
),
|
|
1249
|
+
],
|
|
1250
|
+
)
|
|
1251
|
+
def test_read_mask(
|
|
1252
|
+
naturalearth_lowres_all_ext,
|
|
1253
|
+
use_arrow,
|
|
1254
|
+
mask,
|
|
1255
|
+
expected,
|
|
1256
|
+
):
|
|
1257
|
+
if (
|
|
1258
|
+
use_arrow
|
|
1259
|
+
and __gdal_version__ < (3, 8, 0)
|
|
1260
|
+
and naturalearth_lowres_all_ext.suffix == ".gpkg"
|
|
1261
|
+
):
|
|
1262
|
+
pytest.xfail(reason="GDAL bug: https://github.com/OSGeo/gdal/issues/8347")
|
|
1263
|
+
|
|
1264
|
+
df = read_dataframe(naturalearth_lowres_all_ext, use_arrow=use_arrow, mask=mask)
|
|
1265
|
+
|
|
1266
|
+
assert len(df) == len(expected)
|
|
1267
|
+
assert np.array_equal(df.iso_a3, expected)
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
def test_read_mask_sql(naturalearth_lowres_all_ext, use_arrow):
|
|
1271
|
+
df = read_dataframe(
|
|
1272
|
+
naturalearth_lowres_all_ext,
|
|
1273
|
+
use_arrow=use_arrow,
|
|
1274
|
+
mask=shapely.box(-180, 50, -100, 90),
|
|
1275
|
+
sql="SELECT * from naturalearth_lowres where iso_a3 not in ('USA', 'RUS')",
|
|
1276
|
+
)
|
|
1277
|
+
assert len(df) == 1
|
|
1278
|
+
assert np.array_equal(df.iso_a3, ["CAN"])
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def test_read_mask_where(naturalearth_lowres_all_ext, use_arrow):
|
|
1282
|
+
df = read_dataframe(
|
|
1283
|
+
naturalearth_lowres_all_ext,
|
|
1284
|
+
use_arrow=use_arrow,
|
|
1285
|
+
mask=shapely.box(-180, 50, -100, 90),
|
|
1286
|
+
where="iso_a3 not in ('USA', 'RUS')",
|
|
1287
|
+
)
|
|
1288
|
+
assert len(df) == 1
|
|
1289
|
+
assert np.array_equal(df.iso_a3, ["CAN"])
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
@pytest.mark.parametrize("fids", [[1, 5, 10], np.array([1, 5, 10], dtype=np.int64)])
|
|
1293
|
+
def test_read_fids(naturalearth_lowres_all_ext, fids, use_arrow):
|
|
1294
|
+
# ensure keyword is properly passed through
|
|
1295
|
+
df = read_dataframe(
|
|
1296
|
+
naturalearth_lowres_all_ext, fids=fids, fid_as_index=True, use_arrow=use_arrow
|
|
1297
|
+
)
|
|
1298
|
+
assert len(df) == 3
|
|
1299
|
+
assert np.array_equal(fids, df.index.values)
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
@requires_pyarrow_api
|
|
1303
|
+
def test_read_fids_arrow_max_exception(naturalearth_lowres):
|
|
1304
|
+
# Maximum number at time of writing is 4997 for "OGRSQL". For e.g. for SQLite based
|
|
1305
|
+
# formats like Geopackage, there is no limit.
|
|
1306
|
+
nb_fids = 4998
|
|
1307
|
+
fids = range(nb_fids)
|
|
1308
|
+
with pytest.raises(ValueError, match=f"error applying filter for {nb_fids} fids"):
|
|
1309
|
+
_ = read_dataframe(naturalearth_lowres, fids=fids, use_arrow=True)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
@requires_pyarrow_api
|
|
1313
|
+
@pytest.mark.skipif(
|
|
1314
|
+
__gdal_version__ >= (3, 8, 0), reason="GDAL >= 3.8.0 does not need to warn"
|
|
1315
|
+
)
|
|
1316
|
+
def test_read_fids_arrow_warning_old_gdal(naturalearth_lowres_all_ext):
|
|
1317
|
+
# A warning should be given for old GDAL versions, except for some file formats.
|
|
1318
|
+
if naturalearth_lowres_all_ext.suffix not in [".gpkg", ".geojson"]:
|
|
1319
|
+
handler = pytest.warns(
|
|
1320
|
+
UserWarning,
|
|
1321
|
+
match="Using 'fids' and 'use_arrow=True' with GDAL < 3.8 can be slow",
|
|
1322
|
+
)
|
|
1323
|
+
else:
|
|
1324
|
+
handler = contextlib.nullcontext()
|
|
1325
|
+
|
|
1326
|
+
with handler:
|
|
1327
|
+
df = read_dataframe(naturalearth_lowres_all_ext, fids=[22], use_arrow=True)
|
|
1328
|
+
assert len(df) == 1
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
def test_read_fids_force_2d(tmp_path):
|
|
1332
|
+
filename = tmp_path / "test.gpkg"
|
|
1333
|
+
|
|
1334
|
+
# create a GPKG with 3D point values
|
|
1335
|
+
expected = gp.GeoDataFrame(
|
|
1336
|
+
geometry=[Point(0, 0, 0), Point(1, 1, 0)], crs="EPSG:4326"
|
|
1337
|
+
)
|
|
1338
|
+
write_dataframe(expected, filename)
|
|
1339
|
+
|
|
1340
|
+
df = read_dataframe(filename, fids=[1])
|
|
1341
|
+
assert_geodataframe_equal(df, expected.iloc[:1])
|
|
1342
|
+
|
|
1343
|
+
df = read_dataframe(filename, force_2d=True, fids=[1])
|
|
1344
|
+
assert np.array_equal(
|
|
1345
|
+
df.geometry.values, shapely.force_2d(expected.iloc[:1].geometry.values)
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
@pytest.mark.parametrize("skip_features", [10, 200])
|
|
1350
|
+
def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_features):
|
|
1351
|
+
ext = naturalearth_lowres_all_ext.suffix
|
|
1352
|
+
expected = (
|
|
1353
|
+
read_dataframe(naturalearth_lowres_all_ext)
|
|
1354
|
+
.iloc[skip_features:]
|
|
1355
|
+
.reset_index(drop=True)
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
df = read_dataframe(
|
|
1359
|
+
naturalearth_lowres_all_ext, skip_features=skip_features, use_arrow=use_arrow
|
|
1360
|
+
)
|
|
1361
|
+
assert len(df) == len(expected)
|
|
1362
|
+
|
|
1363
|
+
# Coordinates are not precisely equal when written to JSON
|
|
1364
|
+
# dtypes do not necessarily round-trip precisely through JSON
|
|
1365
|
+
is_json = ext in [".geojson", ".geojsonl"]
|
|
1366
|
+
# In .geojsonl the vertices are reordered, so normalize
|
|
1367
|
+
is_jsons = ext == ".geojsonl"
|
|
1368
|
+
|
|
1369
|
+
if skip_features == 200 and not use_arrow:
|
|
1370
|
+
# result is an empty dataframe, so no proper dtype inference happens
|
|
1371
|
+
# for the numpy object dtype arrays
|
|
1372
|
+
df[["continent", "name", "iso_a3"]] = df[
|
|
1373
|
+
["continent", "name", "iso_a3"]
|
|
1374
|
+
].astype("str")
|
|
1375
|
+
|
|
1376
|
+
assert_geodataframe_equal(
|
|
1377
|
+
df,
|
|
1378
|
+
expected,
|
|
1379
|
+
check_less_precise=is_json,
|
|
1380
|
+
check_index_type=False,
|
|
1381
|
+
check_dtype=not is_json,
|
|
1382
|
+
normalize=is_jsons,
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
|
|
1386
|
+
def test_read_negative_skip_features(naturalearth_lowres, use_arrow):
|
|
1387
|
+
with pytest.raises(ValueError, match="'skip_features' must be >= 0"):
|
|
1388
|
+
read_dataframe(naturalearth_lowres, skip_features=-1, use_arrow=use_arrow)
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
@pytest.mark.parametrize("skip_features", [0, 10, 200])
|
|
1392
|
+
@pytest.mark.parametrize("max_features", [10, 100])
|
|
1393
|
+
def test_read_max_features(
|
|
1394
|
+
naturalearth_lowres_all_ext, use_arrow, max_features, skip_features
|
|
1395
|
+
):
|
|
1396
|
+
ext = naturalearth_lowres_all_ext.suffix
|
|
1397
|
+
expected = (
|
|
1398
|
+
read_dataframe(naturalearth_lowres_all_ext)
|
|
1399
|
+
.iloc[skip_features : skip_features + max_features]
|
|
1400
|
+
.reset_index(drop=True)
|
|
1401
|
+
)
|
|
1402
|
+
df = read_dataframe(
|
|
1403
|
+
naturalearth_lowres_all_ext,
|
|
1404
|
+
skip_features=skip_features,
|
|
1405
|
+
max_features=max_features,
|
|
1406
|
+
use_arrow=use_arrow,
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
assert len(df) == len(expected)
|
|
1410
|
+
|
|
1411
|
+
# Coordinates are not precisely equal when written to JSON
|
|
1412
|
+
# dtypes do not necessarily round-trip precisely through JSON
|
|
1413
|
+
is_json = ext in [".geojson", ".geojsonl"]
|
|
1414
|
+
# In .geojsonl the vertices are reordered, so normalize
|
|
1415
|
+
is_jsons = ext == ".geojsonl"
|
|
1416
|
+
|
|
1417
|
+
if len(expected) == 0 and not use_arrow:
|
|
1418
|
+
# for pandas >= 3, the column has string dtype but when reading it as
|
|
1419
|
+
# empty result, it gets inferred as object dtype
|
|
1420
|
+
expected["continent"] = expected["continent"].astype("object")
|
|
1421
|
+
expected["name"] = expected["name"].astype("object")
|
|
1422
|
+
expected["iso_a3"] = expected["iso_a3"].astype("object")
|
|
1423
|
+
|
|
1424
|
+
assert_geodataframe_equal(
|
|
1425
|
+
df,
|
|
1426
|
+
expected,
|
|
1427
|
+
check_less_precise=is_json,
|
|
1428
|
+
check_index_type=False,
|
|
1429
|
+
check_dtype=not is_json,
|
|
1430
|
+
normalize=is_jsons,
|
|
1431
|
+
)
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
def test_read_negative_max_features(naturalearth_lowres, use_arrow):
|
|
1435
|
+
with pytest.raises(ValueError, match="'max_features' must be >= 0"):
|
|
1436
|
+
read_dataframe(naturalearth_lowres, max_features=-1, use_arrow=use_arrow)
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
def test_read_non_existent_file(use_arrow):
|
|
1440
|
+
# ensure consistent error type / message from GDAL
|
|
1441
|
+
with pytest.raises(DataSourceError, match="No such file or directory"):
|
|
1442
|
+
read_dataframe("non-existent.shp", use_arrow=use_arrow)
|
|
1443
|
+
|
|
1444
|
+
with pytest.raises(DataSourceError, match="does not exist in the file system"):
|
|
1445
|
+
read_dataframe("/vsizip/non-existent.zip", use_arrow=use_arrow)
|
|
1446
|
+
|
|
1447
|
+
with pytest.raises(DataSourceError, match="does not exist in the file system"):
|
|
1448
|
+
read_dataframe("zip:///non-existent.zip", use_arrow=use_arrow)
|
|
1449
|
+
|
|
1450
|
+
|
|
1451
|
+
def test_read_sql(naturalearth_lowres_all_ext, use_arrow):
|
|
1452
|
+
# The geometry column cannot be specified when using the
|
|
1453
|
+
# default OGRSQL dialect but is returned nonetheless, so 4 columns.
|
|
1454
|
+
sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
|
|
1455
|
+
df = read_dataframe(
|
|
1456
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1457
|
+
)
|
|
1458
|
+
assert len(df.columns) == 4
|
|
1459
|
+
assert len(df) == 177
|
|
1460
|
+
|
|
1461
|
+
# Should return single row
|
|
1462
|
+
sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
|
|
1463
|
+
df = read_dataframe(
|
|
1464
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1465
|
+
)
|
|
1466
|
+
assert len(df) == 1
|
|
1467
|
+
assert len(df.columns) == 6
|
|
1468
|
+
assert df.iloc[0].iso_a3 == "CAN"
|
|
1469
|
+
|
|
1470
|
+
sql = """SELECT *
|
|
1471
|
+
FROM naturalearth_lowres
|
|
1472
|
+
WHERE iso_a3 IN ('CAN', 'USA', 'MEX')"""
|
|
1473
|
+
df = read_dataframe(
|
|
1474
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1475
|
+
)
|
|
1476
|
+
assert len(df.columns) == 6
|
|
1477
|
+
assert len(df) == 3
|
|
1478
|
+
assert df.iso_a3.tolist() == ["CAN", "USA", "MEX"]
|
|
1479
|
+
|
|
1480
|
+
sql = """SELECT *
|
|
1481
|
+
FROM naturalearth_lowres
|
|
1482
|
+
WHERE iso_a3 IN ('CAN', 'USA', 'MEX')
|
|
1483
|
+
ORDER BY name"""
|
|
1484
|
+
df = read_dataframe(
|
|
1485
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1486
|
+
)
|
|
1487
|
+
assert len(df.columns) == 6
|
|
1488
|
+
assert len(df) == 3
|
|
1489
|
+
assert df.iso_a3.tolist() == ["CAN", "MEX", "USA"]
|
|
1490
|
+
|
|
1491
|
+
# Should return items within range.
|
|
1492
|
+
sql = """SELECT *
|
|
1493
|
+
FROM naturalearth_lowres
|
|
1494
|
+
WHERE POP_EST >= 10000000 AND POP_EST < 100000000"""
|
|
1495
|
+
df = read_dataframe(
|
|
1496
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1497
|
+
)
|
|
1498
|
+
assert len(df) == 75
|
|
1499
|
+
assert len(df.columns) == 6
|
|
1500
|
+
assert df.pop_est.min() >= 10000000
|
|
1501
|
+
assert df.pop_est.max() < 100000000
|
|
1502
|
+
|
|
1503
|
+
# Should match no items.
|
|
1504
|
+
sql = "SELECT * FROM naturalearth_lowres WHERE ISO_A3 = 'INVALID'"
|
|
1505
|
+
df = read_dataframe(
|
|
1506
|
+
naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow
|
|
1507
|
+
)
|
|
1508
|
+
assert len(df) == 0
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
def test_read_sql_invalid(naturalearth_lowres_all_ext, use_arrow):
|
|
1512
|
+
if naturalearth_lowres_all_ext.suffix == ".gpkg":
|
|
1513
|
+
with pytest.raises(Exception, match="In ExecuteSQL().*"):
|
|
1514
|
+
read_dataframe(
|
|
1515
|
+
naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
|
|
1516
|
+
)
|
|
1517
|
+
else:
|
|
1518
|
+
with pytest.raises(Exception, match="SQL Expression Parsing Error"):
|
|
1519
|
+
read_dataframe(
|
|
1520
|
+
naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
with pytest.raises(
|
|
1524
|
+
ValueError, match="'sql' parameter cannot be combined with 'layer'"
|
|
1525
|
+
):
|
|
1526
|
+
read_dataframe(
|
|
1527
|
+
naturalearth_lowres_all_ext,
|
|
1528
|
+
sql="whatever",
|
|
1529
|
+
layer="invalid",
|
|
1530
|
+
use_arrow=use_arrow,
|
|
1531
|
+
)
|
|
1532
|
+
|
|
1533
|
+
|
|
1534
|
+
def test_read_sql_columns_where(naturalearth_lowres_all_ext, use_arrow):
|
|
1535
|
+
sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
|
|
1536
|
+
df = read_dataframe(
|
|
1537
|
+
naturalearth_lowres_all_ext,
|
|
1538
|
+
sql=sql,
|
|
1539
|
+
sql_dialect="OGRSQL",
|
|
1540
|
+
columns=["iso_a3_renamed", "name"],
|
|
1541
|
+
where="iso_a3_renamed IN ('CAN', 'USA', 'MEX')",
|
|
1542
|
+
use_arrow=use_arrow,
|
|
1543
|
+
)
|
|
1544
|
+
assert len(df.columns) == 3
|
|
1545
|
+
assert len(df) == 3
|
|
1546
|
+
assert df.iso_a3_renamed.tolist() == ["CAN", "USA", "MEX"]
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext, use_arrow):
|
|
1550
|
+
sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres"
|
|
1551
|
+
df = read_dataframe(
|
|
1552
|
+
naturalearth_lowres_all_ext,
|
|
1553
|
+
sql=sql,
|
|
1554
|
+
sql_dialect="OGRSQL",
|
|
1555
|
+
columns=["iso_a3_renamed", "name"],
|
|
1556
|
+
where="iso_a3_renamed IN ('CRI', 'PAN')",
|
|
1557
|
+
bbox=(-85, 8, -80, 10),
|
|
1558
|
+
use_arrow=use_arrow,
|
|
1559
|
+
)
|
|
1560
|
+
assert len(df.columns) == 3
|
|
1561
|
+
assert len(df) == 2
|
|
1562
|
+
assert df.iso_a3_renamed.tolist() == ["PAN", "CRI"]
|
|
1563
|
+
|
|
1564
|
+
|
|
1565
|
+
def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow):
|
|
1566
|
+
sql = """SELECT *
|
|
1567
|
+
FROM naturalearth_lowres
|
|
1568
|
+
WHERE iso_a3 IN ('CAN', 'MEX', 'USA')
|
|
1569
|
+
ORDER BY name"""
|
|
1570
|
+
df = read_dataframe(
|
|
1571
|
+
naturalearth_lowres_all_ext,
|
|
1572
|
+
sql=sql,
|
|
1573
|
+
skip_features=1,
|
|
1574
|
+
max_features=1,
|
|
1575
|
+
sql_dialect="OGRSQL",
|
|
1576
|
+
use_arrow=use_arrow,
|
|
1577
|
+
)
|
|
1578
|
+
assert len(df.columns) == 6
|
|
1579
|
+
assert len(df) == 1
|
|
1580
|
+
assert df.iso_a3.tolist() == ["MEX"]
|
|
1581
|
+
|
|
1582
|
+
sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
|
|
1583
|
+
df = read_dataframe(
|
|
1584
|
+
naturalearth_lowres_all_ext,
|
|
1585
|
+
sql=sql,
|
|
1586
|
+
max_features=3,
|
|
1587
|
+
sql_dialect="OGRSQL",
|
|
1588
|
+
use_arrow=use_arrow,
|
|
1589
|
+
)
|
|
1590
|
+
assert len(df) == 1
|
|
1591
|
+
|
|
1592
|
+
sql = "SELECT * FROM naturalearth_lowres LIMIT 1"
|
|
1593
|
+
df = read_dataframe(
|
|
1594
|
+
naturalearth_lowres_all_ext,
|
|
1595
|
+
sql=sql,
|
|
1596
|
+
sql_dialect="OGRSQL",
|
|
1597
|
+
skip_features=1,
|
|
1598
|
+
use_arrow=use_arrow,
|
|
1599
|
+
)
|
|
1600
|
+
assert len(df) == 0
|
|
1601
|
+
|
|
1602
|
+
|
|
1603
|
+
@requires_gdal_geos
|
|
1604
|
+
@pytest.mark.parametrize(
|
|
1605
|
+
"naturalearth_lowres",
|
|
1606
|
+
[ext for ext in ALL_EXTS if ext != ".gpkg"],
|
|
1607
|
+
indirect=["naturalearth_lowres"],
|
|
1608
|
+
)
|
|
1609
|
+
def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres, use_arrow):
|
|
1610
|
+
# Should return singular item
|
|
1611
|
+
sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
|
|
1612
|
+
df = read_dataframe(
|
|
1613
|
+
naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
|
|
1614
|
+
)
|
|
1615
|
+
assert len(df) == 1
|
|
1616
|
+
assert len(df.columns) == 6
|
|
1617
|
+
assert df.iloc[0].iso_a3 == "CAN"
|
|
1618
|
+
area_canada = df.iloc[0].geometry.area
|
|
1619
|
+
|
|
1620
|
+
# Use spatialite function
|
|
1621
|
+
sql = """SELECT ST_Buffer(geometry, 5) AS geometry, name, pop_est, iso_a3
|
|
1622
|
+
FROM naturalearth_lowres
|
|
1623
|
+
WHERE ISO_A3 = 'CAN'"""
|
|
1624
|
+
df = read_dataframe(
|
|
1625
|
+
naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow
|
|
1626
|
+
)
|
|
1627
|
+
assert len(df) == 1
|
|
1628
|
+
assert len(df.columns) == 4
|
|
1629
|
+
assert df.iloc[0].geometry.area > area_canada
|
|
1630
|
+
|
|
1631
|
+
|
|
1632
|
+
@requires_gdal_geos
|
|
1633
|
+
@pytest.mark.parametrize(
|
|
1634
|
+
"naturalearth_lowres", [".gpkg"], indirect=["naturalearth_lowres"]
|
|
1635
|
+
)
|
|
1636
|
+
def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
|
|
1637
|
+
# "INDIRECT_SQL" prohibits GDAL from passing the SQL statement to sqlite.
|
|
1638
|
+
# Because the statement is processed within GDAL it is possible to use
|
|
1639
|
+
# spatialite functions even if sqlite isn't built with spatialite support.
|
|
1640
|
+
sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'"
|
|
1641
|
+
df = read_dataframe(
|
|
1642
|
+
naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
|
|
1643
|
+
)
|
|
1644
|
+
assert len(df) == 1
|
|
1645
|
+
assert len(df.columns) == 6
|
|
1646
|
+
assert df.iloc[0].iso_a3 == "CAN"
|
|
1647
|
+
area_canada = df.iloc[0].geometry.area
|
|
1648
|
+
|
|
1649
|
+
# Use spatialite function
|
|
1650
|
+
sql = """SELECT ST_Buffer(geom, 5) AS geometry, name, pop_est, iso_a3
|
|
1651
|
+
FROM naturalearth_lowres
|
|
1652
|
+
WHERE ISO_A3 = 'CAN'"""
|
|
1653
|
+
df = read_dataframe(
|
|
1654
|
+
naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow
|
|
1655
|
+
)
|
|
1656
|
+
assert len(df) == 1
|
|
1657
|
+
assert len(df.columns) == 4
|
|
1658
|
+
assert df.iloc[0].geometry.area > area_canada
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
@pytest.mark.parametrize(
|
|
1662
|
+
"encoding, arrow",
|
|
1663
|
+
[
|
|
1664
|
+
("utf-8", False),
|
|
1665
|
+
pytest.param("utf-8", True, marks=requires_arrow_write_api),
|
|
1666
|
+
("cp1252", False),
|
|
1667
|
+
(None, False),
|
|
1668
|
+
],
|
|
1669
|
+
)
|
|
1670
|
+
def test_write_csv_encoding(tmp_path, encoding, arrow):
|
|
1671
|
+
"""Test if write_dataframe uses the default encoding correctly.
|
|
1672
|
+
|
|
1673
|
+
Arrow only supports utf-8 encoding.
|
|
1674
|
+
"""
|
|
1675
|
+
# Write csv test file. Depending on the os this will be written in a different
|
|
1676
|
+
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
|
|
1677
|
+
csv_path = tmp_path / "test.csv"
|
|
1678
|
+
|
|
1679
|
+
with open(csv_path, "w", encoding=encoding) as csv:
|
|
1680
|
+
csv.write("näme,city\n")
|
|
1681
|
+
csv.write("Wilhelm Röntgen,Zürich\n")
|
|
1682
|
+
|
|
1683
|
+
# Write csv test file with the same data using write_dataframe. It should use the
|
|
1684
|
+
# same encoding as above.
|
|
1685
|
+
df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
|
|
1686
|
+
csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
|
|
1687
|
+
write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=arrow)
|
|
1688
|
+
|
|
1689
|
+
# Check if the text files written both ways can be read again and give same result.
|
|
1690
|
+
with open(csv_path, encoding=encoding) as csv:
|
|
1691
|
+
csv_str = csv.read()
|
|
1692
|
+
with open(csv_pyogrio_path, encoding=encoding) as csv_pyogrio:
|
|
1693
|
+
csv_pyogrio_str = csv_pyogrio.read()
|
|
1694
|
+
assert csv_str == csv_pyogrio_str
|
|
1695
|
+
|
|
1696
|
+
# Check if they files are binary identical, to be 100% sure they were written with
|
|
1697
|
+
# the same encoding.
|
|
1698
|
+
with open(csv_path, "rb") as csv:
|
|
1699
|
+
csv_bytes = csv.read()
|
|
1700
|
+
with open(csv_pyogrio_path, "rb") as csv_pyogrio:
|
|
1701
|
+
csv_pyogrio_bytes = csv_pyogrio.read()
|
|
1702
|
+
assert csv_bytes == csv_pyogrio_bytes
|
|
1703
|
+
|
|
1704
|
+
|
|
1705
|
+
@pytest.mark.parametrize(
|
|
1706
|
+
"ext, fid_column, fid_param_value",
|
|
1707
|
+
[
|
|
1708
|
+
(".gpkg", "fid", None),
|
|
1709
|
+
(".gpkg", "FID", None),
|
|
1710
|
+
(".sqlite", "ogc_fid", None),
|
|
1711
|
+
(".gpkg", "fid_custom", "fid_custom"),
|
|
1712
|
+
(".gpkg", "FID_custom", "fid_custom"),
|
|
1713
|
+
(".sqlite", "ogc_fid_custom", "ogc_fid_custom"),
|
|
1714
|
+
],
|
|
1715
|
+
)
|
|
1716
|
+
@pytest.mark.requires_arrow_write_api
|
|
1717
|
+
def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow):
|
|
1718
|
+
"""Test to specify FIDs to save when writing to a file.
|
|
1719
|
+
|
|
1720
|
+
Saving custom FIDs is only supported for formats that actually store the FID, like
|
|
1721
|
+
e.g. GPKG and SQLite. The fid_column name check is case-insensitive.
|
|
1722
|
+
|
|
1723
|
+
Typically, GDAL supports using a custom FID column for these file formats via a
|
|
1724
|
+
`FID` layer creation option, which is also tested here. If `fid_param_value` is
|
|
1725
|
+
specified (not None), an `fid` parameter is passed to `write_dataframe`, causing
|
|
1726
|
+
GDAL to use the column name specified for the FID.
|
|
1727
|
+
"""
|
|
1728
|
+
input_gdf = gp.GeoDataFrame(
|
|
1729
|
+
{fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326"
|
|
1730
|
+
)
|
|
1731
|
+
kwargs = {}
|
|
1732
|
+
if fid_param_value is not None:
|
|
1733
|
+
kwargs["fid"] = fid_param_value
|
|
1734
|
+
path = tmp_path / f"test{ext}"
|
|
1735
|
+
|
|
1736
|
+
write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs)
|
|
1737
|
+
|
|
1738
|
+
assert path.exists()
|
|
1739
|
+
output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow)
|
|
1740
|
+
output_gdf = output_gdf.reset_index()
|
|
1741
|
+
|
|
1742
|
+
# pyogrio always sets "fid" as index name with `fid_as_index`
|
|
1743
|
+
expected_gdf = input_gdf.rename(columns={fid_column: "fid"})
|
|
1744
|
+
assert_geodataframe_equal(output_gdf, expected_gdf)
|
|
1745
|
+
|
|
1746
|
+
|
|
1747
|
+
@pytest.mark.parametrize("ext", ALL_EXTS)
|
|
1748
|
+
@pytest.mark.requires_arrow_write_api
|
|
1749
|
+
def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):
|
|
1750
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
1751
|
+
output_path = tmp_path / f"test{ext}"
|
|
1752
|
+
|
|
1753
|
+
if ext == ".fgb":
|
|
1754
|
+
# For .fgb, spatial_index=False to avoid the rows being reordered
|
|
1755
|
+
write_dataframe(
|
|
1756
|
+
input_gdf, output_path, use_arrow=use_arrow, spatial_index=False
|
|
1757
|
+
)
|
|
1758
|
+
else:
|
|
1759
|
+
write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
|
|
1760
|
+
|
|
1761
|
+
assert output_path.exists()
|
|
1762
|
+
result_gdf = read_dataframe(output_path)
|
|
1763
|
+
|
|
1764
|
+
geometry_types = result_gdf.geometry.type.unique()
|
|
1765
|
+
if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
|
|
1766
|
+
assert list(geometry_types) == ["MultiPolygon"]
|
|
1767
|
+
else:
|
|
1768
|
+
assert set(geometry_types) == {"MultiPolygon", "Polygon"}
|
|
1769
|
+
|
|
1770
|
+
# Coordinates are not precisely equal when written to JSON
|
|
1771
|
+
# dtypes do not necessarily round-trip precisely through JSON
|
|
1772
|
+
is_json = ext in [".geojson", ".geojsonl"]
|
|
1773
|
+
# In .geojsonl the vertices are reordered, so normalize
|
|
1774
|
+
is_jsons = ext == ".geojsonl"
|
|
1775
|
+
|
|
1776
|
+
assert_geodataframe_equal(
|
|
1777
|
+
result_gdf,
|
|
1778
|
+
input_gdf,
|
|
1779
|
+
check_less_precise=is_json,
|
|
1780
|
+
check_index_type=False,
|
|
1781
|
+
check_dtype=not is_json,
|
|
1782
|
+
normalize=is_jsons,
|
|
1783
|
+
)
|
|
1784
|
+
|
|
1785
|
+
|
|
1786
|
+
@pytest.mark.filterwarnings("ignore:.*No SRS set on layer.*")
|
|
1787
|
+
@pytest.mark.parametrize("write_geodf", [True, False])
|
|
1788
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS + [".xlsx"] if ext != ".fgb"])
|
|
1789
|
+
@pytest.mark.requires_arrow_write_api
|
|
1790
|
+
def test_write_dataframe_no_geom(
|
|
1791
|
+
request, tmp_path, naturalearth_lowres, write_geodf, ext, use_arrow
|
|
1792
|
+
):
|
|
1793
|
+
"""Test writing a (geo)dataframe without a geometry column.
|
|
1794
|
+
|
|
1795
|
+
FlatGeobuf (.fgb) doesn't seem to support this, and just writes an empty file.
|
|
1796
|
+
"""
|
|
1797
|
+
# Prepare test data
|
|
1798
|
+
input_df = read_dataframe(naturalearth_lowres, read_geometry=False)
|
|
1799
|
+
if write_geodf:
|
|
1800
|
+
input_df = gp.GeoDataFrame(input_df)
|
|
1801
|
+
|
|
1802
|
+
output_path = tmp_path / f"test{ext}"
|
|
1803
|
+
|
|
1804
|
+
# A shapefile without geometry column results in only a .dbf file.
|
|
1805
|
+
if ext == ".shp":
|
|
1806
|
+
output_path = output_path.with_suffix(".dbf")
|
|
1807
|
+
|
|
1808
|
+
# Determine driver
|
|
1809
|
+
driver = DRIVERS[ext] if ext != ".xlsx" else "XLSX"
|
|
1810
|
+
|
|
1811
|
+
write_dataframe(input_df, output_path, use_arrow=use_arrow, driver=driver)
|
|
1812
|
+
|
|
1813
|
+
assert output_path.exists()
|
|
1814
|
+
result_df = read_dataframe(output_path)
|
|
1815
|
+
|
|
1816
|
+
assert isinstance(result_df, pd.DataFrame)
|
|
1817
|
+
|
|
1818
|
+
# some dtypes do not round-trip precisely through these file types
|
|
1819
|
+
check_dtype = ext not in [".geojson", ".geojsonl", ".xlsx"]
|
|
1820
|
+
|
|
1821
|
+
if ext in [".gpkg", ".shp", ".xlsx"]:
|
|
1822
|
+
# These file types return a DataFrame when read.
|
|
1823
|
+
assert not isinstance(result_df, gp.GeoDataFrame)
|
|
1824
|
+
if isinstance(input_df, gp.GeoDataFrame):
|
|
1825
|
+
input_df = pd.DataFrame(input_df)
|
|
1826
|
+
|
|
1827
|
+
pd.testing.assert_frame_equal(
|
|
1828
|
+
result_df, input_df, check_index_type=False, check_dtype=check_dtype
|
|
1829
|
+
)
|
|
1830
|
+
else:
|
|
1831
|
+
# These file types return a GeoDataFrame with None Geometries when read.
|
|
1832
|
+
input_none_geom_gdf = gp.GeoDataFrame(
|
|
1833
|
+
input_df, geometry=np.repeat(None, len(input_df)), crs=4326
|
|
1834
|
+
)
|
|
1835
|
+
assert_geodataframe_equal(
|
|
1836
|
+
result_df,
|
|
1837
|
+
input_none_geom_gdf,
|
|
1838
|
+
check_index_type=False,
|
|
1839
|
+
check_dtype=check_dtype,
|
|
1840
|
+
)
|
|
1841
|
+
|
|
1842
|
+
|
|
1843
|
+
@pytest.mark.requires_arrow_write_api
|
|
1844
|
+
def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow):
|
|
1845
|
+
# dataframe writing ignores the index
|
|
1846
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
1847
|
+
input_gdf = input_gdf.set_index("iso_a3")
|
|
1848
|
+
|
|
1849
|
+
output_path = tmp_path / "test.shp"
|
|
1850
|
+
write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
|
|
1851
|
+
|
|
1852
|
+
result_gdf = read_dataframe(output_path)
|
|
1853
|
+
assert isinstance(result_gdf.index, pd.RangeIndex)
|
|
1854
|
+
assert_geodataframe_equal(result_gdf, input_gdf.reset_index(drop=True))
|
|
1855
|
+
|
|
1856
|
+
|
|
1857
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"])
|
|
1858
|
+
@pytest.mark.parametrize(
|
|
1859
|
+
"columns, dtype",
|
|
1860
|
+
[
|
|
1861
|
+
([], None),
|
|
1862
|
+
(["col_int"], np.int64),
|
|
1863
|
+
(["col_float"], np.float64),
|
|
1864
|
+
(["col_object"], object),
|
|
1865
|
+
],
|
|
1866
|
+
)
|
|
1867
|
+
@pytest.mark.requires_arrow_write_api
|
|
1868
|
+
def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow):
|
|
1869
|
+
"""Test writing dataframe with no rows.
|
|
1870
|
+
|
|
1871
|
+
With use_arrow, object type columns with no rows are converted to null type columns
|
|
1872
|
+
by pyarrow, but null columns are not supported by GDAL. Added to test fix for #513.
|
|
1873
|
+
"""
|
|
1874
|
+
expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326)
|
|
1875
|
+
filename = tmp_path / f"test{ext}"
|
|
1876
|
+
write_dataframe(expected, filename, use_arrow=use_arrow)
|
|
1877
|
+
|
|
1878
|
+
assert filename.exists()
|
|
1879
|
+
df = read_dataframe(filename, use_arrow=use_arrow)
|
|
1880
|
+
|
|
1881
|
+
# Check result
|
|
1882
|
+
# For older pandas versions, the index is created as Object dtype but read as
|
|
1883
|
+
# RangeIndex, so don't check the index dtype in that case.
|
|
1884
|
+
check_index_type = True if PANDAS_GE_20 else False
|
|
1885
|
+
# with pandas 3+ and reading through arrow, we preserve the string dtype
|
|
1886
|
+
# (no proper dtype inference happens for the empty numpy object dtype arrays)
|
|
1887
|
+
if use_arrow and dtype is object:
|
|
1888
|
+
expected["col_object"] = expected["col_object"].astype("str")
|
|
1889
|
+
assert_geodataframe_equal(df, expected, check_index_type=check_index_type)
|
|
1890
|
+
|
|
1891
|
+
|
|
1892
|
+
def test_write_empty_geometry(tmp_path):
|
|
1893
|
+
expected = gp.GeoDataFrame({"x": [0]}, geometry=from_wkt(["POINT EMPTY"]), crs=4326)
|
|
1894
|
+
filename = tmp_path / "test.gpkg"
|
|
1895
|
+
|
|
1896
|
+
# Check that no warning is raised with GeoSeries.notna()
|
|
1897
|
+
with warnings.catch_warnings():
|
|
1898
|
+
warnings.simplefilter("error", UserWarning)
|
|
1899
|
+
if not HAS_PYPROJ:
|
|
1900
|
+
warnings.filterwarnings("ignore", message="'crs' was not provided.")
|
|
1901
|
+
write_dataframe(expected, filename)
|
|
1902
|
+
assert filename.exists()
|
|
1903
|
+
|
|
1904
|
+
# Xref GH-436: round-tripping possible with GPKG but not others
|
|
1905
|
+
df = read_dataframe(filename)
|
|
1906
|
+
assert_geodataframe_equal(df, expected)
|
|
1907
|
+
|
|
1908
|
+
|
|
1909
|
+
@pytest.mark.requires_arrow_write_api
|
|
1910
|
+
def test_write_None_string_column(tmp_path, use_arrow):
|
|
1911
|
+
"""Test pandas object columns with all None values.
|
|
1912
|
+
|
|
1913
|
+
With use_arrow, such columns are converted to null type columns by pyarrow, but null
|
|
1914
|
+
columns are not supported by GDAL. Added to test fix for #513.
|
|
1915
|
+
"""
|
|
1916
|
+
gdf = gp.GeoDataFrame({"object_col": [None]}, geometry=[Point(0, 0)], crs=4326)
|
|
1917
|
+
filename = tmp_path / "test.gpkg"
|
|
1918
|
+
|
|
1919
|
+
write_dataframe(gdf, filename, use_arrow=use_arrow)
|
|
1920
|
+
assert filename.exists()
|
|
1921
|
+
|
|
1922
|
+
result_gdf = read_dataframe(filename, use_arrow=use_arrow)
|
|
1923
|
+
if (
|
|
1924
|
+
PANDAS_GE_30 or (PANDAS_GE_23 and pd.options.future.infer_string)
|
|
1925
|
+
) and use_arrow:
|
|
1926
|
+
assert result_gdf.object_col.dtype == "str"
|
|
1927
|
+
gdf["object_col"] = gdf["object_col"].astype("str")
|
|
1928
|
+
else:
|
|
1929
|
+
assert result_gdf.object_col.dtype == object
|
|
1930
|
+
assert_geodataframe_equal(result_gdf, gdf)
|
|
1931
|
+
|
|
1932
|
+
|
|
1933
|
+
@pytest.mark.parametrize("ext", [".geojsonl", ".geojsons"])
|
|
1934
|
+
@pytest.mark.requires_arrow_write_api
|
|
1935
|
+
def test_write_read_empty_dataframe_unsupported(tmp_path, ext, use_arrow):
|
|
1936
|
+
# Writing empty dataframe to .geojsons or .geojsonl results logically in a 0 byte
|
|
1937
|
+
# file, but gdal isn't able to read those again at the time of writing.
|
|
1938
|
+
# Issue logged here: https://github.com/geopandas/pyogrio/issues/94
|
|
1939
|
+
expected = gp.GeoDataFrame(geometry=[], crs=4326)
|
|
1940
|
+
|
|
1941
|
+
filename = tmp_path / f"test{ext}"
|
|
1942
|
+
write_dataframe(expected, filename, use_arrow=use_arrow)
|
|
1943
|
+
|
|
1944
|
+
assert filename.exists()
|
|
1945
|
+
with pytest.raises(
|
|
1946
|
+
Exception, match=".* not recognized as( being in)? a supported file format."
|
|
1947
|
+
):
|
|
1948
|
+
_ = read_dataframe(filename, use_arrow=use_arrow)
|
|
1949
|
+
|
|
1950
|
+
|
|
1951
|
+
@pytest.mark.requires_arrow_write_api
|
|
1952
|
+
def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres, use_arrow):
|
|
1953
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
1954
|
+
filename = tmp_path / "test.gpkg"
|
|
1955
|
+
|
|
1956
|
+
write_dataframe(
|
|
1957
|
+
input_gdf,
|
|
1958
|
+
filename,
|
|
1959
|
+
layer="first",
|
|
1960
|
+
promote_to_multi=True,
|
|
1961
|
+
use_arrow=use_arrow,
|
|
1962
|
+
)
|
|
1963
|
+
|
|
1964
|
+
assert filename.exists()
|
|
1965
|
+
assert np.array_equal(list_layers(filename), [["first", "MultiPolygon"]])
|
|
1966
|
+
|
|
1967
|
+
write_dataframe(
|
|
1968
|
+
input_gdf,
|
|
1969
|
+
filename,
|
|
1970
|
+
layer="second",
|
|
1971
|
+
promote_to_multi=True,
|
|
1972
|
+
use_arrow=use_arrow,
|
|
1973
|
+
)
|
|
1974
|
+
assert np.array_equal(
|
|
1975
|
+
list_layers(filename),
|
|
1976
|
+
[["first", "MultiPolygon"], ["second", "MultiPolygon"]],
|
|
1977
|
+
)
|
|
1978
|
+
|
|
1979
|
+
|
|
1980
|
+
@pytest.mark.parametrize("ext", ALL_EXTS)
|
|
1981
|
+
@pytest.mark.requires_arrow_write_api
|
|
1982
|
+
def test_write_dataframe_append(request, tmp_path, naturalearth_lowres, ext, use_arrow):
|
|
1983
|
+
if use_arrow and ext.startswith(".geojson"):
|
|
1984
|
+
# Bug in GDAL when appending int64 to GeoJSON
|
|
1985
|
+
# (https://github.com/OSGeo/gdal/issues/9792)
|
|
1986
|
+
request.node.add_marker(
|
|
1987
|
+
pytest.mark.xfail(reason="Bugs with append when writing Arrow to GeoJSON")
|
|
1988
|
+
)
|
|
1989
|
+
|
|
1990
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
1991
|
+
filename = tmp_path / f"test{ext}"
|
|
1992
|
+
|
|
1993
|
+
write_dataframe(input_gdf, filename, use_arrow=use_arrow)
|
|
1994
|
+
|
|
1995
|
+
filename.exists()
|
|
1996
|
+
assert len(read_dataframe(filename)) == 177
|
|
1997
|
+
|
|
1998
|
+
write_dataframe(input_gdf, filename, use_arrow=use_arrow, append=True)
|
|
1999
|
+
assert len(read_dataframe(filename)) == 354
|
|
2000
|
+
|
|
2001
|
+
|
|
2002
|
+
@pytest.mark.parametrize("spatial_index", [False, True])
|
|
2003
|
+
@pytest.mark.requires_arrow_write_api
|
|
2004
|
+
def test_write_dataframe_gdal_options(
|
|
2005
|
+
tmp_path, naturalearth_lowres, spatial_index, use_arrow
|
|
2006
|
+
):
|
|
2007
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2008
|
+
|
|
2009
|
+
outfilename1 = tmp_path / "test1.shp"
|
|
2010
|
+
write_dataframe(
|
|
2011
|
+
df,
|
|
2012
|
+
outfilename1,
|
|
2013
|
+
use_arrow=use_arrow,
|
|
2014
|
+
SPATIAL_INDEX="YES" if spatial_index else "NO",
|
|
2015
|
+
)
|
|
2016
|
+
assert outfilename1.exists() is True
|
|
2017
|
+
index_filename1 = tmp_path / "test1.qix"
|
|
2018
|
+
assert index_filename1.exists() is spatial_index
|
|
2019
|
+
|
|
2020
|
+
# using explicit layer_options instead
|
|
2021
|
+
outfilename2 = tmp_path / "test2.shp"
|
|
2022
|
+
write_dataframe(
|
|
2023
|
+
df,
|
|
2024
|
+
outfilename2,
|
|
2025
|
+
use_arrow=use_arrow,
|
|
2026
|
+
layer_options={"spatial_index": spatial_index},
|
|
2027
|
+
)
|
|
2028
|
+
assert outfilename2.exists() is True
|
|
2029
|
+
index_filename2 = tmp_path / "test2.qix"
|
|
2030
|
+
assert index_filename2.exists() is spatial_index
|
|
2031
|
+
|
|
2032
|
+
|
|
2033
|
+
@pytest.mark.requires_arrow_write_api
|
|
2034
|
+
def test_write_dataframe_gdal_options_unknown(tmp_path, naturalearth_lowres, use_arrow):
|
|
2035
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2036
|
+
|
|
2037
|
+
# geojson has no spatial index, so passing keyword should raise
|
|
2038
|
+
outfilename = tmp_path / "test.geojson"
|
|
2039
|
+
with pytest.raises(ValueError, match="unrecognized option 'SPATIAL_INDEX'"):
|
|
2040
|
+
write_dataframe(df, outfilename, use_arrow=use_arrow, spatial_index=True)
|
|
2041
|
+
|
|
2042
|
+
|
|
2043
|
+
def _get_gpkg_table_names(path):
|
|
2044
|
+
import sqlite3
|
|
2045
|
+
|
|
2046
|
+
con = sqlite3.connect(path)
|
|
2047
|
+
cursor = con.cursor()
|
|
2048
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
2049
|
+
result = cursor.fetchall()
|
|
2050
|
+
return [res[0] for res in result]
|
|
2051
|
+
|
|
2052
|
+
|
|
2053
|
+
@pytest.mark.requires_arrow_write_api
|
|
2054
|
+
def test_write_dataframe_gdal_options_dataset(tmp_path, naturalearth_lowres, use_arrow):
|
|
2055
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2056
|
+
|
|
2057
|
+
test_default_filename = tmp_path / "test_default.gpkg"
|
|
2058
|
+
write_dataframe(df, test_default_filename, use_arrow=use_arrow)
|
|
2059
|
+
assert "gpkg_ogr_contents" in _get_gpkg_table_names(test_default_filename)
|
|
2060
|
+
|
|
2061
|
+
test_no_contents_filename = tmp_path / "test_no_contents.gpkg"
|
|
2062
|
+
write_dataframe(
|
|
2063
|
+
df, test_default_filename, use_arrow=use_arrow, ADD_GPKG_OGR_CONTENTS="NO"
|
|
2064
|
+
)
|
|
2065
|
+
assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename)
|
|
2066
|
+
|
|
2067
|
+
test_no_contents_filename2 = tmp_path / "test_no_contents2.gpkg"
|
|
2068
|
+
write_dataframe(
|
|
2069
|
+
df,
|
|
2070
|
+
test_no_contents_filename2,
|
|
2071
|
+
use_arrow=use_arrow,
|
|
2072
|
+
dataset_options={"add_gpkg_ogr_contents": False},
|
|
2073
|
+
)
|
|
2074
|
+
assert "gpkg_ogr_contents" not in _get_gpkg_table_names(test_no_contents_filename2)
|
|
2075
|
+
|
|
2076
|
+
|
|
2077
|
+
@pytest.mark.parametrize(
|
|
2078
|
+
"ext, promote_to_multi, expected_geometry_types, expected_geometry_type",
|
|
2079
|
+
[
|
|
2080
|
+
(".fgb", None, ["MultiPolygon"], "MultiPolygon"),
|
|
2081
|
+
(".fgb", True, ["MultiPolygon"], "MultiPolygon"),
|
|
2082
|
+
(".fgb", False, ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2083
|
+
(".geojson", None, ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2084
|
+
(".geojson", True, ["MultiPolygon"], "MultiPolygon"),
|
|
2085
|
+
(".geojson", False, ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2086
|
+
],
|
|
2087
|
+
)
|
|
2088
|
+
@pytest.mark.requires_arrow_write_api
|
|
2089
|
+
def test_write_dataframe_promote_to_multi(
|
|
2090
|
+
tmp_path,
|
|
2091
|
+
naturalearth_lowres,
|
|
2092
|
+
ext,
|
|
2093
|
+
promote_to_multi,
|
|
2094
|
+
expected_geometry_types,
|
|
2095
|
+
expected_geometry_type,
|
|
2096
|
+
use_arrow,
|
|
2097
|
+
):
|
|
2098
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
2099
|
+
|
|
2100
|
+
output_path = tmp_path / f"test_promote{ext}"
|
|
2101
|
+
write_dataframe(
|
|
2102
|
+
input_gdf, output_path, use_arrow=use_arrow, promote_to_multi=promote_to_multi
|
|
2103
|
+
)
|
|
2104
|
+
|
|
2105
|
+
assert output_path.exists()
|
|
2106
|
+
output_gdf = read_dataframe(output_path)
|
|
2107
|
+
geometry_types = sorted(output_gdf.geometry.type.unique())
|
|
2108
|
+
assert geometry_types == expected_geometry_types
|
|
2109
|
+
assert read_info(output_path)["geometry_type"] == expected_geometry_type
|
|
2110
|
+
|
|
2111
|
+
|
|
2112
|
+
@pytest.mark.parametrize(
|
|
2113
|
+
"ext, promote_to_multi, geometry_type, "
|
|
2114
|
+
"expected_geometry_types, expected_geometry_type",
|
|
2115
|
+
[
|
|
2116
|
+
(".fgb", None, "Unknown", ["MultiPolygon"], "Unknown"),
|
|
2117
|
+
(".geojson", False, "Unknown", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2118
|
+
(".geojson", None, "Unknown", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2119
|
+
(".geojson", None, "Polygon", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2120
|
+
(".geojson", None, "MultiPolygon", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2121
|
+
(".geojson", None, "Point", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2122
|
+
(".geojson", True, "Unknown", ["MultiPolygon"], "MultiPolygon"),
|
|
2123
|
+
(".gpkg", False, "Unknown", ["MultiPolygon", "Polygon"], "Unknown"),
|
|
2124
|
+
(".gpkg", None, "Unknown", ["MultiPolygon"], "Unknown"),
|
|
2125
|
+
(".gpkg", None, "Polygon", ["MultiPolygon"], "Polygon"),
|
|
2126
|
+
(".gpkg", None, "MultiPolygon", ["MultiPolygon"], "MultiPolygon"),
|
|
2127
|
+
(".gpkg", None, "Point", ["MultiPolygon"], "Point"),
|
|
2128
|
+
(".gpkg", True, "Unknown", ["MultiPolygon"], "Unknown"),
|
|
2129
|
+
(".shp", False, "Unknown", ["MultiPolygon", "Polygon"], "Polygon"),
|
|
2130
|
+
(".shp", None, "Unknown", ["MultiPolygon", "Polygon"], "Polygon"),
|
|
2131
|
+
(".shp", None, "Polygon", ["MultiPolygon", "Polygon"], "Polygon"),
|
|
2132
|
+
(".shp", None, "MultiPolygon", ["MultiPolygon", "Polygon"], "Polygon"),
|
|
2133
|
+
(".shp", True, "Unknown", ["MultiPolygon", "Polygon"], "Polygon"),
|
|
2134
|
+
],
|
|
2135
|
+
)
|
|
2136
|
+
@pytest.mark.requires_arrow_write_api
|
|
2137
|
+
def test_write_dataframe_promote_to_multi_layer_geom_type(
|
|
2138
|
+
tmp_path,
|
|
2139
|
+
naturalearth_lowres,
|
|
2140
|
+
ext,
|
|
2141
|
+
promote_to_multi,
|
|
2142
|
+
geometry_type,
|
|
2143
|
+
expected_geometry_types,
|
|
2144
|
+
expected_geometry_type,
|
|
2145
|
+
use_arrow,
|
|
2146
|
+
):
|
|
2147
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
2148
|
+
|
|
2149
|
+
output_path = tmp_path / f"test_promote_layer_geom_type{ext}"
|
|
2150
|
+
|
|
2151
|
+
if ext == ".gpkg" and geometry_type in ("Polygon", "Point"):
|
|
2152
|
+
ctx = pytest.warns(
|
|
2153
|
+
RuntimeWarning, match="A geometry of type MULTIPOLYGON is inserted"
|
|
2154
|
+
)
|
|
2155
|
+
else:
|
|
2156
|
+
ctx = contextlib.nullcontext()
|
|
2157
|
+
|
|
2158
|
+
with ctx:
|
|
2159
|
+
write_dataframe(
|
|
2160
|
+
input_gdf,
|
|
2161
|
+
output_path,
|
|
2162
|
+
use_arrow=use_arrow,
|
|
2163
|
+
promote_to_multi=promote_to_multi,
|
|
2164
|
+
geometry_type=geometry_type,
|
|
2165
|
+
)
|
|
2166
|
+
|
|
2167
|
+
assert output_path.exists()
|
|
2168
|
+
output_gdf = read_dataframe(output_path)
|
|
2169
|
+
geometry_types = sorted(output_gdf.geometry.type.unique())
|
|
2170
|
+
assert geometry_types == expected_geometry_types
|
|
2171
|
+
assert read_info(output_path)["geometry_type"] == expected_geometry_type
|
|
2172
|
+
|
|
2173
|
+
|
|
2174
|
+
@pytest.mark.parametrize(
|
|
2175
|
+
"ext, promote_to_multi, geometry_type, expected_raises_match",
|
|
2176
|
+
[
|
|
2177
|
+
(".fgb", False, "MultiPolygon", "Mismatched geometry type"),
|
|
2178
|
+
(".fgb", False, "Polygon", "Mismatched geometry type"),
|
|
2179
|
+
(".fgb", None, "Point", "Mismatched geometry type"),
|
|
2180
|
+
(".fgb", None, "Polygon", "Mismatched geometry type"),
|
|
2181
|
+
(
|
|
2182
|
+
".shp",
|
|
2183
|
+
None,
|
|
2184
|
+
"Point",
|
|
2185
|
+
"Could not add feature to layer at index|Error while writing batch to OGR "
|
|
2186
|
+
"layer",
|
|
2187
|
+
),
|
|
2188
|
+
],
|
|
2189
|
+
)
|
|
2190
|
+
@pytest.mark.requires_arrow_write_api
|
|
2191
|
+
def test_write_dataframe_promote_to_multi_layer_geom_type_invalid(
|
|
2192
|
+
tmp_path,
|
|
2193
|
+
naturalearth_lowres,
|
|
2194
|
+
ext,
|
|
2195
|
+
promote_to_multi,
|
|
2196
|
+
geometry_type,
|
|
2197
|
+
expected_raises_match,
|
|
2198
|
+
use_arrow,
|
|
2199
|
+
):
|
|
2200
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
2201
|
+
|
|
2202
|
+
output_path = tmp_path / f"test{ext}"
|
|
2203
|
+
with pytest.raises((FeatureError, DataLayerError), match=expected_raises_match):
|
|
2204
|
+
write_dataframe(
|
|
2205
|
+
input_gdf,
|
|
2206
|
+
output_path,
|
|
2207
|
+
use_arrow=use_arrow,
|
|
2208
|
+
promote_to_multi=promote_to_multi,
|
|
2209
|
+
geometry_type=geometry_type,
|
|
2210
|
+
)
|
|
2211
|
+
|
|
2212
|
+
|
|
2213
|
+
@pytest.mark.requires_arrow_write_api
|
|
2214
|
+
def test_write_dataframe_layer_geom_type_invalid(
|
|
2215
|
+
tmp_path, naturalearth_lowres, use_arrow
|
|
2216
|
+
):
|
|
2217
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2218
|
+
|
|
2219
|
+
filename = tmp_path / "test.geojson"
|
|
2220
|
+
with pytest.raises(
|
|
2221
|
+
GeometryError, match="Geometry type is not supported: NotSupported"
|
|
2222
|
+
):
|
|
2223
|
+
write_dataframe(df, filename, use_arrow=use_arrow, geometry_type="NotSupported")
|
|
2224
|
+
|
|
2225
|
+
|
|
2226
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".shp"])
|
|
2227
|
+
@pytest.mark.requires_arrow_write_api
|
|
2228
|
+
def test_write_dataframe_truly_mixed(tmp_path, ext, use_arrow):
|
|
2229
|
+
geometry = [
|
|
2230
|
+
shapely.Point(0, 0),
|
|
2231
|
+
shapely.LineString([(0, 0), (1, 1)]),
|
|
2232
|
+
shapely.box(0, 0, 1, 1),
|
|
2233
|
+
shapely.MultiPoint([shapely.Point(1, 1), shapely.Point(2, 2)]),
|
|
2234
|
+
shapely.MultiLineString(
|
|
2235
|
+
[shapely.LineString([(1, 1), (2, 2)]), shapely.LineString([(2, 2), (3, 3)])]
|
|
2236
|
+
),
|
|
2237
|
+
shapely.MultiPolygon([shapely.box(1, 1, 2, 2), shapely.box(2, 2, 3, 3)]),
|
|
2238
|
+
]
|
|
2239
|
+
|
|
2240
|
+
df = gp.GeoDataFrame(
|
|
2241
|
+
{"col": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}, geometry=geometry, crs="EPSG:4326"
|
|
2242
|
+
)
|
|
2243
|
+
|
|
2244
|
+
filename = tmp_path / f"test{ext}"
|
|
2245
|
+
|
|
2246
|
+
if ext == ".fgb":
|
|
2247
|
+
# For .fgb, spatial_index=False to avoid the rows being reordered
|
|
2248
|
+
write_dataframe(df, filename, use_arrow=use_arrow, spatial_index=False)
|
|
2249
|
+
else:
|
|
2250
|
+
write_dataframe(df, filename, use_arrow=use_arrow)
|
|
2251
|
+
|
|
2252
|
+
# Drivers that support mixed geometries will default to "Unknown" geometry type
|
|
2253
|
+
assert read_info(filename)["geometry_type"] == "Unknown"
|
|
2254
|
+
result = read_dataframe(filename)
|
|
2255
|
+
assert_geodataframe_equal(result, df, check_geom_type=True)
|
|
2256
|
+
|
|
2257
|
+
|
|
2258
|
+
@pytest.mark.requires_arrow_write_api
|
|
2259
|
+
def test_write_dataframe_truly_mixed_invalid(tmp_path, use_arrow):
|
|
2260
|
+
# Shapefile doesn't support generic "Geometry" / "Unknown" type
|
|
2261
|
+
# for mixed geometries
|
|
2262
|
+
|
|
2263
|
+
df = gp.GeoDataFrame(
|
|
2264
|
+
{"col": [1.0, 2.0, 3.0]},
|
|
2265
|
+
geometry=[
|
|
2266
|
+
shapely.Point(0, 0),
|
|
2267
|
+
shapely.LineString([(0, 0), (1, 1)]),
|
|
2268
|
+
shapely.box(0, 0, 1, 1),
|
|
2269
|
+
],
|
|
2270
|
+
crs="EPSG:4326",
|
|
2271
|
+
)
|
|
2272
|
+
|
|
2273
|
+
# ensure error message from GDAL is included
|
|
2274
|
+
msg = (
|
|
2275
|
+
"Could not add feature to layer at index 1: Attempt to "
|
|
2276
|
+
r"write non-point \(LINESTRING\) geometry to point shapefile."
|
|
2277
|
+
# DataLayerError when using Arrow
|
|
2278
|
+
"|Error while writing batch to OGR layer: Attempt to "
|
|
2279
|
+
r"write non-point \(LINESTRING\) geometry to point shapefile."
|
|
2280
|
+
)
|
|
2281
|
+
with pytest.raises((FeatureError, DataLayerError), match=msg):
|
|
2282
|
+
write_dataframe(df, tmp_path / "test.shp", use_arrow=use_arrow)
|
|
2283
|
+
|
|
2284
|
+
|
|
2285
|
+
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".fgb"])
|
|
2286
|
+
@pytest.mark.parametrize(
|
|
2287
|
+
"geoms",
|
|
2288
|
+
[
|
|
2289
|
+
[None, shapely.Point(1, 1)],
|
|
2290
|
+
[shapely.Point(1, 1), None],
|
|
2291
|
+
[None, shapely.Point(1, 1, 2)],
|
|
2292
|
+
[None, None],
|
|
2293
|
+
],
|
|
2294
|
+
)
|
|
2295
|
+
@pytest.mark.requires_arrow_write_api
|
|
2296
|
+
def test_write_dataframe_infer_geometry_with_nulls(tmp_path, geoms, ext, use_arrow):
|
|
2297
|
+
filename = tmp_path / f"test{ext}"
|
|
2298
|
+
|
|
2299
|
+
df = gp.GeoDataFrame({"col": [1.0, 2.0]}, geometry=geoms, crs="EPSG:4326")
|
|
2300
|
+
write_dataframe(df, filename, use_arrow=use_arrow)
|
|
2301
|
+
result = read_dataframe(filename)
|
|
2302
|
+
assert_geodataframe_equal(result, df)
|
|
2303
|
+
|
|
2304
|
+
|
|
2305
|
+
@pytest.mark.filterwarnings(
|
|
2306
|
+
"ignore: You will likely lose important projection information"
|
|
2307
|
+
)
|
|
2308
|
+
@pytest.mark.requires_arrow_write_api
|
|
2309
|
+
@requires_pyproj
|
|
2310
|
+
def test_custom_crs_io(tmp_path, naturalearth_lowres_all_ext, use_arrow):
|
|
2311
|
+
df = read_dataframe(naturalearth_lowres_all_ext)
|
|
2312
|
+
# project Belgium to a custom Albers Equal Area projection
|
|
2313
|
+
expected = (
|
|
2314
|
+
df.loc[df.name == "Belgium"]
|
|
2315
|
+
.reset_index(drop=True)
|
|
2316
|
+
.to_crs("+proj=aea +lat_1=49.5 +lat_2=51.5 +lon_0=4.3")
|
|
2317
|
+
)
|
|
2318
|
+
filename = tmp_path / "test.shp"
|
|
2319
|
+
write_dataframe(expected, filename, use_arrow=use_arrow)
|
|
2320
|
+
|
|
2321
|
+
assert filename.exists()
|
|
2322
|
+
|
|
2323
|
+
df = read_dataframe(filename)
|
|
2324
|
+
|
|
2325
|
+
crs = df.crs.to_dict()
|
|
2326
|
+
assert crs["lat_1"] == 49.5
|
|
2327
|
+
assert crs["lat_2"] == 51.5
|
|
2328
|
+
assert crs["lon_0"] == 4.3
|
|
2329
|
+
assert df.crs.equals(expected.crs)
|
|
2330
|
+
|
|
2331
|
+
|
|
2332
|
+
@pytest.mark.parametrize("ext", [".gpkg.zip", ".shp.zip", ".shz"])
|
|
2333
|
+
@pytest.mark.requires_arrow_write_api
|
|
2334
|
+
def test_write_read_zipped_ext(tmp_path, naturalearth_lowres, ext, use_arrow):
|
|
2335
|
+
"""Run a basic read and write test on some extra (zipped) extensions."""
|
|
2336
|
+
if ext == ".gpkg.zip" and not GDAL_GE_37:
|
|
2337
|
+
pytest.skip(".gpkg.zip support requires GDAL >= 3.7")
|
|
2338
|
+
|
|
2339
|
+
input_gdf = read_dataframe(naturalearth_lowres)
|
|
2340
|
+
output_path = tmp_path / f"test{ext}"
|
|
2341
|
+
|
|
2342
|
+
write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
|
|
2343
|
+
|
|
2344
|
+
assert output_path.exists()
|
|
2345
|
+
result_gdf = read_dataframe(output_path)
|
|
2346
|
+
|
|
2347
|
+
geometry_types = result_gdf.geometry.type.unique()
|
|
2348
|
+
if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
|
|
2349
|
+
assert list(geometry_types) == ["MultiPolygon"]
|
|
2350
|
+
else:
|
|
2351
|
+
assert set(geometry_types) == {"MultiPolygon", "Polygon"}
|
|
2352
|
+
|
|
2353
|
+
assert_geodataframe_equal(result_gdf, input_gdf, check_index_type=False)
|
|
2354
|
+
|
|
2355
|
+
|
|
2356
|
+
def test_write_read_mixed_column_values(tmp_path):
|
|
2357
|
+
# use_arrow=True is tested separately below
|
|
2358
|
+
mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
|
|
2359
|
+
geoms = [shapely.Point(0, 0) for _ in mixed_values]
|
|
2360
|
+
test_gdf = gp.GeoDataFrame(
|
|
2361
|
+
{"geometry": geoms, "mixed": mixed_values}, crs="epsg:31370"
|
|
2362
|
+
)
|
|
2363
|
+
output_path = tmp_path / "test_write_mixed_column.gpkg"
|
|
2364
|
+
write_dataframe(test_gdf, output_path)
|
|
2365
|
+
output_gdf = read_dataframe(output_path)
|
|
2366
|
+
assert len(test_gdf) == len(output_gdf)
|
|
2367
|
+
# mixed values as object dtype are currently written as strings
|
|
2368
|
+
# (but preserving nulls)
|
|
2369
|
+
expected = pd.Series(
|
|
2370
|
+
[str(value) if value not in (None, np.nan) else None for value in mixed_values],
|
|
2371
|
+
name="mixed",
|
|
2372
|
+
)
|
|
2373
|
+
assert_series_equal(output_gdf["mixed"], expected)
|
|
2374
|
+
|
|
2375
|
+
|
|
2376
|
+
@requires_arrow_write_api
|
|
2377
|
+
def test_write_read_mixed_column_values_arrow(tmp_path):
|
|
2378
|
+
# Arrow cannot represent a column of mixed types
|
|
2379
|
+
mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
|
|
2380
|
+
geoms = [shapely.Point(0, 0) for _ in mixed_values]
|
|
2381
|
+
test_gdf = gp.GeoDataFrame(
|
|
2382
|
+
{"geometry": geoms, "mixed": mixed_values}, crs="epsg:31370"
|
|
2383
|
+
)
|
|
2384
|
+
output_path = tmp_path / "test_write_mixed_column.gpkg"
|
|
2385
|
+
with pytest.raises(TypeError, match=".*Conversion failed for column"):
|
|
2386
|
+
write_dataframe(test_gdf, output_path, use_arrow=True)
|
|
2387
|
+
|
|
2388
|
+
|
|
2389
|
+
@pytest.mark.requires_arrow_write_api
|
|
2390
|
+
def test_write_read_null(tmp_path, use_arrow):
|
|
2391
|
+
output_path = tmp_path / "test_write_nan.gpkg"
|
|
2392
|
+
geom = shapely.Point(0, 0)
|
|
2393
|
+
test_data = {
|
|
2394
|
+
"geometry": [geom, geom, geom],
|
|
2395
|
+
"float64": [1.0, None, np.nan],
|
|
2396
|
+
"object_str": ["test", None, np.nan],
|
|
2397
|
+
}
|
|
2398
|
+
test_gdf = gp.GeoDataFrame(test_data, crs="epsg:31370")
|
|
2399
|
+
write_dataframe(test_gdf, output_path, use_arrow=use_arrow)
|
|
2400
|
+
result_gdf = read_dataframe(output_path)
|
|
2401
|
+
assert len(test_gdf) == len(result_gdf)
|
|
2402
|
+
assert result_gdf["float64"][0] == 1.0
|
|
2403
|
+
assert pd.isna(result_gdf["float64"][1])
|
|
2404
|
+
assert pd.isna(result_gdf["float64"][2])
|
|
2405
|
+
assert result_gdf["object_str"][0] == "test"
|
|
2406
|
+
assert pd.isna(result_gdf["object_str"][1])
|
|
2407
|
+
assert pd.isna(result_gdf["object_str"][2])
|
|
2408
|
+
|
|
2409
|
+
|
|
2410
|
+
@pytest.mark.requires_arrow_write_api
|
|
2411
|
+
def test_write_read_vsimem(naturalearth_lowres_vsi, use_arrow):
|
|
2412
|
+
path, _ = naturalearth_lowres_vsi
|
|
2413
|
+
mem_path = f"/vsimem/{path.name}"
|
|
2414
|
+
|
|
2415
|
+
input = read_dataframe(path, use_arrow=use_arrow)
|
|
2416
|
+
assert len(input) == 177
|
|
2417
|
+
|
|
2418
|
+
try:
|
|
2419
|
+
write_dataframe(input, mem_path, use_arrow=use_arrow)
|
|
2420
|
+
result = read_dataframe(mem_path, use_arrow=use_arrow)
|
|
2421
|
+
assert len(result) == 177
|
|
2422
|
+
finally:
|
|
2423
|
+
vsi_unlink(mem_path)
|
|
2424
|
+
|
|
2425
|
+
|
|
2426
|
+
@pytest.mark.parametrize(
|
|
2427
|
+
"wkt,geom_types",
|
|
2428
|
+
[
|
|
2429
|
+
("Point Z (0 0 0)", ["2.5D Point", "Point Z"]),
|
|
2430
|
+
("LineString Z (0 0 0, 1 1 0)", ["2.5D LineString", "LineString Z"]),
|
|
2431
|
+
("Polygon Z ((0 0 0, 0 1 0, 1 1 0, 0 0 0))", ["2.5D Polygon", "Polygon Z"]),
|
|
2432
|
+
("MultiPoint Z (0 0 0, 1 1 0)", ["2.5D MultiPoint", "MultiPoint Z"]),
|
|
2433
|
+
(
|
|
2434
|
+
"MultiLineString Z ((0 0 0, 1 1 0), (2 2 2, 3 3 2))",
|
|
2435
|
+
["2.5D MultiLineString", "MultiLineString Z"],
|
|
2436
|
+
),
|
|
2437
|
+
(
|
|
2438
|
+
"MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))", # noqa: E501
|
|
2439
|
+
["2.5D MultiPolygon", "MultiPolygon Z"],
|
|
2440
|
+
),
|
|
2441
|
+
(
|
|
2442
|
+
"GeometryCollection Z (Point Z (0 0 0))",
|
|
2443
|
+
["2.5D GeometryCollection", "GeometryCollection Z"],
|
|
2444
|
+
),
|
|
2445
|
+
],
|
|
2446
|
+
)
|
|
2447
|
+
@pytest.mark.requires_arrow_write_api
|
|
2448
|
+
def test_write_geometry_z_types(tmp_path, wkt, geom_types, use_arrow):
|
|
2449
|
+
filename = tmp_path / "test.fgb"
|
|
2450
|
+
gdf = gp.GeoDataFrame(geometry=from_wkt([wkt]), crs="EPSG:4326")
|
|
2451
|
+
for geom_type in geom_types:
|
|
2452
|
+
write_dataframe(gdf, filename, use_arrow=use_arrow, geometry_type=geom_type)
|
|
2453
|
+
df = read_dataframe(filename)
|
|
2454
|
+
assert_geodataframe_equal(df, gdf)
|
|
2455
|
+
|
|
2456
|
+
|
|
2457
|
+
@pytest.mark.parametrize("ext", ALL_EXTS)
|
|
2458
|
+
@pytest.mark.parametrize(
|
|
2459
|
+
"test_descr, exp_geometry_type, mixed_dimensions, wkt",
|
|
2460
|
+
[
|
|
2461
|
+
("1 Point Z", "Point Z", False, ["Point Z (0 0 0)"]),
|
|
2462
|
+
("1 LineString Z", "LineString Z", False, ["LineString Z (0 0 0, 1 1 0)"]),
|
|
2463
|
+
(
|
|
2464
|
+
"1 Polygon Z",
|
|
2465
|
+
"Polygon Z",
|
|
2466
|
+
False,
|
|
2467
|
+
["Polygon Z ((0 0 0, 0 1 0, 1 1 0, 0 0 0))"],
|
|
2468
|
+
),
|
|
2469
|
+
("1 MultiPoint Z", "MultiPoint Z", False, ["MultiPoint Z (0 0 0, 1 1 0)"]),
|
|
2470
|
+
(
|
|
2471
|
+
"1 MultiLineString Z",
|
|
2472
|
+
"MultiLineString Z",
|
|
2473
|
+
False,
|
|
2474
|
+
["MultiLineString Z ((0 0 0, 1 1 0), (2 2 2, 3 3 2))"],
|
|
2475
|
+
),
|
|
2476
|
+
(
|
|
2477
|
+
"1 MultiLinePolygon Z",
|
|
2478
|
+
"MultiPolygon Z",
|
|
2479
|
+
False,
|
|
2480
|
+
[
|
|
2481
|
+
"MultiPolygon Z (((0 0 0, 0 1 0, 1 1 0, 0 0 0)), ((1 1 1, 1 2 1, 2 2 1, 1 1 1)))" # noqa: E501
|
|
2482
|
+
],
|
|
2483
|
+
),
|
|
2484
|
+
(
|
|
2485
|
+
"1 GeometryCollection Z",
|
|
2486
|
+
"GeometryCollection Z",
|
|
2487
|
+
False,
|
|
2488
|
+
["GeometryCollection Z (Point Z (0 0 0))"],
|
|
2489
|
+
),
|
|
2490
|
+
("Point Z + Point", "Point Z", True, ["Point Z (0 0 0)", "Point (0 0)"]),
|
|
2491
|
+
("Point Z + None", "Point Z", False, ["Point Z (0 0 0)", None]),
|
|
2492
|
+
(
|
|
2493
|
+
"Point Z + LineString Z",
|
|
2494
|
+
"Unknown",
|
|
2495
|
+
False,
|
|
2496
|
+
["LineString Z (0 0 0, 1 1 0)", "Point Z (0 0 0)"],
|
|
2497
|
+
),
|
|
2498
|
+
(
|
|
2499
|
+
"Point Z + LineString",
|
|
2500
|
+
"Unknown",
|
|
2501
|
+
True,
|
|
2502
|
+
["LineString (0 0, 1 1)", "Point Z (0 0 0)"],
|
|
2503
|
+
),
|
|
2504
|
+
],
|
|
2505
|
+
)
|
|
2506
|
+
@pytest.mark.requires_arrow_write_api
|
|
2507
|
+
def test_write_geometry_z_types_auto(
|
|
2508
|
+
tmp_path, ext, test_descr, exp_geometry_type, mixed_dimensions, wkt, use_arrow
|
|
2509
|
+
):
|
|
2510
|
+
# Shapefile has some different behaviour that other file types
|
|
2511
|
+
if ext == ".shp":
|
|
2512
|
+
if exp_geometry_type in ("GeometryCollection Z", "Unknown"):
|
|
2513
|
+
pytest.skip(f"ext {ext} doesn't support {exp_geometry_type}")
|
|
2514
|
+
elif exp_geometry_type == "MultiLineString Z":
|
|
2515
|
+
exp_geometry_type = "LineString Z"
|
|
2516
|
+
elif exp_geometry_type == "MultiPolygon Z":
|
|
2517
|
+
exp_geometry_type = "Polygon Z"
|
|
2518
|
+
|
|
2519
|
+
column_data = {}
|
|
2520
|
+
column_data["test_descr"] = [test_descr] * len(wkt)
|
|
2521
|
+
column_data["idx"] = [str(idx) for idx in range(len(wkt))]
|
|
2522
|
+
gdf = gp.GeoDataFrame(column_data, geometry=from_wkt(wkt), crs="EPSG:4326")
|
|
2523
|
+
filename = tmp_path / f"test{ext}"
|
|
2524
|
+
|
|
2525
|
+
if ext == ".fgb":
|
|
2526
|
+
# writing empty / null geometries not allowed by FlatGeobuf for
|
|
2527
|
+
# GDAL >= 3.6.4 and were simply not written previously
|
|
2528
|
+
gdf = gdf.loc[~(gdf.geometry.isna() | gdf.geometry.is_empty)]
|
|
2529
|
+
|
|
2530
|
+
if mixed_dimensions and DRIVERS[ext] in DRIVERS_NO_MIXED_DIMENSIONS:
|
|
2531
|
+
with pytest.raises(
|
|
2532
|
+
DataSourceError,
|
|
2533
|
+
match=("Mixed 2D and 3D coordinates are not supported by"),
|
|
2534
|
+
):
|
|
2535
|
+
write_dataframe(gdf, filename, use_arrow=use_arrow)
|
|
2536
|
+
return
|
|
2537
|
+
else:
|
|
2538
|
+
write_dataframe(gdf, filename, use_arrow=use_arrow)
|
|
2539
|
+
|
|
2540
|
+
info = read_info(filename)
|
|
2541
|
+
assert info["geometry_type"] == exp_geometry_type
|
|
2542
|
+
|
|
2543
|
+
result_gdf = read_dataframe(filename)
|
|
2544
|
+
if ext == ".geojsonl":
|
|
2545
|
+
result_gdf.crs = "EPSG:4326"
|
|
2546
|
+
|
|
2547
|
+
assert_geodataframe_equal(gdf, result_gdf)
|
|
2548
|
+
|
|
2549
|
+
|
|
2550
|
+
@pytest.mark.parametrize(
|
|
2551
|
+
"on_invalid, message, expected_wkt",
|
|
2552
|
+
[
|
|
2553
|
+
(
|
|
2554
|
+
"warn",
|
|
2555
|
+
"Invalid WKB: geometry is returned as None. IllegalArgumentException: "
|
|
2556
|
+
"Points of LinearRing do not form a closed linestring",
|
|
2557
|
+
None,
|
|
2558
|
+
),
|
|
2559
|
+
("raise", "Points of LinearRing do not form a closed linestring", None),
|
|
2560
|
+
("ignore", None, None),
|
|
2561
|
+
("fix", None, "POLYGON ((0 0, 0 1, 0 0))"),
|
|
2562
|
+
],
|
|
2563
|
+
)
|
|
2564
|
+
@pytest.mark.filterwarnings("ignore:Non closed ring detected:RuntimeWarning")
|
|
2565
|
+
def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expected_wkt):
|
|
2566
|
+
if on_invalid == "fix" and not SHAPELY_GE_21:
|
|
2567
|
+
pytest.skip("on_invalid=fix not available for Shapely < 2.1")
|
|
2568
|
+
|
|
2569
|
+
if on_invalid == "raise":
|
|
2570
|
+
handler = pytest.raises(shapely.errors.GEOSException, match=message)
|
|
2571
|
+
elif on_invalid == "warn":
|
|
2572
|
+
handler = pytest.warns(match=message)
|
|
2573
|
+
elif on_invalid in ("fix", "ignore"):
|
|
2574
|
+
handler = contextlib.nullcontext()
|
|
2575
|
+
else:
|
|
2576
|
+
raise ValueError(f"unknown value for on_invalid: {on_invalid}")
|
|
2577
|
+
|
|
2578
|
+
# create a GeoJSON file with an invalid exterior ring
|
|
2579
|
+
invalid_geojson = """{
|
|
2580
|
+
"type": "FeatureCollection",
|
|
2581
|
+
"features": [
|
|
2582
|
+
{
|
|
2583
|
+
"type": "Feature",
|
|
2584
|
+
"properties": {},
|
|
2585
|
+
"geometry": {
|
|
2586
|
+
"type": "Polygon",
|
|
2587
|
+
"coordinates": [ [ [0, 0], [0, 1] ] ]
|
|
2588
|
+
}
|
|
2589
|
+
}
|
|
2590
|
+
]
|
|
2591
|
+
}"""
|
|
2592
|
+
|
|
2593
|
+
filename = tmp_path / "test.geojson"
|
|
2594
|
+
with open(filename, "w") as f:
|
|
2595
|
+
_ = f.write(invalid_geojson)
|
|
2596
|
+
|
|
2597
|
+
with handler:
|
|
2598
|
+
df = read_dataframe(
|
|
2599
|
+
filename,
|
|
2600
|
+
use_arrow=use_arrow,
|
|
2601
|
+
on_invalid=on_invalid,
|
|
2602
|
+
)
|
|
2603
|
+
if expected_wkt is None:
|
|
2604
|
+
assert df.geometry.iloc[0] is None
|
|
2605
|
+
else:
|
|
2606
|
+
assert df.geometry.iloc[0].wkt == expected_wkt
|
|
2607
|
+
|
|
2608
|
+
|
|
2609
|
+
def test_read_multisurface(multisurface_file, use_arrow):
|
|
2610
|
+
if use_arrow:
|
|
2611
|
+
# TODO: revisit once https://github.com/geopandas/pyogrio/issues/478
|
|
2612
|
+
# is resolved.
|
|
2613
|
+
pytest.skip("Shapely + GEOS 3.13 crashes in from_wkb for this case")
|
|
2614
|
+
|
|
2615
|
+
with pytest.raises(shapely.errors.GEOSException):
|
|
2616
|
+
# TODO(Arrow)
|
|
2617
|
+
# shapely fails parsing the WKB
|
|
2618
|
+
read_dataframe(multisurface_file, use_arrow=True)
|
|
2619
|
+
else:
|
|
2620
|
+
df = read_dataframe(multisurface_file)
|
|
2621
|
+
|
|
2622
|
+
# MultiSurface should be converted to MultiPolygon
|
|
2623
|
+
assert df.geometry.type.tolist() == ["MultiPolygon"]
|
|
2624
|
+
|
|
2625
|
+
|
|
2626
|
+
def test_read_dataset_kwargs(nested_geojson_file, use_arrow):
|
|
2627
|
+
# by default, nested data are not flattened
|
|
2628
|
+
df = read_dataframe(nested_geojson_file, use_arrow=use_arrow)
|
|
2629
|
+
|
|
2630
|
+
expected = gp.GeoDataFrame(
|
|
2631
|
+
{
|
|
2632
|
+
"top_level": ["A"],
|
|
2633
|
+
"intermediate_level": [{"bottom_level": "B"}],
|
|
2634
|
+
},
|
|
2635
|
+
geometry=[shapely.Point(0, 0)],
|
|
2636
|
+
crs="EPSG:4326",
|
|
2637
|
+
)
|
|
2638
|
+
if GDAL_GE_311 and use_arrow:
|
|
2639
|
+
# GDAL 3.11 started to use json extension type, which is not yet handled
|
|
2640
|
+
# correctly in the arrow->pandas conversion (using object instead of str dtype)
|
|
2641
|
+
expected["intermediate_level"] = expected["intermediate_level"].astype(object)
|
|
2642
|
+
|
|
2643
|
+
assert_geodataframe_equal(df, expected)
|
|
2644
|
+
|
|
2645
|
+
df = read_dataframe(
|
|
2646
|
+
nested_geojson_file, use_arrow=use_arrow, FLATTEN_NESTED_ATTRIBUTES="YES"
|
|
2647
|
+
)
|
|
2648
|
+
|
|
2649
|
+
expected = gp.GeoDataFrame(
|
|
2650
|
+
{
|
|
2651
|
+
"top_level": ["A"],
|
|
2652
|
+
"intermediate_level_bottom_level": ["B"],
|
|
2653
|
+
},
|
|
2654
|
+
geometry=[shapely.Point(0, 0)],
|
|
2655
|
+
crs="EPSG:4326",
|
|
2656
|
+
)
|
|
2657
|
+
|
|
2658
|
+
assert_geodataframe_equal(df, expected)
|
|
2659
|
+
|
|
2660
|
+
|
|
2661
|
+
def test_read_invalid_dataset_kwargs(naturalearth_lowres, use_arrow):
|
|
2662
|
+
with pytest.warns(RuntimeWarning, match="does not support open option INVALID"):
|
|
2663
|
+
read_dataframe(naturalearth_lowres, use_arrow=use_arrow, INVALID="YES")
|
|
2664
|
+
|
|
2665
|
+
|
|
2666
|
+
@pytest.mark.requires_arrow_write_api
|
|
2667
|
+
def test_write_nullable_dtypes(tmp_path, use_arrow):
|
|
2668
|
+
path = tmp_path / "test_nullable_dtypes.gpkg"
|
|
2669
|
+
test_data = {
|
|
2670
|
+
"col1": pd.Series([1, 2, 3], dtype="int64"),
|
|
2671
|
+
"col2": pd.Series([1, 2, None], dtype="Int64"),
|
|
2672
|
+
"col3": pd.Series([0.1, None, 0.3], dtype="Float32"),
|
|
2673
|
+
"col4": pd.Series([True, False, None], dtype="boolean"),
|
|
2674
|
+
"col5": pd.Series(["a", None, "b"], dtype="string"),
|
|
2675
|
+
}
|
|
2676
|
+
input_gdf = gp.GeoDataFrame(
|
|
2677
|
+
test_data, geometry=[shapely.Point(0, 0)] * 3, crs="epsg:31370"
|
|
2678
|
+
)
|
|
2679
|
+
write_dataframe(input_gdf, path, use_arrow=use_arrow)
|
|
2680
|
+
output_gdf = read_dataframe(path)
|
|
2681
|
+
# We read it back as default (non-nullable) numpy dtypes, so we cast
|
|
2682
|
+
# to those for the expected result
|
|
2683
|
+
expected = input_gdf.copy()
|
|
2684
|
+
expected["col2"] = expected["col2"].astype("float64")
|
|
2685
|
+
expected["col3"] = expected["col3"].astype("float32")
|
|
2686
|
+
expected["col4"] = expected["col4"].astype("float64")
|
|
2687
|
+
expected["col5"] = expected["col5"].astype("str")
|
|
2688
|
+
expected.loc[1, "col5"] = None # pandas converts to pd.NA on line above
|
|
2689
|
+
assert_geodataframe_equal(output_gdf, expected)
|
|
2690
|
+
|
|
2691
|
+
|
|
2692
|
+
@pytest.mark.parametrize(
|
|
2693
|
+
"metadata_type", ["dataset_metadata", "layer_metadata", "metadata"]
|
|
2694
|
+
)
|
|
2695
|
+
@pytest.mark.requires_arrow_write_api
|
|
2696
|
+
def test_metadata_io(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
|
|
2697
|
+
metadata = {"level": metadata_type}
|
|
2698
|
+
|
|
2699
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2700
|
+
|
|
2701
|
+
filename = tmp_path / "test.gpkg"
|
|
2702
|
+
write_dataframe(df, filename, use_arrow=use_arrow, **{metadata_type: metadata})
|
|
2703
|
+
|
|
2704
|
+
metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type
|
|
2705
|
+
|
|
2706
|
+
assert read_info(filename)[metadata_key] == metadata
|
|
2707
|
+
|
|
2708
|
+
|
|
2709
|
+
@pytest.mark.parametrize("metadata_type", ["dataset_metadata", "layer_metadata"])
|
|
2710
|
+
@pytest.mark.parametrize(
|
|
2711
|
+
"metadata",
|
|
2712
|
+
[
|
|
2713
|
+
{1: 2},
|
|
2714
|
+
{"key": None},
|
|
2715
|
+
{"key": 1},
|
|
2716
|
+
],
|
|
2717
|
+
)
|
|
2718
|
+
@pytest.mark.requires_arrow_write_api
|
|
2719
|
+
def test_invalid_metadata(
|
|
2720
|
+
tmp_path, naturalearth_lowres, metadata_type, metadata, use_arrow
|
|
2721
|
+
):
|
|
2722
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2723
|
+
with pytest.raises(ValueError, match="must be a string"):
|
|
2724
|
+
write_dataframe(
|
|
2725
|
+
df, tmp_path / "test.gpkg", use_arrow=use_arrow, **{metadata_type: metadata}
|
|
2726
|
+
)
|
|
2727
|
+
|
|
2728
|
+
|
|
2729
|
+
@pytest.mark.parametrize("metadata_type", ["dataset_metadata", "layer_metadata"])
|
|
2730
|
+
@pytest.mark.requires_arrow_write_api
|
|
2731
|
+
def test_metadata_unsupported(tmp_path, naturalearth_lowres, metadata_type, use_arrow):
|
|
2732
|
+
"""metadata is silently ignored"""
|
|
2733
|
+
|
|
2734
|
+
filename = tmp_path / "test.geojson"
|
|
2735
|
+
write_dataframe(
|
|
2736
|
+
read_dataframe(naturalearth_lowres),
|
|
2737
|
+
filename,
|
|
2738
|
+
use_arrow=use_arrow,
|
|
2739
|
+
**{metadata_type: {"key": "value"}},
|
|
2740
|
+
)
|
|
2741
|
+
|
|
2742
|
+
metadata_key = "layer_metadata" if metadata_type == "metadata" else metadata_type
|
|
2743
|
+
|
|
2744
|
+
assert read_info(filename)[metadata_key] is None
|
|
2745
|
+
|
|
2746
|
+
|
|
2747
|
+
@pytest.mark.skipif(not PANDAS_GE_15, reason="ArrowDtype requires pandas 1.5+")
|
|
2748
|
+
def test_read_dataframe_arrow_dtypes(tmp_path):
|
|
2749
|
+
# https://github.com/geopandas/pyogrio/issues/319 - ensure arrow binary
|
|
2750
|
+
# column can be converted with from_wkb in case of missing values
|
|
2751
|
+
pytest.importorskip("pyarrow")
|
|
2752
|
+
filename = tmp_path / "test.gpkg"
|
|
2753
|
+
df = gp.GeoDataFrame(
|
|
2754
|
+
{"col": [1.0, 2.0]}, geometry=[Point(1, 1), None], crs="EPSG:4326"
|
|
2755
|
+
)
|
|
2756
|
+
write_dataframe(df, filename)
|
|
2757
|
+
|
|
2758
|
+
result = read_dataframe(
|
|
2759
|
+
filename,
|
|
2760
|
+
use_arrow=True,
|
|
2761
|
+
arrow_to_pandas_kwargs={
|
|
2762
|
+
"types_mapper": lambda pa_dtype: pd.ArrowDtype(pa_dtype)
|
|
2763
|
+
},
|
|
2764
|
+
)
|
|
2765
|
+
assert isinstance(result["col"].dtype, pd.ArrowDtype)
|
|
2766
|
+
result["col"] = result["col"].astype("float64")
|
|
2767
|
+
assert_geodataframe_equal(result, df)
|
|
2768
|
+
|
|
2769
|
+
|
|
2770
|
+
@requires_pyarrow_api
|
|
2771
|
+
@pytest.mark.skipif(
|
|
2772
|
+
__gdal_version__ < (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
|
|
2773
|
+
)
|
|
2774
|
+
@pytest.mark.parametrize("ext", ALL_EXTS)
|
|
2775
|
+
def test_arrow_bool_roundtrip(tmp_path, ext):
|
|
2776
|
+
filename = tmp_path / f"test{ext}"
|
|
2777
|
+
|
|
2778
|
+
kwargs = {}
|
|
2779
|
+
|
|
2780
|
+
if ext == ".fgb":
|
|
2781
|
+
# For .fgb, spatial_index=False to avoid the rows being reordered
|
|
2782
|
+
kwargs["spatial_index"] = False
|
|
2783
|
+
|
|
2784
|
+
df = gp.GeoDataFrame(
|
|
2785
|
+
{"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
|
|
2786
|
+
crs="EPSG:4326",
|
|
2787
|
+
)
|
|
2788
|
+
|
|
2789
|
+
write_dataframe(df, filename, **kwargs)
|
|
2790
|
+
result = read_dataframe(filename, use_arrow=True)
|
|
2791
|
+
# Shapefiles do not support bool columns; these are returned as int32
|
|
2792
|
+
assert_geodataframe_equal(result, df, check_dtype=ext != ".shp")
|
|
2793
|
+
|
|
2794
|
+
|
|
2795
|
+
@requires_pyarrow_api
|
|
2796
|
+
@pytest.mark.skipif(
|
|
2797
|
+
__gdal_version__ >= (3, 8, 3), reason="Arrow bool value bug fixed in GDAL >= 3.8.3"
|
|
2798
|
+
)
|
|
2799
|
+
@pytest.mark.parametrize("ext", ALL_EXTS)
|
|
2800
|
+
def test_arrow_bool_exception(tmp_path, ext):
|
|
2801
|
+
filename = tmp_path / f"test{ext}"
|
|
2802
|
+
|
|
2803
|
+
df = gp.GeoDataFrame(
|
|
2804
|
+
{"bool_col": [True, False, True, False, True], "geometry": [Point(0, 0)] * 5},
|
|
2805
|
+
crs="EPSG:4326",
|
|
2806
|
+
)
|
|
2807
|
+
|
|
2808
|
+
write_dataframe(df, filename)
|
|
2809
|
+
|
|
2810
|
+
if ext in {".fgb", ".gpkg"}:
|
|
2811
|
+
# only raise exception for GPKG / FGB
|
|
2812
|
+
with pytest.raises(
|
|
2813
|
+
RuntimeError,
|
|
2814
|
+
match="GDAL < 3.8.3 does not correctly read boolean data values using "
|
|
2815
|
+
"the Arrow API",
|
|
2816
|
+
):
|
|
2817
|
+
read_dataframe(filename, use_arrow=True)
|
|
2818
|
+
|
|
2819
|
+
# do not raise exception if no bool columns are read
|
|
2820
|
+
read_dataframe(filename, use_arrow=True, columns=[])
|
|
2821
|
+
|
|
2822
|
+
else:
|
|
2823
|
+
_ = read_dataframe(filename, use_arrow=True)
|
|
2824
|
+
|
|
2825
|
+
|
|
2826
|
+
@requires_pyarrow_api
|
|
2827
|
+
def test_arrow_enable_with_environment_variable(tmp_path):
|
|
2828
|
+
"""Test if arrow can be enabled via an environment variable."""
|
|
2829
|
+
# Latin 1 / Western European
|
|
2830
|
+
encoding = "CP1252"
|
|
2831
|
+
text = "ÿ"
|
|
2832
|
+
test_path = tmp_path / "test.gpkg"
|
|
2833
|
+
|
|
2834
|
+
df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
|
|
2835
|
+
write_dataframe(df, test_path, encoding=encoding)
|
|
2836
|
+
|
|
2837
|
+
# Without arrow, specifying the encoding is supported
|
|
2838
|
+
result = read_dataframe(test_path, encoding="cp1252")
|
|
2839
|
+
assert result is not None
|
|
2840
|
+
|
|
2841
|
+
# With arrow enabled, specifying the encoding is not supported
|
|
2842
|
+
with use_arrow_context():
|
|
2843
|
+
with pytest.raises(
|
|
2844
|
+
ValueError, match="non-UTF-8 encoding is not supported for Arrow"
|
|
2845
|
+
):
|
|
2846
|
+
_ = read_dataframe(test_path, encoding="cp1252")
|
|
2847
|
+
|
|
2848
|
+
|
|
2849
|
+
@pytest.mark.filterwarnings("ignore:File /vsimem:RuntimeWarning")
|
|
2850
|
+
@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
|
|
2851
|
+
def test_write_memory(naturalearth_lowres, driver):
|
|
2852
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2853
|
+
|
|
2854
|
+
buffer = BytesIO()
|
|
2855
|
+
write_dataframe(df, buffer, driver=driver, layer="test")
|
|
2856
|
+
|
|
2857
|
+
assert len(buffer.getbuffer()) > 0
|
|
2858
|
+
|
|
2859
|
+
actual = read_dataframe(buffer)
|
|
2860
|
+
assert len(actual) == len(df)
|
|
2861
|
+
|
|
2862
|
+
is_json = driver == "GeoJSON"
|
|
2863
|
+
|
|
2864
|
+
assert_geodataframe_equal(
|
|
2865
|
+
actual,
|
|
2866
|
+
df,
|
|
2867
|
+
check_less_precise=is_json,
|
|
2868
|
+
check_index_type=False,
|
|
2869
|
+
check_dtype=not is_json,
|
|
2870
|
+
)
|
|
2871
|
+
|
|
2872
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2873
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2874
|
+
|
|
2875
|
+
|
|
2876
|
+
def test_write_memory_driver_required(naturalearth_lowres):
|
|
2877
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2878
|
+
|
|
2879
|
+
buffer = BytesIO()
|
|
2880
|
+
|
|
2881
|
+
with pytest.raises(
|
|
2882
|
+
ValueError,
|
|
2883
|
+
match="driver must be provided to write to in-memory file",
|
|
2884
|
+
):
|
|
2885
|
+
write_dataframe(df.head(1), buffer, driver=None, layer="test")
|
|
2886
|
+
|
|
2887
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2888
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2889
|
+
|
|
2890
|
+
|
|
2891
|
+
@pytest.mark.parametrize("driver", ["ESRI Shapefile", "OpenFileGDB"])
|
|
2892
|
+
def test_write_memory_unsupported_driver(naturalearth_lowres, driver):
|
|
2893
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2894
|
+
|
|
2895
|
+
buffer = BytesIO()
|
|
2896
|
+
|
|
2897
|
+
with pytest.raises(
|
|
2898
|
+
ValueError, match=f"writing to in-memory file is not supported for {driver}"
|
|
2899
|
+
):
|
|
2900
|
+
write_dataframe(df, buffer, driver=driver, layer="test")
|
|
2901
|
+
|
|
2902
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2903
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2904
|
+
|
|
2905
|
+
|
|
2906
|
+
@pytest.mark.parametrize("driver", ["GeoJSON", "GPKG"])
|
|
2907
|
+
def test_write_memory_append_unsupported(naturalearth_lowres, driver):
|
|
2908
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2909
|
+
|
|
2910
|
+
buffer = BytesIO()
|
|
2911
|
+
|
|
2912
|
+
with pytest.raises(
|
|
2913
|
+
NotImplementedError, match="append is not supported for in-memory files"
|
|
2914
|
+
):
|
|
2915
|
+
write_dataframe(df.head(1), buffer, driver=driver, layer="test", append=True)
|
|
2916
|
+
|
|
2917
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2918
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2919
|
+
|
|
2920
|
+
|
|
2921
|
+
def test_write_memory_existing_unsupported(naturalearth_lowres):
|
|
2922
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2923
|
+
|
|
2924
|
+
buffer = BytesIO(b"0000")
|
|
2925
|
+
with pytest.raises(
|
|
2926
|
+
NotImplementedError,
|
|
2927
|
+
match="writing to existing in-memory object is not supported",
|
|
2928
|
+
):
|
|
2929
|
+
write_dataframe(df.head(1), buffer, driver="GeoJSON", layer="test")
|
|
2930
|
+
|
|
2931
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2932
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2933
|
+
|
|
2934
|
+
|
|
2935
|
+
def test_write_open_file_handle(tmp_path, naturalearth_lowres):
|
|
2936
|
+
"""Verify that writing to an open file handle is not currently supported"""
|
|
2937
|
+
|
|
2938
|
+
df = read_dataframe(naturalearth_lowres)
|
|
2939
|
+
|
|
2940
|
+
# verify it fails for regular file handle
|
|
2941
|
+
with pytest.raises(
|
|
2942
|
+
NotImplementedError, match="writing to an open file handle is not yet supported"
|
|
2943
|
+
):
|
|
2944
|
+
with open(tmp_path / "test.geojson", "wb") as f:
|
|
2945
|
+
write_dataframe(df.head(1), f)
|
|
2946
|
+
|
|
2947
|
+
# verify it fails for ZipFile
|
|
2948
|
+
with pytest.raises(
|
|
2949
|
+
NotImplementedError, match="writing to an open file handle is not yet supported"
|
|
2950
|
+
):
|
|
2951
|
+
with ZipFile(tmp_path / "test.geojson.zip", "w") as z:
|
|
2952
|
+
with z.open("test.geojson", "w") as f:
|
|
2953
|
+
write_dataframe(df.head(1), f)
|
|
2954
|
+
|
|
2955
|
+
# Check temp file was cleaned up. Filter, as gdal keeps cache files in /vsimem/.
|
|
2956
|
+
assert vsi_listtree("/vsimem/", pattern="pyogrio_*") == []
|
|
2957
|
+
|
|
2958
|
+
|
|
2959
|
+
@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
|
|
2960
|
+
def test_non_utf8_encoding_io(tmp_path, ext, encoded_text):
|
|
2961
|
+
"""Verify that we write non-UTF data to the data source
|
|
2962
|
+
|
|
2963
|
+
IMPORTANT: this may not be valid for the data source and will likely render
|
|
2964
|
+
them unusable in other tools, but should successfully roundtrip unless we
|
|
2965
|
+
disable writing using other encodings.
|
|
2966
|
+
|
|
2967
|
+
NOTE: FlatGeobuff driver cannot handle non-UTF data in GDAL >= 3.9
|
|
2968
|
+
|
|
2969
|
+
NOTE: pyarrow cannot handle non-UTF-8 characters in this way
|
|
2970
|
+
"""
|
|
2971
|
+
|
|
2972
|
+
encoding, text = encoded_text
|
|
2973
|
+
output_path = tmp_path / f"test.{ext}"
|
|
2974
|
+
|
|
2975
|
+
df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
|
|
2976
|
+
write_dataframe(df, output_path, encoding=encoding)
|
|
2977
|
+
|
|
2978
|
+
# cannot open these files without specifying encoding
|
|
2979
|
+
with pytest.raises(UnicodeDecodeError):
|
|
2980
|
+
read_dataframe(output_path)
|
|
2981
|
+
|
|
2982
|
+
# must provide encoding to read these properly
|
|
2983
|
+
actual = read_dataframe(output_path, encoding=encoding)
|
|
2984
|
+
assert actual.columns[0] == text
|
|
2985
|
+
assert actual[text].values[0] == text
|
|
2986
|
+
|
|
2987
|
+
|
|
2988
|
+
@requires_pyarrow_api
|
|
2989
|
+
@pytest.mark.parametrize("ext", ["gpkg", "geojson"])
|
|
2990
|
+
def test_non_utf8_encoding_io_arrow_exception(tmp_path, ext, encoded_text):
|
|
2991
|
+
encoding, text = encoded_text
|
|
2992
|
+
output_path = tmp_path / f"test.{ext}"
|
|
2993
|
+
|
|
2994
|
+
df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
|
|
2995
|
+
write_dataframe(df, output_path, encoding=encoding)
|
|
2996
|
+
|
|
2997
|
+
# cannot open these files without specifying encoding
|
|
2998
|
+
with pytest.raises(UnicodeDecodeError):
|
|
2999
|
+
read_dataframe(output_path)
|
|
3000
|
+
|
|
3001
|
+
with pytest.raises(
|
|
3002
|
+
ValueError, match="non-UTF-8 encoding is not supported for Arrow"
|
|
3003
|
+
):
|
|
3004
|
+
read_dataframe(output_path, encoding=encoding, use_arrow=True)
|
|
3005
|
+
|
|
3006
|
+
|
|
3007
|
+
def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
|
|
3008
|
+
encoding, text = encoded_text
|
|
3009
|
+
|
|
3010
|
+
output_path = tmp_path / "test.shp"
|
|
3011
|
+
|
|
3012
|
+
df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
|
|
3013
|
+
write_dataframe(df, output_path, encoding=encoding)
|
|
3014
|
+
|
|
3015
|
+
# NOTE: GDAL automatically creates a cpg file with the encoding name, which
|
|
3016
|
+
# means that if we read this without specifying the encoding it uses the
|
|
3017
|
+
# correct one
|
|
3018
|
+
actual = read_dataframe(output_path, use_arrow=use_arrow)
|
|
3019
|
+
assert actual.columns[0] == text
|
|
3020
|
+
assert actual[text].values[0] == text
|
|
3021
|
+
|
|
3022
|
+
# verify that if cpg file is not present, that user-provided encoding must be used
|
|
3023
|
+
output_path.with_suffix(".cpg").unlink()
|
|
3024
|
+
|
|
3025
|
+
# We will assume ISO-8859-1, which is wrong
|
|
3026
|
+
miscoded = text.encode(encoding).decode("ISO-8859-1")
|
|
3027
|
+
|
|
3028
|
+
if use_arrow:
|
|
3029
|
+
# pyarrow cannot decode column name with incorrect encoding
|
|
3030
|
+
with pytest.raises(
|
|
3031
|
+
DataSourceError,
|
|
3032
|
+
match="The file being read is not encoded in UTF-8; please use_arrow=False",
|
|
3033
|
+
):
|
|
3034
|
+
read_dataframe(output_path, use_arrow=True)
|
|
3035
|
+
else:
|
|
3036
|
+
bad = read_dataframe(output_path, use_arrow=False)
|
|
3037
|
+
assert bad.columns[0] == miscoded
|
|
3038
|
+
assert bad[miscoded].values[0] == miscoded
|
|
3039
|
+
|
|
3040
|
+
# If encoding is provided, that should yield correct text
|
|
3041
|
+
actual = read_dataframe(output_path, encoding=encoding, use_arrow=use_arrow)
|
|
3042
|
+
assert actual.columns[0] == text
|
|
3043
|
+
assert actual[text].values[0] == text
|
|
3044
|
+
|
|
3045
|
+
# if ENCODING open option, that should yield correct text
|
|
3046
|
+
actual = read_dataframe(output_path, use_arrow=use_arrow, ENCODING=encoding)
|
|
3047
|
+
assert actual.columns[0] == text
|
|
3048
|
+
assert actual[text].values[0] == text
|
|
3049
|
+
|
|
3050
|
+
|
|
3051
|
+
def test_encoding_read_option_collision_shapefile(naturalearth_lowres, use_arrow):
|
|
3052
|
+
"""Providing both encoding parameter and ENCODING open option
|
|
3053
|
+
(even if blank) is not allowed."""
|
|
3054
|
+
|
|
3055
|
+
with pytest.raises(
|
|
3056
|
+
ValueError, match='cannot provide both encoding parameter and "ENCODING" option'
|
|
3057
|
+
):
|
|
3058
|
+
read_dataframe(
|
|
3059
|
+
naturalearth_lowres, encoding="CP936", ENCODING="", use_arrow=use_arrow
|
|
3060
|
+
)
|
|
3061
|
+
|
|
3062
|
+
|
|
3063
|
+
def test_encoding_write_layer_option_collision_shapefile(tmp_path, encoded_text):
|
|
3064
|
+
"""Providing both encoding parameter and ENCODING layer creation option
|
|
3065
|
+
(even if blank) is not allowed."""
|
|
3066
|
+
encoding, text = encoded_text
|
|
3067
|
+
|
|
3068
|
+
output_path = tmp_path / "test.shp"
|
|
3069
|
+
df = gp.GeoDataFrame({text: [text], "geometry": [Point(0, 0)]}, crs="EPSG:4326")
|
|
3070
|
+
|
|
3071
|
+
with pytest.raises(
|
|
3072
|
+
ValueError,
|
|
3073
|
+
match=(
|
|
3074
|
+
'cannot provide both encoding parameter and "ENCODING" layer creation '
|
|
3075
|
+
"option"
|
|
3076
|
+
),
|
|
3077
|
+
):
|
|
3078
|
+
write_dataframe(
|
|
3079
|
+
df, output_path, encoding=encoding, layer_options={"ENCODING": ""}
|
|
3080
|
+
)
|
|
3081
|
+
|
|
3082
|
+
|
|
3083
|
+
def test_non_utf8_encoding_shapefile_sql(tmp_path, use_arrow):
|
|
3084
|
+
encoding = "CP936"
|
|
3085
|
+
|
|
3086
|
+
output_path = tmp_path / "test.shp"
|
|
3087
|
+
|
|
3088
|
+
mandarin = "中文"
|
|
3089
|
+
df = gp.GeoDataFrame(
|
|
3090
|
+
{mandarin: mandarin, "geometry": [Point(0, 0)]}, crs="EPSG:4326"
|
|
3091
|
+
)
|
|
3092
|
+
write_dataframe(df, output_path, encoding=encoding)
|
|
3093
|
+
|
|
3094
|
+
actual = read_dataframe(
|
|
3095
|
+
output_path,
|
|
3096
|
+
sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
|
|
3097
|
+
use_arrow=use_arrow,
|
|
3098
|
+
)
|
|
3099
|
+
assert actual.columns[0] == mandarin
|
|
3100
|
+
assert actual[mandarin].values[0] == mandarin
|
|
3101
|
+
|
|
3102
|
+
actual = read_dataframe(
|
|
3103
|
+
output_path,
|
|
3104
|
+
sql=f"select * from test where \"{mandarin}\" = '{mandarin}'",
|
|
3105
|
+
encoding=encoding,
|
|
3106
|
+
use_arrow=use_arrow,
|
|
3107
|
+
)
|
|
3108
|
+
assert actual.columns[0] == mandarin
|
|
3109
|
+
assert actual[mandarin].values[0] == mandarin
|
|
3110
|
+
|
|
3111
|
+
|
|
3112
|
+
@pytest.mark.requires_arrow_write_api
|
|
3113
|
+
def test_write_kml_file_coordinate_order(tmp_path, use_arrow):
|
|
3114
|
+
# confirm KML coordinates are written in lon, lat order even if CRS axis
|
|
3115
|
+
# specifies otherwise
|
|
3116
|
+
points = [Point(10, 20), Point(30, 40), Point(50, 60)]
|
|
3117
|
+
gdf = gp.GeoDataFrame(geometry=points, crs="EPSG:4326")
|
|
3118
|
+
output_path = tmp_path / "test.kml"
|
|
3119
|
+
write_dataframe(
|
|
3120
|
+
gdf, output_path, layer="tmp_layer", driver="KML", use_arrow=use_arrow
|
|
3121
|
+
)
|
|
3122
|
+
|
|
3123
|
+
gdf_in = read_dataframe(output_path, use_arrow=use_arrow)
|
|
3124
|
+
|
|
3125
|
+
assert np.array_equal(gdf_in.geometry.values, points)
|
|
3126
|
+
|
|
3127
|
+
|
|
3128
|
+
@pytest.mark.requires_arrow_write_api
|
|
3129
|
+
@pytest.mark.skipif(
|
|
3130
|
+
"LIBKML" not in list_drivers(),
|
|
3131
|
+
reason="LIBKML driver is not available and is needed to append to .kml",
|
|
3132
|
+
)
|
|
3133
|
+
def test_write_kml_append(tmp_path, use_arrow):
|
|
3134
|
+
"""Append features to an existing KML file.
|
|
3135
|
+
|
|
3136
|
+
Appending is only supported by the LIBKML driver, and the driver isn't
|
|
3137
|
+
included in the GDAL ubuntu-small images, so skip if not available.
|
|
3138
|
+
"""
|
|
3139
|
+
points = [Point(10, 20), Point(30, 40), Point(50, 60)]
|
|
3140
|
+
gdf = gp.GeoDataFrame(geometry=points, crs="EPSG:4326")
|
|
3141
|
+
output_path = tmp_path / "test.kml"
|
|
3142
|
+
write_dataframe(
|
|
3143
|
+
gdf, output_path, layer="tmp_layer", driver="KML", use_arrow=use_arrow
|
|
3144
|
+
)
|
|
3145
|
+
|
|
3146
|
+
# test appending to the existing file only if LIBKML is available
|
|
3147
|
+
# as it appears to fall back on LIBKML driver when appending.
|
|
3148
|
+
points_append = [Point(7, 8), Point(9, 10), Point(11, 12)]
|
|
3149
|
+
gdf_append = gp.GeoDataFrame(geometry=points_append, crs="EPSG:4326")
|
|
3150
|
+
|
|
3151
|
+
write_dataframe(
|
|
3152
|
+
gdf_append,
|
|
3153
|
+
output_path,
|
|
3154
|
+
layer="tmp_layer",
|
|
3155
|
+
driver="KML",
|
|
3156
|
+
use_arrow=use_arrow,
|
|
3157
|
+
append=True,
|
|
3158
|
+
)
|
|
3159
|
+
# force_2d is used to only compare the xy dimensions of the geometry, as the LIBKML
|
|
3160
|
+
# driver always adds the z-dimension when the kml file is over-written.
|
|
3161
|
+
gdf_in_appended = read_dataframe(output_path, use_arrow=use_arrow, force_2d=True)
|
|
3162
|
+
|
|
3163
|
+
assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
|
|
3164
|
+
|
|
3165
|
+
|
|
3166
|
+
@pytest.mark.requires_arrow_write_api
|
|
3167
|
+
def test_write_geojson_rfc7946_coordinates(tmp_path, use_arrow):
|
|
3168
|
+
points = [Point(10, 20), Point(30, 40), Point(50, 60)]
|
|
3169
|
+
gdf = gp.GeoDataFrame(geometry=points, crs="EPSG:4326")
|
|
3170
|
+
output_path = tmp_path / "test.geojson"
|
|
3171
|
+
write_dataframe(
|
|
3172
|
+
gdf,
|
|
3173
|
+
output_path,
|
|
3174
|
+
layer="tmp_layer",
|
|
3175
|
+
driver="GeoJSON",
|
|
3176
|
+
RFC7946=True,
|
|
3177
|
+
use_arrow=use_arrow,
|
|
3178
|
+
)
|
|
3179
|
+
|
|
3180
|
+
gdf_in = read_dataframe(output_path, use_arrow=use_arrow)
|
|
3181
|
+
|
|
3182
|
+
assert np.array_equal(gdf_in.geometry.values, points)
|
|
3183
|
+
|
|
3184
|
+
# test appending to the existing file
|
|
3185
|
+
|
|
3186
|
+
points_append = [Point(70, 80), Point(90, 100), Point(110, 120)]
|
|
3187
|
+
gdf_append = gp.GeoDataFrame(geometry=points_append, crs="EPSG:4326")
|
|
3188
|
+
|
|
3189
|
+
write_dataframe(
|
|
3190
|
+
gdf_append,
|
|
3191
|
+
output_path,
|
|
3192
|
+
layer="tmp_layer",
|
|
3193
|
+
driver="GeoJSON",
|
|
3194
|
+
RFC7946=True,
|
|
3195
|
+
use_arrow=use_arrow,
|
|
3196
|
+
append=True,
|
|
3197
|
+
)
|
|
3198
|
+
|
|
3199
|
+
gdf_in_appended = read_dataframe(output_path, use_arrow=use_arrow)
|
|
3200
|
+
assert np.array_equal(gdf_in_appended.geometry.values, points + points_append)
|
|
3201
|
+
|
|
3202
|
+
|
|
3203
|
+
@pytest.mark.requires_arrow_api
|
|
3204
|
+
@pytest.mark.skipif(
|
|
3205
|
+
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
|
|
3206
|
+
)
|
|
3207
|
+
def test_parquet_driver(tmp_path, use_arrow):
|
|
3208
|
+
"""
|
|
3209
|
+
Simple test verifying the Parquet driver works if available
|
|
3210
|
+
"""
|
|
3211
|
+
gdf = gp.GeoDataFrame(
|
|
3212
|
+
{"col": [1, 2, 3], "geometry": [Point(0, 0), Point(1, 1), Point(2, 2)]},
|
|
3213
|
+
crs="EPSG:4326",
|
|
3214
|
+
)
|
|
3215
|
+
output_path = tmp_path / "test.parquet"
|
|
3216
|
+
write_dataframe(gdf, output_path, use_arrow=use_arrow)
|
|
3217
|
+
result = read_dataframe(output_path, use_arrow=use_arrow)
|
|
3218
|
+
assert_geodataframe_equal(result, gdf)
|