PyPI - pyogrio - Versions diffs - 0.10.0__cp312-cp312-manylinux_2_28_aarch64.whl → 0.11.0__cp312-cp312-manylinux_2_28_aarch64.whl - Mend

pyogrio 0.10.0__cp312-cp312-manylinux_2_28_aarch64.whl → 0.11.0__cp312-cp312-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyogrio might be problematic. Click here for more details.

Files changed (49) hide show

pyogrio/__init__.py +9 -9
pyogrio/_compat.py +8 -0
pyogrio/_err.cpython-312-aarch64-linux-gnu.so +0 -0
pyogrio/_geometry.cpython-312-aarch64-linux-gnu.so +0 -0
pyogrio/_io.cpython-312-aarch64-linux-gnu.so +0 -0
pyogrio/_ogr.cpython-312-aarch64-linux-gnu.so +0 -0
pyogrio/_version.py +3 -3
pyogrio/_vsi.cpython-312-aarch64-linux-gnu.so +0 -0
pyogrio/gdal_data/GDAL-targets-release.cmake +3 -3
pyogrio/gdal_data/GDAL-targets.cmake +2 -2
pyogrio/gdal_data/GDALConfigVersion.cmake +3 -3
pyogrio/gdal_data/gdalinfo_output.schema.json +3 -3
pyogrio/gdal_data/gdaltileindex.xsd +1 -17
pyogrio/gdal_data/gdalvrt.xsd +48 -41
pyogrio/gdal_data/nitf_spec.xml +1 -17
pyogrio/gdal_data/nitf_spec.xsd +1 -17
pyogrio/gdal_data/ogrvrt.xsd +1 -17
pyogrio/gdal_data/osmconf.ini +3 -1
pyogrio/gdal_data/pdfcomposition.xsd +1 -17
pyogrio/gdal_data/template_tiles.mapml +28 -0
pyogrio/gdal_data/vcpkg.spdx.json +32 -27
pyogrio/gdal_data/vcpkg_abi_info.txt +27 -26
pyogrio/gdal_data/vdv452.xml +1 -17
pyogrio/gdal_data/vdv452.xsd +1 -17
pyogrio/geopandas.py +91 -43
pyogrio/proj_data/ITRF2014 +1 -1
pyogrio/proj_data/ITRF2020 +91 -0
pyogrio/proj_data/proj-config-version.cmake +3 -3
pyogrio/proj_data/proj-config.cmake +1 -1
pyogrio/proj_data/proj-targets.cmake +3 -3
pyogrio/proj_data/proj.db +0 -0
pyogrio/proj_data/proj.ini +11 -3
pyogrio/proj_data/proj4-targets.cmake +3 -3
pyogrio/proj_data/usage +7 -2
pyogrio/proj_data/vcpkg.spdx.json +27 -22
pyogrio/proj_data/vcpkg_abi_info.txt +15 -14
pyogrio/tests/conftest.py +8 -0
pyogrio/tests/test_arrow.py +3 -0
pyogrio/tests/test_core.py +8 -4
pyogrio/tests/test_geopandas_io.py +250 -42
pyogrio/tests/test_path.py +10 -0
pyogrio/tests/test_raw_io.py +6 -2
pyogrio/util.py +15 -2
{pyogrio-0.10.0.dist-info → pyogrio-0.11.0.dist-info}/METADATA +32 -37
{pyogrio-0.10.0.dist-info → pyogrio-0.11.0.dist-info}/RECORD +202 -200
{pyogrio-0.10.0.dist-info → pyogrio-0.11.0.dist-info}/WHEEL +1 -1
pyogrio.libs/{libgdal-b0847c7b.so.35.3.9.1 → libgdal-4bc0d15f.so.36.3.10.3} +0 -0
{pyogrio-0.10.0.dist-info → pyogrio-0.11.0.dist-info/licenses}/LICENSE +0 -0
{pyogrio-0.10.0.dist-info → pyogrio-0.11.0.dist-info}/top_level.txt +0 -0

pyogrio/tests/test_geopandas_io.py CHANGED Viewed

@@ -12,10 +12,20 @@ from pyogrio import (
     list_drivers,
     list_layers,
     read_info,
+    set_gdal_config_options,
     vsi_listtree,
     vsi_unlink,
 )
-from pyogrio._compat import HAS_ARROW_WRITE_API, HAS_PYPROJ, PANDAS_GE_15
+from pyogrio._compat import (
+    GDAL_GE_37,
+    GDAL_GE_311,
+    GDAL_GE_352,
+    HAS_ARROW_WRITE_API,
+    HAS_PYPROJ,
+    PANDAS_GE_15,
+    PANDAS_GE_30,
+    SHAPELY_GE_21,
+)
 from pyogrio.errors import DataLayerError, DataSourceError, FeatureError, GeometryError
 from pyogrio.geopandas import PANDAS_GE_20, read_dataframe, write_dataframe
 from pyogrio.raw import (
@@ -93,8 +103,20 @@ def spatialite_available(path):
         return False
-@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-def test_read_csv_encoding(tmp_path, encoding):
+@pytest.mark.parametrize(
+    "encoding, arrow",
+    [
+        ("utf-8", False),
+        pytest.param("utf-8", True, marks=requires_pyarrow_api),
+        ("cp1252", False),
+        (None, False),
+    ],
+)
+def test_read_csv_encoding(tmp_path, encoding, arrow):
+    """ "Test reading CSV files with different encodings.
+    Arrow only supports utf-8 encoding.
+    """
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
     csv_path = tmp_path / "test.csv"
@@ -105,7 +127,7 @@ def test_read_csv_encoding(tmp_path, encoding):
     # Read csv. The data should be read with the same default encoding as the csv file
     # was written in, but should have been converted to utf-8 in the dataframe returned.
     # Hence, the asserts below, with strings in utf-8, be OK.
-    df = read_dataframe(csv_path, encoding=encoding)
+    df = read_dataframe(csv_path, encoding=encoding, use_arrow=arrow)
     assert len(df) == 1
     assert df.columns.tolist() == ["näme", "city"]
@@ -117,19 +139,29 @@ def test_read_csv_encoding(tmp_path, encoding):
     locale.getpreferredencoding().upper() == "UTF-8",
     reason="test requires non-UTF-8 default platform",
 )
-def test_read_csv_platform_encoding(tmp_path):
-    """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
+def test_read_csv_platform_encoding(tmp_path, use_arrow):
+    """Verify that read defaults to platform encoding; only works on Windows (CP1252).
+    When use_arrow=True, reading an non-UTF8 fails.
+    """
     csv_path = tmp_path / "test.csv"
     with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
         csv.write("näme,city\n")
         csv.write("Wilhelm Röntgen,Zürich\n")
-    df = read_dataframe(csv_path)
+    if use_arrow:
+        with pytest.raises(
+            DataSourceError,
+            match="; please use_arrow=False",
+        ):
+            df = read_dataframe(csv_path, use_arrow=use_arrow)
+    else:
+        df = read_dataframe(csv_path, use_arrow=use_arrow)
-    assert len(df) == 1
-    assert df.columns.tolist() == ["näme", "city"]
-    assert df.city.tolist() == ["Zürich"]
-    assert df.näme.tolist() == ["Wilhelm Röntgen"]
+        assert len(df) == 1
+        assert df.columns.tolist() == ["näme", "city"]
+        assert df.city.tolist() == ["Zürich"]
+        assert df.näme.tolist() == ["Wilhelm Röntgen"]
 def test_read_dataframe(naturalearth_lowres_all_ext):
@@ -227,11 +259,32 @@ def test_read_force_2d(tmp_path, use_arrow):
     assert not df.iloc[0].geometry.has_z
+@pytest.mark.skipif(
+    not GDAL_GE_352,
+    reason="gdal >= 3.5.2 needed to use OGR_GEOJSON_MAX_OBJ_SIZE with a float value",
+)
+def test_read_geojson_error(naturalearth_lowres_geojson, use_arrow):
+    try:
+        set_gdal_config_options({"OGR_GEOJSON_MAX_OBJ_SIZE": 0.01})
+        with pytest.raises(
+            DataSourceError,
+            match="Failed to read GeoJSON data; .* GeoJSON object too complex",
+        ):
+            read_dataframe(naturalearth_lowres_geojson, use_arrow=use_arrow)
+    finally:
+        set_gdal_config_options({"OGR_GEOJSON_MAX_OBJ_SIZE": None})
 def test_read_layer(tmp_path, use_arrow):
     filename = tmp_path / "test.gpkg"
     # create a multilayer GPKG
     expected1 = gp.GeoDataFrame(geometry=[Point(0, 0)], crs="EPSG:4326")
+    if use_arrow:
+        # TODO this needs to be fixed on the geopandas side (to ensure the
+        # GeoDataFrame() constructor does this), when use_arrow we already
+        # get columns Index with string dtype
+        expected1.columns = expected1.columns.astype("str")
     write_dataframe(
         expected1,
         filename,
@@ -239,6 +292,8 @@ def test_read_layer(tmp_path, use_arrow):
     )
     expected2 = gp.GeoDataFrame(geometry=[Point(1, 1)], crs="EPSG:4326")
+    if use_arrow:
+        expected2.columns = expected2.columns.astype("str")
     write_dataframe(expected2, filename, layer="layer2", append=True)
     assert np.array_equal(
@@ -361,7 +416,7 @@ def test_read_null_values(tmp_path, use_arrow):
     df = read_dataframe(filename, use_arrow=use_arrow, read_geometry=False)
     # make sure that Null values are preserved
-    assert np.array_equal(df.col.values, expected.col.values)
+    assert df["col"].isna().all()
 def test_read_fid_as_index(naturalearth_lowres_all_ext, use_arrow):
@@ -438,10 +493,17 @@ def test_read_where_invalid(request, naturalearth_lowres_all_ext, use_arrow):
     if use_arrow and naturalearth_lowres_all_ext.suffix == ".gpkg":
         # https://github.com/OSGeo/gdal/issues/8492
         request.node.add_marker(pytest.mark.xfail(reason="GDAL doesn't error for GPGK"))
-    with pytest.raises(ValueError, match="Invalid SQL"):
-        read_dataframe(
-            naturalearth_lowres_all_ext, use_arrow=use_arrow, where="invalid"
-        )
+    if naturalearth_lowres_all_ext.suffix == ".gpkg" and __gdal_version__ >= (3, 11, 0):
+        with pytest.raises(DataLayerError, match="no such column"):
+            read_dataframe(
+                naturalearth_lowres_all_ext, use_arrow=use_arrow, where="invalid"
+            )
+    else:
+        with pytest.raises(ValueError, match="Invalid SQL"):
+            read_dataframe(
+                naturalearth_lowres_all_ext, use_arrow=use_arrow, where="invalid"
+            )
 def test_read_where_ignored_field(naturalearth_lowres, use_arrow):
@@ -675,6 +737,13 @@ def test_read_skip_features(naturalearth_lowres_all_ext, use_arrow, skip_feature
     # In .geojsonl the vertices are reordered, so normalize
     is_jsons = ext == ".geojsonl"
+    if skip_features == 200 and not use_arrow:
+        # result is an empty dataframe, so no proper dtype inference happens
+        # for the numpy object dtype arrays
+        df[["continent", "name", "iso_a3"]] = df[
+            ["continent", "name", "iso_a3"]
+        ].astype("str")
     assert_geodataframe_equal(
         df,
         expected,
@@ -943,9 +1012,20 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
     assert df.iloc[0].geometry.area > area_canada
-@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-def test_write_csv_encoding(tmp_path, encoding):
-    """Test if write_dataframe uses the default encoding correctly."""
+@pytest.mark.parametrize(
+    "encoding, arrow",
+    [
+        ("utf-8", False),
+        pytest.param("utf-8", True, marks=requires_arrow_write_api),
+        ("cp1252", False),
+        (None, False),
+    ],
+)
+def test_write_csv_encoding(tmp_path, encoding, arrow):
+    """Test if write_dataframe uses the default encoding correctly.
+    Arrow only supports utf-8 encoding.
+    """
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
     csv_path = tmp_path / "test.csv"
@@ -958,7 +1038,7 @@ def test_write_csv_encoding(tmp_path, encoding):
     # same encoding as above.
     df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
     csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
-    write_dataframe(df, csv_pyogrio_path, encoding=encoding)
+    write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=arrow)
     # Check if the text files written both ways can be read again and give same result.
     with open(csv_path, encoding=encoding) as csv:
@@ -976,6 +1056,48 @@ def test_write_csv_encoding(tmp_path, encoding):
     assert csv_bytes == csv_pyogrio_bytes
+@pytest.mark.parametrize(
+    "ext, fid_column, fid_param_value",
+    [
+        (".gpkg", "fid", None),
+        (".gpkg", "FID", None),
+        (".sqlite", "ogc_fid", None),
+        (".gpkg", "fid_custom", "fid_custom"),
+        (".gpkg", "FID_custom", "fid_custom"),
+        (".sqlite", "ogc_fid_custom", "ogc_fid_custom"),
+    ],
+)
+@pytest.mark.requires_arrow_write_api
+def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow):
+    """Test to specify FIDs to save when writing to a file.
+    Saving custom FIDs is only supported for formats that actually store the FID, like
+    e.g. GPKG and SQLite. The fid_column name check is case-insensitive.
+    Typically, GDAL supports using a custom FID column for these file formats via a
+    `FID` layer creation option, which is also tested here. If `fid_param_value` is
+    specified (not None), an `fid` parameter is passed to `write_dataframe`, causing
+    GDAL to use the column name specified for the FID.
+    """
+    input_gdf = gp.GeoDataFrame(
+        {fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326"
+    )
+    kwargs = {}
+    if fid_param_value is not None:
+        kwargs["fid"] = fid_param_value
+    path = tmp_path / f"test{ext}"
+    write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs)
+    assert path.exists()
+    output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow)
+    output_gdf = output_gdf.reset_index()
+    # pyogrio always sets "fid" as index name with `fid_as_index`
+    expected_gdf = input_gdf.rename(columns={fid_column: "fid"})
+    assert_geodataframe_equal(output_gdf, expected_gdf)
 @pytest.mark.parametrize("ext", ALL_EXTS)
 @pytest.mark.requires_arrow_write_api
 def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):
@@ -1087,16 +1209,38 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow):
 @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"])
+@pytest.mark.parametrize(
+    "columns, dtype",
+    [
+        ([], None),
+        (["col_int"], np.int64),
+        (["col_float"], np.float64),
+        (["col_object"], object),
+    ],
+)
 @pytest.mark.requires_arrow_write_api
-def test_write_empty_dataframe(tmp_path, ext, use_arrow):
-    expected = gp.GeoDataFrame(geometry=[], crs=4326)
+def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow):
+    """Test writing dataframe with no rows.
+    With use_arrow, object type columns with no rows are converted to null type columns
+    by pyarrow, but null columns are not supported by GDAL. Added to test fix for #513.
+    """
+    expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326)
     filename = tmp_path / f"test{ext}"
     write_dataframe(expected, filename, use_arrow=use_arrow)
     assert filename.exists()
-    df = read_dataframe(filename)
-    assert_geodataframe_equal(df, expected)
+    df = read_dataframe(filename, use_arrow=use_arrow)
+    # Check result
+    # For older pandas versions, the index is created as Object dtype but read as
+    # RangeIndex, so don't check the index dtype in that case.
+    check_index_type = True if PANDAS_GE_20 else False
+    # with pandas 3+ and reading through arrow, we preserve the string dtype
+    # (no proper dtype inference happens for the empty numpy object dtype arrays)
+    if use_arrow and dtype is object:
+        expected["col_object"] = expected["col_object"].astype("str")
+    assert_geodataframe_equal(df, expected, check_index_type=check_index_type)
 def test_write_empty_geometry(tmp_path):
@@ -1116,6 +1260,28 @@ def test_write_empty_geometry(tmp_path):
     assert_geodataframe_equal(df, expected)
+@pytest.mark.requires_arrow_write_api
+def test_write_None_string_column(tmp_path, use_arrow):
+    """Test pandas object columns with all None values.
+    With use_arrow, such columns are converted to null type columns by pyarrow, but null
+    columns are not supported by GDAL. Added to test fix for #513.
+    """
+    gdf = gp.GeoDataFrame({"object_col": [None]}, geometry=[Point(0, 0)], crs=4326)
+    filename = tmp_path / "test.gpkg"
+    write_dataframe(gdf, filename, use_arrow=use_arrow)
+    assert filename.exists()
+    result_gdf = read_dataframe(filename, use_arrow=use_arrow)
+    if PANDAS_GE_30 and use_arrow:
+        assert result_gdf.object_col.dtype == "str"
+        gdf["object_col"] = gdf["object_col"].astype("str")
+    else:
+        assert result_gdf.object_col.dtype == object
+    assert_geodataframe_equal(result_gdf, gdf)
 @pytest.mark.parametrize("ext", [".geojsonl", ".geojsons"])
 @pytest.mark.requires_arrow_write_api
 def test_write_read_empty_dataframe_unsupported(tmp_path, ext, use_arrow):
@@ -1521,6 +1687,30 @@ def test_custom_crs_io(tmp_path, naturalearth_lowres_all_ext, use_arrow):
     assert df.crs.equals(expected.crs)
+@pytest.mark.parametrize("ext", [".gpkg.zip", ".shp.zip", ".shz"])
+@pytest.mark.requires_arrow_write_api
+def test_write_read_zipped_ext(tmp_path, naturalearth_lowres, ext, use_arrow):
+    """Run a basic read and write test on some extra (zipped) extensions."""
+    if ext == ".gpkg.zip" and not GDAL_GE_37:
+        pytest.skip(".gpkg.zip support requires GDAL >= 3.7")
+    input_gdf = read_dataframe(naturalearth_lowres)
+    output_path = tmp_path / f"test{ext}"
+    write_dataframe(input_gdf, output_path, use_arrow=use_arrow)
+    assert output_path.exists()
+    result_gdf = read_dataframe(output_path)
+    geometry_types = result_gdf.geometry.type.unique()
+    if DRIVERS[ext] in DRIVERS_NO_MIXED_SINGLE_MULTI:
+        assert list(geometry_types) == ["MultiPolygon"]
+    else:
+        assert set(geometry_types) == {"MultiPolygon", "Polygon"}
+    assert_geodataframe_equal(result_gdf, input_gdf, check_index_type=False)
 def test_write_read_mixed_column_values(tmp_path):
     # use_arrow=True is tested separately below
     mixed_values = ["test", 1.0, 1, datetime.now(), None, np.nan]
@@ -1532,11 +1722,13 @@ def test_write_read_mixed_column_values(tmp_path):
     write_dataframe(test_gdf, output_path)
     output_gdf = read_dataframe(output_path)
     assert len(test_gdf) == len(output_gdf)
-    for idx, value in enumerate(mixed_values):
-        if value in (None, np.nan):
-            assert output_gdf["mixed"][idx] is None
-        else:
-            assert output_gdf["mixed"][idx] == str(value)
+    # mixed values as object dtype are currently written as strings
+    # (but preserving nulls)
+    expected = pd.Series(
+        [str(value) if value not in (None, np.nan) else None for value in mixed_values],
+        name="mixed",
+    )
+    assert_series_equal(output_gdf["mixed"], expected)
 @requires_arrow_write_api
@@ -1569,8 +1761,8 @@ def test_write_read_null(tmp_path, use_arrow):
     assert pd.isna(result_gdf["float64"][1])
     assert pd.isna(result_gdf["float64"][2])
     assert result_gdf["object_str"][0] == "test"
-    assert result_gdf["object_str"][1] is None
-    assert result_gdf["object_str"][2] is None
+    assert pd.isna(result_gdf["object_str"][1])
+    assert pd.isna(result_gdf["object_str"][2])
 @pytest.mark.requires_arrow_write_api
@@ -1714,23 +1906,29 @@ def test_write_geometry_z_types_auto(
 @pytest.mark.parametrize(
-    "on_invalid, message",
+    "on_invalid, message, expected_wkt",
     [
         (
             "warn",
             "Invalid WKB: geometry is returned as None. IllegalArgumentException: "
-            "Invalid number of points in LinearRing found 2 - must be 0 or >=",
+            "Points of LinearRing do not form a closed linestring",
+            None,
         ),
-        ("raise", "Invalid number of points in LinearRing found 2 - must be 0 or >="),
-        ("ignore", None),
+        ("raise", "Points of LinearRing do not form a closed linestring", None),
+        ("ignore", None, None),
+        ("fix", None, "POLYGON ((0 0, 0 1, 0 0))"),
     ],
 )
-def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message):
+@pytest.mark.filterwarnings("ignore:Non closed ring detected:RuntimeWarning")
+def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message, expected_wkt):
+    if on_invalid == "fix" and not SHAPELY_GE_21:
+        pytest.skip("on_invalid=fix not available for Shapely < 2.1")
     if on_invalid == "raise":
         handler = pytest.raises(shapely.errors.GEOSException, match=message)
     elif on_invalid == "warn":
         handler = pytest.warns(match=message)
-    elif on_invalid == "ignore":
+    elif on_invalid in ("fix", "ignore"):
         handler = contextlib.nullcontext()
     else:
         raise ValueError(f"unknown value for on_invalid: {on_invalid}")
@@ -1744,7 +1942,7 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message):
                 "properties": {},
                 "geometry": {
                     "type": "Polygon",
-                    "coordinates": [ [ [0, 0], [0, 0] ] ]
+                    "coordinates": [ [ [0, 0], [0, 1] ] ]
                 }
             }
         ]
@@ -1760,7 +1958,10 @@ def test_read_invalid_poly_ring(tmp_path, use_arrow, on_invalid, message):
             use_arrow=use_arrow,
             on_invalid=on_invalid,
         )
-        df.geometry.isnull().all()
+        if expected_wkt is None:
+            assert df.geometry.iloc[0] is None
+        else:
+            assert df.geometry.iloc[0].wkt == expected_wkt
 def test_read_multisurface(multisurface_file, use_arrow):
@@ -1792,6 +1993,10 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow):
         geometry=[shapely.Point(0, 0)],
         crs="EPSG:4326",
     )
+    if GDAL_GE_311 and use_arrow:
+        # GDAL 3.11 started to use json extension type, which is not yet handled
+        # correctly in the arrow->pandas conversion (using object instead of str dtype)
+        expected["intermediate_level"] = expected["intermediate_level"].astype(object)
     assert_geodataframe_equal(df, expected)
@@ -1837,7 +2042,7 @@ def test_write_nullable_dtypes(tmp_path, use_arrow):
     expected["col2"] = expected["col2"].astype("float64")
     expected["col3"] = expected["col3"].astype("float32")
     expected["col4"] = expected["col4"].astype("float64")
-    expected["col5"] = expected["col5"].astype(object)
+    expected["col5"] = expected["col5"].astype("str")
     expected.loc[1, "col5"] = None  # pandas converts to pd.NA on line above
     assert_geodataframe_equal(output_gdf, expected)
@@ -2160,7 +2365,10 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
     if use_arrow:
         # pyarrow cannot decode column name with incorrect encoding
-        with pytest.raises(UnicodeDecodeError):
+        with pytest.raises(
+            DataSourceError,
+            match="The file being read is not encoded in UTF-8; please use_arrow=False",
+        ):
             read_dataframe(output_path, use_arrow=True)
     else:
         bad = read_dataframe(output_path, use_arrow=False)
@@ -2257,7 +2465,7 @@ def test_write_kml_file_coordinate_order(tmp_path, use_arrow):
     if "LIBKML" in list_drivers():
         # test appending to the existing file only if LIBKML is available
         # as it appears to fall back on LIBKML driver when appending.
-        points_append = [Point(70, 80), Point(90, 100), Point(110, 120)]
+        points_append = [Point(7, 8), Point(9, 10), Point(11, 12)]
         gdf_append = gp.GeoDataFrame(geometry=points_append, crs="EPSG:4326")
         write_dataframe(

pyogrio/tests/test_path.py CHANGED Viewed

@@ -33,10 +33,20 @@ def change_cwd(path):
     [
         # local file paths that should be passed through as is
         ("data.gpkg", "data.gpkg"),
+        ("data.gpkg.zip", "data.gpkg.zip"),
+        ("data.shp.zip", "data.shp.zip"),
         (Path("data.gpkg"), "data.gpkg"),
+        (Path("data.gpkg.zip"), "data.gpkg.zip"),
+        (Path("data.shp.zip"), "data.shp.zip"),
         ("/home/user/data.gpkg", "/home/user/data.gpkg"),
+        ("/home/user/data.gpkg.zip", "/home/user/data.gpkg.zip"),
+        ("/home/user/data.shp.zip", "/home/user/data.shp.zip"),
         (r"C:\User\Documents\data.gpkg", r"C:\User\Documents\data.gpkg"),
+        (r"C:\User\Documents\data.gpkg.zip", r"C:\User\Documents\data.gpkg.zip"),
+        (r"C:\User\Documents\data.shp.zip", r"C:\User\Documents\data.shp.zip"),
         ("file:///home/user/data.gpkg", "/home/user/data.gpkg"),
+        ("file:///home/user/data.gpkg.zip", "/home/user/data.gpkg.zip"),
+        ("file:///home/user/data.shp.zip", "/home/user/data.shp.zip"),
         ("/home/folder # with hash/data.gpkg", "/home/folder # with hash/data.gpkg"),
         # cloud URIs
         ("https://testing/data.gpkg", "/vsicurl/https://testing/data.gpkg"),

pyogrio/tests/test_raw_io.py CHANGED Viewed

@@ -17,7 +17,7 @@ from pyogrio import (
     read_info,
     set_gdal_config_options,
 )
-from pyogrio._compat import HAS_PYARROW, HAS_SHAPELY
+from pyogrio._compat import GDAL_GE_37, HAS_PYARROW, HAS_SHAPELY
 from pyogrio.errors import DataLayerError, DataSourceError, FeatureError
 from pyogrio.raw import open_arrow, read, write
 from pyogrio.tests.conftest import (
@@ -63,9 +63,10 @@ def test_read(naturalearth_lowres):
 @pytest.mark.parametrize("ext", DRIVERS)
 def test_read_autodetect_driver(tmp_path, naturalearth_lowres, ext):
     # Test all supported autodetect drivers
+    if ext == ".gpkg.zip" and not GDAL_GE_37:
+        pytest.skip(".gpkg.zip not supported for gdal < 3.7.0")
     testfile = prepare_testfile(naturalearth_lowres, dst_dir=tmp_path, ext=ext)
-    assert testfile.suffix == ext
     assert testfile.exists()
     meta, _, geometry, fields = read(testfile)
@@ -703,6 +704,9 @@ def test_write_append(tmp_path, naturalearth_lowres, ext):
     if ext in (".geojsonl", ".geojsons") and __gdal_version__ < (3, 6, 0):
         pytest.skip("Append to GeoJSONSeq only available for GDAL >= 3.6.0")
+    if ext == ".gpkg.zip":
+        pytest.skip("Append to .gpkg.zip is not supported")
     meta, _, geometry, field_data = read(naturalearth_lowres)
     # coerce output layer to MultiPolygon to avoid mixed type errors

pyogrio/util.py CHANGED Viewed

@@ -9,6 +9,8 @@ from urllib.parse import urlparse
 from pyogrio._vsi import vsimem_rmtree_toplevel as _vsimem_rmtree_toplevel
+MULTI_EXTENSIONS = (".gpkg.zip", ".shp.zip")
 def get_vsi_path_or_buffer(path_or_buffer):
     """Get VSI-prefixed path or bytes buffer depending on type of path_or_buffer.
@@ -68,15 +70,23 @@ def vsi_path(path: Union[str, Path]) -> str:
     # Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
     # URL schemes
     if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
+        # If it is not a zip file or it is multi-extension zip file that is directly
+        # supported by a GDAL driver, return the path as is.
         if not path.split("!")[0].endswith(".zip"):
             return path
+        if path.split("!")[0].endswith(MULTI_EXTENSIONS):
+            return path
         # prefix then allow to proceed with remaining parsing
         path = f"zip://{path}"
     path, archive, scheme = _parse_uri(path)
-    if scheme or archive or path.endswith(".zip"):
+    if (
+        scheme
+        or archive
+        or (path.endswith(".zip") and not path.endswith(MULTI_EXTENSIONS))
+    ):
         return _construct_vsi_path(path, archive, scheme)
     return path
@@ -146,7 +156,10 @@ def _construct_vsi_path(path, archive, scheme) -> str:
     suffix = ""
     schemes = scheme.split("+")
-    if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
+    if "zip" not in schemes and (
+        archive.endswith(".zip")
+        or (path.endswith(".zip") and not path.endswith(MULTI_EXTENSIONS))
+    ):
         schemes.insert(0, "zip")
     if schemes: