fsspec 2024.5.0__py3-none-any.whl → 2024.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. fsspec/_version.py +2 -2
  2. fsspec/caching.py +3 -2
  3. fsspec/compression.py +1 -1
  4. fsspec/generic.py +3 -0
  5. fsspec/implementations/cached.py +6 -16
  6. fsspec/implementations/dirfs.py +2 -0
  7. fsspec/implementations/github.py +12 -0
  8. fsspec/implementations/http.py +2 -1
  9. fsspec/implementations/reference.py +9 -0
  10. fsspec/implementations/smb.py +10 -0
  11. fsspec/json.py +121 -0
  12. fsspec/registry.py +24 -18
  13. fsspec/spec.py +119 -33
  14. fsspec/utils.py +1 -1
  15. {fsspec-2024.5.0.dist-info → fsspec-2024.6.1.dist-info}/METADATA +10 -5
  16. fsspec-2024.6.1.dist-info/RECORD +55 -0
  17. {fsspec-2024.5.0.dist-info → fsspec-2024.6.1.dist-info}/WHEEL +1 -1
  18. fsspec/implementations/tests/__init__.py +0 -0
  19. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_file_listing.yaml +0 -112
  20. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_mkdir.yaml +0 -582
  21. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_pyarrow_non_partitioned.yaml +0 -873
  22. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range.yaml +0 -458
  23. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range_chunked.yaml +0 -1355
  24. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_and_read.yaml +0 -795
  25. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_pyarrow_non_partitioned.yaml +0 -613
  26. fsspec/implementations/tests/conftest.py +0 -39
  27. fsspec/implementations/tests/local/__init__.py +0 -0
  28. fsspec/implementations/tests/local/local_fixtures.py +0 -18
  29. fsspec/implementations/tests/local/local_test.py +0 -14
  30. fsspec/implementations/tests/memory/__init__.py +0 -0
  31. fsspec/implementations/tests/memory/memory_fixtures.py +0 -27
  32. fsspec/implementations/tests/memory/memory_test.py +0 -14
  33. fsspec/implementations/tests/out.zip +0 -0
  34. fsspec/implementations/tests/test_archive.py +0 -382
  35. fsspec/implementations/tests/test_arrow.py +0 -259
  36. fsspec/implementations/tests/test_cached.py +0 -1306
  37. fsspec/implementations/tests/test_common.py +0 -35
  38. fsspec/implementations/tests/test_dask.py +0 -29
  39. fsspec/implementations/tests/test_data.py +0 -20
  40. fsspec/implementations/tests/test_dbfs.py +0 -268
  41. fsspec/implementations/tests/test_dirfs.py +0 -588
  42. fsspec/implementations/tests/test_ftp.py +0 -178
  43. fsspec/implementations/tests/test_git.py +0 -76
  44. fsspec/implementations/tests/test_http.py +0 -577
  45. fsspec/implementations/tests/test_jupyter.py +0 -57
  46. fsspec/implementations/tests/test_libarchive.py +0 -33
  47. fsspec/implementations/tests/test_local.py +0 -1285
  48. fsspec/implementations/tests/test_memory.py +0 -382
  49. fsspec/implementations/tests/test_reference.py +0 -720
  50. fsspec/implementations/tests/test_sftp.py +0 -233
  51. fsspec/implementations/tests/test_smb.py +0 -139
  52. fsspec/implementations/tests/test_tar.py +0 -243
  53. fsspec/implementations/tests/test_webhdfs.py +0 -197
  54. fsspec/implementations/tests/test_zip.py +0 -134
  55. fsspec/tests/__init__.py +0 -0
  56. fsspec/tests/conftest.py +0 -188
  57. fsspec/tests/data/listing.html +0 -1
  58. fsspec/tests/test_api.py +0 -498
  59. fsspec/tests/test_async.py +0 -230
  60. fsspec/tests/test_caches.py +0 -255
  61. fsspec/tests/test_callbacks.py +0 -89
  62. fsspec/tests/test_compression.py +0 -164
  63. fsspec/tests/test_config.py +0 -129
  64. fsspec/tests/test_core.py +0 -466
  65. fsspec/tests/test_downstream.py +0 -40
  66. fsspec/tests/test_file.py +0 -200
  67. fsspec/tests/test_fuse.py +0 -147
  68. fsspec/tests/test_generic.py +0 -90
  69. fsspec/tests/test_gui.py +0 -23
  70. fsspec/tests/test_mapping.py +0 -228
  71. fsspec/tests/test_parquet.py +0 -140
  72. fsspec/tests/test_registry.py +0 -134
  73. fsspec/tests/test_spec.py +0 -1167
  74. fsspec/tests/test_utils.py +0 -478
  75. fsspec-2024.5.0.dist-info/RECORD +0 -111
  76. {fsspec-2024.5.0.dist-info → fsspec-2024.6.1.dist-info}/licenses/LICENSE +0 -0
fsspec/tests/test_fuse.py DELETED
@@ -1,147 +0,0 @@
1
- import os
2
- import subprocess
3
- import time
4
- from multiprocessing import Process
5
-
6
- import pytest
7
-
8
- try:
9
- pytest.importorskip("fuse") # noqa: E402
10
- except OSError:
11
- # can succeed in importing fuse, but fail to load so
12
- pytest.importorskip("nonexistent") # noqa: E402
13
-
14
- from fsspec.fuse import main, run
15
- from fsspec.implementations.memory import MemoryFileSystem
16
-
17
-
18
- def host_fuse(mountdir):
19
- fs = MemoryFileSystem()
20
- fs.touch("/mounted/testfile")
21
- run(fs, "/mounted/", mountdir)
22
-
23
-
24
- def test_basic(tmpdir, capfd):
25
- mountdir = str(tmpdir.mkdir("mount"))
26
-
27
- fuse_process = Process(target=host_fuse, args=(str(mountdir),))
28
- fuse_process.start()
29
-
30
- try:
31
- timeout = 10
32
- while True:
33
- try:
34
- # can fail with device not ready while waiting for fuse
35
- if "testfile" in os.listdir(mountdir):
36
- break
37
- except Exception:
38
- pass
39
- timeout -= 1
40
- time.sleep(1)
41
- if not timeout > 0:
42
- import pdb
43
-
44
- pdb.set_trace()
45
- pytest.skip(msg="fuse didn't come live")
46
-
47
- fn = os.path.join(mountdir, "test")
48
- with open(fn, "wb") as f:
49
- f.write(b"data")
50
-
51
- with open(fn) as f:
52
- assert f.read() == "data"
53
-
54
- os.remove(fn)
55
-
56
- os.mkdir(fn)
57
- assert os.listdir(fn) == []
58
-
59
- os.mkdir(fn + "/inner")
60
-
61
- with pytest.raises(OSError):
62
- os.rmdir(fn)
63
-
64
- captured = capfd.readouterr()
65
- assert "Traceback" not in captured.out
66
- assert "Traceback" not in captured.err
67
-
68
- os.rmdir(fn + "/inner")
69
- os.rmdir(fn)
70
- finally:
71
- fuse_process.terminate()
72
- fuse_process.join(timeout=10)
73
- if fuse_process.is_alive():
74
- fuse_process.kill()
75
- fuse_process.join()
76
-
77
-
78
- def host_mount_local(source_dir, mount_dir, debug_log):
79
- main(["local", source_dir, mount_dir, "-l", debug_log, "--ready-file"])
80
-
81
-
82
- @pytest.fixture()
83
- def mount_local(tmpdir):
84
- source_dir = tmpdir.mkdir("source")
85
- mount_dir = tmpdir.mkdir("local")
86
- debug_log = tmpdir / "debug.log"
87
- fuse_process = Process(
88
- target=host_mount_local, args=(str(source_dir), str(mount_dir), str(debug_log))
89
- )
90
- fuse_process.start()
91
- ready_file = mount_dir / ".fuse_ready"
92
- for _ in range(20):
93
- if ready_file.exists() and open(ready_file).read() == b"ready":
94
- break
95
- time.sleep(0.1)
96
- try:
97
- yield (source_dir, mount_dir)
98
- finally:
99
- fuse_process.terminate()
100
- fuse_process.join(timeout=10)
101
- if fuse_process.is_alive():
102
- fuse_process.kill()
103
- fuse_process.join()
104
-
105
-
106
- def test_mount(mount_local):
107
- source_dir, mount_dir = mount_local
108
- assert os.listdir(mount_dir) == []
109
- assert os.listdir(source_dir) == []
110
-
111
- mount_dir.mkdir("a")
112
-
113
- assert os.listdir(mount_dir) == ["a"]
114
- assert os.listdir(source_dir) == ["a"]
115
-
116
-
117
- def test_chmod(mount_local):
118
- source_dir, mount_dir = mount_local
119
- open(mount_dir / "text", "w").write("test")
120
- assert os.listdir(source_dir) == ["text"]
121
-
122
- cp = subprocess.run(
123
- ["cp", str(mount_dir / "text"), str(mount_dir / "new")],
124
- stdout=subprocess.PIPE,
125
- stderr=subprocess.PIPE,
126
- check=False,
127
- )
128
-
129
- assert cp.stderr == b""
130
- assert cp.stdout == b""
131
- assert set(os.listdir(source_dir)) == {"text", "new"}
132
- assert open(mount_dir / "new").read() == "test"
133
-
134
-
135
- def test_seek_rw(mount_local):
136
- source_dir, mount_dir = mount_local
137
- fh = open(mount_dir / "text", "w")
138
- fh.write("teST")
139
- fh.seek(2)
140
- fh.write("st")
141
- fh.close()
142
-
143
- fh = open(mount_dir / "text", "r")
144
- assert fh.read() == "test"
145
- fh.seek(2)
146
- assert fh.read() == "st"
147
- fh.close()
@@ -1,90 +0,0 @@
1
- import pytest
2
-
3
- import fsspec
4
- from fsspec.tests.conftest import data, server # noqa: F401
5
-
6
-
7
- def test_remote_async_ops(server):
8
- fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
9
- fs = fsspec.filesystem("generic", default_method="current")
10
- out = fs.info(server + "/index/realfile")
11
- assert out["size"] == len(data)
12
- assert out["type"] == "file"
13
- assert fs.isfile(server + "/index/realfile") # this method from superclass
14
-
15
-
16
- def test_touch_rm(m):
17
- m.touch("afile")
18
- m.touch("dir/afile")
19
-
20
- fs = fsspec.filesystem("generic", default_method="current")
21
- fs.rm("memory://afile")
22
- assert not m.exists("afile")
23
-
24
- fs.rm("memory://dir", recursive=True)
25
- assert not m.exists("dir/afile")
26
- assert not m.exists("dir")
27
-
28
-
29
- def test_cp_async_to_sync(server, m):
30
- fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
31
- fs = fsspec.filesystem("generic", default_method="current")
32
- fs.cp([server + "/index/realfile"], ["memory://realfile"])
33
- assert m.cat("realfile") == data
34
-
35
- fs.rm("memory://realfile")
36
- assert not m.exists("realfile")
37
-
38
-
39
- def test_pipe_cat_sync(m):
40
- fs = fsspec.filesystem("generic", default_method="current")
41
- fs.pipe("memory://afile", b"data")
42
- assert fs.cat("memory://afile") == b"data"
43
-
44
-
45
- def test_cat_async(server):
46
- fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
47
- fs = fsspec.filesystem("generic", default_method="current")
48
- assert fs.cat(server + "/index/realfile") == data
49
-
50
-
51
- def test_rsync(tmpdir, m):
52
- from fsspec.generic import GenericFileSystem, rsync
53
-
54
- fs = GenericFileSystem()
55
- fs.pipe("memory:///deep/path/afile", b"data1")
56
- fs.pipe("memory:///deep/afile", b"data2")
57
-
58
- with pytest.raises(ValueError):
59
- rsync("memory:///deep/afile", f"file://{tmpdir}")
60
- rsync("memory://", f"file://{tmpdir}")
61
-
62
- allfiles = fs.find(f"file://{tmpdir}", withdirs=True, detail=True)
63
- pos_tmpdir = fsspec.implementations.local.make_path_posix(str(tmpdir)) # for WIN
64
- assert set(allfiles) == {
65
- f"file://{pos_tmpdir}{_}"
66
- for _ in [
67
- "",
68
- "/deep",
69
- "/deep/path",
70
- "/deep/path/afile",
71
- "/deep/afile",
72
- ]
73
- }
74
- fs.rm("memory:///deep/afile")
75
- rsync("memory://", f"file://{tmpdir}", delete_missing=True)
76
- allfiles2 = fs.find(f"file://{tmpdir}", withdirs=True, detail=True)
77
- assert set(allfiles2) == {
78
- f"file://{pos_tmpdir}{_}"
79
- for _ in [
80
- "",
81
- "/deep",
82
- "/deep/path",
83
- "/deep/path/afile",
84
- ]
85
- }
86
- # the file was not updated, since size was correct
87
- assert (
88
- allfiles[f"file://{pos_tmpdir}/deep/path/afile"]
89
- == allfiles2[f"file://{pos_tmpdir}/deep/path/afile"]
90
- )
fsspec/tests/test_gui.py DELETED
@@ -1,23 +0,0 @@
1
- import pytest
2
-
3
- panel = pytest.importorskip("panel")
4
-
5
-
6
- def test_basic():
7
- import fsspec.gui
8
-
9
- gui = fsspec.gui.FileSelector()
10
- assert "url" in str(gui.panel)
11
-
12
-
13
- def test_kwargs(tmpdir):
14
- """confirm kwargs are passed to the filesystem instance"""
15
- import fsspec.gui
16
-
17
- gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs="{'auto_mkdir': True}")
18
-
19
- assert gui.fs.auto_mkdir
20
-
21
- gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs={"auto_mkdir": True})
22
-
23
- assert gui.fs.auto_mkdir
@@ -1,228 +0,0 @@
1
- import os
2
- import pickle
3
- import platform
4
- import sys
5
- import uuid
6
-
7
- import pytest
8
-
9
- import fsspec
10
- from fsspec.implementations.local import LocalFileSystem
11
- from fsspec.implementations.memory import MemoryFileSystem
12
-
13
-
14
- def test_mapping_prefix(tmpdir):
15
- tmpdir = str(tmpdir)
16
- os.makedirs(os.path.join(tmpdir, "afolder"))
17
- open(os.path.join(tmpdir, "afile"), "w").write("test")
18
- open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
19
-
20
- m = fsspec.get_mapper(f"file://{tmpdir}")
21
- assert "afile" in m
22
- assert m["afolder/anotherfile"] == b"test2"
23
-
24
- fs = fsspec.filesystem("file")
25
- m2 = fs.get_mapper(tmpdir)
26
- m3 = fs.get_mapper(f"file://{tmpdir}")
27
-
28
- assert m == m2 == m3
29
-
30
-
31
- def test_getitems_errors(tmpdir):
32
- tmpdir = str(tmpdir)
33
- os.makedirs(os.path.join(tmpdir, "afolder"))
34
- open(os.path.join(tmpdir, "afile"), "w").write("test")
35
- open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
36
- m = fsspec.get_mapper(f"file://{tmpdir}")
37
- assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
38
- with pytest.raises(KeyError):
39
- m.getitems(["afile", "bfile"])
40
- out = m.getitems(["afile", "bfile"], on_error="return")
41
- assert isinstance(out["bfile"], KeyError)
42
- m = fsspec.get_mapper(f"file://{tmpdir}", missing_exceptions=())
43
- assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
44
- with pytest.raises(FileNotFoundError):
45
- m.getitems(["afile", "bfile"])
46
-
47
-
48
- def test_ops():
49
- MemoryFileSystem.store.clear()
50
- m = fsspec.get_mapper("memory://")
51
- assert not m
52
- assert list(m) == []
53
-
54
- with pytest.raises(KeyError):
55
- m["hi"]
56
-
57
- assert m.pop("key", 0) == 0
58
-
59
- m["key0"] = b"data"
60
- assert list(m) == ["key0"]
61
- assert m["key0"] == b"data"
62
-
63
- m.clear()
64
-
65
- assert list(m) == []
66
-
67
-
68
- def test_pickle():
69
- m = fsspec.get_mapper("memory://")
70
- assert isinstance(m.fs, MemoryFileSystem)
71
- m["key"] = b"data"
72
- m2 = pickle.loads(pickle.dumps(m))
73
- assert list(m) == list(m2)
74
- assert m.missing_exceptions == m2.missing_exceptions
75
-
76
-
77
- def test_keys_view():
78
- # https://github.com/fsspec/filesystem_spec/issues/186
79
- m = fsspec.get_mapper("memory://")
80
- m["key"] = b"data"
81
-
82
- keys = m.keys()
83
- assert len(keys) == 1
84
- # check that we don't consume the keys
85
- assert len(keys) == 1
86
- m.clear()
87
-
88
-
89
- def test_multi():
90
- m = fsspec.get_mapper("memory:///")
91
- data = {"a": b"data1", "b": b"data2"}
92
- m.setitems(data)
93
-
94
- assert m.getitems(list(data)) == data
95
- m.delitems(list(data))
96
- assert not list(m)
97
-
98
-
99
- def test_setitem_types():
100
- import array
101
-
102
- m = fsspec.get_mapper("memory://")
103
- m["a"] = array.array("i", [1])
104
- if sys.byteorder == "little":
105
- assert m["a"] == b"\x01\x00\x00\x00"
106
- else:
107
- assert m["a"] == b"\x00\x00\x00\x01"
108
- m["b"] = bytearray(b"123")
109
- assert m["b"] == b"123"
110
- m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")})
111
- if sys.byteorder == "little":
112
- assert m["c"] == b"\x01\x00\x00\x00"
113
- else:
114
- assert m["c"] == b"\x00\x00\x00\x01"
115
- assert m["d"] == b"123"
116
-
117
-
118
- def test_setitem_numpy():
119
- m = fsspec.get_mapper("memory://")
120
- np = pytest.importorskip("numpy")
121
- m["c"] = np.array(1, dtype="<i4") # scalar
122
- assert m["c"] == b"\x01\x00\x00\x00"
123
- m["c"] = np.array([1, 2], dtype="<i4") # array
124
- assert m["c"] == b"\x01\x00\x00\x00\x02\x00\x00\x00"
125
- m["c"] = np.array(
126
- np.datetime64("2000-01-01T23:59:59.999999999"), dtype="<M8[ns]"
127
- ) # datetime64 scalar
128
- assert m["c"] == b"\xff\xff\x91\xe3c\x9b#\r"
129
- m["c"] = np.array(
130
- [
131
- np.datetime64("1900-01-01T23:59:59.999999999"),
132
- np.datetime64("2000-01-01T23:59:59.999999999"),
133
- ],
134
- dtype="<M8[ns]",
135
- ) # datetime64 array
136
- assert m["c"] == b"\xff\xff}p\xf8fX\xe1\xff\xff\x91\xe3c\x9b#\r"
137
- m["c"] = np.array(
138
- np.timedelta64(3155673612345678901, "ns"), dtype="<m8[ns]"
139
- ) # timedelta64 scalar
140
- assert m["c"] == b"5\x1c\xf0Rn4\xcb+"
141
- m["c"] = np.array(
142
- [
143
- np.timedelta64(450810516049382700, "ns"),
144
- np.timedelta64(3155673612345678901, "ns"),
145
- ],
146
- dtype="<m8[ns]",
147
- ) # timedelta64 scalar
148
- assert m["c"] == b',M"\x9e\xc6\x99A\x065\x1c\xf0Rn4\xcb+'
149
-
150
-
151
- def test_empty_url():
152
- m = fsspec.get_mapper()
153
- assert isinstance(m.fs, LocalFileSystem)
154
-
155
-
156
- def test_fsmap_access_with_root_prefix(tmp_path):
157
- # "/a" and "a" are the same for LocalFileSystem
158
- tmp_path.joinpath("a").write_bytes(b"data")
159
- m = fsspec.get_mapper(f"file://{tmp_path}")
160
- assert m["/a"] == m["a"] == b"data"
161
-
162
- # "/a" and "a" differ for MemoryFileSystem
163
- m = fsspec.get_mapper(f"memory://{uuid.uuid4()}")
164
- m["/a"] = b"data"
165
-
166
- assert m["/a"] == b"data"
167
- with pytest.raises(KeyError):
168
- _ = m["a"]
169
-
170
-
171
- @pytest.mark.parametrize(
172
- "key",
173
- [
174
- pytest.param(b"k", id="bytes"),
175
- pytest.param(1234, id="int"),
176
- pytest.param((1,), id="tuple"),
177
- pytest.param([""], id="list"),
178
- ],
179
- )
180
- def test_fsmap_non_str_keys(key):
181
- m = fsspec.get_mapper()
182
-
183
- # Once the deprecation period passes
184
- # FSMap.__getitem__ should raise TypeError for non-str keys
185
- # with pytest.raises(TypeError):
186
- # _ = m[key]
187
-
188
- with pytest.warns(DeprecationWarning):
189
- with pytest.raises(KeyError):
190
- _ = m[key]
191
-
192
-
193
- def test_fsmap_error_on_protocol_keys():
194
- root = uuid.uuid4()
195
- m = fsspec.get_mapper(f"memory://{root}", create=True)
196
- m["a"] = b"data"
197
-
198
- assert m["a"] == b"data"
199
- with pytest.raises(KeyError):
200
- _ = m[f"memory://{root}/a"]
201
-
202
-
203
- def test_fsmap_access_with_suffix(tmp_path):
204
- tmp_path.joinpath("b").mkdir()
205
- tmp_path.joinpath("b", "a").write_bytes(b"data")
206
- if platform.system() == "Windows":
207
- # on Windows opening a directory will raise PermissionError
208
- # see: https://bugs.python.org/issue43095
209
- missing_exceptions = (
210
- FileNotFoundError,
211
- IsADirectoryError,
212
- NotADirectoryError,
213
- PermissionError,
214
- )
215
- else:
216
- missing_exceptions = None
217
- m = fsspec.get_mapper(f"file://{tmp_path}", missing_exceptions=missing_exceptions)
218
- with pytest.raises(KeyError):
219
- _ = m["b/"]
220
- assert m["b/a/"] == b"data"
221
-
222
-
223
- def test_fsmap_dirfs():
224
- m = fsspec.get_mapper("memory://")
225
-
226
- fs = m.dirfs
227
- assert isinstance(fs, fsspec.implementations.dirfs.DirFileSystem)
228
- assert fs.path == m.root
@@ -1,140 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
- try:
6
- import fastparquet
7
- except ImportError:
8
- fastparquet = None
9
- try:
10
- import pyarrow.parquet as pq
11
- except ImportError:
12
- pq = None
13
-
14
- from fsspec.core import url_to_fs
15
- from fsspec.parquet import _get_parquet_byte_ranges, open_parquet_file
16
-
17
- # Define `engine` fixture
18
- FASTPARQUET_MARK = pytest.mark.skipif(not fastparquet, reason="fastparquet not found")
19
- PYARROW_MARK = pytest.mark.skipif(not pq, reason="pyarrow not found")
20
- ANY_ENGINE_MARK = pytest.mark.skipif(
21
- not (fastparquet or pq),
22
- reason="No parquet engine (fastparquet or pyarrow) found",
23
- )
24
-
25
-
26
- @pytest.fixture(
27
- params=[
28
- pytest.param("fastparquet", marks=FASTPARQUET_MARK),
29
- pytest.param("pyarrow", marks=PYARROW_MARK),
30
- pytest.param("auto", marks=ANY_ENGINE_MARK),
31
- ]
32
- )
33
- def engine(request):
34
- return request.param
35
-
36
-
37
- @pytest.mark.parametrize("columns", [None, ["x"], ["x", "y"], ["z"]])
38
- @pytest.mark.parametrize("max_gap", [0, 64])
39
- @pytest.mark.parametrize("max_block", [64, 256_000_000])
40
- @pytest.mark.parametrize("footer_sample_size", [8, 1_000])
41
- @pytest.mark.parametrize("range_index", [True, False])
42
- def test_open_parquet_file(
43
- tmpdir, engine, columns, max_gap, max_block, footer_sample_size, range_index
44
- ):
45
- # Pandas required for this test
46
- pd = pytest.importorskip("pandas")
47
-
48
- # Write out a simple DataFrame
49
- path = os.path.join(str(tmpdir), "test.parquet")
50
- nrows = 40
51
- df = pd.DataFrame(
52
- {
53
- "x": [i * 7 % 5 for i in range(nrows)],
54
- "y": [[0, i] for i in range(nrows)], # list
55
- "z": [{"a": i, "b": "cat"} for i in range(nrows)], # struct
56
- },
57
- index=pd.Index([10 * i for i in range(nrows)], name="myindex"),
58
- )
59
- if range_index:
60
- df = df.reset_index(drop=True)
61
- df.index.name = "myindex"
62
- df.to_parquet(path)
63
-
64
- # "Traditional read" (without `open_parquet_file`)
65
- expect = pd.read_parquet(path, columns=columns)
66
-
67
- # Use `_get_parquet_byte_ranges` to re-write a
68
- # place-holder file with all bytes NOT required
69
- # to read `columns` set to b"0". The purpose of
70
- # this step is to make sure the read will fail
71
- # if the correct bytes have not been accurately
72
- # selected by `_get_parquet_byte_ranges`. If this
73
- # test were reading from remote storage, we would
74
- # not need this logic to capture errors.
75
- fs = url_to_fs(path)[0]
76
- data = _get_parquet_byte_ranges(
77
- [path],
78
- fs,
79
- columns=columns,
80
- engine=engine,
81
- max_gap=max_gap,
82
- max_block=max_block,
83
- footer_sample_size=footer_sample_size,
84
- )[path]
85
- file_size = fs.size(path)
86
- with open(path, "wb") as f:
87
- f.write(b"0" * file_size)
88
-
89
- if footer_sample_size == 8:
90
- # We know 8 bytes is too small to include
91
- # the footer metadata, so there should NOT
92
- # be a key for the last 8 bytes of the file
93
- bad_key = (file_size - 8, file_size)
94
- assert bad_key not in data.keys()
95
-
96
- for (start, stop), byte_data in data.items():
97
- f.seek(start)
98
- f.write(byte_data)
99
-
100
- # Read back the modified file with `open_parquet_file`
101
- with open_parquet_file(
102
- path,
103
- columns=columns,
104
- engine=engine,
105
- max_gap=max_gap,
106
- max_block=max_block,
107
- footer_sample_size=footer_sample_size,
108
- ) as f:
109
- result = pd.read_parquet(f, columns=columns)
110
-
111
- # Check that `result` matches `expect`
112
- pd.testing.assert_frame_equal(expect, result)
113
-
114
- # Try passing metadata
115
- if engine == "fastparquet":
116
- # Should work fine for "fastparquet"
117
- pf = fastparquet.ParquetFile(path)
118
- with open_parquet_file(
119
- path,
120
- metadata=pf,
121
- columns=columns,
122
- engine=engine,
123
- max_gap=max_gap,
124
- max_block=max_block,
125
- footer_sample_size=footer_sample_size,
126
- ) as f:
127
- result = pd.read_parquet(f, columns=columns)
128
- pd.testing.assert_frame_equal(expect, result)
129
- elif engine == "pyarrow":
130
- # Should raise ValueError for "pyarrow"
131
- with pytest.raises(ValueError):
132
- open_parquet_file(
133
- path,
134
- metadata=["Not-None"],
135
- columns=columns,
136
- engine=engine,
137
- max_gap=max_gap,
138
- max_block=max_block,
139
- footer_sample_size=footer_sample_size,
140
- )