fsspec 2024.3.0__py3-none-any.whl → 2024.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. fsspec/__init__.py +2 -3
  2. fsspec/_version.py +14 -19
  3. fsspec/caching.py +83 -14
  4. fsspec/compression.py +1 -0
  5. fsspec/core.py +31 -6
  6. fsspec/exceptions.py +1 -0
  7. fsspec/generic.py +1 -1
  8. fsspec/gui.py +1 -1
  9. fsspec/implementations/arrow.py +0 -2
  10. fsspec/implementations/cache_mapper.py +1 -2
  11. fsspec/implementations/cache_metadata.py +7 -7
  12. fsspec/implementations/dirfs.py +2 -2
  13. fsspec/implementations/http.py +9 -9
  14. fsspec/implementations/local.py +97 -48
  15. fsspec/implementations/memory.py +9 -0
  16. fsspec/implementations/smb.py +3 -1
  17. fsspec/implementations/tests/__init__.py +0 -0
  18. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_file_listing.yaml +112 -0
  19. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_mkdir.yaml +582 -0
  20. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_pyarrow_non_partitioned.yaml +873 -0
  21. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range.yaml +458 -0
  22. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range_chunked.yaml +1355 -0
  23. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_and_read.yaml +795 -0
  24. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_pyarrow_non_partitioned.yaml +613 -0
  25. fsspec/implementations/tests/conftest.py +39 -0
  26. fsspec/implementations/tests/local/__init__.py +0 -0
  27. fsspec/implementations/tests/local/local_fixtures.py +18 -0
  28. fsspec/implementations/tests/local/local_test.py +14 -0
  29. fsspec/implementations/tests/memory/__init__.py +0 -0
  30. fsspec/implementations/tests/memory/memory_fixtures.py +27 -0
  31. fsspec/implementations/tests/memory/memory_test.py +14 -0
  32. fsspec/implementations/tests/out.zip +0 -0
  33. fsspec/implementations/tests/test_archive.py +382 -0
  34. fsspec/implementations/tests/test_arrow.py +259 -0
  35. fsspec/implementations/tests/test_cached.py +1306 -0
  36. fsspec/implementations/tests/test_common.py +35 -0
  37. fsspec/implementations/tests/test_dask.py +29 -0
  38. fsspec/implementations/tests/test_data.py +20 -0
  39. fsspec/implementations/tests/test_dbfs.py +268 -0
  40. fsspec/implementations/tests/test_dirfs.py +588 -0
  41. fsspec/implementations/tests/test_ftp.py +178 -0
  42. fsspec/implementations/tests/test_git.py +76 -0
  43. fsspec/implementations/tests/test_http.py +577 -0
  44. fsspec/implementations/tests/test_jupyter.py +57 -0
  45. fsspec/implementations/tests/test_libarchive.py +33 -0
  46. fsspec/implementations/tests/test_local.py +1285 -0
  47. fsspec/implementations/tests/test_memory.py +382 -0
  48. fsspec/implementations/tests/test_reference.py +720 -0
  49. fsspec/implementations/tests/test_sftp.py +233 -0
  50. fsspec/implementations/tests/test_smb.py +139 -0
  51. fsspec/implementations/tests/test_tar.py +243 -0
  52. fsspec/implementations/tests/test_webhdfs.py +197 -0
  53. fsspec/implementations/tests/test_zip.py +134 -0
  54. fsspec/implementations/webhdfs.py +1 -3
  55. fsspec/mapping.py +2 -2
  56. fsspec/parquet.py +0 -8
  57. fsspec/registry.py +4 -0
  58. fsspec/spec.py +21 -4
  59. fsspec/tests/__init__.py +0 -0
  60. fsspec/tests/abstract/mv.py +57 -0
  61. fsspec/tests/conftest.py +188 -0
  62. fsspec/tests/data/listing.html +1 -0
  63. fsspec/tests/test_api.py +498 -0
  64. fsspec/tests/test_async.py +230 -0
  65. fsspec/tests/test_caches.py +255 -0
  66. fsspec/tests/test_callbacks.py +89 -0
  67. fsspec/tests/test_compression.py +164 -0
  68. fsspec/tests/test_config.py +129 -0
  69. fsspec/tests/test_core.py +466 -0
  70. fsspec/tests/test_downstream.py +40 -0
  71. fsspec/tests/test_file.py +200 -0
  72. fsspec/tests/test_fuse.py +147 -0
  73. fsspec/tests/test_generic.py +90 -0
  74. fsspec/tests/test_gui.py +23 -0
  75. fsspec/tests/test_mapping.py +228 -0
  76. fsspec/tests/test_parquet.py +140 -0
  77. fsspec/tests/test_registry.py +134 -0
  78. fsspec/tests/test_spec.py +1167 -0
  79. fsspec/tests/test_utils.py +478 -0
  80. fsspec/utils.py +0 -2
  81. fsspec-2024.5.0.dist-info/METADATA +273 -0
  82. fsspec-2024.5.0.dist-info/RECORD +111 -0
  83. {fsspec-2024.3.0.dist-info → fsspec-2024.5.0.dist-info}/WHEEL +1 -2
  84. fsspec-2024.3.0.dist-info/METADATA +0 -167
  85. fsspec-2024.3.0.dist-info/RECORD +0 -54
  86. fsspec-2024.3.0.dist-info/top_level.txt +0 -1
  87. {fsspec-2024.3.0.dist-info → fsspec-2024.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,35 @@
1
+ import datetime
2
+ import time
3
+
4
+ import pytest
5
+
6
+ from fsspec import AbstractFileSystem
7
+ from fsspec.implementations.tests.conftest import READ_ONLY_FILESYSTEMS
8
+
9
+
10
+ @pytest.mark.parametrize("fs", ["local"], indirect=["fs"])
11
+ def test_created(fs: AbstractFileSystem, temp_file):
12
+ try:
13
+ fs.touch(temp_file)
14
+ created = fs.created(path=temp_file)
15
+ assert isinstance(created, datetime.datetime)
16
+ finally:
17
+ if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)):
18
+ fs.rm(temp_file)
19
+
20
+
21
+ @pytest.mark.parametrize("fs", ["local", "memory", "arrow"], indirect=["fs"])
22
+ def test_modified(fs: AbstractFileSystem, temp_file):
23
+ try:
24
+ fs.touch(temp_file)
25
+ # created = fs.created(path=temp_file)
26
+ created = datetime.datetime.now(
27
+ tz=datetime.timezone.utc
28
+ ) # pyarrow only have modified
29
+ time.sleep(0.05)
30
+ fs.touch(temp_file)
31
+ modified = fs.modified(path=temp_file)
32
+ assert isinstance(modified, datetime.datetime)
33
+ assert modified > created
34
+ finally:
35
+ fs.rm(temp_file)
@@ -0,0 +1,29 @@
1
+ import pytest
2
+
3
+ import fsspec
4
+
5
+ pytest.importorskip("distributed")
6
+
7
+
8
+ @pytest.fixture()
9
+ def cli(tmpdir):
10
+ import dask.distributed
11
+
12
+ client = dask.distributed.Client(n_workers=1)
13
+
14
+ def setup():
15
+ m = fsspec.filesystem("memory")
16
+ with m.open("afile", "wb") as f:
17
+ f.write(b"data")
18
+
19
+ client.run(setup)
20
+ try:
21
+ yield client
22
+ finally:
23
+ client.shutdown()
24
+
25
+
26
+ def test_basic(cli):
27
+ fs = fsspec.filesystem("dask", target_protocol="memory")
28
+ assert fs.ls("", detail=False) == ["/afile"]
29
+ assert fs.cat("/afile") == b"data"
@@ -0,0 +1,20 @@
1
+ import fsspec
2
+
3
+
4
+ def test_1():
5
+ with fsspec.open("data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==") as f:
6
+ assert f.read() == b"Hello, World!"
7
+
8
+ with fsspec.open("data:,Hello%2C%20World%21") as f:
9
+ assert f.read() == b"Hello, World!"
10
+
11
+
12
+ def test_info():
13
+ fs = fsspec.filesystem("data")
14
+ info = fs.info("data:text/html,%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E")
15
+ assert info == {
16
+ "name": "%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E",
17
+ "size": 22,
18
+ "type": "file",
19
+ "mimetype": "text/html",
20
+ }
@@ -0,0 +1,268 @@
1
+ """
2
+ Test-Cases for the DataBricks Filesystem.
3
+ This test case is somewhat special, as there is no "mock" databricks
4
+ API available. We use the [vcr(https://github.com/kevin1024/vcrpy)
5
+ package to record the requests and responses to the real databricks API and
6
+ replay them on tests.
7
+
8
+ This however means, that when you change the tests (or when the API
9
+ itself changes, which is very unlikely to occur as it is versioned),
10
+ you need to re-record the answers. This can be done as follows:
11
+
12
+ 1. Delete all casettes files in the "./cassettes/test_dbfs" folder
13
+ 2. Spin up a databricks cluster. For example,
14
+ you can use an Azure Databricks instance for this.
15
+ 3. Take note of the instance details (the instance URL. For example for an Azure
16
+ databricks cluster, this has the form
17
+ adb-<some-number>.<two digits>.azuredatabricks.net)
18
+ and your personal token (Find out more here:
19
+ https://docs.databricks.com/dev-tools/api/latest/authentication.html)
20
+ 4. Set the two environment variables `DBFS_INSTANCE` and `DBFS_TOKEN`
21
+ 5. Now execute the tests as normal. The results of the API calls will be recorded.
22
+ 6. Unset the environment variables and replay the tests.
23
+ """
24
+
25
+ import os
26
+ import sys
27
+ from urllib.parse import urlparse
28
+
29
+ import numpy
30
+ import pytest
31
+
32
+ import fsspec
33
+
34
+ if sys.version_info >= (3, 10):
35
+ pytest.skip("These tests need to be re-recorded.", allow_module_level=True)
36
+
37
+ DUMMY_INSTANCE = "my_instance.com"
38
+ INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE)
39
+ TOKEN = os.getenv("DBFS_TOKEN", "")
40
+
41
+
42
+ @pytest.fixture(scope="module")
43
+ def vcr_config():
44
+ """
45
+ To not record information in the instance and token details
46
+ (which are sensitive), we delete them from both the
47
+ request and the response before storing it.
48
+ We also delete the date as it is likely to change
49
+ (and will make git diffs harder).
50
+ If the DBFS_TOKEN env variable is set, we record with VCR.
51
+ If not, we only replay (to not accidentally record with a wrong URL).
52
+ """
53
+
54
+ def before_record_response(response):
55
+ try:
56
+ del response["headers"]["x-databricks-org-id"]
57
+ del response["headers"]["date"]
58
+ except KeyError:
59
+ pass
60
+ return response
61
+
62
+ def before_record_request(request):
63
+ # Replace the instance URL
64
+ uri = urlparse(request.uri)
65
+ uri = uri._replace(netloc=DUMMY_INSTANCE)
66
+ request.uri = uri.geturl()
67
+
68
+ return request
69
+
70
+ if TOKEN:
71
+ return {
72
+ "record_mode": "once",
73
+ "filter_headers": [("authorization", "DUMMY")],
74
+ "before_record_response": before_record_response,
75
+ "before_record_request": before_record_request,
76
+ }
77
+ else:
78
+ return {
79
+ "record_mode": "none",
80
+ }
81
+
82
+
83
+ @pytest.fixture
84
+ def dbfsFS():
85
+ fs = fsspec.filesystem("dbfs", instance=INSTANCE, token=TOKEN)
86
+
87
+ return fs
88
+
89
+
90
+ @pytest.fixture
91
+ def make_mock_diabetes_ds():
92
+ pa = pytest.importorskip("pyarrow")
93
+
94
+ names = [
95
+ "Pregnancies",
96
+ "Glucose",
97
+ "BloodPressure",
98
+ "SkinThickness",
99
+ "Insulin",
100
+ "BMI",
101
+ "DiabetesPedigreeFunction",
102
+ "Age",
103
+ "Outcome",
104
+ ]
105
+ pregnancies = pa.array(numpy.random.randint(low=0, high=17, size=25))
106
+ glucose = pa.array(numpy.random.randint(low=0, high=199, size=25))
107
+ blood_pressure = pa.array(numpy.random.randint(low=0, high=122, size=25))
108
+ skin_thickness = pa.array(numpy.random.randint(low=0, high=99, size=25))
109
+ insulin = pa.array(numpy.random.randint(low=0, high=846, size=25))
110
+ bmi = pa.array(numpy.random.uniform(0.0, 67.1, size=25))
111
+ diabetes_pedigree_function = pa.array(numpy.random.uniform(0.08, 2.42, size=25))
112
+ age = pa.array(numpy.random.randint(low=21, high=81, size=25))
113
+ outcome = pa.array(numpy.random.randint(low=0, high=1, size=25))
114
+
115
+ return pa.Table.from_arrays(
116
+ arrays=[
117
+ pregnancies,
118
+ glucose,
119
+ blood_pressure,
120
+ skin_thickness,
121
+ insulin,
122
+ bmi,
123
+ diabetes_pedigree_function,
124
+ age,
125
+ outcome,
126
+ ],
127
+ names=names,
128
+ )
129
+
130
+
131
+ @pytest.mark.vcr()
132
+ def test_dbfs_file_listing(dbfsFS):
133
+ assert "/FileStore" in dbfsFS.ls("/", detail=False)
134
+ assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls(
135
+ "/", detail=True
136
+ )
137
+
138
+
139
+ @pytest.mark.vcr()
140
+ def test_dbfs_mkdir(dbfsFS):
141
+ dbfsFS.rm("/FileStore/my", recursive=True)
142
+ assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
143
+
144
+ dbfsFS.mkdir("/FileStore/my/dir", create_parents=True)
145
+
146
+ assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
147
+ assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False)
148
+
149
+ with pytest.raises(FileExistsError):
150
+ dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False)
151
+
152
+ with pytest.raises(OSError):
153
+ dbfsFS.rm("/FileStore/my", recursive=False)
154
+
155
+ assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
156
+
157
+ dbfsFS.rm("/FileStore/my", recursive=True)
158
+ assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)
159
+
160
+
161
+ @pytest.mark.vcr()
162
+ def test_dbfs_write_and_read(dbfsFS):
163
+ dbfsFS.rm("/FileStore/file.csv")
164
+ assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
165
+
166
+ content = b"This is a test\n" * 100000 + b"For this is the end\n"
167
+
168
+ with dbfsFS.open("/FileStore/file.csv", "wb") as f:
169
+ f.write(content)
170
+
171
+ assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False)
172
+
173
+ with dbfsFS.open("/FileStore/file.csv", "rb") as f:
174
+ data = f.read()
175
+ assert data == content
176
+ dbfsFS.rm("/FileStore/file.csv")
177
+ assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)
178
+
179
+
180
+ @pytest.mark.vcr()
181
+ def test_dbfs_read_range(dbfsFS):
182
+ dbfsFS.rm("/FileStore/file.txt")
183
+ assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
184
+ content = b"This is a test\n"
185
+ with dbfsFS.open("/FileStore/file.txt", "wb") as f:
186
+ f.write(content)
187
+ assert "/FileStore/file.txt" in dbfsFS.ls("/FileStore", detail=False)
188
+ assert dbfsFS.cat_file("/FileStore/file.txt", start=8, end=14) == content[8:14]
189
+ dbfsFS.rm("/FileStore/file.txt")
190
+ assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
191
+
192
+
193
+ @pytest.mark.vcr()
194
+ def test_dbfs_read_range_chunked(dbfsFS):
195
+ dbfsFS.rm("/FileStore/large_file.txt")
196
+ assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
197
+ content = b"This is a test\n" * (1 * 2**18) + b"For this is the end\n"
198
+ with dbfsFS.open("/FileStore/large_file.txt", "wb") as f:
199
+ f.write(content)
200
+ assert "/FileStore/large_file.txt" in dbfsFS.ls("/FileStore", detail=False)
201
+ assert dbfsFS.cat_file("/FileStore/large_file.txt", start=8) == content[8:]
202
+ dbfsFS.rm("/FileStore/large_file.txt")
203
+ assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
204
+
205
+
206
+ @pytest.mark.vcr()
207
+ def test_dbfs_write_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
208
+ pytest.importorskip("pyarrow.dataset")
209
+ pq = pytest.importorskip("pyarrow.parquet")
210
+
211
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
212
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
213
+
214
+ pq.write_to_dataset(
215
+ make_mock_diabetes_ds,
216
+ filesystem=dbfsFS,
217
+ compression="none",
218
+ existing_data_behavior="error",
219
+ root_path="/FileStore/pyarrow/diabetes",
220
+ use_threads=False,
221
+ )
222
+
223
+ assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
224
+ assert (
225
+ "/FileStore/pyarrow/diabetes"
226
+ in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
227
+ and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
228
+ )
229
+
230
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
231
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
232
+
233
+
234
+ @pytest.mark.vcr()
235
+ def test_dbfs_read_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
236
+ ds = pytest.importorskip("pyarrow.dataset")
237
+ pq = pytest.importorskip("pyarrow.parquet")
238
+
239
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
240
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)
241
+
242
+ pq.write_to_dataset(
243
+ make_mock_diabetes_ds,
244
+ filesystem=dbfsFS,
245
+ compression="none",
246
+ existing_data_behavior="error",
247
+ root_path="/FileStore/pyarrow/diabetes",
248
+ use_threads=False,
249
+ )
250
+
251
+ assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
252
+ assert (
253
+ "/FileStore/pyarrow/diabetes"
254
+ in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
255
+ and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
256
+ )
257
+
258
+ arr_res = ds.dataset(
259
+ source="/FileStore/pyarrow/diabetes",
260
+ filesystem=dbfsFS,
261
+ ).to_table()
262
+
263
+ assert arr_res.num_rows == make_mock_diabetes_ds.num_rows
264
+ assert arr_res.num_columns == make_mock_diabetes_ds.num_columns
265
+ assert set(arr_res.schema).difference(set(make_mock_diabetes_ds.schema)) == set()
266
+
267
+ dbfsFS.rm("/FileStore/pyarrow", recursive=True)
268
+ assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)