fsspec 2024.3.1__py3-none-any.whl → 2024.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. fsspec/__init__.py +2 -3
  2. fsspec/_version.py +14 -19
  3. fsspec/caching.py +83 -14
  4. fsspec/compression.py +1 -0
  5. fsspec/core.py +32 -8
  6. fsspec/exceptions.py +1 -0
  7. fsspec/generic.py +1 -1
  8. fsspec/gui.py +1 -1
  9. fsspec/implementations/arrow.py +0 -2
  10. fsspec/implementations/cache_mapper.py +1 -2
  11. fsspec/implementations/cache_metadata.py +7 -7
  12. fsspec/implementations/dirfs.py +2 -2
  13. fsspec/implementations/http.py +9 -9
  14. fsspec/implementations/local.py +78 -45
  15. fsspec/implementations/memory.py +9 -0
  16. fsspec/implementations/smb.py +3 -1
  17. fsspec/implementations/tests/__init__.py +0 -0
  18. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_file_listing.yaml +112 -0
  19. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_mkdir.yaml +582 -0
  20. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_pyarrow_non_partitioned.yaml +873 -0
  21. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range.yaml +458 -0
  22. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_read_range_chunked.yaml +1355 -0
  23. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_and_read.yaml +795 -0
  24. fsspec/implementations/tests/cassettes/test_dbfs/test_dbfs_write_pyarrow_non_partitioned.yaml +613 -0
  25. fsspec/implementations/tests/conftest.py +39 -0
  26. fsspec/implementations/tests/local/__init__.py +0 -0
  27. fsspec/implementations/tests/local/local_fixtures.py +18 -0
  28. fsspec/implementations/tests/local/local_test.py +14 -0
  29. fsspec/implementations/tests/memory/__init__.py +0 -0
  30. fsspec/implementations/tests/memory/memory_fixtures.py +27 -0
  31. fsspec/implementations/tests/memory/memory_test.py +14 -0
  32. fsspec/implementations/tests/out.zip +0 -0
  33. fsspec/implementations/tests/test_archive.py +382 -0
  34. fsspec/implementations/tests/test_arrow.py +259 -0
  35. fsspec/implementations/tests/test_cached.py +1306 -0
  36. fsspec/implementations/tests/test_common.py +35 -0
  37. fsspec/implementations/tests/test_dask.py +29 -0
  38. fsspec/implementations/tests/test_data.py +20 -0
  39. fsspec/implementations/tests/test_dbfs.py +268 -0
  40. fsspec/implementations/tests/test_dirfs.py +588 -0
  41. fsspec/implementations/tests/test_ftp.py +178 -0
  42. fsspec/implementations/tests/test_git.py +76 -0
  43. fsspec/implementations/tests/test_http.py +577 -0
  44. fsspec/implementations/tests/test_jupyter.py +57 -0
  45. fsspec/implementations/tests/test_libarchive.py +33 -0
  46. fsspec/implementations/tests/test_local.py +1285 -0
  47. fsspec/implementations/tests/test_memory.py +382 -0
  48. fsspec/implementations/tests/test_reference.py +720 -0
  49. fsspec/implementations/tests/test_sftp.py +233 -0
  50. fsspec/implementations/tests/test_smb.py +139 -0
  51. fsspec/implementations/tests/test_tar.py +243 -0
  52. fsspec/implementations/tests/test_webhdfs.py +197 -0
  53. fsspec/implementations/tests/test_zip.py +134 -0
  54. fsspec/implementations/webhdfs.py +1 -3
  55. fsspec/parquet.py +0 -8
  56. fsspec/registry.py +4 -0
  57. fsspec/spec.py +21 -4
  58. fsspec/tests/__init__.py +0 -0
  59. fsspec/tests/abstract/mv.py +57 -0
  60. fsspec/tests/conftest.py +188 -0
  61. fsspec/tests/data/listing.html +1 -0
  62. fsspec/tests/test_api.py +498 -0
  63. fsspec/tests/test_async.py +230 -0
  64. fsspec/tests/test_caches.py +255 -0
  65. fsspec/tests/test_callbacks.py +89 -0
  66. fsspec/tests/test_compression.py +164 -0
  67. fsspec/tests/test_config.py +129 -0
  68. fsspec/tests/test_core.py +466 -0
  69. fsspec/tests/test_downstream.py +40 -0
  70. fsspec/tests/test_file.py +200 -0
  71. fsspec/tests/test_fuse.py +147 -0
  72. fsspec/tests/test_generic.py +90 -0
  73. fsspec/tests/test_gui.py +23 -0
  74. fsspec/tests/test_mapping.py +228 -0
  75. fsspec/tests/test_parquet.py +140 -0
  76. fsspec/tests/test_registry.py +134 -0
  77. fsspec/tests/test_spec.py +1167 -0
  78. fsspec/tests/test_utils.py +478 -0
  79. fsspec/utils.py +0 -2
  80. fsspec-2024.5.0.dist-info/METADATA +273 -0
  81. fsspec-2024.5.0.dist-info/RECORD +111 -0
  82. {fsspec-2024.3.1.dist-info → fsspec-2024.5.0.dist-info}/WHEEL +1 -2
  83. fsspec-2024.3.1.dist-info/METADATA +0 -167
  84. fsspec-2024.3.1.dist-info/RECORD +0 -54
  85. fsspec-2024.3.1.dist-info/top_level.txt +0 -1
  86. {fsspec-2024.3.1.dist-info → fsspec-2024.5.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,720 @@
1
+ import json
2
+ import os
3
+
4
+ import pytest
5
+
6
+ import fsspec
7
+ from fsspec.implementations.local import LocalFileSystem
8
+ from fsspec.implementations.reference import (
9
+ LazyReferenceMapper,
10
+ ReferenceFileSystem,
11
+ ReferenceNotReachable,
12
+ )
13
+ from fsspec.tests.conftest import data, realfile, reset_files, server, win # noqa: F401
14
+
15
+
16
+ def test_simple(server): # noqa: F811
17
+ refs = {
18
+ "a": b"data",
19
+ "b": (realfile, 0, 5),
20
+ "c": (realfile, 1, 5),
21
+ "d": b"base64:aGVsbG8=",
22
+ }
23
+ h = fsspec.filesystem("http")
24
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
25
+
26
+ assert fs.cat("a") == b"data"
27
+ assert fs.cat("b") == data[:5]
28
+ assert fs.cat("c") == data[1 : 1 + 5]
29
+ assert fs.cat("d") == b"hello"
30
+ with fs.open("d", "rt") as f:
31
+ assert f.read(2) == "he"
32
+
33
+
34
+ def test_target_options(m):
35
+ m.pipe("data/0", b"hello")
36
+ refs = {"a": ["memory://data/0"]}
37
+ fn = "memory://refs.json.gz"
38
+ with fsspec.open(fn, "wt", compression="gzip") as f:
39
+ json.dump(refs, f)
40
+
41
+ fs = fsspec.filesystem("reference", fo=fn, target_options={"compression": "gzip"})
42
+ assert fs.cat("a") == b"hello"
43
+
44
+
45
+ def test_ls(server): # noqa: F811
46
+ refs = {"a": b"data", "b": (realfile, 0, 5), "c/d": (realfile, 1, 6)}
47
+ h = fsspec.filesystem("http")
48
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
49
+
50
+ assert fs.ls("", detail=False) == ["a", "b", "c"]
51
+ assert {"name": "c", "type": "directory", "size": 0} in fs.ls("", detail=True)
52
+ assert fs.find("") == ["a", "b", "c/d"]
53
+ assert fs.find("", withdirs=True) == ["a", "b", "c", "c/d"]
54
+ assert fs.find("c", detail=True) == {
55
+ "c/d": {"name": "c/d", "size": 6, "type": "file"}
56
+ }
57
+
58
+
59
+ def test_nested_dirs_ls():
60
+ # issue #1430
61
+ refs = {"a": "A", "B/C/b": "B", "B/C/d": "d", "B/_": "_"}
62
+ fs = fsspec.filesystem("reference", fo=refs)
63
+ assert len(fs.ls("")) == 2
64
+ assert {e["name"] for e in fs.ls("")} == {"a", "B"}
65
+ assert len(fs.ls("B")) == 2
66
+ assert {e["name"] for e in fs.ls("B")} == {"B/C", "B/_"}
67
+
68
+
69
+ def test_info(server): # noqa: F811
70
+ refs = {
71
+ "a": b"data",
72
+ "b": (realfile, 0, 5),
73
+ "c/d": (realfile, 1, 6),
74
+ "e": (realfile,),
75
+ }
76
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
77
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
78
+ assert fs.size("a") == 4
79
+ assert fs.size("b") == 5
80
+ assert fs.size("c/d") == 6
81
+ assert fs.info("e")["size"] == len(data)
82
+
83
+
84
+ def test_mutable(server, m):
85
+ refs = {
86
+ "a": b"data",
87
+ "b": (realfile, 0, 5),
88
+ "c/d": (realfile, 1, 6),
89
+ "e": (realfile,),
90
+ }
91
+ h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
92
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
93
+ fs.rm("a")
94
+ assert not fs.exists("a")
95
+
96
+ bin_data = b"bin data"
97
+ fs.pipe("aa", bin_data)
98
+ assert fs.cat("aa") == bin_data
99
+
100
+ fs.save_json("memory://refs.json")
101
+ assert m.exists("refs.json")
102
+
103
+ fs = fsspec.filesystem("reference", fo="memory://refs.json", remote_protocol="http")
104
+ assert not fs.exists("a")
105
+ assert fs.cat("aa") == bin_data
106
+
107
+
108
+ def test_put_get(tmpdir):
109
+ d1 = f"{tmpdir}/d1"
110
+ os.mkdir(d1)
111
+ with open(f"{d1}/a", "wb") as f:
112
+ f.write(b"1")
113
+ with open(f"{d1}/b", "wb") as f:
114
+ f.write(b"2")
115
+ d2 = f"{tmpdir}/d2"
116
+
117
+ fs = fsspec.filesystem("reference", fo={}, remote_protocol="file")
118
+ fs.put(d1, "out", recursive=True)
119
+
120
+ fs.get("out", d2, recursive=True)
121
+ assert open(f"{d2}/a", "rb").read() == b"1"
122
+ assert open(f"{d2}/b", "rb").read() == b"2"
123
+
124
+
125
+ def test_put_get_single(tmpdir):
126
+ d1 = f"{tmpdir}/f1"
127
+ d2 = f"{tmpdir}/f2"
128
+ with open(d1, "wb") as f:
129
+ f.write(b"1")
130
+
131
+ # skip instance cache since this is the same kwargs as previous test
132
+ fs = fsspec.filesystem(
133
+ "reference", fo={}, remote_protocol="file", skip_instance_cache=True
134
+ )
135
+ fs.put_file(d1, "out")
136
+
137
+ fs.get_file("out", d2)
138
+ assert open(d2, "rb").read() == b"1"
139
+ fs.pipe({"hi": b"data"})
140
+ assert fs.cat("hi") == b"data"
141
+
142
+
143
+ def test_defaults(server): # noqa: F811
144
+ refs = {"a": b"data", "b": (None, 0, 5)}
145
+ fs = fsspec.filesystem(
146
+ "reference",
147
+ fo=refs,
148
+ target_protocol="http",
149
+ target=realfile,
150
+ remote_protocol="http",
151
+ )
152
+
153
+ assert fs.cat("a") == b"data"
154
+ assert fs.cat("b") == data[:5]
155
+
156
+
157
+ jdata = """{
158
+ "metadata": {
159
+ ".zattrs": {
160
+ "Conventions": "UGRID-0.9.0"
161
+ },
162
+ ".zgroup": {
163
+ "zarr_format": 2
164
+ },
165
+ "adcirc_mesh/.zarray": {
166
+ "chunks": [
167
+ 1
168
+ ],
169
+ "dtype": "<i4",
170
+ "shape": [
171
+ 1
172
+ ],
173
+ "zarr_format": 2
174
+ },
175
+ "adcirc_mesh/.zattrs": {
176
+ "_ARRAY_DIMENSIONS": [
177
+ "mesh"
178
+ ],
179
+ "cf_role": "mesh_topology"
180
+ },
181
+ "adcirc_mesh/.zchunkstore": {
182
+ "adcirc_mesh/0": {
183
+ "offset": 8928,
184
+ "size": 4
185
+ },
186
+ "source": {
187
+ "array_name": "/adcirc_mesh",
188
+ "uri": "https://url"
189
+ }
190
+ }
191
+ },
192
+ "zarr_consolidated_format": 1
193
+ }
194
+ """
195
+
196
+
197
+ def test_spec1_expand():
198
+ pytest.importorskip("jinja2")
199
+ in_data = {
200
+ "version": 1,
201
+ "templates": {"u": "server.domain/path", "f": "{{c}}"},
202
+ "gen": [
203
+ {
204
+ "key": "gen_key{{i}}",
205
+ "url": "http://{{u}}_{{i}}",
206
+ "offset": "{{(i + 1) * 1000}}",
207
+ "length": "1000",
208
+ "dimensions": {"i": {"stop": 5}},
209
+ },
210
+ {
211
+ "key": "gen_key{{i}}",
212
+ "url": "http://{{u}}_{{i}}",
213
+ "dimensions": {"i": {"start": 5, "stop": 7}},
214
+ },
215
+ ],
216
+ "refs": {
217
+ "key0": "data",
218
+ "key1": ["http://target_url", 10000, 100],
219
+ "key2": ["http://{{u}}", 10000, 100],
220
+ "key3": ["http://{{f(c='text')}}", 10000, 100],
221
+ "key4": ["http://target_url"],
222
+ },
223
+ }
224
+ fs = fsspec.filesystem(
225
+ "reference", fo=in_data, target_protocol="http", simple_templates=False
226
+ )
227
+ assert fs.references == {
228
+ "key0": "data",
229
+ "key1": ["http://target_url", 10000, 100],
230
+ "key2": ["http://server.domain/path", 10000, 100],
231
+ "key3": ["http://text", 10000, 100],
232
+ "key4": ["http://target_url"],
233
+ "gen_key0": ["http://server.domain/path_0", 1000, 1000],
234
+ "gen_key1": ["http://server.domain/path_1", 2000, 1000],
235
+ "gen_key2": ["http://server.domain/path_2", 3000, 1000],
236
+ "gen_key3": ["http://server.domain/path_3", 4000, 1000],
237
+ "gen_key4": ["http://server.domain/path_4", 5000, 1000],
238
+ "gen_key5": ["http://server.domain/path_5"],
239
+ "gen_key6": ["http://server.domain/path_6"],
240
+ }
241
+
242
+
243
+ def test_spec1_expand_simple():
244
+ pytest.importorskip("jinja2")
245
+ in_data = {
246
+ "version": 1,
247
+ "templates": {"u": "server.domain/path"},
248
+ "refs": {
249
+ "key0": "base64:ZGF0YQ==",
250
+ "key2": ["http://{{u}}", 10000, 100],
251
+ "key4": ["http://target_url"],
252
+ },
253
+ }
254
+ fs = fsspec.filesystem("reference", fo=in_data, target_protocol="http")
255
+ assert fs.references["key2"] == ["http://server.domain/path", 10000, 100]
256
+ fs = fsspec.filesystem(
257
+ "reference",
258
+ fo=in_data,
259
+ target_protocol="http",
260
+ template_overrides={"u": "not.org/p"},
261
+ )
262
+ assert fs.references["key2"] == ["http://not.org/p", 10000, 100]
263
+ assert fs.cat("key0") == b"data"
264
+
265
+
266
+ def test_spec1_gen_variants():
267
+ pytest.importorskip("jinja2")
268
+ with pytest.raises(ValueError):
269
+ missing_length_spec = {
270
+ "version": 1,
271
+ "templates": {"u": "server.domain/path"},
272
+ "gen": [
273
+ {
274
+ "key": "gen_key{{i}}",
275
+ "url": "http://{{u}}_{{i}}",
276
+ "offset": "{{(i + 1) * 1000}}",
277
+ "dimensions": {"i": {"stop": 2}},
278
+ },
279
+ ],
280
+ }
281
+ fsspec.filesystem("reference", fo=missing_length_spec, target_protocol="http")
282
+
283
+ with pytest.raises(ValueError):
284
+ missing_offset_spec = {
285
+ "version": 1,
286
+ "templates": {"u": "server.domain/path"},
287
+ "gen": [
288
+ {
289
+ "key": "gen_key{{i}}",
290
+ "url": "http://{{u}}_{{i}}",
291
+ "length": "1000",
292
+ "dimensions": {"i": {"stop": 2}},
293
+ },
294
+ ],
295
+ }
296
+ fsspec.filesystem("reference", fo=missing_offset_spec, target_protocol="http")
297
+
298
+ url_only_gen_spec = {
299
+ "version": 1,
300
+ "templates": {"u": "server.domain/path"},
301
+ "gen": [
302
+ {
303
+ "key": "gen_key{{i}}",
304
+ "url": "http://{{u}}_{{i}}",
305
+ "dimensions": {"i": {"stop": 2}},
306
+ },
307
+ ],
308
+ }
309
+
310
+ fs = fsspec.filesystem("reference", fo=url_only_gen_spec, target_protocol="http")
311
+ assert fs.references == {
312
+ "gen_key0": ["http://server.domain/path_0"],
313
+ "gen_key1": ["http://server.domain/path_1"],
314
+ }
315
+
316
+
317
+ def test_empty():
318
+ pytest.importorskip("jinja2")
319
+ fs = fsspec.filesystem("reference", fo={"version": 1}, target_protocol="http")
320
+ assert fs.references == {}
321
+
322
+
323
+ def test_get_sync(tmpdir):
324
+ localfs = LocalFileSystem()
325
+
326
+ real = tmpdir / "file"
327
+ real.write_binary(b"0123456789")
328
+
329
+ refs = {"a": b"data", "b": (str(real), 0, 5), "c/d": (str(real), 1, 6)}
330
+ fs = fsspec.filesystem("reference", fo=refs, fs=localfs)
331
+
332
+ fs.get("a", str(tmpdir / "a"))
333
+ assert (tmpdir / "a").read_binary() == b"data"
334
+ fs.get("b", str(tmpdir / "b"))
335
+ assert (tmpdir / "b").read_binary() == b"01234"
336
+ fs.get("c/d", str(tmpdir / "d"))
337
+ assert (tmpdir / "d").read_binary() == b"123456"
338
+ fs.get("c", str(tmpdir / "c"), recursive=True)
339
+ assert (tmpdir / "c").isdir()
340
+ assert (tmpdir / "c" / "d").read_binary() == b"123456"
341
+
342
+
343
+ def test_multi_fs_provided(m, tmpdir):
344
+ localfs = LocalFileSystem()
345
+
346
+ real = tmpdir / "file"
347
+ real.write_binary(b"0123456789")
348
+
349
+ m.pipe("afile", b"hello")
350
+
351
+ # local URLs are file:// by default
352
+ refs = {
353
+ "a": b"data",
354
+ "b": (f"file://{real}", 0, 5),
355
+ "c/d": (f"file://{real}", 1, 6),
356
+ "c/e": ["memory://afile"],
357
+ }
358
+
359
+ fs = fsspec.filesystem("reference", fo=refs, fs={"file": localfs, "memory": m})
360
+ assert fs.cat("c/e") == b"hello"
361
+ assert fs.cat(["c/e", "a", "b"]) == {
362
+ "a": b"data",
363
+ "b": b"01234",
364
+ "c/e": b"hello",
365
+ }
366
+
367
+
368
+ def test_multi_fs_created(m, tmpdir):
369
+ real = tmpdir / "file"
370
+ real.write_binary(b"0123456789")
371
+
372
+ m.pipe("afile", b"hello")
373
+
374
+ # local URLs are file:// by default
375
+ refs = {
376
+ "a": b"data",
377
+ "b": (f"file://{real}", 0, 5),
378
+ "c/d": (f"file://{real}", 1, 6),
379
+ "c/e": ["memory://afile"],
380
+ }
381
+
382
+ fs = fsspec.filesystem("reference", fo=refs, fs={"file": {}, "memory": {}})
383
+ assert fs.cat("c/e") == b"hello"
384
+ assert fs.cat(["c/e", "a", "b"]) == {
385
+ "a": b"data",
386
+ "b": b"01234",
387
+ "c/e": b"hello",
388
+ }
389
+
390
+
391
+ def test_missing_nonasync(m):
392
+ zarr = pytest.importorskip("zarr")
393
+ zarray = {
394
+ "chunks": [1],
395
+ "compressor": None,
396
+ "dtype": "<f8",
397
+ "fill_value": "NaN",
398
+ "filters": [],
399
+ "order": "C",
400
+ "shape": [10],
401
+ "zarr_format": 2,
402
+ }
403
+ refs = {".zarray": json.dumps(zarray)}
404
+
405
+ m = fsspec.get_mapper("reference://", fo=refs, remote_protocol="memory")
406
+
407
+ a = zarr.open_array(m)
408
+ assert str(a[0]) == "nan"
409
+
410
+
411
+ def test_fss_has_defaults(m):
412
+ fs = fsspec.filesystem("reference", fo={})
413
+ assert None in fs.fss
414
+
415
+ fs = fsspec.filesystem("reference", fo={}, remote_protocol="memory")
416
+ assert fs.fss[None].protocol == "memory"
417
+ assert fs.fss["memory"].protocol == "memory"
418
+
419
+ fs = fsspec.filesystem("reference", fs=m, fo={})
420
+ assert fs.fss[None] is m
421
+
422
+ fs = fsspec.filesystem("reference", fs={"memory": m}, fo={})
423
+ assert fs.fss["memory"] is m
424
+ assert fs.fss[None].protocol == ("file", "local")
425
+
426
+ fs = fsspec.filesystem("reference", fs={None: m}, fo={})
427
+ assert fs.fss[None] is m
428
+
429
+ fs = fsspec.filesystem("reference", fo={"key": ["memory://a"]})
430
+ assert fs.fss[None] is fs.fss["memory"]
431
+
432
+ fs = fsspec.filesystem("reference", fo={"key": ["memory://a"], "blah": ["path"]})
433
+ assert fs.fss[None] is fs.fss["memory"]
434
+
435
+
436
+ def test_merging(m):
437
+ m.pipe("/a", b"test data")
438
+ other = b"other test data"
439
+ m.pipe("/b", other)
440
+ fs = fsspec.filesystem(
441
+ "reference",
442
+ fo={
443
+ "a": ["memory://a", 1, 1],
444
+ "b": ["memory://a", 2, 1],
445
+ "c": ["memory://b"],
446
+ "d": ["memory://b", 4, 6],
447
+ },
448
+ )
449
+ out = fs.cat(["a", "b", "c", "d"])
450
+ assert out == {"a": b"e", "b": b"s", "c": other, "d": other[4:10]}
451
+
452
+
453
+ def test_cat_file_ranges(m):
454
+ other = b"other test data"
455
+ m.pipe("/b", other)
456
+
457
+ fs = fsspec.filesystem(
458
+ "reference",
459
+ fo={
460
+ "c": ["memory://b"],
461
+ "d": ["memory://b", 4, 6],
462
+ },
463
+ )
464
+ assert fs.cat_file("c") == other
465
+ assert fs.cat_file("c", start=1) == other[1:]
466
+ assert fs.cat_file("c", start=-5) == other[-5:]
467
+ assert fs.cat_file("c", 1, -5) == other[1:-5]
468
+
469
+ assert fs.cat_file("d") == other[4:10]
470
+ assert fs.cat_file("d", start=1) == other[4:10][1:]
471
+ assert fs.cat_file("d", start=-5) == other[4:10][-5:]
472
+ assert fs.cat_file("d", 1, -3) == other[4:10][1:-3]
473
+
474
+
475
+ @pytest.mark.parametrize(
476
+ "fo",
477
+ [
478
+ {
479
+ "c": ["memory://b"],
480
+ "d": ["memory://unknown", 4, 6],
481
+ },
482
+ {
483
+ "c": ["memory://b"],
484
+ "d": ["//unknown", 4, 6],
485
+ },
486
+ ],
487
+ ids=["memory protocol", "mixed protocols: memory and unspecified"],
488
+ )
489
+ def test_cat_missing(m, fo):
490
+ other = b"other test data"
491
+ m.pipe("/b", other)
492
+ fs = fsspec.filesystem(
493
+ "reference",
494
+ fo=fo,
495
+ )
496
+ with pytest.raises(FileNotFoundError):
497
+ fs.cat("notafile")
498
+
499
+ with pytest.raises(FileNotFoundError):
500
+ fs.cat(["notone", "nottwo"])
501
+
502
+ mapper = fs.get_mapper("")
503
+
504
+ with pytest.raises(KeyError):
505
+ mapper["notakey"]
506
+
507
+ with pytest.raises(KeyError):
508
+ mapper.getitems(["notone", "nottwo"])
509
+
510
+ with pytest.raises(ReferenceNotReachable) as ex:
511
+ fs.cat("d")
512
+ assert ex.value.__cause__
513
+ out = fs.cat("d", on_error="return")
514
+ assert isinstance(out, ReferenceNotReachable)
515
+
516
+ with pytest.raises(ReferenceNotReachable) as e:
517
+ mapper["d"]
518
+ assert '"d"' in str(e.value)
519
+ assert "//unknown" in str(e.value)
520
+
521
+ with pytest.raises(ReferenceNotReachable):
522
+ mapper.getitems(["c", "d"])
523
+
524
+ out = mapper.getitems(["c", "d"], on_error="return")
525
+ assert isinstance(out["d"], ReferenceNotReachable)
526
+
527
+ out = fs.cat(["notone", "c", "d"], on_error="return")
528
+ assert isinstance(out["notone"], FileNotFoundError)
529
+ assert out["c"] == other
530
+ assert isinstance(out["d"], ReferenceNotReachable)
531
+
532
+ out = mapper.getitems(["c", "d"], on_error="omit")
533
+ assert list(out) == ["c"]
534
+
535
+
536
+ def test_df_single(m):
537
+ pd = pytest.importorskip("pandas")
538
+ pytest.importorskip("fastparquet")
539
+ data = b"data0data1data2"
540
+ m.pipe({"data": data})
541
+ df = pd.DataFrame(
542
+ {
543
+ "path": [None, "memory://data", "memory://data"],
544
+ "offset": [0, 0, 4],
545
+ "size": [0, 0, 4],
546
+ "raw": [b"raw", None, None],
547
+ }
548
+ )
549
+ df.to_parquet("memory://stuff/refs.0.parq")
550
+ m.pipe(
551
+ ".zmetadata",
552
+ b"""{
553
+ "metadata": {
554
+ ".zgroup": {
555
+ "zarr_format": 2
556
+ },
557
+ "stuff/.zarray": {
558
+ "chunks": [1],
559
+ "compressor": null,
560
+ "dtype": "i8",
561
+ "filters": null,
562
+ "shape": [3],
563
+ "zarr_format": 2
564
+ }
565
+ },
566
+ "zarr_consolidated_format": 1,
567
+ "record_size": 10
568
+ }
569
+ """,
570
+ )
571
+ fs = ReferenceFileSystem(fo="memory:///", remote_protocol="memory")
572
+ allfiles = fs.find("")
573
+ assert ".zmetadata" in allfiles
574
+ assert ".zgroup" in allfiles
575
+ assert "stuff/2" in allfiles
576
+
577
+ assert fs.cat("stuff/0") == b"raw"
578
+ assert fs.cat("stuff/1") == data
579
+ assert fs.cat("stuff/2") == data[4:8]
580
+
581
+
582
+ def test_df_multi(m):
583
+ pd = pytest.importorskip("pandas")
584
+ pytest.importorskip("fastparquet")
585
+ data = b"data0data1data2"
586
+ m.pipe({"data": data})
587
+ df0 = pd.DataFrame(
588
+ {
589
+ "path": [None, "memory://data", "memory://data"],
590
+ "offset": [0, 0, 4],
591
+ "size": [0, 0, 4],
592
+ "raw": [b"raw1", None, None],
593
+ }
594
+ )
595
+ df0.to_parquet("memory://stuff/refs.0.parq")
596
+ df1 = pd.DataFrame(
597
+ {
598
+ "path": [None, "memory://data", "memory://data"],
599
+ "offset": [0, 0, 2],
600
+ "size": [0, 0, 2],
601
+ "raw": [b"raw2", None, None],
602
+ }
603
+ )
604
+ df1.to_parquet("memory://stuff/refs.1.parq")
605
+ m.pipe(
606
+ ".zmetadata",
607
+ b"""{
608
+ "metadata": {
609
+ ".zgroup": {
610
+ "zarr_format": 2
611
+ },
612
+ "stuff/.zarray": {
613
+ "chunks": [1],
614
+ "compressor": null,
615
+ "dtype": "i8",
616
+ "filters": null,
617
+ "shape": [6],
618
+ "zarr_format": 2
619
+ }
620
+ },
621
+ "zarr_consolidated_format": 1,
622
+ "record_size": 3
623
+ }
624
+ """,
625
+ )
626
+ fs = ReferenceFileSystem(
627
+ fo="memory:///", remote_protocol="memory", skip_instance_cache=True
628
+ )
629
+ allfiles = fs.find("")
630
+ assert ".zmetadata" in allfiles
631
+ assert ".zgroup" in allfiles
632
+ assert "stuff/2" in allfiles
633
+ assert "stuff/4" in allfiles
634
+
635
+ assert fs.cat("stuff/0") == b"raw1"
636
+ assert fs.cat("stuff/1") == data
637
+ assert fs.cat("stuff/2") == data[4:8]
638
+ assert fs.cat("stuff/3") == b"raw2"
639
+ assert fs.cat("stuff/4") == data
640
+ assert fs.cat("stuff/5") == data[2:4]
641
+
642
+
643
+ def test_mapping_getitems(m):
644
+ m.pipe({"a": b"A", "b": b"B"})
645
+
646
+ refs = {
647
+ "a": ["a"],
648
+ "b": ["b"],
649
+ }
650
+ h = fsspec.filesystem("memory")
651
+ fs = fsspec.filesystem("reference", fo=refs, fs=h)
652
+ mapping = fs.get_mapper("")
653
+ assert mapping.getitems(["b", "a"]) == {"a": b"A", "b": b"B"}
654
+
655
+
656
+ def test_cached(m, tmpdir):
657
+ fn = f"{tmpdir}/ref.json"
658
+
659
+ m.pipe({"a": b"A", "b": b"B"})
660
+ m.pipe("ref.json", b"""{"a": ["a"], "b": ["b"]}""")
661
+
662
+ fs = fsspec.filesystem(
663
+ "reference",
664
+ fo="simplecache::memory://ref.json",
665
+ fs=m,
666
+ target_options={"cache_storage": str(tmpdir), "same_names": True},
667
+ )
668
+ assert fs.cat("a") == b"A"
669
+ assert os.path.exists(fn)
670
+
671
+ # truncate original file to show we are loading from the cached version
672
+ m.pipe("ref.json", b"")
673
+ fs = fsspec.filesystem(
674
+ "reference",
675
+ fo="simplecache::memory://ref.json",
676
+ fs=m,
677
+ target_options={"cache_storage": str(tmpdir), "same_names": True},
678
+ skip_instance_cache=True,
679
+ )
680
+ assert fs.cat("a") == b"A"
681
+
682
+
683
+ @pytest.fixture()
684
+ def lazy_refs(m):
685
+ zarr = pytest.importorskip("zarr")
686
+ l = LazyReferenceMapper.create("memory://refs", fs=m)
687
+ g = zarr.open(l, mode="w")
688
+ g.create_dataset(name="data", shape=(100,), chunks=(10,), dtype="int64")
689
+ return l
690
+
691
+
692
+ def test_append_parquet(lazy_refs, m):
693
+ pytest.importorskip("kerchunk")
694
+ with pytest.raises(KeyError):
695
+ lazy_refs["data/0"]
696
+ lazy_refs["data/0"] = b"data"
697
+ assert lazy_refs["data/0"] == b"data"
698
+ lazy_refs.flush()
699
+
700
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
701
+ assert lazy2["data/0"] == b"data"
702
+ with pytest.raises(KeyError):
703
+ lazy_refs["data/1"]
704
+ lazy2["data/1"] = b"Bdata"
705
+ assert lazy2["data/1"] == b"Bdata"
706
+ lazy2.flush()
707
+
708
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
709
+ assert lazy2["data/0"] == b"data"
710
+ assert lazy2["data/1"] == b"Bdata"
711
+ lazy2["data/1"] = b"Adata"
712
+ del lazy2["data/0"]
713
+ assert lazy2["data/1"] == b"Adata"
714
+ assert "data/0" not in lazy2
715
+ lazy2.flush()
716
+
717
+ lazy2 = LazyReferenceMapper("memory://refs", fs=m)
718
+ with pytest.raises(KeyError):
719
+ lazy2["data/0"]
720
+ assert lazy2["data/1"] == b"Adata"