fsspec 2024.10.0__py3-none-any.whl → 2025.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +2 -2
- fsspec/archive.py +3 -1
- fsspec/asyn.py +5 -7
- fsspec/caching.py +34 -19
- fsspec/core.py +15 -13
- fsspec/implementations/asyn_wrapper.py +99 -0
- fsspec/implementations/cached.py +1 -1
- fsspec/implementations/dbfs.py +3 -3
- fsspec/implementations/ftp.py +1 -1
- fsspec/implementations/http.py +4 -22
- fsspec/implementations/local.py +6 -1
- fsspec/implementations/memory.py +8 -3
- fsspec/implementations/reference.py +124 -17
- fsspec/implementations/webhdfs.py +2 -1
- fsspec/mapping.py +1 -1
- fsspec/parquet.py +1 -1
- fsspec/registry.py +7 -3
- fsspec/spec.py +209 -33
- fsspec/tests/abstract/__init__.py +3 -1
- fsspec/tests/abstract/open.py +11 -0
- fsspec/tests/abstract/pipe.py +11 -0
- fsspec/utils.py +4 -2
- {fsspec-2024.10.0.dist-info → fsspec-2025.2.0.dist-info}/METADATA +3 -3
- {fsspec-2024.10.0.dist-info → fsspec-2025.2.0.dist-info}/RECORD +26 -23
- {fsspec-2024.10.0.dist-info → fsspec-2025.2.0.dist-info}/WHEEL +1 -1
- {fsspec-2024.10.0.dist-info → fsspec-2025.2.0.dist-info}/licenses/LICENSE +0 -0
fsspec/_version.py
CHANGED
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '
|
|
16
|
-
__version_tuple__ = version_tuple = (
|
|
15
|
+
__version__ = version = '2025.2.0'
|
|
16
|
+
__version_tuple__ = version_tuple = (2025, 2, 0)
|
fsspec/archive.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import operator
|
|
2
|
+
|
|
1
3
|
from fsspec import AbstractFileSystem
|
|
2
4
|
from fsspec.utils import tokenize
|
|
3
5
|
|
|
@@ -67,7 +69,7 @@ class AbstractArchiveFileSystem(AbstractFileSystem):
|
|
|
67
69
|
out = {"name": ppath, "size": 0, "type": "directory"}
|
|
68
70
|
paths[ppath] = out
|
|
69
71
|
if detail:
|
|
70
|
-
out = sorted(paths.values(), key=
|
|
72
|
+
out = sorted(paths.values(), key=operator.itemgetter("name"))
|
|
71
73
|
return out
|
|
72
74
|
else:
|
|
73
75
|
return sorted(paths)
|
fsspec/asyn.py
CHANGED
|
@@ -408,7 +408,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
408
408
|
continue
|
|
409
409
|
raise ex
|
|
410
410
|
|
|
411
|
-
async def _pipe_file(self, path, value, **kwargs):
|
|
411
|
+
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
|
|
412
412
|
raise NotImplementedError
|
|
413
413
|
|
|
414
414
|
async def _pipe(self, path, value=None, batch_size=None, **kwargs):
|
|
@@ -517,7 +517,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
517
517
|
coros, batch_size=batch_size, nofiles=True, return_exceptions=True
|
|
518
518
|
)
|
|
519
519
|
|
|
520
|
-
async def _put_file(self, lpath, rpath, **kwargs):
|
|
520
|
+
async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
|
|
521
521
|
raise NotImplementedError
|
|
522
522
|
|
|
523
523
|
async def _put(
|
|
@@ -816,11 +816,9 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
816
816
|
p: info
|
|
817
817
|
for p, info in sorted(allpaths.items())
|
|
818
818
|
if pattern.match(
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
else p
|
|
823
|
-
)
|
|
819
|
+
p + "/"
|
|
820
|
+
if append_slash_to_dirname and info["type"] == "directory"
|
|
821
|
+
else p
|
|
824
822
|
)
|
|
825
823
|
}
|
|
826
824
|
|
fsspec/caching.py
CHANGED
|
@@ -8,6 +8,8 @@ import os
|
|
|
8
8
|
import threading
|
|
9
9
|
import warnings
|
|
10
10
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
11
|
+
from itertools import groupby
|
|
12
|
+
from operator import itemgetter
|
|
11
13
|
from typing import (
|
|
12
14
|
TYPE_CHECKING,
|
|
13
15
|
Any,
|
|
@@ -85,12 +87,7 @@ class BaseCache:
|
|
|
85
87
|
if self.hit_count == 0 and self.miss_count == 0:
|
|
86
88
|
# a cache that does nothing, this is for logs only
|
|
87
89
|
return ""
|
|
88
|
-
return " ,
|
|
89
|
-
self.name,
|
|
90
|
-
self.hit_count,
|
|
91
|
-
self.miss_count,
|
|
92
|
-
self.total_requested_bytes,
|
|
93
|
-
)
|
|
90
|
+
return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
|
|
94
91
|
|
|
95
92
|
def __repr__(self) -> str:
|
|
96
93
|
# TODO: use rich for better formatting
|
|
@@ -161,21 +158,39 @@ class MMapCache(BaseCache):
|
|
|
161
158
|
return b""
|
|
162
159
|
start_block = start // self.blocksize
|
|
163
160
|
end_block = end // self.blocksize
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
161
|
+
block_range = range(start_block, end_block + 1)
|
|
162
|
+
# Determine which blocks need to be fetched. This sequence is sorted by construction.
|
|
163
|
+
need = (i for i in block_range if i not in self.blocks)
|
|
164
|
+
# Count the number of blocks already cached
|
|
165
|
+
self.hit_count += sum(1 for i in block_range if i in self.blocks)
|
|
166
|
+
|
|
167
|
+
# Consolidate needed blocks.
|
|
168
|
+
# Algorithm adapted from Python 2.x itertools documentation.
|
|
169
|
+
# We are grouping an enumerated sequence of blocks. By comparing when the difference
|
|
170
|
+
# between an ascending range (provided by enumerate) and the needed block numbers
|
|
171
|
+
# we can detect when the block number skips values. The key computes this difference.
|
|
172
|
+
# Whenever the difference changes, we know that we have previously cached block(s),
|
|
173
|
+
# and a new group is started. In other words, this algorithm neatly groups
|
|
174
|
+
# runs of consecutive block numbers so they can be fetched together.
|
|
175
|
+
for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
|
|
176
|
+
# Extract the blocks from the enumerated sequence
|
|
177
|
+
_blocks = tuple(map(itemgetter(1), _blocks))
|
|
178
|
+
# Compute start of first block
|
|
179
|
+
sstart = _blocks[0] * self.blocksize
|
|
180
|
+
# Compute the end of the last block. Last block may not be full size.
|
|
181
|
+
send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
|
|
182
|
+
|
|
183
|
+
# Fetch bytes (could be multiple consecutive blocks)
|
|
175
184
|
self.total_requested_bytes += send - sstart
|
|
176
|
-
logger.debug(
|
|
185
|
+
logger.debug(
|
|
186
|
+
f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
|
|
187
|
+
)
|
|
177
188
|
self.cache[sstart:send] = self.fetcher(sstart, send)
|
|
178
|
-
|
|
189
|
+
|
|
190
|
+
# Update set of cached blocks
|
|
191
|
+
self.blocks.update(_blocks)
|
|
192
|
+
# Update cache statistics with number of blocks we had to cache
|
|
193
|
+
self.miss_count += len(_blocks)
|
|
179
194
|
|
|
180
195
|
return self.cache[start:end]
|
|
181
196
|
|
fsspec/core.py
CHANGED
|
@@ -329,12 +329,19 @@ def open_files(
|
|
|
329
329
|
|
|
330
330
|
|
|
331
331
|
def _un_chain(path, kwargs):
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
332
|
+
# Avoid a circular import
|
|
333
|
+
from fsspec.implementations.cached import CachingFileSystem
|
|
334
|
+
|
|
335
|
+
if "::" in path:
|
|
336
|
+
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
|
337
|
+
bits = []
|
|
338
|
+
for p in path.split("::"):
|
|
339
|
+
if "://" in p or x.match(p):
|
|
340
|
+
bits.append(p)
|
|
341
|
+
else:
|
|
342
|
+
bits.append(p + "://")
|
|
343
|
+
else:
|
|
344
|
+
bits = [path]
|
|
338
345
|
# [[url, protocol, kwargs], ...]
|
|
339
346
|
out = []
|
|
340
347
|
previous_bit = None
|
|
@@ -351,10 +358,7 @@ def _un_chain(path, kwargs):
|
|
|
351
358
|
**kws,
|
|
352
359
|
)
|
|
353
360
|
bit = cls._strip_protocol(bit)
|
|
354
|
-
if (
|
|
355
|
-
protocol in {"blockcache", "filecache", "simplecache"}
|
|
356
|
-
and "target_protocol" not in kw
|
|
357
|
-
):
|
|
361
|
+
if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
|
|
358
362
|
bit = previous_bit
|
|
359
363
|
out.append((bit, protocol, kw))
|
|
360
364
|
previous_bit = bit
|
|
@@ -676,9 +680,7 @@ def get_fs_token_paths(
|
|
|
676
680
|
elif not isinstance(paths, list):
|
|
677
681
|
paths = list(paths)
|
|
678
682
|
else:
|
|
679
|
-
if "w" in mode and expand:
|
|
680
|
-
paths = _expand_paths(paths, name_function, num)
|
|
681
|
-
elif "x" in mode and expand:
|
|
683
|
+
if ("w" in mode or "x" in mode) and expand:
|
|
682
684
|
paths = _expand_paths(paths, name_function, num)
|
|
683
685
|
elif "*" in paths:
|
|
684
686
|
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import inspect
|
|
4
|
+
|
|
5
|
+
from fsspec.asyn import AsyncFileSystem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def async_wrapper(func, obj=None):
|
|
9
|
+
"""
|
|
10
|
+
Wraps a synchronous function to make it awaitable.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
func : callable
|
|
15
|
+
The synchronous function to wrap.
|
|
16
|
+
obj : object, optional
|
|
17
|
+
The instance to bind the function to, if applicable.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
coroutine
|
|
22
|
+
An awaitable version of the function.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
@functools.wraps(func)
|
|
26
|
+
async def wrapper(*args, **kwargs):
|
|
27
|
+
return await asyncio.to_thread(func, *args, **kwargs)
|
|
28
|
+
|
|
29
|
+
return wrapper
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AsyncFileSystemWrapper(AsyncFileSystem):
|
|
33
|
+
"""
|
|
34
|
+
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
|
35
|
+
|
|
36
|
+
This class takes an existing synchronous filesystem implementation and wraps all
|
|
37
|
+
its methods to provide an asynchronous interface.
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
sync_fs : AbstractFileSystem
|
|
42
|
+
The synchronous filesystem instance to wrap.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, sync_fs, *args, **kwargs):
|
|
46
|
+
super().__init__(*args, **kwargs)
|
|
47
|
+
self.asynchronous = True
|
|
48
|
+
self.sync_fs = sync_fs
|
|
49
|
+
self.protocol = self.sync_fs.protocol
|
|
50
|
+
self._wrap_all_sync_methods()
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def fsid(self):
|
|
54
|
+
return f"async_{self.sync_fs.fsid}"
|
|
55
|
+
|
|
56
|
+
def _wrap_all_sync_methods(self):
|
|
57
|
+
"""
|
|
58
|
+
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
|
|
59
|
+
"""
|
|
60
|
+
excluded_methods = {"open"}
|
|
61
|
+
for method_name in dir(self.sync_fs):
|
|
62
|
+
if method_name.startswith("_") or method_name in excluded_methods:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
attr = inspect.getattr_static(self.sync_fs, method_name)
|
|
66
|
+
if isinstance(attr, property):
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
method = getattr(self.sync_fs, method_name)
|
|
70
|
+
if callable(method) and not asyncio.iscoroutinefunction(method):
|
|
71
|
+
async_method = async_wrapper(method, obj=self)
|
|
72
|
+
setattr(self, f"_{method_name}", async_method)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def wrap_class(cls, sync_fs_class):
|
|
76
|
+
"""
|
|
77
|
+
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
|
|
78
|
+
with lazy instantiation of the underlying synchronous filesystem.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
sync_fs_class : type
|
|
83
|
+
The class of the synchronous filesystem to wrap.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
type
|
|
88
|
+
A new class that wraps the provided synchronous filesystem class.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
class GeneratedAsyncFileSystemWrapper(cls):
|
|
92
|
+
def __init__(self, *args, **kwargs):
|
|
93
|
+
sync_fs = sync_fs_class(*args, **kwargs)
|
|
94
|
+
super().__init__(sync_fs)
|
|
95
|
+
|
|
96
|
+
GeneratedAsyncFileSystemWrapper.__name__ = (
|
|
97
|
+
f"Async{sync_fs_class.__name__}Wrapper"
|
|
98
|
+
)
|
|
99
|
+
return GeneratedAsyncFileSystemWrapper
|
fsspec/implementations/cached.py
CHANGED
|
@@ -612,7 +612,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
612
612
|
**kwargs,
|
|
613
613
|
):
|
|
614
614
|
paths = self.expand_path(
|
|
615
|
-
path, recursive=recursive, maxdepth=kwargs.get("maxdepth"
|
|
615
|
+
path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
|
|
616
616
|
)
|
|
617
617
|
getpaths = []
|
|
618
618
|
storepaths = []
|
fsspec/implementations/dbfs.py
CHANGED
|
@@ -412,9 +412,9 @@ class DatabricksFile(AbstractBufferedFile):
|
|
|
412
412
|
if block_size is None or block_size == "default":
|
|
413
413
|
block_size = self.DEFAULT_BLOCK_SIZE
|
|
414
414
|
|
|
415
|
-
assert (
|
|
416
|
-
|
|
417
|
-
)
|
|
415
|
+
assert block_size == self.DEFAULT_BLOCK_SIZE, (
|
|
416
|
+
f"Only the default block size is allowed, not {block_size}"
|
|
417
|
+
)
|
|
418
418
|
|
|
419
419
|
super().__init__(
|
|
420
420
|
fs,
|
fsspec/implementations/ftp.py
CHANGED
fsspec/implementations/http.py
CHANGED
|
@@ -273,8 +273,12 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
273
273
|
chunk_size=5 * 2**20,
|
|
274
274
|
callback=DEFAULT_CALLBACK,
|
|
275
275
|
method="post",
|
|
276
|
+
mode="overwrite",
|
|
276
277
|
**kwargs,
|
|
277
278
|
):
|
|
279
|
+
if mode != "overwrite":
|
|
280
|
+
raise NotImplementedError("Exclusive write")
|
|
281
|
+
|
|
278
282
|
async def gen_chunks():
|
|
279
283
|
# Support passing arbitrary file-like objects
|
|
280
284
|
# and use them instead of streams.
|
|
@@ -692,25 +696,6 @@ class HTTPFile(AbstractBufferedFile):
|
|
|
692
696
|
|
|
693
697
|
_fetch_range = sync_wrapper(async_fetch_range)
|
|
694
698
|
|
|
695
|
-
def __reduce__(self):
|
|
696
|
-
return (
|
|
697
|
-
reopen,
|
|
698
|
-
(
|
|
699
|
-
self.fs,
|
|
700
|
-
self.url,
|
|
701
|
-
self.mode,
|
|
702
|
-
self.blocksize,
|
|
703
|
-
self.cache.name if self.cache else "none",
|
|
704
|
-
self.size,
|
|
705
|
-
),
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
def reopen(fs, url, mode, blocksize, cache_type, size=None):
|
|
710
|
-
return fs.open(
|
|
711
|
-
url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
|
|
712
|
-
)
|
|
713
|
-
|
|
714
699
|
|
|
715
700
|
magic_check = re.compile("([*[])")
|
|
716
701
|
|
|
@@ -760,9 +745,6 @@ class HTTPStreamFile(AbstractBufferedFile):
|
|
|
760
745
|
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
|
|
761
746
|
super().close()
|
|
762
747
|
|
|
763
|
-
def __reduce__(self):
|
|
764
|
-
return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
|
|
765
|
-
|
|
766
748
|
|
|
767
749
|
class AsyncStreamFile(AbstractAsyncStreamedFile):
|
|
768
750
|
def __init__(
|
fsspec/implementations/local.py
CHANGED
|
@@ -60,7 +60,12 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
60
60
|
info = self.info(path)
|
|
61
61
|
if info["type"] == "directory":
|
|
62
62
|
with os.scandir(path) as it:
|
|
63
|
-
infos = [
|
|
63
|
+
infos = []
|
|
64
|
+
for f in it:
|
|
65
|
+
try:
|
|
66
|
+
infos.append(self.info(f))
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
pass
|
|
64
69
|
else:
|
|
65
70
|
infos = [info]
|
|
66
71
|
|
fsspec/implementations/memory.py
CHANGED
|
@@ -126,12 +126,13 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
126
126
|
if not exist_ok:
|
|
127
127
|
raise
|
|
128
128
|
|
|
129
|
-
def pipe_file(self, path, value, **kwargs):
|
|
129
|
+
def pipe_file(self, path, value, mode="overwrite", **kwargs):
|
|
130
130
|
"""Set the bytes of given file
|
|
131
131
|
|
|
132
132
|
Avoids copies of the data if possible
|
|
133
133
|
"""
|
|
134
|
-
|
|
134
|
+
mode = "xb" if mode == "create" else "wb"
|
|
135
|
+
self.open(path, mode=mode, data=value)
|
|
135
136
|
|
|
136
137
|
def rmdir(self, path):
|
|
137
138
|
path = self._strip_protocol(path)
|
|
@@ -178,6 +179,8 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
178
179
|
**kwargs,
|
|
179
180
|
):
|
|
180
181
|
path = self._strip_protocol(path)
|
|
182
|
+
if "x" in mode and self.exists(path):
|
|
183
|
+
raise FileExistsError
|
|
181
184
|
if path in self.pseudo_dirs:
|
|
182
185
|
raise IsADirectoryError(path)
|
|
183
186
|
parent = path
|
|
@@ -197,7 +200,9 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
197
200
|
return f
|
|
198
201
|
else:
|
|
199
202
|
raise FileNotFoundError(path)
|
|
200
|
-
elif mode
|
|
203
|
+
elif mode in {"wb", "xb"}:
|
|
204
|
+
if mode == "xb" and self.exists(path):
|
|
205
|
+
raise FileExistsError
|
|
201
206
|
m = MemoryFile(self, path, kwargs.get("data"))
|
|
202
207
|
if not self._intrans:
|
|
203
208
|
m.commit()
|
|
@@ -5,11 +5,12 @@ import itertools
|
|
|
5
5
|
import logging
|
|
6
6
|
import math
|
|
7
7
|
import os
|
|
8
|
-
from itertools import chain
|
|
9
8
|
from functools import lru_cache
|
|
9
|
+
from itertools import chain
|
|
10
10
|
from typing import TYPE_CHECKING, Literal
|
|
11
11
|
|
|
12
12
|
import fsspec.core
|
|
13
|
+
from fsspec.spec import AbstractBufferedFile
|
|
13
14
|
|
|
14
15
|
try:
|
|
15
16
|
import ujson as json
|
|
@@ -20,6 +21,7 @@ except ImportError:
|
|
|
20
21
|
from fsspec.asyn import AsyncFileSystem
|
|
21
22
|
from fsspec.callbacks import DEFAULT_CALLBACK
|
|
22
23
|
from fsspec.core import filesystem, open, split_protocol
|
|
24
|
+
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
23
25
|
from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
|
|
24
26
|
|
|
25
27
|
logger = logging.getLogger("fsspec.reference")
|
|
@@ -41,7 +43,7 @@ def _first(d):
|
|
|
41
43
|
|
|
42
44
|
def _prot_in_references(path, references):
|
|
43
45
|
ref = references.get(path)
|
|
44
|
-
if isinstance(ref, (list, tuple)):
|
|
46
|
+
if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
|
|
45
47
|
return split_protocol(ref[0])[0] if ref[0] else ref[0]
|
|
46
48
|
|
|
47
49
|
|
|
@@ -173,8 +175,11 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
173
175
|
"""cached parquet file loader"""
|
|
174
176
|
path = self.url.format(field=field, record=record)
|
|
175
177
|
data = io.BytesIO(self.fs.cat_file(path))
|
|
176
|
-
|
|
177
|
-
|
|
178
|
+
try:
|
|
179
|
+
df = self.pd.read_parquet(data, engine=self.engine)
|
|
180
|
+
refs = {c: df[c].to_numpy() for c in df.columns}
|
|
181
|
+
except OSError:
|
|
182
|
+
refs = None
|
|
178
183
|
return refs
|
|
179
184
|
|
|
180
185
|
self.open_refs = open_refs
|
|
@@ -390,10 +395,14 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
390
395
|
self.write(field, record)
|
|
391
396
|
else:
|
|
392
397
|
# metadata or top-level
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
398
|
+
if hasattr(value, "to_bytes"):
|
|
399
|
+
val = value.to_bytes().decode()
|
|
400
|
+
elif isinstance(value, bytes):
|
|
401
|
+
val = value.decode()
|
|
402
|
+
else:
|
|
403
|
+
val = value
|
|
404
|
+
self._items[key] = val
|
|
405
|
+
new_value = json.loads(val)
|
|
397
406
|
self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
|
|
398
407
|
|
|
399
408
|
@staticmethod
|
|
@@ -428,7 +437,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
428
437
|
if len(partition) < self.record_size:
|
|
429
438
|
try:
|
|
430
439
|
original = self.open_refs(field, record)
|
|
431
|
-
except
|
|
440
|
+
except OSError:
|
|
432
441
|
pass
|
|
433
442
|
|
|
434
443
|
if original:
|
|
@@ -591,8 +600,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
591
600
|
async, and must allow start and end args in _cat_file. Later versions
|
|
592
601
|
may allow multiple arbitrary URLs for the targets.
|
|
593
602
|
This FileSystem is read-only. It is designed to be used with async
|
|
594
|
-
targets (for now).
|
|
595
|
-
``open``. We do not get original file details from the target FS.
|
|
603
|
+
targets (for now). We do not get original file details from the target FS.
|
|
596
604
|
Configuration is by passing a dict of references at init, or a URL to
|
|
597
605
|
a JSON file containing the same; this dict
|
|
598
606
|
can also contain concrete data for some set of paths.
|
|
@@ -602,6 +610,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
602
610
|
"""
|
|
603
611
|
|
|
604
612
|
protocol = "reference"
|
|
613
|
+
cachable = False
|
|
605
614
|
|
|
606
615
|
def __init__(
|
|
607
616
|
self,
|
|
@@ -754,6 +763,15 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
754
763
|
self.fss[remote_protocol] = fs
|
|
755
764
|
|
|
756
765
|
self.fss[None] = fs or filesystem("file") # default one
|
|
766
|
+
# Wrap any non-async filesystems to ensure async methods are available below
|
|
767
|
+
for k, f in self.fss.items():
|
|
768
|
+
if not f.async_impl:
|
|
769
|
+
self.fss[k] = AsyncFileSystemWrapper(f)
|
|
770
|
+
elif self.asynchronous ^ f.asynchronous:
|
|
771
|
+
raise ValueError(
|
|
772
|
+
"Reference-FS's target filesystem must have same value"
|
|
773
|
+
"of asynchronous"
|
|
774
|
+
)
|
|
757
775
|
|
|
758
776
|
def _cat_common(self, path, start=None, end=None):
|
|
759
777
|
path = self._strip_protocol(path)
|
|
@@ -764,6 +782,8 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
764
782
|
raise FileNotFoundError(path) from exc
|
|
765
783
|
if isinstance(part, str):
|
|
766
784
|
part = part.encode()
|
|
785
|
+
if hasattr(part, "to_bytes"):
|
|
786
|
+
part = part.to_bytes()
|
|
767
787
|
if isinstance(part, bytes):
|
|
768
788
|
logger.debug(f"Reference: {path}, type bytes")
|
|
769
789
|
if part.startswith(b"base64:"):
|
|
@@ -803,7 +823,9 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
803
823
|
return part_or_url[start:end]
|
|
804
824
|
protocol, _ = split_protocol(part_or_url)
|
|
805
825
|
try:
|
|
806
|
-
await self.fss[protocol]._cat_file(
|
|
826
|
+
return await self.fss[protocol]._cat_file(
|
|
827
|
+
part_or_url, start=start0, end=end0
|
|
828
|
+
)
|
|
807
829
|
except Exception as e:
|
|
808
830
|
raise ReferenceNotReachable(path, part_or_url) from e
|
|
809
831
|
|
|
@@ -871,6 +893,9 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
871
893
|
# found and on_error is "raise"
|
|
872
894
|
try:
|
|
873
895
|
u, s, e = self._cat_common(p)
|
|
896
|
+
if not isinstance(u, (bytes, str)):
|
|
897
|
+
# nan/None from parquet
|
|
898
|
+
continue
|
|
874
899
|
except FileNotFoundError as err:
|
|
875
900
|
if on_error == "raise":
|
|
876
901
|
raise
|
|
@@ -1060,7 +1085,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1060
1085
|
self.dircache = {"": []}
|
|
1061
1086
|
it = self.references.items()
|
|
1062
1087
|
for path, part in it:
|
|
1063
|
-
if isinstance(part, (bytes, str)):
|
|
1088
|
+
if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
|
|
1064
1089
|
size = len(part)
|
|
1065
1090
|
elif len(part) == 1:
|
|
1066
1091
|
size = None
|
|
@@ -1087,10 +1112,33 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1087
1112
|
self.dircache[par].append({"name": path, "type": "file", "size": size})
|
|
1088
1113
|
|
|
1089
1114
|
def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
|
|
1090
|
-
|
|
1091
|
-
|
|
1115
|
+
part_or_url, start0, end0 = self._cat_common(path)
|
|
1116
|
+
# This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
|
|
1117
|
+
# That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
|
|
1118
|
+
if isinstance(part_or_url, bytes):
|
|
1119
|
+
return io.BytesIO(part_or_url[start0:end0])
|
|
1120
|
+
|
|
1121
|
+
protocol, _ = split_protocol(part_or_url)
|
|
1122
|
+
if start0 is None and end0 is None:
|
|
1123
|
+
return self.fss[protocol]._open(
|
|
1124
|
+
part_or_url,
|
|
1125
|
+
mode,
|
|
1126
|
+
block_size=block_size,
|
|
1127
|
+
cache_options=cache_options,
|
|
1128
|
+
**kwargs,
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
return ReferenceFile(
|
|
1132
|
+
self,
|
|
1133
|
+
path,
|
|
1134
|
+
mode,
|
|
1135
|
+
block_size=block_size,
|
|
1136
|
+
cache_options=cache_options,
|
|
1137
|
+
**kwargs,
|
|
1138
|
+
)
|
|
1092
1139
|
|
|
1093
1140
|
def ls(self, path, detail=True, **kwargs):
|
|
1141
|
+
logger.debug("list %s", path)
|
|
1094
1142
|
path = self._strip_protocol(path)
|
|
1095
1143
|
if isinstance(self.references, LazyReferenceMapper):
|
|
1096
1144
|
try:
|
|
@@ -1173,13 +1221,17 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1173
1221
|
) # ignores FileNotFound, just as well for directories
|
|
1174
1222
|
self.dircache.clear() # this is a bit heavy handed
|
|
1175
1223
|
|
|
1176
|
-
async def _pipe_file(self, path, data):
|
|
1224
|
+
async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
|
|
1225
|
+
if mode == "create" and self.exists(path):
|
|
1226
|
+
raise FileExistsError
|
|
1177
1227
|
# can be str or bytes
|
|
1178
1228
|
self.references[path] = data
|
|
1179
1229
|
self.dircache.clear() # this is a bit heavy handed
|
|
1180
1230
|
|
|
1181
|
-
async def _put_file(self, lpath, rpath, **kwargs):
|
|
1231
|
+
async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
|
|
1182
1232
|
# puts binary
|
|
1233
|
+
if mode == "create" and self.exists(rpath):
|
|
1234
|
+
raise FileExistsError
|
|
1183
1235
|
with open(lpath, "rb") as f:
|
|
1184
1236
|
self.references[rpath] = f.read()
|
|
1185
1237
|
self.dircache.clear() # this is a bit heavy handed
|
|
@@ -1197,3 +1249,58 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1197
1249
|
out[k] = v
|
|
1198
1250
|
with fsspec.open(url, "wb", **storage_options) as f:
|
|
1199
1251
|
f.write(json.dumps({"version": 1, "refs": out}).encode())
|
|
1252
|
+
|
|
1253
|
+
|
|
1254
|
+
class ReferenceFile(AbstractBufferedFile):
|
|
1255
|
+
def __init__(
|
|
1256
|
+
self,
|
|
1257
|
+
fs,
|
|
1258
|
+
path,
|
|
1259
|
+
mode="rb",
|
|
1260
|
+
block_size="default",
|
|
1261
|
+
autocommit=True,
|
|
1262
|
+
cache_type="readahead",
|
|
1263
|
+
cache_options=None,
|
|
1264
|
+
size=None,
|
|
1265
|
+
**kwargs,
|
|
1266
|
+
):
|
|
1267
|
+
super().__init__(
|
|
1268
|
+
fs,
|
|
1269
|
+
path,
|
|
1270
|
+
mode=mode,
|
|
1271
|
+
block_size=block_size,
|
|
1272
|
+
autocommit=autocommit,
|
|
1273
|
+
size=size,
|
|
1274
|
+
cache_type=cache_type,
|
|
1275
|
+
cache_options=cache_options,
|
|
1276
|
+
**kwargs,
|
|
1277
|
+
)
|
|
1278
|
+
part_or_url, self.start, self.end = self.fs._cat_common(self.path)
|
|
1279
|
+
protocol, _ = split_protocol(part_or_url)
|
|
1280
|
+
self.src_fs = self.fs.fss[protocol]
|
|
1281
|
+
self.src_path = part_or_url
|
|
1282
|
+
self._f = None
|
|
1283
|
+
|
|
1284
|
+
@property
|
|
1285
|
+
def f(self):
|
|
1286
|
+
if self._f is None or self._f.closed:
|
|
1287
|
+
self._f = self.src_fs._open(
|
|
1288
|
+
self.src_path,
|
|
1289
|
+
mode=self.mode,
|
|
1290
|
+
block_size=self.blocksize,
|
|
1291
|
+
autocommit=self.autocommit,
|
|
1292
|
+
cache_type="none",
|
|
1293
|
+
**self.kwargs,
|
|
1294
|
+
)
|
|
1295
|
+
return self._f
|
|
1296
|
+
|
|
1297
|
+
def close(self):
|
|
1298
|
+
if self._f is not None:
|
|
1299
|
+
self._f.close()
|
|
1300
|
+
return super().close()
|
|
1301
|
+
|
|
1302
|
+
def _fetch_range(self, start, end):
|
|
1303
|
+
start = start + self.start
|
|
1304
|
+
end = min(end + self.start, self.end)
|
|
1305
|
+
self.f.seek(start)
|
|
1306
|
+
return self.f.read(end - start)
|