PyPI - fsspec - Versions diffs - 2024.9.0__py3-none-any.whl → 2024.12.0__py3-none-any.whl - Mend

fsspec 2024.9.0py3-none-any.whl → 2024.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

fsspec/_version.py +2 -2
fsspec/asyn.py +9 -7
fsspec/caching.py +34 -19
fsspec/core.py +20 -15
fsspec/implementations/asyn_wrapper.py +98 -0
fsspec/implementations/cached.py +1 -1
fsspec/implementations/dirfs.py +12 -0
fsspec/implementations/ftp.py +1 -1
fsspec/implementations/git.py +27 -39
fsspec/implementations/http.py +14 -30
fsspec/implementations/local.py +6 -1
fsspec/implementations/memory.py +15 -6
fsspec/implementations/reference.py +58 -15
fsspec/implementations/webhdfs.py +2 -1
fsspec/implementations/zip.py +2 -1
fsspec/mapping.py +1 -1
fsspec/parquet.py +1 -1
fsspec/registry.py +4 -0
fsspec/spec.py +209 -35
fsspec/tests/abstract/__init__.py +3 -1
fsspec/tests/abstract/open.py +11 -0
fsspec/tests/abstract/pipe.py +11 -0
fsspec/utils.py +4 -2
{fsspec-2024.9.0.dist-info → fsspec-2024.12.0.dist-info}/METADATA +3 -2
{fsspec-2024.9.0.dist-info → fsspec-2024.12.0.dist-info}/RECORD +27 -24
{fsspec-2024.9.0.dist-info → fsspec-2024.12.0.dist-info}/WHEEL +1 -1
{fsspec-2024.9.0.dist-info → fsspec-2024.12.0.dist-info}/licenses/LICENSE +0 -0

fsspec/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2024.9.0'
-__version_tuple__ = version_tuple = (2024, 9, 0)
+__version__ = version = '2024.12.0'
+__version_tuple__ = version_tuple = (2024, 12, 0)

fsspec/asyn.py CHANGED Viewed

@@ -344,6 +344,10 @@ class AsyncFileSystem(AbstractFileSystem):
     async def _cp_file(self, path1, path2, **kwargs):
         raise NotImplementedError
+    async def _mv_file(self, path1, path2):
+        await self._cp_file(path1, path2)
+        await self._rm_file(path1)
     async def _copy(
         self,
         path1,
@@ -404,7 +408,7 @@ class AsyncFileSystem(AbstractFileSystem):
                 continue
             raise ex
-    async def _pipe_file(self, path, value, **kwargs):
+    async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
         raise NotImplementedError
     async def _pipe(self, path, value=None, batch_size=None, **kwargs):
@@ -513,7 +517,7 @@ class AsyncFileSystem(AbstractFileSystem):
             coros, batch_size=batch_size, nofiles=True, return_exceptions=True
         )
-    async def _put_file(self, lpath, rpath, **kwargs):
+    async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
         raise NotImplementedError
     async def _put(
@@ -812,11 +816,9 @@ class AsyncFileSystem(AbstractFileSystem):
             p: info
             for p, info in sorted(allpaths.items())
             if pattern.match(
-                (
-                    p + "/"
-                    if append_slash_to_dirname and info["type"] == "directory"
-                    else p
-                )
+                p + "/"
+                if append_slash_to_dirname and info["type"] == "directory"
+                else p
             )
         }

fsspec/caching.py CHANGED Viewed

@@ -8,6 +8,8 @@ import os
 import threading
 import warnings
 from concurrent.futures import Future, ThreadPoolExecutor
+from itertools import groupby
+from operator import itemgetter
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -85,12 +87,7 @@ class BaseCache:
         if self.hit_count == 0 and self.miss_count == 0:
             # a cache that does nothing, this is for logs only
             return ""
-        return " , %s: %d hits, %d misses, %d total requested bytes" % (
-            self.name,
-            self.hit_count,
-            self.miss_count,
-            self.total_requested_bytes,
-        )
+        return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
     def __repr__(self) -> str:
         # TODO: use rich for better formatting
@@ -161,21 +158,39 @@ class MMapCache(BaseCache):
             return b""
         start_block = start // self.blocksize
         end_block = end // self.blocksize
-        need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
-        hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
-        self.miss_count += len(need)
-        self.hit_count += len(hits)
-        while need:
-            # TODO: not a for loop so we can consolidate blocks later to
-            # make fewer fetch calls; this could be parallel
-            i = need.pop(0)
-            sstart = i * self.blocksize
-            send = min(sstart + self.blocksize, self.size)
+        block_range = range(start_block, end_block + 1)
+        # Determine which blocks need to be fetched. This sequence is sorted by construction.
+        need = (i for i in block_range if i not in self.blocks)
+        # Count the number of blocks already cached
+        self.hit_count += sum(1 for i in block_range if i in self.blocks)
+        # Consolidate needed blocks.
+        # Algorithm adapted from Python 2.x itertools documentation.
+        # We are grouping an enumerated sequence of blocks. By comparing when the difference
+        # between an ascending range (provided by enumerate) and the needed block numbers
+        # we can detect when the block number skips values. The key computes this difference.
+        # Whenever the difference changes, we know that we have previously cached block(s),
+        # and a new group is started. In other words, this algorithm neatly groups
+        # runs of consecutive block numbers so they can be fetched together.
+        for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
+            # Extract the blocks from the enumerated sequence
+            _blocks = tuple(map(itemgetter(1), _blocks))
+            # Compute start of first block
+            sstart = _blocks[0] * self.blocksize
+            # Compute the end of the last block. Last block may not be full size.
+            send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
+            # Fetch bytes (could be multiple consecutive blocks)
             self.total_requested_bytes += send - sstart
-            logger.debug(f"MMap get block #{i} ({sstart}-{send})")
+            logger.debug(
+                f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
+            )
             self.cache[sstart:send] = self.fetcher(sstart, send)
-            self.blocks.add(i)
+            # Update set of cached blocks
+            self.blocks.update(_blocks)
+            # Update cache statistics with number of blocks we had to cache
+            self.miss_count += len(_blocks)
         return self.cache[start:end]

fsspec/core.py CHANGED Viewed

@@ -329,12 +329,19 @@ def open_files(
 def _un_chain(path, kwargs):
-    x = re.compile(".*[^a-z]+.*")  # test for non protocol-like single word
-    bits = (
-        [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
-        if "::" in path
-        else [path]
-    )
+    # Avoid a circular import
+    from fsspec.implementations.cached import CachingFileSystem
+    if "::" in path:
+        x = re.compile(".*[^a-z]+.*")  # test for non protocol-like single word
+        bits = []
+        for p in path.split("::"):
+            if "://" in p or x.match(p):
+                bits.append(p)
+            else:
+                bits.append(p + "://")
+    else:
+        bits = [path]
     # [[url, protocol, kwargs], ...]
     out = []
     previous_bit = None
@@ -346,12 +353,12 @@ def _un_chain(path, kwargs):
         kws = kwargs.pop(protocol, {})
         if bit is bits[0]:
             kws.update(kwargs)
-        kw = dict(**extra_kwargs, **kws)
+        kw = dict(
+            **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
+            **kws,
+        )
         bit = cls._strip_protocol(bit)
-        if (
-            protocol in {"blockcache", "filecache", "simplecache"}
-            and "target_protocol" not in kw
-        ):
+        if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
             bit = previous_bit
         out.append((bit, protocol, kw))
         previous_bit = bit
@@ -578,7 +585,7 @@ def expand_paths_if_needed(paths, mode, num, fs, name_function):
     paths = list(paths)
     if "w" in mode:  # read mode
-        if sum([1 for p in paths if "*" in p]) > 1:
+        if sum(1 for p in paths if "*" in p) > 1:
             raise ValueError(
                 "When writing data, only one filename mask can be specified."
             )
@@ -673,9 +680,7 @@ def get_fs_token_paths(
         elif not isinstance(paths, list):
             paths = list(paths)
     else:
-        if "w" in mode and expand:
-            paths = _expand_paths(paths, name_function, num)
-        elif "x" in mode and expand:
+        if ("w" in mode or "x" in mode) and expand:
             paths = _expand_paths(paths, name_function, num)
         elif "*" in paths:
             paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]

fsspec/implementations/asyn_wrapper.py ADDED Viewed

@@ -0,0 +1,98 @@
+import asyncio
+import functools
+import inspect
+from fsspec.asyn import AsyncFileSystem
+def async_wrapper(func, obj=None):
+    """
+    Wraps a synchronous function to make it awaitable.
+    Parameters
+    ----------
+    func : callable
+        The synchronous function to wrap.
+    obj : object, optional
+        The instance to bind the function to, if applicable.
+    Returns
+    -------
+    coroutine
+        An awaitable version of the function.
+    """
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        return await asyncio.to_thread(func, *args, **kwargs)
+    return wrapper
+class AsyncFileSystemWrapper(AsyncFileSystem):
+    """
+    A wrapper class to convert a synchronous filesystem into an asynchronous one.
+    This class takes an existing synchronous filesystem implementation and wraps all
+    its methods to provide an asynchronous interface.
+    Parameters
+    ----------
+    sync_fs : AbstractFileSystem
+        The synchronous filesystem instance to wrap.
+    """
+    def __init__(self, sync_fs, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.asynchronous = True
+        self.sync_fs = sync_fs
+        self.protocol = self.sync_fs.protocol
+        self._wrap_all_sync_methods()
+    @property
+    def fsid(self):
+        return f"async_{self.sync_fs.fsid}"
+    def _wrap_all_sync_methods(self):
+        """
+        Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
+        """
+        for method_name in dir(self.sync_fs):
+            if method_name.startswith("_"):
+                continue
+            attr = inspect.getattr_static(self.sync_fs, method_name)
+            if isinstance(attr, property):
+                continue
+            method = getattr(self.sync_fs, method_name)
+            if callable(method) and not asyncio.iscoroutinefunction(method):
+                async_method = async_wrapper(method, obj=self)
+                setattr(self, f"_{method_name}", async_method)
+    @classmethod
+    def wrap_class(cls, sync_fs_class):
+        """
+        Create a new class that can be used to instantiate an AsyncFileSystemWrapper
+        with lazy instantiation of the underlying synchronous filesystem.
+        Parameters
+        ----------
+        sync_fs_class : type
+            The class of the synchronous filesystem to wrap.
+        Returns
+        -------
+        type
+            A new class that wraps the provided synchronous filesystem class.
+        """
+        class GeneratedAsyncFileSystemWrapper(cls):
+            def __init__(self, *args, **kwargs):
+                sync_fs = sync_fs_class(*args, **kwargs)
+                super().__init__(sync_fs)
+        GeneratedAsyncFileSystemWrapper.__name__ = (
+            f"Async{sync_fs_class.__name__}Wrapper"
+        )
+        return GeneratedAsyncFileSystemWrapper

fsspec/implementations/cached.py CHANGED Viewed

@@ -612,7 +612,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
         **kwargs,
     ):
         paths = self.expand_path(
-            path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
+            path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
         )
         getpaths = []
         storepaths = []

fsspec/implementations/dirfs.py CHANGED Viewed

@@ -370,3 +370,15 @@ class DirFileSystem(AsyncFileSystem):
             *args,
             **kwargs,
         )
+    async def open_async(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs.open_async(
+            self._join(path),
+            *args,
+            **kwargs,
+        )

fsspec/implementations/ftp.py CHANGED Viewed

@@ -387,7 +387,7 @@ def _mlsd2(ftp, path="."):
                 "size": split_line[4],
             },
         )
-        if "d" == this[1]["unix.mode"][0]:
+        if this[1]["unix.mode"][0] == "d":
             this[1]["type"] = "dir"
         else:
             this[1]["type"] = "file"

fsspec/implementations/git.py CHANGED Viewed

@@ -55,6 +55,8 @@ class GitFileSystem(AbstractFileSystem):
         tree = comm.tree
         for part in parts:
             if part and isinstance(tree, pygit2.Tree):
+                if part not in tree:
+                    raise FileNotFoundError(path)
                 tree = tree[part]
         return tree
@@ -69,46 +71,32 @@ class GitFileSystem(AbstractFileSystem):
             out["ref"], path = path.split("@", 1)
         return out
+    @staticmethod
+    def _object_to_info(obj, path=None):
+        # obj.name and obj.filemode are None for the root tree!
+        is_dir = isinstance(obj, pygit2.Tree)
+        return {
+            "type": "directory" if is_dir else "file",
+            "name": (
+                "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
+            ),
+            "hex": str(obj.id),
+            "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
+            "size": 0 if is_dir else obj.size,
+        }
     def ls(self, path, detail=True, ref=None, **kwargs):
-        path = self._strip_protocol(path)
-        tree = self._path_to_object(path, ref)
-        if isinstance(tree, pygit2.Tree):
-            out = []
-            for obj in tree:
-                if isinstance(obj, pygit2.Tree):
-                    out.append(
-                        {
-                            "type": "directory",
-                            "name": "/".join([path, obj.name]).lstrip("/"),
-                            "hex": obj.hex,
-                            "mode": f"{obj.filemode:o}",
-                            "size": 0,
-                        }
-                    )
-                else:
-                    out.append(
-                        {
-                            "type": "file",
-                            "name": "/".join([path, obj.name]).lstrip("/"),
-                            "hex": obj.hex,
-                            "mode": f"{obj.filemode:o}",
-                            "size": obj.size,
-                        }
-                    )
-        else:
-            obj = tree
-            out = [
-                {
-                    "type": "file",
-                    "name": obj.name,
-                    "hex": obj.hex,
-                    "mode": f"{obj.filemode:o}",
-                    "size": obj.size,
-                }
-            ]
-        if detail:
-            return out
-        return [o["name"] for o in out]
+        tree = self._path_to_object(self._strip_protocol(path), ref)
+        return [
+            GitFileSystem._object_to_info(obj, path)
+            if detail
+            else GitFileSystem._object_to_info(obj, path)["name"]
+            for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
+        ]
+    def info(self, path, ref=None, **kwargs):
+        tree = self._path_to_object(self._strip_protocol(path), ref)
+        return GitFileSystem._object_to_info(tree, path)
     def ukey(self, path, ref=None):
         return self.info(path, ref=ref)["hex"]

fsspec/implementations/http.py CHANGED Viewed

@@ -273,8 +273,12 @@ class HTTPFileSystem(AsyncFileSystem):
         chunk_size=5 * 2**20,
         callback=DEFAULT_CALLBACK,
         method="post",
+        mode="overwrite",
         **kwargs,
     ):
+        if mode != "overwrite":
+            raise NotImplementedError("Exclusive write")
         async def gen_chunks():
             # Support passing arbitrary file-like objects
             # and use them instead of streams.
@@ -358,9 +362,10 @@ class HTTPFileSystem(AsyncFileSystem):
         kw = self.kwargs.copy()
         kw["asynchronous"] = self.asynchronous
         kw.update(kwargs)
-        size = size or self.info(path, **kwargs)["size"]
+        info = {}
+        size = size or info.update(self.info(path, **kwargs)) or info["size"]
         session = sync(self.loop, self.set_session)
-        if block_size and size:
+        if block_size and size and info.get("partial", True):
             return HTTPFile(
                 self,
                 path,
@@ -520,9 +525,9 @@ class HTTPFileSystem(AsyncFileSystem):
 class HTTPFile(AbstractBufferedFile):
     """
-    A file-like object pointing to a remove HTTP(S) resource
+    A file-like object pointing to a remote HTTP(S) resource
-    Supports only reading, with read-ahead of a predermined block-size.
+    Supports only reading, with read-ahead of a predetermined block-size.
     In the case that the server does not supply the filesize, only reading of
     the complete file in one go is supported.
@@ -691,25 +696,6 @@ class HTTPFile(AbstractBufferedFile):
     _fetch_range = sync_wrapper(async_fetch_range)
-    def __reduce__(self):
-        return (
-            reopen,
-            (
-                self.fs,
-                self.url,
-                self.mode,
-                self.blocksize,
-                self.cache.name if self.cache else "none",
-                self.size,
-            ),
-        )
-def reopen(fs, url, mode, blocksize, cache_type, size=None):
-    return fs.open(
-        url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
-    )
 magic_check = re.compile("([*[])")
@@ -759,9 +745,6 @@ class HTTPStreamFile(AbstractBufferedFile):
         asyncio.run_coroutine_threadsafe(self._close(), self.loop)
         super().close()
-    def __reduce__(self):
-        return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
 class AsyncStreamFile(AbstractAsyncStreamedFile):
     def __init__(
@@ -835,10 +818,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
     async with r:
         r.raise_for_status()
-        # TODO:
-        #  recognise lack of 'Accept-Ranges',
-        #                 or 'Accept-Ranges': 'none' (not 'bytes')
-        #  to mean streaming only, no random access => return None
         if "Content-Length" in r.headers:
             # Some servers may choose to ignore Accept-Encoding and return
             # compressed content, in which case the returned size is unreliable.
@@ -853,6 +832,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
         if "Content-Type" in r.headers:
             info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
+        if r.headers.get("Accept-Ranges") == "none":
+            # Some servers may explicitly discourage partial content requests, but
+            # the lack of "Accept-Ranges" does not always indicate they would fail
+            info["partial"] = False
         info["url"] = str(r.url)
         for checksum_field in ["ETag", "Content-MD5", "Digest"]:

fsspec/implementations/local.py CHANGED Viewed

@@ -60,7 +60,12 @@ class LocalFileSystem(AbstractFileSystem):
         info = self.info(path)
         if info["type"] == "directory":
             with os.scandir(path) as it:
-                infos = [self.info(f) for f in it]
+                infos = []
+                for f in it:
+                    try:
+                        infos.append(self.info(f))
+                    except FileNotFoundError:
+                        pass
         else:
             infos = [info]

fsspec/implementations/memory.py CHANGED Viewed

@@ -126,12 +126,13 @@ class MemoryFileSystem(AbstractFileSystem):
             if not exist_ok:
                 raise
-    def pipe_file(self, path, value, **kwargs):
+    def pipe_file(self, path, value, mode="overwrite", **kwargs):
         """Set the bytes of given file
         Avoids copies of the data if possible
         """
-        self.open(path, "wb", data=value)
+        mode = "xb" if mode == "create" else "wb"
+        self.open(path, mode=mode, data=value)
     def rmdir(self, path):
         path = self._strip_protocol(path)
@@ -178,6 +179,8 @@ class MemoryFileSystem(AbstractFileSystem):
         **kwargs,
     ):
         path = self._strip_protocol(path)
+        if "x" in mode and self.exists(path):
+            raise FileExistsError
         if path in self.pseudo_dirs:
             raise IsADirectoryError(path)
         parent = path
@@ -197,7 +200,9 @@ class MemoryFileSystem(AbstractFileSystem):
                 return f
             else:
                 raise FileNotFoundError(path)
-        elif mode == "wb":
+        elif mode in {"wb", "xb"}:
+            if mode == "xb" and self.exists(path):
+                raise FileExistsError
             m = MemoryFile(self, path, kwargs.get("data"))
             if not self._intrans:
                 m.commit()
@@ -248,6 +253,10 @@ class MemoryFileSystem(AbstractFileSystem):
         except KeyError as e:
             raise FileNotFoundError(path) from e
+    def isfile(self, path):
+        path = self._strip_protocol(path)
+        return path in self.store
     def rm(self, path, recursive=False, maxdepth=None):
         if isinstance(path, str):
             path = self._strip_protocol(path)
@@ -255,14 +264,14 @@ class MemoryFileSystem(AbstractFileSystem):
             path = [self._strip_protocol(p) for p in path]
         paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
         for p in reversed(paths):
+            if self.isfile(p):
+                self.rm_file(p)
             # If the expanded path doesn't exist, it is only because the expanded
             # path was a directory that does not exist in self.pseudo_dirs. This
             # is possible if you directly create files without making the
             # directories first.
-            if not self.exists(p):
+            elif not self.exists(p):
                 continue
-            if self.isfile(p):
-                self.rm_file(p)
             else:
                 self.rmdir(p)

fsspec 2024.9.0__py3-none-any.whl → 2024.12.0__py3-none-any.whl

fsspec 2024.9.0py3-none-any.whl → 2024.12.0py3-none-any.whl