fsspec 2024.9.0__py3-none-any.whl → 2024.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fsspec/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2024.9.0'
16
- __version_tuple__ = version_tuple = (2024, 9, 0)
15
+ __version__ = version = '2024.12.0'
16
+ __version_tuple__ = version_tuple = (2024, 12, 0)
fsspec/asyn.py CHANGED
@@ -344,6 +344,10 @@ class AsyncFileSystem(AbstractFileSystem):
344
344
  async def _cp_file(self, path1, path2, **kwargs):
345
345
  raise NotImplementedError
346
346
 
347
+ async def _mv_file(self, path1, path2):
348
+ await self._cp_file(path1, path2)
349
+ await self._rm_file(path1)
350
+
347
351
  async def _copy(
348
352
  self,
349
353
  path1,
@@ -404,7 +408,7 @@ class AsyncFileSystem(AbstractFileSystem):
404
408
  continue
405
409
  raise ex
406
410
 
407
- async def _pipe_file(self, path, value, **kwargs):
411
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
408
412
  raise NotImplementedError
409
413
 
410
414
  async def _pipe(self, path, value=None, batch_size=None, **kwargs):
@@ -513,7 +517,7 @@ class AsyncFileSystem(AbstractFileSystem):
513
517
  coros, batch_size=batch_size, nofiles=True, return_exceptions=True
514
518
  )
515
519
 
516
- async def _put_file(self, lpath, rpath, **kwargs):
520
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
517
521
  raise NotImplementedError
518
522
 
519
523
  async def _put(
@@ -812,11 +816,9 @@ class AsyncFileSystem(AbstractFileSystem):
812
816
  p: info
813
817
  for p, info in sorted(allpaths.items())
814
818
  if pattern.match(
815
- (
816
- p + "/"
817
- if append_slash_to_dirname and info["type"] == "directory"
818
- else p
819
- )
819
+ p + "/"
820
+ if append_slash_to_dirname and info["type"] == "directory"
821
+ else p
820
822
  )
821
823
  }
822
824
 
fsspec/caching.py CHANGED
@@ -8,6 +8,8 @@ import os
8
8
  import threading
9
9
  import warnings
10
10
  from concurrent.futures import Future, ThreadPoolExecutor
11
+ from itertools import groupby
12
+ from operator import itemgetter
11
13
  from typing import (
12
14
  TYPE_CHECKING,
13
15
  Any,
@@ -85,12 +87,7 @@ class BaseCache:
85
87
  if self.hit_count == 0 and self.miss_count == 0:
86
88
  # a cache that does nothing, this is for logs only
87
89
  return ""
88
- return " , %s: %d hits, %d misses, %d total requested bytes" % (
89
- self.name,
90
- self.hit_count,
91
- self.miss_count,
92
- self.total_requested_bytes,
93
- )
90
+ return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
94
91
 
95
92
  def __repr__(self) -> str:
96
93
  # TODO: use rich for better formatting
@@ -161,21 +158,39 @@ class MMapCache(BaseCache):
161
158
  return b""
162
159
  start_block = start // self.blocksize
163
160
  end_block = end // self.blocksize
164
- need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
165
- hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
166
- self.miss_count += len(need)
167
- self.hit_count += len(hits)
168
- while need:
169
- # TODO: not a for loop so we can consolidate blocks later to
170
- # make fewer fetch calls; this could be parallel
171
- i = need.pop(0)
172
-
173
- sstart = i * self.blocksize
174
- send = min(sstart + self.blocksize, self.size)
161
+ block_range = range(start_block, end_block + 1)
162
+ # Determine which blocks need to be fetched. This sequence is sorted by construction.
163
+ need = (i for i in block_range if i not in self.blocks)
164
+ # Count the number of blocks already cached
165
+ self.hit_count += sum(1 for i in block_range if i in self.blocks)
166
+
167
+ # Consolidate needed blocks.
168
+ # Algorithm adapted from Python 2.x itertools documentation.
169
+ # We are grouping an enumerated sequence of blocks. By comparing when the difference
170
+ # between an ascending range (provided by enumerate) and the needed block numbers
171
+ # we can detect when the block number skips values. The key computes this difference.
172
+ # Whenever the difference changes, we know that we have previously cached block(s),
173
+ # and a new group is started. In other words, this algorithm neatly groups
174
+ # runs of consecutive block numbers so they can be fetched together.
175
+ for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
176
+ # Extract the blocks from the enumerated sequence
177
+ _blocks = tuple(map(itemgetter(1), _blocks))
178
+ # Compute start of first block
179
+ sstart = _blocks[0] * self.blocksize
180
+ # Compute the end of the last block. Last block may not be full size.
181
+ send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
182
+
183
+ # Fetch bytes (could be multiple consecutive blocks)
175
184
  self.total_requested_bytes += send - sstart
176
- logger.debug(f"MMap get block #{i} ({sstart}-{send})")
185
+ logger.debug(
186
+ f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
187
+ )
177
188
  self.cache[sstart:send] = self.fetcher(sstart, send)
178
- self.blocks.add(i)
189
+
190
+ # Update set of cached blocks
191
+ self.blocks.update(_blocks)
192
+ # Update cache statistics with number of blocks we had to cache
193
+ self.miss_count += len(_blocks)
179
194
 
180
195
  return self.cache[start:end]
181
196
 
fsspec/core.py CHANGED
@@ -329,12 +329,19 @@ def open_files(
329
329
 
330
330
 
331
331
  def _un_chain(path, kwargs):
332
- x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
333
- bits = (
334
- [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
335
- if "::" in path
336
- else [path]
337
- )
332
+ # Avoid a circular import
333
+ from fsspec.implementations.cached import CachingFileSystem
334
+
335
+ if "::" in path:
336
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ bits = []
338
+ for p in path.split("::"):
339
+ if "://" in p or x.match(p):
340
+ bits.append(p)
341
+ else:
342
+ bits.append(p + "://")
343
+ else:
344
+ bits = [path]
338
345
  # [[url, protocol, kwargs], ...]
339
346
  out = []
340
347
  previous_bit = None
@@ -346,12 +353,12 @@ def _un_chain(path, kwargs):
346
353
  kws = kwargs.pop(protocol, {})
347
354
  if bit is bits[0]:
348
355
  kws.update(kwargs)
349
- kw = dict(**extra_kwargs, **kws)
356
+ kw = dict(
357
+ **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
358
+ **kws,
359
+ )
350
360
  bit = cls._strip_protocol(bit)
351
- if (
352
- protocol in {"blockcache", "filecache", "simplecache"}
353
- and "target_protocol" not in kw
354
- ):
361
+ if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
355
362
  bit = previous_bit
356
363
  out.append((bit, protocol, kw))
357
364
  previous_bit = bit
@@ -578,7 +585,7 @@ def expand_paths_if_needed(paths, mode, num, fs, name_function):
578
585
  paths = list(paths)
579
586
 
580
587
  if "w" in mode: # read mode
581
- if sum([1 for p in paths if "*" in p]) > 1:
588
+ if sum(1 for p in paths if "*" in p) > 1:
582
589
  raise ValueError(
583
590
  "When writing data, only one filename mask can be specified."
584
591
  )
@@ -673,9 +680,7 @@ def get_fs_token_paths(
673
680
  elif not isinstance(paths, list):
674
681
  paths = list(paths)
675
682
  else:
676
- if "w" in mode and expand:
677
- paths = _expand_paths(paths, name_function, num)
678
- elif "x" in mode and expand:
683
+ if ("w" in mode or "x" in mode) and expand:
679
684
  paths = _expand_paths(paths, name_function, num)
680
685
  elif "*" in paths:
681
686
  paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
@@ -0,0 +1,98 @@
1
+ import asyncio
2
+ import functools
3
+ import inspect
4
+
5
+ from fsspec.asyn import AsyncFileSystem
6
+
7
+
8
+ def async_wrapper(func, obj=None):
9
+ """
10
+ Wraps a synchronous function to make it awaitable.
11
+
12
+ Parameters
13
+ ----------
14
+ func : callable
15
+ The synchronous function to wrap.
16
+ obj : object, optional
17
+ The instance to bind the function to, if applicable.
18
+
19
+ Returns
20
+ -------
21
+ coroutine
22
+ An awaitable version of the function.
23
+ """
24
+
25
+ @functools.wraps(func)
26
+ async def wrapper(*args, **kwargs):
27
+ return await asyncio.to_thread(func, *args, **kwargs)
28
+
29
+ return wrapper
30
+
31
+
32
+ class AsyncFileSystemWrapper(AsyncFileSystem):
33
+ """
34
+ A wrapper class to convert a synchronous filesystem into an asynchronous one.
35
+
36
+ This class takes an existing synchronous filesystem implementation and wraps all
37
+ its methods to provide an asynchronous interface.
38
+
39
+ Parameters
40
+ ----------
41
+ sync_fs : AbstractFileSystem
42
+ The synchronous filesystem instance to wrap.
43
+ """
44
+
45
+ def __init__(self, sync_fs, *args, **kwargs):
46
+ super().__init__(*args, **kwargs)
47
+ self.asynchronous = True
48
+ self.sync_fs = sync_fs
49
+ self.protocol = self.sync_fs.protocol
50
+ self._wrap_all_sync_methods()
51
+
52
+ @property
53
+ def fsid(self):
54
+ return f"async_{self.sync_fs.fsid}"
55
+
56
+ def _wrap_all_sync_methods(self):
57
+ """
58
+ Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
59
+ """
60
+ for method_name in dir(self.sync_fs):
61
+ if method_name.startswith("_"):
62
+ continue
63
+
64
+ attr = inspect.getattr_static(self.sync_fs, method_name)
65
+ if isinstance(attr, property):
66
+ continue
67
+
68
+ method = getattr(self.sync_fs, method_name)
69
+ if callable(method) and not asyncio.iscoroutinefunction(method):
70
+ async_method = async_wrapper(method, obj=self)
71
+ setattr(self, f"_{method_name}", async_method)
72
+
73
+ @classmethod
74
+ def wrap_class(cls, sync_fs_class):
75
+ """
76
+ Create a new class that can be used to instantiate an AsyncFileSystemWrapper
77
+ with lazy instantiation of the underlying synchronous filesystem.
78
+
79
+ Parameters
80
+ ----------
81
+ sync_fs_class : type
82
+ The class of the synchronous filesystem to wrap.
83
+
84
+ Returns
85
+ -------
86
+ type
87
+ A new class that wraps the provided synchronous filesystem class.
88
+ """
89
+
90
+ class GeneratedAsyncFileSystemWrapper(cls):
91
+ def __init__(self, *args, **kwargs):
92
+ sync_fs = sync_fs_class(*args, **kwargs)
93
+ super().__init__(sync_fs)
94
+
95
+ GeneratedAsyncFileSystemWrapper.__name__ = (
96
+ f"Async{sync_fs_class.__name__}Wrapper"
97
+ )
98
+ return GeneratedAsyncFileSystemWrapper
@@ -612,7 +612,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
612
612
  **kwargs,
613
613
  ):
614
614
  paths = self.expand_path(
615
- path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
615
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
616
616
  )
617
617
  getpaths = []
618
618
  storepaths = []
@@ -370,3 +370,15 @@ class DirFileSystem(AsyncFileSystem):
370
370
  *args,
371
371
  **kwargs,
372
372
  )
373
+
374
+ async def open_async(
375
+ self,
376
+ path,
377
+ *args,
378
+ **kwargs,
379
+ ):
380
+ return await self.fs.open_async(
381
+ self._join(path),
382
+ *args,
383
+ **kwargs,
384
+ )
@@ -387,7 +387,7 @@ def _mlsd2(ftp, path="."):
387
387
  "size": split_line[4],
388
388
  },
389
389
  )
390
- if "d" == this[1]["unix.mode"][0]:
390
+ if this[1]["unix.mode"][0] == "d":
391
391
  this[1]["type"] = "dir"
392
392
  else:
393
393
  this[1]["type"] = "file"
@@ -55,6 +55,8 @@ class GitFileSystem(AbstractFileSystem):
55
55
  tree = comm.tree
56
56
  for part in parts:
57
57
  if part and isinstance(tree, pygit2.Tree):
58
+ if part not in tree:
59
+ raise FileNotFoundError(path)
58
60
  tree = tree[part]
59
61
  return tree
60
62
 
@@ -69,46 +71,32 @@ class GitFileSystem(AbstractFileSystem):
69
71
  out["ref"], path = path.split("@", 1)
70
72
  return out
71
73
 
74
+ @staticmethod
75
+ def _object_to_info(obj, path=None):
76
+ # obj.name and obj.filemode are None for the root tree!
77
+ is_dir = isinstance(obj, pygit2.Tree)
78
+ return {
79
+ "type": "directory" if is_dir else "file",
80
+ "name": (
81
+ "/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
82
+ ),
83
+ "hex": str(obj.id),
84
+ "mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
85
+ "size": 0 if is_dir else obj.size,
86
+ }
87
+
72
88
  def ls(self, path, detail=True, ref=None, **kwargs):
73
- path = self._strip_protocol(path)
74
- tree = self._path_to_object(path, ref)
75
- if isinstance(tree, pygit2.Tree):
76
- out = []
77
- for obj in tree:
78
- if isinstance(obj, pygit2.Tree):
79
- out.append(
80
- {
81
- "type": "directory",
82
- "name": "/".join([path, obj.name]).lstrip("/"),
83
- "hex": obj.hex,
84
- "mode": f"{obj.filemode:o}",
85
- "size": 0,
86
- }
87
- )
88
- else:
89
- out.append(
90
- {
91
- "type": "file",
92
- "name": "/".join([path, obj.name]).lstrip("/"),
93
- "hex": obj.hex,
94
- "mode": f"{obj.filemode:o}",
95
- "size": obj.size,
96
- }
97
- )
98
- else:
99
- obj = tree
100
- out = [
101
- {
102
- "type": "file",
103
- "name": obj.name,
104
- "hex": obj.hex,
105
- "mode": f"{obj.filemode:o}",
106
- "size": obj.size,
107
- }
108
- ]
109
- if detail:
110
- return out
111
- return [o["name"] for o in out]
89
+ tree = self._path_to_object(self._strip_protocol(path), ref)
90
+ return [
91
+ GitFileSystem._object_to_info(obj, path)
92
+ if detail
93
+ else GitFileSystem._object_to_info(obj, path)["name"]
94
+ for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
95
+ ]
96
+
97
+ def info(self, path, ref=None, **kwargs):
98
+ tree = self._path_to_object(self._strip_protocol(path), ref)
99
+ return GitFileSystem._object_to_info(tree, path)
112
100
 
113
101
  def ukey(self, path, ref=None):
114
102
  return self.info(path, ref=ref)["hex"]
@@ -273,8 +273,12 @@ class HTTPFileSystem(AsyncFileSystem):
273
273
  chunk_size=5 * 2**20,
274
274
  callback=DEFAULT_CALLBACK,
275
275
  method="post",
276
+ mode="overwrite",
276
277
  **kwargs,
277
278
  ):
279
+ if mode != "overwrite":
280
+ raise NotImplementedError("Exclusive write")
281
+
278
282
  async def gen_chunks():
279
283
  # Support passing arbitrary file-like objects
280
284
  # and use them instead of streams.
@@ -358,9 +362,10 @@ class HTTPFileSystem(AsyncFileSystem):
358
362
  kw = self.kwargs.copy()
359
363
  kw["asynchronous"] = self.asynchronous
360
364
  kw.update(kwargs)
361
- size = size or self.info(path, **kwargs)["size"]
365
+ info = {}
366
+ size = size or info.update(self.info(path, **kwargs)) or info["size"]
362
367
  session = sync(self.loop, self.set_session)
363
- if block_size and size:
368
+ if block_size and size and info.get("partial", True):
364
369
  return HTTPFile(
365
370
  self,
366
371
  path,
@@ -520,9 +525,9 @@ class HTTPFileSystem(AsyncFileSystem):
520
525
 
521
526
  class HTTPFile(AbstractBufferedFile):
522
527
  """
523
- A file-like object pointing to a remove HTTP(S) resource
528
+ A file-like object pointing to a remote HTTP(S) resource
524
529
 
525
- Supports only reading, with read-ahead of a predermined block-size.
530
+ Supports only reading, with read-ahead of a predetermined block-size.
526
531
 
527
532
  In the case that the server does not supply the filesize, only reading of
528
533
  the complete file in one go is supported.
@@ -691,25 +696,6 @@ class HTTPFile(AbstractBufferedFile):
691
696
 
692
697
  _fetch_range = sync_wrapper(async_fetch_range)
693
698
 
694
- def __reduce__(self):
695
- return (
696
- reopen,
697
- (
698
- self.fs,
699
- self.url,
700
- self.mode,
701
- self.blocksize,
702
- self.cache.name if self.cache else "none",
703
- self.size,
704
- ),
705
- )
706
-
707
-
708
- def reopen(fs, url, mode, blocksize, cache_type, size=None):
709
- return fs.open(
710
- url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
711
- )
712
-
713
699
 
714
700
  magic_check = re.compile("([*[])")
715
701
 
@@ -759,9 +745,6 @@ class HTTPStreamFile(AbstractBufferedFile):
759
745
  asyncio.run_coroutine_threadsafe(self._close(), self.loop)
760
746
  super().close()
761
747
 
762
- def __reduce__(self):
763
- return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
764
-
765
748
 
766
749
  class AsyncStreamFile(AbstractAsyncStreamedFile):
767
750
  def __init__(
@@ -835,10 +818,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
835
818
  async with r:
836
819
  r.raise_for_status()
837
820
 
838
- # TODO:
839
- # recognise lack of 'Accept-Ranges',
840
- # or 'Accept-Ranges': 'none' (not 'bytes')
841
- # to mean streaming only, no random access => return None
842
821
  if "Content-Length" in r.headers:
843
822
  # Some servers may choose to ignore Accept-Encoding and return
844
823
  # compressed content, in which case the returned size is unreliable.
@@ -853,6 +832,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
853
832
  if "Content-Type" in r.headers:
854
833
  info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
855
834
 
835
+ if r.headers.get("Accept-Ranges") == "none":
836
+ # Some servers may explicitly discourage partial content requests, but
837
+ # the lack of "Accept-Ranges" does not always indicate they would fail
838
+ info["partial"] = False
839
+
856
840
  info["url"] = str(r.url)
857
841
 
858
842
  for checksum_field in ["ETag", "Content-MD5", "Digest"]:
@@ -60,7 +60,12 @@ class LocalFileSystem(AbstractFileSystem):
60
60
  info = self.info(path)
61
61
  if info["type"] == "directory":
62
62
  with os.scandir(path) as it:
63
- infos = [self.info(f) for f in it]
63
+ infos = []
64
+ for f in it:
65
+ try:
66
+ infos.append(self.info(f))
67
+ except FileNotFoundError:
68
+ pass
64
69
  else:
65
70
  infos = [info]
66
71
 
@@ -126,12 +126,13 @@ class MemoryFileSystem(AbstractFileSystem):
126
126
  if not exist_ok:
127
127
  raise
128
128
 
129
- def pipe_file(self, path, value, **kwargs):
129
+ def pipe_file(self, path, value, mode="overwrite", **kwargs):
130
130
  """Set the bytes of given file
131
131
 
132
132
  Avoids copies of the data if possible
133
133
  """
134
- self.open(path, "wb", data=value)
134
+ mode = "xb" if mode == "create" else "wb"
135
+ self.open(path, mode=mode, data=value)
135
136
 
136
137
  def rmdir(self, path):
137
138
  path = self._strip_protocol(path)
@@ -178,6 +179,8 @@ class MemoryFileSystem(AbstractFileSystem):
178
179
  **kwargs,
179
180
  ):
180
181
  path = self._strip_protocol(path)
182
+ if "x" in mode and self.exists(path):
183
+ raise FileExistsError
181
184
  if path in self.pseudo_dirs:
182
185
  raise IsADirectoryError(path)
183
186
  parent = path
@@ -197,7 +200,9 @@ class MemoryFileSystem(AbstractFileSystem):
197
200
  return f
198
201
  else:
199
202
  raise FileNotFoundError(path)
200
- elif mode == "wb":
203
+ elif mode in {"wb", "xb"}:
204
+ if mode == "xb" and self.exists(path):
205
+ raise FileExistsError
201
206
  m = MemoryFile(self, path, kwargs.get("data"))
202
207
  if not self._intrans:
203
208
  m.commit()
@@ -248,6 +253,10 @@ class MemoryFileSystem(AbstractFileSystem):
248
253
  except KeyError as e:
249
254
  raise FileNotFoundError(path) from e
250
255
 
256
+ def isfile(self, path):
257
+ path = self._strip_protocol(path)
258
+ return path in self.store
259
+
251
260
  def rm(self, path, recursive=False, maxdepth=None):
252
261
  if isinstance(path, str):
253
262
  path = self._strip_protocol(path)
@@ -255,14 +264,14 @@ class MemoryFileSystem(AbstractFileSystem):
255
264
  path = [self._strip_protocol(p) for p in path]
256
265
  paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
257
266
  for p in reversed(paths):
267
+ if self.isfile(p):
268
+ self.rm_file(p)
258
269
  # If the expanded path doesn't exist, it is only because the expanded
259
270
  # path was a directory that does not exist in self.pseudo_dirs. This
260
271
  # is possible if you directly create files without making the
261
272
  # directories first.
262
- if not self.exists(p):
273
+ elif not self.exists(p):
263
274
  continue
264
- if self.isfile(p):
265
- self.rm_file(p)
266
275
  else:
267
276
  self.rmdir(p)
268
277