fsspec 2024.10.0__py3-none-any.whl → 2025.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fsspec/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2024.10.0'
16
- __version_tuple__ = version_tuple = (2024, 10, 0)
15
+ __version__ = version = '2025.2.0'
16
+ __version_tuple__ = version_tuple = (2025, 2, 0)
fsspec/archive.py CHANGED
@@ -1,3 +1,5 @@
1
+ import operator
2
+
1
3
  from fsspec import AbstractFileSystem
2
4
  from fsspec.utils import tokenize
3
5
 
@@ -67,7 +69,7 @@ class AbstractArchiveFileSystem(AbstractFileSystem):
67
69
  out = {"name": ppath, "size": 0, "type": "directory"}
68
70
  paths[ppath] = out
69
71
  if detail:
70
- out = sorted(paths.values(), key=lambda _: _["name"])
72
+ out = sorted(paths.values(), key=operator.itemgetter("name"))
71
73
  return out
72
74
  else:
73
75
  return sorted(paths)
fsspec/asyn.py CHANGED
@@ -408,7 +408,7 @@ class AsyncFileSystem(AbstractFileSystem):
408
408
  continue
409
409
  raise ex
410
410
 
411
- async def _pipe_file(self, path, value, **kwargs):
411
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
412
412
  raise NotImplementedError
413
413
 
414
414
  async def _pipe(self, path, value=None, batch_size=None, **kwargs):
@@ -517,7 +517,7 @@ class AsyncFileSystem(AbstractFileSystem):
517
517
  coros, batch_size=batch_size, nofiles=True, return_exceptions=True
518
518
  )
519
519
 
520
- async def _put_file(self, lpath, rpath, **kwargs):
520
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
521
521
  raise NotImplementedError
522
522
 
523
523
  async def _put(
@@ -816,11 +816,9 @@ class AsyncFileSystem(AbstractFileSystem):
816
816
  p: info
817
817
  for p, info in sorted(allpaths.items())
818
818
  if pattern.match(
819
- (
820
- p + "/"
821
- if append_slash_to_dirname and info["type"] == "directory"
822
- else p
823
- )
819
+ p + "/"
820
+ if append_slash_to_dirname and info["type"] == "directory"
821
+ else p
824
822
  )
825
823
  }
826
824
 
fsspec/caching.py CHANGED
@@ -8,6 +8,8 @@ import os
8
8
  import threading
9
9
  import warnings
10
10
  from concurrent.futures import Future, ThreadPoolExecutor
11
+ from itertools import groupby
12
+ from operator import itemgetter
11
13
  from typing import (
12
14
  TYPE_CHECKING,
13
15
  Any,
@@ -85,12 +87,7 @@ class BaseCache:
85
87
  if self.hit_count == 0 and self.miss_count == 0:
86
88
  # a cache that does nothing, this is for logs only
87
89
  return ""
88
- return " , %s: %d hits, %d misses, %d total requested bytes" % (
89
- self.name,
90
- self.hit_count,
91
- self.miss_count,
92
- self.total_requested_bytes,
93
- )
90
+ return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
94
91
 
95
92
  def __repr__(self) -> str:
96
93
  # TODO: use rich for better formatting
@@ -161,21 +158,39 @@ class MMapCache(BaseCache):
161
158
  return b""
162
159
  start_block = start // self.blocksize
163
160
  end_block = end // self.blocksize
164
- need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
165
- hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
166
- self.miss_count += len(need)
167
- self.hit_count += len(hits)
168
- while need:
169
- # TODO: not a for loop so we can consolidate blocks later to
170
- # make fewer fetch calls; this could be parallel
171
- i = need.pop(0)
172
-
173
- sstart = i * self.blocksize
174
- send = min(sstart + self.blocksize, self.size)
161
+ block_range = range(start_block, end_block + 1)
162
+ # Determine which blocks need to be fetched. This sequence is sorted by construction.
163
+ need = (i for i in block_range if i not in self.blocks)
164
+ # Count the number of blocks already cached
165
+ self.hit_count += sum(1 for i in block_range if i in self.blocks)
166
+
167
+ # Consolidate needed blocks.
168
+ # Algorithm adapted from Python 2.x itertools documentation.
169
+ # We are grouping an enumerated sequence of blocks. By comparing when the difference
170
+ # between an ascending range (provided by enumerate) and the needed block numbers
171
+ # we can detect when the block number skips values. The key computes this difference.
172
+ # Whenever the difference changes, we know that we have previously cached block(s),
173
+ # and a new group is started. In other words, this algorithm neatly groups
174
+ # runs of consecutive block numbers so they can be fetched together.
175
+ for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
176
+ # Extract the blocks from the enumerated sequence
177
+ _blocks = tuple(map(itemgetter(1), _blocks))
178
+ # Compute start of first block
179
+ sstart = _blocks[0] * self.blocksize
180
+ # Compute the end of the last block. Last block may not be full size.
181
+ send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
182
+
183
+ # Fetch bytes (could be multiple consecutive blocks)
175
184
  self.total_requested_bytes += send - sstart
176
- logger.debug(f"MMap get block #{i} ({sstart}-{send})")
185
+ logger.debug(
186
+ f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
187
+ )
177
188
  self.cache[sstart:send] = self.fetcher(sstart, send)
178
- self.blocks.add(i)
189
+
190
+ # Update set of cached blocks
191
+ self.blocks.update(_blocks)
192
+ # Update cache statistics with number of blocks we had to cache
193
+ self.miss_count += len(_blocks)
179
194
 
180
195
  return self.cache[start:end]
181
196
 
fsspec/core.py CHANGED
@@ -329,12 +329,19 @@ def open_files(
329
329
 
330
330
 
331
331
  def _un_chain(path, kwargs):
332
- x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
333
- bits = (
334
- [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
335
- if "::" in path
336
- else [path]
337
- )
332
+ # Avoid a circular import
333
+ from fsspec.implementations.cached import CachingFileSystem
334
+
335
+ if "::" in path:
336
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ bits = []
338
+ for p in path.split("::"):
339
+ if "://" in p or x.match(p):
340
+ bits.append(p)
341
+ else:
342
+ bits.append(p + "://")
343
+ else:
344
+ bits = [path]
338
345
  # [[url, protocol, kwargs], ...]
339
346
  out = []
340
347
  previous_bit = None
@@ -351,10 +358,7 @@ def _un_chain(path, kwargs):
351
358
  **kws,
352
359
  )
353
360
  bit = cls._strip_protocol(bit)
354
- if (
355
- protocol in {"blockcache", "filecache", "simplecache"}
356
- and "target_protocol" not in kw
357
- ):
361
+ if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
358
362
  bit = previous_bit
359
363
  out.append((bit, protocol, kw))
360
364
  previous_bit = bit
@@ -676,9 +680,7 @@ def get_fs_token_paths(
676
680
  elif not isinstance(paths, list):
677
681
  paths = list(paths)
678
682
  else:
679
- if "w" in mode and expand:
680
- paths = _expand_paths(paths, name_function, num)
681
- elif "x" in mode and expand:
683
+ if ("w" in mode or "x" in mode) and expand:
682
684
  paths = _expand_paths(paths, name_function, num)
683
685
  elif "*" in paths:
684
686
  paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
@@ -0,0 +1,99 @@
1
+ import asyncio
2
+ import functools
3
+ import inspect
4
+
5
+ from fsspec.asyn import AsyncFileSystem
6
+
7
+
8
+ def async_wrapper(func, obj=None):
9
+ """
10
+ Wraps a synchronous function to make it awaitable.
11
+
12
+ Parameters
13
+ ----------
14
+ func : callable
15
+ The synchronous function to wrap.
16
+ obj : object, optional
17
+ The instance to bind the function to, if applicable.
18
+
19
+ Returns
20
+ -------
21
+ coroutine
22
+ An awaitable version of the function.
23
+ """
24
+
25
+ @functools.wraps(func)
26
+ async def wrapper(*args, **kwargs):
27
+ return await asyncio.to_thread(func, *args, **kwargs)
28
+
29
+ return wrapper
30
+
31
+
32
+ class AsyncFileSystemWrapper(AsyncFileSystem):
33
+ """
34
+ A wrapper class to convert a synchronous filesystem into an asynchronous one.
35
+
36
+ This class takes an existing synchronous filesystem implementation and wraps all
37
+ its methods to provide an asynchronous interface.
38
+
39
+ Parameters
40
+ ----------
41
+ sync_fs : AbstractFileSystem
42
+ The synchronous filesystem instance to wrap.
43
+ """
44
+
45
+ def __init__(self, sync_fs, *args, **kwargs):
46
+ super().__init__(*args, **kwargs)
47
+ self.asynchronous = True
48
+ self.sync_fs = sync_fs
49
+ self.protocol = self.sync_fs.protocol
50
+ self._wrap_all_sync_methods()
51
+
52
+ @property
53
+ def fsid(self):
54
+ return f"async_{self.sync_fs.fsid}"
55
+
56
+ def _wrap_all_sync_methods(self):
57
+ """
58
+ Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
59
+ """
60
+ excluded_methods = {"open"}
61
+ for method_name in dir(self.sync_fs):
62
+ if method_name.startswith("_") or method_name in excluded_methods:
63
+ continue
64
+
65
+ attr = inspect.getattr_static(self.sync_fs, method_name)
66
+ if isinstance(attr, property):
67
+ continue
68
+
69
+ method = getattr(self.sync_fs, method_name)
70
+ if callable(method) and not asyncio.iscoroutinefunction(method):
71
+ async_method = async_wrapper(method, obj=self)
72
+ setattr(self, f"_{method_name}", async_method)
73
+
74
+ @classmethod
75
+ def wrap_class(cls, sync_fs_class):
76
+ """
77
+ Create a new class that can be used to instantiate an AsyncFileSystemWrapper
78
+ with lazy instantiation of the underlying synchronous filesystem.
79
+
80
+ Parameters
81
+ ----------
82
+ sync_fs_class : type
83
+ The class of the synchronous filesystem to wrap.
84
+
85
+ Returns
86
+ -------
87
+ type
88
+ A new class that wraps the provided synchronous filesystem class.
89
+ """
90
+
91
+ class GeneratedAsyncFileSystemWrapper(cls):
92
+ def __init__(self, *args, **kwargs):
93
+ sync_fs = sync_fs_class(*args, **kwargs)
94
+ super().__init__(sync_fs)
95
+
96
+ GeneratedAsyncFileSystemWrapper.__name__ = (
97
+ f"Async{sync_fs_class.__name__}Wrapper"
98
+ )
99
+ return GeneratedAsyncFileSystemWrapper
@@ -612,7 +612,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
612
612
  **kwargs,
613
613
  ):
614
614
  paths = self.expand_path(
615
- path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
615
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
616
616
  )
617
617
  getpaths = []
618
618
  storepaths = []
@@ -412,9 +412,9 @@ class DatabricksFile(AbstractBufferedFile):
412
412
  if block_size is None or block_size == "default":
413
413
  block_size = self.DEFAULT_BLOCK_SIZE
414
414
 
415
- assert (
416
- block_size == self.DEFAULT_BLOCK_SIZE
417
- ), f"Only the default block size is allowed, not {block_size}"
415
+ assert block_size == self.DEFAULT_BLOCK_SIZE, (
416
+ f"Only the default block size is allowed, not {block_size}"
417
+ )
418
418
 
419
419
  super().__init__(
420
420
  fs,
@@ -387,7 +387,7 @@ def _mlsd2(ftp, path="."):
387
387
  "size": split_line[4],
388
388
  },
389
389
  )
390
- if "d" == this[1]["unix.mode"][0]:
390
+ if this[1]["unix.mode"][0] == "d":
391
391
  this[1]["type"] = "dir"
392
392
  else:
393
393
  this[1]["type"] = "file"
@@ -273,8 +273,12 @@ class HTTPFileSystem(AsyncFileSystem):
273
273
  chunk_size=5 * 2**20,
274
274
  callback=DEFAULT_CALLBACK,
275
275
  method="post",
276
+ mode="overwrite",
276
277
  **kwargs,
277
278
  ):
279
+ if mode != "overwrite":
280
+ raise NotImplementedError("Exclusive write")
281
+
278
282
  async def gen_chunks():
279
283
  # Support passing arbitrary file-like objects
280
284
  # and use them instead of streams.
@@ -692,25 +696,6 @@ class HTTPFile(AbstractBufferedFile):
692
696
 
693
697
  _fetch_range = sync_wrapper(async_fetch_range)
694
698
 
695
- def __reduce__(self):
696
- return (
697
- reopen,
698
- (
699
- self.fs,
700
- self.url,
701
- self.mode,
702
- self.blocksize,
703
- self.cache.name if self.cache else "none",
704
- self.size,
705
- ),
706
- )
707
-
708
-
709
- def reopen(fs, url, mode, blocksize, cache_type, size=None):
710
- return fs.open(
711
- url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
712
- )
713
-
714
699
 
715
700
  magic_check = re.compile("([*[])")
716
701
 
@@ -760,9 +745,6 @@ class HTTPStreamFile(AbstractBufferedFile):
760
745
  asyncio.run_coroutine_threadsafe(self._close(), self.loop)
761
746
  super().close()
762
747
 
763
- def __reduce__(self):
764
- return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
765
-
766
748
 
767
749
  class AsyncStreamFile(AbstractAsyncStreamedFile):
768
750
  def __init__(
@@ -60,7 +60,12 @@ class LocalFileSystem(AbstractFileSystem):
60
60
  info = self.info(path)
61
61
  if info["type"] == "directory":
62
62
  with os.scandir(path) as it:
63
- infos = [self.info(f) for f in it]
63
+ infos = []
64
+ for f in it:
65
+ try:
66
+ infos.append(self.info(f))
67
+ except FileNotFoundError:
68
+ pass
64
69
  else:
65
70
  infos = [info]
66
71
 
@@ -126,12 +126,13 @@ class MemoryFileSystem(AbstractFileSystem):
126
126
  if not exist_ok:
127
127
  raise
128
128
 
129
- def pipe_file(self, path, value, **kwargs):
129
+ def pipe_file(self, path, value, mode="overwrite", **kwargs):
130
130
  """Set the bytes of given file
131
131
 
132
132
  Avoids copies of the data if possible
133
133
  """
134
- self.open(path, "wb", data=value)
134
+ mode = "xb" if mode == "create" else "wb"
135
+ self.open(path, mode=mode, data=value)
135
136
 
136
137
  def rmdir(self, path):
137
138
  path = self._strip_protocol(path)
@@ -178,6 +179,8 @@ class MemoryFileSystem(AbstractFileSystem):
178
179
  **kwargs,
179
180
  ):
180
181
  path = self._strip_protocol(path)
182
+ if "x" in mode and self.exists(path):
183
+ raise FileExistsError
181
184
  if path in self.pseudo_dirs:
182
185
  raise IsADirectoryError(path)
183
186
  parent = path
@@ -197,7 +200,9 @@ class MemoryFileSystem(AbstractFileSystem):
197
200
  return f
198
201
  else:
199
202
  raise FileNotFoundError(path)
200
- elif mode == "wb":
203
+ elif mode in {"wb", "xb"}:
204
+ if mode == "xb" and self.exists(path):
205
+ raise FileExistsError
201
206
  m = MemoryFile(self, path, kwargs.get("data"))
202
207
  if not self._intrans:
203
208
  m.commit()
@@ -5,11 +5,12 @@ import itertools
5
5
  import logging
6
6
  import math
7
7
  import os
8
- from itertools import chain
9
8
  from functools import lru_cache
9
+ from itertools import chain
10
10
  from typing import TYPE_CHECKING, Literal
11
11
 
12
12
  import fsspec.core
13
+ from fsspec.spec import AbstractBufferedFile
13
14
 
14
15
  try:
15
16
  import ujson as json
@@ -20,6 +21,7 @@ except ImportError:
20
21
  from fsspec.asyn import AsyncFileSystem
21
22
  from fsspec.callbacks import DEFAULT_CALLBACK
22
23
  from fsspec.core import filesystem, open, split_protocol
24
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
23
25
  from fsspec.utils import isfilelike, merge_offset_ranges, other_paths
24
26
 
25
27
  logger = logging.getLogger("fsspec.reference")
@@ -41,7 +43,7 @@ def _first(d):
41
43
 
42
44
  def _prot_in_references(path, references):
43
45
  ref = references.get(path)
44
- if isinstance(ref, (list, tuple)):
46
+ if isinstance(ref, (list, tuple)) and isinstance(ref[0], str):
45
47
  return split_protocol(ref[0])[0] if ref[0] else ref[0]
46
48
 
47
49
 
@@ -173,8 +175,11 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
173
175
  """cached parquet file loader"""
174
176
  path = self.url.format(field=field, record=record)
175
177
  data = io.BytesIO(self.fs.cat_file(path))
176
- df = self.pd.read_parquet(data, engine=self.engine)
177
- refs = {c: df[c].to_numpy() for c in df.columns}
178
+ try:
179
+ df = self.pd.read_parquet(data, engine=self.engine)
180
+ refs = {c: df[c].to_numpy() for c in df.columns}
181
+ except OSError:
182
+ refs = None
178
183
  return refs
179
184
 
180
185
  self.open_refs = open_refs
@@ -390,10 +395,14 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
390
395
  self.write(field, record)
391
396
  else:
392
397
  # metadata or top-level
393
- self._items[key] = value
394
- new_value = json.loads(
395
- value.decode() if isinstance(value, bytes) else value
396
- )
398
+ if hasattr(value, "to_bytes"):
399
+ val = value.to_bytes().decode()
400
+ elif isinstance(value, bytes):
401
+ val = value.decode()
402
+ else:
403
+ val = value
404
+ self._items[key] = val
405
+ new_value = json.loads(val)
397
406
  self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
398
407
 
399
408
  @staticmethod
@@ -428,7 +437,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
428
437
  if len(partition) < self.record_size:
429
438
  try:
430
439
  original = self.open_refs(field, record)
431
- except IOError:
440
+ except OSError:
432
441
  pass
433
442
 
434
443
  if original:
@@ -591,8 +600,7 @@ class ReferenceFileSystem(AsyncFileSystem):
591
600
  async, and must allow start and end args in _cat_file. Later versions
592
601
  may allow multiple arbitrary URLs for the targets.
593
602
  This FileSystem is read-only. It is designed to be used with async
594
- targets (for now). This FileSystem only allows whole-file access, no
595
- ``open``. We do not get original file details from the target FS.
603
+ targets (for now). We do not get original file details from the target FS.
596
604
  Configuration is by passing a dict of references at init, or a URL to
597
605
  a JSON file containing the same; this dict
598
606
  can also contain concrete data for some set of paths.
@@ -602,6 +610,7 @@ class ReferenceFileSystem(AsyncFileSystem):
602
610
  """
603
611
 
604
612
  protocol = "reference"
613
+ cachable = False
605
614
 
606
615
  def __init__(
607
616
  self,
@@ -754,6 +763,15 @@ class ReferenceFileSystem(AsyncFileSystem):
754
763
  self.fss[remote_protocol] = fs
755
764
 
756
765
  self.fss[None] = fs or filesystem("file") # default one
766
+ # Wrap any non-async filesystems to ensure async methods are available below
767
+ for k, f in self.fss.items():
768
+ if not f.async_impl:
769
+ self.fss[k] = AsyncFileSystemWrapper(f)
770
+ elif self.asynchronous ^ f.asynchronous:
771
+ raise ValueError(
772
+ "Reference-FS's target filesystem must have same value"
773
+ "of asynchronous"
774
+ )
757
775
 
758
776
  def _cat_common(self, path, start=None, end=None):
759
777
  path = self._strip_protocol(path)
@@ -764,6 +782,8 @@ class ReferenceFileSystem(AsyncFileSystem):
764
782
  raise FileNotFoundError(path) from exc
765
783
  if isinstance(part, str):
766
784
  part = part.encode()
785
+ if hasattr(part, "to_bytes"):
786
+ part = part.to_bytes()
767
787
  if isinstance(part, bytes):
768
788
  logger.debug(f"Reference: {path}, type bytes")
769
789
  if part.startswith(b"base64:"):
@@ -803,7 +823,9 @@ class ReferenceFileSystem(AsyncFileSystem):
803
823
  return part_or_url[start:end]
804
824
  protocol, _ = split_protocol(part_or_url)
805
825
  try:
806
- await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
826
+ return await self.fss[protocol]._cat_file(
827
+ part_or_url, start=start0, end=end0
828
+ )
807
829
  except Exception as e:
808
830
  raise ReferenceNotReachable(path, part_or_url) from e
809
831
 
@@ -871,6 +893,9 @@ class ReferenceFileSystem(AsyncFileSystem):
871
893
  # found and on_error is "raise"
872
894
  try:
873
895
  u, s, e = self._cat_common(p)
896
+ if not isinstance(u, (bytes, str)):
897
+ # nan/None from parquet
898
+ continue
874
899
  except FileNotFoundError as err:
875
900
  if on_error == "raise":
876
901
  raise
@@ -1060,7 +1085,7 @@ class ReferenceFileSystem(AsyncFileSystem):
1060
1085
  self.dircache = {"": []}
1061
1086
  it = self.references.items()
1062
1087
  for path, part in it:
1063
- if isinstance(part, (bytes, str)):
1088
+ if isinstance(part, (bytes, str)) or hasattr(part, "to_bytes"):
1064
1089
  size = len(part)
1065
1090
  elif len(part) == 1:
1066
1091
  size = None
@@ -1087,10 +1112,33 @@ class ReferenceFileSystem(AsyncFileSystem):
1087
1112
  self.dircache[par].append({"name": path, "type": "file", "size": size})
1088
1113
 
1089
1114
  def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
1090
- data = self.cat_file(path) # load whole chunk into memory
1091
- return io.BytesIO(data)
1115
+ part_or_url, start0, end0 = self._cat_common(path)
1116
+ # This logic is kept outside `ReferenceFile` to avoid unnecessary redirection.
1117
+ # That does mean `_cat_common` gets called twice if it eventually reaches `ReferenceFile`.
1118
+ if isinstance(part_or_url, bytes):
1119
+ return io.BytesIO(part_or_url[start0:end0])
1120
+
1121
+ protocol, _ = split_protocol(part_or_url)
1122
+ if start0 is None and end0 is None:
1123
+ return self.fss[protocol]._open(
1124
+ part_or_url,
1125
+ mode,
1126
+ block_size=block_size,
1127
+ cache_options=cache_options,
1128
+ **kwargs,
1129
+ )
1130
+
1131
+ return ReferenceFile(
1132
+ self,
1133
+ path,
1134
+ mode,
1135
+ block_size=block_size,
1136
+ cache_options=cache_options,
1137
+ **kwargs,
1138
+ )
1092
1139
 
1093
1140
  def ls(self, path, detail=True, **kwargs):
1141
+ logger.debug("list %s", path)
1094
1142
  path = self._strip_protocol(path)
1095
1143
  if isinstance(self.references, LazyReferenceMapper):
1096
1144
  try:
@@ -1173,13 +1221,17 @@ class ReferenceFileSystem(AsyncFileSystem):
1173
1221
  ) # ignores FileNotFound, just as well for directories
1174
1222
  self.dircache.clear() # this is a bit heavy handed
1175
1223
 
1176
- async def _pipe_file(self, path, data):
1224
+ async def _pipe_file(self, path, data, mode="overwrite", **kwargs):
1225
+ if mode == "create" and self.exists(path):
1226
+ raise FileExistsError
1177
1227
  # can be str or bytes
1178
1228
  self.references[path] = data
1179
1229
  self.dircache.clear() # this is a bit heavy handed
1180
1230
 
1181
- async def _put_file(self, lpath, rpath, **kwargs):
1231
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
1182
1232
  # puts binary
1233
+ if mode == "create" and self.exists(rpath):
1234
+ raise FileExistsError
1183
1235
  with open(lpath, "rb") as f:
1184
1236
  self.references[rpath] = f.read()
1185
1237
  self.dircache.clear() # this is a bit heavy handed
@@ -1197,3 +1249,58 @@ class ReferenceFileSystem(AsyncFileSystem):
1197
1249
  out[k] = v
1198
1250
  with fsspec.open(url, "wb", **storage_options) as f:
1199
1251
  f.write(json.dumps({"version": 1, "refs": out}).encode())
1252
+
1253
+
1254
+ class ReferenceFile(AbstractBufferedFile):
1255
+ def __init__(
1256
+ self,
1257
+ fs,
1258
+ path,
1259
+ mode="rb",
1260
+ block_size="default",
1261
+ autocommit=True,
1262
+ cache_type="readahead",
1263
+ cache_options=None,
1264
+ size=None,
1265
+ **kwargs,
1266
+ ):
1267
+ super().__init__(
1268
+ fs,
1269
+ path,
1270
+ mode=mode,
1271
+ block_size=block_size,
1272
+ autocommit=autocommit,
1273
+ size=size,
1274
+ cache_type=cache_type,
1275
+ cache_options=cache_options,
1276
+ **kwargs,
1277
+ )
1278
+ part_or_url, self.start, self.end = self.fs._cat_common(self.path)
1279
+ protocol, _ = split_protocol(part_or_url)
1280
+ self.src_fs = self.fs.fss[protocol]
1281
+ self.src_path = part_or_url
1282
+ self._f = None
1283
+
1284
+ @property
1285
+ def f(self):
1286
+ if self._f is None or self._f.closed:
1287
+ self._f = self.src_fs._open(
1288
+ self.src_path,
1289
+ mode=self.mode,
1290
+ block_size=self.blocksize,
1291
+ autocommit=self.autocommit,
1292
+ cache_type="none",
1293
+ **self.kwargs,
1294
+ )
1295
+ return self._f
1296
+
1297
+ def close(self):
1298
+ if self._f is not None:
1299
+ self._f.close()
1300
+ return super().close()
1301
+
1302
+ def _fetch_range(self, start, end):
1303
+ start = start + self.start
1304
+ end = min(end + self.start, self.end)
1305
+ self.f.seek(start)
1306
+ return self.f.read(end - start)