fsspec 2025.10.0__tar.gz → 2025.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {fsspec-2025.10.0 → fsspec-2025.12.0}/.github/workflows/main.yaml +6 -6
  2. {fsspec-2025.10.0 → fsspec-2025.12.0}/.github/workflows/pypipublish.yaml +2 -2
  3. {fsspec-2025.10.0 → fsspec-2025.12.0}/.pre-commit-config.yaml +1 -2
  4. {fsspec-2025.10.0 → fsspec-2025.12.0}/PKG-INFO +4 -4
  5. {fsspec-2025.10.0 → fsspec-2025.12.0}/README.md +1 -1
  6. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/environment.yml +1 -1
  7. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/changelog.rst +20 -0
  8. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/_version.py +2 -2
  9. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/asyn.py +7 -1
  10. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/caching.py +52 -45
  11. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/core.py +20 -3
  12. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/arrow.py +6 -3
  13. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/asyn_wrapper.py +3 -1
  14. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cache_metadata.py +1 -3
  15. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cached.py +2 -1
  16. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/data.py +1 -2
  17. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dirfs.py +2 -1
  18. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/http.py +7 -1
  19. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/http_sync.py +7 -1
  20. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/libarchive.py +1 -1
  21. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/reference.py +1 -1
  22. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/json.py +7 -12
  23. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/parquet.py +100 -61
  24. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/registry.py +3 -0
  25. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/utils.py +3 -10
  26. {fsspec-2025.10.0 → fsspec-2025.12.0}/pyproject.toml +4 -4
  27. {fsspec-2025.10.0 → fsspec-2025.12.0}/.codespellrc +0 -0
  28. {fsspec-2025.10.0 → fsspec-2025.12.0}/.coveragerc +0 -0
  29. {fsspec-2025.10.0 → fsspec-2025.12.0}/.gitattributes +0 -0
  30. {fsspec-2025.10.0 → fsspec-2025.12.0}/.gitignore +0 -0
  31. {fsspec-2025.10.0 → fsspec-2025.12.0}/LICENSE +0 -0
  32. {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-downstream.yml +0 -0
  33. {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-friends.yml +0 -0
  34. {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-linux.yml +0 -0
  35. {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-win.yml +0 -0
  36. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/Makefile +0 -0
  37. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/README.md +0 -0
  38. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/make.bat +0 -0
  39. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/_static/custom.css +0 -0
  40. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/api.rst +0 -0
  41. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/async.rst +0 -0
  42. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/code-of-conduct.rst +0 -0
  43. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/conf.py +0 -0
  44. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/copying.rst +0 -0
  45. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/developer.rst +0 -0
  46. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/features.rst +0 -0
  47. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/img/gui.png +0 -0
  48. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/index.rst +0 -0
  49. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/intro.rst +0 -0
  50. {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/usage.rst +0 -0
  51. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/__init__.py +0 -0
  52. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/archive.py +0 -0
  53. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/callbacks.py +0 -0
  54. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/compression.py +0 -0
  55. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/config.py +0 -0
  56. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/conftest.py +0 -0
  57. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/dircache.py +0 -0
  58. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/exceptions.py +0 -0
  59. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/fuse.py +0 -0
  60. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/generic.py +0 -0
  61. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/gui.py +0 -0
  62. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/__init__.py +0 -0
  63. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cache_mapper.py +0 -0
  64. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/chained.py +0 -0
  65. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dask.py +0 -0
  66. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dbfs.py +0 -0
  67. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/ftp.py +0 -0
  68. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/gist.py +0 -0
  69. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/git.py +0 -0
  70. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/github.py +0 -0
  71. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/jupyter.py +0 -0
  72. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/local.py +0 -0
  73. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/memory.py +0 -0
  74. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/sftp.py +0 -0
  75. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/smb.py +0 -0
  76. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/tar.py +0 -0
  77. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/webhdfs.py +0 -0
  78. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/zip.py +0 -0
  79. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/mapping.py +0 -0
  80. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/spec.py +0 -0
  81. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/__init__.py +0 -0
  82. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/common.py +0 -0
  83. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/copy.py +0 -0
  84. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/get.py +0 -0
  85. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/mv.py +0 -0
  86. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/open.py +0 -0
  87. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/pipe.py +0 -0
  88. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/put.py +0 -0
  89. {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/transaction.py +0 -0
  90. {fsspec-2025.10.0 → fsspec-2025.12.0}/install_s3fs.sh +0 -0
  91. {fsspec-2025.10.0 → fsspec-2025.12.0}/readthedocs.yml +0 -0
  92. {fsspec-2025.10.0 → fsspec-2025.12.0}/setup.cfg +0 -0
@@ -14,18 +14,18 @@ jobs:
14
14
  fail-fast: false
15
15
  matrix:
16
16
  PY:
17
- - "3.9"
18
17
  - "3.10"
19
18
  - "3.11"
20
19
  - "3.12"
21
20
  - "3.13"
21
+ - "3.14"
22
22
 
23
23
  env:
24
24
  CIRUN: true
25
25
 
26
26
  steps:
27
27
  - name: Checkout
28
- uses: actions/checkout@v4
28
+ uses: actions/checkout@v5
29
29
  with:
30
30
  fetch-depth: 0
31
31
 
@@ -50,7 +50,7 @@ jobs:
50
50
 
51
51
  steps:
52
52
  - name: Checkout
53
- uses: actions/checkout@v4
53
+ uses: actions/checkout@v5
54
54
  with:
55
55
  fetch-depth: 0
56
56
 
@@ -81,7 +81,7 @@ jobs:
81
81
 
82
82
  steps:
83
83
  - name: Checkout
84
- uses: actions/checkout@v4
84
+ uses: actions/checkout@v5
85
85
  with:
86
86
  fetch-depth: 0
87
87
 
@@ -124,7 +124,7 @@ jobs:
124
124
 
125
125
  steps:
126
126
  - name: Checkout
127
- uses: actions/checkout@v4
127
+ uses: actions/checkout@v5
128
128
 
129
129
  - name: Setup conda
130
130
  uses: conda-incubator/setup-miniconda@v3
@@ -145,5 +145,5 @@ jobs:
145
145
  shell: bash -l {0}
146
146
  run: |
147
147
  cd ${{ matrix.FRIEND }}
148
- pytest -v
148
+ pytest -v -W ignore::pytest.PytestRemovedIn9Warning
149
149
  cd ..
@@ -8,9 +8,9 @@ jobs:
8
8
  deploy:
9
9
  runs-on: ubuntu-latest
10
10
  steps:
11
- - uses: actions/checkout@v4
11
+ - uses: actions/checkout@v5
12
12
  - name: Set up Python
13
- uses: actions/setup-python@v4
13
+ uses: actions/setup-python@v6
14
14
  with:
15
15
  python-version: "3.x"
16
16
  - name: Install dependencies
@@ -13,9 +13,8 @@ repos:
13
13
  - id: check-json
14
14
  - id: check-yaml
15
15
  - repo: https://github.com/astral-sh/ruff-pre-commit
16
- rev: v0.12.2
16
+ rev: v0.14.3
17
17
  hooks:
18
- # Run the linter.
19
18
  - id: ruff-check
20
19
  args: [ --fix, "--show-fixes"]
21
20
  - id: ruff-format
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fsspec
3
- Version: 2025.10.0
3
+ Version: 2025.12.0
4
4
  Summary: File-system specification
5
5
  Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
6
6
  Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
@@ -12,12 +12,12 @@ Keywords: file
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Operating System :: OS Independent
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3.13
20
- Requires-Python: >=3.9
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Requires-Python: >=3.10
21
21
  Provides-Extra: abfs
22
22
  Requires-Dist: adlfs; extra == 'abfs'
23
23
  Provides-Extra: adl
@@ -197,7 +197,7 @@ CI runtime. For local use, pick a version suitable for you.
197
197
 
198
198
  ```bash
199
199
  # For a new environment (mamba / conda).
200
- mamba create -n fsspec -c conda-forge python=3.9 -y
200
+ mamba create -n fsspec -c conda-forge python=3.10 -y
201
201
  conda activate fsspec
202
202
 
203
203
  # Standard dev install with docs and tests.
@@ -47,7 +47,7 @@ CI runtime. For local use, pick a version suitable for you.
47
47
 
48
48
  ```bash
49
49
  # For a new environment (mamba / conda).
50
- mamba create -n fsspec -c conda-forge python=3.9 -y
50
+ mamba create -n fsspec -c conda-forge python=3.10 -y
51
51
  conda activate fsspec
52
52
 
53
53
  # Standard dev install with docs and tests.
@@ -2,4 +2,4 @@ name: fsspec
2
2
  channels:
3
3
  - defaults
4
4
  dependencies:
5
- - python=3.9
5
+ - python=3.10
@@ -1,6 +1,26 @@
1
1
  Changelog
2
2
  =========
3
3
 
4
+ 2025.12.0
5
+ ---------
6
+
7
+ Enhancements
8
+
9
+ - fsspec.parquet to support filters and multiple files (#1945)
10
+
11
+ Fixes
12
+
13
+ - passing withdirs in aync _glob() (#1953)
14
+ - fix _rm_file/_rm redirection in async (#1951)
15
+ - allow arrowFile to be seekable (#1950)
16
+ - add size attribute to arrowFile (#1944)
17
+
18
+
19
+ Other
20
+
21
+ - support py3.14 and drop 3.9 (#1946)
22
+ - avoid ruff warning (#1942)
23
+
4
24
  2025.10.0
5
25
  ---------
6
26
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '2025.10.0'
32
- __version_tuple__ = version_tuple = (2025, 10, 0)
31
+ __version__ = version = '2025.12.0'
32
+ __version_tuple__ = version_tuple = (2025, 12, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -328,6 +328,11 @@ class AsyncFileSystem(AbstractFileSystem):
328
328
  return self._loop
329
329
 
330
330
  async def _rm_file(self, path, **kwargs):
331
+ if (
332
+ inspect.iscoroutinefunction(self._rm)
333
+ and type(self)._rm is not AsyncFileSystem._rm
334
+ ):
335
+ return await self._rm(path, recursive=False, batch_size=1, **kwargs)
331
336
  raise NotImplementedError
332
337
 
333
338
  async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
@@ -776,6 +781,7 @@ class AsyncFileSystem(AbstractFileSystem):
776
781
  min_idx = min(idx_star, idx_qmark, idx_brace)
777
782
 
778
783
  detail = kwargs.pop("detail", False)
784
+ withdirs = kwargs.pop("withdirs", True)
779
785
 
780
786
  if not has_magic(path):
781
787
  if await self._exists(path, **kwargs):
@@ -805,7 +811,7 @@ class AsyncFileSystem(AbstractFileSystem):
805
811
  depth = None
806
812
 
807
813
  allpaths = await self._find(
808
- root, maxdepth=depth, withdirs=True, detail=True, **kwargs
814
+ root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
809
815
  )
810
816
 
811
817
  pattern = glob_translate(path + ("/" if ends_with_sep else ""))
@@ -6,20 +6,12 @@ import logging
6
6
  import math
7
7
  import os
8
8
  import threading
9
- import warnings
10
9
  from collections import OrderedDict
10
+ from collections.abc import Callable
11
11
  from concurrent.futures import Future, ThreadPoolExecutor
12
12
  from itertools import groupby
13
13
  from operator import itemgetter
14
- from typing import (
15
- TYPE_CHECKING,
16
- Any,
17
- Callable,
18
- ClassVar,
19
- Generic,
20
- NamedTuple,
21
- TypeVar,
22
- )
14
+ from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar
23
15
 
24
16
  if TYPE_CHECKING:
25
17
  import mmap
@@ -629,7 +621,7 @@ class KnownPartsOfAFile(BaseCache):
629
621
  fetcher: Fetcher,
630
622
  size: int,
631
623
  data: dict[tuple[int, int], bytes] | None = None,
632
- strict: bool = True,
624
+ strict: bool = False,
633
625
  **_: Any,
634
626
  ):
635
627
  super().__init__(blocksize, fetcher, size)
@@ -653,50 +645,65 @@ class KnownPartsOfAFile(BaseCache):
653
645
  else:
654
646
  self.data = {}
655
647
 
648
+ @property
649
+ def size(self):
650
+ return sum(_[1] - _[0] for _ in self.data)
651
+
652
+ @size.setter
653
+ def size(self, value):
654
+ pass
655
+
656
+ @property
657
+ def nblocks(self):
658
+ return len(self.data)
659
+
660
+ @nblocks.setter
661
+ def nblocks(self, value):
662
+ pass
663
+
656
664
  def _fetch(self, start: int | None, stop: int | None) -> bytes:
657
665
  if start is None:
658
666
  start = 0
659
667
  if stop is None:
660
668
  stop = self.size
669
+ self.total_requested_bytes += stop - start
661
670
 
662
671
  out = b""
663
- for (loc0, loc1), data in self.data.items():
664
- # If self.strict=False, use zero-padded data
665
- # for reads beyond the end of a "known" buffer
672
+ started = False
673
+ loc_old = 0
674
+ for loc0, loc1 in sorted(self.data):
675
+ if (loc0 <= start < loc1) and (loc0 <= stop <= loc1):
676
+ # entirely within the block
677
+ off = start - loc0
678
+ self.hit_count += 1
679
+ return self.data[(loc0, loc1)][off : off + stop - start]
680
+ if stop <= loc0:
681
+ break
682
+ if started and loc0 > loc_old:
683
+ # a gap where we need data
684
+ self.miss_count += 1
685
+ if self.strict:
686
+ raise ValueError
687
+ out += b"\x00" * (loc0 - loc_old)
666
688
  if loc0 <= start < loc1:
689
+ # found the start
690
+ self.hit_count += 1
667
691
  off = start - loc0
668
- out = data[off : off + stop - start]
669
- if not self.strict or loc0 <= stop <= loc1:
670
- # The request is within a known range, or
671
- # it begins within a known range, and we
672
- # are allowed to pad reads beyond the
673
- # buffer with zero
674
- out += b"\x00" * (stop - start - len(out))
675
- self.hit_count += 1
676
- return out
677
- else:
678
- # The request ends outside a known range,
679
- # and we are being "strict" about reads
680
- # beyond the buffer
681
- start = loc1
682
- break
683
-
684
- # We only get here if there is a request outside the
685
- # known parts of the file. In an ideal world, this
686
- # should never happen
687
- if self.fetcher is None:
688
- # We cannot fetch the data, so raise an error
689
- raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
690
- # We can fetch the data, but should warn the user
691
- # that this may be slow
692
- warnings.warn(
693
- f"Read is outside the known file parts: {(start, stop)}. "
694
- f"IO/caching performance may be poor!"
695
- )
696
- logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
697
- self.total_requested_bytes += stop - start
692
+ out = self.data[(loc0, loc1)][off : off + stop - start]
693
+ started = True
694
+ elif start < loc0 and stop > loc1:
695
+ # the whole block
696
+ self.hit_count += 1
697
+ out += self.data[(loc0, loc1)]
698
+ elif loc0 <= stop <= loc1:
699
+ # end block
700
+ self.hit_count += 1
701
+ return out + self.data[(loc0, loc1)][: stop - loc0]
702
+ loc_old = loc1
698
703
  self.miss_count += 1
699
- return out + super()._fetch(start, stop)
704
+ if started and not self.strict:
705
+ return out + b"\x00" * (stop - loc_old)
706
+ raise ValueError
700
707
 
701
708
 
702
709
  class UpdatableLRU(Generic[P, T]):
@@ -18,7 +18,7 @@ from fsspec.caching import ( # noqa: F401
18
18
  )
19
19
  from fsspec.compression import compr
20
20
  from fsspec.config import conf
21
- from fsspec.registry import filesystem, get_filesystem_class
21
+ from fsspec.registry import available_protocols, filesystem, get_filesystem_class
22
22
  from fsspec.utils import (
23
23
  _unstrip_protocol,
24
24
  build_name_function,
@@ -334,34 +334,51 @@ def _un_chain(path, kwargs):
334
334
 
335
335
  if "::" in path:
336
336
  x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ known_protocols = set(available_protocols())
337
338
  bits = []
339
+
340
+ # split on '::', then ensure each bit has a protocol
338
341
  for p in path.split("::"):
339
- if "://" in p or x.match(p):
342
+ if p in known_protocols:
343
+ bits.append(p + "://")
344
+ elif "://" in p or x.match(p):
340
345
  bits.append(p)
341
346
  else:
342
347
  bits.append(p + "://")
343
348
  else:
344
349
  bits = [path]
350
+
345
351
  # [[url, protocol, kwargs], ...]
346
352
  out = []
347
353
  previous_bit = None
348
354
  kwargs = kwargs.copy()
355
+
349
356
  for bit in reversed(bits):
350
357
  protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
351
358
  cls = get_filesystem_class(protocol)
352
359
  extra_kwargs = cls._get_kwargs_from_urls(bit)
353
360
  kws = kwargs.pop(protocol, {})
361
+
354
362
  if bit is bits[0]:
355
363
  kws.update(kwargs)
364
+
356
365
  kw = dict(
357
366
  **{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
358
367
  **kws,
359
368
  )
360
369
  bit = cls._strip_protocol(bit)
361
- if "target_protocol" not in kw and issubclass(cls, ChainedFileSystem):
370
+
371
+ if (
372
+ "target_protocol" not in kw
373
+ and issubclass(cls, ChainedFileSystem)
374
+ and not bit
375
+ ):
376
+ # replace bit if we are chaining and no path given
362
377
  bit = previous_bit
378
+
363
379
  out.append((bit, protocol, kw))
364
380
  previous_bit = bit
381
+
365
382
  out.reverse()
366
383
  return out
367
384
 
@@ -205,11 +205,11 @@ class ArrowFSWrapper(AbstractFileSystem):
205
205
  return self.fs.get_file_info(path).mtime
206
206
 
207
207
  def cat_file(self, path, start=None, end=None, **kwargs):
208
- kwargs["seekable"] = start not in [None, 0]
208
+ kwargs.setdefault("seekable", start not in [None, 0])
209
209
  return super().cat_file(path, start=None, end=None, **kwargs)
210
210
 
211
211
  def get_file(self, rpath, lpath, **kwargs):
212
- kwargs["seekable"] = False
212
+ kwargs.setdefault("seekable", False)
213
213
  super().get_file(rpath, lpath, **kwargs)
214
214
 
215
215
 
@@ -223,7 +223,6 @@ class ArrowFSWrapper(AbstractFileSystem):
223
223
  "readable",
224
224
  "writable",
225
225
  "close",
226
- "size",
227
226
  "seekable",
228
227
  ],
229
228
  )
@@ -241,6 +240,10 @@ class ArrowFile(io.IOBase):
241
240
  def __enter__(self):
242
241
  return self
243
242
 
243
+ @property
244
+ def size(self):
245
+ return self.stream.size()
246
+
244
247
  def __exit__(self, *args):
245
248
  return self.close()
246
249
 
@@ -5,6 +5,8 @@ import inspect
5
5
  import fsspec
6
6
  from fsspec.asyn import AsyncFileSystem, running_async
7
7
 
8
+ from .chained import ChainedFileSystem
9
+
8
10
 
9
11
  def async_wrapper(func, obj=None, semaphore=None):
10
12
  """
@@ -35,7 +37,7 @@ def async_wrapper(func, obj=None, semaphore=None):
35
37
  return wrapper
36
38
 
37
39
 
38
- class AsyncFileSystemWrapper(AsyncFileSystem):
40
+ class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
39
41
  """
40
42
  A wrapper class to convert a synchronous filesystem into an asynchronous one.
41
43
 
@@ -15,9 +15,7 @@ except ImportError:
15
15
 
16
16
  if TYPE_CHECKING:
17
17
  from collections.abc import Iterator
18
- from typing import Any, Literal
19
-
20
- from typing_extensions import TypeAlias
18
+ from typing import Any, Literal, TypeAlias
21
19
 
22
20
  from .cached import CachingFileSystem
23
21
 
@@ -6,8 +6,9 @@ import os
6
6
  import tempfile
7
7
  import time
8
8
  import weakref
9
+ from collections.abc import Callable
9
10
  from shutil import rmtree
10
- from typing import TYPE_CHECKING, Any, Callable, ClassVar
11
+ from typing import TYPE_CHECKING, Any, ClassVar
11
12
 
12
13
  from fsspec import filesystem
13
14
  from fsspec.callbacks import DEFAULT_CALLBACK
@@ -1,6 +1,5 @@
1
1
  import base64
2
2
  import io
3
- from typing import Optional
4
3
  from urllib.parse import unquote
5
4
 
6
5
  from fsspec import AbstractFileSystem
@@ -50,7 +49,7 @@ class DataFileSystem(AbstractFileSystem):
50
49
  return io.BytesIO(self.cat_file(path))
51
50
 
52
51
  @staticmethod
53
- def encode(data: bytes, mime: Optional[str] = None):
52
+ def encode(data: bytes, mime: str | None = None):
54
53
  """Format the given data into data-URL syntax
55
54
 
56
55
  This version always base64 encodes, even when the data is ascii/url-safe.
@@ -1,8 +1,9 @@
1
1
  from .. import filesystem
2
2
  from ..asyn import AsyncFileSystem
3
+ from .chained import ChainedFileSystem
3
4
 
4
5
 
5
- class DirFileSystem(AsyncFileSystem):
6
+ class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
6
7
  """Directory prefix filesystem
7
8
 
8
9
  The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
@@ -327,7 +327,7 @@ class HTTPFileSystem(AsyncFileSystem):
327
327
  async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
328
328
  self._raise_not_found_for_status(resp, rpath)
329
329
 
330
- async def _exists(self, path, **kwargs):
330
+ async def _exists(self, path, strict=False, **kwargs):
331
331
  kw = self.kwargs.copy()
332
332
  kw.update(kwargs)
333
333
  try:
@@ -335,8 +335,14 @@ class HTTPFileSystem(AsyncFileSystem):
335
335
  session = await self.set_session()
336
336
  r = await session.get(self.encode_url(path), **kw)
337
337
  async with r:
338
+ if strict:
339
+ self._raise_not_found_for_status(r, path)
338
340
  return r.status < 400
341
+ except FileNotFoundError:
342
+ return False
339
343
  except aiohttp.ClientError:
344
+ if strict:
345
+ raise
340
346
  return False
341
347
 
342
348
  async def _isfile(self, path, **kwargs):
@@ -463,14 +463,20 @@ class HTTPFileSystem(AbstractFileSystem):
463
463
  end -= 1 # bytes range is inclusive
464
464
  return f"bytes={start}-{end}"
465
465
 
466
- def exists(self, path, **kwargs):
466
+ def exists(self, path, strict=False, **kwargs):
467
467
  kw = self.kwargs.copy()
468
468
  kw.update(kwargs)
469
469
  try:
470
470
  logger.debug(path)
471
471
  r = self.session.get(self.encode_url(path), **kw)
472
+ if strict:
473
+ self._raise_not_found_for_status(r, path)
472
474
  return r.status_code < 400
475
+ except FileNotFoundError:
476
+ return False
473
477
  except Exception:
478
+ if strict:
479
+ raise
474
480
  return False
475
481
 
476
482
  def isfile(self, path, **kwargs):
@@ -195,7 +195,7 @@ class LibArchiveFileSystem(AbstractArchiveFileSystem):
195
195
  if mode != "rb":
196
196
  raise NotImplementedError
197
197
 
198
- data = bytes()
198
+ data = b""
199
199
  with self._open_archive() as arc:
200
200
  for entry in arc:
201
201
  if entry.pathname != path:
@@ -219,7 +219,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
219
219
  fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
220
220
  return LazyReferenceMapper(root, fs, **kwargs)
221
221
 
222
- @lru_cache()
222
+ @lru_cache
223
223
  def listdir(self):
224
224
  """List top-level directories"""
225
225
  dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
@@ -1,13 +1,8 @@
1
1
  import json
2
- from collections.abc import Mapping, Sequence
2
+ from collections.abc import Callable, Mapping, Sequence
3
3
  from contextlib import suppress
4
4
  from pathlib import PurePath
5
- from typing import (
6
- Any,
7
- Callable,
8
- ClassVar,
9
- Optional,
10
- )
5
+ from typing import Any, ClassVar
11
6
 
12
7
  from .registry import _import_class, get_filesystem_class
13
8
  from .spec import AbstractFileSystem
@@ -45,12 +40,12 @@ class FilesystemJSONDecoder(json.JSONDecoder):
45
40
  def __init__(
46
41
  self,
47
42
  *,
48
- object_hook: Optional[Callable[[dict[str, Any]], Any]] = None,
49
- parse_float: Optional[Callable[[str], Any]] = None,
50
- parse_int: Optional[Callable[[str], Any]] = None,
51
- parse_constant: Optional[Callable[[str], Any]] = None,
43
+ object_hook: Callable[[dict[str, Any]], Any] | None = None,
44
+ parse_float: Callable[[str], Any] | None = None,
45
+ parse_int: Callable[[str], Any] | None = None,
46
+ parse_constant: Callable[[str], Any] | None = None,
52
47
  strict: bool = True,
53
- object_pairs_hook: Optional[Callable[[list[tuple[str, Any]]], Any]] = None,
48
+ object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None,
54
49
  ) -> None:
55
50
  self.original_object_hook = object_hook
56
51
 
@@ -1,8 +1,12 @@
1
1
  import io
2
2
  import json
3
3
  import warnings
4
+ from typing import Literal
5
+
6
+ import fsspec
4
7
 
5
8
  from .core import url_to_fs
9
+ from .spec import AbstractBufferedFile
6
10
  from .utils import merge_offset_ranges
7
11
 
8
12
  # Parquet-Specific Utilities for fsspec
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
14
18
  # on remote file systems.
15
19
 
16
20
 
17
- def open_parquet_file(
18
- path,
19
- mode="rb",
20
- fs=None,
21
+ class AlreadyBufferedFile(AbstractBufferedFile):
22
+ def _fetch_range(self, start, end):
23
+ raise NotImplementedError
24
+
25
+
26
+ def open_parquet_files(
27
+ path: list[str],
28
+ mode: Literal["rb"] = "rb",
29
+ fs: None | fsspec.AbstractFileSystem = None,
21
30
  metadata=None,
22
- columns=None,
23
- row_groups=None,
24
- storage_options=None,
25
- strict=False,
26
- engine="auto",
27
- max_gap=64_000,
28
- max_block=256_000_000,
29
- footer_sample_size=1_000_000,
31
+ columns: None | list[str] = None,
32
+ row_groups: None | list[int] = None,
33
+ storage_options: None | dict = None,
34
+ engine: str = "auto",
35
+ max_gap: int = 64_000,
36
+ max_block: int = 256_000_000,
37
+ footer_sample_size: int = 1_000_000,
38
+ filters: None | list[list[list[str]]] = None,
30
39
  **kwargs,
31
40
  ):
32
41
  """
@@ -72,12 +81,6 @@ def open_parquet_file(
72
81
  storage_options : dict, optional
73
82
  Used to generate an `AbstractFileSystem` object if `fs` was
74
83
  not specified.
75
- strict : bool, optional
76
- Whether the resulting `KnownPartsOfAFile` cache should
77
- fetch reads that go beyond a known byte-range boundary.
78
- If `False` (the default), any read that ends outside a
79
- known part will be zero padded. Note that using
80
- `strict=True` may be useful for debugging.
81
84
  max_gap : int, optional
82
85
  Neighboring byte ranges will only be merged when their
83
86
  inter-range gap is <= `max_gap`. Default is 64KB.
@@ -89,6 +92,10 @@ def open_parquet_file(
89
92
  for the footer metadata. If the sampled bytes do not contain
90
93
  the footer, a second read request will be required, and
91
94
  performance will suffer. Default is 1MB.
95
+ filters : list[list], optional
96
+ List of filters to apply to prevent reading row groups, of the
97
+ same format as accepted by the loading engines. Ignored if
98
+ ``row_groups`` is specified.
92
99
  **kwargs :
93
100
  Optional key-word arguments to pass to `fs.open`
94
101
  """
@@ -96,20 +103,36 @@ def open_parquet_file(
96
103
  # Make sure we have an `AbstractFileSystem` object
97
104
  # to work with
98
105
  if fs is None:
99
- fs = url_to_fs(path, **(storage_options or {}))[0]
106
+ path0 = path
107
+ if isinstance(path, (list, tuple)):
108
+ path = path[0]
109
+ fs, path = url_to_fs(path, **(storage_options or {}))
110
+ else:
111
+ path0 = path
100
112
 
101
- # For now, `columns == []` not supported. Just use
102
- # default `open` command with `path` input
113
+ # For now, `columns == []` not supported, is the same
114
+ # as all columns
103
115
  if columns is not None and len(columns) == 0:
104
- return fs.open(path, mode=mode)
116
+ columns = None
105
117
 
106
118
  # Set the engine
107
119
  engine = _set_engine(engine)
108
120
 
109
- # Fetch the known byte ranges needed to read
110
- # `columns` and/or `row_groups`
121
+ if isinstance(path0, (list, tuple)):
122
+ paths = path0
123
+ elif "*" in path:
124
+ paths = fs.glob(path)
125
+ elif path0.endswith("/"): # or fs.isdir(path):
126
+ paths = [
127
+ _
128
+ for _ in fs.find(path, withdirs=False, detail=False)
129
+ if _.endswith((".parquet", ".parq"))
130
+ ]
131
+ else:
132
+ paths = [path]
133
+
111
134
  data = _get_parquet_byte_ranges(
112
- [path],
135
+ paths,
113
136
  fs,
114
137
  metadata=metadata,
115
138
  columns=columns,
@@ -118,24 +141,37 @@ def open_parquet_file(
118
141
  max_gap=max_gap,
119
142
  max_block=max_block,
120
143
  footer_sample_size=footer_sample_size,
144
+ filters=filters,
121
145
  )
122
146
 
123
- # Extract file name from `data`
124
- fn = next(iter(data)) if data else path
125
-
126
147
  # Call self.open with "parts" caching
127
148
  options = kwargs.pop("cache_options", {}).copy()
128
- return fs.open(
129
- fn,
130
- mode=mode,
131
- cache_type="parts",
132
- cache_options={
133
- **options,
134
- "data": data.get(fn, {}),
135
- "strict": strict,
136
- },
137
- **kwargs,
138
- )
149
+ return [
150
+ AlreadyBufferedFile(
151
+ fs=None,
152
+ path=fn,
153
+ mode=mode,
154
+ cache_type="parts",
155
+ cache_options={
156
+ **options,
157
+ "data": data.get(fn, {}),
158
+ },
159
+ size=max(_[1] for _ in data.get(fn, {})),
160
+ **kwargs,
161
+ )
162
+ for fn in data
163
+ ]
164
+
165
+
166
+ def open_parquet_file(*args, **kwargs):
167
+ """Create files tailed to reading specific parts of parquet files
168
+
169
+ Please see ``open_parquet_files`` for details of the arguments. The
170
+ difference is, this function always returns a single ``AleadyBufferedFile``,
171
+ whereas `open_parquet_files`` always returns a list of files, even if
172
+ there are one or zero matching parquet files.
173
+ """
174
+ return open_parquet_files(*args, **kwargs)[0]
139
175
 
140
176
 
141
177
  def _get_parquet_byte_ranges(
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
148
184
  max_block=256_000_000,
149
185
  footer_sample_size=1_000_000,
150
186
  engine="auto",
187
+ filters=None,
151
188
  ):
152
189
  """Get a dictionary of the known byte ranges needed
153
190
  to read a specific column/row-group selection from a
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
172
209
  row_groups=row_groups,
173
210
  max_gap=max_gap,
174
211
  max_block=max_block,
212
+ filters=filters,
175
213
  )
176
214
 
177
215
  # Get file sizes asynchronously
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
183
221
  data_starts = []
184
222
  data_ends = []
185
223
  add_header_magic = True
186
- if columns is None and row_groups is None:
224
+ if columns is None and row_groups is None and filters is None:
187
225
  # We are NOT selecting specific columns or row-groups.
188
226
  #
189
227
  # We can avoid sampling the footers, and just transfer
190
228
  # all file data with cat_ranges
191
229
  for i, path in enumerate(paths):
192
230
  result[path] = {}
193
- for b in range(0, file_sizes[i], max_block):
194
- data_paths.append(path)
195
- data_starts.append(b)
196
- data_ends.append(min(b + max_block, file_sizes[i]))
231
+ data_paths.append(path)
232
+ data_starts.append(0)
233
+ data_ends.append(file_sizes[i])
197
234
  add_header_magic = False # "Magic" should already be included
198
235
  else:
199
236
  # We ARE selecting specific columns or row-groups.
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
235
272
 
236
273
  # Calculate required byte ranges for each path
237
274
  for i, path in enumerate(paths):
238
- # Deal with small-file case.
239
- # Just include all remaining bytes of the file
240
- # in a single range.
241
- if file_sizes[i] < max_block:
242
- if footer_starts[i] > 0:
243
- # Only need to transfer the data if the
244
- # footer sample isn't already the whole file
245
- data_paths.append(path)
246
- data_starts.append(0)
247
- data_ends.append(footer_starts[i])
248
- continue
249
-
250
275
  # Use "engine" to collect data byte ranges
251
276
  path_data_starts, path_data_ends = engine._parquet_byte_ranges(
252
277
  columns,
253
278
  row_groups=row_groups,
254
279
  footer=footer_samples[i],
255
280
  footer_start=footer_starts[i],
281
+ filters=filters,
256
282
  )
257
283
 
258
284
  data_paths += [path] * len(path_data_starts)
259
285
  data_starts += path_data_starts
260
286
  data_ends += path_data_ends
287
+ result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
288
+ footer_samples[i]
289
+ )
261
290
 
262
291
  # Merge adjacent offset ranges
263
292
  data_paths, data_starts, data_ends = merge_offset_ranges(
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
291
320
  row_groups=None,
292
321
  max_gap=64_000,
293
322
  max_block=256_000_000,
323
+ filters=None,
294
324
  ):
295
325
  """Simplified version of `_get_parquet_byte_ranges` for
296
326
  the case that an engine-specific `metadata` object is
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
300
330
 
301
331
  # Use "engine" to collect data byte ranges
302
332
  data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
303
- columns,
304
- row_groups=row_groups,
305
- metadata=metadata,
333
+ columns, row_groups=row_groups, metadata=metadata, filters=filters
306
334
  )
307
335
 
308
336
  # Merge adjacent offset ranges
@@ -401,16 +429,19 @@ class FastparquetEngine:
401
429
  metadata=None,
402
430
  footer=None,
403
431
  footer_start=None,
432
+ filters=None,
404
433
  ):
405
434
  # Initialize offset ranges and define ParqetFile metadata
406
435
  pf = metadata
407
436
  data_paths, data_starts, data_ends = [], [], []
437
+ if filters and row_groups:
438
+ raise ValueError("filters and row_groups cannot be used together")
408
439
  if pf is None:
409
440
  pf = self.fp.ParquetFile(io.BytesIO(footer))
410
441
 
411
442
  # Convert columns to a set and add any index columns
412
443
  # specified in the pandas metadata (just in case)
413
- column_set = None if columns is None else set(columns)
444
+ column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
414
445
  if column_set is not None and hasattr(pf, "pandas_metadata"):
415
446
  md_index = [
416
447
  ind
@@ -422,7 +453,12 @@ class FastparquetEngine:
422
453
 
423
454
  # Check if row_groups is a list of integers
424
455
  # or a list of row-group metadata
425
- if row_groups and not isinstance(row_groups[0], int):
456
+ if filters:
457
+ from fastparquet.api import filter_row_groups
458
+
459
+ row_group_indices = None
460
+ row_groups = filter_row_groups(pf, filters)
461
+ elif row_groups and not isinstance(row_groups[0], int):
426
462
  # Input row_groups contains row-group metadata
427
463
  row_group_indices = None
428
464
  else:
@@ -486,9 +522,12 @@ class PyarrowEngine:
486
522
  metadata=None,
487
523
  footer=None,
488
524
  footer_start=None,
525
+ filters=None,
489
526
  ):
490
527
  if metadata is not None:
491
528
  raise ValueError("metadata input not supported for PyarrowEngine")
529
+ if filters:
530
+ raise NotImplementedError
492
531
 
493
532
  data_starts, data_ends = [], []
494
533
  md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
@@ -72,6 +72,9 @@ known_implementations = {
72
72
  "class": "fsspec.implementations.arrow.HadoopFileSystem",
73
73
  "err": "pyarrow and local java libraries required for HDFS",
74
74
  },
75
+ "async_wrapper": {
76
+ "class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
77
+ },
75
78
  "asynclocal": {
76
79
  "class": "morefs.asyn_local.AsyncLocalFileSystem",
77
80
  "err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
@@ -7,23 +7,16 @@ import os
7
7
  import re
8
8
  import sys
9
9
  import tempfile
10
- from collections.abc import Iterable, Iterator, Sequence
10
+ from collections.abc import Callable, Iterable, Iterator, Sequence
11
11
  from functools import partial
12
12
  from hashlib import md5
13
13
  from importlib.metadata import version
14
- from typing import (
15
- IO,
16
- TYPE_CHECKING,
17
- Any,
18
- Callable,
19
- TypeVar,
20
- )
14
+ from typing import IO, TYPE_CHECKING, Any, TypeVar
21
15
  from urllib.parse import urlsplit
22
16
 
23
17
  if TYPE_CHECKING:
24
18
  import pathlib
25
-
26
- from typing_extensions import TypeGuard
19
+ from typing import TypeGuard
27
20
 
28
21
  from fsspec.spec import AbstractFileSystem
29
22
 
@@ -9,18 +9,18 @@ description = "File-system specification"
9
9
  readme = "README.md"
10
10
  license = "BSD-3-Clause"
11
11
  license-files = ["LICENSE"]
12
- requires-python = ">=3.9"
12
+ requires-python = ">=3.10"
13
13
  maintainers = [{ name = "Martin Durant", email = "mdurant@anaconda.com" }]
14
14
  keywords = ["file"]
15
15
  classifiers = [
16
16
  "Development Status :: 4 - Beta",
17
17
  "Intended Audience :: Developers",
18
18
  "Operating System :: OS Independent",
19
- "Programming Language :: Python :: 3.9",
20
19
  "Programming Language :: Python :: 3.10",
21
20
  "Programming Language :: Python :: 3.11",
22
21
  "Programming Language :: Python :: 3.12",
23
22
  "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
24
  ]
25
25
 
26
26
  [project.optional-dependencies]
@@ -194,6 +194,8 @@ ignore = [
194
194
  "B026",
195
195
  # No explicit `stacklevel` keyword argument found
196
196
  "B028",
197
+ # `zip` without explicit `strict` keyword
198
+ "B905",
197
199
  # Assigning lambda expression
198
200
  "E731",
199
201
  # Ambiguous variable names
@@ -220,8 +222,6 @@ ignore = [
220
222
  "SIM115",
221
223
  "SIM117",
222
224
  "TC003",
223
- # https://github.com/astral-sh/ruff/issues/7871
224
- "UP038",
225
225
  # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
226
226
  "W191",
227
227
  "E111",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes