fsspec 2025.10.0__tar.gz → 2025.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.github/workflows/main.yaml +6 -6
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.github/workflows/pypipublish.yaml +2 -2
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.pre-commit-config.yaml +1 -2
- {fsspec-2025.10.0 → fsspec-2025.12.0}/PKG-INFO +4 -4
- {fsspec-2025.10.0 → fsspec-2025.12.0}/README.md +1 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/environment.yml +1 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/changelog.rst +20 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/_version.py +2 -2
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/asyn.py +7 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/caching.py +52 -45
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/core.py +20 -3
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/arrow.py +6 -3
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/asyn_wrapper.py +3 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cache_metadata.py +1 -3
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cached.py +2 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/data.py +1 -2
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dirfs.py +2 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/http.py +7 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/http_sync.py +7 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/libarchive.py +1 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/reference.py +1 -1
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/json.py +7 -12
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/parquet.py +100 -61
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/registry.py +3 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/utils.py +3 -10
- {fsspec-2025.10.0 → fsspec-2025.12.0}/pyproject.toml +4 -4
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.codespellrc +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.coveragerc +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.gitattributes +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/.gitignore +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/LICENSE +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-downstream.yml +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-friends.yml +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-linux.yml +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/ci/environment-win.yml +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/Makefile +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/README.md +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/make.bat +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/_static/custom.css +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/api.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/async.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/code-of-conduct.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/conf.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/copying.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/developer.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/features.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/img/gui.png +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/index.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/intro.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/docs/source/usage.rst +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/__init__.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/archive.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/callbacks.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/compression.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/config.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/conftest.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/dircache.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/exceptions.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/fuse.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/generic.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/gui.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/__init__.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/cache_mapper.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/chained.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dask.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/dbfs.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/ftp.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/gist.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/git.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/github.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/jupyter.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/local.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/memory.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/sftp.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/smb.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/tar.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/webhdfs.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/implementations/zip.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/mapping.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/spec.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/__init__.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/common.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/copy.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/get.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/mv.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/open.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/pipe.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/tests/abstract/put.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/fsspec/transaction.py +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/install_s3fs.sh +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/readthedocs.yml +0 -0
- {fsspec-2025.10.0 → fsspec-2025.12.0}/setup.cfg +0 -0
|
@@ -14,18 +14,18 @@ jobs:
|
|
|
14
14
|
fail-fast: false
|
|
15
15
|
matrix:
|
|
16
16
|
PY:
|
|
17
|
-
- "3.9"
|
|
18
17
|
- "3.10"
|
|
19
18
|
- "3.11"
|
|
20
19
|
- "3.12"
|
|
21
20
|
- "3.13"
|
|
21
|
+
- "3.14"
|
|
22
22
|
|
|
23
23
|
env:
|
|
24
24
|
CIRUN: true
|
|
25
25
|
|
|
26
26
|
steps:
|
|
27
27
|
- name: Checkout
|
|
28
|
-
uses: actions/checkout@
|
|
28
|
+
uses: actions/checkout@v5
|
|
29
29
|
with:
|
|
30
30
|
fetch-depth: 0
|
|
31
31
|
|
|
@@ -50,7 +50,7 @@ jobs:
|
|
|
50
50
|
|
|
51
51
|
steps:
|
|
52
52
|
- name: Checkout
|
|
53
|
-
uses: actions/checkout@
|
|
53
|
+
uses: actions/checkout@v5
|
|
54
54
|
with:
|
|
55
55
|
fetch-depth: 0
|
|
56
56
|
|
|
@@ -81,7 +81,7 @@ jobs:
|
|
|
81
81
|
|
|
82
82
|
steps:
|
|
83
83
|
- name: Checkout
|
|
84
|
-
uses: actions/checkout@
|
|
84
|
+
uses: actions/checkout@v5
|
|
85
85
|
with:
|
|
86
86
|
fetch-depth: 0
|
|
87
87
|
|
|
@@ -124,7 +124,7 @@ jobs:
|
|
|
124
124
|
|
|
125
125
|
steps:
|
|
126
126
|
- name: Checkout
|
|
127
|
-
uses: actions/checkout@
|
|
127
|
+
uses: actions/checkout@v5
|
|
128
128
|
|
|
129
129
|
- name: Setup conda
|
|
130
130
|
uses: conda-incubator/setup-miniconda@v3
|
|
@@ -145,5 +145,5 @@ jobs:
|
|
|
145
145
|
shell: bash -l {0}
|
|
146
146
|
run: |
|
|
147
147
|
cd ${{ matrix.FRIEND }}
|
|
148
|
-
pytest -v
|
|
148
|
+
pytest -v -W ignore::pytest.PytestRemovedIn9Warning
|
|
149
149
|
cd ..
|
|
@@ -8,9 +8,9 @@ jobs:
|
|
|
8
8
|
deploy:
|
|
9
9
|
runs-on: ubuntu-latest
|
|
10
10
|
steps:
|
|
11
|
-
- uses: actions/checkout@
|
|
11
|
+
- uses: actions/checkout@v5
|
|
12
12
|
- name: Set up Python
|
|
13
|
-
uses: actions/setup-python@
|
|
13
|
+
uses: actions/setup-python@v6
|
|
14
14
|
with:
|
|
15
15
|
python-version: "3.x"
|
|
16
16
|
- name: Install dependencies
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fsspec
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.0
|
|
4
4
|
Summary: File-system specification
|
|
5
5
|
Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
|
|
6
6
|
Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
|
|
@@ -12,12 +12,12 @@ Keywords: file
|
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Operating System :: OS Independent
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
-
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
21
|
Provides-Extra: abfs
|
|
22
22
|
Requires-Dist: adlfs; extra == 'abfs'
|
|
23
23
|
Provides-Extra: adl
|
|
@@ -197,7 +197,7 @@ CI runtime. For local use, pick a version suitable for you.
|
|
|
197
197
|
|
|
198
198
|
```bash
|
|
199
199
|
# For a new environment (mamba / conda).
|
|
200
|
-
mamba create -n fsspec -c conda-forge python=3.
|
|
200
|
+
mamba create -n fsspec -c conda-forge python=3.10 -y
|
|
201
201
|
conda activate fsspec
|
|
202
202
|
|
|
203
203
|
# Standard dev install with docs and tests.
|
|
@@ -47,7 +47,7 @@ CI runtime. For local use, pick a version suitable for you.
|
|
|
47
47
|
|
|
48
48
|
```bash
|
|
49
49
|
# For a new environment (mamba / conda).
|
|
50
|
-
mamba create -n fsspec -c conda-forge python=3.
|
|
50
|
+
mamba create -n fsspec -c conda-forge python=3.10 -y
|
|
51
51
|
conda activate fsspec
|
|
52
52
|
|
|
53
53
|
# Standard dev install with docs and tests.
|
|
@@ -1,6 +1,26 @@
|
|
|
1
1
|
Changelog
|
|
2
2
|
=========
|
|
3
3
|
|
|
4
|
+
2025.12.0
|
|
5
|
+
---------
|
|
6
|
+
|
|
7
|
+
Enhancements
|
|
8
|
+
|
|
9
|
+
- fsspec.parquet to support filters and multiple files (#1945)
|
|
10
|
+
|
|
11
|
+
Fixes
|
|
12
|
+
|
|
13
|
+
- passing withdirs in aync _glob() (#1953)
|
|
14
|
+
- fix _rm_file/_rm redirection in async (#1951)
|
|
15
|
+
- allow arrowFile to be seekable (#1950)
|
|
16
|
+
- add size attribute to arrowFile (#1944)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
Other
|
|
20
|
+
|
|
21
|
+
- support py3.14 and drop 3.9 (#1946)
|
|
22
|
+
- avoid ruff warning (#1942)
|
|
23
|
+
|
|
4
24
|
2025.10.0
|
|
5
25
|
---------
|
|
6
26
|
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '2025.
|
|
32
|
-
__version_tuple__ = version_tuple = (2025,
|
|
31
|
+
__version__ = version = '2025.12.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (2025, 12, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -328,6 +328,11 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
328
328
|
return self._loop
|
|
329
329
|
|
|
330
330
|
async def _rm_file(self, path, **kwargs):
|
|
331
|
+
if (
|
|
332
|
+
inspect.iscoroutinefunction(self._rm)
|
|
333
|
+
and type(self)._rm is not AsyncFileSystem._rm
|
|
334
|
+
):
|
|
335
|
+
return await self._rm(path, recursive=False, batch_size=1, **kwargs)
|
|
331
336
|
raise NotImplementedError
|
|
332
337
|
|
|
333
338
|
async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
|
|
@@ -776,6 +781,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
776
781
|
min_idx = min(idx_star, idx_qmark, idx_brace)
|
|
777
782
|
|
|
778
783
|
detail = kwargs.pop("detail", False)
|
|
784
|
+
withdirs = kwargs.pop("withdirs", True)
|
|
779
785
|
|
|
780
786
|
if not has_magic(path):
|
|
781
787
|
if await self._exists(path, **kwargs):
|
|
@@ -805,7 +811,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
805
811
|
depth = None
|
|
806
812
|
|
|
807
813
|
allpaths = await self._find(
|
|
808
|
-
root, maxdepth=depth, withdirs=
|
|
814
|
+
root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
|
|
809
815
|
)
|
|
810
816
|
|
|
811
817
|
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
|
|
@@ -6,20 +6,12 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
import os
|
|
8
8
|
import threading
|
|
9
|
-
import warnings
|
|
10
9
|
from collections import OrderedDict
|
|
10
|
+
from collections.abc import Callable
|
|
11
11
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
12
12
|
from itertools import groupby
|
|
13
13
|
from operator import itemgetter
|
|
14
|
-
from typing import
|
|
15
|
-
TYPE_CHECKING,
|
|
16
|
-
Any,
|
|
17
|
-
Callable,
|
|
18
|
-
ClassVar,
|
|
19
|
-
Generic,
|
|
20
|
-
NamedTuple,
|
|
21
|
-
TypeVar,
|
|
22
|
-
)
|
|
14
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar
|
|
23
15
|
|
|
24
16
|
if TYPE_CHECKING:
|
|
25
17
|
import mmap
|
|
@@ -629,7 +621,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
629
621
|
fetcher: Fetcher,
|
|
630
622
|
size: int,
|
|
631
623
|
data: dict[tuple[int, int], bytes] | None = None,
|
|
632
|
-
strict: bool =
|
|
624
|
+
strict: bool = False,
|
|
633
625
|
**_: Any,
|
|
634
626
|
):
|
|
635
627
|
super().__init__(blocksize, fetcher, size)
|
|
@@ -653,50 +645,65 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
653
645
|
else:
|
|
654
646
|
self.data = {}
|
|
655
647
|
|
|
648
|
+
@property
|
|
649
|
+
def size(self):
|
|
650
|
+
return sum(_[1] - _[0] for _ in self.data)
|
|
651
|
+
|
|
652
|
+
@size.setter
|
|
653
|
+
def size(self, value):
|
|
654
|
+
pass
|
|
655
|
+
|
|
656
|
+
@property
|
|
657
|
+
def nblocks(self):
|
|
658
|
+
return len(self.data)
|
|
659
|
+
|
|
660
|
+
@nblocks.setter
|
|
661
|
+
def nblocks(self, value):
|
|
662
|
+
pass
|
|
663
|
+
|
|
656
664
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
|
657
665
|
if start is None:
|
|
658
666
|
start = 0
|
|
659
667
|
if stop is None:
|
|
660
668
|
stop = self.size
|
|
669
|
+
self.total_requested_bytes += stop - start
|
|
661
670
|
|
|
662
671
|
out = b""
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
672
|
+
started = False
|
|
673
|
+
loc_old = 0
|
|
674
|
+
for loc0, loc1 in sorted(self.data):
|
|
675
|
+
if (loc0 <= start < loc1) and (loc0 <= stop <= loc1):
|
|
676
|
+
# entirely within the block
|
|
677
|
+
off = start - loc0
|
|
678
|
+
self.hit_count += 1
|
|
679
|
+
return self.data[(loc0, loc1)][off : off + stop - start]
|
|
680
|
+
if stop <= loc0:
|
|
681
|
+
break
|
|
682
|
+
if started and loc0 > loc_old:
|
|
683
|
+
# a gap where we need data
|
|
684
|
+
self.miss_count += 1
|
|
685
|
+
if self.strict:
|
|
686
|
+
raise ValueError
|
|
687
|
+
out += b"\x00" * (loc0 - loc_old)
|
|
666
688
|
if loc0 <= start < loc1:
|
|
689
|
+
# found the start
|
|
690
|
+
self.hit_count += 1
|
|
667
691
|
off = start - loc0
|
|
668
|
-
out = data[off : off + stop - start]
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
# and we are being "strict" about reads
|
|
680
|
-
# beyond the buffer
|
|
681
|
-
start = loc1
|
|
682
|
-
break
|
|
683
|
-
|
|
684
|
-
# We only get here if there is a request outside the
|
|
685
|
-
# known parts of the file. In an ideal world, this
|
|
686
|
-
# should never happen
|
|
687
|
-
if self.fetcher is None:
|
|
688
|
-
# We cannot fetch the data, so raise an error
|
|
689
|
-
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
|
|
690
|
-
# We can fetch the data, but should warn the user
|
|
691
|
-
# that this may be slow
|
|
692
|
-
warnings.warn(
|
|
693
|
-
f"Read is outside the known file parts: {(start, stop)}. "
|
|
694
|
-
f"IO/caching performance may be poor!"
|
|
695
|
-
)
|
|
696
|
-
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
|
697
|
-
self.total_requested_bytes += stop - start
|
|
692
|
+
out = self.data[(loc0, loc1)][off : off + stop - start]
|
|
693
|
+
started = True
|
|
694
|
+
elif start < loc0 and stop > loc1:
|
|
695
|
+
# the whole block
|
|
696
|
+
self.hit_count += 1
|
|
697
|
+
out += self.data[(loc0, loc1)]
|
|
698
|
+
elif loc0 <= stop <= loc1:
|
|
699
|
+
# end block
|
|
700
|
+
self.hit_count += 1
|
|
701
|
+
return out + self.data[(loc0, loc1)][: stop - loc0]
|
|
702
|
+
loc_old = loc1
|
|
698
703
|
self.miss_count += 1
|
|
699
|
-
|
|
704
|
+
if started and not self.strict:
|
|
705
|
+
return out + b"\x00" * (stop - loc_old)
|
|
706
|
+
raise ValueError
|
|
700
707
|
|
|
701
708
|
|
|
702
709
|
class UpdatableLRU(Generic[P, T]):
|
|
@@ -18,7 +18,7 @@ from fsspec.caching import ( # noqa: F401
|
|
|
18
18
|
)
|
|
19
19
|
from fsspec.compression import compr
|
|
20
20
|
from fsspec.config import conf
|
|
21
|
-
from fsspec.registry import filesystem, get_filesystem_class
|
|
21
|
+
from fsspec.registry import available_protocols, filesystem, get_filesystem_class
|
|
22
22
|
from fsspec.utils import (
|
|
23
23
|
_unstrip_protocol,
|
|
24
24
|
build_name_function,
|
|
@@ -334,34 +334,51 @@ def _un_chain(path, kwargs):
|
|
|
334
334
|
|
|
335
335
|
if "::" in path:
|
|
336
336
|
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
|
337
|
+
known_protocols = set(available_protocols())
|
|
337
338
|
bits = []
|
|
339
|
+
|
|
340
|
+
# split on '::', then ensure each bit has a protocol
|
|
338
341
|
for p in path.split("::"):
|
|
339
|
-
if
|
|
342
|
+
if p in known_protocols:
|
|
343
|
+
bits.append(p + "://")
|
|
344
|
+
elif "://" in p or x.match(p):
|
|
340
345
|
bits.append(p)
|
|
341
346
|
else:
|
|
342
347
|
bits.append(p + "://")
|
|
343
348
|
else:
|
|
344
349
|
bits = [path]
|
|
350
|
+
|
|
345
351
|
# [[url, protocol, kwargs], ...]
|
|
346
352
|
out = []
|
|
347
353
|
previous_bit = None
|
|
348
354
|
kwargs = kwargs.copy()
|
|
355
|
+
|
|
349
356
|
for bit in reversed(bits):
|
|
350
357
|
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
|
351
358
|
cls = get_filesystem_class(protocol)
|
|
352
359
|
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
|
353
360
|
kws = kwargs.pop(protocol, {})
|
|
361
|
+
|
|
354
362
|
if bit is bits[0]:
|
|
355
363
|
kws.update(kwargs)
|
|
364
|
+
|
|
356
365
|
kw = dict(
|
|
357
366
|
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
|
358
367
|
**kws,
|
|
359
368
|
)
|
|
360
369
|
bit = cls._strip_protocol(bit)
|
|
361
|
-
|
|
370
|
+
|
|
371
|
+
if (
|
|
372
|
+
"target_protocol" not in kw
|
|
373
|
+
and issubclass(cls, ChainedFileSystem)
|
|
374
|
+
and not bit
|
|
375
|
+
):
|
|
376
|
+
# replace bit if we are chaining and no path given
|
|
362
377
|
bit = previous_bit
|
|
378
|
+
|
|
363
379
|
out.append((bit, protocol, kw))
|
|
364
380
|
previous_bit = bit
|
|
381
|
+
|
|
365
382
|
out.reverse()
|
|
366
383
|
return out
|
|
367
384
|
|
|
@@ -205,11 +205,11 @@ class ArrowFSWrapper(AbstractFileSystem):
|
|
|
205
205
|
return self.fs.get_file_info(path).mtime
|
|
206
206
|
|
|
207
207
|
def cat_file(self, path, start=None, end=None, **kwargs):
|
|
208
|
-
kwargs
|
|
208
|
+
kwargs.setdefault("seekable", start not in [None, 0])
|
|
209
209
|
return super().cat_file(path, start=None, end=None, **kwargs)
|
|
210
210
|
|
|
211
211
|
def get_file(self, rpath, lpath, **kwargs):
|
|
212
|
-
kwargs
|
|
212
|
+
kwargs.setdefault("seekable", False)
|
|
213
213
|
super().get_file(rpath, lpath, **kwargs)
|
|
214
214
|
|
|
215
215
|
|
|
@@ -223,7 +223,6 @@ class ArrowFSWrapper(AbstractFileSystem):
|
|
|
223
223
|
"readable",
|
|
224
224
|
"writable",
|
|
225
225
|
"close",
|
|
226
|
-
"size",
|
|
227
226
|
"seekable",
|
|
228
227
|
],
|
|
229
228
|
)
|
|
@@ -241,6 +240,10 @@ class ArrowFile(io.IOBase):
|
|
|
241
240
|
def __enter__(self):
|
|
242
241
|
return self
|
|
243
242
|
|
|
243
|
+
@property
|
|
244
|
+
def size(self):
|
|
245
|
+
return self.stream.size()
|
|
246
|
+
|
|
244
247
|
def __exit__(self, *args):
|
|
245
248
|
return self.close()
|
|
246
249
|
|
|
@@ -5,6 +5,8 @@ import inspect
|
|
|
5
5
|
import fsspec
|
|
6
6
|
from fsspec.asyn import AsyncFileSystem, running_async
|
|
7
7
|
|
|
8
|
+
from .chained import ChainedFileSystem
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def async_wrapper(func, obj=None, semaphore=None):
|
|
10
12
|
"""
|
|
@@ -35,7 +37,7 @@ def async_wrapper(func, obj=None, semaphore=None):
|
|
|
35
37
|
return wrapper
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
class AsyncFileSystemWrapper(AsyncFileSystem):
|
|
40
|
+
class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
|
|
39
41
|
"""
|
|
40
42
|
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
|
41
43
|
|
|
@@ -15,9 +15,7 @@ except ImportError:
|
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from collections.abc import Iterator
|
|
18
|
-
from typing import Any, Literal
|
|
19
|
-
|
|
20
|
-
from typing_extensions import TypeAlias
|
|
18
|
+
from typing import Any, Literal, TypeAlias
|
|
21
19
|
|
|
22
20
|
from .cached import CachingFileSystem
|
|
23
21
|
|
|
@@ -6,8 +6,9 @@ import os
|
|
|
6
6
|
import tempfile
|
|
7
7
|
import time
|
|
8
8
|
import weakref
|
|
9
|
+
from collections.abc import Callable
|
|
9
10
|
from shutil import rmtree
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
12
|
|
|
12
13
|
from fsspec import filesystem
|
|
13
14
|
from fsspec.callbacks import DEFAULT_CALLBACK
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import io
|
|
3
|
-
from typing import Optional
|
|
4
3
|
from urllib.parse import unquote
|
|
5
4
|
|
|
6
5
|
from fsspec import AbstractFileSystem
|
|
@@ -50,7 +49,7 @@ class DataFileSystem(AbstractFileSystem):
|
|
|
50
49
|
return io.BytesIO(self.cat_file(path))
|
|
51
50
|
|
|
52
51
|
@staticmethod
|
|
53
|
-
def encode(data: bytes, mime:
|
|
52
|
+
def encode(data: bytes, mime: str | None = None):
|
|
54
53
|
"""Format the given data into data-URL syntax
|
|
55
54
|
|
|
56
55
|
This version always base64 encodes, even when the data is ascii/url-safe.
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from .. import filesystem
|
|
2
2
|
from ..asyn import AsyncFileSystem
|
|
3
|
+
from .chained import ChainedFileSystem
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
class DirFileSystem(AsyncFileSystem):
|
|
6
|
+
class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
|
|
6
7
|
"""Directory prefix filesystem
|
|
7
8
|
|
|
8
9
|
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
|
@@ -327,7 +327,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
327
327
|
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
|
328
328
|
self._raise_not_found_for_status(resp, rpath)
|
|
329
329
|
|
|
330
|
-
async def _exists(self, path, **kwargs):
|
|
330
|
+
async def _exists(self, path, strict=False, **kwargs):
|
|
331
331
|
kw = self.kwargs.copy()
|
|
332
332
|
kw.update(kwargs)
|
|
333
333
|
try:
|
|
@@ -335,8 +335,14 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
335
335
|
session = await self.set_session()
|
|
336
336
|
r = await session.get(self.encode_url(path), **kw)
|
|
337
337
|
async with r:
|
|
338
|
+
if strict:
|
|
339
|
+
self._raise_not_found_for_status(r, path)
|
|
338
340
|
return r.status < 400
|
|
341
|
+
except FileNotFoundError:
|
|
342
|
+
return False
|
|
339
343
|
except aiohttp.ClientError:
|
|
344
|
+
if strict:
|
|
345
|
+
raise
|
|
340
346
|
return False
|
|
341
347
|
|
|
342
348
|
async def _isfile(self, path, **kwargs):
|
|
@@ -463,14 +463,20 @@ class HTTPFileSystem(AbstractFileSystem):
|
|
|
463
463
|
end -= 1 # bytes range is inclusive
|
|
464
464
|
return f"bytes={start}-{end}"
|
|
465
465
|
|
|
466
|
-
def exists(self, path, **kwargs):
|
|
466
|
+
def exists(self, path, strict=False, **kwargs):
|
|
467
467
|
kw = self.kwargs.copy()
|
|
468
468
|
kw.update(kwargs)
|
|
469
469
|
try:
|
|
470
470
|
logger.debug(path)
|
|
471
471
|
r = self.session.get(self.encode_url(path), **kw)
|
|
472
|
+
if strict:
|
|
473
|
+
self._raise_not_found_for_status(r, path)
|
|
472
474
|
return r.status_code < 400
|
|
475
|
+
except FileNotFoundError:
|
|
476
|
+
return False
|
|
473
477
|
except Exception:
|
|
478
|
+
if strict:
|
|
479
|
+
raise
|
|
474
480
|
return False
|
|
475
481
|
|
|
476
482
|
def isfile(self, path, **kwargs):
|
|
@@ -219,7 +219,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
219
219
|
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
|
220
220
|
return LazyReferenceMapper(root, fs, **kwargs)
|
|
221
221
|
|
|
222
|
-
@lru_cache
|
|
222
|
+
@lru_cache
|
|
223
223
|
def listdir(self):
|
|
224
224
|
"""List top-level directories"""
|
|
225
225
|
dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
|
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from collections.abc import Mapping, Sequence
|
|
2
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
3
3
|
from contextlib import suppress
|
|
4
4
|
from pathlib import PurePath
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
ClassVar,
|
|
9
|
-
Optional,
|
|
10
|
-
)
|
|
5
|
+
from typing import Any, ClassVar
|
|
11
6
|
|
|
12
7
|
from .registry import _import_class, get_filesystem_class
|
|
13
8
|
from .spec import AbstractFileSystem
|
|
@@ -45,12 +40,12 @@ class FilesystemJSONDecoder(json.JSONDecoder):
|
|
|
45
40
|
def __init__(
|
|
46
41
|
self,
|
|
47
42
|
*,
|
|
48
|
-
object_hook:
|
|
49
|
-
parse_float:
|
|
50
|
-
parse_int:
|
|
51
|
-
parse_constant:
|
|
43
|
+
object_hook: Callable[[dict[str, Any]], Any] | None = None,
|
|
44
|
+
parse_float: Callable[[str], Any] | None = None,
|
|
45
|
+
parse_int: Callable[[str], Any] | None = None,
|
|
46
|
+
parse_constant: Callable[[str], Any] | None = None,
|
|
52
47
|
strict: bool = True,
|
|
53
|
-
object_pairs_hook:
|
|
48
|
+
object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None,
|
|
54
49
|
) -> None:
|
|
55
50
|
self.original_object_hook = object_hook
|
|
56
51
|
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import warnings
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import fsspec
|
|
4
7
|
|
|
5
8
|
from .core import url_to_fs
|
|
9
|
+
from .spec import AbstractBufferedFile
|
|
6
10
|
from .utils import merge_offset_ranges
|
|
7
11
|
|
|
8
12
|
# Parquet-Specific Utilities for fsspec
|
|
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
|
|
|
14
18
|
# on remote file systems.
|
|
15
19
|
|
|
16
20
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
class AlreadyBufferedFile(AbstractBufferedFile):
|
|
22
|
+
def _fetch_range(self, start, end):
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def open_parquet_files(
|
|
27
|
+
path: list[str],
|
|
28
|
+
mode: Literal["rb"] = "rb",
|
|
29
|
+
fs: None | fsspec.AbstractFileSystem = None,
|
|
21
30
|
metadata=None,
|
|
22
|
-
columns=None,
|
|
23
|
-
row_groups=None,
|
|
24
|
-
storage_options=None,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
31
|
+
columns: None | list[str] = None,
|
|
32
|
+
row_groups: None | list[int] = None,
|
|
33
|
+
storage_options: None | dict = None,
|
|
34
|
+
engine: str = "auto",
|
|
35
|
+
max_gap: int = 64_000,
|
|
36
|
+
max_block: int = 256_000_000,
|
|
37
|
+
footer_sample_size: int = 1_000_000,
|
|
38
|
+
filters: None | list[list[list[str]]] = None,
|
|
30
39
|
**kwargs,
|
|
31
40
|
):
|
|
32
41
|
"""
|
|
@@ -72,12 +81,6 @@ def open_parquet_file(
|
|
|
72
81
|
storage_options : dict, optional
|
|
73
82
|
Used to generate an `AbstractFileSystem` object if `fs` was
|
|
74
83
|
not specified.
|
|
75
|
-
strict : bool, optional
|
|
76
|
-
Whether the resulting `KnownPartsOfAFile` cache should
|
|
77
|
-
fetch reads that go beyond a known byte-range boundary.
|
|
78
|
-
If `False` (the default), any read that ends outside a
|
|
79
|
-
known part will be zero padded. Note that using
|
|
80
|
-
`strict=True` may be useful for debugging.
|
|
81
84
|
max_gap : int, optional
|
|
82
85
|
Neighboring byte ranges will only be merged when their
|
|
83
86
|
inter-range gap is <= `max_gap`. Default is 64KB.
|
|
@@ -89,6 +92,10 @@ def open_parquet_file(
|
|
|
89
92
|
for the footer metadata. If the sampled bytes do not contain
|
|
90
93
|
the footer, a second read request will be required, and
|
|
91
94
|
performance will suffer. Default is 1MB.
|
|
95
|
+
filters : list[list], optional
|
|
96
|
+
List of filters to apply to prevent reading row groups, of the
|
|
97
|
+
same format as accepted by the loading engines. Ignored if
|
|
98
|
+
``row_groups`` is specified.
|
|
92
99
|
**kwargs :
|
|
93
100
|
Optional key-word arguments to pass to `fs.open`
|
|
94
101
|
"""
|
|
@@ -96,20 +103,36 @@ def open_parquet_file(
|
|
|
96
103
|
# Make sure we have an `AbstractFileSystem` object
|
|
97
104
|
# to work with
|
|
98
105
|
if fs is None:
|
|
99
|
-
|
|
106
|
+
path0 = path
|
|
107
|
+
if isinstance(path, (list, tuple)):
|
|
108
|
+
path = path[0]
|
|
109
|
+
fs, path = url_to_fs(path, **(storage_options or {}))
|
|
110
|
+
else:
|
|
111
|
+
path0 = path
|
|
100
112
|
|
|
101
|
-
# For now, `columns == []` not supported
|
|
102
|
-
#
|
|
113
|
+
# For now, `columns == []` not supported, is the same
|
|
114
|
+
# as all columns
|
|
103
115
|
if columns is not None and len(columns) == 0:
|
|
104
|
-
|
|
116
|
+
columns = None
|
|
105
117
|
|
|
106
118
|
# Set the engine
|
|
107
119
|
engine = _set_engine(engine)
|
|
108
120
|
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
if isinstance(path0, (list, tuple)):
|
|
122
|
+
paths = path0
|
|
123
|
+
elif "*" in path:
|
|
124
|
+
paths = fs.glob(path)
|
|
125
|
+
elif path0.endswith("/"): # or fs.isdir(path):
|
|
126
|
+
paths = [
|
|
127
|
+
_
|
|
128
|
+
for _ in fs.find(path, withdirs=False, detail=False)
|
|
129
|
+
if _.endswith((".parquet", ".parq"))
|
|
130
|
+
]
|
|
131
|
+
else:
|
|
132
|
+
paths = [path]
|
|
133
|
+
|
|
111
134
|
data = _get_parquet_byte_ranges(
|
|
112
|
-
|
|
135
|
+
paths,
|
|
113
136
|
fs,
|
|
114
137
|
metadata=metadata,
|
|
115
138
|
columns=columns,
|
|
@@ -118,24 +141,37 @@ def open_parquet_file(
|
|
|
118
141
|
max_gap=max_gap,
|
|
119
142
|
max_block=max_block,
|
|
120
143
|
footer_sample_size=footer_sample_size,
|
|
144
|
+
filters=filters,
|
|
121
145
|
)
|
|
122
146
|
|
|
123
|
-
# Extract file name from `data`
|
|
124
|
-
fn = next(iter(data)) if data else path
|
|
125
|
-
|
|
126
147
|
# Call self.open with "parts" caching
|
|
127
148
|
options = kwargs.pop("cache_options", {}).copy()
|
|
128
|
-
return
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
149
|
+
return [
|
|
150
|
+
AlreadyBufferedFile(
|
|
151
|
+
fs=None,
|
|
152
|
+
path=fn,
|
|
153
|
+
mode=mode,
|
|
154
|
+
cache_type="parts",
|
|
155
|
+
cache_options={
|
|
156
|
+
**options,
|
|
157
|
+
"data": data.get(fn, {}),
|
|
158
|
+
},
|
|
159
|
+
size=max(_[1] for _ in data.get(fn, {})),
|
|
160
|
+
**kwargs,
|
|
161
|
+
)
|
|
162
|
+
for fn in data
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def open_parquet_file(*args, **kwargs):
|
|
167
|
+
"""Create files tailed to reading specific parts of parquet files
|
|
168
|
+
|
|
169
|
+
Please see ``open_parquet_files`` for details of the arguments. The
|
|
170
|
+
difference is, this function always returns a single ``AleadyBufferedFile``,
|
|
171
|
+
whereas `open_parquet_files`` always returns a list of files, even if
|
|
172
|
+
there are one or zero matching parquet files.
|
|
173
|
+
"""
|
|
174
|
+
return open_parquet_files(*args, **kwargs)[0]
|
|
139
175
|
|
|
140
176
|
|
|
141
177
|
def _get_parquet_byte_ranges(
|
|
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
|
|
|
148
184
|
max_block=256_000_000,
|
|
149
185
|
footer_sample_size=1_000_000,
|
|
150
186
|
engine="auto",
|
|
187
|
+
filters=None,
|
|
151
188
|
):
|
|
152
189
|
"""Get a dictionary of the known byte ranges needed
|
|
153
190
|
to read a specific column/row-group selection from a
|
|
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
|
|
|
172
209
|
row_groups=row_groups,
|
|
173
210
|
max_gap=max_gap,
|
|
174
211
|
max_block=max_block,
|
|
212
|
+
filters=filters,
|
|
175
213
|
)
|
|
176
214
|
|
|
177
215
|
# Get file sizes asynchronously
|
|
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
|
|
|
183
221
|
data_starts = []
|
|
184
222
|
data_ends = []
|
|
185
223
|
add_header_magic = True
|
|
186
|
-
if columns is None and row_groups is None:
|
|
224
|
+
if columns is None and row_groups is None and filters is None:
|
|
187
225
|
# We are NOT selecting specific columns or row-groups.
|
|
188
226
|
#
|
|
189
227
|
# We can avoid sampling the footers, and just transfer
|
|
190
228
|
# all file data with cat_ranges
|
|
191
229
|
for i, path in enumerate(paths):
|
|
192
230
|
result[path] = {}
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
data_ends.append(min(b + max_block, file_sizes[i]))
|
|
231
|
+
data_paths.append(path)
|
|
232
|
+
data_starts.append(0)
|
|
233
|
+
data_ends.append(file_sizes[i])
|
|
197
234
|
add_header_magic = False # "Magic" should already be included
|
|
198
235
|
else:
|
|
199
236
|
# We ARE selecting specific columns or row-groups.
|
|
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
|
|
|
235
272
|
|
|
236
273
|
# Calculate required byte ranges for each path
|
|
237
274
|
for i, path in enumerate(paths):
|
|
238
|
-
# Deal with small-file case.
|
|
239
|
-
# Just include all remaining bytes of the file
|
|
240
|
-
# in a single range.
|
|
241
|
-
if file_sizes[i] < max_block:
|
|
242
|
-
if footer_starts[i] > 0:
|
|
243
|
-
# Only need to transfer the data if the
|
|
244
|
-
# footer sample isn't already the whole file
|
|
245
|
-
data_paths.append(path)
|
|
246
|
-
data_starts.append(0)
|
|
247
|
-
data_ends.append(footer_starts[i])
|
|
248
|
-
continue
|
|
249
|
-
|
|
250
275
|
# Use "engine" to collect data byte ranges
|
|
251
276
|
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
|
252
277
|
columns,
|
|
253
278
|
row_groups=row_groups,
|
|
254
279
|
footer=footer_samples[i],
|
|
255
280
|
footer_start=footer_starts[i],
|
|
281
|
+
filters=filters,
|
|
256
282
|
)
|
|
257
283
|
|
|
258
284
|
data_paths += [path] * len(path_data_starts)
|
|
259
285
|
data_starts += path_data_starts
|
|
260
286
|
data_ends += path_data_ends
|
|
287
|
+
result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
|
|
288
|
+
footer_samples[i]
|
|
289
|
+
)
|
|
261
290
|
|
|
262
291
|
# Merge adjacent offset ranges
|
|
263
292
|
data_paths, data_starts, data_ends = merge_offset_ranges(
|
|
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
|
|
291
320
|
row_groups=None,
|
|
292
321
|
max_gap=64_000,
|
|
293
322
|
max_block=256_000_000,
|
|
323
|
+
filters=None,
|
|
294
324
|
):
|
|
295
325
|
"""Simplified version of `_get_parquet_byte_ranges` for
|
|
296
326
|
the case that an engine-specific `metadata` object is
|
|
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
|
|
300
330
|
|
|
301
331
|
# Use "engine" to collect data byte ranges
|
|
302
332
|
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
|
303
|
-
columns,
|
|
304
|
-
row_groups=row_groups,
|
|
305
|
-
metadata=metadata,
|
|
333
|
+
columns, row_groups=row_groups, metadata=metadata, filters=filters
|
|
306
334
|
)
|
|
307
335
|
|
|
308
336
|
# Merge adjacent offset ranges
|
|
@@ -401,16 +429,19 @@ class FastparquetEngine:
|
|
|
401
429
|
metadata=None,
|
|
402
430
|
footer=None,
|
|
403
431
|
footer_start=None,
|
|
432
|
+
filters=None,
|
|
404
433
|
):
|
|
405
434
|
# Initialize offset ranges and define ParqetFile metadata
|
|
406
435
|
pf = metadata
|
|
407
436
|
data_paths, data_starts, data_ends = [], [], []
|
|
437
|
+
if filters and row_groups:
|
|
438
|
+
raise ValueError("filters and row_groups cannot be used together")
|
|
408
439
|
if pf is None:
|
|
409
440
|
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
|
410
441
|
|
|
411
442
|
# Convert columns to a set and add any index columns
|
|
412
443
|
# specified in the pandas metadata (just in case)
|
|
413
|
-
column_set = None if columns is None else
|
|
444
|
+
column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
|
|
414
445
|
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
|
415
446
|
md_index = [
|
|
416
447
|
ind
|
|
@@ -422,7 +453,12 @@ class FastparquetEngine:
|
|
|
422
453
|
|
|
423
454
|
# Check if row_groups is a list of integers
|
|
424
455
|
# or a list of row-group metadata
|
|
425
|
-
if
|
|
456
|
+
if filters:
|
|
457
|
+
from fastparquet.api import filter_row_groups
|
|
458
|
+
|
|
459
|
+
row_group_indices = None
|
|
460
|
+
row_groups = filter_row_groups(pf, filters)
|
|
461
|
+
elif row_groups and not isinstance(row_groups[0], int):
|
|
426
462
|
# Input row_groups contains row-group metadata
|
|
427
463
|
row_group_indices = None
|
|
428
464
|
else:
|
|
@@ -486,9 +522,12 @@ class PyarrowEngine:
|
|
|
486
522
|
metadata=None,
|
|
487
523
|
footer=None,
|
|
488
524
|
footer_start=None,
|
|
525
|
+
filters=None,
|
|
489
526
|
):
|
|
490
527
|
if metadata is not None:
|
|
491
528
|
raise ValueError("metadata input not supported for PyarrowEngine")
|
|
529
|
+
if filters:
|
|
530
|
+
raise NotImplementedError
|
|
492
531
|
|
|
493
532
|
data_starts, data_ends = [], []
|
|
494
533
|
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
|
@@ -72,6 +72,9 @@ known_implementations = {
|
|
|
72
72
|
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
|
73
73
|
"err": "pyarrow and local java libraries required for HDFS",
|
|
74
74
|
},
|
|
75
|
+
"async_wrapper": {
|
|
76
|
+
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
|
|
77
|
+
},
|
|
75
78
|
"asynclocal": {
|
|
76
79
|
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
|
77
80
|
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
|
@@ -7,23 +7,16 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import sys
|
|
9
9
|
import tempfile
|
|
10
|
-
from collections.abc import Iterable, Iterator, Sequence
|
|
10
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
11
11
|
from functools import partial
|
|
12
12
|
from hashlib import md5
|
|
13
13
|
from importlib.metadata import version
|
|
14
|
-
from typing import
|
|
15
|
-
IO,
|
|
16
|
-
TYPE_CHECKING,
|
|
17
|
-
Any,
|
|
18
|
-
Callable,
|
|
19
|
-
TypeVar,
|
|
20
|
-
)
|
|
14
|
+
from typing import IO, TYPE_CHECKING, Any, TypeVar
|
|
21
15
|
from urllib.parse import urlsplit
|
|
22
16
|
|
|
23
17
|
if TYPE_CHECKING:
|
|
24
18
|
import pathlib
|
|
25
|
-
|
|
26
|
-
from typing_extensions import TypeGuard
|
|
19
|
+
from typing import TypeGuard
|
|
27
20
|
|
|
28
21
|
from fsspec.spec import AbstractFileSystem
|
|
29
22
|
|
|
@@ -9,18 +9,18 @@ description = "File-system specification"
|
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "BSD-3-Clause"
|
|
11
11
|
license-files = ["LICENSE"]
|
|
12
|
-
requires-python = ">=3.
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
13
|
maintainers = [{ name = "Martin Durant", email = "mdurant@anaconda.com" }]
|
|
14
14
|
keywords = ["file"]
|
|
15
15
|
classifiers = [
|
|
16
16
|
"Development Status :: 4 - Beta",
|
|
17
17
|
"Intended Audience :: Developers",
|
|
18
18
|
"Operating System :: OS Independent",
|
|
19
|
-
"Programming Language :: Python :: 3.9",
|
|
20
19
|
"Programming Language :: Python :: 3.10",
|
|
21
20
|
"Programming Language :: Python :: 3.11",
|
|
22
21
|
"Programming Language :: Python :: 3.12",
|
|
23
22
|
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
24
|
]
|
|
25
25
|
|
|
26
26
|
[project.optional-dependencies]
|
|
@@ -194,6 +194,8 @@ ignore = [
|
|
|
194
194
|
"B026",
|
|
195
195
|
# No explicit `stacklevel` keyword argument found
|
|
196
196
|
"B028",
|
|
197
|
+
# `zip` without explicit `strict` keyword
|
|
198
|
+
"B905",
|
|
197
199
|
# Assigning lambda expression
|
|
198
200
|
"E731",
|
|
199
201
|
# Ambiguous variable names
|
|
@@ -220,8 +222,6 @@ ignore = [
|
|
|
220
222
|
"SIM115",
|
|
221
223
|
"SIM117",
|
|
222
224
|
"TC003",
|
|
223
|
-
# https://github.com/astral-sh/ruff/issues/7871
|
|
224
|
-
"UP038",
|
|
225
225
|
# https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
|
|
226
226
|
"W191",
|
|
227
227
|
"E111",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|