fsspec 2024.10.0__tar.gz → 2025.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {fsspec-2024.10.0 → fsspec-2025.2.0}/.github/workflows/main.yaml +16 -8
  2. {fsspec-2024.10.0 → fsspec-2025.2.0}/.pre-commit-config.yaml +6 -2
  3. {fsspec-2024.10.0 → fsspec-2025.2.0}/PKG-INFO +3 -3
  4. {fsspec-2024.10.0 → fsspec-2025.2.0}/ci/environment-friends.yml +4 -2
  5. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/api.rst +44 -12
  6. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/async.rst +34 -0
  7. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/changelog.rst +44 -0
  8. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/features.rst +18 -0
  9. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/_version.py +2 -2
  10. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/archive.py +3 -1
  11. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/asyn.py +5 -7
  12. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/caching.py +34 -19
  13. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/core.py +15 -13
  14. fsspec-2025.2.0/fsspec/implementations/asyn_wrapper.py +99 -0
  15. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/cached.py +1 -1
  16. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/dbfs.py +3 -3
  17. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/ftp.py +1 -1
  18. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/http.py +4 -22
  19. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/local.py +6 -1
  20. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/memory.py +8 -3
  21. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/reference.py +124 -17
  22. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/webhdfs.py +2 -1
  23. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/mapping.py +1 -1
  24. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/parquet.py +1 -1
  25. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/registry.py +7 -3
  26. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/spec.py +209 -33
  27. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/__init__.py +3 -1
  28. fsspec-2025.2.0/fsspec/tests/abstract/open.py +11 -0
  29. fsspec-2025.2.0/fsspec/tests/abstract/pipe.py +11 -0
  30. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/utils.py +4 -2
  31. {fsspec-2024.10.0 → fsspec-2025.2.0}/pyproject.toml +15 -3
  32. fsspec-2024.10.0/.github/workflows/codespell.yml +0 -19
  33. {fsspec-2024.10.0 → fsspec-2025.2.0}/.codespellrc +0 -0
  34. {fsspec-2024.10.0 → fsspec-2025.2.0}/.coveragerc +0 -0
  35. {fsspec-2024.10.0 → fsspec-2025.2.0}/.gitattributes +0 -0
  36. {fsspec-2024.10.0 → fsspec-2025.2.0}/.github/workflows/pypipublish.yaml +0 -0
  37. {fsspec-2024.10.0 → fsspec-2025.2.0}/.gitignore +0 -0
  38. {fsspec-2024.10.0 → fsspec-2025.2.0}/LICENSE +0 -0
  39. {fsspec-2024.10.0 → fsspec-2025.2.0}/README.md +0 -0
  40. {fsspec-2024.10.0 → fsspec-2025.2.0}/ci/environment-downstream.yml +0 -0
  41. {fsspec-2024.10.0 → fsspec-2025.2.0}/ci/environment-py38.yml +0 -0
  42. {fsspec-2024.10.0 → fsspec-2025.2.0}/ci/environment-typecheck.yml +0 -0
  43. {fsspec-2024.10.0 → fsspec-2025.2.0}/ci/environment-win.yml +0 -0
  44. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/Makefile +0 -0
  45. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/README.md +0 -0
  46. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/environment.yml +0 -0
  47. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/make.bat +0 -0
  48. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/_static/custom.css +0 -0
  49. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/conf.py +0 -0
  50. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/copying.rst +0 -0
  51. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/developer.rst +0 -0
  52. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/img/gui.png +0 -0
  53. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/index.rst +0 -0
  54. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/intro.rst +0 -0
  55. {fsspec-2024.10.0 → fsspec-2025.2.0}/docs/source/usage.rst +0 -0
  56. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/__init__.py +0 -0
  57. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/callbacks.py +0 -0
  58. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/compression.py +0 -0
  59. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/config.py +0 -0
  60. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/conftest.py +0 -0
  61. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/dircache.py +0 -0
  62. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/exceptions.py +0 -0
  63. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/fuse.py +0 -0
  64. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/generic.py +0 -0
  65. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/gui.py +0 -0
  66. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/__init__.py +0 -0
  67. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/arrow.py +0 -0
  68. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/cache_mapper.py +0 -0
  69. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/cache_metadata.py +0 -0
  70. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/dask.py +0 -0
  71. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/data.py +0 -0
  72. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/dirfs.py +0 -0
  73. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/git.py +0 -0
  74. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/github.py +0 -0
  75. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/jupyter.py +0 -0
  76. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/libarchive.py +0 -0
  77. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/sftp.py +0 -0
  78. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/smb.py +0 -0
  79. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/tar.py +0 -0
  80. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/implementations/zip.py +0 -0
  81. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/json.py +0 -0
  82. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/common.py +0 -0
  83. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/copy.py +0 -0
  84. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/get.py +0 -0
  85. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/mv.py +0 -0
  86. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/tests/abstract/put.py +0 -0
  87. {fsspec-2024.10.0 → fsspec-2025.2.0}/fsspec/transaction.py +0 -0
  88. {fsspec-2024.10.0 → fsspec-2025.2.0}/install_s3fs.sh +0 -0
  89. {fsspec-2024.10.0 → fsspec-2025.2.0}/readthedocs.yml +0 -0
  90. {fsspec-2024.10.0 → fsspec-2025.2.0}/setup.cfg +0 -0
@@ -9,11 +9,16 @@ on:
9
9
  jobs:
10
10
  linux:
11
11
  name: ${{ matrix.PY }}-pytest
12
- runs-on: ubuntu-latest
12
+ runs-on: ubuntu-24.04
13
13
  strategy:
14
14
  fail-fast: false
15
15
  matrix:
16
- PY: ["3.8", "3.9", "3.10", "3.11", "3.12"]
16
+ PY:
17
+ - "3.9"
18
+ - "3.10"
19
+ - "3.11"
20
+ - "3.12"
21
+ - "3.13"
17
22
 
18
23
  env:
19
24
  CIRUN: true
@@ -62,7 +67,7 @@ jobs:
62
67
 
63
68
  lint:
64
69
  name: lint
65
- runs-on: ubuntu-latest
70
+ runs-on: ubuntu-24.04
66
71
  steps:
67
72
  - uses: actions/checkout@main
68
73
  - uses: actions/setup-python@main
@@ -88,7 +93,7 @@ jobs:
88
93
  #
89
94
  downstream:
90
95
  name: downstream
91
- runs-on: ubuntu-latest
96
+ runs-on: ubuntu-24.04
92
97
 
93
98
  steps:
94
99
  - name: Checkout
@@ -121,11 +126,11 @@ jobs:
121
126
 
122
127
  fsspec_friends:
123
128
  name: ${{ matrix.FRIEND }}-pytest
124
- runs-on: ubuntu-latest
129
+ runs-on: ubuntu-24.04
125
130
  strategy:
126
131
  fail-fast: false
127
132
  matrix:
128
- FRIEND: [gcsfs, s3fs]
133
+ FRIEND: [s3fs, gcsfs]
129
134
 
130
135
  env:
131
136
  CIRUN: true
@@ -150,8 +155,11 @@ jobs:
150
155
  shell: bash -l {0}
151
156
  run: |
152
157
  pip install -e . --no-deps
153
- pip install -e ./${{ matrix.FRIEND }} --no-deps
158
+ pip list
154
159
 
155
160
  - name: Test
156
161
  shell: bash -l {0}
157
- run: pytest -v ${{ matrix.FRIEND }}
162
+ run: |
163
+ cd ${{ matrix.FRIEND }}
164
+ pytest -v
165
+ cd ..
@@ -5,7 +5,7 @@ exclude: >
5
5
  repos:
6
6
 
7
7
  - repo: https://github.com/pre-commit/pre-commit-hooks
8
- rev: v4.5.0
8
+ rev: v5.0.0
9
9
  hooks:
10
10
  - id: trailing-whitespace
11
11
  - id: end-of-file-fixer
@@ -14,10 +14,14 @@ repos:
14
14
  - id: check-yaml
15
15
  - repo: https://github.com/astral-sh/ruff-pre-commit
16
16
  # Ruff version.
17
- rev: v0.4.4
17
+ rev: v0.9.2
18
18
  hooks:
19
19
  # Run the linter.
20
20
  - id: ruff
21
21
  args: [ --fix, "--show-fixes"]
22
22
  - id: ruff-format
23
23
  types_or: [python]
24
+ - repo: https://github.com/codespell-project/codespell
25
+ rev: v2.4.0
26
+ hooks:
27
+ - id: codespell
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: fsspec
3
- Version: 2024.10.0
3
+ Version: 2025.2.0
4
4
  Summary: File-system specification
5
5
  Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
6
6
  Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
@@ -46,6 +46,7 @@ Classifier: Programming Language :: Python :: 3.9
46
46
  Classifier: Programming Language :: Python :: 3.10
47
47
  Classifier: Programming Language :: Python :: 3.11
48
48
  Classifier: Programming Language :: Python :: 3.12
49
+ Classifier: Programming Language :: Python :: 3.13
49
50
  Requires-Python: >=3.8
50
51
  Provides-Extra: abfs
51
52
  Requires-Dist: adlfs; extra == 'abfs'
@@ -130,7 +131,6 @@ Requires-Dist: pytest-rerunfailures; extra == 'test'
130
131
  Requires-Dist: requests; extra == 'test'
131
132
  Provides-Extra: test-downstream
132
133
  Requires-Dist: aiobotocore<3.0.0,>=2.5.4; extra == 'test-downstream'
133
- Requires-Dist: dask-expr; extra == 'test-downstream'
134
134
  Requires-Dist: dask[dataframe,test]; extra == 'test-downstream'
135
135
  Requires-Dist: moto[server]<5,>4; extra == 'test-downstream'
136
136
  Requires-Dist: pytest-timeout; extra == 'test-downstream'
@@ -9,7 +9,6 @@ dependencies:
9
9
  - pytest-cov
10
10
  - pytest-mock
11
11
  - pip
12
- - pytest<8
13
12
  - ujson
14
13
  - requests
15
14
  - decorator
@@ -25,5 +24,8 @@ dependencies:
25
24
  - google-api-python-client
26
25
  - httpretty
27
26
  - aiobotocore
28
- - "moto >=4,<5"
27
+ - moto
29
28
  - flask
29
+ - pip:
30
+ - git+https://github.com/fsspec/s3fs
31
+ - git+https://github.com/fsspec/gcsfs
@@ -209,41 +209,73 @@ Built-in Implementations
209
209
  Other Known Implementations
210
210
  ---------------------------
211
211
 
212
- - `abfs`_ for Azure Blob service
213
- - `adl`_ for Azure DataLake storage
212
+
213
+ Note that most of these projects are hosted outside of the `fsspec` organisation. Please read their
214
+ documentation carefully before using any particular package.
215
+
216
+ - `abfs`_ for Azure Blob service, with protocol "abfs://"
217
+ - `adl`_ for Azure DataLake storage, with protocol "adl://"
214
218
  - `alluxiofs`_ to access fsspec implemented filesystem with Alluxio distributed cache
215
- - `boxfs`_ for access to Box file storage
216
- - `dropbox`_ for access to dropbox shares
219
+ - `boxfs`_ for access to Box file storage, with protocol "box://"
220
+ - `csvbase`_ for access to csvbase.com hosted CSV files, with protocol "csvbase://"
221
+ - `dropbox`_ for access to dropbox shares, with protocol "dropbox://"
217
222
  - `dvc`_ to access DVC/Git repository as a filesystem
218
- - `gcsfs`_ for Google Cloud Storage
223
+ - `fsspec-encrypted`_ for transparent encryption on top of other fsspec filesystems.
224
+ - `gcsfs`_ for Google Cloud Storage, with protocol "gcs://"
219
225
  - `gdrive`_ to access Google Drive and shares (experimental)
226
+ - `git`_ to access Git repositories
220
227
  - `huggingface_hub`_ to access the Hugging Face Hub filesystem, with protocol "hf://"
221
- - `lakefs`_ for lakeFS data lakes
222
- - `ocifs`_ for access to Oracle Cloud Object Storage
228
+ - `hdfs-native`_ to access Hadoop filesystem, with protocol "hdfs://"
229
+ - `httpfs-sync`_ to access HTTP(s) files in a synchronous manner to offer an alternative to the aiohttp-based implementation.
230
+ - `ipfsspec`_ for the InterPlanetary File System (IPFS), with protocol "ipfs://"
231
+ - `irods`_ for access to iRODS servers, with protocol "irods://"
232
+ - `lakefs`_ for lakeFS data lakes, with protocol "lakefs://"
233
+ - `morefs`_ for `OverlayFileSystem`, `DictFileSystem`, and others
234
+ - `ocifs`_ for access to Oracle Cloud Object Storage, with protocol "oci://"
223
235
  - `ocilake`_ for OCI Data Lake storage
224
236
  - `ossfs`_ for Alibaba Cloud (Aliyun) Object Storage System (OSS)
225
237
  - `p9fs`_ for 9P (Plan 9 Filesystem Protocol) servers
226
- - `s3fs`_ for Amazon S3 and other compatible stores
238
+ - `PyAthena`_ for S3 access to Amazon Athena, with protocol "s3://" or "s3a://"
239
+ - `PyDrive2`_ for Google Drive access
240
+ - `s3fs`_ for Amazon S3 and other compatible stores, with protocol "s3://"
241
+ - `sshfs`_ for access to SSH servers, with protocol "ssh://" or "sftp://"
242
+ - `swiftspec`_ for OpenStack SWIFT, with protocol "swift://"
243
+ - `tosfs`_ for ByteDance volcano engine Tinder Object Storage (TOS)
227
244
  - `wandbfs`_ to access Wandb run data (experimental)
228
- - `webdav4`_ for WebDAV
245
+ - `wandbfsspec`_ to access Weights & Biases (experimental)
246
+ - `webdav4`_ for WebDAV, with protocol "webdav://" or "dav://"
229
247
  - `xrootd`_ for xrootd, with protocol "root://"
230
248
 
231
249
  .. _abfs: https://github.com/dask/adlfs
232
250
  .. _adl: https://github.com/dask/adlfs
233
251
  .. _alluxiofs: https://github.com/fsspec/alluxiofs
234
252
  .. _boxfs: https://github.com/IBM/boxfs
235
- .. _dropbox: https://github.com/MarineChap/intake_dropbox
253
+ .. _csvbase: https://github.com/calpaterson/csvbase-client
254
+ .. _dropbox: https://github.com/fsspec/dropboxdrivefs
236
255
  .. _dvc: https://github.com/iterative/dvc
256
+ .. _fsspec-encrypted: https://github.com/thevgergroup/fsspec-encrypted
237
257
  .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/
238
258
  .. _gdrive: https://github.com/fsspec/gdrivefs
259
+ .. _git: https://github.com/iterative/scmrepo
260
+ .. _hdfs-native: https://github.com/Kimahriman/hdfs-native/blob/master/python/hdfs_native/fsspec.py
261
+ .. _httpfs-sync: https://github.com/moradology/httpfs-sync
239
262
  .. _huggingface_hub: https://huggingface.co/docs/huggingface_hub/main/en/guides/hf_file_system
240
- .. _lakefs: https://github.com/appliedAI-Initiative/lakefs-spec
241
- .. _ocifs: https://pypi.org/project/ocifs
263
+ .. _ipfsspec: https://github.com/fsspec/ipfsspec
264
+ .. _irods: https://github.com/xwcl/irods_fsspec
265
+ .. _lakefs: https://github.com/aai-institute/lakefs-spec
266
+ .. _morefs: https://github.com/iterative/morefs
267
+ .. _ocifs: https://ocifs.readthedocs.io/en/latest/
242
268
  .. _ocilake: https://github.com/oracle/ocifs
243
269
  .. _ossfs: https://github.com/fsspec/ossfs
244
270
  .. _p9fs: https://github.com/pbchekin/p9fs-py
271
+ .. _PyAthena: https://github.com/laughingman7743/PyAthena
272
+ .. _PyDrive2: https://github.com/iterative/PyDrive2
245
273
  .. _s3fs: https://s3fs.readthedocs.io/en/latest/
274
+ .. _sshfs: https://github.com/fsspec/sshfs
275
+ .. _swiftspec: https://github.com/fsspec/swiftspec
276
+ .. _tosfs: https://tosfs.readthedocs.io/en/latest/
246
277
  .. _wandbfs: https://github.com/jkulhanek/wandbfs
278
+ .. _wandbfsspec: https://github.com/alvarobartt/wandbfsspec
247
279
  .. _webdav4: https://github.com/skshetry/webdav4
248
280
  .. _xrootd: https://github.com/CoffeaTeam/fsspec-xrootd
249
281
 
@@ -152,3 +152,37 @@ available as the attribute ``.loop``.
152
152
 
153
153
  <script data-goatcounter="https://fsspec.goatcounter.com/count"
154
154
  async src="//gc.zgo.at/count.js"></script>
155
+
156
+ AsyncFileSystemWrapper
157
+ ----------------------
158
+
159
+ The `AsyncFileSystemWrapper` class is an experimental feature that allows you to convert
160
+ a synchronous filesystem into an asynchronous one. This is useful for quickly integrating
161
+ synchronous filesystems into workflows that may expect `AsyncFileSystem` instances.
162
+
163
+ Basic Usage
164
+ ~~~~~~~~~~~
165
+
166
+ To use `AsyncFileSystemWrapper`, wrap any synchronous filesystem to work in an asynchronous context.
167
+ In this example, the synchronous `LocalFileSystem` is wrapped, creating an `AsyncFileSystem` instance
168
+ backed by the normal, synchronous methods of `LocalFileSystem`:
169
+
170
+ .. code-block:: python
171
+
172
+ import asyncio
173
+ import fsspec
174
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
175
+
176
+ async def async_copy_file():
177
+ sync_fs = fsspec.filesystem('file') # by-default synchronous, local filesystem
178
+ async_fs = AsyncFileSystemWrapper(sync_fs)
179
+ return await async_fs._copy('/source/file.txt', '/destination/file.txt')
180
+
181
+ asyncio.run(async_copy_file())
182
+
183
+ Limitations
184
+ -----------
185
+
186
+ This is experimental. Users should not expect this wrapper to magically make things faster.
187
+ It is primarily provided to allow usage of synchronous filesystems with interfaces that expect
188
+ `AsyncFileSystem` instances.
@@ -1,6 +1,50 @@
1
1
  Changelog
2
2
  =========
3
3
 
4
+ 2025.2.0
5
+ --------
6
+
7
+ Enhancements
8
+
9
+ - add open() to referenceFS (#1778)
10
+
11
+ Fixes
12
+
13
+ - don't make async open() in async-wrapper (#1769)
14
+ - fix CI following dask-expr upstream change (#1781)
15
+ - cope with zarr3 "Buffer" objects in referenceFS (#1784)
16
+
17
+ Other
18
+
19
+ - use itemgetter in archiveFS (#1764)
20
+ - document that newline is included in readline(s) (#1770)
21
+ - format/spelling (#1774, 1779, 1780)
22
+
23
+ 2024.12.0
24
+ ---------
25
+
26
+ Enhancements
27
+
28
+ - "exclusive" mode for writing (#1762, 1756, 174+)
29
+ - "tree" text display of filesystem contents (#1750)
30
+ - async wrapper for sync FSs (#1745)
31
+ - new known implementation: tosfs (#1739)
32
+ - consilidate block fetch requests (#1733)
33
+
34
+ Fixes
35
+
36
+ - better webHDFS proxies (#
37
+ - syn FSs in referenceFS (#1755)
38
+ - don't serialize file caches (#1753)
39
+ - race condition in local ls() (#1744)
40
+ - missing/nan references in parquet (#1738)
41
+ - _un_chain kwargs (@1736)
42
+ - async _cat_file in referenceFS (#1734)
43
+
44
+ Other
45
+
46
+ - fallback implementation for _fetch_range (#1732)
47
+
4
48
  2024.10.0
5
49
  ---------
6
50
 
@@ -408,3 +408,21 @@ tqdm.
408
408
 
409
409
  <script data-goatcounter="https://fsspec.goatcounter.com/count"
410
410
  async src="//gc.zgo.at/count.js"></script>
411
+
412
+
413
+ Exclusive write
414
+ ---------------
415
+
416
+ Some backends support writing to a file only if it doesn't already exist. This may be
417
+ implemented for the following methods:
418
+ - pipe_file (with argument ``mode=='create'``)
419
+ - put_file (with argument ``mode=='create'``)
420
+ - open (with argument ``mode="xb"``)
421
+ Since some writes will be achieved in blocks, the timing of when the check is done is
422
+ not defined - it may be at the start or at the completion of the operation, depending
423
+ on the backend.
424
+
425
+ If using exclusive mode on a file that does already exist, a ``FileExistsError`` will
426
+ be raised.
427
+
428
+ This feature is currently included on a trial basis and may change in the future.
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2024.10.0'
16
- __version_tuple__ = version_tuple = (2024, 10, 0)
15
+ __version__ = version = '2025.2.0'
16
+ __version_tuple__ = version_tuple = (2025, 2, 0)
@@ -1,3 +1,5 @@
1
+ import operator
2
+
1
3
  from fsspec import AbstractFileSystem
2
4
  from fsspec.utils import tokenize
3
5
 
@@ -67,7 +69,7 @@ class AbstractArchiveFileSystem(AbstractFileSystem):
67
69
  out = {"name": ppath, "size": 0, "type": "directory"}
68
70
  paths[ppath] = out
69
71
  if detail:
70
- out = sorted(paths.values(), key=lambda _: _["name"])
72
+ out = sorted(paths.values(), key=operator.itemgetter("name"))
71
73
  return out
72
74
  else:
73
75
  return sorted(paths)
@@ -408,7 +408,7 @@ class AsyncFileSystem(AbstractFileSystem):
408
408
  continue
409
409
  raise ex
410
410
 
411
- async def _pipe_file(self, path, value, **kwargs):
411
+ async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
412
412
  raise NotImplementedError
413
413
 
414
414
  async def _pipe(self, path, value=None, batch_size=None, **kwargs):
@@ -517,7 +517,7 @@ class AsyncFileSystem(AbstractFileSystem):
517
517
  coros, batch_size=batch_size, nofiles=True, return_exceptions=True
518
518
  )
519
519
 
520
- async def _put_file(self, lpath, rpath, **kwargs):
520
+ async def _put_file(self, lpath, rpath, mode="overwrite", **kwargs):
521
521
  raise NotImplementedError
522
522
 
523
523
  async def _put(
@@ -816,11 +816,9 @@ class AsyncFileSystem(AbstractFileSystem):
816
816
  p: info
817
817
  for p, info in sorted(allpaths.items())
818
818
  if pattern.match(
819
- (
820
- p + "/"
821
- if append_slash_to_dirname and info["type"] == "directory"
822
- else p
823
- )
819
+ p + "/"
820
+ if append_slash_to_dirname and info["type"] == "directory"
821
+ else p
824
822
  )
825
823
  }
826
824
 
@@ -8,6 +8,8 @@ import os
8
8
  import threading
9
9
  import warnings
10
10
  from concurrent.futures import Future, ThreadPoolExecutor
11
+ from itertools import groupby
12
+ from operator import itemgetter
11
13
  from typing import (
12
14
  TYPE_CHECKING,
13
15
  Any,
@@ -85,12 +87,7 @@ class BaseCache:
85
87
  if self.hit_count == 0 and self.miss_count == 0:
86
88
  # a cache that does nothing, this is for logs only
87
89
  return ""
88
- return " , %s: %d hits, %d misses, %d total requested bytes" % (
89
- self.name,
90
- self.hit_count,
91
- self.miss_count,
92
- self.total_requested_bytes,
93
- )
90
+ return f" , {self.name}: {self.hit_count} hits, {self.miss_count} misses, {self.total_requested_bytes} total requested bytes"
94
91
 
95
92
  def __repr__(self) -> str:
96
93
  # TODO: use rich for better formatting
@@ -161,21 +158,39 @@ class MMapCache(BaseCache):
161
158
  return b""
162
159
  start_block = start // self.blocksize
163
160
  end_block = end // self.blocksize
164
- need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
165
- hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
166
- self.miss_count += len(need)
167
- self.hit_count += len(hits)
168
- while need:
169
- # TODO: not a for loop so we can consolidate blocks later to
170
- # make fewer fetch calls; this could be parallel
171
- i = need.pop(0)
172
-
173
- sstart = i * self.blocksize
174
- send = min(sstart + self.blocksize, self.size)
161
+ block_range = range(start_block, end_block + 1)
162
+ # Determine which blocks need to be fetched. This sequence is sorted by construction.
163
+ need = (i for i in block_range if i not in self.blocks)
164
+ # Count the number of blocks already cached
165
+ self.hit_count += sum(1 for i in block_range if i in self.blocks)
166
+
167
+ # Consolidate needed blocks.
168
+ # Algorithm adapted from Python 2.x itertools documentation.
169
+ # We are grouping an enumerated sequence of blocks. By comparing when the difference
170
+ # between an ascending range (provided by enumerate) and the needed block numbers
171
+ # we can detect when the block number skips values. The key computes this difference.
172
+ # Whenever the difference changes, we know that we have previously cached block(s),
173
+ # and a new group is started. In other words, this algorithm neatly groups
174
+ # runs of consecutive block numbers so they can be fetched together.
175
+ for _, _blocks in groupby(enumerate(need), key=lambda x: x[0] - x[1]):
176
+ # Extract the blocks from the enumerated sequence
177
+ _blocks = tuple(map(itemgetter(1), _blocks))
178
+ # Compute start of first block
179
+ sstart = _blocks[0] * self.blocksize
180
+ # Compute the end of the last block. Last block may not be full size.
181
+ send = min(_blocks[-1] * self.blocksize + self.blocksize, self.size)
182
+
183
+ # Fetch bytes (could be multiple consecutive blocks)
175
184
  self.total_requested_bytes += send - sstart
176
- logger.debug(f"MMap get block #{i} ({sstart}-{send})")
185
+ logger.debug(
186
+ f"MMap get blocks {_blocks[0]}-{_blocks[-1]} ({sstart}-{send})"
187
+ )
177
188
  self.cache[sstart:send] = self.fetcher(sstart, send)
178
- self.blocks.add(i)
189
+
190
+ # Update set of cached blocks
191
+ self.blocks.update(_blocks)
192
+ # Update cache statistics with number of blocks we had to cache
193
+ self.miss_count += len(_blocks)
179
194
 
180
195
  return self.cache[start:end]
181
196
 
@@ -329,12 +329,19 @@ def open_files(
329
329
 
330
330
 
331
331
  def _un_chain(path, kwargs):
332
- x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
333
- bits = (
334
- [p if "://" in p or x.match(p) else p + "://" for p in path.split("::")]
335
- if "::" in path
336
- else [path]
337
- )
332
+ # Avoid a circular import
333
+ from fsspec.implementations.cached import CachingFileSystem
334
+
335
+ if "::" in path:
336
+ x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
337
+ bits = []
338
+ for p in path.split("::"):
339
+ if "://" in p or x.match(p):
340
+ bits.append(p)
341
+ else:
342
+ bits.append(p + "://")
343
+ else:
344
+ bits = [path]
338
345
  # [[url, protocol, kwargs], ...]
339
346
  out = []
340
347
  previous_bit = None
@@ -351,10 +358,7 @@ def _un_chain(path, kwargs):
351
358
  **kws,
352
359
  )
353
360
  bit = cls._strip_protocol(bit)
354
- if (
355
- protocol in {"blockcache", "filecache", "simplecache"}
356
- and "target_protocol" not in kw
357
- ):
361
+ if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
358
362
  bit = previous_bit
359
363
  out.append((bit, protocol, kw))
360
364
  previous_bit = bit
@@ -676,9 +680,7 @@ def get_fs_token_paths(
676
680
  elif not isinstance(paths, list):
677
681
  paths = list(paths)
678
682
  else:
679
- if "w" in mode and expand:
680
- paths = _expand_paths(paths, name_function, num)
681
- elif "x" in mode and expand:
683
+ if ("w" in mode or "x" in mode) and expand:
682
684
  paths = _expand_paths(paths, name_function, num)
683
685
  elif "*" in paths:
684
686
  paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
@@ -0,0 +1,99 @@
1
+ import asyncio
2
+ import functools
3
+ import inspect
4
+
5
+ from fsspec.asyn import AsyncFileSystem
6
+
7
+
8
+ def async_wrapper(func, obj=None):
9
+ """
10
+ Wraps a synchronous function to make it awaitable.
11
+
12
+ Parameters
13
+ ----------
14
+ func : callable
15
+ The synchronous function to wrap.
16
+ obj : object, optional
17
+ The instance to bind the function to, if applicable.
18
+
19
+ Returns
20
+ -------
21
+ coroutine
22
+ An awaitable version of the function.
23
+ """
24
+
25
+ @functools.wraps(func)
26
+ async def wrapper(*args, **kwargs):
27
+ return await asyncio.to_thread(func, *args, **kwargs)
28
+
29
+ return wrapper
30
+
31
+
32
+ class AsyncFileSystemWrapper(AsyncFileSystem):
33
+ """
34
+ A wrapper class to convert a synchronous filesystem into an asynchronous one.
35
+
36
+ This class takes an existing synchronous filesystem implementation and wraps all
37
+ its methods to provide an asynchronous interface.
38
+
39
+ Parameters
40
+ ----------
41
+ sync_fs : AbstractFileSystem
42
+ The synchronous filesystem instance to wrap.
43
+ """
44
+
45
+ def __init__(self, sync_fs, *args, **kwargs):
46
+ super().__init__(*args, **kwargs)
47
+ self.asynchronous = True
48
+ self.sync_fs = sync_fs
49
+ self.protocol = self.sync_fs.protocol
50
+ self._wrap_all_sync_methods()
51
+
52
+ @property
53
+ def fsid(self):
54
+ return f"async_{self.sync_fs.fsid}"
55
+
56
+ def _wrap_all_sync_methods(self):
57
+ """
58
+ Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
59
+ """
60
+ excluded_methods = {"open"}
61
+ for method_name in dir(self.sync_fs):
62
+ if method_name.startswith("_") or method_name in excluded_methods:
63
+ continue
64
+
65
+ attr = inspect.getattr_static(self.sync_fs, method_name)
66
+ if isinstance(attr, property):
67
+ continue
68
+
69
+ method = getattr(self.sync_fs, method_name)
70
+ if callable(method) and not asyncio.iscoroutinefunction(method):
71
+ async_method = async_wrapper(method, obj=self)
72
+ setattr(self, f"_{method_name}", async_method)
73
+
74
+ @classmethod
75
+ def wrap_class(cls, sync_fs_class):
76
+ """
77
+ Create a new class that can be used to instantiate an AsyncFileSystemWrapper
78
+ with lazy instantiation of the underlying synchronous filesystem.
79
+
80
+ Parameters
81
+ ----------
82
+ sync_fs_class : type
83
+ The class of the synchronous filesystem to wrap.
84
+
85
+ Returns
86
+ -------
87
+ type
88
+ A new class that wraps the provided synchronous filesystem class.
89
+ """
90
+
91
+ class GeneratedAsyncFileSystemWrapper(cls):
92
+ def __init__(self, *args, **kwargs):
93
+ sync_fs = sync_fs_class(*args, **kwargs)
94
+ super().__init__(sync_fs)
95
+
96
+ GeneratedAsyncFileSystemWrapper.__name__ = (
97
+ f"Async{sync_fs_class.__name__}Wrapper"
98
+ )
99
+ return GeneratedAsyncFileSystemWrapper
@@ -612,7 +612,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
612
612
  **kwargs,
613
613
  ):
614
614
  paths = self.expand_path(
615
- path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
615
+ path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
616
616
  )
617
617
  getpaths = []
618
618
  storepaths = []
@@ -412,9 +412,9 @@ class DatabricksFile(AbstractBufferedFile):
412
412
  if block_size is None or block_size == "default":
413
413
  block_size = self.DEFAULT_BLOCK_SIZE
414
414
 
415
- assert (
416
- block_size == self.DEFAULT_BLOCK_SIZE
417
- ), f"Only the default block size is allowed, not {block_size}"
415
+ assert block_size == self.DEFAULT_BLOCK_SIZE, (
416
+ f"Only the default block size is allowed, not {block_size}"
417
+ )
418
418
 
419
419
  super().__init__(
420
420
  fs,
@@ -387,7 +387,7 @@ def _mlsd2(ftp, path="."):
387
387
  "size": split_line[4],
388
388
  },
389
389
  )
390
- if "d" == this[1]["unix.mode"][0]:
390
+ if this[1]["unix.mode"][0] == "d":
391
391
  this[1]["type"] = "dir"
392
392
  else:
393
393
  this[1]["type"] = "file"