fsspec 2024.9.0__tar.gz → 2024.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fsspec-2024.9.0 → fsspec-2024.10.0}/PKG-INFO +1 -1
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/changelog.rst +21 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/_version.py +2 -2
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/asyn.py +4 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/core.py +5 -2
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/dirfs.py +12 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/git.py +27 -39
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/http.py +10 -8
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/memory.py +7 -3
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/reference.py +34 -8
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/zip.py +2 -1
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/spec.py +3 -5
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.codespellrc +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.coveragerc +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.gitattributes +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.github/workflows/codespell.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.github/workflows/main.yaml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.github/workflows/pypipublish.yaml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.gitignore +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/.pre-commit-config.yaml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/LICENSE +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/README.md +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/ci/environment-downstream.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/ci/environment-friends.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/ci/environment-py38.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/ci/environment-typecheck.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/ci/environment-win.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/Makefile +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/README.md +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/environment.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/make.bat +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/_static/custom.css +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/api.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/async.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/conf.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/copying.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/developer.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/features.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/img/gui.png +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/index.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/intro.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/docs/source/usage.rst +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/__init__.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/archive.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/caching.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/callbacks.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/compression.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/config.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/conftest.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/dircache.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/exceptions.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/fuse.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/generic.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/gui.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/__init__.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/arrow.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/cache_mapper.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/cache_metadata.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/cached.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/dask.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/data.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/dbfs.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/ftp.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/github.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/jupyter.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/libarchive.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/local.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/sftp.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/smb.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/tar.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/implementations/webhdfs.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/json.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/mapping.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/parquet.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/registry.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/__init__.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/common.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/copy.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/get.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/mv.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/tests/abstract/put.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/transaction.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/fsspec/utils.py +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/install_s3fs.sh +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/pyproject.toml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/readthedocs.yml +0 -0
- {fsspec-2024.9.0 → fsspec-2024.10.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: fsspec
|
|
3
|
-
Version: 2024.
|
|
3
|
+
Version: 2024.10.0
|
|
4
4
|
Summary: File-system specification
|
|
5
5
|
Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
|
|
6
6
|
Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
|
|
@@ -1,6 +1,27 @@
|
|
|
1
1
|
Changelog
|
|
2
2
|
=========
|
|
3
3
|
|
|
4
|
+
2024.10.0
|
|
5
|
+
---------
|
|
6
|
+
|
|
7
|
+
Fixes
|
|
8
|
+
|
|
9
|
+
- Performance of memoryFS rm (#1725)
|
|
10
|
+
- Performance of git FS info (#1712)
|
|
11
|
+
- Avoid git hex for newer pygit (#1703)
|
|
12
|
+
- tests fix for zip (#1700, 1691)
|
|
13
|
+
- missing open_async for dirFS (#1698)
|
|
14
|
+
- handle pathlib in zip (#1689)
|
|
15
|
+
- skip tests needing kerchunk if not installed (#1689)
|
|
16
|
+
- allow repeated kwargs in unchain (#1673)
|
|
17
|
+
|
|
18
|
+
Other
|
|
19
|
+
|
|
20
|
+
- Code style (#1704, 1706)
|
|
21
|
+
- allow pyarrow in referenceFS parquet (#1692)
|
|
22
|
+
- don't hardcode test port for parallel runs (#1690)
|
|
23
|
+
|
|
24
|
+
|
|
4
25
|
2024.9.0
|
|
5
26
|
--------
|
|
6
27
|
|
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '2024.
|
|
16
|
-
__version_tuple__ = version_tuple = (2024,
|
|
15
|
+
__version__ = version = '2024.10.0'
|
|
16
|
+
__version_tuple__ = version_tuple = (2024, 10, 0)
|
|
@@ -344,6 +344,10 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
344
344
|
async def _cp_file(self, path1, path2, **kwargs):
|
|
345
345
|
raise NotImplementedError
|
|
346
346
|
|
|
347
|
+
async def _mv_file(self, path1, path2):
|
|
348
|
+
await self._cp_file(path1, path2)
|
|
349
|
+
await self._rm_file(path1)
|
|
350
|
+
|
|
347
351
|
async def _copy(
|
|
348
352
|
self,
|
|
349
353
|
path1,
|
|
@@ -346,7 +346,10 @@ def _un_chain(path, kwargs):
|
|
|
346
346
|
kws = kwargs.pop(protocol, {})
|
|
347
347
|
if bit is bits[0]:
|
|
348
348
|
kws.update(kwargs)
|
|
349
|
-
kw = dict(
|
|
349
|
+
kw = dict(
|
|
350
|
+
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
|
351
|
+
**kws,
|
|
352
|
+
)
|
|
350
353
|
bit = cls._strip_protocol(bit)
|
|
351
354
|
if (
|
|
352
355
|
protocol in {"blockcache", "filecache", "simplecache"}
|
|
@@ -578,7 +581,7 @@ def expand_paths_if_needed(paths, mode, num, fs, name_function):
|
|
|
578
581
|
paths = list(paths)
|
|
579
582
|
|
|
580
583
|
if "w" in mode: # read mode
|
|
581
|
-
if sum(
|
|
584
|
+
if sum(1 for p in paths if "*" in p) > 1:
|
|
582
585
|
raise ValueError(
|
|
583
586
|
"When writing data, only one filename mask can be specified."
|
|
584
587
|
)
|
|
@@ -55,6 +55,8 @@ class GitFileSystem(AbstractFileSystem):
|
|
|
55
55
|
tree = comm.tree
|
|
56
56
|
for part in parts:
|
|
57
57
|
if part and isinstance(tree, pygit2.Tree):
|
|
58
|
+
if part not in tree:
|
|
59
|
+
raise FileNotFoundError(path)
|
|
58
60
|
tree = tree[part]
|
|
59
61
|
return tree
|
|
60
62
|
|
|
@@ -69,46 +71,32 @@ class GitFileSystem(AbstractFileSystem):
|
|
|
69
71
|
out["ref"], path = path.split("@", 1)
|
|
70
72
|
return out
|
|
71
73
|
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _object_to_info(obj, path=None):
|
|
76
|
+
# obj.name and obj.filemode are None for the root tree!
|
|
77
|
+
is_dir = isinstance(obj, pygit2.Tree)
|
|
78
|
+
return {
|
|
79
|
+
"type": "directory" if is_dir else "file",
|
|
80
|
+
"name": (
|
|
81
|
+
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
|
|
82
|
+
),
|
|
83
|
+
"hex": str(obj.id),
|
|
84
|
+
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
|
|
85
|
+
"size": 0 if is_dir else obj.size,
|
|
86
|
+
}
|
|
87
|
+
|
|
72
88
|
def ls(self, path, detail=True, ref=None, **kwargs):
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"mode": f"{obj.filemode:o}",
|
|
85
|
-
"size": 0,
|
|
86
|
-
}
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
out.append(
|
|
90
|
-
{
|
|
91
|
-
"type": "file",
|
|
92
|
-
"name": "/".join([path, obj.name]).lstrip("/"),
|
|
93
|
-
"hex": obj.hex,
|
|
94
|
-
"mode": f"{obj.filemode:o}",
|
|
95
|
-
"size": obj.size,
|
|
96
|
-
}
|
|
97
|
-
)
|
|
98
|
-
else:
|
|
99
|
-
obj = tree
|
|
100
|
-
out = [
|
|
101
|
-
{
|
|
102
|
-
"type": "file",
|
|
103
|
-
"name": obj.name,
|
|
104
|
-
"hex": obj.hex,
|
|
105
|
-
"mode": f"{obj.filemode:o}",
|
|
106
|
-
"size": obj.size,
|
|
107
|
-
}
|
|
108
|
-
]
|
|
109
|
-
if detail:
|
|
110
|
-
return out
|
|
111
|
-
return [o["name"] for o in out]
|
|
89
|
+
tree = self._path_to_object(self._strip_protocol(path), ref)
|
|
90
|
+
return [
|
|
91
|
+
GitFileSystem._object_to_info(obj, path)
|
|
92
|
+
if detail
|
|
93
|
+
else GitFileSystem._object_to_info(obj, path)["name"]
|
|
94
|
+
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def info(self, path, ref=None, **kwargs):
|
|
98
|
+
tree = self._path_to_object(self._strip_protocol(path), ref)
|
|
99
|
+
return GitFileSystem._object_to_info(tree, path)
|
|
112
100
|
|
|
113
101
|
def ukey(self, path, ref=None):
|
|
114
102
|
return self.info(path, ref=ref)["hex"]
|
|
@@ -358,9 +358,10 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
358
358
|
kw = self.kwargs.copy()
|
|
359
359
|
kw["asynchronous"] = self.asynchronous
|
|
360
360
|
kw.update(kwargs)
|
|
361
|
-
|
|
361
|
+
info = {}
|
|
362
|
+
size = size or info.update(self.info(path, **kwargs)) or info["size"]
|
|
362
363
|
session = sync(self.loop, self.set_session)
|
|
363
|
-
if block_size and size:
|
|
364
|
+
if block_size and size and info.get("partial", True):
|
|
364
365
|
return HTTPFile(
|
|
365
366
|
self,
|
|
366
367
|
path,
|
|
@@ -520,9 +521,9 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
520
521
|
|
|
521
522
|
class HTTPFile(AbstractBufferedFile):
|
|
522
523
|
"""
|
|
523
|
-
A file-like object pointing to a
|
|
524
|
+
A file-like object pointing to a remote HTTP(S) resource
|
|
524
525
|
|
|
525
|
-
Supports only reading, with read-ahead of a
|
|
526
|
+
Supports only reading, with read-ahead of a predetermined block-size.
|
|
526
527
|
|
|
527
528
|
In the case that the server does not supply the filesize, only reading of
|
|
528
529
|
the complete file in one go is supported.
|
|
@@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
|
|
|
835
836
|
async with r:
|
|
836
837
|
r.raise_for_status()
|
|
837
838
|
|
|
838
|
-
# TODO:
|
|
839
|
-
# recognise lack of 'Accept-Ranges',
|
|
840
|
-
# or 'Accept-Ranges': 'none' (not 'bytes')
|
|
841
|
-
# to mean streaming only, no random access => return None
|
|
842
839
|
if "Content-Length" in r.headers:
|
|
843
840
|
# Some servers may choose to ignore Accept-Encoding and return
|
|
844
841
|
# compressed content, in which case the returned size is unreliable.
|
|
@@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
|
|
|
853
850
|
if "Content-Type" in r.headers:
|
|
854
851
|
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
|
|
855
852
|
|
|
853
|
+
if r.headers.get("Accept-Ranges") == "none":
|
|
854
|
+
# Some servers may explicitly discourage partial content requests, but
|
|
855
|
+
# the lack of "Accept-Ranges" does not always indicate they would fail
|
|
856
|
+
info["partial"] = False
|
|
857
|
+
|
|
856
858
|
info["url"] = str(r.url)
|
|
857
859
|
|
|
858
860
|
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
|
@@ -248,6 +248,10 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
248
248
|
except KeyError as e:
|
|
249
249
|
raise FileNotFoundError(path) from e
|
|
250
250
|
|
|
251
|
+
def isfile(self, path):
|
|
252
|
+
path = self._strip_protocol(path)
|
|
253
|
+
return path in self.store
|
|
254
|
+
|
|
251
255
|
def rm(self, path, recursive=False, maxdepth=None):
|
|
252
256
|
if isinstance(path, str):
|
|
253
257
|
path = self._strip_protocol(path)
|
|
@@ -255,14 +259,14 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
255
259
|
path = [self._strip_protocol(p) for p in path]
|
|
256
260
|
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
|
257
261
|
for p in reversed(paths):
|
|
262
|
+
if self.isfile(p):
|
|
263
|
+
self.rm_file(p)
|
|
258
264
|
# If the expanded path doesn't exist, it is only because the expanded
|
|
259
265
|
# path was a directory that does not exist in self.pseudo_dirs. This
|
|
260
266
|
# is possible if you directly create files without making the
|
|
261
267
|
# directories first.
|
|
262
|
-
|
|
268
|
+
elif not self.exists(p):
|
|
263
269
|
continue
|
|
264
|
-
if self.isfile(p):
|
|
265
|
-
self.rm_file(p)
|
|
266
270
|
else:
|
|
267
271
|
self.rmdir(p)
|
|
268
272
|
|
|
@@ -7,7 +7,7 @@ import math
|
|
|
7
7
|
import os
|
|
8
8
|
from itertools import chain
|
|
9
9
|
from functools import lru_cache
|
|
10
|
-
from typing import TYPE_CHECKING
|
|
10
|
+
from typing import TYPE_CHECKING, Literal
|
|
11
11
|
|
|
12
12
|
import fsspec.core
|
|
13
13
|
|
|
@@ -104,7 +104,13 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
104
104
|
return pd
|
|
105
105
|
|
|
106
106
|
def __init__(
|
|
107
|
-
self,
|
|
107
|
+
self,
|
|
108
|
+
root,
|
|
109
|
+
fs=None,
|
|
110
|
+
out_root=None,
|
|
111
|
+
cache_size=128,
|
|
112
|
+
categorical_threshold=10,
|
|
113
|
+
engine: Literal["fastparquet", "pyarrow"] = "fastparquet",
|
|
108
114
|
):
|
|
109
115
|
"""
|
|
110
116
|
|
|
@@ -126,16 +132,25 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
126
132
|
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
|
|
127
133
|
of the number of unique urls to total number of refs for each variable
|
|
128
134
|
is greater than or equal to this number. (default 10)
|
|
135
|
+
engine: Literal["fastparquet","pyarrow"]
|
|
136
|
+
Engine choice for reading parquet files. (default is "fastparquet")
|
|
129
137
|
"""
|
|
138
|
+
|
|
130
139
|
self.root = root
|
|
131
140
|
self.chunk_sizes = {}
|
|
132
141
|
self.out_root = out_root or self.root
|
|
133
142
|
self.cat_thresh = categorical_threshold
|
|
143
|
+
self.engine = engine
|
|
134
144
|
self.cache_size = cache_size
|
|
135
145
|
self.url = self.root + "/{field}/refs.{record}.parq"
|
|
136
146
|
# TODO: derive fs from `root`
|
|
137
147
|
self.fs = fsspec.filesystem("file") if fs is None else fs
|
|
138
148
|
|
|
149
|
+
from importlib.util import find_spec
|
|
150
|
+
|
|
151
|
+
if self.engine == "pyarrow" and find_spec("pyarrow") is None:
|
|
152
|
+
raise ImportError("engine choice `pyarrow` is not installed.")
|
|
153
|
+
|
|
139
154
|
def __getattr__(self, item):
|
|
140
155
|
if item in ("_items", "record_size", "zmetadata"):
|
|
141
156
|
self.setup()
|
|
@@ -158,7 +173,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
158
173
|
"""cached parquet file loader"""
|
|
159
174
|
path = self.url.format(field=field, record=record)
|
|
160
175
|
data = io.BytesIO(self.fs.cat_file(path))
|
|
161
|
-
df = self.pd.read_parquet(data, engine=
|
|
176
|
+
df = self.pd.read_parquet(data, engine=self.engine)
|
|
162
177
|
refs = {c: df[c].to_numpy() for c in df.columns}
|
|
163
178
|
return refs
|
|
164
179
|
|
|
@@ -463,18 +478,28 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
463
478
|
|
|
464
479
|
fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
|
|
465
480
|
self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
|
|
481
|
+
|
|
482
|
+
if self.engine == "pyarrow":
|
|
483
|
+
df_backend_kwargs = {"write_statistics": False}
|
|
484
|
+
elif self.engine == "fastparquet":
|
|
485
|
+
df_backend_kwargs = {
|
|
486
|
+
"stats": False,
|
|
487
|
+
"object_encoding": object_encoding,
|
|
488
|
+
"has_nulls": has_nulls,
|
|
489
|
+
}
|
|
490
|
+
else:
|
|
491
|
+
raise NotImplementedError(f"{self.engine} not supported")
|
|
492
|
+
|
|
466
493
|
df.to_parquet(
|
|
467
494
|
fn,
|
|
468
|
-
engine=
|
|
495
|
+
engine=self.engine,
|
|
469
496
|
storage_options=storage_options
|
|
470
497
|
or getattr(self.fs, "storage_options", None),
|
|
471
498
|
compression="zstd",
|
|
472
499
|
index=False,
|
|
473
|
-
|
|
474
|
-
object_encoding=object_encoding,
|
|
475
|
-
has_nulls=has_nulls,
|
|
476
|
-
# **kwargs,
|
|
500
|
+
**df_backend_kwargs,
|
|
477
501
|
)
|
|
502
|
+
|
|
478
503
|
partition.clear()
|
|
479
504
|
self._items.pop((field, record))
|
|
480
505
|
|
|
@@ -486,6 +511,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
486
511
|
base_url: str
|
|
487
512
|
Location of the output
|
|
488
513
|
"""
|
|
514
|
+
|
|
489
515
|
# write what we have so far and clear sub chunks
|
|
490
516
|
for thing in list(self._items):
|
|
491
517
|
if isinstance(thing, tuple):
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import zipfile
|
|
2
3
|
|
|
3
4
|
import fsspec
|
|
@@ -48,7 +49,7 @@ class ZipFileSystem(AbstractArchiveFileSystem):
|
|
|
48
49
|
if mode not in set("rwa"):
|
|
49
50
|
raise ValueError(f"mode '{mode}' no understood")
|
|
50
51
|
self.mode = mode
|
|
51
|
-
if isinstance(fo, str):
|
|
52
|
+
if isinstance(fo, (str, os.PathLike)):
|
|
52
53
|
if mode == "a":
|
|
53
54
|
m = "r+b"
|
|
54
55
|
else:
|
|
@@ -428,11 +428,9 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
428
428
|
except (FileNotFoundError, OSError) as e:
|
|
429
429
|
if on_error == "raise":
|
|
430
430
|
raise
|
|
431
|
-
|
|
431
|
+
if callable(on_error):
|
|
432
432
|
on_error(e)
|
|
433
|
-
|
|
434
|
-
return path, {}, {}
|
|
435
|
-
return path, [], []
|
|
433
|
+
return
|
|
436
434
|
|
|
437
435
|
for info in listing:
|
|
438
436
|
# each info name must be at least [path]/part , but here
|
|
@@ -650,7 +648,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
650
648
|
Returns a single dictionary, with exactly the same information as ``ls``
|
|
651
649
|
would with ``detail=True``.
|
|
652
650
|
|
|
653
|
-
The default implementation
|
|
651
|
+
The default implementation calls ls and could be overridden by a
|
|
654
652
|
shortcut. kwargs are passed on to ```ls()``.
|
|
655
653
|
|
|
656
654
|
Some file systems might not be able to measure the file's size, in
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|