fsspec 2023.10.0__py3-none-any.whl → 2024.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +3 -3
- fsspec/archive.py +4 -4
- fsspec/asyn.py +43 -53
- fsspec/caching.py +1 -1
- fsspec/callbacks.py +98 -12
- fsspec/compression.py +3 -3
- fsspec/core.py +16 -3
- fsspec/exceptions.py +0 -4
- fsspec/generic.py +11 -4
- fsspec/gui.py +4 -3
- fsspec/implementations/arrow.py +9 -0
- fsspec/implementations/cache_mapper.py +2 -6
- fsspec/implementations/cached.py +92 -18
- fsspec/implementations/data.py +48 -0
- fsspec/implementations/dbfs.py +14 -4
- fsspec/implementations/dirfs.py +6 -0
- fsspec/implementations/ftp.py +18 -13
- fsspec/implementations/github.py +17 -5
- fsspec/implementations/http.py +42 -51
- fsspec/implementations/libarchive.py +2 -3
- fsspec/implementations/local.py +11 -4
- fsspec/implementations/memory.py +2 -2
- fsspec/implementations/reference.py +127 -56
- fsspec/implementations/sftp.py +6 -5
- fsspec/implementations/smb.py +0 -1
- fsspec/implementations/tar.py +2 -1
- fsspec/implementations/webhdfs.py +46 -5
- fsspec/implementations/zip.py +11 -3
- fsspec/parquet.py +3 -5
- fsspec/registry.py +2 -1
- fsspec/spec.py +51 -61
- fsspec/tests/abstract/common.py +5 -5
- fsspec/tests/abstract/copy.py +21 -7
- fsspec/tests/abstract/put.py +21 -7
- fsspec/transaction.py +8 -4
- fsspec/utils.py +114 -1
- {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/METADATA +1 -2
- fsspec-2024.2.0.dist-info/RECORD +54 -0
- {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/WHEEL +1 -1
- fsspec-2023.10.0.dist-info/RECORD +0 -53
- {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/LICENSE +0 -0
- {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/top_level.txt +0 -0
fsspec/implementations/local.py
CHANGED
|
@@ -3,7 +3,6 @@ import io
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import os.path as osp
|
|
6
|
-
import posixpath
|
|
7
6
|
import re
|
|
8
7
|
import shutil
|
|
9
8
|
import stat
|
|
@@ -59,11 +58,16 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
59
58
|
|
|
60
59
|
def ls(self, path, detail=False, **kwargs):
|
|
61
60
|
path = self._strip_protocol(path)
|
|
62
|
-
|
|
61
|
+
info = self.info(path)
|
|
62
|
+
if info["type"] == "directory":
|
|
63
63
|
with os.scandir(path) as it:
|
|
64
|
-
|
|
64
|
+
infos = [self.info(f) for f in it]
|
|
65
65
|
else:
|
|
66
|
-
|
|
66
|
+
infos = [info]
|
|
67
|
+
|
|
68
|
+
if not detail:
|
|
69
|
+
return [i["name"] for i in infos]
|
|
70
|
+
return infos
|
|
67
71
|
|
|
68
72
|
def info(self, path, **kwargs):
|
|
69
73
|
if isinstance(path, os.DirEntry):
|
|
@@ -386,6 +390,9 @@ class LocalFileOpener(io.IOBase):
|
|
|
386
390
|
def close(self):
|
|
387
391
|
return self.f.close()
|
|
388
392
|
|
|
393
|
+
def truncate(self, size=None) -> int:
|
|
394
|
+
return self.f.truncate(size)
|
|
395
|
+
|
|
389
396
|
@property
|
|
390
397
|
def closed(self):
|
|
391
398
|
return self.f.closed
|
fsspec/implementations/memory.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any, ClassVar
|
|
|
8
8
|
|
|
9
9
|
from fsspec import AbstractFileSystem
|
|
10
10
|
|
|
11
|
-
logger = logging.
|
|
11
|
+
logger = logging.getLogger("fsspec.memoryfs")
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class MemoryFileSystem(AbstractFileSystem):
|
|
@@ -175,7 +175,7 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
175
175
|
parent = self._parent(parent)
|
|
176
176
|
if self.isfile(parent):
|
|
177
177
|
raise FileExistsError(parent)
|
|
178
|
-
if mode in ["rb", "ab", "
|
|
178
|
+
if mode in ["rb", "ab", "r+b"]:
|
|
179
179
|
if path in self.store:
|
|
180
180
|
f = self.store[path]
|
|
181
181
|
if mode == "ab":
|
|
@@ -17,7 +17,7 @@ except ImportError:
|
|
|
17
17
|
import json
|
|
18
18
|
|
|
19
19
|
from ..asyn import AsyncFileSystem
|
|
20
|
-
from ..callbacks import
|
|
20
|
+
from ..callbacks import DEFAULT_CALLBACK
|
|
21
21
|
from ..core import filesystem, open, split_protocol
|
|
22
22
|
from ..utils import isfilelike, merge_offset_ranges, other_paths
|
|
23
23
|
|
|
@@ -106,6 +106,12 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
106
106
|
self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
|
|
107
107
|
):
|
|
108
108
|
"""
|
|
109
|
+
|
|
110
|
+
This instance will be writable, storing changes in memory until full partitions
|
|
111
|
+
are accumulated or .flush() is called.
|
|
112
|
+
|
|
113
|
+
To create an empty lazy store, use .create()
|
|
114
|
+
|
|
109
115
|
Parameters
|
|
110
116
|
----------
|
|
111
117
|
root : str
|
|
@@ -119,26 +125,35 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
119
125
|
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
|
|
120
126
|
of the number of unique urls to total number of refs for each variable
|
|
121
127
|
is greater than or equal to this number. (default 10)
|
|
122
|
-
|
|
123
|
-
|
|
124
128
|
"""
|
|
125
129
|
self.root = root
|
|
126
130
|
self.chunk_sizes = {}
|
|
127
|
-
self.
|
|
131
|
+
self.out_root = out_root or self.root
|
|
132
|
+
self.cat_thresh = categorical_threshold
|
|
133
|
+
self.cache_size = cache_size
|
|
128
134
|
self.dirs = None
|
|
135
|
+
self.url = self.root + "/{field}/refs.{record}.parq"
|
|
136
|
+
# TODO: derive fs from `root`
|
|
129
137
|
self.fs = fsspec.filesystem("file") if fs is None else fs
|
|
138
|
+
|
|
139
|
+
def __getattr__(self, item):
|
|
140
|
+
if item in ("_items", "record_size", "zmetadata"):
|
|
141
|
+
self.setup()
|
|
142
|
+
# avoid possible recursion if setup fails somehow
|
|
143
|
+
return self.__dict__[item]
|
|
144
|
+
raise AttributeError(item)
|
|
145
|
+
|
|
146
|
+
def setup(self):
|
|
147
|
+
self._items = {}
|
|
130
148
|
self._items[".zmetadata"] = self.fs.cat_file(
|
|
131
149
|
"/".join([self.root, ".zmetadata"])
|
|
132
150
|
)
|
|
133
151
|
met = json.loads(self._items[".zmetadata"])
|
|
134
152
|
self.record_size = met["record_size"]
|
|
135
153
|
self.zmetadata = met["metadata"]
|
|
136
|
-
self.url = self.root + "/{field}/refs.{record}.parq"
|
|
137
|
-
self.out_root = out_root or self.root
|
|
138
|
-
self.cat_thresh = categorical_threshold
|
|
139
154
|
|
|
140
155
|
# Define function to open and decompress refs
|
|
141
|
-
@lru_cache(maxsize=cache_size)
|
|
156
|
+
@lru_cache(maxsize=self.cache_size)
|
|
142
157
|
def open_refs(field, record):
|
|
143
158
|
"""cached parquet file loader"""
|
|
144
159
|
path = self.url.format(field=field, record=record)
|
|
@@ -150,13 +165,39 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
150
165
|
self.open_refs = open_refs
|
|
151
166
|
|
|
152
167
|
@staticmethod
|
|
153
|
-
def create(
|
|
168
|
+
def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
|
|
169
|
+
"""Make empty parquet reference set
|
|
170
|
+
|
|
171
|
+
First deletes the contents of the given directory, if it exists.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
root: str
|
|
176
|
+
Directory to contain the output; will be created
|
|
177
|
+
storage_options: dict | None
|
|
178
|
+
For making the filesystem to use for writing is fs is None
|
|
179
|
+
fs: FileSystem | None
|
|
180
|
+
Filesystem for writing
|
|
181
|
+
record_size: int
|
|
182
|
+
Number of references per parquet file
|
|
183
|
+
kwargs: passed to __init__
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
LazyReferenceMapper instance
|
|
188
|
+
"""
|
|
154
189
|
met = {"metadata": {}, "record_size": record_size}
|
|
190
|
+
if fs is None:
|
|
191
|
+
fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
|
|
192
|
+
if fs.exists(root):
|
|
193
|
+
fs.rm(root, recursive=True)
|
|
194
|
+
fs.makedirs(root, exist_ok=True)
|
|
155
195
|
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
|
156
196
|
return LazyReferenceMapper(root, fs, **kwargs)
|
|
157
197
|
|
|
158
198
|
def listdir(self, basename=True):
|
|
159
199
|
"""List top-level directories"""
|
|
200
|
+
# cache me?
|
|
160
201
|
if self.dirs is None:
|
|
161
202
|
dirs = [p.split("/", 1)[0] for p in self.zmetadata]
|
|
162
203
|
self.dirs = {p for p in dirs if p and not p.startswith(".")}
|
|
@@ -237,19 +278,18 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
237
278
|
elif "/" not in key or self._is_meta(key):
|
|
238
279
|
raise KeyError(key)
|
|
239
280
|
field, sub_key = key.split("/")
|
|
240
|
-
record,
|
|
241
|
-
maybe = self._items.get((field,
|
|
281
|
+
record, ri, chunk_size = self._key_to_record(key)
|
|
282
|
+
maybe = self._items.get((field, record), {}).get(ri, False)
|
|
242
283
|
if maybe is None:
|
|
243
284
|
# explicitly deleted
|
|
244
285
|
raise KeyError
|
|
245
286
|
elif maybe:
|
|
246
287
|
return maybe
|
|
288
|
+
elif chunk_size == 0:
|
|
289
|
+
return b""
|
|
247
290
|
|
|
248
291
|
# Chunk keys can be loaded from row group and cached in LRU cache
|
|
249
292
|
try:
|
|
250
|
-
record, ri, chunk_size = self._key_to_record(key)
|
|
251
|
-
if chunk_size == 0:
|
|
252
|
-
return b""
|
|
253
293
|
refs = self.open_refs(field, record)
|
|
254
294
|
except (ValueError, TypeError, FileNotFoundError):
|
|
255
295
|
raise KeyError(key)
|
|
@@ -259,7 +299,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
259
299
|
if raw is not None:
|
|
260
300
|
return raw
|
|
261
301
|
if selection[0] is None:
|
|
262
|
-
raise KeyError("This reference has been deleted")
|
|
302
|
+
raise KeyError("This reference does not exist or has been deleted")
|
|
263
303
|
if selection[1:3] == [0, 0]:
|
|
264
304
|
# URL only
|
|
265
305
|
return selection[:1]
|
|
@@ -286,13 +326,13 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
286
326
|
size_ratio = [
|
|
287
327
|
math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
|
|
288
328
|
]
|
|
289
|
-
self.chunk_sizes[field] = size_ratio
|
|
329
|
+
self.chunk_sizes[field] = size_ratio or [1]
|
|
290
330
|
return self.chunk_sizes[field]
|
|
291
331
|
|
|
292
332
|
def _generate_record(self, field, record):
|
|
293
333
|
"""The references for a given parquet file of a given field"""
|
|
294
334
|
refs = self.open_refs(field, record)
|
|
295
|
-
it = iter(zip(refs.values()))
|
|
335
|
+
it = iter(zip(*refs.values()))
|
|
296
336
|
if len(refs) == 3:
|
|
297
337
|
# All urls
|
|
298
338
|
return (list(t) for t in it)
|
|
@@ -321,7 +361,6 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
321
361
|
def __hash__(self):
|
|
322
362
|
return id(self)
|
|
323
363
|
|
|
324
|
-
@lru_cache(20)
|
|
325
364
|
def __getitem__(self, key):
|
|
326
365
|
return self._load_one_key(key)
|
|
327
366
|
|
|
@@ -336,9 +375,10 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
336
375
|
else:
|
|
337
376
|
# metadata or top-level
|
|
338
377
|
self._items[key] = value
|
|
339
|
-
|
|
378
|
+
new_value = json.loads(
|
|
340
379
|
value.decode() if isinstance(value, bytes) else value
|
|
341
380
|
)
|
|
381
|
+
self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
|
|
342
382
|
|
|
343
383
|
@staticmethod
|
|
344
384
|
def _is_meta(key):
|
|
@@ -352,9 +392,9 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
352
392
|
else:
|
|
353
393
|
if "/" in key and not self._is_meta(key):
|
|
354
394
|
field, chunk = key.split("/")
|
|
355
|
-
record,
|
|
395
|
+
record, i, _ = self._key_to_record(key)
|
|
356
396
|
subdict = self._items.setdefault((field, record), {})
|
|
357
|
-
subdict[
|
|
397
|
+
subdict[i] = None
|
|
358
398
|
if len(subdict) == self.record_size:
|
|
359
399
|
self.write(field, record)
|
|
360
400
|
else:
|
|
@@ -367,26 +407,43 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
367
407
|
import numpy as np
|
|
368
408
|
import pandas as pd
|
|
369
409
|
|
|
370
|
-
# TODO: if the dict is incomplete, also load records and merge in
|
|
371
410
|
partition = self._items[(field, record)]
|
|
372
|
-
|
|
411
|
+
original = False
|
|
412
|
+
if len(partition) < self.record_size:
|
|
413
|
+
try:
|
|
414
|
+
original = self.open_refs(field, record)
|
|
415
|
+
except IOError:
|
|
416
|
+
pass
|
|
373
417
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
418
|
+
if original:
|
|
419
|
+
paths = original["path"]
|
|
420
|
+
offsets = original["offset"]
|
|
421
|
+
sizes = original["size"]
|
|
422
|
+
raws = original["raw"]
|
|
423
|
+
else:
|
|
424
|
+
paths = np.full(self.record_size, np.nan, dtype="O")
|
|
425
|
+
offsets = np.zeros(self.record_size, dtype="int64")
|
|
426
|
+
sizes = np.zeros(self.record_size, dtype="int64")
|
|
427
|
+
raws = np.full(self.record_size, np.nan, dtype="O")
|
|
381
428
|
for j, data in partition.items():
|
|
382
429
|
if isinstance(data, list):
|
|
383
|
-
|
|
430
|
+
if (
|
|
431
|
+
str(paths.dtype) == "category"
|
|
432
|
+
and data[0] not in paths.dtype.categories
|
|
433
|
+
):
|
|
434
|
+
paths = paths.add_categories(data[0])
|
|
384
435
|
paths[j] = data[0]
|
|
385
436
|
if len(data) > 1:
|
|
386
437
|
offsets[j] = data[1]
|
|
387
438
|
sizes[j] = data[2]
|
|
439
|
+
elif data is None:
|
|
440
|
+
# delete
|
|
441
|
+
paths[j] = None
|
|
442
|
+
offsets[j] = 0
|
|
443
|
+
sizes[j] = 0
|
|
444
|
+
raws[j] = None
|
|
388
445
|
else:
|
|
389
|
-
|
|
446
|
+
# this is the only call into kerchunk, could remove
|
|
390
447
|
raws[j] = kerchunk.df._proc_raw(data)
|
|
391
448
|
# TODO: only save needed columns
|
|
392
449
|
df = pd.DataFrame(
|
|
@@ -403,6 +460,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
403
460
|
object_encoding = {"raw": "bytes", "path": "utf8"}
|
|
404
461
|
has_nulls = ["path", "raw"]
|
|
405
462
|
|
|
463
|
+
fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
|
|
406
464
|
self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
|
|
407
465
|
df.to_parquet(
|
|
408
466
|
fn,
|
|
@@ -453,29 +511,30 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
453
511
|
self.open_refs.cache_clear()
|
|
454
512
|
|
|
455
513
|
def __len__(self):
|
|
456
|
-
# Caveat: This counts expected references, not actual
|
|
514
|
+
# Caveat: This counts expected references, not actual - but is fast
|
|
457
515
|
count = 0
|
|
458
516
|
for field in self.listdir():
|
|
459
517
|
if field.startswith("."):
|
|
460
518
|
count += 1
|
|
461
519
|
else:
|
|
462
|
-
|
|
463
|
-
nchunks = self.np.product(chunk_sizes)
|
|
464
|
-
count += nchunks
|
|
520
|
+
count += math.prod(self._get_chunk_sizes(field))
|
|
465
521
|
count += len(self.zmetadata) # all metadata keys
|
|
466
|
-
|
|
522
|
+
# any other files not in reference partitions
|
|
523
|
+
count += sum(1 for _ in self._items if not isinstance(_, tuple))
|
|
467
524
|
return count
|
|
468
525
|
|
|
469
526
|
def __iter__(self):
|
|
470
|
-
# Caveat:
|
|
471
|
-
#
|
|
527
|
+
# Caveat: returns only existing keys, so the number of these does not
|
|
528
|
+
# match len(self)
|
|
472
529
|
metas = set(self.zmetadata)
|
|
473
530
|
metas.update(self._items)
|
|
474
531
|
for bit in metas:
|
|
475
532
|
if isinstance(bit, str):
|
|
476
533
|
yield bit
|
|
477
534
|
for field in self.listdir():
|
|
478
|
-
|
|
535
|
+
for k in self._keys_in_field(field):
|
|
536
|
+
if k in self:
|
|
537
|
+
yield k
|
|
479
538
|
|
|
480
539
|
def __contains__(self, item):
|
|
481
540
|
try:
|
|
@@ -603,7 +662,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
603
662
|
**(ref_storage_args or target_options or {}), protocol=target_protocol
|
|
604
663
|
)
|
|
605
664
|
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
|
|
606
|
-
if ref_fs.isfile(
|
|
665
|
+
if ref_fs.isfile(fo2):
|
|
607
666
|
# text JSON
|
|
608
667
|
with fsspec.open(fo, "rb", **dic) as f:
|
|
609
668
|
logger.info("Read reference from URL %s", fo)
|
|
@@ -650,6 +709,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
650
709
|
self.fss[protocol] = fs
|
|
651
710
|
if remote_protocol is None:
|
|
652
711
|
# get single protocol from references
|
|
712
|
+
# TODO: warning here, since this can be very expensive?
|
|
653
713
|
for ref in self.references.values():
|
|
654
714
|
if callable(ref):
|
|
655
715
|
ref = ref()
|
|
@@ -740,7 +800,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
740
800
|
with open(lpath, "wb") as f:
|
|
741
801
|
f.write(data)
|
|
742
802
|
|
|
743
|
-
def get_file(self, rpath, lpath, callback=
|
|
803
|
+
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
|
|
744
804
|
if self.isdir(rpath):
|
|
745
805
|
return os.makedirs(lpath, exist_ok=True)
|
|
746
806
|
data = self.cat_file(rpath, **kwargs)
|
|
@@ -772,24 +832,27 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
772
832
|
raise NotImplementedError
|
|
773
833
|
if isinstance(path, list) and (recursive or any("*" in p for p in path)):
|
|
774
834
|
raise NotImplementedError
|
|
835
|
+
# TODO: if references is lazy, pre-fetch all paths in batch before access
|
|
775
836
|
proto_dict = _protocol_groups(path, self.references)
|
|
776
837
|
out = {}
|
|
777
838
|
for proto, paths in proto_dict.items():
|
|
778
839
|
fs = self.fss[proto]
|
|
779
|
-
urls, starts, ends = [], [], []
|
|
840
|
+
urls, starts, ends, valid_paths = [], [], [], []
|
|
780
841
|
for p in paths:
|
|
781
842
|
# find references or label not-found. Early exit if any not
|
|
782
843
|
# found and on_error is "raise"
|
|
783
844
|
try:
|
|
784
845
|
u, s, e = self._cat_common(p)
|
|
785
|
-
urls.append(u)
|
|
786
|
-
starts.append(s)
|
|
787
|
-
ends.append(e)
|
|
788
846
|
except FileNotFoundError as err:
|
|
789
847
|
if on_error == "raise":
|
|
790
848
|
raise
|
|
791
849
|
if on_error != "omit":
|
|
792
850
|
out[p] = err
|
|
851
|
+
else:
|
|
852
|
+
urls.append(u)
|
|
853
|
+
starts.append(s)
|
|
854
|
+
ends.append(e)
|
|
855
|
+
valid_paths.append(p)
|
|
793
856
|
|
|
794
857
|
# process references into form for merging
|
|
795
858
|
urls2 = []
|
|
@@ -797,7 +860,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
797
860
|
ends2 = []
|
|
798
861
|
paths2 = []
|
|
799
862
|
whole_files = set()
|
|
800
|
-
for u, s, e, p in zip(urls, starts, ends,
|
|
863
|
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
|
801
864
|
if isinstance(u, bytes):
|
|
802
865
|
# data
|
|
803
866
|
out[p] = u
|
|
@@ -809,7 +872,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
809
872
|
starts2.append(s)
|
|
810
873
|
ends2.append(e)
|
|
811
874
|
paths2.append(p)
|
|
812
|
-
for u, s, e, p in zip(urls, starts, ends,
|
|
875
|
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
|
813
876
|
# second run to account for files that are to be loaded whole
|
|
814
877
|
if s is not None and u not in whole_files:
|
|
815
878
|
urls2.append(u)
|
|
@@ -829,7 +892,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
829
892
|
bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
|
|
830
893
|
|
|
831
894
|
# unbundle from merged bytes - simple approach
|
|
832
|
-
for u, s, e, p in zip(urls, starts, ends,
|
|
895
|
+
for u, s, e, p in zip(urls, starts, ends, valid_paths):
|
|
833
896
|
if p in out:
|
|
834
897
|
continue # was bytes, already handled
|
|
835
898
|
for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
|
|
@@ -963,16 +1026,24 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
963
1026
|
elif len(part) == 1:
|
|
964
1027
|
size = None
|
|
965
1028
|
else:
|
|
966
|
-
_,
|
|
1029
|
+
_, _, size = part
|
|
967
1030
|
par = path.rsplit("/", 1)[0] if "/" in path else ""
|
|
968
1031
|
par0 = par
|
|
1032
|
+
subdirs = [par0]
|
|
969
1033
|
while par0 and par0 not in self.dircache:
|
|
970
|
-
#
|
|
971
|
-
self.dircache[par0] = []
|
|
972
|
-
self.dircache.setdefault(
|
|
973
|
-
par0.rsplit("/", 1)[0] if "/" in par0 else "", []
|
|
974
|
-
).append({"name": par0, "type": "directory", "size": 0})
|
|
1034
|
+
# collect parent directories
|
|
975
1035
|
par0 = self._parent(par0)
|
|
1036
|
+
subdirs.append(par0)
|
|
1037
|
+
|
|
1038
|
+
subdirs = subdirs[::-1]
|
|
1039
|
+
for parent, child in zip(subdirs, subdirs[1:]):
|
|
1040
|
+
# register newly discovered directories
|
|
1041
|
+
assert child not in self.dircache
|
|
1042
|
+
assert parent in self.dircache
|
|
1043
|
+
self.dircache[parent].append(
|
|
1044
|
+
{"name": child, "type": "directory", "size": 0}
|
|
1045
|
+
)
|
|
1046
|
+
self.dircache[child] = []
|
|
976
1047
|
|
|
977
1048
|
self.dircache[par].append({"name": path, "type": "file", "size": size})
|
|
978
1049
|
|
|
@@ -1068,7 +1139,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1068
1139
|
self.references[path] = data
|
|
1069
1140
|
self.dircache.clear() # this is a bit heavy handed
|
|
1070
1141
|
|
|
1071
|
-
async def _put_file(self, lpath, rpath):
|
|
1142
|
+
async def _put_file(self, lpath, rpath, **kwargs):
|
|
1072
1143
|
# puts binary
|
|
1073
1144
|
with open(lpath, "rb") as f:
|
|
1074
1145
|
self.references[rpath] = f.read()
|
fsspec/implementations/sftp.py
CHANGED
|
@@ -65,7 +65,7 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
65
65
|
out.pop("protocol", None)
|
|
66
66
|
return out
|
|
67
67
|
|
|
68
|
-
def mkdir(self, path, create_parents=
|
|
68
|
+
def mkdir(self, path, create_parents=True, mode=511):
|
|
69
69
|
logger.debug("Creating folder %s", path)
|
|
70
70
|
if self.exists(path):
|
|
71
71
|
raise FileExistsError(f"File exists: {path}")
|
|
@@ -80,12 +80,13 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
80
80
|
raise FileExistsError(f"File exists: {path}")
|
|
81
81
|
|
|
82
82
|
parts = path.split("/")
|
|
83
|
-
|
|
83
|
+
new_path = "/" if path[:1] == "/" else ""
|
|
84
84
|
|
|
85
85
|
for part in parts:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
self.
|
|
86
|
+
if part:
|
|
87
|
+
new_path = f"{new_path}/{part}" if new_path else part
|
|
88
|
+
if not self.exists(new_path):
|
|
89
|
+
self.ftp.mkdir(new_path, mode)
|
|
89
90
|
|
|
90
91
|
def rmdir(self, path):
|
|
91
92
|
logger.debug("Removing folder %s", path)
|
fsspec/implementations/smb.py
CHANGED
fsspec/implementations/tar.py
CHANGED
|
@@ -106,11 +106,12 @@ class TarFileSystem(AbstractArchiveFileSystem):
|
|
|
106
106
|
|
|
107
107
|
# This enables ls to get directories as children as well as files
|
|
108
108
|
self.dir_cache = {
|
|
109
|
-
dirname
|
|
109
|
+
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
|
110
110
|
for dirname in self._all_dirnames(self.tar.getnames())
|
|
111
111
|
}
|
|
112
112
|
for member in self.tar.getmembers():
|
|
113
113
|
info = member.get_info()
|
|
114
|
+
info["name"] = info["name"].rstrip("/")
|
|
114
115
|
info["type"] = typemap.get(info["type"], "file")
|
|
115
116
|
self.dir_cache[info["name"]] = info
|
|
116
117
|
|
|
@@ -21,7 +21,7 @@ class WebHDFS(AbstractFileSystem):
|
|
|
21
21
|
"""
|
|
22
22
|
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
Four auth mechanisms are supported:
|
|
25
25
|
|
|
26
26
|
insecure: no auth is done, and the user is assumed to be whoever they
|
|
27
27
|
say they are (parameter ``user``), or a predefined value such as
|
|
@@ -34,6 +34,8 @@ class WebHDFS(AbstractFileSystem):
|
|
|
34
34
|
service. Indeed, this client can also generate such tokens when
|
|
35
35
|
not insecure. Note that tokens expire, but can be renewed (by a
|
|
36
36
|
previously specified user) and may allow for proxying.
|
|
37
|
+
basic-auth: used when both parameter ``user`` and parameter ``password``
|
|
38
|
+
are provided.
|
|
37
39
|
|
|
38
40
|
"""
|
|
39
41
|
|
|
@@ -47,10 +49,13 @@ class WebHDFS(AbstractFileSystem):
|
|
|
47
49
|
kerberos=False,
|
|
48
50
|
token=None,
|
|
49
51
|
user=None,
|
|
52
|
+
password=None,
|
|
50
53
|
proxy_to=None,
|
|
51
54
|
kerb_kwargs=None,
|
|
52
55
|
data_proxy=None,
|
|
53
56
|
use_https=False,
|
|
57
|
+
session_cert=None,
|
|
58
|
+
session_verify=True,
|
|
54
59
|
**kwargs,
|
|
55
60
|
):
|
|
56
61
|
"""
|
|
@@ -68,6 +73,9 @@ class WebHDFS(AbstractFileSystem):
|
|
|
68
73
|
given
|
|
69
74
|
user: str or None
|
|
70
75
|
If given, assert the user name to connect with
|
|
76
|
+
password: str or None
|
|
77
|
+
If given, assert the password to use for basic auth. If password
|
|
78
|
+
is provided, user must be provided also
|
|
71
79
|
proxy_to: str or None
|
|
72
80
|
If given, the user has the authority to proxy, and this value is
|
|
73
81
|
the user in who's name actions are taken
|
|
@@ -84,12 +92,19 @@ class WebHDFS(AbstractFileSystem):
|
|
|
84
92
|
``url->data_proxy(url)``.
|
|
85
93
|
use_https: bool
|
|
86
94
|
Whether to connect to the Name-node using HTTPS instead of HTTP
|
|
95
|
+
session_cert: str or Tuple[str, str] or None
|
|
96
|
+
Path to a certificate file, or tuple of (cert, key) files to use
|
|
97
|
+
for the requests.Session
|
|
98
|
+
session_verify: str, bool or None
|
|
99
|
+
Path to a certificate file to use for verifying the requests.Session.
|
|
87
100
|
kwargs
|
|
88
101
|
"""
|
|
89
102
|
if self._cached:
|
|
90
103
|
return
|
|
91
104
|
super().__init__(**kwargs)
|
|
92
|
-
self.url =
|
|
105
|
+
self.url = (
|
|
106
|
+
f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa
|
|
107
|
+
)
|
|
93
108
|
self.kerb = kerberos
|
|
94
109
|
self.kerb_kwargs = kerb_kwargs or {}
|
|
95
110
|
self.pars = {}
|
|
@@ -102,8 +117,19 @@ class WebHDFS(AbstractFileSystem):
|
|
|
102
117
|
" token"
|
|
103
118
|
)
|
|
104
119
|
self.pars["delegation"] = token
|
|
105
|
-
|
|
106
|
-
|
|
120
|
+
self.user = user
|
|
121
|
+
self.password = password
|
|
122
|
+
|
|
123
|
+
if password is not None:
|
|
124
|
+
if user is None:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"If passing a password, the user must also be"
|
|
127
|
+
"set in order to set up the basic-auth"
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
if user is not None:
|
|
131
|
+
self.pars["user.name"] = user
|
|
132
|
+
|
|
107
133
|
if proxy_to is not None:
|
|
108
134
|
self.pars["doas"] = proxy_to
|
|
109
135
|
if kerberos and user is not None:
|
|
@@ -111,6 +137,10 @@ class WebHDFS(AbstractFileSystem):
|
|
|
111
137
|
"If using Kerberos auth, do not specify the "
|
|
112
138
|
"user, this is handled by kinit."
|
|
113
139
|
)
|
|
140
|
+
|
|
141
|
+
self.session_cert = session_cert
|
|
142
|
+
self.session_verify = session_verify
|
|
143
|
+
|
|
114
144
|
self._connect()
|
|
115
145
|
|
|
116
146
|
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
|
@@ -121,13 +151,24 @@ class WebHDFS(AbstractFileSystem):
|
|
|
121
151
|
|
|
122
152
|
def _connect(self):
|
|
123
153
|
self.session = requests.Session()
|
|
154
|
+
|
|
155
|
+
if self.session_cert:
|
|
156
|
+
self.session.cert = self.session_cert
|
|
157
|
+
|
|
158
|
+
self.session.verify = self.session_verify
|
|
159
|
+
|
|
124
160
|
if self.kerb:
|
|
125
161
|
from requests_kerberos import HTTPKerberosAuth
|
|
126
162
|
|
|
127
163
|
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
|
128
164
|
|
|
165
|
+
if self.user is not None and self.password is not None:
|
|
166
|
+
from requests.auth import HTTPBasicAuth
|
|
167
|
+
|
|
168
|
+
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
|
169
|
+
|
|
129
170
|
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
|
130
|
-
url = self.url + quote(path or "")
|
|
171
|
+
url = self._apply_proxy(self.url + quote(path or "", safe="/="))
|
|
131
172
|
args = kwargs.copy()
|
|
132
173
|
args.update(self.pars)
|
|
133
174
|
args["op"] = op.upper()
|
fsspec/implementations/zip.py
CHANGED
|
@@ -49,8 +49,12 @@ class ZipFileSystem(AbstractArchiveFileSystem):
|
|
|
49
49
|
raise ValueError(f"mode '{mode}' no understood")
|
|
50
50
|
self.mode = mode
|
|
51
51
|
if isinstance(fo, str):
|
|
52
|
+
if mode == "a":
|
|
53
|
+
m = "r+b"
|
|
54
|
+
else:
|
|
55
|
+
m = mode + "b"
|
|
52
56
|
fo = fsspec.open(
|
|
53
|
-
fo, mode=
|
|
57
|
+
fo, mode=m, protocol=target_protocol, **(target_options or {})
|
|
54
58
|
)
|
|
55
59
|
self.of = fo
|
|
56
60
|
self.fo = fo.__enter__() # the whole instance is a context
|
|
@@ -83,14 +87,18 @@ class ZipFileSystem(AbstractArchiveFileSystem):
|
|
|
83
87
|
# not read from the file.
|
|
84
88
|
files = self.zip.infolist()
|
|
85
89
|
self.dir_cache = {
|
|
86
|
-
dirname
|
|
90
|
+
dirname.rstrip("/"): {
|
|
91
|
+
"name": dirname.rstrip("/"),
|
|
92
|
+
"size": 0,
|
|
93
|
+
"type": "directory",
|
|
94
|
+
}
|
|
87
95
|
for dirname in self._all_dirnames(self.zip.namelist())
|
|
88
96
|
}
|
|
89
97
|
for z in files:
|
|
90
98
|
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
|
|
91
99
|
f.update(
|
|
92
100
|
{
|
|
93
|
-
"name": z.filename,
|
|
101
|
+
"name": z.filename.rstrip("/"),
|
|
94
102
|
"size": z.file_size,
|
|
95
103
|
"type": ("directory" if z.is_dir() else "file"),
|
|
96
104
|
}
|