fsspec 2023.12.2__py3-none-any.whl → 2024.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +3 -3
- fsspec/asyn.py +9 -9
- fsspec/callbacks.py +98 -12
- fsspec/compression.py +3 -3
- fsspec/exceptions.py +0 -4
- fsspec/generic.py +2 -2
- fsspec/gui.py +3 -2
- fsspec/implementations/arrow.py +9 -0
- fsspec/implementations/cache_mapper.py +2 -6
- fsspec/implementations/cached.py +25 -7
- fsspec/implementations/dbfs.py +14 -4
- fsspec/implementations/dirfs.py +6 -0
- fsspec/implementations/ftp.py +18 -13
- fsspec/implementations/github.py +17 -5
- fsspec/implementations/http.py +14 -10
- fsspec/implementations/local.py +8 -4
- fsspec/implementations/memory.py +1 -1
- fsspec/implementations/reference.py +78 -40
- fsspec/implementations/sftp.py +1 -1
- fsspec/implementations/webhdfs.py +20 -1
- fsspec/parquet.py +3 -5
- fsspec/spec.py +15 -13
- fsspec/tests/abstract/copy.py +21 -7
- fsspec/tests/abstract/put.py +21 -7
- {fsspec-2023.12.2.dist-info → fsspec-2024.2.0.dist-info}/METADATA +1 -2
- fsspec-2024.2.0.dist-info/RECORD +54 -0
- fsspec-2023.12.2.dist-info/RECORD +0 -54
- {fsspec-2023.12.2.dist-info → fsspec-2024.2.0.dist-info}/LICENSE +0 -0
- {fsspec-2023.12.2.dist-info → fsspec-2024.2.0.dist-info}/WHEEL +0 -0
- {fsspec-2023.12.2.dist-info → fsspec-2024.2.0.dist-info}/top_level.txt +0 -0
fsspec/implementations/http.py
CHANGED
|
@@ -7,11 +7,10 @@ from copy import copy
|
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
9
|
import aiohttp
|
|
10
|
-
import requests
|
|
11
10
|
import yarl
|
|
12
11
|
|
|
13
12
|
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
|
|
14
|
-
from fsspec.callbacks import
|
|
13
|
+
from fsspec.callbacks import DEFAULT_CALLBACK
|
|
15
14
|
from fsspec.exceptions import FSTimeoutError
|
|
16
15
|
from fsspec.spec import AbstractBufferedFile
|
|
17
16
|
from fsspec.utils import (
|
|
@@ -124,7 +123,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
124
123
|
try:
|
|
125
124
|
sync(loop, session.close, timeout=0.1)
|
|
126
125
|
return
|
|
127
|
-
except (TimeoutError, FSTimeoutError):
|
|
126
|
+
except (TimeoutError, FSTimeoutError, NotImplementedError):
|
|
128
127
|
pass
|
|
129
128
|
connector = getattr(session, "_connector", None)
|
|
130
129
|
if connector is not None:
|
|
@@ -235,7 +234,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
235
234
|
return out
|
|
236
235
|
|
|
237
236
|
async def _get_file(
|
|
238
|
-
self, rpath, lpath, chunk_size=5 * 2**20, callback=
|
|
237
|
+
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
|
|
239
238
|
):
|
|
240
239
|
kw = self.kwargs.copy()
|
|
241
240
|
kw.update(kwargs)
|
|
@@ -252,7 +251,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
252
251
|
if isfilelike(lpath):
|
|
253
252
|
outfile = lpath
|
|
254
253
|
else:
|
|
255
|
-
outfile = open(lpath, "wb")
|
|
254
|
+
outfile = open(lpath, "wb") # noqa: ASYNC101
|
|
256
255
|
|
|
257
256
|
try:
|
|
258
257
|
chunk = True
|
|
@@ -269,7 +268,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
269
268
|
lpath,
|
|
270
269
|
rpath,
|
|
271
270
|
chunk_size=5 * 2**20,
|
|
272
|
-
callback=
|
|
271
|
+
callback=DEFAULT_CALLBACK,
|
|
273
272
|
method="post",
|
|
274
273
|
**kwargs,
|
|
275
274
|
):
|
|
@@ -280,7 +279,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
280
279
|
context = nullcontext(lpath)
|
|
281
280
|
use_seek = False # might not support seeking
|
|
282
281
|
else:
|
|
283
|
-
context = open(lpath, "rb")
|
|
282
|
+
context = open(lpath, "rb") # noqa: ASYNC101
|
|
284
283
|
use_seek = True
|
|
285
284
|
|
|
286
285
|
with context as f:
|
|
@@ -319,7 +318,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
319
318
|
r = await session.get(self.encode_url(path), **kw)
|
|
320
319
|
async with r:
|
|
321
320
|
return r.status < 400
|
|
322
|
-
except
|
|
321
|
+
except aiohttp.ClientError:
|
|
323
322
|
return False
|
|
324
323
|
|
|
325
324
|
async def _isfile(self, path, **kwargs):
|
|
@@ -529,7 +528,7 @@ class HTTPFile(AbstractBufferedFile):
|
|
|
529
528
|
----------
|
|
530
529
|
url: str
|
|
531
530
|
Full URL of the remote resource, including the protocol
|
|
532
|
-
session:
|
|
531
|
+
session: aiohttp.ClientSession or None
|
|
533
532
|
All calls will be made within this session, to avoid restarting
|
|
534
533
|
connections where the server allows this
|
|
535
534
|
block_size: int or None
|
|
@@ -802,7 +801,7 @@ async def get_range(session, url, start, end, file=None, **kwargs):
|
|
|
802
801
|
async with r:
|
|
803
802
|
out = await r.read()
|
|
804
803
|
if file:
|
|
805
|
-
with open(file, "r+b") as f:
|
|
804
|
+
with open(file, "r+b") as f: # noqa: ASYNC101
|
|
806
805
|
f.seek(start)
|
|
807
806
|
f.write(out)
|
|
808
807
|
else:
|
|
@@ -847,6 +846,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
|
|
|
847
846
|
elif "Content-Range" in r.headers:
|
|
848
847
|
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
|
849
848
|
|
|
849
|
+
if "Content-Type" in r.headers:
|
|
850
|
+
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
|
|
851
|
+
|
|
852
|
+
info["url"] = str(r.url)
|
|
853
|
+
|
|
850
854
|
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
|
851
855
|
if r.headers.get(checksum_field):
|
|
852
856
|
info[checksum_field] = r.headers[checksum_field]
|
fsspec/implementations/local.py
CHANGED
|
@@ -3,7 +3,6 @@ import io
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import os.path as osp
|
|
6
|
-
import posixpath
|
|
7
6
|
import re
|
|
8
7
|
import shutil
|
|
9
8
|
import stat
|
|
@@ -59,11 +58,16 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
59
58
|
|
|
60
59
|
def ls(self, path, detail=False, **kwargs):
|
|
61
60
|
path = self._strip_protocol(path)
|
|
62
|
-
|
|
61
|
+
info = self.info(path)
|
|
62
|
+
if info["type"] == "directory":
|
|
63
63
|
with os.scandir(path) as it:
|
|
64
|
-
|
|
64
|
+
infos = [self.info(f) for f in it]
|
|
65
65
|
else:
|
|
66
|
-
|
|
66
|
+
infos = [info]
|
|
67
|
+
|
|
68
|
+
if not detail:
|
|
69
|
+
return [i["name"] for i in infos]
|
|
70
|
+
return infos
|
|
67
71
|
|
|
68
72
|
def info(self, path, **kwargs):
|
|
69
73
|
if isinstance(path, os.DirEntry):
|
fsspec/implementations/memory.py
CHANGED
|
@@ -17,7 +17,7 @@ except ImportError:
|
|
|
17
17
|
import json
|
|
18
18
|
|
|
19
19
|
from ..asyn import AsyncFileSystem
|
|
20
|
-
from ..callbacks import
|
|
20
|
+
from ..callbacks import DEFAULT_CALLBACK
|
|
21
21
|
from ..core import filesystem, open, split_protocol
|
|
22
22
|
from ..utils import isfilelike, merge_offset_ranges, other_paths
|
|
23
23
|
|
|
@@ -106,6 +106,12 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
106
106
|
self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
|
|
107
107
|
):
|
|
108
108
|
"""
|
|
109
|
+
|
|
110
|
+
This instance will be writable, storing changes in memory until full partitions
|
|
111
|
+
are accumulated or .flush() is called.
|
|
112
|
+
|
|
113
|
+
To create an empty lazy store, use .create()
|
|
114
|
+
|
|
109
115
|
Parameters
|
|
110
116
|
----------
|
|
111
117
|
root : str
|
|
@@ -119,26 +125,35 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
119
125
|
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
|
|
120
126
|
of the number of unique urls to total number of refs for each variable
|
|
121
127
|
is greater than or equal to this number. (default 10)
|
|
122
|
-
|
|
123
|
-
|
|
124
128
|
"""
|
|
125
129
|
self.root = root
|
|
126
130
|
self.chunk_sizes = {}
|
|
127
|
-
self.
|
|
131
|
+
self.out_root = out_root or self.root
|
|
132
|
+
self.cat_thresh = categorical_threshold
|
|
133
|
+
self.cache_size = cache_size
|
|
128
134
|
self.dirs = None
|
|
135
|
+
self.url = self.root + "/{field}/refs.{record}.parq"
|
|
136
|
+
# TODO: derive fs from `root`
|
|
129
137
|
self.fs = fsspec.filesystem("file") if fs is None else fs
|
|
138
|
+
|
|
139
|
+
def __getattr__(self, item):
|
|
140
|
+
if item in ("_items", "record_size", "zmetadata"):
|
|
141
|
+
self.setup()
|
|
142
|
+
# avoid possible recursion if setup fails somehow
|
|
143
|
+
return self.__dict__[item]
|
|
144
|
+
raise AttributeError(item)
|
|
145
|
+
|
|
146
|
+
def setup(self):
|
|
147
|
+
self._items = {}
|
|
130
148
|
self._items[".zmetadata"] = self.fs.cat_file(
|
|
131
149
|
"/".join([self.root, ".zmetadata"])
|
|
132
150
|
)
|
|
133
151
|
met = json.loads(self._items[".zmetadata"])
|
|
134
152
|
self.record_size = met["record_size"]
|
|
135
153
|
self.zmetadata = met["metadata"]
|
|
136
|
-
self.url = self.root + "/{field}/refs.{record}.parq"
|
|
137
|
-
self.out_root = out_root or self.root
|
|
138
|
-
self.cat_thresh = categorical_threshold
|
|
139
154
|
|
|
140
155
|
# Define function to open and decompress refs
|
|
141
|
-
@lru_cache(maxsize=cache_size)
|
|
156
|
+
@lru_cache(maxsize=self.cache_size)
|
|
142
157
|
def open_refs(field, record):
|
|
143
158
|
"""cached parquet file loader"""
|
|
144
159
|
path = self.url.format(field=field, record=record)
|
|
@@ -153,6 +168,8 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
153
168
|
def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
|
|
154
169
|
"""Make empty parquet reference set
|
|
155
170
|
|
|
171
|
+
First deletes the contents of the given directory, if it exists.
|
|
172
|
+
|
|
156
173
|
Parameters
|
|
157
174
|
----------
|
|
158
175
|
root: str
|
|
@@ -172,12 +189,15 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
172
189
|
met = {"metadata": {}, "record_size": record_size}
|
|
173
190
|
if fs is None:
|
|
174
191
|
fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
|
|
192
|
+
if fs.exists(root):
|
|
193
|
+
fs.rm(root, recursive=True)
|
|
175
194
|
fs.makedirs(root, exist_ok=True)
|
|
176
195
|
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
|
177
196
|
return LazyReferenceMapper(root, fs, **kwargs)
|
|
178
197
|
|
|
179
198
|
def listdir(self, basename=True):
|
|
180
199
|
"""List top-level directories"""
|
|
200
|
+
# cache me?
|
|
181
201
|
if self.dirs is None:
|
|
182
202
|
dirs = [p.split("/", 1)[0] for p in self.zmetadata]
|
|
183
203
|
self.dirs = {p for p in dirs if p and not p.startswith(".")}
|
|
@@ -258,19 +278,18 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
258
278
|
elif "/" not in key or self._is_meta(key):
|
|
259
279
|
raise KeyError(key)
|
|
260
280
|
field, sub_key = key.split("/")
|
|
261
|
-
record,
|
|
262
|
-
maybe = self._items.get((field,
|
|
281
|
+
record, ri, chunk_size = self._key_to_record(key)
|
|
282
|
+
maybe = self._items.get((field, record), {}).get(ri, False)
|
|
263
283
|
if maybe is None:
|
|
264
284
|
# explicitly deleted
|
|
265
285
|
raise KeyError
|
|
266
286
|
elif maybe:
|
|
267
287
|
return maybe
|
|
288
|
+
elif chunk_size == 0:
|
|
289
|
+
return b""
|
|
268
290
|
|
|
269
291
|
# Chunk keys can be loaded from row group and cached in LRU cache
|
|
270
292
|
try:
|
|
271
|
-
record, ri, chunk_size = self._key_to_record(key)
|
|
272
|
-
if chunk_size == 0:
|
|
273
|
-
return b""
|
|
274
293
|
refs = self.open_refs(field, record)
|
|
275
294
|
except (ValueError, TypeError, FileNotFoundError):
|
|
276
295
|
raise KeyError(key)
|
|
@@ -280,7 +299,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
280
299
|
if raw is not None:
|
|
281
300
|
return raw
|
|
282
301
|
if selection[0] is None:
|
|
283
|
-
raise KeyError("This reference has been deleted")
|
|
302
|
+
raise KeyError("This reference does not exist or has been deleted")
|
|
284
303
|
if selection[1:3] == [0, 0]:
|
|
285
304
|
# URL only
|
|
286
305
|
return selection[:1]
|
|
@@ -307,7 +326,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
307
326
|
size_ratio = [
|
|
308
327
|
math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
|
|
309
328
|
]
|
|
310
|
-
self.chunk_sizes[field] = size_ratio
|
|
329
|
+
self.chunk_sizes[field] = size_ratio or [1]
|
|
311
330
|
return self.chunk_sizes[field]
|
|
312
331
|
|
|
313
332
|
def _generate_record(self, field, record):
|
|
@@ -342,7 +361,6 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
342
361
|
def __hash__(self):
|
|
343
362
|
return id(self)
|
|
344
363
|
|
|
345
|
-
@lru_cache(20)
|
|
346
364
|
def __getitem__(self, key):
|
|
347
365
|
return self._load_one_key(key)
|
|
348
366
|
|
|
@@ -357,9 +375,10 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
357
375
|
else:
|
|
358
376
|
# metadata or top-level
|
|
359
377
|
self._items[key] = value
|
|
360
|
-
|
|
378
|
+
new_value = json.loads(
|
|
361
379
|
value.decode() if isinstance(value, bytes) else value
|
|
362
380
|
)
|
|
381
|
+
self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
|
|
363
382
|
|
|
364
383
|
@staticmethod
|
|
365
384
|
def _is_meta(key):
|
|
@@ -373,9 +392,9 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
373
392
|
else:
|
|
374
393
|
if "/" in key and not self._is_meta(key):
|
|
375
394
|
field, chunk = key.split("/")
|
|
376
|
-
record,
|
|
395
|
+
record, i, _ = self._key_to_record(key)
|
|
377
396
|
subdict = self._items.setdefault((field, record), {})
|
|
378
|
-
subdict[
|
|
397
|
+
subdict[i] = None
|
|
379
398
|
if len(subdict) == self.record_size:
|
|
380
399
|
self.write(field, record)
|
|
381
400
|
else:
|
|
@@ -388,26 +407,43 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
388
407
|
import numpy as np
|
|
389
408
|
import pandas as pd
|
|
390
409
|
|
|
391
|
-
# TODO: if the dict is incomplete, also load records and merge in
|
|
392
410
|
partition = self._items[(field, record)]
|
|
393
|
-
|
|
411
|
+
original = False
|
|
412
|
+
if len(partition) < self.record_size:
|
|
413
|
+
try:
|
|
414
|
+
original = self.open_refs(field, record)
|
|
415
|
+
except IOError:
|
|
416
|
+
pass
|
|
394
417
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
418
|
+
if original:
|
|
419
|
+
paths = original["path"]
|
|
420
|
+
offsets = original["offset"]
|
|
421
|
+
sizes = original["size"]
|
|
422
|
+
raws = original["raw"]
|
|
423
|
+
else:
|
|
424
|
+
paths = np.full(self.record_size, np.nan, dtype="O")
|
|
425
|
+
offsets = np.zeros(self.record_size, dtype="int64")
|
|
426
|
+
sizes = np.zeros(self.record_size, dtype="int64")
|
|
427
|
+
raws = np.full(self.record_size, np.nan, dtype="O")
|
|
402
428
|
for j, data in partition.items():
|
|
403
429
|
if isinstance(data, list):
|
|
404
|
-
|
|
430
|
+
if (
|
|
431
|
+
str(paths.dtype) == "category"
|
|
432
|
+
and data[0] not in paths.dtype.categories
|
|
433
|
+
):
|
|
434
|
+
paths = paths.add_categories(data[0])
|
|
405
435
|
paths[j] = data[0]
|
|
406
436
|
if len(data) > 1:
|
|
407
437
|
offsets[j] = data[1]
|
|
408
438
|
sizes[j] = data[2]
|
|
439
|
+
elif data is None:
|
|
440
|
+
# delete
|
|
441
|
+
paths[j] = None
|
|
442
|
+
offsets[j] = 0
|
|
443
|
+
sizes[j] = 0
|
|
444
|
+
raws[j] = None
|
|
409
445
|
else:
|
|
410
|
-
|
|
446
|
+
# this is the only call into kerchunk, could remove
|
|
411
447
|
raws[j] = kerchunk.df._proc_raw(data)
|
|
412
448
|
# TODO: only save needed columns
|
|
413
449
|
df = pd.DataFrame(
|
|
@@ -424,6 +460,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
424
460
|
object_encoding = {"raw": "bytes", "path": "utf8"}
|
|
425
461
|
has_nulls = ["path", "raw"]
|
|
426
462
|
|
|
463
|
+
fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
|
|
427
464
|
self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
|
|
428
465
|
df.to_parquet(
|
|
429
466
|
fn,
|
|
@@ -474,29 +511,30 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
474
511
|
self.open_refs.cache_clear()
|
|
475
512
|
|
|
476
513
|
def __len__(self):
|
|
477
|
-
# Caveat: This counts expected references, not actual
|
|
514
|
+
# Caveat: This counts expected references, not actual - but is fast
|
|
478
515
|
count = 0
|
|
479
516
|
for field in self.listdir():
|
|
480
517
|
if field.startswith("."):
|
|
481
518
|
count += 1
|
|
482
519
|
else:
|
|
483
|
-
|
|
484
|
-
nchunks = self.np.product(chunk_sizes)
|
|
485
|
-
count += nchunks
|
|
520
|
+
count += math.prod(self._get_chunk_sizes(field))
|
|
486
521
|
count += len(self.zmetadata) # all metadata keys
|
|
487
|
-
|
|
522
|
+
# any other files not in reference partitions
|
|
523
|
+
count += sum(1 for _ in self._items if not isinstance(_, tuple))
|
|
488
524
|
return count
|
|
489
525
|
|
|
490
526
|
def __iter__(self):
|
|
491
|
-
# Caveat:
|
|
492
|
-
#
|
|
527
|
+
# Caveat: returns only existing keys, so the number of these does not
|
|
528
|
+
# match len(self)
|
|
493
529
|
metas = set(self.zmetadata)
|
|
494
530
|
metas.update(self._items)
|
|
495
531
|
for bit in metas:
|
|
496
532
|
if isinstance(bit, str):
|
|
497
533
|
yield bit
|
|
498
534
|
for field in self.listdir():
|
|
499
|
-
|
|
535
|
+
for k in self._keys_in_field(field):
|
|
536
|
+
if k in self:
|
|
537
|
+
yield k
|
|
500
538
|
|
|
501
539
|
def __contains__(self, item):
|
|
502
540
|
try:
|
|
@@ -762,7 +800,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
762
800
|
with open(lpath, "wb") as f:
|
|
763
801
|
f.write(data)
|
|
764
802
|
|
|
765
|
-
def get_file(self, rpath, lpath, callback=
|
|
803
|
+
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
|
|
766
804
|
if self.isdir(rpath):
|
|
767
805
|
return os.makedirs(lpath, exist_ok=True)
|
|
768
806
|
data = self.cat_file(rpath, **kwargs)
|
|
@@ -1101,7 +1139,7 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
1101
1139
|
self.references[path] = data
|
|
1102
1140
|
self.dircache.clear() # this is a bit heavy handed
|
|
1103
1141
|
|
|
1104
|
-
async def _put_file(self, lpath, rpath):
|
|
1142
|
+
async def _put_file(self, lpath, rpath, **kwargs):
|
|
1105
1143
|
# puts binary
|
|
1106
1144
|
with open(lpath, "rb") as f:
|
|
1107
1145
|
self.references[rpath] = f.read()
|
fsspec/implementations/sftp.py
CHANGED
|
@@ -65,7 +65,7 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
65
65
|
out.pop("protocol", None)
|
|
66
66
|
return out
|
|
67
67
|
|
|
68
|
-
def mkdir(self, path, create_parents=
|
|
68
|
+
def mkdir(self, path, create_parents=True, mode=511):
|
|
69
69
|
logger.debug("Creating folder %s", path)
|
|
70
70
|
if self.exists(path):
|
|
71
71
|
raise FileExistsError(f"File exists: {path}")
|
|
@@ -54,6 +54,8 @@ class WebHDFS(AbstractFileSystem):
|
|
|
54
54
|
kerb_kwargs=None,
|
|
55
55
|
data_proxy=None,
|
|
56
56
|
use_https=False,
|
|
57
|
+
session_cert=None,
|
|
58
|
+
session_verify=True,
|
|
57
59
|
**kwargs,
|
|
58
60
|
):
|
|
59
61
|
"""
|
|
@@ -90,12 +92,19 @@ class WebHDFS(AbstractFileSystem):
|
|
|
90
92
|
``url->data_proxy(url)``.
|
|
91
93
|
use_https: bool
|
|
92
94
|
Whether to connect to the Name-node using HTTPS instead of HTTP
|
|
95
|
+
session_cert: str or Tuple[str, str] or None
|
|
96
|
+
Path to a certificate file, or tuple of (cert, key) files to use
|
|
97
|
+
for the requests.Session
|
|
98
|
+
session_verify: str, bool or None
|
|
99
|
+
Path to a certificate file to use for verifying the requests.Session.
|
|
93
100
|
kwargs
|
|
94
101
|
"""
|
|
95
102
|
if self._cached:
|
|
96
103
|
return
|
|
97
104
|
super().__init__(**kwargs)
|
|
98
|
-
self.url =
|
|
105
|
+
self.url = (
|
|
106
|
+
f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa
|
|
107
|
+
)
|
|
99
108
|
self.kerb = kerberos
|
|
100
109
|
self.kerb_kwargs = kerb_kwargs or {}
|
|
101
110
|
self.pars = {}
|
|
@@ -128,6 +137,10 @@ class WebHDFS(AbstractFileSystem):
|
|
|
128
137
|
"If using Kerberos auth, do not specify the "
|
|
129
138
|
"user, this is handled by kinit."
|
|
130
139
|
)
|
|
140
|
+
|
|
141
|
+
self.session_cert = session_cert
|
|
142
|
+
self.session_verify = session_verify
|
|
143
|
+
|
|
131
144
|
self._connect()
|
|
132
145
|
|
|
133
146
|
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
|
@@ -138,6 +151,12 @@ class WebHDFS(AbstractFileSystem):
|
|
|
138
151
|
|
|
139
152
|
def _connect(self):
|
|
140
153
|
self.session = requests.Session()
|
|
154
|
+
|
|
155
|
+
if self.session_cert:
|
|
156
|
+
self.session.cert = self.session_cert
|
|
157
|
+
|
|
158
|
+
self.session.verify = self.session_verify
|
|
159
|
+
|
|
141
160
|
if self.kerb:
|
|
142
161
|
from requests_kerberos import HTTPKerberosAuth
|
|
143
162
|
|
fsspec/parquet.py
CHANGED
|
@@ -131,10 +131,8 @@ def open_parquet_file(
|
|
|
131
131
|
cache_type="parts",
|
|
132
132
|
cache_options={
|
|
133
133
|
**options,
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
"strict": strict,
|
|
137
|
-
},
|
|
134
|
+
"data": data.get(fn, {}),
|
|
135
|
+
"strict": strict,
|
|
138
136
|
},
|
|
139
137
|
**kwargs,
|
|
140
138
|
)
|
|
@@ -338,7 +336,7 @@ def _transfer_ranges(fs, blocks, paths, starts, ends):
|
|
|
338
336
|
|
|
339
337
|
def _add_header_magic(data):
|
|
340
338
|
# Add b"PAR1" to file headers
|
|
341
|
-
for
|
|
339
|
+
for path in list(data.keys()):
|
|
342
340
|
add_magic = True
|
|
343
341
|
for k in data[path].keys():
|
|
344
342
|
if k[0] == 0 and k[1] >= 4:
|
fsspec/spec.py
CHANGED
|
@@ -11,7 +11,7 @@ from glob import has_magic
|
|
|
11
11
|
from hashlib import sha256
|
|
12
12
|
from typing import ClassVar
|
|
13
13
|
|
|
14
|
-
from .callbacks import
|
|
14
|
+
from .callbacks import DEFAULT_CALLBACK
|
|
15
15
|
from .config import apply_config, conf
|
|
16
16
|
from .dircache import DirCache
|
|
17
17
|
from .transaction import Transaction
|
|
@@ -876,9 +876,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
876
876
|
else:
|
|
877
877
|
return self.cat_file(paths[0], **kwargs)
|
|
878
878
|
|
|
879
|
-
def get_file(
|
|
880
|
-
self, rpath, lpath, callback=_DEFAULT_CALLBACK, outfile=None, **kwargs
|
|
881
|
-
):
|
|
879
|
+
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs):
|
|
882
880
|
"""Copy single remote file to local"""
|
|
883
881
|
from .implementations.local import LocalFileSystem
|
|
884
882
|
|
|
@@ -913,7 +911,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
913
911
|
rpath,
|
|
914
912
|
lpath,
|
|
915
913
|
recursive=False,
|
|
916
|
-
callback=
|
|
914
|
+
callback=DEFAULT_CALLBACK,
|
|
917
915
|
maxdepth=None,
|
|
918
916
|
**kwargs,
|
|
919
917
|
):
|
|
@@ -967,10 +965,10 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
967
965
|
|
|
968
966
|
callback.set_size(len(lpaths))
|
|
969
967
|
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
|
|
970
|
-
callback.
|
|
971
|
-
|
|
968
|
+
with callback.branched(rpath, lpath) as child:
|
|
969
|
+
self.get_file(rpath, lpath, callback=child, **kwargs)
|
|
972
970
|
|
|
973
|
-
def put_file(self, lpath, rpath, callback=
|
|
971
|
+
def put_file(self, lpath, rpath, callback=DEFAULT_CALLBACK, **kwargs):
|
|
974
972
|
"""Copy single file to remote"""
|
|
975
973
|
if os.path.isdir(lpath):
|
|
976
974
|
self.makedirs(rpath, exist_ok=True)
|
|
@@ -995,7 +993,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
995
993
|
lpath,
|
|
996
994
|
rpath,
|
|
997
995
|
recursive=False,
|
|
998
|
-
callback=
|
|
996
|
+
callback=DEFAULT_CALLBACK,
|
|
999
997
|
maxdepth=None,
|
|
1000
998
|
**kwargs,
|
|
1001
999
|
):
|
|
@@ -1053,8 +1051,8 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1053
1051
|
|
|
1054
1052
|
callback.set_size(len(rpaths))
|
|
1055
1053
|
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
|
|
1056
|
-
callback.
|
|
1057
|
-
|
|
1054
|
+
with callback.branched(lpath, rpath) as child:
|
|
1055
|
+
self.put_file(lpath, rpath, callback=child, **kwargs)
|
|
1058
1056
|
|
|
1059
1057
|
def head(self, path, size=1024):
|
|
1060
1058
|
"""Get the first ``size`` bytes from file"""
|
|
@@ -1134,7 +1132,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1134
1132
|
if maxdepth is not None and maxdepth < 1:
|
|
1135
1133
|
raise ValueError("maxdepth must be at least 1")
|
|
1136
1134
|
|
|
1137
|
-
if isinstance(path, str):
|
|
1135
|
+
if isinstance(path, (str, os.PathLike)):
|
|
1138
1136
|
out = self.expand_path([path], recursive, maxdepth)
|
|
1139
1137
|
else:
|
|
1140
1138
|
out = set()
|
|
@@ -1400,7 +1398,9 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1400
1398
|
)
|
|
1401
1399
|
return json.dumps(
|
|
1402
1400
|
dict(
|
|
1403
|
-
|
|
1401
|
+
cls=cls,
|
|
1402
|
+
protocol=proto,
|
|
1403
|
+
args=self.storage_args,
|
|
1404
1404
|
**self.storage_options,
|
|
1405
1405
|
)
|
|
1406
1406
|
)
|
|
@@ -1691,6 +1691,8 @@ class AbstractBufferedFile(io.IOBase):
|
|
|
1691
1691
|
|
|
1692
1692
|
def __eq__(self, other):
|
|
1693
1693
|
"""Files are equal if they have the same checksum, only in read mode"""
|
|
1694
|
+
if self is other:
|
|
1695
|
+
return True
|
|
1694
1696
|
return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other)
|
|
1695
1697
|
|
|
1696
1698
|
def commit(self):
|
fsspec/tests/abstract/copy.py
CHANGED
|
@@ -128,7 +128,9 @@ class AbstractCopyTests:
|
|
|
128
128
|
|
|
129
129
|
# Without recursive does nothing
|
|
130
130
|
fs.cp(s, t)
|
|
131
|
-
assert fs.ls(target) == (
|
|
131
|
+
assert fs.ls(target, detail=False) == (
|
|
132
|
+
[] if supports_empty_directories else [dummy]
|
|
133
|
+
)
|
|
132
134
|
|
|
133
135
|
# With recursive
|
|
134
136
|
fs.cp(s, t, recursive=True)
|
|
@@ -155,7 +157,9 @@ class AbstractCopyTests:
|
|
|
155
157
|
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
|
|
156
158
|
|
|
157
159
|
fs.rm(fs_join(target, "subdir"), recursive=True)
|
|
158
|
-
assert fs.ls(target) == (
|
|
160
|
+
assert fs.ls(target, detail=False) == (
|
|
161
|
+
[] if supports_empty_directories else [dummy]
|
|
162
|
+
)
|
|
159
163
|
|
|
160
164
|
# Limit recursive by maxdepth
|
|
161
165
|
fs.cp(s, t, recursive=True, maxdepth=1)
|
|
@@ -179,7 +183,9 @@ class AbstractCopyTests:
|
|
|
179
183
|
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
|
|
180
184
|
|
|
181
185
|
fs.rm(fs_join(target, "subdir"), recursive=True)
|
|
182
|
-
assert fs.ls(target) == (
|
|
186
|
+
assert fs.ls(target, detail=False) == (
|
|
187
|
+
[] if supports_empty_directories else [dummy]
|
|
188
|
+
)
|
|
183
189
|
|
|
184
190
|
def test_copy_directory_to_new_directory(
|
|
185
191
|
self,
|
|
@@ -271,7 +277,9 @@ class AbstractCopyTests:
|
|
|
271
277
|
],
|
|
272
278
|
recursive=True,
|
|
273
279
|
)
|
|
274
|
-
assert fs.ls(target) == (
|
|
280
|
+
assert fs.ls(target, detail=False) == (
|
|
281
|
+
[] if supports_empty_directories else [dummy]
|
|
282
|
+
)
|
|
275
283
|
|
|
276
284
|
# With recursive
|
|
277
285
|
for glob, recursive in zip(["*", "**"], [True, False]):
|
|
@@ -290,7 +298,9 @@ class AbstractCopyTests:
|
|
|
290
298
|
],
|
|
291
299
|
recursive=True,
|
|
292
300
|
)
|
|
293
|
-
assert fs.ls(target) == (
|
|
301
|
+
assert fs.ls(target, detail=False) == (
|
|
302
|
+
[] if supports_empty_directories else [dummy]
|
|
303
|
+
)
|
|
294
304
|
|
|
295
305
|
# Limit recursive by maxdepth
|
|
296
306
|
fs.cp(
|
|
@@ -308,7 +318,9 @@ class AbstractCopyTests:
|
|
|
308
318
|
],
|
|
309
319
|
recursive=True,
|
|
310
320
|
)
|
|
311
|
-
assert fs.ls(target) == (
|
|
321
|
+
assert fs.ls(target, detail=False) == (
|
|
322
|
+
[] if supports_empty_directories else [dummy]
|
|
323
|
+
)
|
|
312
324
|
|
|
313
325
|
def test_copy_glob_to_new_directory(
|
|
314
326
|
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|
|
@@ -451,7 +463,9 @@ class AbstractCopyTests:
|
|
|
451
463
|
],
|
|
452
464
|
recursive=True,
|
|
453
465
|
)
|
|
454
|
-
assert fs.ls(target) == (
|
|
466
|
+
assert fs.ls(target, detail=False) == (
|
|
467
|
+
[] if supports_empty_directories else [dummy]
|
|
468
|
+
)
|
|
455
469
|
|
|
456
470
|
def test_copy_list_of_files_to_new_directory(
|
|
457
471
|
self, fs, fs_join, fs_bulk_operations_scenario_0, fs_target
|