fsspec 2023.10.0__py3-none-any.whl → 2024.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. fsspec/_version.py +3 -3
  2. fsspec/archive.py +4 -4
  3. fsspec/asyn.py +43 -53
  4. fsspec/caching.py +1 -1
  5. fsspec/callbacks.py +98 -12
  6. fsspec/compression.py +3 -3
  7. fsspec/core.py +16 -3
  8. fsspec/exceptions.py +0 -4
  9. fsspec/generic.py +11 -4
  10. fsspec/gui.py +4 -3
  11. fsspec/implementations/arrow.py +9 -0
  12. fsspec/implementations/cache_mapper.py +2 -6
  13. fsspec/implementations/cached.py +92 -18
  14. fsspec/implementations/data.py +48 -0
  15. fsspec/implementations/dbfs.py +14 -4
  16. fsspec/implementations/dirfs.py +6 -0
  17. fsspec/implementations/ftp.py +18 -13
  18. fsspec/implementations/github.py +17 -5
  19. fsspec/implementations/http.py +42 -51
  20. fsspec/implementations/libarchive.py +2 -3
  21. fsspec/implementations/local.py +11 -4
  22. fsspec/implementations/memory.py +2 -2
  23. fsspec/implementations/reference.py +127 -56
  24. fsspec/implementations/sftp.py +6 -5
  25. fsspec/implementations/smb.py +0 -1
  26. fsspec/implementations/tar.py +2 -1
  27. fsspec/implementations/webhdfs.py +46 -5
  28. fsspec/implementations/zip.py +11 -3
  29. fsspec/parquet.py +3 -5
  30. fsspec/registry.py +2 -1
  31. fsspec/spec.py +51 -61
  32. fsspec/tests/abstract/common.py +5 -5
  33. fsspec/tests/abstract/copy.py +21 -7
  34. fsspec/tests/abstract/put.py +21 -7
  35. fsspec/transaction.py +8 -4
  36. fsspec/utils.py +114 -1
  37. {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/METADATA +1 -2
  38. fsspec-2024.2.0.dist-info/RECORD +54 -0
  39. {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/WHEEL +1 -1
  40. fsspec-2023.10.0.dist-info/RECORD +0 -53
  41. {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/LICENSE +0 -0
  42. {fsspec-2023.10.0.dist-info → fsspec-2024.2.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ import io
3
3
  import logging
4
4
  import os
5
5
  import os.path as osp
6
- import posixpath
7
6
  import re
8
7
  import shutil
9
8
  import stat
@@ -59,11 +58,16 @@ class LocalFileSystem(AbstractFileSystem):
59
58
 
60
59
  def ls(self, path, detail=False, **kwargs):
61
60
  path = self._strip_protocol(path)
62
- if detail:
61
+ info = self.info(path)
62
+ if info["type"] == "directory":
63
63
  with os.scandir(path) as it:
64
- return [self.info(f) for f in it]
64
+ infos = [self.info(f) for f in it]
65
65
  else:
66
- return [posixpath.join(path, f) for f in os.listdir(path)]
66
+ infos = [info]
67
+
68
+ if not detail:
69
+ return [i["name"] for i in infos]
70
+ return infos
67
71
 
68
72
  def info(self, path, **kwargs):
69
73
  if isinstance(path, os.DirEntry):
@@ -386,6 +390,9 @@ class LocalFileOpener(io.IOBase):
386
390
  def close(self):
387
391
  return self.f.close()
388
392
 
393
+ def truncate(self, size=None) -> int:
394
+ return self.f.truncate(size)
395
+
389
396
  @property
390
397
  def closed(self):
391
398
  return self.f.closed
@@ -8,7 +8,7 @@ from typing import Any, ClassVar
8
8
 
9
9
  from fsspec import AbstractFileSystem
10
10
 
11
- logger = logging.Logger("fsspec.memoryfs")
11
+ logger = logging.getLogger("fsspec.memoryfs")
12
12
 
13
13
 
14
14
  class MemoryFileSystem(AbstractFileSystem):
@@ -175,7 +175,7 @@ class MemoryFileSystem(AbstractFileSystem):
175
175
  parent = self._parent(parent)
176
176
  if self.isfile(parent):
177
177
  raise FileExistsError(parent)
178
- if mode in ["rb", "ab", "rb+"]:
178
+ if mode in ["rb", "ab", "r+b"]:
179
179
  if path in self.store:
180
180
  f = self.store[path]
181
181
  if mode == "ab":
@@ -17,7 +17,7 @@ except ImportError:
17
17
  import json
18
18
 
19
19
  from ..asyn import AsyncFileSystem
20
- from ..callbacks import _DEFAULT_CALLBACK
20
+ from ..callbacks import DEFAULT_CALLBACK
21
21
  from ..core import filesystem, open, split_protocol
22
22
  from ..utils import isfilelike, merge_offset_ranges, other_paths
23
23
 
@@ -106,6 +106,12 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
106
106
  self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
107
107
  ):
108
108
  """
109
+
110
+ This instance will be writable, storing changes in memory until full partitions
111
+ are accumulated or .flush() is called.
112
+
113
+ To create an empty lazy store, use .create()
114
+
109
115
  Parameters
110
116
  ----------
111
117
  root : str
@@ -119,26 +125,35 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
119
125
  Encode urls as pandas.Categorical to reduce memory footprint if the ratio
120
126
  of the number of unique urls to total number of refs for each variable
121
127
  is greater than or equal to this number. (default 10)
122
-
123
-
124
128
  """
125
129
  self.root = root
126
130
  self.chunk_sizes = {}
127
- self._items = {}
131
+ self.out_root = out_root or self.root
132
+ self.cat_thresh = categorical_threshold
133
+ self.cache_size = cache_size
128
134
  self.dirs = None
135
+ self.url = self.root + "/{field}/refs.{record}.parq"
136
+ # TODO: derive fs from `root`
129
137
  self.fs = fsspec.filesystem("file") if fs is None else fs
138
+
139
+ def __getattr__(self, item):
140
+ if item in ("_items", "record_size", "zmetadata"):
141
+ self.setup()
142
+ # avoid possible recursion if setup fails somehow
143
+ return self.__dict__[item]
144
+ raise AttributeError(item)
145
+
146
+ def setup(self):
147
+ self._items = {}
130
148
  self._items[".zmetadata"] = self.fs.cat_file(
131
149
  "/".join([self.root, ".zmetadata"])
132
150
  )
133
151
  met = json.loads(self._items[".zmetadata"])
134
152
  self.record_size = met["record_size"]
135
153
  self.zmetadata = met["metadata"]
136
- self.url = self.root + "/{field}/refs.{record}.parq"
137
- self.out_root = out_root or self.root
138
- self.cat_thresh = categorical_threshold
139
154
 
140
155
  # Define function to open and decompress refs
141
- @lru_cache(maxsize=cache_size)
156
+ @lru_cache(maxsize=self.cache_size)
142
157
  def open_refs(field, record):
143
158
  """cached parquet file loader"""
144
159
  path = self.url.format(field=field, record=record)
@@ -150,13 +165,39 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
150
165
  self.open_refs = open_refs
151
166
 
152
167
  @staticmethod
153
- def create(record_size, root, fs, **kwargs):
168
+ def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
169
+ """Make empty parquet reference set
170
+
171
+ First deletes the contents of the given directory, if it exists.
172
+
173
+ Parameters
174
+ ----------
175
+ root: str
176
+ Directory to contain the output; will be created
177
+ storage_options: dict | None
178
+ For making the filesystem to use for writing is fs is None
179
+ fs: FileSystem | None
180
+ Filesystem for writing
181
+ record_size: int
182
+ Number of references per parquet file
183
+ kwargs: passed to __init__
184
+
185
+ Returns
186
+ -------
187
+ LazyReferenceMapper instance
188
+ """
154
189
  met = {"metadata": {}, "record_size": record_size}
190
+ if fs is None:
191
+ fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
192
+ if fs.exists(root):
193
+ fs.rm(root, recursive=True)
194
+ fs.makedirs(root, exist_ok=True)
155
195
  fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
156
196
  return LazyReferenceMapper(root, fs, **kwargs)
157
197
 
158
198
  def listdir(self, basename=True):
159
199
  """List top-level directories"""
200
+ # cache me?
160
201
  if self.dirs is None:
161
202
  dirs = [p.split("/", 1)[0] for p in self.zmetadata]
162
203
  self.dirs = {p for p in dirs if p and not p.startswith(".")}
@@ -237,19 +278,18 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
237
278
  elif "/" not in key or self._is_meta(key):
238
279
  raise KeyError(key)
239
280
  field, sub_key = key.split("/")
240
- record, _, _ = self._key_to_record(key)
241
- maybe = self._items.get((field, key), {}).get(sub_key, False)
281
+ record, ri, chunk_size = self._key_to_record(key)
282
+ maybe = self._items.get((field, record), {}).get(ri, False)
242
283
  if maybe is None:
243
284
  # explicitly deleted
244
285
  raise KeyError
245
286
  elif maybe:
246
287
  return maybe
288
+ elif chunk_size == 0:
289
+ return b""
247
290
 
248
291
  # Chunk keys can be loaded from row group and cached in LRU cache
249
292
  try:
250
- record, ri, chunk_size = self._key_to_record(key)
251
- if chunk_size == 0:
252
- return b""
253
293
  refs = self.open_refs(field, record)
254
294
  except (ValueError, TypeError, FileNotFoundError):
255
295
  raise KeyError(key)
@@ -259,7 +299,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
259
299
  if raw is not None:
260
300
  return raw
261
301
  if selection[0] is None:
262
- raise KeyError("This reference has been deleted")
302
+ raise KeyError("This reference does not exist or has been deleted")
263
303
  if selection[1:3] == [0, 0]:
264
304
  # URL only
265
305
  return selection[:1]
@@ -286,13 +326,13 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
286
326
  size_ratio = [
287
327
  math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
288
328
  ]
289
- self.chunk_sizes[field] = size_ratio
329
+ self.chunk_sizes[field] = size_ratio or [1]
290
330
  return self.chunk_sizes[field]
291
331
 
292
332
  def _generate_record(self, field, record):
293
333
  """The references for a given parquet file of a given field"""
294
334
  refs = self.open_refs(field, record)
295
- it = iter(zip(refs.values()))
335
+ it = iter(zip(*refs.values()))
296
336
  if len(refs) == 3:
297
337
  # All urls
298
338
  return (list(t) for t in it)
@@ -321,7 +361,6 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
321
361
  def __hash__(self):
322
362
  return id(self)
323
363
 
324
- @lru_cache(20)
325
364
  def __getitem__(self, key):
326
365
  return self._load_one_key(key)
327
366
 
@@ -336,9 +375,10 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
336
375
  else:
337
376
  # metadata or top-level
338
377
  self._items[key] = value
339
- self.zmetadata[key] = json.loads(
378
+ new_value = json.loads(
340
379
  value.decode() if isinstance(value, bytes) else value
341
380
  )
381
+ self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
342
382
 
343
383
  @staticmethod
344
384
  def _is_meta(key):
@@ -352,9 +392,9 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
352
392
  else:
353
393
  if "/" in key and not self._is_meta(key):
354
394
  field, chunk = key.split("/")
355
- record, _, _ = self._key_to_record(key)
395
+ record, i, _ = self._key_to_record(key)
356
396
  subdict = self._items.setdefault((field, record), {})
357
- subdict[chunk] = None
397
+ subdict[i] = None
358
398
  if len(subdict) == self.record_size:
359
399
  self.write(field, record)
360
400
  else:
@@ -367,26 +407,43 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
367
407
  import numpy as np
368
408
  import pandas as pd
369
409
 
370
- # TODO: if the dict is incomplete, also load records and merge in
371
410
  partition = self._items[(field, record)]
372
- fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
411
+ original = False
412
+ if len(partition) < self.record_size:
413
+ try:
414
+ original = self.open_refs(field, record)
415
+ except IOError:
416
+ pass
373
417
 
374
- ####
375
- paths = np.full(self.record_size, np.nan, dtype="O")
376
- offsets = np.zeros(self.record_size, dtype="int64")
377
- sizes = np.zeros(self.record_size, dtype="int64")
378
- raws = np.full(self.record_size, np.nan, dtype="O")
379
- nraw = 0
380
- npath = 0
418
+ if original:
419
+ paths = original["path"]
420
+ offsets = original["offset"]
421
+ sizes = original["size"]
422
+ raws = original["raw"]
423
+ else:
424
+ paths = np.full(self.record_size, np.nan, dtype="O")
425
+ offsets = np.zeros(self.record_size, dtype="int64")
426
+ sizes = np.zeros(self.record_size, dtype="int64")
427
+ raws = np.full(self.record_size, np.nan, dtype="O")
381
428
  for j, data in partition.items():
382
429
  if isinstance(data, list):
383
- npath += 1
430
+ if (
431
+ str(paths.dtype) == "category"
432
+ and data[0] not in paths.dtype.categories
433
+ ):
434
+ paths = paths.add_categories(data[0])
384
435
  paths[j] = data[0]
385
436
  if len(data) > 1:
386
437
  offsets[j] = data[1]
387
438
  sizes[j] = data[2]
439
+ elif data is None:
440
+ # delete
441
+ paths[j] = None
442
+ offsets[j] = 0
443
+ sizes[j] = 0
444
+ raws[j] = None
388
445
  else:
389
- nraw += 1
446
+ # this is the only call into kerchunk, could remove
390
447
  raws[j] = kerchunk.df._proc_raw(data)
391
448
  # TODO: only save needed columns
392
449
  df = pd.DataFrame(
@@ -403,6 +460,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
403
460
  object_encoding = {"raw": "bytes", "path": "utf8"}
404
461
  has_nulls = ["path", "raw"]
405
462
 
463
+ fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
406
464
  self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
407
465
  df.to_parquet(
408
466
  fn,
@@ -453,29 +511,30 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
453
511
  self.open_refs.cache_clear()
454
512
 
455
513
  def __len__(self):
456
- # Caveat: This counts expected references, not actual
514
+ # Caveat: This counts expected references, not actual - but is fast
457
515
  count = 0
458
516
  for field in self.listdir():
459
517
  if field.startswith("."):
460
518
  count += 1
461
519
  else:
462
- chunk_sizes = self._get_chunk_sizes(field)
463
- nchunks = self.np.product(chunk_sizes)
464
- count += nchunks
520
+ count += math.prod(self._get_chunk_sizes(field))
465
521
  count += len(self.zmetadata) # all metadata keys
466
- count += len(self._items) # the metadata file itself
522
+ # any other files not in reference partitions
523
+ count += sum(1 for _ in self._items if not isinstance(_, tuple))
467
524
  return count
468
525
 
469
526
  def __iter__(self):
470
- # Caveat: Note that this generates all expected keys, but does not
471
- # account for reference keys that are missing.
527
+ # Caveat: returns only existing keys, so the number of these does not
528
+ # match len(self)
472
529
  metas = set(self.zmetadata)
473
530
  metas.update(self._items)
474
531
  for bit in metas:
475
532
  if isinstance(bit, str):
476
533
  yield bit
477
534
  for field in self.listdir():
478
- yield from self._keys_in_field(field)
535
+ for k in self._keys_in_field(field):
536
+ if k in self:
537
+ yield k
479
538
 
480
539
  def __contains__(self, item):
481
540
  try:
@@ -603,7 +662,7 @@ class ReferenceFileSystem(AsyncFileSystem):
603
662
  **(ref_storage_args or target_options or {}), protocol=target_protocol
604
663
  )
605
664
  ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
606
- if ref_fs.isfile(fo):
665
+ if ref_fs.isfile(fo2):
607
666
  # text JSON
608
667
  with fsspec.open(fo, "rb", **dic) as f:
609
668
  logger.info("Read reference from URL %s", fo)
@@ -650,6 +709,7 @@ class ReferenceFileSystem(AsyncFileSystem):
650
709
  self.fss[protocol] = fs
651
710
  if remote_protocol is None:
652
711
  # get single protocol from references
712
+ # TODO: warning here, since this can be very expensive?
653
713
  for ref in self.references.values():
654
714
  if callable(ref):
655
715
  ref = ref()
@@ -740,7 +800,7 @@ class ReferenceFileSystem(AsyncFileSystem):
740
800
  with open(lpath, "wb") as f:
741
801
  f.write(data)
742
802
 
743
- def get_file(self, rpath, lpath, callback=_DEFAULT_CALLBACK, **kwargs):
803
+ def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
744
804
  if self.isdir(rpath):
745
805
  return os.makedirs(lpath, exist_ok=True)
746
806
  data = self.cat_file(rpath, **kwargs)
@@ -772,24 +832,27 @@ class ReferenceFileSystem(AsyncFileSystem):
772
832
  raise NotImplementedError
773
833
  if isinstance(path, list) and (recursive or any("*" in p for p in path)):
774
834
  raise NotImplementedError
835
+ # TODO: if references is lazy, pre-fetch all paths in batch before access
775
836
  proto_dict = _protocol_groups(path, self.references)
776
837
  out = {}
777
838
  for proto, paths in proto_dict.items():
778
839
  fs = self.fss[proto]
779
- urls, starts, ends = [], [], []
840
+ urls, starts, ends, valid_paths = [], [], [], []
780
841
  for p in paths:
781
842
  # find references or label not-found. Early exit if any not
782
843
  # found and on_error is "raise"
783
844
  try:
784
845
  u, s, e = self._cat_common(p)
785
- urls.append(u)
786
- starts.append(s)
787
- ends.append(e)
788
846
  except FileNotFoundError as err:
789
847
  if on_error == "raise":
790
848
  raise
791
849
  if on_error != "omit":
792
850
  out[p] = err
851
+ else:
852
+ urls.append(u)
853
+ starts.append(s)
854
+ ends.append(e)
855
+ valid_paths.append(p)
793
856
 
794
857
  # process references into form for merging
795
858
  urls2 = []
@@ -797,7 +860,7 @@ class ReferenceFileSystem(AsyncFileSystem):
797
860
  ends2 = []
798
861
  paths2 = []
799
862
  whole_files = set()
800
- for u, s, e, p in zip(urls, starts, ends, paths):
863
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
801
864
  if isinstance(u, bytes):
802
865
  # data
803
866
  out[p] = u
@@ -809,7 +872,7 @@ class ReferenceFileSystem(AsyncFileSystem):
809
872
  starts2.append(s)
810
873
  ends2.append(e)
811
874
  paths2.append(p)
812
- for u, s, e, p in zip(urls, starts, ends, paths):
875
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
813
876
  # second run to account for files that are to be loaded whole
814
877
  if s is not None and u not in whole_files:
815
878
  urls2.append(u)
@@ -829,7 +892,7 @@ class ReferenceFileSystem(AsyncFileSystem):
829
892
  bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
830
893
 
831
894
  # unbundle from merged bytes - simple approach
832
- for u, s, e, p in zip(urls, starts, ends, paths):
895
+ for u, s, e, p in zip(urls, starts, ends, valid_paths):
833
896
  if p in out:
834
897
  continue # was bytes, already handled
835
898
  for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
@@ -963,16 +1026,24 @@ class ReferenceFileSystem(AsyncFileSystem):
963
1026
  elif len(part) == 1:
964
1027
  size = None
965
1028
  else:
966
- _, start, size = part
1029
+ _, _, size = part
967
1030
  par = path.rsplit("/", 1)[0] if "/" in path else ""
968
1031
  par0 = par
1032
+ subdirs = [par0]
969
1033
  while par0 and par0 not in self.dircache:
970
- # build parent directories
971
- self.dircache[par0] = []
972
- self.dircache.setdefault(
973
- par0.rsplit("/", 1)[0] if "/" in par0 else "", []
974
- ).append({"name": par0, "type": "directory", "size": 0})
1034
+ # collect parent directories
975
1035
  par0 = self._parent(par0)
1036
+ subdirs.append(par0)
1037
+
1038
+ subdirs = subdirs[::-1]
1039
+ for parent, child in zip(subdirs, subdirs[1:]):
1040
+ # register newly discovered directories
1041
+ assert child not in self.dircache
1042
+ assert parent in self.dircache
1043
+ self.dircache[parent].append(
1044
+ {"name": child, "type": "directory", "size": 0}
1045
+ )
1046
+ self.dircache[child] = []
976
1047
 
977
1048
  self.dircache[par].append({"name": path, "type": "file", "size": size})
978
1049
 
@@ -1068,7 +1139,7 @@ class ReferenceFileSystem(AsyncFileSystem):
1068
1139
  self.references[path] = data
1069
1140
  self.dircache.clear() # this is a bit heavy handed
1070
1141
 
1071
- async def _put_file(self, lpath, rpath):
1142
+ async def _put_file(self, lpath, rpath, **kwargs):
1072
1143
  # puts binary
1073
1144
  with open(lpath, "rb") as f:
1074
1145
  self.references[rpath] = f.read()
@@ -65,7 +65,7 @@ class SFTPFileSystem(AbstractFileSystem):
65
65
  out.pop("protocol", None)
66
66
  return out
67
67
 
68
- def mkdir(self, path, create_parents=False, mode=511):
68
+ def mkdir(self, path, create_parents=True, mode=511):
69
69
  logger.debug("Creating folder %s", path)
70
70
  if self.exists(path):
71
71
  raise FileExistsError(f"File exists: {path}")
@@ -80,12 +80,13 @@ class SFTPFileSystem(AbstractFileSystem):
80
80
  raise FileExistsError(f"File exists: {path}")
81
81
 
82
82
  parts = path.split("/")
83
- path = ""
83
+ new_path = "/" if path[:1] == "/" else ""
84
84
 
85
85
  for part in parts:
86
- path += f"/{part}"
87
- if not self.exists(path):
88
- self.ftp.mkdir(path, mode)
86
+ if part:
87
+ new_path = f"{new_path}/{part}" if new_path else part
88
+ if not self.exists(new_path):
89
+ self.ftp.mkdir(new_path, mode)
89
90
 
90
91
  def rmdir(self, path):
91
92
  logger.debug("Removing folder %s", path)
@@ -1,4 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
1
  """
3
2
  This module contains SMBFileSystem class responsible for handling access to
4
3
  Windows Samba network shares by using package smbprotocol
@@ -106,11 +106,12 @@ class TarFileSystem(AbstractArchiveFileSystem):
106
106
 
107
107
  # This enables ls to get directories as children as well as files
108
108
  self.dir_cache = {
109
- dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
109
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
110
110
  for dirname in self._all_dirnames(self.tar.getnames())
111
111
  }
112
112
  for member in self.tar.getmembers():
113
113
  info = member.get_info()
114
+ info["name"] = info["name"].rstrip("/")
114
115
  info["type"] = typemap.get(info["type"], "file")
115
116
  self.dir_cache[info["name"]] = info
116
117
 
@@ -21,7 +21,7 @@ class WebHDFS(AbstractFileSystem):
21
21
  """
22
22
  Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
23
23
 
24
- Three auth mechanisms are supported:
24
+ Four auth mechanisms are supported:
25
25
 
26
26
  insecure: no auth is done, and the user is assumed to be whoever they
27
27
  say they are (parameter ``user``), or a predefined value such as
@@ -34,6 +34,8 @@ class WebHDFS(AbstractFileSystem):
34
34
  service. Indeed, this client can also generate such tokens when
35
35
  not insecure. Note that tokens expire, but can be renewed (by a
36
36
  previously specified user) and may allow for proxying.
37
+ basic-auth: used when both parameter ``user`` and parameter ``password``
38
+ are provided.
37
39
 
38
40
  """
39
41
 
@@ -47,10 +49,13 @@ class WebHDFS(AbstractFileSystem):
47
49
  kerberos=False,
48
50
  token=None,
49
51
  user=None,
52
+ password=None,
50
53
  proxy_to=None,
51
54
  kerb_kwargs=None,
52
55
  data_proxy=None,
53
56
  use_https=False,
57
+ session_cert=None,
58
+ session_verify=True,
54
59
  **kwargs,
55
60
  ):
56
61
  """
@@ -68,6 +73,9 @@ class WebHDFS(AbstractFileSystem):
68
73
  given
69
74
  user: str or None
70
75
  If given, assert the user name to connect with
76
+ password: str or None
77
+ If given, assert the password to use for basic auth. If password
78
+ is provided, user must be provided also
71
79
  proxy_to: str or None
72
80
  If given, the user has the authority to proxy, and this value is
73
81
  the user in who's name actions are taken
@@ -84,12 +92,19 @@ class WebHDFS(AbstractFileSystem):
84
92
  ``url->data_proxy(url)``.
85
93
  use_https: bool
86
94
  Whether to connect to the Name-node using HTTPS instead of HTTP
95
+ session_cert: str or Tuple[str, str] or None
96
+ Path to a certificate file, or tuple of (cert, key) files to use
97
+ for the requests.Session
98
+ session_verify: str, bool or None
99
+ Path to a certificate file to use for verifying the requests.Session.
87
100
  kwargs
88
101
  """
89
102
  if self._cached:
90
103
  return
91
104
  super().__init__(**kwargs)
92
- self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
105
+ self.url = (
106
+ f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1" # noqa
107
+ )
93
108
  self.kerb = kerberos
94
109
  self.kerb_kwargs = kerb_kwargs or {}
95
110
  self.pars = {}
@@ -102,8 +117,19 @@ class WebHDFS(AbstractFileSystem):
102
117
  " token"
103
118
  )
104
119
  self.pars["delegation"] = token
105
- if user is not None:
106
- self.pars["user.name"] = user
120
+ self.user = user
121
+ self.password = password
122
+
123
+ if password is not None:
124
+ if user is None:
125
+ raise ValueError(
126
+ "If passing a password, the user must also be"
127
+ "set in order to set up the basic-auth"
128
+ )
129
+ else:
130
+ if user is not None:
131
+ self.pars["user.name"] = user
132
+
107
133
  if proxy_to is not None:
108
134
  self.pars["doas"] = proxy_to
109
135
  if kerberos and user is not None:
@@ -111,6 +137,10 @@ class WebHDFS(AbstractFileSystem):
111
137
  "If using Kerberos auth, do not specify the "
112
138
  "user, this is handled by kinit."
113
139
  )
140
+
141
+ self.session_cert = session_cert
142
+ self.session_verify = session_verify
143
+
114
144
  self._connect()
115
145
 
116
146
  self._fsid = f"webhdfs_{tokenize(host, port)}"
@@ -121,13 +151,24 @@ class WebHDFS(AbstractFileSystem):
121
151
 
122
152
  def _connect(self):
123
153
  self.session = requests.Session()
154
+
155
+ if self.session_cert:
156
+ self.session.cert = self.session_cert
157
+
158
+ self.session.verify = self.session_verify
159
+
124
160
  if self.kerb:
125
161
  from requests_kerberos import HTTPKerberosAuth
126
162
 
127
163
  self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
128
164
 
165
+ if self.user is not None and self.password is not None:
166
+ from requests.auth import HTTPBasicAuth
167
+
168
+ self.session.auth = HTTPBasicAuth(self.user, self.password)
169
+
129
170
  def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
130
- url = self.url + quote(path or "")
171
+ url = self._apply_proxy(self.url + quote(path or "", safe="/="))
131
172
  args = kwargs.copy()
132
173
  args.update(self.pars)
133
174
  args["op"] = op.upper()
@@ -49,8 +49,12 @@ class ZipFileSystem(AbstractArchiveFileSystem):
49
49
  raise ValueError(f"mode '{mode}' no understood")
50
50
  self.mode = mode
51
51
  if isinstance(fo, str):
52
+ if mode == "a":
53
+ m = "r+b"
54
+ else:
55
+ m = mode + "b"
52
56
  fo = fsspec.open(
53
- fo, mode=mode + "b", protocol=target_protocol, **(target_options or {})
57
+ fo, mode=m, protocol=target_protocol, **(target_options or {})
54
58
  )
55
59
  self.of = fo
56
60
  self.fo = fo.__enter__() # the whole instance is a context
@@ -83,14 +87,18 @@ class ZipFileSystem(AbstractArchiveFileSystem):
83
87
  # not read from the file.
84
88
  files = self.zip.infolist()
85
89
  self.dir_cache = {
86
- dirname + "/": {"name": dirname + "/", "size": 0, "type": "directory"}
90
+ dirname.rstrip("/"): {
91
+ "name": dirname.rstrip("/"),
92
+ "size": 0,
93
+ "type": "directory",
94
+ }
87
95
  for dirname in self._all_dirnames(self.zip.namelist())
88
96
  }
89
97
  for z in files:
90
98
  f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
91
99
  f.update(
92
100
  {
93
- "name": z.filename,
101
+ "name": z.filename.rstrip("/"),
94
102
  "size": z.file_size,
95
103
  "type": ("directory" if z.is_dir() else "file"),
96
104
  }