fsspec 2023.6.0__tar.gz → 2023.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {fsspec-2023.6.0/fsspec.egg-info → fsspec-2023.9.1}/PKG-INFO +1 -1
  2. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/_version.py +3 -3
  3. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/asyn.py +154 -92
  4. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/caching.py +1 -1
  5. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/compression.py +7 -2
  6. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/core.py +16 -8
  7. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/generic.py +111 -17
  8. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/gui.py +4 -2
  9. fsspec-2023.9.1/fsspec/implementations/cache_mapper.py +80 -0
  10. fsspec-2023.9.1/fsspec/implementations/cache_metadata.py +232 -0
  11. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/cached.py +74 -157
  12. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/dirfs.py +3 -1
  13. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/http.py +36 -19
  14. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/local.py +4 -21
  15. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/memory.py +8 -9
  16. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/reference.py +8 -8
  17. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/sftp.py +6 -2
  18. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/smb.py +39 -23
  19. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/mapping.py +8 -0
  20. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/registry.py +22 -0
  21. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/spec.py +164 -96
  22. fsspec-2023.9.1/fsspec/tests/abstract/__init__.py +287 -0
  23. fsspec-2023.9.1/fsspec/tests/abstract/common.py +175 -0
  24. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/tests/abstract/copy.py +250 -56
  25. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/tests/abstract/get.py +248 -38
  26. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/tests/abstract/put.py +246 -66
  27. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/utils.py +25 -8
  28. {fsspec-2023.6.0 → fsspec-2023.9.1/fsspec.egg-info}/PKG-INFO +1 -1
  29. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec.egg-info/SOURCES.txt +3 -0
  30. {fsspec-2023.6.0 → fsspec-2023.9.1}/setup.cfg +1 -1
  31. fsspec-2023.6.0/fsspec/tests/abstract/__init__.py +0 -140
  32. {fsspec-2023.6.0 → fsspec-2023.9.1}/LICENSE +0 -0
  33. {fsspec-2023.6.0 → fsspec-2023.9.1}/MANIFEST.in +0 -0
  34. {fsspec-2023.6.0 → fsspec-2023.9.1}/README.md +0 -0
  35. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/__init__.py +0 -0
  36. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/archive.py +0 -0
  37. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/callbacks.py +0 -0
  38. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/config.py +0 -0
  39. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/conftest.py +0 -0
  40. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/dircache.py +0 -0
  41. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/exceptions.py +0 -0
  42. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/fuse.py +0 -0
  43. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/__init__.py +0 -0
  44. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/arrow.py +0 -0
  45. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/dask.py +0 -0
  46. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/dbfs.py +0 -0
  47. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/ftp.py +0 -0
  48. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/git.py +0 -0
  49. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/github.py +0 -0
  50. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/jupyter.py +0 -0
  51. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/libarchive.py +0 -0
  52. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/tar.py +0 -0
  53. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/webhdfs.py +0 -0
  54. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/implementations/zip.py +0 -0
  55. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/parquet.py +0 -0
  56. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec/transaction.py +0 -0
  57. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec.egg-info/dependency_links.txt +0 -0
  58. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec.egg-info/not-zip-safe +0 -0
  59. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec.egg-info/requires.txt +0 -0
  60. {fsspec-2023.6.0 → fsspec-2023.9.1}/fsspec.egg-info/top_level.txt +0 -0
  61. {fsspec-2023.6.0 → fsspec-2023.9.1}/pyproject.toml +0 -0
  62. {fsspec-2023.6.0 → fsspec-2023.9.1}/requirements.txt +0 -0
  63. {fsspec-2023.6.0 → fsspec-2023.9.1}/setup.py +0 -0
  64. {fsspec-2023.6.0 → fsspec-2023.9.1}/versioneer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fsspec
3
- Version: 2023.6.0
3
+ Version: 2023.9.1
4
4
  Summary: File-system specification
5
5
  Home-page: http://github.com/fsspec/filesystem_spec
6
6
  Maintainer: Martin Durant
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2023-06-09T13:30:57-0400",
11
+ "date": "2023-09-15T16:17:21-0400",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "9a1e624022f3ad39071de5b17bafa23214b8662b",
15
- "version": "2023.6.0"
14
+ "full-revisionid": "247b249a008990c584d2619f030bd42916a82e4a",
15
+ "version": "2023.9.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -13,12 +13,7 @@ from typing import TYPE_CHECKING, Iterable
13
13
 
14
14
  from .callbacks import _DEFAULT_CALLBACK
15
15
  from .exceptions import FSTimeoutError
16
- from .implementations.local import (
17
- LocalFileSystem,
18
- make_path_posix,
19
- trailing_sep,
20
- trailing_sep_maybe_asterisk,
21
- )
16
+ from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
22
17
  from .spec import AbstractBufferedFile, AbstractFileSystem
23
18
  from .utils import is_exception, other_paths
24
19
 
@@ -83,6 +78,8 @@ def sync(loop, func, *args, timeout=None, **kwargs):
83
78
  loop0 = asyncio.events.get_running_loop()
84
79
  if loop0 is loop:
85
80
  raise NotImplementedError("Calling sync() from within a running loop")
81
+ except NotImplementedError:
82
+ raise
86
83
  except RuntimeError:
87
84
  pass
88
85
  coro = func(*args, **kwargs)
@@ -347,26 +344,42 @@ class AsyncFileSystem(AbstractFileSystem):
347
344
  elif on_error is None:
348
345
  on_error = "raise"
349
346
 
350
- source_is_str = isinstance(path1, str)
351
- paths = await self._expand_path(path1, maxdepth=maxdepth, recursive=recursive)
352
- if source_is_str and (not recursive or maxdepth is not None):
353
- # Non-recursive glob does not copy directories
354
- paths = [p for p in paths if not (trailing_sep(p) or await self._isdir(p))]
355
- if not paths:
356
- return
347
+ if isinstance(path1, list) and isinstance(path2, list):
348
+ # No need to expand paths when both source and destination
349
+ # are provided as lists
350
+ paths1 = path1
351
+ paths2 = path2
352
+ else:
353
+ source_is_str = isinstance(path1, str)
354
+ paths1 = await self._expand_path(
355
+ path1, maxdepth=maxdepth, recursive=recursive
356
+ )
357
+ if source_is_str and (not recursive or maxdepth is not None):
358
+ # Non-recursive glob does not copy directories
359
+ paths1 = [
360
+ p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
361
+ ]
362
+ if not paths1:
363
+ return
364
+
365
+ source_is_file = len(paths1) == 1
366
+ dest_is_dir = isinstance(path2, str) and (
367
+ trailing_sep(path2) or await self._isdir(path2)
368
+ )
369
+
370
+ exists = source_is_str and (
371
+ (has_magic(path1) and source_is_file)
372
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
373
+ )
374
+ paths2 = other_paths(
375
+ paths1,
376
+ path2,
377
+ exists=exists,
378
+ flatten=not source_is_str,
379
+ )
357
380
 
358
- isdir = isinstance(path2, str) and (
359
- trailing_sep(path2) or await self._isdir(path2)
360
- )
361
- path2 = other_paths(
362
- paths,
363
- path2,
364
- exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(path1),
365
- is_dir=isdir,
366
- flatten=not source_is_str,
367
- )
368
381
  batch_size = batch_size or self.batch_size
369
- coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths, path2)]
382
+ coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
370
383
  result = await _run_coros_in_chunks(
371
384
  coros, batch_size=batch_size, return_exceptions=True, nofiles=True
372
385
  )
@@ -501,28 +514,39 @@ class AsyncFileSystem(AbstractFileSystem):
501
514
  constructor, or for all instances by setting the "gather_batch_size" key
502
515
  in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
503
516
  """
504
- source_is_str = isinstance(lpath, str)
505
- if source_is_str:
506
- lpath = make_path_posix(lpath)
507
- fs = LocalFileSystem()
508
- lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
509
- if source_is_str and (not recursive or maxdepth is not None):
510
- # Non-recursive glob does not copy directories
511
- lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
512
- if not lpaths:
513
- return
517
+ if isinstance(lpath, list) and isinstance(rpath, list):
518
+ # No need to expand paths when both source and destination
519
+ # are provided as lists
520
+ rpaths = rpath
521
+ lpaths = lpath
522
+ else:
523
+ source_is_str = isinstance(lpath, str)
524
+ if source_is_str:
525
+ lpath = make_path_posix(lpath)
526
+ fs = LocalFileSystem()
527
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
528
+ if source_is_str and (not recursive or maxdepth is not None):
529
+ # Non-recursive glob does not copy directories
530
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
531
+ if not lpaths:
532
+ return
533
+
534
+ source_is_file = len(lpaths) == 1
535
+ dest_is_dir = isinstance(rpath, str) and (
536
+ trailing_sep(rpath) or await self._isdir(rpath)
537
+ )
514
538
 
515
- isdir = isinstance(rpath, str) and (
516
- trailing_sep(rpath) or await self._isdir(rpath)
517
- )
518
- rpath = self._strip_protocol(rpath)
519
- rpaths = other_paths(
520
- lpaths,
521
- rpath,
522
- exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(lpath),
523
- is_dir=isdir,
524
- flatten=not source_is_str,
525
- )
539
+ rpath = self._strip_protocol(rpath)
540
+ exists = source_is_str and (
541
+ (has_magic(lpath) and source_is_file)
542
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
543
+ )
544
+ rpaths = other_paths(
545
+ lpaths,
546
+ rpath,
547
+ exists=exists,
548
+ flatten=not source_is_str,
549
+ )
526
550
 
527
551
  is_dir = {l: os.path.isdir(l) for l in lpaths}
528
552
  rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
@@ -567,32 +591,44 @@ class AsyncFileSystem(AbstractFileSystem):
567
591
  constructor, or for all instances by setting the "gather_batch_size" key
568
592
  in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
569
593
  """
570
- source_is_str = isinstance(rpath, str)
571
- # First check for rpath trailing slash as _strip_protocol removes it.
572
- source_not_trailing_sep = source_is_str and not trailing_sep_maybe_asterisk(
573
- rpath
574
- )
575
- rpath = self._strip_protocol(rpath)
576
- rpaths = await self._expand_path(rpath, recursive=recursive)
577
- if source_is_str and (not recursive or maxdepth is not None):
578
- # Non-recursive glob does not copy directories
579
- rpaths = [
580
- p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
581
- ]
582
- if not rpaths:
583
- return
594
+ if isinstance(lpath, list) and isinstance(rpath, list):
595
+ # No need to expand paths when both source and destination
596
+ # are provided as lists
597
+ rpaths = rpath
598
+ lpaths = lpath
599
+ else:
600
+ source_is_str = isinstance(rpath, str)
601
+ # First check for rpath trailing slash as _strip_protocol removes it.
602
+ source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
603
+ rpath = self._strip_protocol(rpath)
604
+ rpaths = await self._expand_path(
605
+ rpath, recursive=recursive, maxdepth=maxdepth
606
+ )
607
+ if source_is_str and (not recursive or maxdepth is not None):
608
+ # Non-recursive glob does not copy directories
609
+ rpaths = [
610
+ p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
611
+ ]
612
+ if not rpaths:
613
+ return
614
+
615
+ lpath = make_path_posix(lpath)
616
+ source_is_file = len(rpaths) == 1
617
+ dest_is_dir = isinstance(lpath, str) and (
618
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
619
+ )
620
+
621
+ exists = source_is_str and (
622
+ (has_magic(rpath) and source_is_file)
623
+ or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
624
+ )
625
+ lpaths = other_paths(
626
+ rpaths,
627
+ lpath,
628
+ exists=exists,
629
+ flatten=not source_is_str,
630
+ )
584
631
 
585
- lpath = make_path_posix(lpath)
586
- isdir = isinstance(lpath, str) and (
587
- trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
588
- )
589
- lpaths = other_paths(
590
- rpaths,
591
- lpath,
592
- exists=isdir and source_not_trailing_sep,
593
- is_dir=isdir,
594
- flatten=not source_is_str,
595
- )
596
632
  [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
597
633
  batch_size = kwargs.pop("batch_size", self.batch_size)
598
634
 
@@ -639,7 +675,7 @@ class AsyncFileSystem(AbstractFileSystem):
639
675
  async def _ls(self, path, detail=True, **kwargs):
640
676
  raise NotImplementedError
641
677
 
642
- async def _walk(self, path, maxdepth=None, **kwargs):
678
+ async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
643
679
  if maxdepth is not None and maxdepth < 1:
644
680
  raise ValueError("maxdepth must be at least 1")
645
681
 
@@ -651,7 +687,11 @@ class AsyncFileSystem(AbstractFileSystem):
651
687
  detail = kwargs.pop("detail", False)
652
688
  try:
653
689
  listing = await self._ls(path, detail=True, **kwargs)
654
- except (FileNotFoundError, OSError):
690
+ except (FileNotFoundError, OSError) as e:
691
+ if on_error == "raise":
692
+ raise
693
+ elif callable(on_error):
694
+ on_error(e)
655
695
  if detail:
656
696
  yield path, {}, {}
657
697
  else:
@@ -689,25 +729,24 @@ class AsyncFileSystem(AbstractFileSystem):
689
729
  ):
690
730
  yield _
691
731
 
692
- async def _glob(self, path, **kwargs):
732
+ async def _glob(self, path, maxdepth=None, **kwargs):
733
+ if maxdepth is not None and maxdepth < 1:
734
+ raise ValueError("maxdepth must be at least 1")
735
+
693
736
  import re
694
737
 
695
738
  ends = path.endswith("/")
696
739
  path = self._strip_protocol(path)
697
- indstar = path.find("*") if path.find("*") >= 0 else len(path)
698
- indques = path.find("?") if path.find("?") >= 0 else len(path)
699
- indbrace = path.find("[") if path.find("[") >= 0 else len(path)
740
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
741
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
742
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
700
743
 
701
- ind = min(indstar, indques, indbrace)
744
+ min_idx = min(idx_star, idx_qmark, idx_brace)
702
745
 
703
746
  detail = kwargs.pop("detail", False)
704
747
 
705
748
  if not has_magic(path):
706
- root = path
707
- depth = 1
708
- if ends:
709
- path += "/*"
710
- elif await self._exists(path):
749
+ if await self._exists(path):
711
750
  if not detail:
712
751
  return [path]
713
752
  else:
@@ -717,13 +756,21 @@ class AsyncFileSystem(AbstractFileSystem):
717
756
  return [] # glob of non-existent returns empty
718
757
  else:
719
758
  return {}
720
- elif "/" in path[:ind]:
721
- ind2 = path[:ind].rindex("/")
722
- root = path[: ind2 + 1]
723
- depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
759
+ elif "/" in path[:min_idx]:
760
+ min_idx = path[:min_idx].rindex("/")
761
+ root = path[: min_idx + 1]
762
+ depth = path[min_idx + 1 :].count("/") + 1
724
763
  else:
725
764
  root = ""
726
- depth = None if "**" in path else path[ind + 1 :].count("/") + 1
765
+ depth = path[min_idx + 1 :].count("/") + 1
766
+
767
+ if "**" in path:
768
+ if maxdepth is not None:
769
+ idx_double_stars = path.find("**")
770
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
771
+ depth = depth - depth_double_stars + maxdepth
772
+ else:
773
+ depth = None
727
774
 
728
775
  allpaths = await self._find(
729
776
  root, maxdepth=depth, withdirs=True, detail=True, **kwargs
@@ -751,14 +798,23 @@ class AsyncFileSystem(AbstractFileSystem):
751
798
  )
752
799
  + "$"
753
800
  )
754
- pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
801
+ pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
802
+ pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
755
803
  pattern = re.sub("[*]", "[^/]*", pattern)
756
- pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
804
+ pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
805
+ pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
806
+ pattern = re.compile(pattern)
757
807
  out = {
758
808
  p: allpaths[p]
759
809
  for p in sorted(allpaths)
760
810
  if pattern.match(p.replace("//", "/").rstrip("/"))
761
811
  }
812
+
813
+ # Return directories only when the glob end by a slash
814
+ # This is needed for posix glob compliance
815
+ if ends:
816
+ out = {k: v for k, v in out.items() if v["type"] == "directory"}
817
+
762
818
  if detail:
763
819
  return out
764
820
  else:
@@ -777,8 +833,14 @@ class AsyncFileSystem(AbstractFileSystem):
777
833
 
778
834
  async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
779
835
  path = self._strip_protocol(path)
780
- out = dict()
836
+ out = {}
781
837
  detail = kwargs.pop("detail", False)
838
+
839
+ # Add the root directory if withdirs is requested
840
+ # This is needed for posix glob compliance
841
+ if withdirs and path != "" and await self._isdir(path):
842
+ out[path] = await self._info(path)
843
+
782
844
  # async for?
783
845
  async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
784
846
  if withdirs:
@@ -805,7 +867,7 @@ class AsyncFileSystem(AbstractFileSystem):
805
867
  path = [self._strip_protocol(p) for p in path]
806
868
  for p in path: # can gather here
807
869
  if has_magic(p):
808
- bit = set(await self._glob(p))
870
+ bit = set(await self._glob(p, maxdepth=maxdepth))
809
871
  out |= bit
810
872
  if recursive:
811
873
  # glob call above expanded one depth so if maxdepth is defined
@@ -829,7 +891,7 @@ class AsyncFileSystem(AbstractFileSystem):
829
891
  out.add(p)
830
892
  if not out:
831
893
  raise FileNotFoundError(path)
832
- return list(sorted(out))
894
+ return sorted(out)
833
895
 
834
896
  async def _mkdir(self, path, create_parents=True, **kwargs):
835
897
  pass # not necessary to implement, may not have directories
@@ -456,7 +456,7 @@ class KnownPartsOfAFile(BaseCache):
456
456
 
457
457
  # simple consolidation of contiguous blocks
458
458
  if data:
459
- old_offsets = sorted(list(data.keys()))
459
+ old_offsets = sorted(data.keys())
460
460
  offsets = [old_offsets[0]]
461
461
  blocks = [data.pop(old_offsets[0])]
462
462
  for start, stop in old_offsets[1:]:
@@ -1,5 +1,4 @@
1
1
  """Helper functions for a standard streaming compression API"""
2
- from bz2 import BZ2File
3
2
  from zipfile import ZipFile
4
3
 
5
4
  import fsspec.utils
@@ -68,7 +67,13 @@ def unzip(infile, mode="rb", filename=None, **kwargs):
68
67
 
69
68
 
70
69
  register_compression("zip", unzip, "zip")
71
- register_compression("bz2", BZ2File, "bz2")
70
+
71
+ try:
72
+ from bz2 import BZ2File
73
+ except ImportError:
74
+ pass
75
+ else:
76
+ register_compression("bz2", BZ2File, "bz2")
72
77
 
73
78
  try: # pragma: no cover
74
79
  from isal import igzip
@@ -210,7 +210,6 @@ def open_files(
210
210
  num=1,
211
211
  protocol=None,
212
212
  newline=None,
213
- auto_mkdir=True,
214
213
  expand=True,
215
214
  **kwargs,
216
215
  ):
@@ -249,9 +248,6 @@ def open_files(
249
248
  newline: bytes or None
250
249
  Used for line terminator in text mode. If None, uses system default;
251
250
  if blank, uses no translation.
252
- auto_mkdir: bool (True)
253
- If in write mode, this will ensure the target directory exists before
254
- writing, by calling ``fs.mkdirs(exist_ok=True)``.
255
251
  expand: bool
256
252
  **kwargs: dict
257
253
  Extra options that make sense to a particular storage connection, e.g.
@@ -288,9 +284,6 @@ def open_files(
288
284
  protocol=protocol,
289
285
  expand=expand,
290
286
  )
291
- if "r" not in mode and auto_mkdir:
292
- parents = {fs._parent(path) for path in paths}
293
- [fs.makedirs(parent, exist_ok=True) for parent in parents]
294
287
  return OpenFiles(
295
288
  [
296
289
  OpenFile(
@@ -360,6 +353,19 @@ def url_to_fs(url, **kwargs):
360
353
  urlpath : str
361
354
  The file-systems-specific URL for ``url``.
362
355
  """
356
+ # non-FS arguments that appear in fsspec.open()
357
+ # inspect could keep this in sync with open()'s signature
358
+ known_kwargs = {
359
+ "compression",
360
+ "encoding",
361
+ "errors",
362
+ "expand",
363
+ "mode",
364
+ "name_function",
365
+ "newline",
366
+ "num",
367
+ }
368
+ kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
363
369
  chain = _un_chain(url, kwargs)
364
370
  inkwargs = {}
365
371
  # Reverse iterate the chain, creating a nested target_* structure
@@ -611,7 +617,7 @@ def get_fs_token_paths(
611
617
  pchains = [
612
618
  _un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
613
619
  ]
614
- if len(set(pc[1] for pc in pchains)) > 1:
620
+ if len({pc[1] for pc in pchains}) > 1:
615
621
  raise ValueError("Protocol mismatch getting fs from %s", urlpath)
616
622
  paths = [pc[0] for pc in pchains]
617
623
  else:
@@ -621,6 +627,8 @@ def get_fs_token_paths(
621
627
  else:
622
628
  if "w" in mode and expand:
623
629
  paths = _expand_paths(paths, name_function, num)
630
+ elif "x" in mode and expand:
631
+ paths = _expand_paths(paths, name_function, num)
624
632
  elif "*" in paths:
625
633
  paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
626
634
  else:
@@ -1,9 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import inspect
2
4
  import logging
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from typing import Optional
3
9
 
4
- from .asyn import AsyncFileSystem
10
+ from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
5
11
  from .callbacks import _DEFAULT_CALLBACK
6
- from .core import filesystem, get_filesystem_class, split_protocol
12
+ from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
7
13
 
8
14
  _generic_fs = {}
9
15
  logger = logging.getLogger("fsspec.generic")
@@ -29,7 +35,8 @@ def _resolve_fs(url, method=None, protocol=None, storage_options=None):
29
35
  cls = get_filesystem_class(protocol)
30
36
  return cls.current()
31
37
  if method == "options":
32
- return filesystem(protocol, **storage_options.get(protocol, {}))
38
+ fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
39
+ return fs
33
40
  raise ValueError(f"Unknown FS resolution method: {method}")
34
41
 
35
42
 
@@ -51,19 +58,23 @@ def rsync(
51
58
  Parameters
52
59
  ----------
53
60
  source: str
54
- Root of the directory tree to take files from.
61
+ Root of the directory tree to take files from. This must be a directory, but
62
+ do not include any terminating "/" character
55
63
  destination: str
56
64
  Root path to copy into. The contents of this location should be
57
- identical to the contents of ``source`` when done.
65
+ identical to the contents of ``source`` when done. This will be made a
66
+ directory, and the terminal "/" should not be included.
58
67
  delete_missing: bool
59
68
  If there are paths in the destination that don't exist in the
60
69
  source and this is True, delete them. Otherwise, leave them alone.
61
- source_field: str
70
+ source_field: str | callable
62
71
  If ``update_field`` is "different", this is the key in the info
63
- of source files to consider for difference.
64
- dest_field: str
72
+ of source files to consider for difference. Maybe a function of the
73
+ info dict.
74
+ dest_field: str | callable
65
75
  If ``update_field`` is "different", this is the key in the info
66
- of destination files to consider for difference.
76
+ of destination files to consider for difference. May be a function of
77
+ the info dict.
67
78
  update_cond: "different"|"always"|"never"
68
79
  If "always", every file is copied, regardless of whether it exists in
69
80
  the destination. If "never", files that exist in the destination are
@@ -90,9 +101,10 @@ def rsync(
90
101
  if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
91
102
  ]
92
103
  logger.debug(f"{len(dirs)} directories to create")
93
- for dirn in dirs:
94
- # no async
95
- fs.mkdirs(dirn.replace(source, destination), exist_ok=True)
104
+ if dirs:
105
+ fs.make_many_dirs(
106
+ [dirn.replace(source, destination) for dirn in dirs], exist_ok=True
107
+ )
96
108
  allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
97
109
  logger.debug(f"{len(allfiles)} files to consider for copy")
98
110
  to_delete = [
@@ -106,7 +118,10 @@ def rsync(
106
118
  if update_cond == "always":
107
119
  allfiles[k] = otherfile
108
120
  elif update_cond == "different":
109
- if v[source_field] != otherfiles[otherfile][dest_field]:
121
+ inf1 = source_field(v) if callable(source_field) else v[source_field]
122
+ v2 = otherfiles[otherfile]
123
+ inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
124
+ if inf1 != inf2:
110
125
  # details mismatch, make copy
111
126
  allfiles[k] = otherfile
112
127
  else:
@@ -115,12 +130,12 @@ def rsync(
115
130
  else:
116
131
  # file not in target yet
117
132
  allfiles[k] = otherfile
133
+ logger.debug(f"{len(allfiles)} files to copy")
118
134
  if allfiles:
119
135
  source_files, target_files = zip(*allfiles.items())
120
- logger.debug(f"{len(source_files)} files to copy")
121
136
  fs.cp(source_files, target_files, **kwargs)
137
+ logger.debug(f"{len(to_delete)} files to delete")
122
138
  if delete_missing:
123
- logger.debug(f"{len(to_delete)} files to delete")
124
139
  fs.rm(to_delete)
125
140
 
126
141
 
@@ -165,11 +180,11 @@ class GenericFileSystem(AsyncFileSystem):
165
180
  fs = _resolve_fs(path, self.method)
166
181
  if fs.async_impl:
167
182
  out = await fs._find(
168
- path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
183
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
169
184
  )
170
185
  else:
171
186
  out = fs.find(
172
- path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
187
+ path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
173
188
  )
174
189
  result = {}
175
190
  for k, v in out.items():
@@ -238,6 +253,7 @@ class GenericFileSystem(AsyncFileSystem):
238
253
  fs.rm(url, **kwargs)
239
254
 
240
255
  async def _makedirs(self, path, exist_ok=False):
256
+ logger.debug("Make dir %s", path)
241
257
  fs = _resolve_fs(path, self.method)
242
258
  if fs.async_impl:
243
259
  await fs._makedirs(path, exist_ok=exist_ok)
@@ -294,6 +310,84 @@ class GenericFileSystem(AsyncFileSystem):
294
310
  # fail while opening f1 or f2
295
311
  pass
296
312
 
313
+ async def _make_many_dirs(self, urls, exist_ok=True):
314
+ fs = _resolve_fs(urls[0], self.method)
315
+ if fs.async_impl:
316
+ coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
317
+ await _run_coros_in_chunks(coros)
318
+ else:
319
+ for u in urls:
320
+ fs.makedirs(u, exist_ok=exist_ok)
321
+
322
+ make_many_dirs = sync_wrapper(_make_many_dirs)
323
+
324
+ async def _copy(
325
+ self,
326
+ path1: list[str],
327
+ path2: list[str],
328
+ recursive: bool = False,
329
+ on_error: str = "ignore",
330
+ maxdepth: Optional[int] = None,
331
+ batch_size: Optional[int] = None,
332
+ tempdir: Optional[str] = None,
333
+ **kwargs,
334
+ ):
335
+ if recursive:
336
+ raise NotImplementedError
337
+ fs = _resolve_fs(path1[0], self.method)
338
+ fs2 = _resolve_fs(path2[0], self.method)
339
+ # not expanding paths atm., assume call is from rsync()
340
+ if fs is fs2:
341
+ # pure remote
342
+ if fs.async_impl:
343
+ return await fs._copy(path1, path2, **kwargs)
344
+ else:
345
+ return fs.copy(path1, path2, **kwargs)
346
+ await copy_file_op(
347
+ fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
348
+ )
349
+
350
+
351
+ async def copy_file_op(
352
+ fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
353
+ ):
354
+ import tempfile
355
+
356
+ tempdir = tempdir or tempfile.mkdtemp()
357
+ try:
358
+ coros = [
359
+ _copy_file_op(
360
+ fs1,
361
+ u1,
362
+ fs2,
363
+ u2,
364
+ os.path.join(tempdir, uuid.uuid4().hex),
365
+ on_error=on_error,
366
+ )
367
+ for u1, u2 in zip(url1, url2)
368
+ ]
369
+ await _run_coros_in_chunks(coros, batch_size=batch_size)
370
+ finally:
371
+ shutil.rmtree(tempdir)
372
+
373
+
374
+ async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
375
+ ex = () if on_error == "raise" else Exception
376
+ logger.debug("Copy %s -> %s", url1, url2)
377
+ try:
378
+ if fs1.async_impl:
379
+ await fs1._get_file(url1, local)
380
+ else:
381
+ fs1.get_file(url1, local)
382
+ if fs2.async_impl:
383
+ await fs2._put_file(local, url2)
384
+ else:
385
+ fs2.put_file(local, url2)
386
+ os.unlink(local)
387
+ logger.debug("Copy %s -> %s; done", url1, url2)
388
+ except ex as e:
389
+ logger.debug("ignoring cp exception for %s: %s", url1, e)
390
+
297
391
 
298
392
  async def maybe_await(cor):
299
393
  if inspect.iscoroutine(cor):