cloud-files 5.8.2__tar.gz → 6.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {cloud_files-5.8.2 → cloud_files-6.0.0}/.github/workflows/test-suite.yml +2 -2
  2. {cloud_files-5.8.2 → cloud_files-6.0.0}/ChangeLog +20 -0
  3. {cloud_files-5.8.2 → cloud_files-6.0.0}/PKG-INFO +1 -1
  4. {cloud_files-5.8.2 → cloud_files-6.0.0}/automated_test.py +1 -1
  5. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/PKG-INFO +1 -1
  6. cloud_files-6.0.0/cloud_files.egg-info/pbr.json +1 -0
  7. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/cloudfiles.py +56 -1
  8. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/interfaces.py +111 -15
  9. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles_cli/cloudfiles_cli.py +22 -4
  10. cloud_files-5.8.2/cloud_files.egg-info/pbr.json +0 -1
  11. {cloud_files-5.8.2 → cloud_files-6.0.0}/AUTHORS +0 -0
  12. {cloud_files-5.8.2 → cloud_files-6.0.0}/LICENSE +0 -0
  13. {cloud_files-5.8.2 → cloud_files-6.0.0}/MANIFEST.in +0 -0
  14. {cloud_files-5.8.2 → cloud_files-6.0.0}/README.md +0 -0
  15. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/SOURCES.txt +0 -0
  16. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/dependency_links.txt +0 -0
  17. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/entry_points.txt +0 -0
  18. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/not-zip-safe +0 -0
  19. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/requires.txt +0 -0
  20. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/top_level.txt +0 -0
  21. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/__init__.py +0 -0
  22. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/compression.py +0 -0
  23. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/connectionpools.py +0 -0
  24. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/exceptions.py +0 -0
  25. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/gcs.py +0 -0
  26. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/lib.py +0 -0
  27. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/monitoring.py +0 -0
  28. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/paths.py +0 -0
  29. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/resumable_tools.py +0 -0
  30. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/scheduler.py +0 -0
  31. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/secrets.py +0 -0
  32. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/test.py +0 -0
  33. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/threaded_queue.py +0 -0
  34. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/typing.py +0 -0
  35. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles_cli/LICENSE +0 -0
  36. {cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles_cli/__init__.py +0 -0
  37. {cloud_files-5.8.2 → cloud_files-6.0.0}/requirements.txt +0 -0
  38. {cloud_files-5.8.2 → cloud_files-6.0.0}/setup.cfg +0 -0
  39. {cloud_files-5.8.2 → cloud_files-6.0.0}/setup.py +0 -0
@@ -15,7 +15,7 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
18
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
- python -m pip install --upgrade pip
28
+ python -m pip install --upgrade pip setuptools wheel
29
29
  if [ -f requirements.txt ]; then pip install -e ".[test,monitoring]"; fi
30
30
  - name: Test with pytest
31
31
  run: |
@@ -1,6 +1,26 @@
1
1
  CHANGES
2
2
  =======
3
3
 
4
+ 6.0.0
5
+ -----
6
+
7
+ * feat: add file counts to du as -N flag
8
+
9
+ 5.9.0
10
+ -----
11
+
12
+ * perf: reduce data loading for list\_blogs for GCS
13
+ * perf: memory efficient listing on GCS
14
+ * fix: errant print statement
15
+ * feat: add CloudFile(s).sep
16
+ * fix(https): allow "raw=True" to work
17
+ * fix: don't retry on 403,404 for http head
18
+ * ci: drop py3.9 add py3.14
19
+ * fixtest: try upgrading setuptools
20
+ * feat: add constructor args to CloudFile that are present in CloudFiles
21
+ * perf: faster du using listing (#120)
22
+ * test: change target for nokura
23
+
4
24
  5.8.2
5
25
  -----
6
26
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.8.2
3
+ Version: 6.0.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -757,7 +757,7 @@ def test_to_https_protocol():
757
757
  assert pth == "https://s3-hpcrc.rc.princeton.edu/my_bucket/to/heaven"
758
758
 
759
759
  pth = to_https_protocol("nokura://my_bucket/to/heaven")
760
- assert pth == "https://nokura.pni.princeton.edu/my_bucket/to/heaven"
760
+ assert pth == "https://c10s.pni.princeton.edu/my_bucket/to/heaven"
761
761
 
762
762
  pth = to_https_protocol("tigerdata://my_bucket/to/heaven")
763
763
  assert pth == "https://td.princeton.edu/my_bucket/to/heaven"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.8.2
3
+ Version: 6.0.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -0,0 +1 @@
1
+ {"git_version": "38a2b59", "is_release": true}
@@ -18,6 +18,7 @@ import platform
18
18
  import posixpath
19
19
  import re
20
20
  import shutil
21
+ import threading
21
22
  import types
22
23
  import time
23
24
 
@@ -1007,6 +1008,40 @@ class CloudFiles:
1007
1008
  return results
1008
1009
  return first(results.values())
1009
1010
 
1011
+ def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
1012
+ """High performance size calculation for directory trees."""
1013
+ prefix, return_multiple = toiter(prefix, is_iter=True)
1014
+ total_bytes = 0
1015
+ total_files = 0
1016
+
1017
+ total = totalfn(prefix, None)
1018
+
1019
+ lock = threading.Lock()
1020
+
1021
+ def size_thunk(prefix):
1022
+ nonlocal total_bytes
1023
+ nonlocal total_files
1024
+ nonlocal lock
1025
+
1026
+ with self._get_connection() as conn:
1027
+ subtree_files, subtree_bytes = conn.subtree_size(prefix)
1028
+ with lock:
1029
+ total_files += subtree_files
1030
+ total_bytes += subtree_bytes
1031
+
1032
+ schedule_jobs(
1033
+ fns=( partial(size_thunk, path) for path in prefix ),
1034
+ concurrency=self.num_threads,
1035
+ progress=self.progress,
1036
+ green=self.green,
1037
+ total=total,
1038
+ )
1039
+
1040
+ return {
1041
+ "N": total_files,
1042
+ "num_bytes": total_bytes,
1043
+ }
1044
+
1010
1045
  @parallelize(desc="Delete")
1011
1046
  def delete(
1012
1047
  self, paths:GetPathType, total:Optional[int] = None,
@@ -1666,6 +1701,12 @@ class CloudFiles:
1666
1701
  return os.path.join(*paths)
1667
1702
  return posixpath.join(*paths)
1668
1703
 
1704
+ @property
1705
+ def sep(self) -> str:
1706
+ if self._path.protocol == "file":
1707
+ return os.sep
1708
+ return posixpath.sep
1709
+
1669
1710
  def dirname(self, path:str) -> str:
1670
1711
  if self._path.protocol == "file":
1671
1712
  return os.path.dirname(path)
@@ -1706,11 +1747,17 @@ class CloudFiles:
1706
1747
 
1707
1748
  class CloudFile:
1708
1749
  def __init__(
1709
- self, path:str, cache_meta:bool = False,
1750
+ self,
1751
+ path:str,
1752
+ cache_meta:bool = False,
1710
1753
  secrets:SecretsType = None,
1711
1754
  composite_upload_threshold:int = int(1e8),
1712
1755
  locking:bool = True,
1713
1756
  lock_dir:Optional[str] = None,
1757
+ endpoint:Optional[str] = None,
1758
+ no_sign_request:bool = False,
1759
+ request_payer:Optional[str] = None,
1760
+ use_https:bool = False,
1714
1761
  ):
1715
1762
  path = paths.normalize(path)
1716
1763
  self.cf = CloudFiles(
@@ -1719,6 +1766,10 @@ class CloudFile:
1719
1766
  composite_upload_threshold=composite_upload_threshold,
1720
1767
  locking=locking,
1721
1768
  lock_dir=lock_dir,
1769
+ use_https=use_https,
1770
+ endpoint=endpoint,
1771
+ request_payer=request_payer,
1772
+ no_sign_request=no_sign_request,
1722
1773
  )
1723
1774
  self.filename = paths.basename(path)
1724
1775
 
@@ -1726,6 +1777,10 @@ class CloudFile:
1726
1777
  self._size:Optional[int] = None
1727
1778
  self._head = None
1728
1779
 
1780
+ @property
1781
+ def sep(self) -> str:
1782
+ return self.cf.sep
1783
+
1729
1784
  @property
1730
1785
  def protocol(self):
1731
1786
  return self.cf.protocol
@@ -48,6 +48,7 @@ MEM_POOL = None
48
48
 
49
49
  S3_ACLS = {
50
50
  "tigerdata": "private",
51
+ "nokura": "public-read",
51
52
  }
52
53
 
53
54
  S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -303,6 +304,22 @@ class FileInterface(StorageInterface):
303
304
 
304
305
  return self.io_with_lock(do_size, path, exclusive=False)
305
306
 
307
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
308
+ total_bytes = 0
309
+ total_files = 0
310
+
311
+ subdir = self.get_path_to_file("")
312
+ if prefix:
313
+ subdir = os.path.join(subdir, os.path.dirname(prefix))
314
+
315
+ for root, dirs, files in os.walk(subdir):
316
+ for f in files:
317
+ path = os.path.join(root, f)
318
+ total_files += 1
319
+ total_bytes += os.path.getsize(path)
320
+
321
+ return (total_files, total_bytes)
322
+
306
323
  def exists(self, file_path):
307
324
  path = self.get_path_to_file(file_path)
308
325
  def do_exists():
@@ -580,8 +597,7 @@ class MemoryInterface(StorageInterface):
580
597
 
581
598
  Returns: iterator
582
599
  """
583
- layer_path = self.get_path_to_file("")
584
- path = os.path.join(layer_path, prefix) + '*'
600
+ layer_path = self.get_path_to_file("")
585
601
 
586
602
  remove = layer_path
587
603
  if len(remove) and remove[-1] != '/':
@@ -615,6 +631,23 @@ class MemoryInterface(StorageInterface):
615
631
  filenames.sort()
616
632
  return iter(filenames)
617
633
 
634
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
635
+ layer_path = self.get_path_to_file("")
636
+
637
+ remove = layer_path
638
+ if len(remove) and remove[-1] != '/':
639
+ remove += '/'
640
+
641
+ total_bytes = 0
642
+ total_files = 0
643
+ for filename, binary in self._data.items():
644
+ f_prefix = f.removeprefix(remove)[:len(prefix)]
645
+ if f_prefix == prefix:
646
+ total_bytes += len(binary)
647
+ total_files += 1
648
+
649
+ return (total_files, total_bytes)
650
+
618
651
  class GoogleCloudStorageInterface(StorageInterface):
619
652
  exists_batch_size = Batch._MAX_BATCH_SIZE
620
653
  delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -816,6 +849,8 @@ class GoogleCloudStorageInterface(StorageInterface):
816
849
  blobs = self._bucket.list_blobs(
817
850
  prefix=path,
818
851
  delimiter=delimiter,
852
+ page_size=2500,
853
+ fields="items(name),nextPageToken",
819
854
  )
820
855
 
821
856
  for page in blobs.pages:
@@ -835,6 +870,26 @@ class GoogleCloudStorageInterface(StorageInterface):
835
870
  yield filename
836
871
 
837
872
 
873
+ @retry
874
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
875
+ layer_path = self.get_path_to_file("")
876
+ path = posixpath.join(layer_path, prefix)
877
+
878
+ blobs = self._bucket.list_blobs(
879
+ prefix=path,
880
+ page_size=5000,
881
+ fields="items(name,size),nextPageToken",
882
+ )
883
+
884
+ total_bytes = 0
885
+ total_files = 0
886
+ for page in blobs.pages:
887
+ for blob in page:
888
+ total_bytes += blob.size
889
+ total_files += 1
890
+
891
+ return (total_files, total_bytes)
892
+
838
893
  def release_connection(self):
839
894
  global GC_POOL
840
895
  with GCS_BUCKET_POOL_LOCK:
@@ -882,6 +937,8 @@ class HttpInterface(StorageInterface):
882
937
  key = self.get_path_to_file(file_path)
883
938
  headers = self.default_headers()
884
939
  with self.session.head(key, headers=headers) as resp:
940
+ if resp.status_code in (404, 403):
941
+ return None
885
942
  resp.raise_for_status()
886
943
  return resp.headers
887
944
 
@@ -889,6 +946,9 @@ class HttpInterface(StorageInterface):
889
946
  headers = self.head(file_path)
890
947
  return int(headers["Content-Length"])
891
948
 
949
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
950
+ raise NotImplementedError()
951
+
892
952
  @retry
893
953
  def get_file(self, file_path, start=None, end=None, part_size=None):
894
954
  key = self.get_path_to_file(file_path)
@@ -899,24 +959,20 @@ class HttpInterface(StorageInterface):
899
959
  end = int(end - 1) if end is not None else ''
900
960
  headers["Range"] = f"bytes={start}-{end}"
901
961
 
902
- resp = self.session.get(key, headers=headers)
903
-
904
- if resp.status_code in (404, 403):
905
- return (None, None, None, None)
906
- resp.close()
907
- resp.raise_for_status()
962
+ with self.session.get(key, headers=headers, stream=True) as resp:
963
+ if resp.status_code in (404, 403):
964
+ return (None, None, None, None)
965
+ resp.raise_for_status()
966
+ resp.raw.decode_content = False
967
+ content = resp.raw.read()
968
+ content_encoding = resp.headers.get('Content-Encoding', None)
908
969
 
909
970
  # Don't check MD5 for http because the etag can come in many
910
971
  # forms from either GCS, S3 or another service entirely. We
911
972
  # probably won't figure out how to decode it right.
912
973
  # etag = resp.headers.get('etag', None)
913
- content_encoding = resp.headers.get('Content-Encoding', None)
914
-
915
- # requests automatically decodes these
916
- if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
917
- content_encoding = None
918
974
 
919
- return (resp.content, content_encoding, None, None)
975
+ return (content, content_encoding, None, None)
920
976
 
921
977
  @retry
922
978
  def save_file(self, src, dest, resumable) -> tuple[bool, int]:
@@ -1017,7 +1073,6 @@ class HttpInterface(StorageInterface):
1017
1073
  )
1018
1074
 
1019
1075
  for res in results.get("items", []):
1020
- print(res["name"])
1021
1076
  yield res["name"].removeprefix(strip)
1022
1077
 
1023
1078
  token = results.get("nextPageToken", None)
@@ -1490,6 +1545,47 @@ class S3Interface(StorageInterface):
1490
1545
  for filename in iterate(resp):
1491
1546
  yield filename
1492
1547
 
1548
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
1549
+ layer_path = self.get_path_to_file("")
1550
+ path = posixpath.join(layer_path, prefix)
1551
+
1552
+ @retry
1553
+ def s3lst(path, continuation_token=None):
1554
+ kwargs = {
1555
+ 'Bucket': self._path.bucket,
1556
+ 'Prefix': path,
1557
+ **self._additional_attrs
1558
+ }
1559
+
1560
+ if continuation_token:
1561
+ kwargs['ContinuationToken'] = continuation_token
1562
+
1563
+ return self._conn.list_objects_v2(**kwargs)
1564
+
1565
+ resp = s3lst(path)
1566
+
1567
+ def iterate(resp):
1568
+ if 'Contents' not in resp.keys():
1569
+ resp['Contents'] = []
1570
+
1571
+ for item in resp['Contents']:
1572
+ yield item.get('Size', 0)
1573
+
1574
+ total_bytes = 0
1575
+ total_files = 0
1576
+ for num_bytes in iterate(resp):
1577
+ total_files += 1
1578
+ total_bytes += num_bytes
1579
+
1580
+ while resp['IsTruncated'] and resp['NextContinuationToken']:
1581
+ resp = s3lst(path, resp['NextContinuationToken'])
1582
+
1583
+ for num_bytes in iterate(resp):
1584
+ total_files += 1
1585
+ total_bytes += num_bytes
1586
+
1587
+ return (total_files, total_bytes)
1588
+
1493
1589
  def release_connection(self):
1494
1590
  global S3_POOL
1495
1591
  service = self._path.alias or 's3'
@@ -802,14 +802,22 @@ def __rm(cloudpath, progress, paths):
802
802
  @click.option('-c', '--grand-total', is_flag=True, default=False, help="Sum a grand total of all inputs.")
803
803
  @click.option('-s', '--summarize', is_flag=True, default=False, help="Sum a total for each input argument.")
804
804
  @click.option('-h', '--human-readable', is_flag=True, default=False, help='"Human-readable" output. Use unit suffixes: Bytes, KiB, MiB, GiB, TiB, PiB, and EiB.')
805
- def du(paths, grand_total, summarize, human_readable):
805
+ @click.option('-N', '--count-files', is_flag=True, default=False, help='Also report the number of files.')
806
+ def du(paths, grand_total, summarize, human_readable, count_files):
806
807
  """Display disk usage statistics."""
807
808
  results = []
809
+
810
+ list_data = False
811
+
808
812
  for path in paths:
809
813
  npath = normalize_path(path)
810
814
  if ispathdir(path):
811
815
  cf = CloudFiles(npath)
812
- results.append(cf.size(cf.list()))
816
+ if summarize:
817
+ results.append(cf.subtree_size())
818
+ else:
819
+ list_data = True
820
+ results.append(cf.size(cf.list()))
813
821
  else:
814
822
  cf = CloudFiles(os.path.dirname(npath))
815
823
  sz = cf.size(os.path.basename(npath))
@@ -838,8 +846,15 @@ def du(paths, grand_total, summarize, human_readable):
838
846
  return f"{(val / 2**60):.2f} EiB"
839
847
 
840
848
  summary = {}
849
+ num_files = 0
841
850
  for path, res in zip(paths, results):
842
- summary[path] = sum(res.values())
851
+ if list_data:
852
+ summary[path] = sum(res.values())
853
+ num_files += len(res)
854
+ else:
855
+ summary[path] = res["num_bytes"]
856
+ num_files += res["N"]
857
+
843
858
  if summarize:
844
859
  print(f"{SI(summary[path])}\t{path}")
845
860
 
@@ -849,7 +864,10 @@ def du(paths, grand_total, summarize, human_readable):
849
864
  print(f"{SI(size)}\t{pth}")
850
865
 
851
866
  if grand_total:
852
- print(f"{SI(sum(summary.values()))}\ttotal")
867
+ print(f"{SI(sum(summary.values()))}\tbytes total")
868
+
869
+ if count_files:
870
+ print(f"{num_files}\tfiles total")
853
871
 
854
872
  @main.command()
855
873
  @click.argument('paths', nargs=-1)
@@ -1 +0,0 @@
1
- {"git_version": "99528f8", "is_release": true}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes