cloud-files 5.8.1__tar.gz → 5.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {cloud_files-5.8.1 → cloud_files-5.9.0}/.github/workflows/test-suite.yml +2 -2
  2. {cloud_files-5.8.1 → cloud_files-5.9.0}/ChangeLog +20 -0
  3. {cloud_files-5.8.1 → cloud_files-5.9.0}/PKG-INFO +2 -14
  4. {cloud_files-5.8.1 → cloud_files-5.9.0}/automated_test.py +1 -1
  5. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/PKG-INFO +2 -14
  6. cloud_files-5.9.0/cloud_files.egg-info/pbr.json +1 -0
  7. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/cloudfiles.py +50 -1
  8. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/interfaces.py +101 -15
  9. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/paths.py +1 -1
  10. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles_cli/cloudfiles_cli.py +8 -2
  11. cloud_files-5.8.1/cloud_files.egg-info/pbr.json +0 -1
  12. {cloud_files-5.8.1 → cloud_files-5.9.0}/AUTHORS +0 -0
  13. {cloud_files-5.8.1 → cloud_files-5.9.0}/LICENSE +0 -0
  14. {cloud_files-5.8.1 → cloud_files-5.9.0}/MANIFEST.in +0 -0
  15. {cloud_files-5.8.1 → cloud_files-5.9.0}/README.md +0 -0
  16. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/SOURCES.txt +0 -0
  17. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/dependency_links.txt +0 -0
  18. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/entry_points.txt +0 -0
  19. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/not-zip-safe +0 -0
  20. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/requires.txt +0 -0
  21. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloud_files.egg-info/top_level.txt +0 -0
  22. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/__init__.py +0 -0
  23. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/compression.py +0 -0
  24. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/connectionpools.py +0 -0
  25. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/exceptions.py +0 -0
  26. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/gcs.py +0 -0
  27. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/lib.py +0 -0
  28. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/monitoring.py +0 -0
  29. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/resumable_tools.py +0 -0
  30. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/scheduler.py +0 -0
  31. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/secrets.py +0 -0
  32. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/test.py +0 -0
  33. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/threaded_queue.py +0 -0
  34. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles/typing.py +0 -0
  35. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles_cli/LICENSE +0 -0
  36. {cloud_files-5.8.1 → cloud_files-5.9.0}/cloudfiles_cli/__init__.py +0 -0
  37. {cloud_files-5.8.1 → cloud_files-5.9.0}/requirements.txt +0 -0
  38. {cloud_files-5.8.1 → cloud_files-5.9.0}/setup.cfg +0 -0
  39. {cloud_files-5.8.1 → cloud_files-5.9.0}/setup.py +0 -0
@@ -15,7 +15,7 @@ jobs:
15
15
  runs-on: ubuntu-latest
16
16
  strategy:
17
17
  matrix:
18
- python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
18
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
19
19
 
20
20
  steps:
21
21
  - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
- python -m pip install --upgrade pip
28
+ python -m pip install --upgrade pip setuptools wheel
29
29
  if [ -f requirements.txt ]; then pip install -e ".[test,monitoring]"; fi
30
30
  - name: Test with pytest
31
31
  run: |
@@ -1,6 +1,26 @@
1
1
  CHANGES
2
2
  =======
3
3
 
4
+ 5.9.0
5
+ -----
6
+
7
+ * perf: reduce data loading for list\_blogs for GCS
8
+ * perf: memory efficient listing on GCS
9
+ * fix: errant print statement
10
+ * feat: add CloudFile(s).sep
11
+ * fix(https): allow "raw=True" to work
12
+ * fix: don't retry on 403,404 for http head
13
+ * ci: drop py3.9 add py3.14
14
+ * fixtest: try upgrading setuptools
15
+ * feat: add constructor args to CloudFile that are present in CloudFiles
16
+ * perf: faster du using listing (#120)
17
+ * test: change target for nokura
18
+
19
+ 5.8.2
20
+ -----
21
+
22
+ * redesign: change nokura endpoint to c10s.pni.princeton.edu
23
+
4
24
  5.8.1
5
25
  -----
6
26
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.8.1
3
+ Version: 5.9.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -54,18 +54,6 @@ Requires-Dist: intervaltree; extra == "monitoring"
54
54
  Requires-Dist: matplotlib; extra == "monitoring"
55
55
  Provides-Extra: apache
56
56
  Requires-Dist: lxml; extra == "apache"
57
- Dynamic: author
58
- Dynamic: author-email
59
- Dynamic: classifier
60
- Dynamic: description
61
- Dynamic: description-content-type
62
- Dynamic: home-page
63
- Dynamic: license
64
- Dynamic: license-file
65
- Dynamic: provides-extra
66
- Dynamic: requires-dist
67
- Dynamic: requires-python
68
- Dynamic: summary
69
57
 
70
58
  [![PyPI version](https://badge.fury.io/py/cloud-files.svg)](https://badge.fury.io/py/cloud-files) [![Test Suite](https://github.com/seung-lab/cloud-files/workflows/Test%20Suite/badge.svg)](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
71
59
 
@@ -757,7 +757,7 @@ def test_to_https_protocol():
757
757
  assert pth == "https://s3-hpcrc.rc.princeton.edu/my_bucket/to/heaven"
758
758
 
759
759
  pth = to_https_protocol("nokura://my_bucket/to/heaven")
760
- assert pth == "https://nokura.pni.princeton.edu/my_bucket/to/heaven"
760
+ assert pth == "https://c10s.pni.princeton.edu/my_bucket/to/heaven"
761
761
 
762
762
  pth = to_https_protocol("tigerdata://my_bucket/to/heaven")
763
763
  assert pth == "https://td.princeton.edu/my_bucket/to/heaven"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.8.1
3
+ Version: 5.9.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -54,18 +54,6 @@ Requires-Dist: intervaltree; extra == "monitoring"
54
54
  Requires-Dist: matplotlib; extra == "monitoring"
55
55
  Provides-Extra: apache
56
56
  Requires-Dist: lxml; extra == "apache"
57
- Dynamic: author
58
- Dynamic: author-email
59
- Dynamic: classifier
60
- Dynamic: description
61
- Dynamic: description-content-type
62
- Dynamic: home-page
63
- Dynamic: license
64
- Dynamic: license-file
65
- Dynamic: provides-extra
66
- Dynamic: requires-dist
67
- Dynamic: requires-python
68
- Dynamic: summary
69
57
 
70
58
  [![PyPI version](https://badge.fury.io/py/cloud-files.svg)](https://badge.fury.io/py/cloud-files) [![Test Suite](https://github.com/seung-lab/cloud-files/workflows/Test%20Suite/badge.svg)](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
71
59
 
@@ -0,0 +1 @@
1
+ {"git_version": "623052c", "is_release": true}
@@ -18,6 +18,7 @@ import platform
18
18
  import posixpath
19
19
  import re
20
20
  import shutil
21
+ import threading
21
22
  import types
22
23
  import time
23
24
 
@@ -1007,6 +1008,34 @@ class CloudFiles:
1007
1008
  return results
1008
1009
  return first(results.values())
1009
1010
 
1011
+ def subtree_size(self, prefix:GetPathType = "") -> int:
1012
+ """High performance size calculation for directory trees."""
1013
+ prefix, return_multiple = toiter(prefix, is_iter=True)
1014
+ total_bytes = 0
1015
+
1016
+ total = totalfn(prefix, None)
1017
+
1018
+ lock = threading.Lock()
1019
+
1020
+ def size_thunk(prefix):
1021
+ nonlocal total_bytes
1022
+ nonlocal lock
1023
+
1024
+ with self._get_connection() as conn:
1025
+ subtree_bytes = conn.subtree_size(prefix)
1026
+ with lock:
1027
+ total_bytes += subtree_bytes
1028
+
1029
+ schedule_jobs(
1030
+ fns=( partial(size_thunk, path) for path in prefix ),
1031
+ concurrency=self.num_threads,
1032
+ progress=self.progress,
1033
+ green=self.green,
1034
+ total=total,
1035
+ )
1036
+
1037
+ return total_bytes
1038
+
1010
1039
  @parallelize(desc="Delete")
1011
1040
  def delete(
1012
1041
  self, paths:GetPathType, total:Optional[int] = None,
@@ -1666,6 +1695,12 @@ class CloudFiles:
1666
1695
  return os.path.join(*paths)
1667
1696
  return posixpath.join(*paths)
1668
1697
 
1698
+ @property
1699
+ def sep(self) -> str:
1700
+ if self._path.protocol == "file":
1701
+ return os.sep
1702
+ return posixpath.sep
1703
+
1669
1704
  def dirname(self, path:str) -> str:
1670
1705
  if self._path.protocol == "file":
1671
1706
  return os.path.dirname(path)
@@ -1706,11 +1741,17 @@ class CloudFiles:
1706
1741
 
1707
1742
  class CloudFile:
1708
1743
  def __init__(
1709
- self, path:str, cache_meta:bool = False,
1744
+ self,
1745
+ path:str,
1746
+ cache_meta:bool = False,
1710
1747
  secrets:SecretsType = None,
1711
1748
  composite_upload_threshold:int = int(1e8),
1712
1749
  locking:bool = True,
1713
1750
  lock_dir:Optional[str] = None,
1751
+ endpoint:Optional[str] = None,
1752
+ no_sign_request:bool = False,
1753
+ request_payer:Optional[str] = None,
1754
+ use_https:bool = False,
1714
1755
  ):
1715
1756
  path = paths.normalize(path)
1716
1757
  self.cf = CloudFiles(
@@ -1719,6 +1760,10 @@ class CloudFile:
1719
1760
  composite_upload_threshold=composite_upload_threshold,
1720
1761
  locking=locking,
1721
1762
  lock_dir=lock_dir,
1763
+ use_https=use_https,
1764
+ endpoint=endpoint,
1765
+ request_payer=request_payer,
1766
+ no_sign_request=no_sign_request,
1722
1767
  )
1723
1768
  self.filename = paths.basename(path)
1724
1769
 
@@ -1726,6 +1771,10 @@ class CloudFile:
1726
1771
  self._size:Optional[int] = None
1727
1772
  self._head = None
1728
1773
 
1774
+ @property
1775
+ def sep(self) -> str:
1776
+ return self.cf.sep
1777
+
1729
1778
  @property
1730
1779
  def protocol(self):
1731
1780
  return self.cf.protocol
@@ -48,6 +48,7 @@ MEM_POOL = None
48
48
 
49
49
  S3_ACLS = {
50
50
  "tigerdata": "private",
51
+ "nokura": "public-read",
51
52
  }
52
53
 
53
54
  S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -303,6 +304,19 @@ class FileInterface(StorageInterface):
303
304
 
304
305
  return self.io_with_lock(do_size, path, exclusive=False)
305
306
 
307
+ def subtree_size(self, prefix:str = "") -> int:
308
+ total_bytes = 0
309
+
310
+ subdir = self.get_path_to_file("")
311
+ if prefix:
312
+ subdir = os.path.join(subdir, os.path.dirname(prefix))
313
+
314
+ for root, dirs, files in os.walk(subdir):
315
+ files = ( os.path.join(root, f) for f in files )
316
+ total_bytes += sum(( os.path.getsize(f) for f in files ))
317
+
318
+ return total_bytes
319
+
306
320
  def exists(self, file_path):
307
321
  path = self.get_path_to_file(file_path)
308
322
  def do_exists():
@@ -580,8 +594,7 @@ class MemoryInterface(StorageInterface):
580
594
 
581
595
  Returns: iterator
582
596
  """
583
- layer_path = self.get_path_to_file("")
584
- path = os.path.join(layer_path, prefix) + '*'
597
+ layer_path = self.get_path_to_file("")
585
598
 
586
599
  remove = layer_path
587
600
  if len(remove) and remove[-1] != '/':
@@ -615,6 +628,21 @@ class MemoryInterface(StorageInterface):
615
628
  filenames.sort()
616
629
  return iter(filenames)
617
630
 
631
+ def subtree_size(self, prefix:str = "") -> int:
632
+ layer_path = self.get_path_to_file("")
633
+
634
+ remove = layer_path
635
+ if len(remove) and remove[-1] != '/':
636
+ remove += '/'
637
+
638
+ total_bytes = 0
639
+ for filename, binary in self._data.items():
640
+ f_prefix = f.removeprefix(remove)[:len(prefix)]
641
+ if f_prefix == prefix:
642
+ total_bytes += len(binary)
643
+
644
+ return total_bytes
645
+
618
646
  class GoogleCloudStorageInterface(StorageInterface):
619
647
  exists_batch_size = Batch._MAX_BATCH_SIZE
620
648
  delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -816,6 +844,8 @@ class GoogleCloudStorageInterface(StorageInterface):
816
844
  blobs = self._bucket.list_blobs(
817
845
  prefix=path,
818
846
  delimiter=delimiter,
847
+ page_size=2500,
848
+ fields="items(name),nextPageToken",
819
849
  )
820
850
 
821
851
  for page in blobs.pages:
@@ -835,6 +865,24 @@ class GoogleCloudStorageInterface(StorageInterface):
835
865
  yield filename
836
866
 
837
867
 
868
+ @retry
869
+ def subtree_size(self, prefix:str = "") -> int:
870
+ layer_path = self.get_path_to_file("")
871
+ path = posixpath.join(layer_path, prefix)
872
+
873
+ blobs = self._bucket.list_blobs(
874
+ prefix=path,
875
+ page_size=5000,
876
+ fields="items(name,size),nextPageToken",
877
+ )
878
+
879
+ total_bytes = 0
880
+ for page in blobs.pages:
881
+ for blob in page:
882
+ total_bytes += blob.size
883
+
884
+ return total_bytes
885
+
838
886
  def release_connection(self):
839
887
  global GC_POOL
840
888
  with GCS_BUCKET_POOL_LOCK:
@@ -882,6 +930,8 @@ class HttpInterface(StorageInterface):
882
930
  key = self.get_path_to_file(file_path)
883
931
  headers = self.default_headers()
884
932
  with self.session.head(key, headers=headers) as resp:
933
+ if resp.status_code in (404, 403):
934
+ return None
885
935
  resp.raise_for_status()
886
936
  return resp.headers
887
937
 
@@ -889,6 +939,9 @@ class HttpInterface(StorageInterface):
889
939
  headers = self.head(file_path)
890
940
  return int(headers["Content-Length"])
891
941
 
942
+ def subtree_size(self, prefix:str = "") -> int:
943
+ raise NotImplementedError()
944
+
892
945
  @retry
893
946
  def get_file(self, file_path, start=None, end=None, part_size=None):
894
947
  key = self.get_path_to_file(file_path)
@@ -899,24 +952,20 @@ class HttpInterface(StorageInterface):
899
952
  end = int(end - 1) if end is not None else ''
900
953
  headers["Range"] = f"bytes={start}-{end}"
901
954
 
902
- resp = self.session.get(key, headers=headers)
903
-
904
- if resp.status_code in (404, 403):
905
- return (None, None, None, None)
906
- resp.close()
907
- resp.raise_for_status()
955
+ with self.session.get(key, headers=headers, stream=True) as resp:
956
+ if resp.status_code in (404, 403):
957
+ return (None, None, None, None)
958
+ resp.raise_for_status()
959
+ resp.raw.decode_content = False
960
+ content = resp.raw.read()
961
+ content_encoding = resp.headers.get('Content-Encoding', None)
908
962
 
909
963
  # Don't check MD5 for http because the etag can come in many
910
964
  # forms from either GCS, S3 or another service entirely. We
911
965
  # probably won't figure out how to decode it right.
912
966
  # etag = resp.headers.get('etag', None)
913
- content_encoding = resp.headers.get('Content-Encoding', None)
914
-
915
- # requests automatically decodes these
916
- if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
917
- content_encoding = None
918
967
 
919
- return (resp.content, content_encoding, None, None)
968
+ return (content, content_encoding, None, None)
920
969
 
921
970
  @retry
922
971
  def save_file(self, src, dest, resumable) -> tuple[bool, int]:
@@ -1017,7 +1066,6 @@ class HttpInterface(StorageInterface):
1017
1066
  )
1018
1067
 
1019
1068
  for res in results.get("items", []):
1020
- print(res["name"])
1021
1069
  yield res["name"].removeprefix(strip)
1022
1070
 
1023
1071
  token = results.get("nextPageToken", None)
@@ -1490,6 +1538,44 @@ class S3Interface(StorageInterface):
1490
1538
  for filename in iterate(resp):
1491
1539
  yield filename
1492
1540
 
1541
+ def subtree_size(self, prefix:str = "") -> int:
1542
+ layer_path = self.get_path_to_file("")
1543
+ path = posixpath.join(layer_path, prefix)
1544
+
1545
+ @retry
1546
+ def s3lst(path, continuation_token=None):
1547
+ kwargs = {
1548
+ 'Bucket': self._path.bucket,
1549
+ 'Prefix': path,
1550
+ **self._additional_attrs
1551
+ }
1552
+
1553
+ if continuation_token:
1554
+ kwargs['ContinuationToken'] = continuation_token
1555
+
1556
+ return self._conn.list_objects_v2(**kwargs)
1557
+
1558
+ resp = s3lst(path)
1559
+
1560
+ def iterate(resp):
1561
+ if 'Contents' not in resp.keys():
1562
+ resp['Contents'] = []
1563
+
1564
+ for item in resp['Contents']:
1565
+ yield item.get('Size', 0)
1566
+
1567
+ total_bytes = 0
1568
+ for num_bytes in iterate(resp):
1569
+ total_bytes += num_bytes
1570
+
1571
+ while resp['IsTruncated'] and resp['NextContinuationToken']:
1572
+ resp = s3lst(path, resp['NextContinuationToken'])
1573
+
1574
+ for num_bytes in iterate(resp):
1575
+ total_bytes += num_bytes
1576
+
1577
+ return total_bytes
1578
+
1493
1579
  def release_connection(self):
1494
1580
  global S3_POOL
1495
1581
  service = self._path.alias or 's3'
@@ -22,7 +22,7 @@ PRECOMPUTED_SUFFIX = '|neuroglancer-precomputed:'
22
22
 
23
23
  ALIAS_FILE = os.path.join(CLOUD_FILES_DIR, "aliases.json")
24
24
  OFFICIAL_ALIASES = {
25
- "nokura": "s3://https://nokura.pni.princeton.edu/",
25
+ "nokura": "s3://https://c10s.pni.princeton.edu/",
26
26
  "matrix": "s3://https://s3-hpcrc.rc.princeton.edu/",
27
27
  "tigerdata": "s3://https://td.princeton.edu/",
28
28
  }
@@ -809,7 +809,10 @@ def du(paths, grand_total, summarize, human_readable):
809
809
  npath = normalize_path(path)
810
810
  if ispathdir(path):
811
811
  cf = CloudFiles(npath)
812
- results.append(cf.size(cf.list()))
812
+ if summarize:
813
+ results.append(cf.subtree_size())
814
+ else:
815
+ results.append(cf.size(cf.list()))
813
816
  else:
814
817
  cf = CloudFiles(os.path.dirname(npath))
815
818
  sz = cf.size(os.path.basename(npath))
@@ -839,7 +842,10 @@ def du(paths, grand_total, summarize, human_readable):
839
842
 
840
843
  summary = {}
841
844
  for path, res in zip(paths, results):
842
- summary[path] = sum(res.values())
845
+ if isinstance(res, int):
846
+ summary[path] = res
847
+ else:
848
+ summary[path] = sum(res.values())
843
849
  if summarize:
844
850
  print(f"{SI(summary[path])}\t{path}")
845
851
 
@@ -1 +0,0 @@
1
- {"git_version": "add0684", "is_release": true}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes