cloud-files 5.8.2__py3-none-any.whl → 5.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.8.2
3
+ Version: 5.9.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -1,10 +1,10 @@
1
1
  cloudfiles/__init__.py,sha256=pLB4CcV2l3Jgv_ni1520Np1pfzFj8Cpr87vNxFT3rNI,493
2
- cloudfiles/cloudfiles.py,sha256=tPG1PBLEjABPu-KLe93yf6xW_zbafPsQ6z5NuofyUoU,56743
2
+ cloudfiles/cloudfiles.py,sha256=eUFf_PKaLtOIkDmGjDRggPMkMY46BHrXOvNSoAnsDWU,57930
3
3
  cloudfiles/compression.py,sha256=WXJHnoNLJ_NWyoY9ygZmFA2qMou35_9xS5dzF7-2H-M,6262
4
4
  cloudfiles/connectionpools.py,sha256=aL8RiSjRepECfgAFmJcz80aJFKbou7hsbuEgugDKwB8,4814
5
5
  cloudfiles/exceptions.py,sha256=N0oGQNG-St6RvnT8e5p_yC_E61q2kgAe2scwAL0F49c,843
6
6
  cloudfiles/gcs.py,sha256=unqu5KxGKaPq6N4QeHSpCDdtnK1BzPOAerTZ8FLt2_4,3820
7
- cloudfiles/interfaces.py,sha256=M62UdugtWcF-J4iQMClHNDEQYu7xxCSc1aT7WW2C1lU,44942
7
+ cloudfiles/interfaces.py,sha256=Eurpmwv6sbn44AfPGp1Pahb2drhqN9lo5J7CRDTyzWU,47190
8
8
  cloudfiles/lib.py,sha256=HHjCvjmOjA0nZWSvHGoSeYpxqd6FAG8xk8LM212LAUA,5382
9
9
  cloudfiles/monitoring.py,sha256=N5Xq0PYZK1OxoYtwBFsnnfaq7dElTgY8Rn2Ez_I3aoo,20897
10
10
  cloudfiles/paths.py,sha256=FLdShqkOg1XlkHurU9eiKzLadx2JFYG1EmleCpOFsYQ,12229
@@ -16,12 +16,12 @@ cloudfiles/threaded_queue.py,sha256=Nl4vfXhQ6nDLF8PZpSSBpww0M2zWtcd4DLs3W3BArBw,
16
16
  cloudfiles/typing.py,sha256=f3ZYkNfN9poxhGu5j-P0KCxjCCqSn9HAg5KiIPkjnCg,416
17
17
  cloudfiles_cli/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
18
18
  cloudfiles_cli/__init__.py,sha256=Wftt3R3F21QsHtWqx49ODuqT9zcSr0em7wk48kcH0WM,29
19
- cloudfiles_cli/cloudfiles_cli.py,sha256=JlP9ocqxZbMANAZhZCQSvvjwe6syovQ1asUzSeAlNYk,38459
20
- cloud_files-5.8.2.dist-info/AUTHORS,sha256=BFVmobgAhaVFI5fqbuqAY5XmBQxe09ZZAsAOTy87hKQ,318
21
- cloud_files-5.8.2.dist-info/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
22
- cloud_files-5.8.2.dist-info/METADATA,sha256=iMhQdNleZM5bNnHKDZ97QNjVuA-7GIWIMHa_wZtePLU,30530
23
- cloud_files-5.8.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- cloud_files-5.8.2.dist-info/entry_points.txt,sha256=xlirb1FVhn1mbcv4IoyMEGumDqKOA4VMVd3drsRQxIg,51
25
- cloud_files-5.8.2.dist-info/pbr.json,sha256=geeaELiKgs-Cl3LBIxRry_acNuF2kxgqEmZPNhjainY,46
26
- cloud_files-5.8.2.dist-info/top_level.txt,sha256=xPyrST3okJbsmdCF5IC2gYAVxg_aD5AYVTnNo8UuoZU,26
27
- cloud_files-5.8.2.dist-info/RECORD,,
19
+ cloudfiles_cli/cloudfiles_cli.py,sha256=GTQj0UZB34Cfy4q-hIbXqRUnbLYCTQ6OeXjAb930i5Q,38602
20
+ cloud_files-5.9.0.dist-info/AUTHORS,sha256=BFVmobgAhaVFI5fqbuqAY5XmBQxe09ZZAsAOTy87hKQ,318
21
+ cloud_files-5.9.0.dist-info/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
22
+ cloud_files-5.9.0.dist-info/METADATA,sha256=4qhGrbkuqEdwCuq-Nqedo7nBNn_QkA5qHFLxfskqid4,30530
23
+ cloud_files-5.9.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ cloud_files-5.9.0.dist-info/entry_points.txt,sha256=xlirb1FVhn1mbcv4IoyMEGumDqKOA4VMVd3drsRQxIg,51
25
+ cloud_files-5.9.0.dist-info/pbr.json,sha256=9M5V77fSgk_LF2IUco2G8NcksQ_1cmz7cGYU3OSqRzY,46
26
+ cloud_files-5.9.0.dist-info/top_level.txt,sha256=xPyrST3okJbsmdCF5IC2gYAVxg_aD5AYVTnNo8UuoZU,26
27
+ cloud_files-5.9.0.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ {"git_version": "623052c", "is_release": true}
cloudfiles/cloudfiles.py CHANGED
@@ -18,6 +18,7 @@ import platform
18
18
  import posixpath
19
19
  import re
20
20
  import shutil
21
+ import threading
21
22
  import types
22
23
  import time
23
24
 
@@ -1007,6 +1008,34 @@ class CloudFiles:
1007
1008
  return results
1008
1009
  return first(results.values())
1009
1010
 
1011
+ def subtree_size(self, prefix:GetPathType = "") -> int:
1012
+ """High performance size calculation for directory trees."""
1013
+ prefix, return_multiple = toiter(prefix, is_iter=True)
1014
+ total_bytes = 0
1015
+
1016
+ total = totalfn(prefix, None)
1017
+
1018
+ lock = threading.Lock()
1019
+
1020
+ def size_thunk(prefix):
1021
+ nonlocal total_bytes
1022
+ nonlocal lock
1023
+
1024
+ with self._get_connection() as conn:
1025
+ subtree_bytes = conn.subtree_size(prefix)
1026
+ with lock:
1027
+ total_bytes += subtree_bytes
1028
+
1029
+ schedule_jobs(
1030
+ fns=( partial(size_thunk, path) for path in prefix ),
1031
+ concurrency=self.num_threads,
1032
+ progress=self.progress,
1033
+ green=self.green,
1034
+ total=total,
1035
+ )
1036
+
1037
+ return total_bytes
1038
+
1010
1039
  @parallelize(desc="Delete")
1011
1040
  def delete(
1012
1041
  self, paths:GetPathType, total:Optional[int] = None,
@@ -1666,6 +1695,12 @@ class CloudFiles:
1666
1695
  return os.path.join(*paths)
1667
1696
  return posixpath.join(*paths)
1668
1697
 
1698
+ @property
1699
+ def sep(self) -> str:
1700
+ if self._path.protocol == "file":
1701
+ return os.sep
1702
+ return posixpath.sep
1703
+
1669
1704
  def dirname(self, path:str) -> str:
1670
1705
  if self._path.protocol == "file":
1671
1706
  return os.path.dirname(path)
@@ -1706,11 +1741,17 @@ class CloudFiles:
1706
1741
 
1707
1742
  class CloudFile:
1708
1743
  def __init__(
1709
- self, path:str, cache_meta:bool = False,
1744
+ self,
1745
+ path:str,
1746
+ cache_meta:bool = False,
1710
1747
  secrets:SecretsType = None,
1711
1748
  composite_upload_threshold:int = int(1e8),
1712
1749
  locking:bool = True,
1713
1750
  lock_dir:Optional[str] = None,
1751
+ endpoint:Optional[str] = None,
1752
+ no_sign_request:bool = False,
1753
+ request_payer:Optional[str] = None,
1754
+ use_https:bool = False,
1714
1755
  ):
1715
1756
  path = paths.normalize(path)
1716
1757
  self.cf = CloudFiles(
@@ -1719,6 +1760,10 @@ class CloudFile:
1719
1760
  composite_upload_threshold=composite_upload_threshold,
1720
1761
  locking=locking,
1721
1762
  lock_dir=lock_dir,
1763
+ use_https=use_https,
1764
+ endpoint=endpoint,
1765
+ request_payer=request_payer,
1766
+ no_sign_request=no_sign_request,
1722
1767
  )
1723
1768
  self.filename = paths.basename(path)
1724
1769
 
@@ -1726,6 +1771,10 @@ class CloudFile:
1726
1771
  self._size:Optional[int] = None
1727
1772
  self._head = None
1728
1773
 
1774
+ @property
1775
+ def sep(self) -> str:
1776
+ return self.cf.sep
1777
+
1729
1778
  @property
1730
1779
  def protocol(self):
1731
1780
  return self.cf.protocol
cloudfiles/interfaces.py CHANGED
@@ -48,6 +48,7 @@ MEM_POOL = None
48
48
 
49
49
  S3_ACLS = {
50
50
  "tigerdata": "private",
51
+ "nokura": "public-read",
51
52
  }
52
53
 
53
54
  S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -303,6 +304,19 @@ class FileInterface(StorageInterface):
303
304
 
304
305
  return self.io_with_lock(do_size, path, exclusive=False)
305
306
 
307
+ def subtree_size(self, prefix:str = "") -> int:
308
+ total_bytes = 0
309
+
310
+ subdir = self.get_path_to_file("")
311
+ if prefix:
312
+ subdir = os.path.join(subdir, os.path.dirname(prefix))
313
+
314
+ for root, dirs, files in os.walk(subdir):
315
+ files = ( os.path.join(root, f) for f in files )
316
+ total_bytes += sum(( os.path.getsize(f) for f in files ))
317
+
318
+ return total_bytes
319
+
306
320
  def exists(self, file_path):
307
321
  path = self.get_path_to_file(file_path)
308
322
  def do_exists():
@@ -580,8 +594,7 @@ class MemoryInterface(StorageInterface):
580
594
 
581
595
  Returns: iterator
582
596
  """
583
- layer_path = self.get_path_to_file("")
584
- path = os.path.join(layer_path, prefix) + '*'
597
+ layer_path = self.get_path_to_file("")
585
598
 
586
599
  remove = layer_path
587
600
  if len(remove) and remove[-1] != '/':
@@ -615,6 +628,21 @@ class MemoryInterface(StorageInterface):
615
628
  filenames.sort()
616
629
  return iter(filenames)
617
630
 
631
+ def subtree_size(self, prefix:str = "") -> int:
632
+ layer_path = self.get_path_to_file("")
633
+
634
+ remove = layer_path
635
+ if len(remove) and remove[-1] != '/':
636
+ remove += '/'
637
+
638
+ total_bytes = 0
639
+ for filename, binary in self._data.items():
640
+ f_prefix = f.removeprefix(remove)[:len(prefix)]
641
+ if f_prefix == prefix:
642
+ total_bytes += len(binary)
643
+
644
+ return total_bytes
645
+
618
646
  class GoogleCloudStorageInterface(StorageInterface):
619
647
  exists_batch_size = Batch._MAX_BATCH_SIZE
620
648
  delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -816,6 +844,8 @@ class GoogleCloudStorageInterface(StorageInterface):
816
844
  blobs = self._bucket.list_blobs(
817
845
  prefix=path,
818
846
  delimiter=delimiter,
847
+ page_size=2500,
848
+ fields="items(name),nextPageToken",
819
849
  )
820
850
 
821
851
  for page in blobs.pages:
@@ -835,6 +865,24 @@ class GoogleCloudStorageInterface(StorageInterface):
835
865
  yield filename
836
866
 
837
867
 
868
+ @retry
869
+ def subtree_size(self, prefix:str = "") -> int:
870
+ layer_path = self.get_path_to_file("")
871
+ path = posixpath.join(layer_path, prefix)
872
+
873
+ blobs = self._bucket.list_blobs(
874
+ prefix=path,
875
+ page_size=5000,
876
+ fields="items(name,size),nextPageToken",
877
+ )
878
+
879
+ total_bytes = 0
880
+ for page in blobs.pages:
881
+ for blob in page:
882
+ total_bytes += blob.size
883
+
884
+ return total_bytes
885
+
838
886
  def release_connection(self):
839
887
  global GC_POOL
840
888
  with GCS_BUCKET_POOL_LOCK:
@@ -882,6 +930,8 @@ class HttpInterface(StorageInterface):
882
930
  key = self.get_path_to_file(file_path)
883
931
  headers = self.default_headers()
884
932
  with self.session.head(key, headers=headers) as resp:
933
+ if resp.status_code in (404, 403):
934
+ return None
885
935
  resp.raise_for_status()
886
936
  return resp.headers
887
937
 
@@ -889,6 +939,9 @@ class HttpInterface(StorageInterface):
889
939
  headers = self.head(file_path)
890
940
  return int(headers["Content-Length"])
891
941
 
942
+ def subtree_size(self, prefix:str = "") -> int:
943
+ raise NotImplementedError()
944
+
892
945
  @retry
893
946
  def get_file(self, file_path, start=None, end=None, part_size=None):
894
947
  key = self.get_path_to_file(file_path)
@@ -899,24 +952,20 @@ class HttpInterface(StorageInterface):
899
952
  end = int(end - 1) if end is not None else ''
900
953
  headers["Range"] = f"bytes={start}-{end}"
901
954
 
902
- resp = self.session.get(key, headers=headers)
903
-
904
- if resp.status_code in (404, 403):
905
- return (None, None, None, None)
906
- resp.close()
907
- resp.raise_for_status()
955
+ with self.session.get(key, headers=headers, stream=True) as resp:
956
+ if resp.status_code in (404, 403):
957
+ return (None, None, None, None)
958
+ resp.raise_for_status()
959
+ resp.raw.decode_content = False
960
+ content = resp.raw.read()
961
+ content_encoding = resp.headers.get('Content-Encoding', None)
908
962
 
909
963
  # Don't check MD5 for http because the etag can come in many
910
964
  # forms from either GCS, S3 or another service entirely. We
911
965
  # probably won't figure out how to decode it right.
912
966
  # etag = resp.headers.get('etag', None)
913
- content_encoding = resp.headers.get('Content-Encoding', None)
914
-
915
- # requests automatically decodes these
916
- if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
917
- content_encoding = None
918
967
 
919
- return (resp.content, content_encoding, None, None)
968
+ return (content, content_encoding, None, None)
920
969
 
921
970
  @retry
922
971
  def save_file(self, src, dest, resumable) -> tuple[bool, int]:
@@ -1017,7 +1066,6 @@ class HttpInterface(StorageInterface):
1017
1066
  )
1018
1067
 
1019
1068
  for res in results.get("items", []):
1020
- print(res["name"])
1021
1069
  yield res["name"].removeprefix(strip)
1022
1070
 
1023
1071
  token = results.get("nextPageToken", None)
@@ -1490,6 +1538,44 @@ class S3Interface(StorageInterface):
1490
1538
  for filename in iterate(resp):
1491
1539
  yield filename
1492
1540
 
1541
+ def subtree_size(self, prefix:str = "") -> int:
1542
+ layer_path = self.get_path_to_file("")
1543
+ path = posixpath.join(layer_path, prefix)
1544
+
1545
+ @retry
1546
+ def s3lst(path, continuation_token=None):
1547
+ kwargs = {
1548
+ 'Bucket': self._path.bucket,
1549
+ 'Prefix': path,
1550
+ **self._additional_attrs
1551
+ }
1552
+
1553
+ if continuation_token:
1554
+ kwargs['ContinuationToken'] = continuation_token
1555
+
1556
+ return self._conn.list_objects_v2(**kwargs)
1557
+
1558
+ resp = s3lst(path)
1559
+
1560
+ def iterate(resp):
1561
+ if 'Contents' not in resp.keys():
1562
+ resp['Contents'] = []
1563
+
1564
+ for item in resp['Contents']:
1565
+ yield item.get('Size', 0)
1566
+
1567
+ total_bytes = 0
1568
+ for num_bytes in iterate(resp):
1569
+ total_bytes += num_bytes
1570
+
1571
+ while resp['IsTruncated'] and resp['NextContinuationToken']:
1572
+ resp = s3lst(path, resp['NextContinuationToken'])
1573
+
1574
+ for num_bytes in iterate(resp):
1575
+ total_bytes += num_bytes
1576
+
1577
+ return total_bytes
1578
+
1493
1579
  def release_connection(self):
1494
1580
  global S3_POOL
1495
1581
  service = self._path.alias or 's3'
@@ -809,7 +809,10 @@ def du(paths, grand_total, summarize, human_readable):
809
809
  npath = normalize_path(path)
810
810
  if ispathdir(path):
811
811
  cf = CloudFiles(npath)
812
- results.append(cf.size(cf.list()))
812
+ if summarize:
813
+ results.append(cf.subtree_size())
814
+ else:
815
+ results.append(cf.size(cf.list()))
813
816
  else:
814
817
  cf = CloudFiles(os.path.dirname(npath))
815
818
  sz = cf.size(os.path.basename(npath))
@@ -839,7 +842,10 @@ def du(paths, grand_total, summarize, human_readable):
839
842
 
840
843
  summary = {}
841
844
  for path, res in zip(paths, results):
842
- summary[path] = sum(res.values())
845
+ if isinstance(res, int):
846
+ summary[path] = res
847
+ else:
848
+ summary[path] = sum(res.values())
843
849
  if summarize:
844
850
  print(f"{SI(summary[path])}\t{path}")
845
851
 
@@ -1 +0,0 @@
1
- {"git_version": "99528f8", "is_release": true}