PyPI - cloud-files - Versions diffs - 5.7.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

cloud-files 5.7.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cloud-files
-Version: 5.7.0
+Version: 6.0.0
 Summary: Fast access to cloud storage and local FS.
 Home-page: https://github.com/seung-lab/cloud-files/
 Author: William Silversmith

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 cloudfiles/__init__.py,sha256=pLB4CcV2l3Jgv_ni1520Np1pfzFj8Cpr87vNxFT3rNI,493
-cloudfiles/cloudfiles.py,sha256=tPG1PBLEjABPu-KLe93yf6xW_zbafPsQ6z5NuofyUoU,56743
+cloudfiles/cloudfiles.py,sha256=SlSkGKCBdnne7vE7Y5_tvsMcFNx_coUWMtDajEnbHfY,58093
 cloudfiles/compression.py,sha256=WXJHnoNLJ_NWyoY9ygZmFA2qMou35_9xS5dzF7-2H-M,6262
 cloudfiles/connectionpools.py,sha256=aL8RiSjRepECfgAFmJcz80aJFKbou7hsbuEgugDKwB8,4814
 cloudfiles/exceptions.py,sha256=N0oGQNG-St6RvnT8e5p_yC_E61q2kgAe2scwAL0F49c,843
 cloudfiles/gcs.py,sha256=unqu5KxGKaPq6N4QeHSpCDdtnK1BzPOAerTZ8FLt2_4,3820
-cloudfiles/interfaces.py,sha256=5rUh2DWOVlg13fAxyZ0wAaQyfW04xc2zlUfTItFV-zQ,45325
+cloudfiles/interfaces.py,sha256=Kg5t2-tWD0EoJ0qK-wid7zdxLgs7q0mDduPxAzyUUL0,47499
 cloudfiles/lib.py,sha256=HHjCvjmOjA0nZWSvHGoSeYpxqd6FAG8xk8LM212LAUA,5382
 cloudfiles/monitoring.py,sha256=N5Xq0PYZK1OxoYtwBFsnnfaq7dElTgY8Rn2Ez_I3aoo,20897
-cloudfiles/paths.py,sha256=HOvtdLSIYGwlwvnZt9d_Ez3TXOe7WWd18bZNDpExUDQ,12231
+cloudfiles/paths.py,sha256=FLdShqkOg1XlkHurU9eiKzLadx2JFYG1EmleCpOFsYQ,12229
 cloudfiles/resumable_tools.py,sha256=NyuSoGh1SaP5akrHCpd9kgy2-JruEWrHW9lvJxV7jpE,6711
 cloudfiles/scheduler.py,sha256=ioqBT5bMPCVHEHlnL-SW_wHulxGgjeThiKHlnaDOydo,3831
 cloudfiles/secrets.py,sha256=IuYKHmmvFmQTyG2Zcmbx7e8U2LIv-woG5d8qyOlyCD8,5431
@@ -16,12 +16,12 @@ cloudfiles/threaded_queue.py,sha256=Nl4vfXhQ6nDLF8PZpSSBpww0M2zWtcd4DLs3W3BArBw,
 cloudfiles/typing.py,sha256=f3ZYkNfN9poxhGu5j-P0KCxjCCqSn9HAg5KiIPkjnCg,416
 cloudfiles_cli/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
 cloudfiles_cli/__init__.py,sha256=Wftt3R3F21QsHtWqx49ODuqT9zcSr0em7wk48kcH0WM,29
-cloudfiles_cli/cloudfiles_cli.py,sha256=k5_bMUcjDM2o-HjgwSaK6rT51t91nYjSAy3xZHf-qSs,38128
-cloud_files-5.7.0.dist-info/AUTHORS,sha256=BFVmobgAhaVFI5fqbuqAY5XmBQxe09ZZAsAOTy87hKQ,318
-cloud_files-5.7.0.dist-info/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
-cloud_files-5.7.0.dist-info/METADATA,sha256=oiedYRc-OIb1u8yRqMKOOKtvEJHj5phtQd-0V-cEqfI,30530
-cloud_files-5.7.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-cloud_files-5.7.0.dist-info/entry_points.txt,sha256=xlirb1FVhn1mbcv4IoyMEGumDqKOA4VMVd3drsRQxIg,51
-cloud_files-5.7.0.dist-info/pbr.json,sha256=btfjSn_FM-LMcr5pxgz5jRJ_ImTfyOwfBZgFRavgNP8,46
-cloud_files-5.7.0.dist-info/top_level.txt,sha256=xPyrST3okJbsmdCF5IC2gYAVxg_aD5AYVTnNo8UuoZU,26
-cloud_files-5.7.0.dist-info/RECORD,,
+cloudfiles_cli/cloudfiles_cli.py,sha256=jHbQasZb5DB_g8nGxS3Y0ekAdIPcSVrHN5mvEedUl0k,38908
+cloud_files-6.0.0.dist-info/AUTHORS,sha256=BFVmobgAhaVFI5fqbuqAY5XmBQxe09ZZAsAOTy87hKQ,318
+cloud_files-6.0.0.dist-info/LICENSE,sha256=Jna4xYE8CCQmaxjr5Fs-wmUBnIQJ1DGcNn9MMjbkprk,1538
+cloud_files-6.0.0.dist-info/METADATA,sha256=SJw22OqzxSN3BvyacUjQgJ1trdAWs4mJv9hC0LYKQZk,30530
+cloud_files-6.0.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+cloud_files-6.0.0.dist-info/entry_points.txt,sha256=xlirb1FVhn1mbcv4IoyMEGumDqKOA4VMVd3drsRQxIg,51
+cloud_files-6.0.0.dist-info/pbr.json,sha256=P1Yg68JWbSeMCxsbPR-QhAUj2p8rzNNuqgMHtcFAveo,46
+cloud_files-6.0.0.dist-info/top_level.txt,sha256=xPyrST3okJbsmdCF5IC2gYAVxg_aD5AYVTnNo8UuoZU,26
+cloud_files-6.0.0.dist-info/RECORD,,

cloud_files-6.0.0.dist-info/pbr.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"git_version": "38a2b59", "is_release": true}

cloudfiles/cloudfiles.py CHANGED Viewed

@@ -18,6 +18,7 @@ import platform
 import posixpath
 import re
 import shutil
+import threading
 import types
 import time
@@ -1007,6 +1008,40 @@ class CloudFiles:
       return results
     return first(results.values())
+  def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
+    """High performance size calculation for directory trees."""
+    prefix, return_multiple = toiter(prefix, is_iter=True)
+    total_bytes = 0
+    total_files = 0
+    total = totalfn(prefix, None)
+    lock = threading.Lock()
+    def size_thunk(prefix):
+      nonlocal total_bytes
+      nonlocal total_files
+      nonlocal lock
+      with self._get_connection() as conn:
+        subtree_files, subtree_bytes = conn.subtree_size(prefix)
+        with lock:
+          total_files += subtree_files
+          total_bytes += subtree_bytes
+    schedule_jobs(
+      fns=( partial(size_thunk, path) for path in prefix ),
+      concurrency=self.num_threads,
+      progress=self.progress,
+      green=self.green,
+      total=total,
+    )
+    return {
+      "N": total_files,
+      "num_bytes": total_bytes,
+    }
   @parallelize(desc="Delete")
   def delete(
     self, paths:GetPathType, total:Optional[int] = None,
@@ -1666,6 +1701,12 @@ class CloudFiles:
       return os.path.join(*paths)
     return posixpath.join(*paths)
+  @property
+  def sep(self) -> str:
+    if self._path.protocol == "file":
+      return os.sep
+    return posixpath.sep
   def dirname(self, path:str) -> str:
     if self._path.protocol == "file":
       return os.path.dirname(path)
@@ -1706,11 +1747,17 @@ class CloudFiles:
 class CloudFile:
   def __init__(
-    self, path:str, cache_meta:bool = False,
+    self,
+    path:str,
+    cache_meta:bool = False,
     secrets:SecretsType = None,
     composite_upload_threshold:int = int(1e8),
     locking:bool = True,
     lock_dir:Optional[str] = None,
+    endpoint:Optional[str] = None,
+    no_sign_request:bool = False,
+    request_payer:Optional[str] = None,
+    use_https:bool = False,
   ):
     path = paths.normalize(path)
     self.cf = CloudFiles(
@@ -1719,6 +1766,10 @@ class CloudFile:
       composite_upload_threshold=composite_upload_threshold,
       locking=locking,
       lock_dir=lock_dir,
+      use_https=use_https,
+      endpoint=endpoint,
+      request_payer=request_payer,
+      no_sign_request=no_sign_request,
     )
     self.filename = paths.basename(path)
@@ -1726,6 +1777,10 @@ class CloudFile:
     self._size:Optional[int] = None
     self._head = None
+  @property
+  def sep(self) -> str:
+    return self.cf.sep
   @property
   def protocol(self):
     return self.cf.protocol

cloudfiles/interfaces.py CHANGED Viewed

@@ -48,6 +48,7 @@ MEM_POOL = None
 S3_ACLS = {
   "tigerdata": "private",
+  "nokura": "public-read",
 }
 S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -303,6 +304,22 @@ class FileInterface(StorageInterface):
     return self.io_with_lock(do_size, path, exclusive=False)
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    total_bytes = 0
+    total_files = 0
+    subdir = self.get_path_to_file("")
+    if prefix:
+      subdir = os.path.join(subdir, os.path.dirname(prefix))
+    for root, dirs, files in os.walk(subdir):
+      for f in files:
+          path = os.path.join(root, f)
+          total_files += 1
+          total_bytes += os.path.getsize(path)
+    return (total_files, total_bytes)
   def exists(self, file_path):
     path = self.get_path_to_file(file_path)
     def do_exists():
@@ -580,8 +597,7 @@ class MemoryInterface(StorageInterface):
     Returns: iterator
     """
-    layer_path = self.get_path_to_file("")
-    path = os.path.join(layer_path, prefix) + '*'
+    layer_path = self.get_path_to_file("")
     remove = layer_path
     if len(remove) and remove[-1] != '/':
@@ -615,6 +631,23 @@ class MemoryInterface(StorageInterface):
     filenames.sort()
     return iter(filenames)
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    remove = layer_path
+    if len(remove) and remove[-1] != '/':
+      remove += '/'
+    total_bytes = 0
+    total_files = 0
+    for filename, binary in self._data.items():
+      f_prefix = f.removeprefix(remove)[:len(prefix)]
+      if f_prefix == prefix:
+        total_bytes += len(binary)
+        total_files += 1
+    return (total_files, total_bytes)
 class GoogleCloudStorageInterface(StorageInterface):
   exists_batch_size = Batch._MAX_BATCH_SIZE
   delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -798,6 +831,7 @@ class GoogleCloudStorageInterface(StorageInterface):
       except google.cloud.exceptions.NotFound:
         pass
   @retry
   def list_files(self, prefix, flat=False):
     """
@@ -815,35 +849,46 @@ class GoogleCloudStorageInterface(StorageInterface):
     blobs = self._bucket.list_blobs(
       prefix=path,
       delimiter=delimiter,
+      page_size=2500,
+      fields="items(name),nextPageToken",
     )
-    first = True
-    for blob in blobs:
-      # This awkward construction is necessary
-      # because the request that populates prefixes
-      # isn't made until the iterator is activated.
-      if first and blobs.prefixes:
+    for page in blobs.pages:
+      if page.prefixes:
         yield from (
           item.removeprefix(path)
-          for item in blobs.prefixes
+          for item in page.prefixes
         )
-        first = False
-      filename = blob.name.removeprefix(layer_path)
-      if not filename:
-        continue
-      elif not flat and filename[-1] != '/':
-        yield filename
-      elif flat and '/' not in blob.name.removeprefix(path):
-        yield filename
+      for blob in page:
+        filename = blob.name.removeprefix(layer_path)
+        if not filename:
+          continue
+        elif not flat and filename[-1] != '/':
+          yield filename
+        elif flat and '/' not in blob.name.removeprefix(path):
+          yield filename
-    # When there are no regular items at this level
-    # we need to still print the directories.
-    if first and blobs.prefixes:
-      yield from (
-        item.removeprefix(path)
-        for item in blobs.prefixes
-      )
+  @retry
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    path = posixpath.join(layer_path, prefix)
+    blobs = self._bucket.list_blobs(
+      prefix=path,
+      page_size=5000,
+      fields="items(name,size),nextPageToken",
+    )
+    total_bytes = 0
+    total_files = 0
+    for page in blobs.pages:
+      for blob in page:
+        total_bytes += blob.size
+        total_files += 1
+    return (total_files, total_bytes)
   def release_connection(self):
     global GC_POOL
@@ -892,6 +937,8 @@ class HttpInterface(StorageInterface):
     key = self.get_path_to_file(file_path)
     headers = self.default_headers()
     with self.session.head(key, headers=headers) as resp:
+      if resp.status_code in (404, 403):
+        return None
       resp.raise_for_status()
       return resp.headers
@@ -899,6 +946,9 @@ class HttpInterface(StorageInterface):
     headers = self.head(file_path)
     return int(headers["Content-Length"])
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    raise NotImplementedError()
   @retry
   def get_file(self, file_path, start=None, end=None, part_size=None):
     key = self.get_path_to_file(file_path)
@@ -909,24 +959,20 @@ class HttpInterface(StorageInterface):
       end = int(end - 1) if end is not None else ''
       headers["Range"] = f"bytes={start}-{end}"
-    resp = self.session.get(key, headers=headers)
-    if resp.status_code in (404, 403):
-      return (None, None, None, None)
-    resp.close()
-    resp.raise_for_status()
+    with self.session.get(key, headers=headers, stream=True) as resp:
+      if resp.status_code in (404, 403):
+        return (None, None, None, None)
+      resp.raise_for_status()
+      resp.raw.decode_content = False
+      content = resp.raw.read()
+      content_encoding = resp.headers.get('Content-Encoding', None)
     # Don't check MD5 for http because the etag can come in many
     # forms from either GCS, S3 or another service entirely. We
     # probably won't figure out how to decode it right.
     # etag = resp.headers.get('etag', None)
-    content_encoding = resp.headers.get('Content-Encoding', None)
-    # requests automatically decodes these
-    if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
-      content_encoding = None
-    return (resp.content, content_encoding, None, None)
+    return (content, content_encoding, None, None)
   @retry
   def save_file(self, src, dest, resumable) -> tuple[bool, int]:
@@ -1027,7 +1073,6 @@ class HttpInterface(StorageInterface):
         )
       for res in results.get("items", []):
-        print(res["name"])
         yield res["name"].removeprefix(strip)
       token = results.get("nextPageToken", None)
@@ -1500,6 +1545,47 @@ class S3Interface(StorageInterface):
       for filename in iterate(resp):
         yield filename
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    path = posixpath.join(layer_path, prefix)
+    @retry
+    def s3lst(path, continuation_token=None):
+      kwargs = {
+        'Bucket': self._path.bucket,
+        'Prefix': path,
+        **self._additional_attrs
+      }
+      if continuation_token:
+        kwargs['ContinuationToken'] = continuation_token
+      return self._conn.list_objects_v2(**kwargs)
+    resp = s3lst(path)
+    def iterate(resp):
+      if 'Contents' not in resp.keys():
+        resp['Contents'] = []
+      for item in resp['Contents']:
+        yield item.get('Size', 0)
+    total_bytes = 0
+    total_files = 0
+    for num_bytes in iterate(resp):
+      total_files += 1
+      total_bytes += num_bytes
+    while resp['IsTruncated'] and resp['NextContinuationToken']:
+      resp = s3lst(path, resp['NextContinuationToken'])
+      for num_bytes in iterate(resp):
+        total_files += 1
+        total_bytes += num_bytes
+    return (total_files, total_bytes)
   def release_connection(self):
     global S3_POOL
     service = self._path.alias or 's3'

cloudfiles/paths.py CHANGED Viewed

@@ -22,7 +22,7 @@ PRECOMPUTED_SUFFIX = '|neuroglancer-precomputed:'
 ALIAS_FILE = os.path.join(CLOUD_FILES_DIR, "aliases.json")
 OFFICIAL_ALIASES = {
-  "nokura": "s3://https://nokura.pni.princeton.edu/",
+  "nokura": "s3://https://c10s.pni.princeton.edu/",
   "matrix": "s3://https://s3-hpcrc.rc.princeton.edu/",
   "tigerdata": "s3://https://td.princeton.edu/",
 }

cloudfiles_cli/cloudfiles_cli.py CHANGED Viewed

@@ -182,6 +182,19 @@ def get_mfp(path, recursive):
   return (many, flat, prefix, suffix)
+@main.command("mkdir")
+@click.argument("paths", nargs=-1)
+def _mkdir(paths):
+  """
+  Create paths on the local file system.
+  """
+  for path in paths:
+    path = normalize_path(path)
+    protocol = get_protocol(path)
+    if protocol == "file":
+      mkdir(path.replace("file://", "", 1))
 @main.command()
 @click.argument("source", nargs=-1)
 @click.argument("destination", nargs=1)
@@ -588,6 +601,7 @@ def touch(
   ctx, sources,
   progress, no_sign_request,
 ):
+  """Create file if it doesn't exist."""
   sources = list(map(normalize_path, sources))
   sources = [ src.replace("precomputed://", "") for src in sources ]
   pbar = tqdm(total=len(sources), desc="Touch", disable=(not progress))
@@ -788,14 +802,22 @@ def __rm(cloudpath, progress, paths):
 @click.option('-c', '--grand-total', is_flag=True, default=False, help="Sum a grand total of all inputs.")
 @click.option('-s', '--summarize', is_flag=True, default=False, help="Sum a total for each input argument.")
 @click.option('-h', '--human-readable', is_flag=True, default=False, help='"Human-readable" output. Use unit suffixes: Bytes, KiB, MiB, GiB, TiB, PiB, and EiB.')
-def du(paths, grand_total, summarize, human_readable):
+@click.option('-N', '--count-files', is_flag=True, default=False, help='Also report the number of files.')
+def du(paths, grand_total, summarize, human_readable, count_files):
   """Display disk usage statistics."""
   results = []
+  list_data = False
   for path in paths:
     npath = normalize_path(path)
     if ispathdir(path):
       cf = CloudFiles(npath)
-      results.append(cf.size(cf.list()))
+      if summarize:
+        results.append(cf.subtree_size())
+      else:
+        list_data = True
+        results.append(cf.size(cf.list()))
     else:
       cf = CloudFiles(os.path.dirname(npath))
       sz = cf.size(os.path.basename(npath))
@@ -824,8 +846,15 @@ def du(paths, grand_total, summarize, human_readable):
       return f"{(val / 2**60):.2f} EiB"
   summary = {}
+  num_files = 0
   for path, res in zip(paths, results):
-    summary[path] = sum(res.values())
+    if list_data:
+      summary[path] = sum(res.values())
+      num_files += len(res)
+    else:
+      summary[path] = res["num_bytes"]
+      num_files += res["N"]
     if summarize:
       print(f"{SI(summary[path])}\t{path}")
@@ -835,7 +864,10 @@ def du(paths, grand_total, summarize, human_readable):
         print(f"{SI(size)}\t{pth}")
   if grand_total:
-    print(f"{SI(sum(summary.values()))}\ttotal")
+    print(f"{SI(sum(summary.values()))}\tbytes total")
+  if count_files:
+    print(f"{num_files}\tfiles total")
 @main.command()
 @click.argument('paths', nargs=-1)

cloud_files-5.7.0.dist-info/pbr.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"git_version": "cab2668", "is_release": true}

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/AUTHORS RENAMED Viewed

File without changes

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cloud_files-5.7.0.dist-info → cloud_files-6.0.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

cloud-files 5.7.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

cloud-files 5.7.0py3-none-any.whl → 6.0.0py3-none-any.whl