PyPI - cloud-files - Versions diffs - 5.8.2__tar.gz → 6.0.0__tar.gz - Mend

cloud-files 5.8.2tar.gz → 6.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{cloud_files-5.8.2 → cloud_files-6.0.0}/.github/workflows/test-suite.yml RENAMED Viewed

@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
     steps:
     - uses: actions/checkout@v2
@@ -25,7 +25,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools wheel
         if [ -f requirements.txt ]; then pip install -e ".[test,monitoring]"; fi
     - name: Test with pytest
       run: |

{cloud_files-5.8.2 → cloud_files-6.0.0}/ChangeLog RENAMED Viewed

@@ -1,6 +1,26 @@
 CHANGES
 =======
+6.0.0
+-----
+* feat: add file counts to du as -N flag
+5.9.0
+-----
+* perf: reduce data loading for list\_blogs for GCS
+* perf: memory efficient listing on GCS
+* fix: errant print statement
+* feat: add CloudFile(s).sep
+* fix(https): allow "raw=True" to work
+* fix: don't retry on 403,404 for http head
+* ci: drop py3.9 add py3.14
+* fixtest: try upgrading setuptools
+* feat: add constructor args to CloudFile that are present in CloudFiles
+* perf: faster du using listing (#120)
+* test: change target for nokura
 5.8.2
 -----

{cloud_files-5.8.2 → cloud_files-6.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cloud-files
-Version: 5.8.2
+Version: 6.0.0
 Summary: Fast access to cloud storage and local FS.
 Home-page: https://github.com/seung-lab/cloud-files/
 Author: William Silversmith

{cloud_files-5.8.2 → cloud_files-6.0.0}/automated_test.py RENAMED Viewed

@@ -757,7 +757,7 @@ def test_to_https_protocol():
   assert pth == "https://s3-hpcrc.rc.princeton.edu/my_bucket/to/heaven"
   pth = to_https_protocol("nokura://my_bucket/to/heaven")
-  assert pth == "https://nokura.pni.princeton.edu/my_bucket/to/heaven"
+  assert pth == "https://c10s.pni.princeton.edu/my_bucket/to/heaven"
   pth = to_https_protocol("tigerdata://my_bucket/to/heaven")
   assert pth == "https://td.princeton.edu/my_bucket/to/heaven"

{cloud_files-5.8.2 → cloud_files-6.0.0}/cloud_files.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cloud-files
-Version: 5.8.2
+Version: 6.0.0
 Summary: Fast access to cloud storage and local FS.
 Home-page: https://github.com/seung-lab/cloud-files/
 Author: William Silversmith

cloud_files-6.0.0/cloud_files.egg-info/pbr.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"git_version": "38a2b59", "is_release": true}

{cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/cloudfiles.py RENAMED Viewed

@@ -18,6 +18,7 @@ import platform
 import posixpath
 import re
 import shutil
+import threading
 import types
 import time
@@ -1007,6 +1008,40 @@ class CloudFiles:
       return results
     return first(results.values())
+  def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
+    """High performance size calculation for directory trees."""
+    prefix, return_multiple = toiter(prefix, is_iter=True)
+    total_bytes = 0
+    total_files = 0
+    total = totalfn(prefix, None)
+    lock = threading.Lock()
+    def size_thunk(prefix):
+      nonlocal total_bytes
+      nonlocal total_files
+      nonlocal lock
+      with self._get_connection() as conn:
+        subtree_files, subtree_bytes = conn.subtree_size(prefix)
+        with lock:
+          total_files += subtree_files
+          total_bytes += subtree_bytes
+    schedule_jobs(
+      fns=( partial(size_thunk, path) for path in prefix ),
+      concurrency=self.num_threads,
+      progress=self.progress,
+      green=self.green,
+      total=total,
+    )
+    return {
+      "N": total_files,
+      "num_bytes": total_bytes,
+    }
   @parallelize(desc="Delete")
   def delete(
     self, paths:GetPathType, total:Optional[int] = None,
@@ -1666,6 +1701,12 @@ class CloudFiles:
       return os.path.join(*paths)
     return posixpath.join(*paths)
+  @property
+  def sep(self) -> str:
+    if self._path.protocol == "file":
+      return os.sep
+    return posixpath.sep
   def dirname(self, path:str) -> str:
     if self._path.protocol == "file":
       return os.path.dirname(path)
@@ -1706,11 +1747,17 @@ class CloudFiles:
 class CloudFile:
   def __init__(
-    self, path:str, cache_meta:bool = False,
+    self,
+    path:str,
+    cache_meta:bool = False,
     secrets:SecretsType = None,
     composite_upload_threshold:int = int(1e8),
     locking:bool = True,
     lock_dir:Optional[str] = None,
+    endpoint:Optional[str] = None,
+    no_sign_request:bool = False,
+    request_payer:Optional[str] = None,
+    use_https:bool = False,
   ):
     path = paths.normalize(path)
     self.cf = CloudFiles(
@@ -1719,6 +1766,10 @@ class CloudFile:
       composite_upload_threshold=composite_upload_threshold,
       locking=locking,
       lock_dir=lock_dir,
+      use_https=use_https,
+      endpoint=endpoint,
+      request_payer=request_payer,
+      no_sign_request=no_sign_request,
     )
     self.filename = paths.basename(path)
@@ -1726,6 +1777,10 @@ class CloudFile:
     self._size:Optional[int] = None
     self._head = None
+  @property
+  def sep(self) -> str:
+    return self.cf.sep
   @property
   def protocol(self):
     return self.cf.protocol

{cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles/interfaces.py RENAMED Viewed

@@ -48,6 +48,7 @@ MEM_POOL = None
 S3_ACLS = {
   "tigerdata": "private",
+  "nokura": "public-read",
 }
 S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -303,6 +304,22 @@ class FileInterface(StorageInterface):
     return self.io_with_lock(do_size, path, exclusive=False)
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    total_bytes = 0
+    total_files = 0
+    subdir = self.get_path_to_file("")
+    if prefix:
+      subdir = os.path.join(subdir, os.path.dirname(prefix))
+    for root, dirs, files in os.walk(subdir):
+      for f in files:
+          path = os.path.join(root, f)
+          total_files += 1
+          total_bytes += os.path.getsize(path)
+    return (total_files, total_bytes)
   def exists(self, file_path):
     path = self.get_path_to_file(file_path)
     def do_exists():
@@ -580,8 +597,7 @@ class MemoryInterface(StorageInterface):
     Returns: iterator
     """
-    layer_path = self.get_path_to_file("")
-    path = os.path.join(layer_path, prefix) + '*'
+    layer_path = self.get_path_to_file("")
     remove = layer_path
     if len(remove) and remove[-1] != '/':
@@ -615,6 +631,23 @@ class MemoryInterface(StorageInterface):
     filenames.sort()
     return iter(filenames)
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    remove = layer_path
+    if len(remove) and remove[-1] != '/':
+      remove += '/'
+    total_bytes = 0
+    total_files = 0
+    for filename, binary in self._data.items():
+      f_prefix = f.removeprefix(remove)[:len(prefix)]
+      if f_prefix == prefix:
+        total_bytes += len(binary)
+        total_files += 1
+    return (total_files, total_bytes)
 class GoogleCloudStorageInterface(StorageInterface):
   exists_batch_size = Batch._MAX_BATCH_SIZE
   delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -816,6 +849,8 @@ class GoogleCloudStorageInterface(StorageInterface):
     blobs = self._bucket.list_blobs(
       prefix=path,
       delimiter=delimiter,
+      page_size=2500,
+      fields="items(name),nextPageToken",
     )
     for page in blobs.pages:
@@ -835,6 +870,26 @@ class GoogleCloudStorageInterface(StorageInterface):
           yield filename
+  @retry
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    path = posixpath.join(layer_path, prefix)
+    blobs = self._bucket.list_blobs(
+      prefix=path,
+      page_size=5000,
+      fields="items(name,size),nextPageToken",
+    )
+    total_bytes = 0
+    total_files = 0
+    for page in blobs.pages:
+      for blob in page:
+        total_bytes += blob.size
+        total_files += 1
+    return (total_files, total_bytes)
   def release_connection(self):
     global GC_POOL
     with GCS_BUCKET_POOL_LOCK:
@@ -882,6 +937,8 @@ class HttpInterface(StorageInterface):
     key = self.get_path_to_file(file_path)
     headers = self.default_headers()
     with self.session.head(key, headers=headers) as resp:
+      if resp.status_code in (404, 403):
+        return None
       resp.raise_for_status()
       return resp.headers
@@ -889,6 +946,9 @@ class HttpInterface(StorageInterface):
     headers = self.head(file_path)
     return int(headers["Content-Length"])
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    raise NotImplementedError()
   @retry
   def get_file(self, file_path, start=None, end=None, part_size=None):
     key = self.get_path_to_file(file_path)
@@ -899,24 +959,20 @@ class HttpInterface(StorageInterface):
       end = int(end - 1) if end is not None else ''
       headers["Range"] = f"bytes={start}-{end}"
-    resp = self.session.get(key, headers=headers)
-    if resp.status_code in (404, 403):
-      return (None, None, None, None)
-    resp.close()
-    resp.raise_for_status()
+    with self.session.get(key, headers=headers, stream=True) as resp:
+      if resp.status_code in (404, 403):
+        return (None, None, None, None)
+      resp.raise_for_status()
+      resp.raw.decode_content = False
+      content = resp.raw.read()
+      content_encoding = resp.headers.get('Content-Encoding', None)
     # Don't check MD5 for http because the etag can come in many
     # forms from either GCS, S3 or another service entirely. We
     # probably won't figure out how to decode it right.
     # etag = resp.headers.get('etag', None)
-    content_encoding = resp.headers.get('Content-Encoding', None)
-    # requests automatically decodes these
-    if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
-      content_encoding = None
-    return (resp.content, content_encoding, None, None)
+    return (content, content_encoding, None, None)
   @retry
   def save_file(self, src, dest, resumable) -> tuple[bool, int]:
@@ -1017,7 +1073,6 @@ class HttpInterface(StorageInterface):
         )
       for res in results.get("items", []):
-        print(res["name"])
         yield res["name"].removeprefix(strip)
       token = results.get("nextPageToken", None)
@@ -1490,6 +1545,47 @@ class S3Interface(StorageInterface):
       for filename in iterate(resp):
         yield filename
+  def subtree_size(self, prefix:str = "") -> tuple[int,int]:
+    layer_path = self.get_path_to_file("")
+    path = posixpath.join(layer_path, prefix)
+    @retry
+    def s3lst(path, continuation_token=None):
+      kwargs = {
+        'Bucket': self._path.bucket,
+        'Prefix': path,
+        **self._additional_attrs
+      }
+      if continuation_token:
+        kwargs['ContinuationToken'] = continuation_token
+      return self._conn.list_objects_v2(**kwargs)
+    resp = s3lst(path)
+    def iterate(resp):
+      if 'Contents' not in resp.keys():
+        resp['Contents'] = []
+      for item in resp['Contents']:
+        yield item.get('Size', 0)
+    total_bytes = 0
+    total_files = 0
+    for num_bytes in iterate(resp):
+      total_files += 1
+      total_bytes += num_bytes
+    while resp['IsTruncated'] and resp['NextContinuationToken']:
+      resp = s3lst(path, resp['NextContinuationToken'])
+      for num_bytes in iterate(resp):
+        total_files += 1
+        total_bytes += num_bytes
+    return (total_files, total_bytes)
   def release_connection(self):
     global S3_POOL
     service = self._path.alias or 's3'

{cloud_files-5.8.2 → cloud_files-6.0.0}/cloudfiles_cli/cloudfiles_cli.py RENAMED Viewed

@@ -802,14 +802,22 @@ def __rm(cloudpath, progress, paths):
 @click.option('-c', '--grand-total', is_flag=True, default=False, help="Sum a grand total of all inputs.")
 @click.option('-s', '--summarize', is_flag=True, default=False, help="Sum a total for each input argument.")
 @click.option('-h', '--human-readable', is_flag=True, default=False, help='"Human-readable" output. Use unit suffixes: Bytes, KiB, MiB, GiB, TiB, PiB, and EiB.')
-def du(paths, grand_total, summarize, human_readable):
+@click.option('-N', '--count-files', is_flag=True, default=False, help='Also report the number of files.')
+def du(paths, grand_total, summarize, human_readable, count_files):
   """Display disk usage statistics."""
   results = []
+  list_data = False
   for path in paths:
     npath = normalize_path(path)
     if ispathdir(path):
       cf = CloudFiles(npath)
-      results.append(cf.size(cf.list()))
+      if summarize:
+        results.append(cf.subtree_size())
+      else:
+        list_data = True
+        results.append(cf.size(cf.list()))
     else:
       cf = CloudFiles(os.path.dirname(npath))
       sz = cf.size(os.path.basename(npath))
@@ -838,8 +846,15 @@ def du(paths, grand_total, summarize, human_readable):
       return f"{(val / 2**60):.2f} EiB"
   summary = {}
+  num_files = 0
   for path, res in zip(paths, results):
-    summary[path] = sum(res.values())
+    if list_data:
+      summary[path] = sum(res.values())
+      num_files += len(res)
+    else:
+      summary[path] = res["num_bytes"]
+      num_files += res["N"]
     if summarize:
       print(f"{SI(summary[path])}\t{path}")
@@ -849,7 +864,10 @@ def du(paths, grand_total, summarize, human_readable):
         print(f"{SI(size)}\t{pth}")
   if grand_total:
-    print(f"{SI(sum(summary.values()))}\ttotal")
+    print(f"{SI(sum(summary.values()))}\tbytes total")
+  if count_files:
+    print(f"{num_files}\tfiles total")
 @main.command()
 @click.argument('paths', nargs=-1)