PyPI - cloud-files - Versions diffs - 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

cloud-files 4.27.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/AUTHORS +1 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/METADATA +101 -21
cloud_files-6.0.0.dist-info/RECORD +27 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/WHEEL +1 -1
cloud_files-6.0.0.dist-info/pbr.json +1 -0
cloudfiles/cloudfiles.py +548 -78
cloudfiles/compression.py +8 -3
cloudfiles/exceptions.py +4 -0
cloudfiles/gcs.py +7 -3
cloudfiles/interfaces.py +462 -69
cloudfiles/lib.py +12 -2
cloudfiles/monitoring.py +724 -0
cloudfiles/paths.py +61 -5
cloudfiles/resumable_tools.py +50 -15
cloudfiles/scheduler.py +6 -1
cloudfiles/secrets.py +16 -12
cloudfiles/test.py +28 -0
cloudfiles_cli/cloudfiles_cli.py +349 -41
cloud_files-4.27.0.dist-info/RECORD +0 -26
cloud_files-4.27.0.dist-info/pbr.json +0 -1
cloudfiles/buckets.py +0 -10
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/LICENSE +0 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/entry_points.txt +0 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/top_level.txt +0 -0

cloudfiles/paths.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import lru_cache
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 import orjson
 import os.path
 import posixpath
@@ -8,17 +8,21 @@ import sys
 import urllib.parse
 from typing import Tuple, Optional
+from .typing import GetPathType
 from .exceptions import UnsupportedProtocolError
-from .lib import yellow, toabs, jsonify, mkdir
+from .lib import yellow, toabs, jsonify, mkdir, toiter
 from .secrets import CLOUD_FILES_DIR
 ExtractedPath = namedtuple('ExtractedPath',
   ('format', 'protocol', 'bucket', 'path', 'host', 'alias')
 )
+PRECOMPUTED_SUFFIX = '|neuroglancer-precomputed:'
 ALIAS_FILE = os.path.join(CLOUD_FILES_DIR, "aliases.json")
 OFFICIAL_ALIASES = {
+  "nokura": "s3://https://c10s.pni.princeton.edu/",
   "matrix": "s3://https://s3-hpcrc.rc.princeton.edu/",
   "tigerdata": "s3://https://td.princeton.edu/",
 }
@@ -37,6 +41,10 @@ ALLOWED_FORMATS = [
   'render', 'vtk', 'nifti', 'dvid',
 ]
+CLOUD_PROTOCOLS = [
+  "gs", "s3"
+] + list(OFFICIAL_ALIASES.keys())
 def update_aliases_from_file():
   global ALIASES_FROM_FILE
   global ALIAS_FILE
@@ -55,11 +63,14 @@ def update_aliases_from_file():
 def cloudpath_error(cloudpath):
   return yellow(f"""
-    Cloud Path must conform to [FORMAT://]PROTOCOL://PATH
+    Cloud Path must conform to one of:
+      (a) [FORMAT://]PROTOCOL://PATH
+      (b) PROTOCOL://PATH|FORMAT:
     Examples:
       precomputed://gs://test_bucket/em
       gs://test_bucket/em
       graphene://https://example.com/image/em
+      gs://text_bucket/em/|zarr2:
     Supported Formats: None (precomputed), {", ".join(ALLOWED_FORMATS)}
     Supported Protocols: {", ".join(ALLOWED_PROTOCOLS)}
@@ -84,6 +95,9 @@ def mkregexp():
 CLOUDPATH_REGEXP = re.compile(mkregexp())
 BUCKET_RE = re.compile(r'^(/?[~\d\w_\.\-]+(?::\d+)?)(?:/|$)') # posix /what/a/great/path
+# |neuroglancer-precomputed: or |zarr2: suffixes etc
+TAIL_FORMAT_REGEXP = re.compile('\\|(?P<fmt>[\\w\\d-]+):$')
 def add_alias(alias:str, host:str):
   global ALIASES
   global ALLOWED_PROTOCOLS
@@ -158,7 +172,15 @@ for alias, host in OFFICIAL_ALIASES.items():
 ## Other Path Library Functions
+@lru_cache(maxsize=10, typed=False)
 def normalize(path):
+  path = path.removesuffix(PRECOMPUTED_SUFFIX)
+  m = re.search(TAIL_FORMAT_REGEXP, path)
+  if m is not None:
+    path = re.sub(TAIL_FORMAT_REGEXP, "", path)
+    path = f"{m.group('fmt')}://{path}"
   fmt, proto, endpoint, cloudpath, alias = extract_format_protocol(
     path, allow_defaults=False
   )
@@ -284,6 +306,13 @@ def pop_protocol(cloudpath):
 def extract_format_protocol(cloudpath:str, allow_defaults=True) -> tuple:
   error = UnsupportedProtocolError(cloudpath_error(cloudpath))
+  cloudpath = cloudpath.removesuffix(PRECOMPUTED_SUFFIX)
+  m = re.search(TAIL_FORMAT_REGEXP, cloudpath)
+  if m is not None:
+    cloudpath = re.sub(TAIL_FORMAT_REGEXP, "", cloudpath)
+    cloudpath = f"{m.group('fmt')}://{cloudpath}"
   alias, cloudpath = resolve_alias(cloudpath)
   m = re.match(CLOUDPATH_REGEXP, cloudpath)
@@ -350,7 +379,7 @@ def extract(cloudpath:str, windows=None) -> ExtractedPath:
     cloudpath = toabs(cloudpath)
   bucket = None
-  if protocol in ('gs', 's3', 'matrix', 'mem'):
+  if protocol in CLOUD_PROTOCOLS + ['mem']:
     match = re.match(BUCKET_RE, cloudpath)
     if not match:
       raise error
@@ -374,7 +403,7 @@ def extract(cloudpath:str, windows=None) -> ExtractedPath:
 def to_https_protocol(cloudpath):
   if isinstance(cloudpath, ExtractedPath):
-    if cloudpath.protocol in ('gs', 's3', 'matrix'):
+    if cloudpath.protocol in CLOUD_PROTOCOLS:
       return extract(to_https_protocol(ascloudpath(cloudpath)))
     return cloudpath
@@ -390,3 +419,30 @@ def to_https_protocol(cloudpath):
     cloudpath = cloudpath.replace(f"{alias}://", host, 1)
   return cloudpath.replace("s3://", "", 1)
+def find_common_buckets(cloudpaths:GetPathType):
+  cloudpaths, is_multiple = toiter(cloudpaths, is_iter=True)
+  clustered = defaultdict(list)
+  for path in cloudpaths:
+    pth = path
+    byte_range = None
+    if isinstance(path, dict):
+      pth = path["path"]
+      byte_range = path["byte_range"]
+    epath = extract(pth)
+    if epath.protocol == "file":
+      path = os.sep.join(asfilepath(epath).split(os.sep)[2:])
+      bucketpath = "file://" + os.sep.join(asfilepath(epath).split(os.sep)[:2])
+    else:
+      path = epath.path
+      bucketpath = asbucketpath(epath)
+    clustered[bucketpath].append({
+      "path": path,
+      "start": (byte_range[0] if byte_range else None), # type: ignore
+      "end": (byte_range[1] if byte_range else None), # type: ignore
+    })
+  return clustered

cloudfiles/resumable_tools.py CHANGED Viewed

@@ -39,6 +39,9 @@ class ResumableFileSet:
     self.conn = sqlite3.connect(db_path)
     self.lease_msec = int(lease_msec)
+    self._total = 0
+    self._total_dirty = True
   def __del__(self):
     self.conn.close()
@@ -46,6 +49,7 @@ class ResumableFileSet:
     cur = self.conn.cursor()
     cur.execute("""DROP TABLE IF EXISTS filelist""")
     cur.execute("""DROP TABLE IF EXISTS xfermeta""")
+    cur.execute("""DROP TABLE IF EXISTS stats""")
     cur.close()
   def create(self, src, dest, reencode=None):
@@ -53,6 +57,7 @@ class ResumableFileSet:
     cur.execute("""DROP TABLE IF EXISTS filelist""")
     cur.execute("""DROP TABLE IF EXISTS xfermeta""")
+    cur.execute("""DROP TABLE IF EXISTS stats""")
     cur.execute(f"""
       CREATE TABLE xfermeta (
@@ -78,6 +83,18 @@ class ResumableFileSet:
     """)
     cur.execute("CREATE INDEX resumableidxfin ON filelist(finished,lease)")
     cur.execute("CREATE INDEX resumableidxfile ON filelist(filename)")
+    cur.execute(f"""
+      CREATE TABLE stats (
+        id {INTEGER} PRIMARY KEY {AUTOINC},
+        key TEXT NOT NULL,
+        value {INTEGER}
+      )
+    """)
+    cur.execute(
+      "INSERT INTO stats(id, key, value) VALUES (?,?,?)",
+      [1, 'finished', 0]
+    )
     cur.close()
   def insert(self, fname_iter):
@@ -91,7 +108,9 @@ class ResumableFileSet:
       cur.execute(f"INSERT INTO filelist(filename,finished,lease) VALUES {bindlist}", filenames)
       cur.execute("commit")
-    cur.close()
+    cur.close()
+    self._total_dirty = True
   def metadata(self):
     cur = self.conn.cursor()
@@ -111,6 +130,7 @@ class ResumableFileSet:
     for filenames in sip(fname_iter, SQLITE_MAX_PARAMS):
       bindlist = ",".join([f"{BIND}"] * len(filenames))
       cur.execute(f"UPDATE filelist SET finished = 1 WHERE filename in ({bindlist})", filenames)
+      cur.execute(f"UPDATE stats SET value = value + {len(filenames)} WHERE id = 1")
       cur.execute("commit")
     cur.close()
@@ -120,7 +140,7 @@ class ResumableFileSet:
     N = 0
     while True:
-      ts = now_msec() + self.lease_msec
+      ts = now_msec()
       cur.execute(f"""SELECT filename FROM filelist WHERE finished = 0 AND lease <= {ts} LIMIT {int(block_size)}""")
       rows = cur.fetchmany(block_size)
       N += len(rows)
@@ -140,31 +160,46 @@ class ResumableFileSet:
     cur.close()
-  def total(self):
+  def _scalar_query(self, sql:str) -> int:
     cur = self.conn.cursor()
-    cur.execute(f"SELECT count(filename) FROM filelist")
+    cur.execute(sql)
     res = cur.fetchone()
     cur.close()
     return int(res[0])
+  def total(self):
+    """Returns the total number of tasks (both processed and unprocessed)."""
+    if not self._total_dirty:
+      return self._total
+    self._total = self._scalar_query(f"SELECT max(id) FROM filelist")
+    self._total_dirty = False
+    return self._total
+  def finished(self):
+    return self._scalar_query(f"SELECT value FROM stats WHERE id = 1")
   def remaining(self):
-    cur = self.conn.cursor()
-    cur.execute(f"SELECT count(filename) FROM filelist WHERE finished = 0")
-    res = cur.fetchone()
-    cur.close()
-    return int(res[0])
+    return self.total() - self.finished()
+  def num_leased(self):
+    ts = int(now_msec())
+    return self._scalar_query(
+      f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease > {ts}"
+    )
   def available(self):
-    cur = self.conn.cursor()
-    ts = int(now_msec() + self.lease_msec)
-    cur.execute(f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease < {ts}")
-    res = cur.fetchone()
-    cur.close()
-    return int(res[0])
+    ts = int(now_msec())
+    return self._scalar_query(
+      f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease <= {ts}"
+    )
   def release(self):
+    cur = self.conn.cursor()
     cur.execute(f"UPDATE filelist SET lease = 0")
     cur.execute("commit")
+    cur.close()
   def __len__(self):
     return self.remaining()

cloudfiles/scheduler.py CHANGED Viewed

@@ -137,7 +137,12 @@ def schedule_jobs(
     or (hasattr(fns, "__len__") and len(fns) <= 1)
   ):
     return schedule_single_threaded_jobs(fns, progress, total, count_return)
+  if isinstance(total, int):
+    concurrency = min(concurrency, max(total, 1))
+  elif hasattr(fns, "__len__"):
+    concurrency = min(concurrency, max(len(fns), 1))
   if green == True or (green is None and gevent.monkey.saved):
     return schedule_green_jobs(fns, concurrency, progress, total, count_return)

cloudfiles/secrets.py CHANGED Viewed

@@ -137,23 +137,27 @@ def aws_credentials(bucket = '', service = 'aws', skip_files=False):
   AWS_CREDENTIALS_CACHE[service][bucket] = aws_credentials
   return aws_credentials
-CAVE_CREDENTIALS = None
-def cave_credentials():
+CAVE_CREDENTIALS:CredentialCacheType = {}
+def cave_credentials(server = ''):
   global CAVE_CREDENTIALS
-  default_file_path = 'cave-secret.json'
-  path = secretpath(default_file_path)
-  if CAVE_CREDENTIALS:
-    return CAVE_CREDENTIALS
+  paths = [
+    secretpath('cave-secret.json')
+  ]
-  if os.path.exists(path):
-    with open(path, 'rt') as f:
-      CAVE_CREDENTIALS = json.loads(f.read())
-  else:
-    CAVE_CREDENTIALS = None
+  if server:
+    paths = [ secretpath(f'{server}-cave-secret.json') ] + paths
+  if server in CAVE_CREDENTIALS:
+    return CAVE_CREDENTIALS.get(server, None)
-  return CAVE_CREDENTIALS
+  for path in paths:
+    if os.path.exists(path):
+      with open(path, 'rt') as f:
+        CAVE_CREDENTIALS[server] = json.loads(f.read())
+      break
+  return CAVE_CREDENTIALS.get(server, None)
 HTTP_CREDENTIALS = None
 def http_credentials():

cloudfiles/test.py ADDED Viewed

@@ -0,0 +1,28 @@
+import igneous.task_creation as tc
+import os
+from taskqueue import totask
+import igneous.tasks
+def process_task(msg):
+    task = totask(msg)
+    task.execute()
+    return None
+def submit_tasks():
+    paths = [
+        "gs://ng_scratch_ranl_7/make_cv_happy/seg/20250403024338",]
+    all_tasks = []
+    for img_path in paths:
+        mip = 0
+        num_mips = 2
+        tasks = tc.create_downsampling_tasks(img_path,
+                fill_missing=False,
+                delete_black_uploads=True,
+                mip=mip, num_mips=num_mips)
+        all_tasks += list(tasks)
+    return all_tasks
+if __name__ == "__main__":
+    tasks = submit_tasks()
+    process_task(tasks[0])

cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

cloud-files 4.27.0py3-none-any.whl → 6.0.0py3-none-any.whl