PyPI - cloud-files - Versions diffs - 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

cloud-files 4.27.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/AUTHORS +1 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/METADATA +101 -21
cloud_files-6.0.0.dist-info/RECORD +27 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/WHEEL +1 -1
cloud_files-6.0.0.dist-info/pbr.json +1 -0
cloudfiles/cloudfiles.py +548 -78
cloudfiles/compression.py +8 -3
cloudfiles/exceptions.py +4 -0
cloudfiles/gcs.py +7 -3
cloudfiles/interfaces.py +462 -69
cloudfiles/lib.py +12 -2
cloudfiles/monitoring.py +724 -0
cloudfiles/paths.py +61 -5
cloudfiles/resumable_tools.py +50 -15
cloudfiles/scheduler.py +6 -1
cloudfiles/secrets.py +16 -12
cloudfiles/test.py +28 -0
cloudfiles_cli/cloudfiles_cli.py +349 -41
cloud_files-4.27.0.dist-info/RECORD +0 -26
cloud_files-4.27.0.dist-info/pbr.json +0 -1
cloudfiles/buckets.py +0 -10
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/LICENSE +0 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/entry_points.txt +0 -0
{cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/top_level.txt +0 -0

cloudfiles/cloudfiles.py CHANGED Viewed

@@ -2,13 +2,14 @@ from typing import (
   Any, Dict, Optional,
   Union, List, Tuple,
   Callable, Generator,
-  Iterable, cast, BinaryIO
+  Sequence, cast, BinaryIO
 )
 from queue import Queue
 from collections import defaultdict
-from functools import partial, wraps
+from functools import partial, wraps, reduce
 import inspect
+import io
 import math
 import multiprocessing
 import itertools
@@ -17,7 +18,9 @@ import platform
 import posixpath
 import re
 import shutil
+import threading
 import types
+import time
 import orjson
 import pathos.pools
@@ -29,10 +32,11 @@ from . import compression, paths, gcs
 from .exceptions import UnsupportedProtocolError, MD5IntegrityError, CRC32CIntegrityError
 from .lib import (
   mkdir, totalfn, toiter, scatter, jsonify, nvl,
-  duplicates, first, sip,
+  duplicates, first, sip, touch,
   md5, crc32c, decode_crc32c_b64
 )
-from .paths import ALIASES
+from .monitoring import TransmissionMonitor, IOEnum
+from .paths import ALIASES, find_common_buckets
 from .secrets import CLOUD_FILES_DIR, CLOUD_FILES_LOCK_DIR
 from .threaded_queue import ThreadedQueue, DEFAULT_THREADS
 from .typing import (
@@ -149,26 +153,42 @@ def parallel_execute(
   if platform.system().lower() == "darwin":
     os.environ["no_proxy"] = "*"
+  # Don't fork, spawn entirely new processes. This
+  # avoids accidental deadlocks.
+  multiprocessing.set_start_method("spawn", force=True)
   results = []
+  tms = []
   try:
     with pathos.pools.ProcessPool(parallel) as executor:
       for res in executor.imap(fn, sip(inputs, block_size)):
-        if isinstance(res, int):
-          pbar.update(res)
-        elif isinstance(res, list):
-          pbar.update(len(res))
+        update = res
+        if isinstance(res, tuple):
+          update = res[0]
+        if isinstance(update, int):
+          pbar.update(update)
+        elif isinstance(update, list):
+          pbar.update(len(update))
         else:
           pbar.update(block_size)
         if returns_list:
-          results.extend(res)
+          if isinstance(res, tuple):
+            results.extend(res[0])
+            tms.append(res[1])
+          else:
+            results.extend(res)
   finally:
     if platform.system().lower() == "darwin":
       os.environ["no_proxy"] = no_proxy
     pbar.close()
   if returns_list:
-    return results
+    if len(tms):
+      return (results, TransmissionMonitor.merge(tms))
+    else:
+      return results
 def get_interface_class(protocol):
   if protocol in INTERFACES:
@@ -182,7 +202,7 @@ def path_to_byte_range_tags(path):
   if isinstance(path, str):
     return (path, None, None, None)
   return (path['path'], path.get('start', None), path.get('end', None), path.get('tags', None))
 def dl(
   cloudpaths:GetPathType, raw:bool=False, **kwargs
 ) -> Union[bytes,List[dict]]:
@@ -193,23 +213,8 @@ def dl(
   dict.
   """
   cloudpaths, is_multiple = toiter(cloudpaths, is_iter=True)
-  clustered = defaultdict(list)
-  total = 0
-  for path in cloudpaths:
-    pth = path
-    byte_range = None
-    if isinstance(path, dict):
-      pth = path["path"]
-      byte_range = path["byte_range"]
-    epath = paths.extract(pth)
-    bucketpath = paths.asbucketpath(epath)
-    clustered[bucketpath].append({
-      "path": epath.path,
-      "start": (byte_range[0] if byte_range else None), # type: ignore
-      "end": (byte_range[1] if byte_range else None), # type: ignore
-    })
-    total += 1
+  clustered = find_common_buckets(cloudpaths)
+  total = sum([ len(bucket) for bucket in clustered.values() ])
   progress = kwargs.get("progress", False) and total > 1
   pbar = tqdm(total=total, desc="Downloading", disable=(not progress))
@@ -243,12 +248,55 @@ class CloudFiles:
   currently supports local filesystem, Google Cloud Storage,
   Amazon S3 interfaces, and reading from arbitrary HTTP
   servers.
+  cloudpath: a parent directory of the files you want to fetch
+    specified as:
+      e.g. gs://bucket/dir/
+           s3://bucket/dir/
+           s3://https://myendpoint.com/dir/
+           file://./dir
+           ./dir
+           https://some.host.edu/dir/
+           mem://bucket/dir
+      Key:
+       gs: Google Cloud Storage
+       s3: Amazon S3
+       file: Local Filesystem (including network mounts)
+       mem: In-Memory storage
+  progress: display progress bar measured in files
+  green: whether to use green threads (uses gevent library)
+  secrets: you can provide GCS, S3, CAVE, etc credentials
+    via the constructor here instead of the default secrets
+    files
+  num_threads: number of threads to launch for remote server
+    IO. No effect on local file fetching (always single threaded
+    for maximum performance).
+  use_https: use the public https API for GCS and S3 instead of
+    boto or google-storage-python
+  endpoint: for S3 emulators, you can provide a different endpoint
+    like https://s3-storage.university.edu. This can also be specified
+    in the secrets file.
+  parallel: number of separate processes to launch (each will use num_threads)
+  request_payer: bill your s3 usage to someone other than the bucket owner
+  locking: for local filesystems, you can use advisory file locking to avoid
+    separate cloudfiles instances from interfering with each other
+  lock_dir: you can specify your own directory for the advisory lock files
+  composite_upload_threshold: GCS and S3 both support multi-part uploads.
+    For files larger than this threshold, use that facility.
+  no_sign_request: (s3 only) don't sign the request with credentials
   """
   def __init__(
-    self, cloudpath:str, progress:bool = False,
-    green:Optional[bool] = None, secrets:SecretsType = None, num_threads:int = 20,
-    use_https:bool = False, endpoint:Optional[str] = None,
-    parallel:ParallelType = 1, request_payer:Optional[str] = None,
+    self,
+    cloudpath:str,
+    progress:bool = False,
+    green:Optional[bool] = None,
+    secrets:SecretsType = None,
+    num_threads:int = 20,
+    use_https:bool = False,
+    endpoint:Optional[str] = None,
+    parallel:ParallelType = 1,
+    request_payer:Optional[str] = None,
     locking:Optional[bool] = None,
     lock_dir:Optional[str] = None,
     composite_upload_threshold:int = int(1e8),
@@ -257,6 +305,8 @@ class CloudFiles:
     if use_https:
       cloudpath = paths.to_https_protocol(cloudpath)
+    cloudpath = paths.normalize(cloudpath)
     self.cloudpath = cloudpath
     self.progress = progress
     self.secrets = secrets
@@ -332,11 +382,16 @@ class CloudFiles:
   @parallelize(desc="Download", returns_list=True)
   def get(
-    self, paths:GetPathType, total:Optional[int] = None,
-    raw:bool = False, progress:Optional[bool] = None,
+    self,
+    paths:GetPathType,
+    total:Optional[int] = None,
+    raw:bool = False,
+    progress:Optional[bool] = None,
     parallel:Optional[ParallelType] = None,
-    return_dict:bool = False, raise_errors:bool = True,
-    part_size:Optional[int] = None
+    return_dict:bool = False,
+    raise_errors:bool = True,
+    part_size:Optional[int] = None,
+    return_recording:bool = False,
   ) -> Union[dict,bytes,List[dict]]:
     """
     Download one or more files. Return order is not guaranteed to match input.
@@ -362,6 +417,10 @@ class CloudFiles:
       extra information. Errors will be raised immediately.
     raise_errors: Raise the first error immediately instead
       of returning them as part of the output.
+    return_recording: Also return a TransmissionMonitor object that
+      records the start and end times and the transmitted size of
+      each object (i.e. before decompression) stored in an interval
+      tree. This enables post-hoc analysis of performance.
     Returns:
       if return_dict:
@@ -379,12 +438,18 @@ class CloudFiles:
             'raw': boolean,
           }
         ]
+      if return_recording:
+        return (ABOVE, TransmissionMonitor)
+      else:
+        return ABOVE
     """
     paths, multiple_return = toiter(paths, is_iter=True)
     progress = nvl(progress, self.progress)
     # return_dict prevents the user from having a chance
     # to inspect errors, so we must raise here.
     raise_errors = raise_errors or return_dict or (not multiple_return)
+    tm = TransmissionMonitor(IOEnum.RX)
     def check_md5(path, content, server_hash):
       if server_hash is None:
@@ -414,12 +479,17 @@ class CloudFiles:
       encoding = None
       server_hash = None
       server_hash_type = None
+      num_bytes_rx = 0
       try:
+        flight_id = tm.start_io(1)
         with self._get_connection() as conn:
           content, encoding, server_hash, server_hash_type = conn.get_file(
             path, start=start, end=end, part_size=part_size
           )
+        num_bytes_rx = len(content) if content is not None else 0
         # md5s don't match for partial reads
         if start is None and end is None:
           if server_hash_type == "md5":
@@ -431,6 +501,9 @@ class CloudFiles:
           content = compression.decompress(content, encoding, filename=path)
       except Exception as err:
         error = err
+        tm.end_error(flight_id)
+      tm.end_io(flight_id, num_bytes_rx)
       if raise_errors and error:
         raise error
@@ -450,11 +523,16 @@ class CloudFiles:
     if total == 1:
       ret = download(first(paths))
       if return_dict:
-        return { ret["path"]: ret["content"] }
+        ret = { ret["path"]: ret["content"] }
       elif multiple_return:
-        return [ ret ]
+        ret = [ ret ]
       else:
-        return ret['content']
+        ret = ret['content']
+      if return_recording:
+        return (ret, tm)
+      else:
+        return ret
     num_threads = self.num_threads
     if self.protocol == "file":
@@ -470,10 +548,14 @@ class CloudFiles:
       green=self.green,
     )
+    ret = results
     if return_dict:
-      return { res["path"]: res["content"] for res in results }
+      ret = { res["path"]: res["content"] for res in results }
+    if return_recording:
+      return (ret, tm)
-    return results
+    return ret
   def get_json(
     self, paths:GetPathType, total:Optional[int] = None
@@ -520,12 +602,19 @@ class CloudFiles:
   @parallelize(desc="Upload")
   def puts(
-    self, files:PutType,
-    content_type:Optional[str] = None, compress:CompressType = None,
-    compression_level:Optional[int] = None, cache_control:Optional[str] = None,
-    total:Optional[int] = None, raw:bool = False, progress:Optional[bool] = None,
-    parallel:ParallelType = 1, storage_class:Optional[str] = None
-  ) -> int:
+    self,
+    files:PutType,
+    content_type:Optional[str] = None,
+    compress:CompressType = None,
+    compression_level:Optional[int] = None,
+    cache_control:Optional[str] = None,
+    total:Optional[int] = None,
+    raw:bool = False,
+    progress:Optional[bool] = None,
+    parallel:ParallelType = 1,
+    storage_class:Optional[str] = None,
+    return_recording:bool = False,
+  ) -> Union[int, tuple[int,TransmissionMonitor]]:
     """
     Writes one or more files at a given location.
@@ -560,11 +649,22 @@ class CloudFiles:
       function call. If progress is a string, it sets the
       text of the progress bar.
     parallel: number of concurrent processes (0 means all cores)
-    Returns: number of files uploaded
+    return_recording: Also return a TransmissionMonitor object that
+      records the start and end times and the transmitted size of
+      each object (i.e. before decompression) stored in an interval
+      tree. This enables post-hoc analysis of performance.
+    Returns:
+      N = number of files uploaded
+      tm = TransmissionMonitor
+      if return_recording:
+        return (N, tm)
+      else:
+        return N
     """
     files = toiter(files)
     progress = nvl(progress, self.progress)
+    tm = TransmissionMonitor(IOEnum.TX)
     def todict(file):
       if isinstance(file, tuple):
@@ -572,6 +672,7 @@ class CloudFiles:
       return file
     def uploadfn(file):
+      start_time = time.monotonic()
       file = todict(file)
       file_compress = file.get('compress', compress)
@@ -586,11 +687,19 @@ class CloudFiles:
           compress_level=file.get('compression_level', compression_level),
         )
+      num_bytes_tx = 0
+      if hasattr(content, "__len__"):
+        num_bytes_tx = len(content)
+      elif isinstance(content, io.IOBase):
+        num_bytes_tx = os.fstat(content.fileno()).st_size
+      flight_id = tm.start_io(num_bytes_tx, start_time)
       if (
         self.protocol == "gs"
         and (
           (hasattr(content, "read") and hasattr(content, "seek"))
-          or (hasattr(content, "__len__") and len(content) > self.composite_upload_threshold)
+          or (num_bytes_tx > self.composite_upload_threshold)
         )
       ):
         gcs.composite_upload(
@@ -603,6 +712,7 @@ class CloudFiles:
           cache_control=cache_control,
           storage_class=storage_class,
           compress=file_compress,
+          skip_compress=True,
         )
         return
@@ -616,6 +726,8 @@ class CloudFiles:
           storage_class=file.get('storage_class', storage_class)
         )
+      tm.end_io(flight_id, num_bytes_tx)
     if not isinstance(files, (types.GeneratorType, zip)):
       dupes = duplicates([ todict(file)['path'] for file in files ])
       if dupes:
@@ -625,7 +737,10 @@ class CloudFiles:
     if total == 1:
       uploadfn(first(files))
-      return 1
+      if return_recording:
+        return (1,tm)
+      else:
+        return 1
     fns = ( partial(uploadfn, file) for file in files )
     desc = self._progress_description("Upload")
@@ -636,7 +751,11 @@ class CloudFiles:
       total=total,
       green=self.green,
     )
-    return len(results)
+    if return_recording:
+      return (len(results), tm)
+    else:
+      return len(results)
   def put(
     self,
@@ -674,9 +793,13 @@ class CloudFiles:
     self, files:PutType,
     compress:CompressType = None,
     compression_level:Optional[int] = None,
-    cache_control:Optional[str] = None, total:Optional[int] = None,
-    raw:bool = False, progress:Optional[bool] = None, parallel:ParallelType = 1,
-    storage_class:Optional[str] = None
+    cache_control:Optional[str] = None,
+    total:Optional[int] = None,
+    raw:bool = False,
+    progress:Optional[bool] = None,
+    parallel:ParallelType = 1,
+    storage_class:Optional[str] = None,
+    return_recording:bool = False,
   ) -> int:
     """
     Write one or more files as JSON.
@@ -705,7 +828,7 @@ class CloudFiles:
       compress=compress, compression_level=compression_level,
       content_type='application/json', storage_class=storage_class,
       total=total, raw=raw,
-      progress=progress, parallel=parallel
+      progress=progress, parallel=parallel, return_recording=return_recording,
     )
   def put_json(
@@ -755,9 +878,11 @@ class CloudFiles:
         return True
     elif prefix[-1] == "/":
       return True
-    res = first(self.list(prefix=prefix))
-    return res is not None
+    try:
+      res = first(self.list(prefix=prefix))
+      return res is not None
+    except NotImplementedError as err:
+      return not CloudFile(self.cloudpath).exists()
   def exists(
     self, paths:GetPathType,
@@ -852,8 +977,10 @@ class CloudFiles:
   def size(
     self, paths:GetPathType,
-    total:Optional[int] = None, progress:Optional[bool] = None
-  ) -> Union[Dict[str,int],List[Dict[str,int]]]:
+    total:Optional[int] = None,
+    progress:Optional[bool] = None,
+    return_sum:bool = False,
+  ) -> Union[Dict[str,int],List[Dict[str,int]],int]:
     """
     Get the size in bytes of one or more files in its stored state.
     """
@@ -874,10 +1001,47 @@ class CloudFiles:
       green=self.green,
     )
+    if return_sum:
+      return sum(( sz for sz in results.values() ))
     if return_multiple:
       return results
     return first(results.values())
+  def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
+    """High performance size calculation for directory trees."""
+    prefix, return_multiple = toiter(prefix, is_iter=True)
+    total_bytes = 0
+    total_files = 0
+    total = totalfn(prefix, None)
+    lock = threading.Lock()
+    def size_thunk(prefix):
+      nonlocal total_bytes
+      nonlocal total_files
+      nonlocal lock
+      with self._get_connection() as conn:
+        subtree_files, subtree_bytes = conn.subtree_size(prefix)
+        with lock:
+          total_files += subtree_files
+          total_bytes += subtree_bytes
+    schedule_jobs(
+      fns=( partial(size_thunk, path) for path in prefix ),
+      concurrency=self.num_threads,
+      progress=self.progress,
+      green=self.green,
+      total=total,
+    )
+    return {
+      "N": total_files,
+      "num_bytes": total_bytes,
+    }
   @parallelize(desc="Delete")
   def delete(
     self, paths:GetPathType, total:Optional[int] = None,
@@ -919,6 +1083,60 @@ class CloudFiles:
     )
     return len(results)
+  def touch(
+    self,
+    paths:GetPathType,
+    progress:Optional[bool] = None,
+    total:Optional[int] = None,
+    nocopy:bool = False,
+  ):
+    """
+    Create a zero byte file if it doesn't exist.
+    """
+    paths = toiter(paths)
+    progress = nvl(progress, self.progress)
+    total = totalfn(paths, total)
+    if self.protocol == "file":
+      basepath = self.cloudpath.replace("file://", "")
+      for path in tqdm(paths, disable=(not progress), total=total):
+        pth = path
+        if isinstance(path, dict):
+          pth = path["path"]
+        touch(self.join(basepath, pth))
+      return
+    results = self.exists(paths, total=total, progress=progress)
+    dne = [
+      (fname, b'')
+      for fname, exists in results.items()
+      if not exists
+    ]
+    self.puts(dne, progress=progress)
+    # def thunk_copy(path):
+    #   with self._get_connection() as conn:
+    #     conn.copy_file(path, self._path.bucket, self.join(self._path.path, path))
+    #   return 1
+    # if not nocopy:
+    #   already_exists = (
+    #     fname
+    #     for fname, exists in results.items()
+    #     if exists
+    #   )
+    #   results = schedule_jobs(
+    #     fns=( partial(thunk_copy, path) for path in already_exists ),
+    #     progress=progress,
+    #     total=(total - len(dne)),
+    #     concurrency=self.num_threads,
+    #     green=self.green,
+    #     count_return=True,
+    #   )
   def list(
     self, prefix:str = "", flat:bool = False
   ) -> Generator[str,None,None]:
@@ -953,7 +1171,9 @@ class CloudFiles:
     reencode:Optional[str] = None,
     content_type:Optional[str] = None,
     allow_missing:bool = False,
-  ) -> None:
+    progress:Optional[bool] = None,
+    resumable:bool = False,
+  ) -> TransmissionMonitor:
     """
     Transfer all files from this CloudFiles storage
     to the destination CloudFiles in batches sized
@@ -969,7 +1189,7 @@ class CloudFiles:
       - gs->gs: Uses GCS copy API to minimize data movement
       - s3->s3: Uses boto s3 copy API to minimize data movement
-    cf_src: another CloudFiles instance or cloudpath
+    cf_dest: another CloudFiles instance or cloudpath
     paths: if None transfer all files from src, else if
       an iterable, transfer only these files.
@@ -987,6 +1207,11 @@ class CloudFiles:
       as '' (None), 'gzip', 'br', 'zstd'
     content_type: if provided, set the Content-Type header
       on the upload. This is necessary for e.g. file->cloud
+    resumable: for remote->file downloads, download to a .part
+      file and rename it when the download completes. If the
+      download does not complete, it can be resumed. Only
+      supported for https->file currently.
     """
     if isinstance(cf_dest, str):
       cf_dest = CloudFiles(
@@ -997,7 +1222,8 @@ class CloudFiles:
     return cf_dest.transfer_from(
       self, paths, block_size,
       reencode, content_type,
-      allow_missing,
+      allow_missing,
+      progress, resumable,
     )
   def transfer_from(
@@ -1008,7 +1234,9 @@ class CloudFiles:
     reencode:Optional[str] = None,
     content_type:Optional[str] = None,
     allow_missing:bool = False,
-  ) -> None:
+    progress:Optional[bool] = None,
+    resumable:bool = False,
+  ) -> TransmissionMonitor:
     """
     Transfer all files from the source CloudFiles storage
     to this CloudFiles in batches sized in the
@@ -1042,6 +1270,10 @@ class CloudFiles:
       as '' (None), 'gzip', 'br', 'zstd'
     content_type: if provided, set the Content-Type header
       on the upload. This is necessary for e.g. file->cloud
+    resumable: for remote->file downloads, download to a .part
+      file and rename it when the download completes. If the
+      download does not complete, it can be resumed. Only
+      supported for https->file currently.
     """
     if isinstance(cf_src, str):
       cf_src = CloudFiles(
@@ -1054,22 +1286,40 @@ class CloudFiles:
     total = totalfn(paths, None)
-    with tqdm(desc="Transferring", total=total, disable=(not self.progress)) as pbar:
+    disable = progress
+    if disable is None:
+      disable = self.progress
+    if disable is None:
+      disable = False
+    else:
+      disable = not disable
+    with tqdm(desc="Transferring", total=total, disable=disable) as pbar:
       if (
         cf_src.protocol == "file"
         and self.protocol == "file"
         and reencode is None
       ):
-        self.__transfer_file_to_file(
+        return self.__transfer_file_to_file(
           cf_src, self, paths, total,
           pbar, block_size, allow_missing
         )
+      elif (
+        cf_src.protocol != "file"
+        and self.protocol == "file"
+        and reencode is None
+      ):
+        return self.__transfer_remote_to_file(
+          cf_src, self, paths, total,
+          pbar, block_size, content_type,
+          allow_missing, resumable,
+        )
       elif (
         cf_src.protocol == "file"
         and self.protocol != "file"
         and reencode is None
       ):
-        self.__transfer_file_to_remote(
+        return self.__transfer_file_to_remote(
           cf_src, self, paths, total,
           pbar, block_size, content_type,
           allow_missing,
@@ -1085,13 +1335,13 @@ class CloudFiles:
         )
         and reencode is None
       ):
-        self.__transfer_cloud_internal(
+        return self.__transfer_cloud_internal(
           cf_src, self, paths,
           total, pbar, block_size,
           allow_missing,
         )
       else:
-        self.__transfer_general(
+        return self.__transfer_general(
           cf_src, self, paths, total,
           pbar, block_size,
           reencode, content_type,
@@ -1103,7 +1353,7 @@ class CloudFiles:
     total, pbar, block_size,
     reencode, content_type,
     allow_missing
-  ):
+  ) -> TransmissionMonitor:
     """
     Downloads the file into RAM, transforms
     the data, and uploads it. This is the slowest and
@@ -1112,6 +1362,7 @@ class CloudFiles:
     pair of endpoints as well as transcoding compression
     formats.
     """
+    upload_tms = []
     for block_paths in sip(paths, block_size):
       for path in block_paths:
         if isinstance(path, dict):
@@ -1135,26 +1386,32 @@ class CloudFiles:
             item["path"] = item["tags"]["dest_path"]
             del item["tags"]["dest_path"]
           yield item
-      self.puts(
+      (ct, batch_tm) = self.puts(
         renameiter(),
         raw=True,
         progress=False,
         compress=reencode,
         content_type=content_type,
+        return_recording=True,
       )
       pbar.update(len(block_paths))
+      upload_tms.append(batch_tm)
+    return TransmissionMonitor.merge(upload_tms)
   def __transfer_file_to_file(
     self, cf_src, cf_dest, paths,
     total, pbar, block_size, allow_missing
-  ):
+  ) -> TransmissionMonitor:
     """
     shutil.copyfile, starting in Python 3.8, uses
     special OS kernel functions to accelerate file copies
     """
+    tm = TransmissionMonitor(IOEnum.TX)
     srcdir = cf_src.cloudpath.replace("file://", "")
     destdir = mkdir(cf_dest.cloudpath.replace("file://", ""))
     for path in paths:
+      start_time = time.monotonic()
       if isinstance(path, dict):
         src = os.path.join(srcdir, path["path"])
         dest = os.path.join(destdir, path["dest_path"])
@@ -1168,6 +1425,15 @@ class CloudFiles:
       if dest_ext_compress != dest_ext:
         dest += dest_ext_compress
+      num_bytes_tx = 0
+      try:
+        if src:
+          num_bytes_tx = os.path.getsize(src)
+      except FileNotFoundError:
+        pass
+      flight_id = tm.start_io(num_bytes_tx, start_time)
       try:
         shutil.copyfile(src, dest) # avoids user space
       except FileNotFoundError:
@@ -1175,10 +1441,55 @@ class CloudFiles:
           with open(dest, "wb") as f:
             f.write(b'')
         else:
+          tm.end_error(flight_id)
           raise
+      finally:
+        tm.end_io(flight_id, num_bytes_tx)
       pbar.update(1)
+    return tm
+  def __transfer_remote_to_file(
+    self, cf_src, cf_dest, paths,
+    total, pbar, block_size, content_type,
+    allow_missing, resumable,
+  ) -> TransmissionMonitor:
+    tm = TransmissionMonitor(IOEnum.RX)
+    def thunk_save(key):
+      nonlocal tm
+      flight_id = tm.start_io(1)
+      with cf_src._get_connection() as conn:
+        if isinstance(key, dict):
+          dest_key = key.get("dest_path", key["path"])
+          src_key = key["path"]
+        else:
+          src_key = key
+          dest_key = key
+        dest_key = os.path.join(cf_dest._path.path, dest_key)
+        (found, num_bytes_rx) = conn.save_file(src_key, dest_key, resumable=resumable)
+      tm.end_io(flight_id, num_bytes_rx)
+      if found == False and not allow_missing:
+        tm.end_error(flight_id)
+        raise FileNotFoundError(src_key)
+      return int(found)
+    schedule_jobs(
+      fns=( partial(thunk_save, path) for path in paths ),
+      progress=pbar,
+      concurrency=self.num_threads,
+      total=totalfn(paths, total),
+      green=self.green,
+      count_return=True,
+    )
+    return tm
   def __transfer_file_to_remote(
     self, cf_src, cf_dest, paths,
     total, pbar, block_size, content_type,
@@ -1189,6 +1500,7 @@ class CloudFiles:
     so that GCS and S3 can do low-memory chunked multi-part
     uploads if necessary.
     """
+    tms = []
     srcdir = cf_src.cloudpath.replace("file://", "")
     for block_paths in sip(paths, block_size):
       to_upload = []
@@ -1211,18 +1523,30 @@ class CloudFiles:
           else:
             raise
+        if dest_path == '':
+          dest_path = src_path
         to_upload.append({
           "path": dest_path,
           "content": handle,
           "compress": encoding,
         })
-      cf_dest.puts(to_upload, raw=True, progress=False, content_type=content_type)
+      (ct, batch_tm) = cf_dest.puts(
+        to_upload,
+        raw=True,
+        progress=False,
+        content_type=content_type,
+        return_recording=True,
+      )
       for item in to_upload:
         handle = item["content"]
         if hasattr(handle, "close"):
           handle.close()
+      tms.append(batch_tm)
       pbar.update(len(block_paths))
+    return TransmissionMonitor.merge(tms)
   def __transfer_cloud_internal(
     self, cf_src, cf_dest, paths,
     total, pbar, block_size, allow_missing
@@ -1235,7 +1559,11 @@ class CloudFiles:
     of the cloud, this is much slower and more expensive
     than necessary.
     """
+    tm = TransmissionMonitor(IOEnum.TX)
     def thunk_copy(key):
+      nonlocal tm
+      flight_id = tm.start_io(1)
       with cf_src._get_connection() as conn:
         if isinstance(key, dict):
           dest_key = key.get("dest_path", key["path"])
@@ -1245,14 +1573,17 @@ class CloudFiles:
           dest_key = key
         dest_key = posixpath.join(cf_dest._path.path, dest_key)
-        found = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
+        (found, num_bytes_tx) = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
+      tm.end_io(flight_id, num_bytes_tx)
       if found == False and not allow_missing:
+        tm.end_error(flight_id)
         raise FileNotFoundError(src_key)
       return int(found)
-    results = schedule_jobs(
+    schedule_jobs(
       fns=( partial(thunk_copy, path) for path in paths ),
       progress=pbar,
       concurrency=self.num_threads,
@@ -1260,7 +1591,100 @@ class CloudFiles:
       green=self.green,
       count_return=True,
     )
-    return len(results)
+    return tm
+  def move(self, src:str, dest:str):
+    """Move (rename) src to dest.
+    src and dest do not have to be on the same filesystem.
+    """
+    epath = paths.extract(dest)
+    full_cloudpath = paths.asprotocolpath(epath)
+    dest_cloudpath = paths.dirname(full_cloudpath)
+    base_dest = paths.basename(full_cloudpath)
+    return self.moves(dest_cloudpath, [
+      (src, base_dest)
+    ], block_size=1, progress=False)
+  def moves(
+    self,
+    cf_dest:Any,
+    paths:Union[Sequence[str], Sequence[Tuple[str, str]]],
+    block_size:int = 64,
+    total:Optional[int] = None,
+    progress:Optional[bool] = None,
+  ):
+    """
+    Move (rename) files.
+    pairs: [ (src, dest), (src, dest), ... ]
+    """
+    if isinstance(cf_dest, str):
+      cf_dest = CloudFiles(
+        cf_dest, progress=False,
+        green=self.green, num_threads=self.num_threads,
+      )
+    total = totalfn(paths, total)
+    disable = not (self.progress if progress is None else progress)
+    if self.protocol == "file" and cf_dest.protocol == "file":
+      self.__moves_file_to_file(
+        cf_dest, paths, total,
+        disable, block_size
+      )
+      return
+    pbar = tqdm(total=total, disable=disable, desc="Moving")
+    with pbar:
+      for subpairs in sip(paths, block_size):
+        subpairs = [
+          ((pair, pair) if isinstance(pair, str) else pair)
+          for pair in subpairs
+        ]
+        self.transfer_to(cf_dest, paths=(
+          {
+            "path": src,
+            "dest_path": dest,
+          }
+          for src, dest in subpairs
+        ), progress=False)
+        self.delete(( src for src, dest in subpairs ), progress=False)
+        pbar.update(len(subpairs))
+  def __moves_file_to_file(
+    self,
+    cf_dest:Any,
+    paths:Union[Sequence[str], Sequence[Tuple[str,str]]],
+    total:Optional[int],
+    disable:bool,
+    block_size:int,
+  ):
+    for pair in tqdm(paths, total=total, disable=disable, desc="Moving"):
+      if isinstance(pair, str):
+        src = pair
+        dest = pair
+      else:
+        (src, dest) = pair
+      src = self.join(self.cloudpath, src).replace("file://", "")
+      dest = cf_dest.join(cf_dest.cloudpath, dest).replace("file://", "")
+      if os.path.isdir(dest):
+        dest = cf_dest.join(dest, os.path.basename(src))
+      else:
+        mkdir(os.path.dirname(dest))
+      src, encoding = FileInterface.get_encoded_file_path(src)
+      _, dest_ext = os.path.splitext(dest)
+      dest_ext_compress = FileInterface.get_extension(encoding)
+      if dest_ext_compress != dest_ext:
+        dest += dest_ext_compress
+      shutil.move(src, dest)
   def join(self, *paths:str) -> str:
     """
@@ -1277,6 +1701,22 @@ class CloudFiles:
       return os.path.join(*paths)
     return posixpath.join(*paths)
+  @property
+  def sep(self) -> str:
+    if self._path.protocol == "file":
+      return os.sep
+    return posixpath.sep
+  def dirname(self, path:str) -> str:
+    if self._path.protocol == "file":
+      return os.path.dirname(path)
+    return posixpath.dirname(path)
+  def basename(self, path:str) -> str:
+    if self._path.protocol == "file":
+      return os.path.basename(path)
+    return posixpath.basename(path)
   def __getitem__(self, key) -> Union[dict,bytes,List[dict]]:
     if isinstance(key, tuple) and len(key) == 2 and isinstance(key[1], slice) and isinstance(key[0], str):
       return self.get({ 'path': key[0], 'start': key[1].start, 'end': key[1].stop })
@@ -1307,11 +1747,17 @@ class CloudFiles:
 class CloudFile:
   def __init__(
-    self, path:str, cache_meta:bool = False,
+    self,
+    path:str,
+    cache_meta:bool = False,
     secrets:SecretsType = None,
     composite_upload_threshold:int = int(1e8),
     locking:bool = True,
     lock_dir:Optional[str] = None,
+    endpoint:Optional[str] = None,
+    no_sign_request:bool = False,
+    request_payer:Optional[str] = None,
+    use_https:bool = False,
   ):
     path = paths.normalize(path)
     self.cf = CloudFiles(
@@ -1320,6 +1766,10 @@ class CloudFile:
       composite_upload_threshold=composite_upload_threshold,
       locking=locking,
       lock_dir=lock_dir,
+      use_https=use_https,
+      endpoint=endpoint,
+      request_payer=request_payer,
+      no_sign_request=no_sign_request,
     )
     self.filename = paths.basename(path)
@@ -1327,6 +1777,10 @@ class CloudFile:
     self._size:Optional[int] = None
     self._head = None
+  @property
+  def sep(self) -> str:
+    return self.cf.sep
   @property
   def protocol(self):
     return self.cf.protocol
@@ -1440,6 +1894,22 @@ class CloudFile:
       reencode=reencode,
     )
+  def join(self, *args):
+    return self.cf.join(*args)
+  def dirname(self, *args):
+    return self.cf.dirname(*args)
+  def basename(self, *args):
+    return self.cf.basename(*args)
+  def touch(self):
+    return self.cf.touch(self.filename)
+  def move(self, dest):
+    """Move (rename) this file to dest."""
+    return self.cf.move(self.filename, dest)
   def __len__(self):
     return self.size()

cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

cloud-files 4.27.0py3-none-any.whl → 6.0.0py3-none-any.whl