cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cloudfiles/cloudfiles.py CHANGED
@@ -2,13 +2,14 @@ from typing import (
2
2
  Any, Dict, Optional,
3
3
  Union, List, Tuple,
4
4
  Callable, Generator,
5
- Iterable, cast, BinaryIO
5
+ Sequence, cast, BinaryIO
6
6
  )
7
7
 
8
8
  from queue import Queue
9
9
  from collections import defaultdict
10
- from functools import partial, wraps
10
+ from functools import partial, wraps, reduce
11
11
  import inspect
12
+ import io
12
13
  import math
13
14
  import multiprocessing
14
15
  import itertools
@@ -17,7 +18,9 @@ import platform
17
18
  import posixpath
18
19
  import re
19
20
  import shutil
21
+ import threading
20
22
  import types
23
+ import time
21
24
 
22
25
  import orjson
23
26
  import pathos.pools
@@ -29,10 +32,11 @@ from . import compression, paths, gcs
29
32
  from .exceptions import UnsupportedProtocolError, MD5IntegrityError, CRC32CIntegrityError
30
33
  from .lib import (
31
34
  mkdir, totalfn, toiter, scatter, jsonify, nvl,
32
- duplicates, first, sip,
35
+ duplicates, first, sip, touch,
33
36
  md5, crc32c, decode_crc32c_b64
34
37
  )
35
- from .paths import ALIASES
38
+ from .monitoring import TransmissionMonitor, IOEnum
39
+ from .paths import ALIASES, find_common_buckets
36
40
  from .secrets import CLOUD_FILES_DIR, CLOUD_FILES_LOCK_DIR
37
41
  from .threaded_queue import ThreadedQueue, DEFAULT_THREADS
38
42
  from .typing import (
@@ -149,26 +153,42 @@ def parallel_execute(
149
153
  if platform.system().lower() == "darwin":
150
154
  os.environ["no_proxy"] = "*"
151
155
 
156
+ # Don't fork, spawn entirely new processes. This
157
+ # avoids accidental deadlocks.
158
+ multiprocessing.set_start_method("spawn", force=True)
159
+
152
160
  results = []
161
+ tms = []
153
162
  try:
154
163
  with pathos.pools.ProcessPool(parallel) as executor:
155
164
  for res in executor.imap(fn, sip(inputs, block_size)):
156
- if isinstance(res, int):
157
- pbar.update(res)
158
- elif isinstance(res, list):
159
- pbar.update(len(res))
165
+ update = res
166
+ if isinstance(res, tuple):
167
+ update = res[0]
168
+
169
+ if isinstance(update, int):
170
+ pbar.update(update)
171
+ elif isinstance(update, list):
172
+ pbar.update(len(update))
160
173
  else:
161
174
  pbar.update(block_size)
162
175
 
163
176
  if returns_list:
164
- results.extend(res)
177
+ if isinstance(res, tuple):
178
+ results.extend(res[0])
179
+ tms.append(res[1])
180
+ else:
181
+ results.extend(res)
165
182
  finally:
166
183
  if platform.system().lower() == "darwin":
167
184
  os.environ["no_proxy"] = no_proxy
168
185
  pbar.close()
169
186
 
170
187
  if returns_list:
171
- return results
188
+ if len(tms):
189
+ return (results, TransmissionMonitor.merge(tms))
190
+ else:
191
+ return results
172
192
 
173
193
  def get_interface_class(protocol):
174
194
  if protocol in INTERFACES:
@@ -182,7 +202,7 @@ def path_to_byte_range_tags(path):
182
202
  if isinstance(path, str):
183
203
  return (path, None, None, None)
184
204
  return (path['path'], path.get('start', None), path.get('end', None), path.get('tags', None))
185
-
205
+
186
206
  def dl(
187
207
  cloudpaths:GetPathType, raw:bool=False, **kwargs
188
208
  ) -> Union[bytes,List[dict]]:
@@ -193,23 +213,8 @@ def dl(
193
213
  dict.
194
214
  """
195
215
  cloudpaths, is_multiple = toiter(cloudpaths, is_iter=True)
196
- clustered = defaultdict(list)
197
- total = 0
198
- for path in cloudpaths:
199
- pth = path
200
- byte_range = None
201
- if isinstance(path, dict):
202
- pth = path["path"]
203
- byte_range = path["byte_range"]
204
-
205
- epath = paths.extract(pth)
206
- bucketpath = paths.asbucketpath(epath)
207
- clustered[bucketpath].append({
208
- "path": epath.path,
209
- "start": (byte_range[0] if byte_range else None), # type: ignore
210
- "end": (byte_range[1] if byte_range else None), # type: ignore
211
- })
212
- total += 1
216
+ clustered = find_common_buckets(cloudpaths)
217
+ total = sum([ len(bucket) for bucket in clustered.values() ])
213
218
 
214
219
  progress = kwargs.get("progress", False) and total > 1
215
220
  pbar = tqdm(total=total, desc="Downloading", disable=(not progress))
@@ -243,12 +248,55 @@ class CloudFiles:
243
248
  currently supports local filesystem, Google Cloud Storage,
244
249
  Amazon S3 interfaces, and reading from arbitrary HTTP
245
250
  servers.
251
+
252
+ cloudpath: a parent directory of the files you want to fetch
253
+ specified as:
254
+ e.g. gs://bucket/dir/
255
+ s3://bucket/dir/
256
+ s3://https://myendpoint.com/dir/
257
+ file://./dir
258
+ ./dir
259
+ https://some.host.edu/dir/
260
+ mem://bucket/dir
261
+ Key:
262
+ gs: Google Cloud Storage
263
+ s3: Amazon S3
264
+ file: Local Filesystem (including network mounts)
265
+ mem: In-Memory storage
266
+
267
+ progress: display progress bar measured in files
268
+ green: whether to use green threads (uses gevent library)
269
+ secrets: you can provide GCS, S3, CAVE, etc credentials
270
+ via the constructor here instead of the default secrets
271
+ files
272
+ num_threads: number of threads to launch for remote server
273
+ IO. No effect on local file fetching (always single threaded
274
+ for maximum performance).
275
+ use_https: use the public https API for GCS and S3 instead of
276
+ boto or google-storage-python
277
+ endpoint: for S3 emulators, you can provide a different endpoint
278
+ like https://s3-storage.university.edu. This can also be specified
279
+ in the secrets file.
280
+ parallel: number of separate processes to launch (each will use num_threads)
281
+ request_payer: bill your s3 usage to someone other than the bucket owner
282
+ locking: for local filesystems, you can use advisory file locking to avoid
283
+ separate cloudfiles instances from interfering with each other
284
+ lock_dir: you can specify your own directory for the advisory lock files
285
+ composite_upload_threshold: GCS and S3 both support multi-part uploads.
286
+ For files larger than this threshold, use that facility.
287
+ no_sign_request: (s3 only) don't sign the request with credentials
246
288
  """
247
289
  def __init__(
248
- self, cloudpath:str, progress:bool = False,
249
- green:Optional[bool] = None, secrets:SecretsType = None, num_threads:int = 20,
250
- use_https:bool = False, endpoint:Optional[str] = None,
251
- parallel:ParallelType = 1, request_payer:Optional[str] = None,
290
+ self,
291
+ cloudpath:str,
292
+ progress:bool = False,
293
+ green:Optional[bool] = None,
294
+ secrets:SecretsType = None,
295
+ num_threads:int = 20,
296
+ use_https:bool = False,
297
+ endpoint:Optional[str] = None,
298
+ parallel:ParallelType = 1,
299
+ request_payer:Optional[str] = None,
252
300
  locking:Optional[bool] = None,
253
301
  lock_dir:Optional[str] = None,
254
302
  composite_upload_threshold:int = int(1e8),
@@ -257,6 +305,8 @@ class CloudFiles:
257
305
  if use_https:
258
306
  cloudpath = paths.to_https_protocol(cloudpath)
259
307
 
308
+ cloudpath = paths.normalize(cloudpath)
309
+
260
310
  self.cloudpath = cloudpath
261
311
  self.progress = progress
262
312
  self.secrets = secrets
@@ -332,11 +382,16 @@ class CloudFiles:
332
382
 
333
383
  @parallelize(desc="Download", returns_list=True)
334
384
  def get(
335
- self, paths:GetPathType, total:Optional[int] = None,
336
- raw:bool = False, progress:Optional[bool] = None,
385
+ self,
386
+ paths:GetPathType,
387
+ total:Optional[int] = None,
388
+ raw:bool = False,
389
+ progress:Optional[bool] = None,
337
390
  parallel:Optional[ParallelType] = None,
338
- return_dict:bool = False, raise_errors:bool = True,
339
- part_size:Optional[int] = None
391
+ return_dict:bool = False,
392
+ raise_errors:bool = True,
393
+ part_size:Optional[int] = None,
394
+ return_recording:bool = False,
340
395
  ) -> Union[dict,bytes,List[dict]]:
341
396
  """
342
397
  Download one or more files. Return order is not guaranteed to match input.
@@ -362,6 +417,10 @@ class CloudFiles:
362
417
  extra information. Errors will be raised immediately.
363
418
  raise_errors: Raise the first error immediately instead
364
419
  of returning them as part of the output.
420
+ return_recording: Also return a TransmissionMonitor object that
421
+ records the start and end times and the transmitted size of
422
+ each object (i.e. before decompression) stored in an interval
423
+ tree. This enables post-hoc analysis of performance.
365
424
 
366
425
  Returns:
367
426
  if return_dict:
@@ -379,12 +438,18 @@ class CloudFiles:
379
438
  'raw': boolean,
380
439
  }
381
440
  ]
441
+
442
+ if return_recording:
443
+ return (ABOVE, TransmissionMonitor)
444
+ else:
445
+ return ABOVE
382
446
  """
383
447
  paths, multiple_return = toiter(paths, is_iter=True)
384
448
  progress = nvl(progress, self.progress)
385
449
  # return_dict prevents the user from having a chance
386
450
  # to inspect errors, so we must raise here.
387
451
  raise_errors = raise_errors or return_dict or (not multiple_return)
452
+ tm = TransmissionMonitor(IOEnum.RX)
388
453
 
389
454
  def check_md5(path, content, server_hash):
390
455
  if server_hash is None:
@@ -414,12 +479,17 @@ class CloudFiles:
414
479
  encoding = None
415
480
  server_hash = None
416
481
  server_hash_type = None
482
+ num_bytes_rx = 0
417
483
  try:
484
+ flight_id = tm.start_io(1)
485
+
418
486
  with self._get_connection() as conn:
419
487
  content, encoding, server_hash, server_hash_type = conn.get_file(
420
488
  path, start=start, end=end, part_size=part_size
421
489
  )
422
490
 
491
+ num_bytes_rx = len(content) if content is not None else 0
492
+
423
493
  # md5s don't match for partial reads
424
494
  if start is None and end is None:
425
495
  if server_hash_type == "md5":
@@ -431,6 +501,9 @@ class CloudFiles:
431
501
  content = compression.decompress(content, encoding, filename=path)
432
502
  except Exception as err:
433
503
  error = err
504
+ tm.end_error(flight_id)
505
+
506
+ tm.end_io(flight_id, num_bytes_rx)
434
507
 
435
508
  if raise_errors and error:
436
509
  raise error
@@ -450,11 +523,16 @@ class CloudFiles:
450
523
  if total == 1:
451
524
  ret = download(first(paths))
452
525
  if return_dict:
453
- return { ret["path"]: ret["content"] }
526
+ ret = { ret["path"]: ret["content"] }
454
527
  elif multiple_return:
455
- return [ ret ]
528
+ ret = [ ret ]
456
529
  else:
457
- return ret['content']
530
+ ret = ret['content']
531
+
532
+ if return_recording:
533
+ return (ret, tm)
534
+ else:
535
+ return ret
458
536
 
459
537
  num_threads = self.num_threads
460
538
  if self.protocol == "file":
@@ -470,10 +548,14 @@ class CloudFiles:
470
548
  green=self.green,
471
549
  )
472
550
 
551
+ ret = results
473
552
  if return_dict:
474
- return { res["path"]: res["content"] for res in results }
553
+ ret = { res["path"]: res["content"] for res in results }
554
+
555
+ if return_recording:
556
+ return (ret, tm)
475
557
 
476
- return results
558
+ return ret
477
559
 
478
560
  def get_json(
479
561
  self, paths:GetPathType, total:Optional[int] = None
@@ -520,12 +602,19 @@ class CloudFiles:
520
602
 
521
603
  @parallelize(desc="Upload")
522
604
  def puts(
523
- self, files:PutType,
524
- content_type:Optional[str] = None, compress:CompressType = None,
525
- compression_level:Optional[int] = None, cache_control:Optional[str] = None,
526
- total:Optional[int] = None, raw:bool = False, progress:Optional[bool] = None,
527
- parallel:ParallelType = 1, storage_class:Optional[str] = None
528
- ) -> int:
605
+ self,
606
+ files:PutType,
607
+ content_type:Optional[str] = None,
608
+ compress:CompressType = None,
609
+ compression_level:Optional[int] = None,
610
+ cache_control:Optional[str] = None,
611
+ total:Optional[int] = None,
612
+ raw:bool = False,
613
+ progress:Optional[bool] = None,
614
+ parallel:ParallelType = 1,
615
+ storage_class:Optional[str] = None,
616
+ return_recording:bool = False,
617
+ ) -> Union[int, tuple[int,TransmissionMonitor]]:
529
618
  """
530
619
  Writes one or more files at a given location.
531
620
 
@@ -560,11 +649,22 @@ class CloudFiles:
560
649
  function call. If progress is a string, it sets the
561
650
  text of the progress bar.
562
651
  parallel: number of concurrent processes (0 means all cores)
563
-
564
- Returns: number of files uploaded
652
+ return_recording: Also return a TransmissionMonitor object that
653
+ records the start and end times and the transmitted size of
654
+ each object (i.e. before decompression) stored in an interval
655
+ tree. This enables post-hoc analysis of performance.
656
+
657
+ Returns:
658
+ N = number of files uploaded
659
+ tm = TransmissionMonitor
660
+ if return_recording:
661
+ return (N, tm)
662
+ else:
663
+ return N
565
664
  """
566
665
  files = toiter(files)
567
666
  progress = nvl(progress, self.progress)
667
+ tm = TransmissionMonitor(IOEnum.TX)
568
668
 
569
669
  def todict(file):
570
670
  if isinstance(file, tuple):
@@ -572,6 +672,7 @@ class CloudFiles:
572
672
  return file
573
673
 
574
674
  def uploadfn(file):
675
+ start_time = time.monotonic()
575
676
  file = todict(file)
576
677
 
577
678
  file_compress = file.get('compress', compress)
@@ -586,11 +687,19 @@ class CloudFiles:
586
687
  compress_level=file.get('compression_level', compression_level),
587
688
  )
588
689
 
690
+ num_bytes_tx = 0
691
+ if hasattr(content, "__len__"):
692
+ num_bytes_tx = len(content)
693
+ elif isinstance(content, io.IOBase):
694
+ num_bytes_tx = os.fstat(content.fileno()).st_size
695
+
696
+ flight_id = tm.start_io(num_bytes_tx, start_time)
697
+
589
698
  if (
590
699
  self.protocol == "gs"
591
700
  and (
592
701
  (hasattr(content, "read") and hasattr(content, "seek"))
593
- or (hasattr(content, "__len__") and len(content) > self.composite_upload_threshold)
702
+ or (num_bytes_tx > self.composite_upload_threshold)
594
703
  )
595
704
  ):
596
705
  gcs.composite_upload(
@@ -603,6 +712,7 @@ class CloudFiles:
603
712
  cache_control=cache_control,
604
713
  storage_class=storage_class,
605
714
  compress=file_compress,
715
+ skip_compress=True,
606
716
  )
607
717
  return
608
718
 
@@ -616,6 +726,8 @@ class CloudFiles:
616
726
  storage_class=file.get('storage_class', storage_class)
617
727
  )
618
728
 
729
+ tm.end_io(flight_id, num_bytes_tx)
730
+
619
731
  if not isinstance(files, (types.GeneratorType, zip)):
620
732
  dupes = duplicates([ todict(file)['path'] for file in files ])
621
733
  if dupes:
@@ -625,7 +737,10 @@ class CloudFiles:
625
737
 
626
738
  if total == 1:
627
739
  uploadfn(first(files))
628
- return 1
740
+ if return_recording:
741
+ return (1,tm)
742
+ else:
743
+ return 1
629
744
 
630
745
  fns = ( partial(uploadfn, file) for file in files )
631
746
  desc = self._progress_description("Upload")
@@ -636,7 +751,11 @@ class CloudFiles:
636
751
  total=total,
637
752
  green=self.green,
638
753
  )
639
- return len(results)
754
+
755
+ if return_recording:
756
+ return (len(results), tm)
757
+ else:
758
+ return len(results)
640
759
 
641
760
  def put(
642
761
  self,
@@ -674,9 +793,13 @@ class CloudFiles:
674
793
  self, files:PutType,
675
794
  compress:CompressType = None,
676
795
  compression_level:Optional[int] = None,
677
- cache_control:Optional[str] = None, total:Optional[int] = None,
678
- raw:bool = False, progress:Optional[bool] = None, parallel:ParallelType = 1,
679
- storage_class:Optional[str] = None
796
+ cache_control:Optional[str] = None,
797
+ total:Optional[int] = None,
798
+ raw:bool = False,
799
+ progress:Optional[bool] = None,
800
+ parallel:ParallelType = 1,
801
+ storage_class:Optional[str] = None,
802
+ return_recording:bool = False,
680
803
  ) -> int:
681
804
  """
682
805
  Write one or more files as JSON.
@@ -705,7 +828,7 @@ class CloudFiles:
705
828
  compress=compress, compression_level=compression_level,
706
829
  content_type='application/json', storage_class=storage_class,
707
830
  total=total, raw=raw,
708
- progress=progress, parallel=parallel
831
+ progress=progress, parallel=parallel, return_recording=return_recording,
709
832
  )
710
833
 
711
834
  def put_json(
@@ -755,9 +878,11 @@ class CloudFiles:
755
878
  return True
756
879
  elif prefix[-1] == "/":
757
880
  return True
758
-
759
- res = first(self.list(prefix=prefix))
760
- return res is not None
881
+ try:
882
+ res = first(self.list(prefix=prefix))
883
+ return res is not None
884
+ except NotImplementedError as err:
885
+ return not CloudFile(self.cloudpath).exists()
761
886
 
762
887
  def exists(
763
888
  self, paths:GetPathType,
@@ -852,8 +977,10 @@ class CloudFiles:
852
977
 
853
978
  def size(
854
979
  self, paths:GetPathType,
855
- total:Optional[int] = None, progress:Optional[bool] = None
856
- ) -> Union[Dict[str,int],List[Dict[str,int]]]:
980
+ total:Optional[int] = None,
981
+ progress:Optional[bool] = None,
982
+ return_sum:bool = False,
983
+ ) -> Union[Dict[str,int],List[Dict[str,int]],int]:
857
984
  """
858
985
  Get the size in bytes of one or more files in its stored state.
859
986
  """
@@ -874,10 +1001,47 @@ class CloudFiles:
874
1001
  green=self.green,
875
1002
  )
876
1003
 
1004
+ if return_sum:
1005
+ return sum(( sz for sz in results.values() ))
1006
+
877
1007
  if return_multiple:
878
1008
  return results
879
1009
  return first(results.values())
880
1010
 
1011
+ def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
1012
+ """High performance size calculation for directory trees."""
1013
+ prefix, return_multiple = toiter(prefix, is_iter=True)
1014
+ total_bytes = 0
1015
+ total_files = 0
1016
+
1017
+ total = totalfn(prefix, None)
1018
+
1019
+ lock = threading.Lock()
1020
+
1021
+ def size_thunk(prefix):
1022
+ nonlocal total_bytes
1023
+ nonlocal total_files
1024
+ nonlocal lock
1025
+
1026
+ with self._get_connection() as conn:
1027
+ subtree_files, subtree_bytes = conn.subtree_size(prefix)
1028
+ with lock:
1029
+ total_files += subtree_files
1030
+ total_bytes += subtree_bytes
1031
+
1032
+ schedule_jobs(
1033
+ fns=( partial(size_thunk, path) for path in prefix ),
1034
+ concurrency=self.num_threads,
1035
+ progress=self.progress,
1036
+ green=self.green,
1037
+ total=total,
1038
+ )
1039
+
1040
+ return {
1041
+ "N": total_files,
1042
+ "num_bytes": total_bytes,
1043
+ }
1044
+
881
1045
  @parallelize(desc="Delete")
882
1046
  def delete(
883
1047
  self, paths:GetPathType, total:Optional[int] = None,
@@ -919,6 +1083,60 @@ class CloudFiles:
919
1083
  )
920
1084
  return len(results)
921
1085
 
1086
+ def touch(
1087
+ self,
1088
+ paths:GetPathType,
1089
+ progress:Optional[bool] = None,
1090
+ total:Optional[int] = None,
1091
+ nocopy:bool = False,
1092
+ ):
1093
+ """
1094
+ Create a zero byte file if it doesn't exist.
1095
+ """
1096
+ paths = toiter(paths)
1097
+ progress = nvl(progress, self.progress)
1098
+ total = totalfn(paths, total)
1099
+
1100
+ if self.protocol == "file":
1101
+ basepath = self.cloudpath.replace("file://", "")
1102
+ for path in tqdm(paths, disable=(not progress), total=total):
1103
+ pth = path
1104
+ if isinstance(path, dict):
1105
+ pth = path["path"]
1106
+ touch(self.join(basepath, pth))
1107
+ return
1108
+
1109
+ results = self.exists(paths, total=total, progress=progress)
1110
+
1111
+ dne = [
1112
+ (fname, b'')
1113
+ for fname, exists in results.items()
1114
+ if not exists
1115
+ ]
1116
+
1117
+ self.puts(dne, progress=progress)
1118
+
1119
+ # def thunk_copy(path):
1120
+ # with self._get_connection() as conn:
1121
+ # conn.copy_file(path, self._path.bucket, self.join(self._path.path, path))
1122
+ # return 1
1123
+
1124
+ # if not nocopy:
1125
+ # already_exists = (
1126
+ # fname
1127
+ # for fname, exists in results.items()
1128
+ # if exists
1129
+ # )
1130
+
1131
+ # results = schedule_jobs(
1132
+ # fns=( partial(thunk_copy, path) for path in already_exists ),
1133
+ # progress=progress,
1134
+ # total=(total - len(dne)),
1135
+ # concurrency=self.num_threads,
1136
+ # green=self.green,
1137
+ # count_return=True,
1138
+ # )
1139
+
922
1140
  def list(
923
1141
  self, prefix:str = "", flat:bool = False
924
1142
  ) -> Generator[str,None,None]:
@@ -953,7 +1171,9 @@ class CloudFiles:
953
1171
  reencode:Optional[str] = None,
954
1172
  content_type:Optional[str] = None,
955
1173
  allow_missing:bool = False,
956
- ) -> None:
1174
+ progress:Optional[bool] = None,
1175
+ resumable:bool = False,
1176
+ ) -> TransmissionMonitor:
957
1177
  """
958
1178
  Transfer all files from this CloudFiles storage
959
1179
  to the destination CloudFiles in batches sized
@@ -969,7 +1189,7 @@ class CloudFiles:
969
1189
  - gs->gs: Uses GCS copy API to minimize data movement
970
1190
  - s3->s3: Uses boto s3 copy API to minimize data movement
971
1191
 
972
- cf_src: another CloudFiles instance or cloudpath
1192
+ cf_dest: another CloudFiles instance or cloudpath
973
1193
  paths: if None transfer all files from src, else if
974
1194
  an iterable, transfer only these files.
975
1195
 
@@ -987,6 +1207,11 @@ class CloudFiles:
987
1207
  as '' (None), 'gzip', 'br', 'zstd'
988
1208
  content_type: if provided, set the Content-Type header
989
1209
  on the upload. This is necessary for e.g. file->cloud
1210
+
1211
+ resumable: for remote->file downloads, download to a .part
1212
+ file and rename it when the download completes. If the
1213
+ download does not complete, it can be resumed. Only
1214
+ supported for https->file currently.
990
1215
  """
991
1216
  if isinstance(cf_dest, str):
992
1217
  cf_dest = CloudFiles(
@@ -997,7 +1222,8 @@ class CloudFiles:
997
1222
  return cf_dest.transfer_from(
998
1223
  self, paths, block_size,
999
1224
  reencode, content_type,
1000
- allow_missing,
1225
+ allow_missing,
1226
+ progress, resumable,
1001
1227
  )
1002
1228
 
1003
1229
  def transfer_from(
@@ -1008,7 +1234,9 @@ class CloudFiles:
1008
1234
  reencode:Optional[str] = None,
1009
1235
  content_type:Optional[str] = None,
1010
1236
  allow_missing:bool = False,
1011
- ) -> None:
1237
+ progress:Optional[bool] = None,
1238
+ resumable:bool = False,
1239
+ ) -> TransmissionMonitor:
1012
1240
  """
1013
1241
  Transfer all files from the source CloudFiles storage
1014
1242
  to this CloudFiles in batches sized in the
@@ -1042,6 +1270,10 @@ class CloudFiles:
1042
1270
  as '' (None), 'gzip', 'br', 'zstd'
1043
1271
  content_type: if provided, set the Content-Type header
1044
1272
  on the upload. This is necessary for e.g. file->cloud
1273
+ resumable: for remote->file downloads, download to a .part
1274
+ file and rename it when the download completes. If the
1275
+ download does not complete, it can be resumed. Only
1276
+ supported for https->file currently.
1045
1277
  """
1046
1278
  if isinstance(cf_src, str):
1047
1279
  cf_src = CloudFiles(
@@ -1054,22 +1286,40 @@ class CloudFiles:
1054
1286
 
1055
1287
  total = totalfn(paths, None)
1056
1288
 
1057
- with tqdm(desc="Transferring", total=total, disable=(not self.progress)) as pbar:
1289
+ disable = progress
1290
+ if disable is None:
1291
+ disable = self.progress
1292
+ if disable is None:
1293
+ disable = False
1294
+ else:
1295
+ disable = not disable
1296
+
1297
+ with tqdm(desc="Transferring", total=total, disable=disable) as pbar:
1058
1298
  if (
1059
1299
  cf_src.protocol == "file"
1060
1300
  and self.protocol == "file"
1061
1301
  and reencode is None
1062
1302
  ):
1063
- self.__transfer_file_to_file(
1303
+ return self.__transfer_file_to_file(
1064
1304
  cf_src, self, paths, total,
1065
1305
  pbar, block_size, allow_missing
1066
1306
  )
1307
+ elif (
1308
+ cf_src.protocol != "file"
1309
+ and self.protocol == "file"
1310
+ and reencode is None
1311
+ ):
1312
+ return self.__transfer_remote_to_file(
1313
+ cf_src, self, paths, total,
1314
+ pbar, block_size, content_type,
1315
+ allow_missing, resumable,
1316
+ )
1067
1317
  elif (
1068
1318
  cf_src.protocol == "file"
1069
1319
  and self.protocol != "file"
1070
1320
  and reencode is None
1071
1321
  ):
1072
- self.__transfer_file_to_remote(
1322
+ return self.__transfer_file_to_remote(
1073
1323
  cf_src, self, paths, total,
1074
1324
  pbar, block_size, content_type,
1075
1325
  allow_missing,
@@ -1085,13 +1335,13 @@ class CloudFiles:
1085
1335
  )
1086
1336
  and reencode is None
1087
1337
  ):
1088
- self.__transfer_cloud_internal(
1338
+ return self.__transfer_cloud_internal(
1089
1339
  cf_src, self, paths,
1090
1340
  total, pbar, block_size,
1091
1341
  allow_missing,
1092
1342
  )
1093
1343
  else:
1094
- self.__transfer_general(
1344
+ return self.__transfer_general(
1095
1345
  cf_src, self, paths, total,
1096
1346
  pbar, block_size,
1097
1347
  reencode, content_type,
@@ -1103,7 +1353,7 @@ class CloudFiles:
1103
1353
  total, pbar, block_size,
1104
1354
  reencode, content_type,
1105
1355
  allow_missing
1106
- ):
1356
+ ) -> TransmissionMonitor:
1107
1357
  """
1108
1358
  Downloads the file into RAM, transforms
1109
1359
  the data, and uploads it. This is the slowest and
@@ -1112,6 +1362,7 @@ class CloudFiles:
1112
1362
  pair of endpoints as well as transcoding compression
1113
1363
  formats.
1114
1364
  """
1365
+ upload_tms = []
1115
1366
  for block_paths in sip(paths, block_size):
1116
1367
  for path in block_paths:
1117
1368
  if isinstance(path, dict):
@@ -1135,26 +1386,32 @@ class CloudFiles:
1135
1386
  item["path"] = item["tags"]["dest_path"]
1136
1387
  del item["tags"]["dest_path"]
1137
1388
  yield item
1138
- self.puts(
1389
+ (ct, batch_tm) = self.puts(
1139
1390
  renameiter(),
1140
1391
  raw=True,
1141
1392
  progress=False,
1142
1393
  compress=reencode,
1143
1394
  content_type=content_type,
1395
+ return_recording=True,
1144
1396
  )
1145
1397
  pbar.update(len(block_paths))
1398
+ upload_tms.append(batch_tm)
1399
+
1400
+ return TransmissionMonitor.merge(upload_tms)
1146
1401
 
1147
1402
  def __transfer_file_to_file(
1148
1403
  self, cf_src, cf_dest, paths,
1149
1404
  total, pbar, block_size, allow_missing
1150
- ):
1405
+ ) -> TransmissionMonitor:
1151
1406
  """
1152
1407
  shutil.copyfile, starting in Python 3.8, uses
1153
1408
  special OS kernel functions to accelerate file copies
1154
1409
  """
1410
+ tm = TransmissionMonitor(IOEnum.TX)
1155
1411
  srcdir = cf_src.cloudpath.replace("file://", "")
1156
1412
  destdir = mkdir(cf_dest.cloudpath.replace("file://", ""))
1157
1413
  for path in paths:
1414
+ start_time = time.monotonic()
1158
1415
  if isinstance(path, dict):
1159
1416
  src = os.path.join(srcdir, path["path"])
1160
1417
  dest = os.path.join(destdir, path["dest_path"])
@@ -1168,6 +1425,15 @@ class CloudFiles:
1168
1425
  if dest_ext_compress != dest_ext:
1169
1426
  dest += dest_ext_compress
1170
1427
 
1428
+ num_bytes_tx = 0
1429
+ try:
1430
+ if src:
1431
+ num_bytes_tx = os.path.getsize(src)
1432
+ except FileNotFoundError:
1433
+ pass
1434
+
1435
+ flight_id = tm.start_io(num_bytes_tx, start_time)
1436
+
1171
1437
  try:
1172
1438
  shutil.copyfile(src, dest) # avoids user space
1173
1439
  except FileNotFoundError:
@@ -1175,10 +1441,55 @@ class CloudFiles:
1175
1441
  with open(dest, "wb") as f:
1176
1442
  f.write(b'')
1177
1443
  else:
1444
+ tm.end_error(flight_id)
1178
1445
  raise
1446
+ finally:
1447
+ tm.end_io(flight_id, num_bytes_tx)
1179
1448
 
1180
1449
  pbar.update(1)
1181
1450
 
1451
+ return tm
1452
+
1453
+ def __transfer_remote_to_file(
1454
+ self, cf_src, cf_dest, paths,
1455
+ total, pbar, block_size, content_type,
1456
+ allow_missing, resumable,
1457
+ ) -> TransmissionMonitor:
1458
+
1459
+ tm = TransmissionMonitor(IOEnum.RX)
1460
+
1461
+ def thunk_save(key):
1462
+ nonlocal tm
1463
+ flight_id = tm.start_io(1)
1464
+ with cf_src._get_connection() as conn:
1465
+ if isinstance(key, dict):
1466
+ dest_key = key.get("dest_path", key["path"])
1467
+ src_key = key["path"]
1468
+ else:
1469
+ src_key = key
1470
+ dest_key = key
1471
+
1472
+ dest_key = os.path.join(cf_dest._path.path, dest_key)
1473
+ (found, num_bytes_rx) = conn.save_file(src_key, dest_key, resumable=resumable)
1474
+
1475
+ tm.end_io(flight_id, num_bytes_rx)
1476
+
1477
+ if found == False and not allow_missing:
1478
+ tm.end_error(flight_id)
1479
+ raise FileNotFoundError(src_key)
1480
+
1481
+ return int(found)
1482
+
1483
+ schedule_jobs(
1484
+ fns=( partial(thunk_save, path) for path in paths ),
1485
+ progress=pbar,
1486
+ concurrency=self.num_threads,
1487
+ total=totalfn(paths, total),
1488
+ green=self.green,
1489
+ count_return=True,
1490
+ )
1491
+ return tm
1492
+
1182
1493
  def __transfer_file_to_remote(
1183
1494
  self, cf_src, cf_dest, paths,
1184
1495
  total, pbar, block_size, content_type,
@@ -1189,6 +1500,7 @@ class CloudFiles:
1189
1500
  so that GCS and S3 can do low-memory chunked multi-part
1190
1501
  uploads if necessary.
1191
1502
  """
1503
+ tms = []
1192
1504
  srcdir = cf_src.cloudpath.replace("file://", "")
1193
1505
  for block_paths in sip(paths, block_size):
1194
1506
  to_upload = []
@@ -1211,18 +1523,30 @@ class CloudFiles:
1211
1523
  else:
1212
1524
  raise
1213
1525
 
1526
+ if dest_path == '':
1527
+ dest_path = src_path
1528
+
1214
1529
  to_upload.append({
1215
1530
  "path": dest_path,
1216
1531
  "content": handle,
1217
1532
  "compress": encoding,
1218
1533
  })
1219
- cf_dest.puts(to_upload, raw=True, progress=False, content_type=content_type)
1534
+ (ct, batch_tm) = cf_dest.puts(
1535
+ to_upload,
1536
+ raw=True,
1537
+ progress=False,
1538
+ content_type=content_type,
1539
+ return_recording=True,
1540
+ )
1220
1541
  for item in to_upload:
1221
1542
  handle = item["content"]
1222
1543
  if hasattr(handle, "close"):
1223
1544
  handle.close()
1545
+ tms.append(batch_tm)
1224
1546
  pbar.update(len(block_paths))
1225
1547
 
1548
+ return TransmissionMonitor.merge(tms)
1549
+
1226
1550
  def __transfer_cloud_internal(
1227
1551
  self, cf_src, cf_dest, paths,
1228
1552
  total, pbar, block_size, allow_missing
@@ -1235,7 +1559,11 @@ class CloudFiles:
1235
1559
  of the cloud, this is much slower and more expensive
1236
1560
  than necessary.
1237
1561
  """
1562
+ tm = TransmissionMonitor(IOEnum.TX)
1563
+
1238
1564
  def thunk_copy(key):
1565
+ nonlocal tm
1566
+ flight_id = tm.start_io(1)
1239
1567
  with cf_src._get_connection() as conn:
1240
1568
  if isinstance(key, dict):
1241
1569
  dest_key = key.get("dest_path", key["path"])
@@ -1245,14 +1573,17 @@ class CloudFiles:
1245
1573
  dest_key = key
1246
1574
 
1247
1575
  dest_key = posixpath.join(cf_dest._path.path, dest_key)
1248
- found = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
1576
+ (found, num_bytes_tx) = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
1577
+
1578
+ tm.end_io(flight_id, num_bytes_tx)
1249
1579
 
1250
1580
  if found == False and not allow_missing:
1581
+ tm.end_error(flight_id)
1251
1582
  raise FileNotFoundError(src_key)
1252
1583
 
1253
1584
  return int(found)
1254
1585
 
1255
- results = schedule_jobs(
1586
+ schedule_jobs(
1256
1587
  fns=( partial(thunk_copy, path) for path in paths ),
1257
1588
  progress=pbar,
1258
1589
  concurrency=self.num_threads,
@@ -1260,7 +1591,100 @@ class CloudFiles:
1260
1591
  green=self.green,
1261
1592
  count_return=True,
1262
1593
  )
1263
- return len(results)
1594
+ return tm
1595
+
1596
+ def move(self, src:str, dest:str):
1597
+ """Move (rename) src to dest.
1598
+
1599
+ src and dest do not have to be on the same filesystem.
1600
+ """
1601
+ epath = paths.extract(dest)
1602
+ full_cloudpath = paths.asprotocolpath(epath)
1603
+ dest_cloudpath = paths.dirname(full_cloudpath)
1604
+ base_dest = paths.basename(full_cloudpath)
1605
+
1606
+ return self.moves(dest_cloudpath, [
1607
+ (src, base_dest)
1608
+ ], block_size=1, progress=False)
1609
+
1610
+ def moves(
1611
+ self,
1612
+ cf_dest:Any,
1613
+ paths:Union[Sequence[str], Sequence[Tuple[str, str]]],
1614
+ block_size:int = 64,
1615
+ total:Optional[int] = None,
1616
+ progress:Optional[bool] = None,
1617
+ ):
1618
+ """
1619
+ Move (rename) files.
1620
+
1621
+ pairs: [ (src, dest), (src, dest), ... ]
1622
+ """
1623
+ if isinstance(cf_dest, str):
1624
+ cf_dest = CloudFiles(
1625
+ cf_dest, progress=False,
1626
+ green=self.green, num_threads=self.num_threads,
1627
+ )
1628
+
1629
+ total = totalfn(paths, total)
1630
+
1631
+ disable = not (self.progress if progress is None else progress)
1632
+
1633
+ if self.protocol == "file" and cf_dest.protocol == "file":
1634
+ self.__moves_file_to_file(
1635
+ cf_dest, paths, total,
1636
+ disable, block_size
1637
+ )
1638
+ return
1639
+
1640
+ pbar = tqdm(total=total, disable=disable, desc="Moving")
1641
+
1642
+ with pbar:
1643
+ for subpairs in sip(paths, block_size):
1644
+ subpairs = [
1645
+ ((pair, pair) if isinstance(pair, str) else pair)
1646
+ for pair in subpairs
1647
+ ]
1648
+
1649
+ self.transfer_to(cf_dest, paths=(
1650
+ {
1651
+ "path": src,
1652
+ "dest_path": dest,
1653
+ }
1654
+ for src, dest in subpairs
1655
+ ), progress=False)
1656
+ self.delete(( src for src, dest in subpairs ), progress=False)
1657
+ pbar.update(len(subpairs))
1658
+
1659
+ def __moves_file_to_file(
1660
+ self,
1661
+ cf_dest:Any,
1662
+ paths:Union[Sequence[str], Sequence[Tuple[str,str]]],
1663
+ total:Optional[int],
1664
+ disable:bool,
1665
+ block_size:int,
1666
+ ):
1667
+ for pair in tqdm(paths, total=total, disable=disable, desc="Moving"):
1668
+ if isinstance(pair, str):
1669
+ src = pair
1670
+ dest = pair
1671
+ else:
1672
+ (src, dest) = pair
1673
+
1674
+ src = self.join(self.cloudpath, src).replace("file://", "")
1675
+ dest = cf_dest.join(cf_dest.cloudpath, dest).replace("file://", "")
1676
+
1677
+ if os.path.isdir(dest):
1678
+ dest = cf_dest.join(dest, os.path.basename(src))
1679
+ else:
1680
+ mkdir(os.path.dirname(dest))
1681
+
1682
+ src, encoding = FileInterface.get_encoded_file_path(src)
1683
+ _, dest_ext = os.path.splitext(dest)
1684
+ dest_ext_compress = FileInterface.get_extension(encoding)
1685
+ if dest_ext_compress != dest_ext:
1686
+ dest += dest_ext_compress
1687
+ shutil.move(src, dest)
1264
1688
 
1265
1689
  def join(self, *paths:str) -> str:
1266
1690
  """
@@ -1277,6 +1701,22 @@ class CloudFiles:
1277
1701
  return os.path.join(*paths)
1278
1702
  return posixpath.join(*paths)
1279
1703
 
1704
+ @property
1705
+ def sep(self) -> str:
1706
+ if self._path.protocol == "file":
1707
+ return os.sep
1708
+ return posixpath.sep
1709
+
1710
+ def dirname(self, path:str) -> str:
1711
+ if self._path.protocol == "file":
1712
+ return os.path.dirname(path)
1713
+ return posixpath.dirname(path)
1714
+
1715
+ def basename(self, path:str) -> str:
1716
+ if self._path.protocol == "file":
1717
+ return os.path.basename(path)
1718
+ return posixpath.basename(path)
1719
+
1280
1720
  def __getitem__(self, key) -> Union[dict,bytes,List[dict]]:
1281
1721
  if isinstance(key, tuple) and len(key) == 2 and isinstance(key[1], slice) and isinstance(key[0], str):
1282
1722
  return self.get({ 'path': key[0], 'start': key[1].start, 'end': key[1].stop })
@@ -1307,11 +1747,17 @@ class CloudFiles:
1307
1747
 
1308
1748
  class CloudFile:
1309
1749
  def __init__(
1310
- self, path:str, cache_meta:bool = False,
1750
+ self,
1751
+ path:str,
1752
+ cache_meta:bool = False,
1311
1753
  secrets:SecretsType = None,
1312
1754
  composite_upload_threshold:int = int(1e8),
1313
1755
  locking:bool = True,
1314
1756
  lock_dir:Optional[str] = None,
1757
+ endpoint:Optional[str] = None,
1758
+ no_sign_request:bool = False,
1759
+ request_payer:Optional[str] = None,
1760
+ use_https:bool = False,
1315
1761
  ):
1316
1762
  path = paths.normalize(path)
1317
1763
  self.cf = CloudFiles(
@@ -1320,6 +1766,10 @@ class CloudFile:
1320
1766
  composite_upload_threshold=composite_upload_threshold,
1321
1767
  locking=locking,
1322
1768
  lock_dir=lock_dir,
1769
+ use_https=use_https,
1770
+ endpoint=endpoint,
1771
+ request_payer=request_payer,
1772
+ no_sign_request=no_sign_request,
1323
1773
  )
1324
1774
  self.filename = paths.basename(path)
1325
1775
 
@@ -1327,6 +1777,10 @@ class CloudFile:
1327
1777
  self._size:Optional[int] = None
1328
1778
  self._head = None
1329
1779
 
1780
+ @property
1781
+ def sep(self) -> str:
1782
+ return self.cf.sep
1783
+
1330
1784
  @property
1331
1785
  def protocol(self):
1332
1786
  return self.cf.protocol
@@ -1440,6 +1894,22 @@ class CloudFile:
1440
1894
  reencode=reencode,
1441
1895
  )
1442
1896
 
1897
+ def join(self, *args):
1898
+ return self.cf.join(*args)
1899
+
1900
+ def dirname(self, *args):
1901
+ return self.cf.dirname(*args)
1902
+
1903
+ def basename(self, *args):
1904
+ return self.cf.basename(*args)
1905
+
1906
+ def touch(self):
1907
+ return self.cf.touch(self.filename)
1908
+
1909
+ def move(self, dest):
1910
+ """Move (rename) this file to dest."""
1911
+ return self.cf.move(self.filename, dest)
1912
+
1443
1913
  def __len__(self):
1444
1914
  return self.size()
1445
1915