cloud-files 5.5.0__py3-none-any.whl → 5.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/METADATA +73 -2
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/RECORD +12 -11
- cloud_files-5.6.1.dist-info/pbr.json +1 -0
- cloudfiles/cloudfiles.py +245 -53
- cloudfiles/interfaces.py +55 -38
- cloudfiles/monitoring.py +724 -0
- cloudfiles_cli/cloudfiles_cli.py +87 -14
- cloud_files-5.5.0.dist-info/pbr.json +0 -1
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/AUTHORS +0 -0
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/LICENSE +0 -0
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/WHEEL +0 -0
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/entry_points.txt +0 -0
- {cloud_files-5.5.0.dist-info → cloud_files-5.6.1.dist-info}/top_level.txt +0 -0
cloudfiles/cloudfiles.py
CHANGED
|
@@ -7,8 +7,9 @@ from typing import (
|
|
|
7
7
|
|
|
8
8
|
from queue import Queue
|
|
9
9
|
from collections import defaultdict
|
|
10
|
-
from functools import partial, wraps
|
|
10
|
+
from functools import partial, wraps, reduce
|
|
11
11
|
import inspect
|
|
12
|
+
import io
|
|
12
13
|
import math
|
|
13
14
|
import multiprocessing
|
|
14
15
|
import itertools
|
|
@@ -18,6 +19,7 @@ import posixpath
|
|
|
18
19
|
import re
|
|
19
20
|
import shutil
|
|
20
21
|
import types
|
|
22
|
+
import time
|
|
21
23
|
|
|
22
24
|
import orjson
|
|
23
25
|
import pathos.pools
|
|
@@ -32,6 +34,7 @@ from .lib import (
|
|
|
32
34
|
duplicates, first, sip, touch,
|
|
33
35
|
md5, crc32c, decode_crc32c_b64
|
|
34
36
|
)
|
|
37
|
+
from .monitoring import TransmissionMonitor, IOEnum
|
|
35
38
|
from .paths import ALIASES, find_common_buckets
|
|
36
39
|
from .secrets import CLOUD_FILES_DIR, CLOUD_FILES_LOCK_DIR
|
|
37
40
|
from .threaded_queue import ThreadedQueue, DEFAULT_THREADS
|
|
@@ -154,25 +157,37 @@ def parallel_execute(
|
|
|
154
157
|
multiprocessing.set_start_method("spawn", force=True)
|
|
155
158
|
|
|
156
159
|
results = []
|
|
160
|
+
tms = []
|
|
157
161
|
try:
|
|
158
162
|
with pathos.pools.ProcessPool(parallel) as executor:
|
|
159
163
|
for res in executor.imap(fn, sip(inputs, block_size)):
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
+
update = res
|
|
165
|
+
if isinstance(res, tuple):
|
|
166
|
+
update = res[0]
|
|
167
|
+
|
|
168
|
+
if isinstance(update, int):
|
|
169
|
+
pbar.update(update)
|
|
170
|
+
elif isinstance(update, list):
|
|
171
|
+
pbar.update(len(update))
|
|
164
172
|
else:
|
|
165
173
|
pbar.update(block_size)
|
|
166
174
|
|
|
167
175
|
if returns_list:
|
|
168
|
-
|
|
176
|
+
if isinstance(res, tuple):
|
|
177
|
+
results.extend(res[0])
|
|
178
|
+
tms.append(res[1])
|
|
179
|
+
else:
|
|
180
|
+
results.extend(res)
|
|
169
181
|
finally:
|
|
170
182
|
if platform.system().lower() == "darwin":
|
|
171
183
|
os.environ["no_proxy"] = no_proxy
|
|
172
184
|
pbar.close()
|
|
173
185
|
|
|
174
186
|
if returns_list:
|
|
175
|
-
|
|
187
|
+
if len(tms):
|
|
188
|
+
return (results, TransmissionMonitor.merge(tms))
|
|
189
|
+
else:
|
|
190
|
+
return results
|
|
176
191
|
|
|
177
192
|
def get_interface_class(protocol):
|
|
178
193
|
if protocol in INTERFACES:
|
|
@@ -232,12 +247,55 @@ class CloudFiles:
|
|
|
232
247
|
currently supports local filesystem, Google Cloud Storage,
|
|
233
248
|
Amazon S3 interfaces, and reading from arbitrary HTTP
|
|
234
249
|
servers.
|
|
250
|
+
|
|
251
|
+
cloudpath: a parent directory of the files you want to fetch
|
|
252
|
+
specified as:
|
|
253
|
+
e.g. gs://bucket/dir/
|
|
254
|
+
s3://bucket/dir/
|
|
255
|
+
s3://https://myendpoint.com/dir/
|
|
256
|
+
file://./dir
|
|
257
|
+
./dir
|
|
258
|
+
https://some.host.edu/dir/
|
|
259
|
+
mem://bucket/dir
|
|
260
|
+
Key:
|
|
261
|
+
gs: Google Cloud Storage
|
|
262
|
+
s3: Amazon S3
|
|
263
|
+
file: Local Filesystem (including network mounts)
|
|
264
|
+
mem: In-Memory storage
|
|
265
|
+
|
|
266
|
+
progress: display progress bar measured in files
|
|
267
|
+
green: whether to use green threads (uses gevent library)
|
|
268
|
+
secrets: you can provide GCS, S3, CAVE, etc credentials
|
|
269
|
+
via the constructor here instead of the default secrets
|
|
270
|
+
files
|
|
271
|
+
num_threads: number of threads to launch for remote server
|
|
272
|
+
IO. No effect on local file fetching (always single threaded
|
|
273
|
+
for maximum performance).
|
|
274
|
+
use_https: use the public https API for GCS and S3 instead of
|
|
275
|
+
boto or google-storage-python
|
|
276
|
+
endpoint: for S3 emulators, you can provide a different endpoint
|
|
277
|
+
like https://s3-storage.university.edu. This can also be specified
|
|
278
|
+
in the secrets file.
|
|
279
|
+
parallel: number of separate processes to launch (each will use num_threads)
|
|
280
|
+
request_payer: bill your s3 usage to someone other than the bucket owner
|
|
281
|
+
locking: for local filesystems, you can use advisory file locking to avoid
|
|
282
|
+
separate cloudfiles instances from interfering with each other
|
|
283
|
+
lock_dir: you can specify your own directory for the advisory lock files
|
|
284
|
+
composite_upload_threshold: GCS and S3 both support multi-part uploads.
|
|
285
|
+
For files larger than this threshold, use that facility.
|
|
286
|
+
no_sign_request: (s3 only) don't sign the request with credentials
|
|
235
287
|
"""
|
|
236
288
|
def __init__(
|
|
237
|
-
self,
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
289
|
+
self,
|
|
290
|
+
cloudpath:str,
|
|
291
|
+
progress:bool = False,
|
|
292
|
+
green:Optional[bool] = None,
|
|
293
|
+
secrets:SecretsType = None,
|
|
294
|
+
num_threads:int = 20,
|
|
295
|
+
use_https:bool = False,
|
|
296
|
+
endpoint:Optional[str] = None,
|
|
297
|
+
parallel:ParallelType = 1,
|
|
298
|
+
request_payer:Optional[str] = None,
|
|
241
299
|
locking:Optional[bool] = None,
|
|
242
300
|
lock_dir:Optional[str] = None,
|
|
243
301
|
composite_upload_threshold:int = int(1e8),
|
|
@@ -323,11 +381,16 @@ class CloudFiles:
|
|
|
323
381
|
|
|
324
382
|
@parallelize(desc="Download", returns_list=True)
|
|
325
383
|
def get(
|
|
326
|
-
self,
|
|
327
|
-
|
|
384
|
+
self,
|
|
385
|
+
paths:GetPathType,
|
|
386
|
+
total:Optional[int] = None,
|
|
387
|
+
raw:bool = False,
|
|
388
|
+
progress:Optional[bool] = None,
|
|
328
389
|
parallel:Optional[ParallelType] = None,
|
|
329
|
-
return_dict:bool = False,
|
|
330
|
-
|
|
390
|
+
return_dict:bool = False,
|
|
391
|
+
raise_errors:bool = True,
|
|
392
|
+
part_size:Optional[int] = None,
|
|
393
|
+
return_recording:bool = False,
|
|
331
394
|
) -> Union[dict,bytes,List[dict]]:
|
|
332
395
|
"""
|
|
333
396
|
Download one or more files. Return order is not guaranteed to match input.
|
|
@@ -353,6 +416,10 @@ class CloudFiles:
|
|
|
353
416
|
extra information. Errors will be raised immediately.
|
|
354
417
|
raise_errors: Raise the first error immediately instead
|
|
355
418
|
of returning them as part of the output.
|
|
419
|
+
return_recording: Also return a TransmissionMonitor object that
|
|
420
|
+
records the start and end times and the transmitted size of
|
|
421
|
+
each object (i.e. before decompression) stored in an interval
|
|
422
|
+
tree. This enables post-hoc analysis of performance.
|
|
356
423
|
|
|
357
424
|
Returns:
|
|
358
425
|
if return_dict:
|
|
@@ -370,12 +437,18 @@ class CloudFiles:
|
|
|
370
437
|
'raw': boolean,
|
|
371
438
|
}
|
|
372
439
|
]
|
|
440
|
+
|
|
441
|
+
if return_recording:
|
|
442
|
+
return (ABOVE, TransmissionMonitor)
|
|
443
|
+
else:
|
|
444
|
+
return ABOVE
|
|
373
445
|
"""
|
|
374
446
|
paths, multiple_return = toiter(paths, is_iter=True)
|
|
375
447
|
progress = nvl(progress, self.progress)
|
|
376
448
|
# return_dict prevents the user from having a chance
|
|
377
449
|
# to inspect errors, so we must raise here.
|
|
378
450
|
raise_errors = raise_errors or return_dict or (not multiple_return)
|
|
451
|
+
tm = TransmissionMonitor(IOEnum.RX)
|
|
379
452
|
|
|
380
453
|
def check_md5(path, content, server_hash):
|
|
381
454
|
if server_hash is None:
|
|
@@ -405,12 +478,17 @@ class CloudFiles:
|
|
|
405
478
|
encoding = None
|
|
406
479
|
server_hash = None
|
|
407
480
|
server_hash_type = None
|
|
481
|
+
num_bytes_rx = 0
|
|
408
482
|
try:
|
|
483
|
+
flight_id = tm.start_io(1)
|
|
484
|
+
|
|
409
485
|
with self._get_connection() as conn:
|
|
410
486
|
content, encoding, server_hash, server_hash_type = conn.get_file(
|
|
411
487
|
path, start=start, end=end, part_size=part_size
|
|
412
488
|
)
|
|
413
489
|
|
|
490
|
+
num_bytes_rx = len(content) if content is not None else 0
|
|
491
|
+
|
|
414
492
|
# md5s don't match for partial reads
|
|
415
493
|
if start is None and end is None:
|
|
416
494
|
if server_hash_type == "md5":
|
|
@@ -422,6 +500,9 @@ class CloudFiles:
|
|
|
422
500
|
content = compression.decompress(content, encoding, filename=path)
|
|
423
501
|
except Exception as err:
|
|
424
502
|
error = err
|
|
503
|
+
tm.end_error(flight_id)
|
|
504
|
+
|
|
505
|
+
tm.end_io(flight_id, num_bytes_rx)
|
|
425
506
|
|
|
426
507
|
if raise_errors and error:
|
|
427
508
|
raise error
|
|
@@ -441,11 +522,16 @@ class CloudFiles:
|
|
|
441
522
|
if total == 1:
|
|
442
523
|
ret = download(first(paths))
|
|
443
524
|
if return_dict:
|
|
444
|
-
|
|
525
|
+
ret = { ret["path"]: ret["content"] }
|
|
445
526
|
elif multiple_return:
|
|
446
|
-
|
|
527
|
+
ret = [ ret ]
|
|
528
|
+
else:
|
|
529
|
+
ret = ret['content']
|
|
530
|
+
|
|
531
|
+
if return_recording:
|
|
532
|
+
return (ret, tm)
|
|
447
533
|
else:
|
|
448
|
-
return ret
|
|
534
|
+
return ret
|
|
449
535
|
|
|
450
536
|
num_threads = self.num_threads
|
|
451
537
|
if self.protocol == "file":
|
|
@@ -461,10 +547,14 @@ class CloudFiles:
|
|
|
461
547
|
green=self.green,
|
|
462
548
|
)
|
|
463
549
|
|
|
550
|
+
ret = results
|
|
464
551
|
if return_dict:
|
|
465
|
-
|
|
552
|
+
ret = { res["path"]: res["content"] for res in results }
|
|
553
|
+
|
|
554
|
+
if return_recording:
|
|
555
|
+
return (ret, tm)
|
|
466
556
|
|
|
467
|
-
return
|
|
557
|
+
return ret
|
|
468
558
|
|
|
469
559
|
def get_json(
|
|
470
560
|
self, paths:GetPathType, total:Optional[int] = None
|
|
@@ -511,12 +601,19 @@ class CloudFiles:
|
|
|
511
601
|
|
|
512
602
|
@parallelize(desc="Upload")
|
|
513
603
|
def puts(
|
|
514
|
-
self,
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
604
|
+
self,
|
|
605
|
+
files:PutType,
|
|
606
|
+
content_type:Optional[str] = None,
|
|
607
|
+
compress:CompressType = None,
|
|
608
|
+
compression_level:Optional[int] = None,
|
|
609
|
+
cache_control:Optional[str] = None,
|
|
610
|
+
total:Optional[int] = None,
|
|
611
|
+
raw:bool = False,
|
|
612
|
+
progress:Optional[bool] = None,
|
|
613
|
+
parallel:ParallelType = 1,
|
|
614
|
+
storage_class:Optional[str] = None,
|
|
615
|
+
return_recording:bool = False,
|
|
616
|
+
) -> Union[int, tuple[int,TransmissionMonitor]]:
|
|
520
617
|
"""
|
|
521
618
|
Writes one or more files at a given location.
|
|
522
619
|
|
|
@@ -551,11 +648,22 @@ class CloudFiles:
|
|
|
551
648
|
function call. If progress is a string, it sets the
|
|
552
649
|
text of the progress bar.
|
|
553
650
|
parallel: number of concurrent processes (0 means all cores)
|
|
554
|
-
|
|
555
|
-
|
|
651
|
+
return_recording: Also return a TransmissionMonitor object that
|
|
652
|
+
records the start and end times and the transmitted size of
|
|
653
|
+
each object (i.e. before decompression) stored in an interval
|
|
654
|
+
tree. This enables post-hoc analysis of performance.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
N = number of files uploaded
|
|
658
|
+
tm = TransmissionMonitor
|
|
659
|
+
if return_recording:
|
|
660
|
+
return (N, tm)
|
|
661
|
+
else:
|
|
662
|
+
return N
|
|
556
663
|
"""
|
|
557
664
|
files = toiter(files)
|
|
558
665
|
progress = nvl(progress, self.progress)
|
|
666
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
559
667
|
|
|
560
668
|
def todict(file):
|
|
561
669
|
if isinstance(file, tuple):
|
|
@@ -563,6 +671,7 @@ class CloudFiles:
|
|
|
563
671
|
return file
|
|
564
672
|
|
|
565
673
|
def uploadfn(file):
|
|
674
|
+
start_time = time.monotonic()
|
|
566
675
|
file = todict(file)
|
|
567
676
|
|
|
568
677
|
file_compress = file.get('compress', compress)
|
|
@@ -577,11 +686,19 @@ class CloudFiles:
|
|
|
577
686
|
compress_level=file.get('compression_level', compression_level),
|
|
578
687
|
)
|
|
579
688
|
|
|
689
|
+
num_bytes_tx = 0
|
|
690
|
+
if hasattr(content, "__len__"):
|
|
691
|
+
num_bytes_tx = len(content)
|
|
692
|
+
elif isinstance(content, io.IOBase):
|
|
693
|
+
num_bytes_tx = os.fstat(content.fileno()).st_size
|
|
694
|
+
|
|
695
|
+
flight_id = tm.start_io(num_bytes_tx, start_time)
|
|
696
|
+
|
|
580
697
|
if (
|
|
581
698
|
self.protocol == "gs"
|
|
582
699
|
and (
|
|
583
700
|
(hasattr(content, "read") and hasattr(content, "seek"))
|
|
584
|
-
or (
|
|
701
|
+
or (num_bytes_tx > self.composite_upload_threshold)
|
|
585
702
|
)
|
|
586
703
|
):
|
|
587
704
|
gcs.composite_upload(
|
|
@@ -608,6 +725,8 @@ class CloudFiles:
|
|
|
608
725
|
storage_class=file.get('storage_class', storage_class)
|
|
609
726
|
)
|
|
610
727
|
|
|
728
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
729
|
+
|
|
611
730
|
if not isinstance(files, (types.GeneratorType, zip)):
|
|
612
731
|
dupes = duplicates([ todict(file)['path'] for file in files ])
|
|
613
732
|
if dupes:
|
|
@@ -617,7 +736,10 @@ class CloudFiles:
|
|
|
617
736
|
|
|
618
737
|
if total == 1:
|
|
619
738
|
uploadfn(first(files))
|
|
620
|
-
|
|
739
|
+
if return_recording:
|
|
740
|
+
return (1,tm)
|
|
741
|
+
else:
|
|
742
|
+
return 1
|
|
621
743
|
|
|
622
744
|
fns = ( partial(uploadfn, file) for file in files )
|
|
623
745
|
desc = self._progress_description("Upload")
|
|
@@ -628,7 +750,11 @@ class CloudFiles:
|
|
|
628
750
|
total=total,
|
|
629
751
|
green=self.green,
|
|
630
752
|
)
|
|
631
|
-
|
|
753
|
+
|
|
754
|
+
if return_recording:
|
|
755
|
+
return (len(results), tm)
|
|
756
|
+
else:
|
|
757
|
+
return len(results)
|
|
632
758
|
|
|
633
759
|
def put(
|
|
634
760
|
self,
|
|
@@ -666,9 +792,13 @@ class CloudFiles:
|
|
|
666
792
|
self, files:PutType,
|
|
667
793
|
compress:CompressType = None,
|
|
668
794
|
compression_level:Optional[int] = None,
|
|
669
|
-
cache_control:Optional[str] = None,
|
|
670
|
-
|
|
671
|
-
|
|
795
|
+
cache_control:Optional[str] = None,
|
|
796
|
+
total:Optional[int] = None,
|
|
797
|
+
raw:bool = False,
|
|
798
|
+
progress:Optional[bool] = None,
|
|
799
|
+
parallel:ParallelType = 1,
|
|
800
|
+
storage_class:Optional[str] = None,
|
|
801
|
+
return_recording:bool = False,
|
|
672
802
|
) -> int:
|
|
673
803
|
"""
|
|
674
804
|
Write one or more files as JSON.
|
|
@@ -697,7 +827,7 @@ class CloudFiles:
|
|
|
697
827
|
compress=compress, compression_level=compression_level,
|
|
698
828
|
content_type='application/json', storage_class=storage_class,
|
|
699
829
|
total=total, raw=raw,
|
|
700
|
-
progress=progress, parallel=parallel
|
|
830
|
+
progress=progress, parallel=parallel, return_recording=return_recording,
|
|
701
831
|
)
|
|
702
832
|
|
|
703
833
|
def put_json(
|
|
@@ -1008,7 +1138,7 @@ class CloudFiles:
|
|
|
1008
1138
|
allow_missing:bool = False,
|
|
1009
1139
|
progress:Optional[bool] = None,
|
|
1010
1140
|
resumable:bool = False,
|
|
1011
|
-
) ->
|
|
1141
|
+
) -> TransmissionMonitor:
|
|
1012
1142
|
"""
|
|
1013
1143
|
Transfer all files from this CloudFiles storage
|
|
1014
1144
|
to the destination CloudFiles in batches sized
|
|
@@ -1071,7 +1201,7 @@ class CloudFiles:
|
|
|
1071
1201
|
allow_missing:bool = False,
|
|
1072
1202
|
progress:Optional[bool] = None,
|
|
1073
1203
|
resumable:bool = False,
|
|
1074
|
-
) ->
|
|
1204
|
+
) -> TransmissionMonitor:
|
|
1075
1205
|
"""
|
|
1076
1206
|
Transfer all files from the source CloudFiles storage
|
|
1077
1207
|
to this CloudFiles in batches sized in the
|
|
@@ -1135,7 +1265,7 @@ class CloudFiles:
|
|
|
1135
1265
|
and self.protocol == "file"
|
|
1136
1266
|
and reencode is None
|
|
1137
1267
|
):
|
|
1138
|
-
self.__transfer_file_to_file(
|
|
1268
|
+
return self.__transfer_file_to_file(
|
|
1139
1269
|
cf_src, self, paths, total,
|
|
1140
1270
|
pbar, block_size, allow_missing
|
|
1141
1271
|
)
|
|
@@ -1144,7 +1274,7 @@ class CloudFiles:
|
|
|
1144
1274
|
and self.protocol == "file"
|
|
1145
1275
|
and reencode is None
|
|
1146
1276
|
):
|
|
1147
|
-
self.__transfer_remote_to_file(
|
|
1277
|
+
return self.__transfer_remote_to_file(
|
|
1148
1278
|
cf_src, self, paths, total,
|
|
1149
1279
|
pbar, block_size, content_type,
|
|
1150
1280
|
allow_missing, resumable,
|
|
@@ -1154,7 +1284,7 @@ class CloudFiles:
|
|
|
1154
1284
|
and self.protocol != "file"
|
|
1155
1285
|
and reencode is None
|
|
1156
1286
|
):
|
|
1157
|
-
self.__transfer_file_to_remote(
|
|
1287
|
+
return self.__transfer_file_to_remote(
|
|
1158
1288
|
cf_src, self, paths, total,
|
|
1159
1289
|
pbar, block_size, content_type,
|
|
1160
1290
|
allow_missing,
|
|
@@ -1170,13 +1300,13 @@ class CloudFiles:
|
|
|
1170
1300
|
)
|
|
1171
1301
|
and reencode is None
|
|
1172
1302
|
):
|
|
1173
|
-
self.__transfer_cloud_internal(
|
|
1303
|
+
return self.__transfer_cloud_internal(
|
|
1174
1304
|
cf_src, self, paths,
|
|
1175
1305
|
total, pbar, block_size,
|
|
1176
1306
|
allow_missing,
|
|
1177
1307
|
)
|
|
1178
1308
|
else:
|
|
1179
|
-
self.__transfer_general(
|
|
1309
|
+
return self.__transfer_general(
|
|
1180
1310
|
cf_src, self, paths, total,
|
|
1181
1311
|
pbar, block_size,
|
|
1182
1312
|
reencode, content_type,
|
|
@@ -1188,7 +1318,7 @@ class CloudFiles:
|
|
|
1188
1318
|
total, pbar, block_size,
|
|
1189
1319
|
reencode, content_type,
|
|
1190
1320
|
allow_missing
|
|
1191
|
-
):
|
|
1321
|
+
) -> TransmissionMonitor:
|
|
1192
1322
|
"""
|
|
1193
1323
|
Downloads the file into RAM, transforms
|
|
1194
1324
|
the data, and uploads it. This is the slowest and
|
|
@@ -1197,6 +1327,7 @@ class CloudFiles:
|
|
|
1197
1327
|
pair of endpoints as well as transcoding compression
|
|
1198
1328
|
formats.
|
|
1199
1329
|
"""
|
|
1330
|
+
upload_tms = []
|
|
1200
1331
|
for block_paths in sip(paths, block_size):
|
|
1201
1332
|
for path in block_paths:
|
|
1202
1333
|
if isinstance(path, dict):
|
|
@@ -1220,26 +1351,32 @@ class CloudFiles:
|
|
|
1220
1351
|
item["path"] = item["tags"]["dest_path"]
|
|
1221
1352
|
del item["tags"]["dest_path"]
|
|
1222
1353
|
yield item
|
|
1223
|
-
self.puts(
|
|
1354
|
+
(ct, batch_tm) = self.puts(
|
|
1224
1355
|
renameiter(),
|
|
1225
1356
|
raw=True,
|
|
1226
1357
|
progress=False,
|
|
1227
1358
|
compress=reencode,
|
|
1228
1359
|
content_type=content_type,
|
|
1360
|
+
return_recording=True,
|
|
1229
1361
|
)
|
|
1230
1362
|
pbar.update(len(block_paths))
|
|
1363
|
+
upload_tms.append(batch_tm)
|
|
1364
|
+
|
|
1365
|
+
return TransmissionMonitor.merge(upload_tms)
|
|
1231
1366
|
|
|
1232
1367
|
def __transfer_file_to_file(
|
|
1233
1368
|
self, cf_src, cf_dest, paths,
|
|
1234
1369
|
total, pbar, block_size, allow_missing
|
|
1235
|
-
):
|
|
1370
|
+
) -> TransmissionMonitor:
|
|
1236
1371
|
"""
|
|
1237
1372
|
shutil.copyfile, starting in Python 3.8, uses
|
|
1238
1373
|
special OS kernel functions to accelerate file copies
|
|
1239
1374
|
"""
|
|
1375
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
1240
1376
|
srcdir = cf_src.cloudpath.replace("file://", "")
|
|
1241
1377
|
destdir = mkdir(cf_dest.cloudpath.replace("file://", ""))
|
|
1242
1378
|
for path in paths:
|
|
1379
|
+
start_time = time.monotonic()
|
|
1243
1380
|
if isinstance(path, dict):
|
|
1244
1381
|
src = os.path.join(srcdir, path["path"])
|
|
1245
1382
|
dest = os.path.join(destdir, path["dest_path"])
|
|
@@ -1253,6 +1390,15 @@ class CloudFiles:
|
|
|
1253
1390
|
if dest_ext_compress != dest_ext:
|
|
1254
1391
|
dest += dest_ext_compress
|
|
1255
1392
|
|
|
1393
|
+
num_bytes_tx = 0
|
|
1394
|
+
try:
|
|
1395
|
+
if src:
|
|
1396
|
+
num_bytes_tx = os.path.getsize(src)
|
|
1397
|
+
except FileNotFoundError:
|
|
1398
|
+
pass
|
|
1399
|
+
|
|
1400
|
+
flight_id = tm.start_io(num_bytes_tx, start_time)
|
|
1401
|
+
|
|
1256
1402
|
try:
|
|
1257
1403
|
shutil.copyfile(src, dest) # avoids user space
|
|
1258
1404
|
except FileNotFoundError:
|
|
@@ -1260,16 +1406,26 @@ class CloudFiles:
|
|
|
1260
1406
|
with open(dest, "wb") as f:
|
|
1261
1407
|
f.write(b'')
|
|
1262
1408
|
else:
|
|
1409
|
+
tm.end_error(flight_id)
|
|
1263
1410
|
raise
|
|
1411
|
+
finally:
|
|
1412
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
1264
1413
|
|
|
1265
1414
|
pbar.update(1)
|
|
1266
1415
|
|
|
1416
|
+
return tm
|
|
1417
|
+
|
|
1267
1418
|
def __transfer_remote_to_file(
|
|
1268
1419
|
self, cf_src, cf_dest, paths,
|
|
1269
1420
|
total, pbar, block_size, content_type,
|
|
1270
1421
|
allow_missing, resumable,
|
|
1271
|
-
):
|
|
1422
|
+
) -> TransmissionMonitor:
|
|
1423
|
+
|
|
1424
|
+
tm = TransmissionMonitor(IOEnum.RX)
|
|
1425
|
+
|
|
1272
1426
|
def thunk_save(key):
|
|
1427
|
+
nonlocal tm
|
|
1428
|
+
flight_id = tm.start_io(1)
|
|
1273
1429
|
with cf_src._get_connection() as conn:
|
|
1274
1430
|
if isinstance(key, dict):
|
|
1275
1431
|
dest_key = key.get("dest_path", key["path"])
|
|
@@ -1279,14 +1435,17 @@ class CloudFiles:
|
|
|
1279
1435
|
dest_key = key
|
|
1280
1436
|
|
|
1281
1437
|
dest_key = os.path.join(cf_dest._path.path, dest_key)
|
|
1282
|
-
found = conn.save_file(src_key, dest_key, resumable=resumable)
|
|
1438
|
+
(found, num_bytes_rx) = conn.save_file(src_key, dest_key, resumable=resumable)
|
|
1439
|
+
|
|
1440
|
+
tm.end_io(flight_id, num_bytes_rx)
|
|
1283
1441
|
|
|
1284
1442
|
if found == False and not allow_missing:
|
|
1443
|
+
tm.end_error(flight_id)
|
|
1285
1444
|
raise FileNotFoundError(src_key)
|
|
1286
1445
|
|
|
1287
1446
|
return int(found)
|
|
1288
1447
|
|
|
1289
|
-
|
|
1448
|
+
schedule_jobs(
|
|
1290
1449
|
fns=( partial(thunk_save, path) for path in paths ),
|
|
1291
1450
|
progress=pbar,
|
|
1292
1451
|
concurrency=self.num_threads,
|
|
@@ -1294,7 +1453,7 @@ class CloudFiles:
|
|
|
1294
1453
|
green=self.green,
|
|
1295
1454
|
count_return=True,
|
|
1296
1455
|
)
|
|
1297
|
-
return
|
|
1456
|
+
return tm
|
|
1298
1457
|
|
|
1299
1458
|
def __transfer_file_to_remote(
|
|
1300
1459
|
self, cf_src, cf_dest, paths,
|
|
@@ -1306,6 +1465,7 @@ class CloudFiles:
|
|
|
1306
1465
|
so that GCS and S3 can do low-memory chunked multi-part
|
|
1307
1466
|
uploads if necessary.
|
|
1308
1467
|
"""
|
|
1468
|
+
tms = []
|
|
1309
1469
|
srcdir = cf_src.cloudpath.replace("file://", "")
|
|
1310
1470
|
for block_paths in sip(paths, block_size):
|
|
1311
1471
|
to_upload = []
|
|
@@ -1336,13 +1496,22 @@ class CloudFiles:
|
|
|
1336
1496
|
"content": handle,
|
|
1337
1497
|
"compress": encoding,
|
|
1338
1498
|
})
|
|
1339
|
-
|
|
1499
|
+
(ct, batch_tm) = cf_dest.puts(
|
|
1500
|
+
to_upload,
|
|
1501
|
+
raw=True,
|
|
1502
|
+
progress=False,
|
|
1503
|
+
content_type=content_type,
|
|
1504
|
+
return_recording=True,
|
|
1505
|
+
)
|
|
1340
1506
|
for item in to_upload:
|
|
1341
1507
|
handle = item["content"]
|
|
1342
1508
|
if hasattr(handle, "close"):
|
|
1343
1509
|
handle.close()
|
|
1510
|
+
tms.append(batch_tm)
|
|
1344
1511
|
pbar.update(len(block_paths))
|
|
1345
1512
|
|
|
1513
|
+
return TransmissionMonitor.merge(tms)
|
|
1514
|
+
|
|
1346
1515
|
def __transfer_cloud_internal(
|
|
1347
1516
|
self, cf_src, cf_dest, paths,
|
|
1348
1517
|
total, pbar, block_size, allow_missing
|
|
@@ -1355,7 +1524,11 @@ class CloudFiles:
|
|
|
1355
1524
|
of the cloud, this is much slower and more expensive
|
|
1356
1525
|
than necessary.
|
|
1357
1526
|
"""
|
|
1527
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
1528
|
+
|
|
1358
1529
|
def thunk_copy(key):
|
|
1530
|
+
nonlocal tm
|
|
1531
|
+
flight_id = tm.start_io(1)
|
|
1359
1532
|
with cf_src._get_connection() as conn:
|
|
1360
1533
|
if isinstance(key, dict):
|
|
1361
1534
|
dest_key = key.get("dest_path", key["path"])
|
|
@@ -1365,14 +1538,17 @@ class CloudFiles:
|
|
|
1365
1538
|
dest_key = key
|
|
1366
1539
|
|
|
1367
1540
|
dest_key = posixpath.join(cf_dest._path.path, dest_key)
|
|
1368
|
-
found = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
|
|
1541
|
+
(found, num_bytes_tx) = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
|
|
1542
|
+
|
|
1543
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
1369
1544
|
|
|
1370
1545
|
if found == False and not allow_missing:
|
|
1546
|
+
tm.end_error(flight_id)
|
|
1371
1547
|
raise FileNotFoundError(src_key)
|
|
1372
1548
|
|
|
1373
1549
|
return int(found)
|
|
1374
1550
|
|
|
1375
|
-
|
|
1551
|
+
schedule_jobs(
|
|
1376
1552
|
fns=( partial(thunk_copy, path) for path in paths ),
|
|
1377
1553
|
progress=pbar,
|
|
1378
1554
|
concurrency=self.num_threads,
|
|
@@ -1380,7 +1556,7 @@ class CloudFiles:
|
|
|
1380
1556
|
green=self.green,
|
|
1381
1557
|
count_return=True,
|
|
1382
1558
|
)
|
|
1383
|
-
return
|
|
1559
|
+
return tm
|
|
1384
1560
|
|
|
1385
1561
|
def move(self, src:str, dest:str):
|
|
1386
1562
|
"""Move (rename) src to dest.
|
|
@@ -1490,6 +1666,16 @@ class CloudFiles:
|
|
|
1490
1666
|
return os.path.join(*paths)
|
|
1491
1667
|
return posixpath.join(*paths)
|
|
1492
1668
|
|
|
1669
|
+
def dirname(self, path:str) -> str:
|
|
1670
|
+
if self._path.protocol == "file":
|
|
1671
|
+
return os.path.dirname(path)
|
|
1672
|
+
return posixpath.dirname(path)
|
|
1673
|
+
|
|
1674
|
+
def basename(self, path:str) -> str:
|
|
1675
|
+
if self._path.protocol == "file":
|
|
1676
|
+
return os.path.basename(path)
|
|
1677
|
+
return posixpath.basename(path)
|
|
1678
|
+
|
|
1493
1679
|
def __getitem__(self, key) -> Union[dict,bytes,List[dict]]:
|
|
1494
1680
|
if isinstance(key, tuple) and len(key) == 2 and isinstance(key[1], slice) and isinstance(key[0], str):
|
|
1495
1681
|
return self.get({ 'path': key[0], 'start': key[1].start, 'end': key[1].stop })
|
|
@@ -1656,6 +1842,12 @@ class CloudFile:
|
|
|
1656
1842
|
def join(self, *args):
|
|
1657
1843
|
return self.cf.join(*args)
|
|
1658
1844
|
|
|
1845
|
+
def dirname(self, *args):
|
|
1846
|
+
return self.cf.dirname(*args)
|
|
1847
|
+
|
|
1848
|
+
def basename(self, *args):
|
|
1849
|
+
return self.cf.basename(*args)
|
|
1850
|
+
|
|
1659
1851
|
def touch(self):
|
|
1660
1852
|
return self.cf.touch(self.filename)
|
|
1661
1853
|
|