cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/AUTHORS +1 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/METADATA +101 -21
- cloud_files-6.0.0.dist-info/RECORD +27 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/WHEEL +1 -1
- cloud_files-6.0.0.dist-info/pbr.json +1 -0
- cloudfiles/cloudfiles.py +548 -78
- cloudfiles/compression.py +8 -3
- cloudfiles/exceptions.py +4 -0
- cloudfiles/gcs.py +7 -3
- cloudfiles/interfaces.py +462 -69
- cloudfiles/lib.py +12 -2
- cloudfiles/monitoring.py +724 -0
- cloudfiles/paths.py +61 -5
- cloudfiles/resumable_tools.py +50 -15
- cloudfiles/scheduler.py +6 -1
- cloudfiles/secrets.py +16 -12
- cloudfiles/test.py +28 -0
- cloudfiles_cli/cloudfiles_cli.py +349 -41
- cloud_files-4.27.0.dist-info/RECORD +0 -26
- cloud_files-4.27.0.dist-info/pbr.json +0 -1
- cloudfiles/buckets.py +0 -10
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/LICENSE +0 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/entry_points.txt +0 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/top_level.txt +0 -0
cloudfiles/cloudfiles.py
CHANGED
|
@@ -2,13 +2,14 @@ from typing import (
|
|
|
2
2
|
Any, Dict, Optional,
|
|
3
3
|
Union, List, Tuple,
|
|
4
4
|
Callable, Generator,
|
|
5
|
-
|
|
5
|
+
Sequence, cast, BinaryIO
|
|
6
6
|
)
|
|
7
7
|
|
|
8
8
|
from queue import Queue
|
|
9
9
|
from collections import defaultdict
|
|
10
|
-
from functools import partial, wraps
|
|
10
|
+
from functools import partial, wraps, reduce
|
|
11
11
|
import inspect
|
|
12
|
+
import io
|
|
12
13
|
import math
|
|
13
14
|
import multiprocessing
|
|
14
15
|
import itertools
|
|
@@ -17,7 +18,9 @@ import platform
|
|
|
17
18
|
import posixpath
|
|
18
19
|
import re
|
|
19
20
|
import shutil
|
|
21
|
+
import threading
|
|
20
22
|
import types
|
|
23
|
+
import time
|
|
21
24
|
|
|
22
25
|
import orjson
|
|
23
26
|
import pathos.pools
|
|
@@ -29,10 +32,11 @@ from . import compression, paths, gcs
|
|
|
29
32
|
from .exceptions import UnsupportedProtocolError, MD5IntegrityError, CRC32CIntegrityError
|
|
30
33
|
from .lib import (
|
|
31
34
|
mkdir, totalfn, toiter, scatter, jsonify, nvl,
|
|
32
|
-
duplicates, first, sip,
|
|
35
|
+
duplicates, first, sip, touch,
|
|
33
36
|
md5, crc32c, decode_crc32c_b64
|
|
34
37
|
)
|
|
35
|
-
from .
|
|
38
|
+
from .monitoring import TransmissionMonitor, IOEnum
|
|
39
|
+
from .paths import ALIASES, find_common_buckets
|
|
36
40
|
from .secrets import CLOUD_FILES_DIR, CLOUD_FILES_LOCK_DIR
|
|
37
41
|
from .threaded_queue import ThreadedQueue, DEFAULT_THREADS
|
|
38
42
|
from .typing import (
|
|
@@ -149,26 +153,42 @@ def parallel_execute(
|
|
|
149
153
|
if platform.system().lower() == "darwin":
|
|
150
154
|
os.environ["no_proxy"] = "*"
|
|
151
155
|
|
|
156
|
+
# Don't fork, spawn entirely new processes. This
|
|
157
|
+
# avoids accidental deadlocks.
|
|
158
|
+
multiprocessing.set_start_method("spawn", force=True)
|
|
159
|
+
|
|
152
160
|
results = []
|
|
161
|
+
tms = []
|
|
153
162
|
try:
|
|
154
163
|
with pathos.pools.ProcessPool(parallel) as executor:
|
|
155
164
|
for res in executor.imap(fn, sip(inputs, block_size)):
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
165
|
+
update = res
|
|
166
|
+
if isinstance(res, tuple):
|
|
167
|
+
update = res[0]
|
|
168
|
+
|
|
169
|
+
if isinstance(update, int):
|
|
170
|
+
pbar.update(update)
|
|
171
|
+
elif isinstance(update, list):
|
|
172
|
+
pbar.update(len(update))
|
|
160
173
|
else:
|
|
161
174
|
pbar.update(block_size)
|
|
162
175
|
|
|
163
176
|
if returns_list:
|
|
164
|
-
|
|
177
|
+
if isinstance(res, tuple):
|
|
178
|
+
results.extend(res[0])
|
|
179
|
+
tms.append(res[1])
|
|
180
|
+
else:
|
|
181
|
+
results.extend(res)
|
|
165
182
|
finally:
|
|
166
183
|
if platform.system().lower() == "darwin":
|
|
167
184
|
os.environ["no_proxy"] = no_proxy
|
|
168
185
|
pbar.close()
|
|
169
186
|
|
|
170
187
|
if returns_list:
|
|
171
|
-
|
|
188
|
+
if len(tms):
|
|
189
|
+
return (results, TransmissionMonitor.merge(tms))
|
|
190
|
+
else:
|
|
191
|
+
return results
|
|
172
192
|
|
|
173
193
|
def get_interface_class(protocol):
|
|
174
194
|
if protocol in INTERFACES:
|
|
@@ -182,7 +202,7 @@ def path_to_byte_range_tags(path):
|
|
|
182
202
|
if isinstance(path, str):
|
|
183
203
|
return (path, None, None, None)
|
|
184
204
|
return (path['path'], path.get('start', None), path.get('end', None), path.get('tags', None))
|
|
185
|
-
|
|
205
|
+
|
|
186
206
|
def dl(
|
|
187
207
|
cloudpaths:GetPathType, raw:bool=False, **kwargs
|
|
188
208
|
) -> Union[bytes,List[dict]]:
|
|
@@ -193,23 +213,8 @@ def dl(
|
|
|
193
213
|
dict.
|
|
194
214
|
"""
|
|
195
215
|
cloudpaths, is_multiple = toiter(cloudpaths, is_iter=True)
|
|
196
|
-
clustered =
|
|
197
|
-
total =
|
|
198
|
-
for path in cloudpaths:
|
|
199
|
-
pth = path
|
|
200
|
-
byte_range = None
|
|
201
|
-
if isinstance(path, dict):
|
|
202
|
-
pth = path["path"]
|
|
203
|
-
byte_range = path["byte_range"]
|
|
204
|
-
|
|
205
|
-
epath = paths.extract(pth)
|
|
206
|
-
bucketpath = paths.asbucketpath(epath)
|
|
207
|
-
clustered[bucketpath].append({
|
|
208
|
-
"path": epath.path,
|
|
209
|
-
"start": (byte_range[0] if byte_range else None), # type: ignore
|
|
210
|
-
"end": (byte_range[1] if byte_range else None), # type: ignore
|
|
211
|
-
})
|
|
212
|
-
total += 1
|
|
216
|
+
clustered = find_common_buckets(cloudpaths)
|
|
217
|
+
total = sum([ len(bucket) for bucket in clustered.values() ])
|
|
213
218
|
|
|
214
219
|
progress = kwargs.get("progress", False) and total > 1
|
|
215
220
|
pbar = tqdm(total=total, desc="Downloading", disable=(not progress))
|
|
@@ -243,12 +248,55 @@ class CloudFiles:
|
|
|
243
248
|
currently supports local filesystem, Google Cloud Storage,
|
|
244
249
|
Amazon S3 interfaces, and reading from arbitrary HTTP
|
|
245
250
|
servers.
|
|
251
|
+
|
|
252
|
+
cloudpath: a parent directory of the files you want to fetch
|
|
253
|
+
specified as:
|
|
254
|
+
e.g. gs://bucket/dir/
|
|
255
|
+
s3://bucket/dir/
|
|
256
|
+
s3://https://myendpoint.com/dir/
|
|
257
|
+
file://./dir
|
|
258
|
+
./dir
|
|
259
|
+
https://some.host.edu/dir/
|
|
260
|
+
mem://bucket/dir
|
|
261
|
+
Key:
|
|
262
|
+
gs: Google Cloud Storage
|
|
263
|
+
s3: Amazon S3
|
|
264
|
+
file: Local Filesystem (including network mounts)
|
|
265
|
+
mem: In-Memory storage
|
|
266
|
+
|
|
267
|
+
progress: display progress bar measured in files
|
|
268
|
+
green: whether to use green threads (uses gevent library)
|
|
269
|
+
secrets: you can provide GCS, S3, CAVE, etc credentials
|
|
270
|
+
via the constructor here instead of the default secrets
|
|
271
|
+
files
|
|
272
|
+
num_threads: number of threads to launch for remote server
|
|
273
|
+
IO. No effect on local file fetching (always single threaded
|
|
274
|
+
for maximum performance).
|
|
275
|
+
use_https: use the public https API for GCS and S3 instead of
|
|
276
|
+
boto or google-storage-python
|
|
277
|
+
endpoint: for S3 emulators, you can provide a different endpoint
|
|
278
|
+
like https://s3-storage.university.edu. This can also be specified
|
|
279
|
+
in the secrets file.
|
|
280
|
+
parallel: number of separate processes to launch (each will use num_threads)
|
|
281
|
+
request_payer: bill your s3 usage to someone other than the bucket owner
|
|
282
|
+
locking: for local filesystems, you can use advisory file locking to avoid
|
|
283
|
+
separate cloudfiles instances from interfering with each other
|
|
284
|
+
lock_dir: you can specify your own directory for the advisory lock files
|
|
285
|
+
composite_upload_threshold: GCS and S3 both support multi-part uploads.
|
|
286
|
+
For files larger than this threshold, use that facility.
|
|
287
|
+
no_sign_request: (s3 only) don't sign the request with credentials
|
|
246
288
|
"""
|
|
247
289
|
def __init__(
|
|
248
|
-
self,
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
290
|
+
self,
|
|
291
|
+
cloudpath:str,
|
|
292
|
+
progress:bool = False,
|
|
293
|
+
green:Optional[bool] = None,
|
|
294
|
+
secrets:SecretsType = None,
|
|
295
|
+
num_threads:int = 20,
|
|
296
|
+
use_https:bool = False,
|
|
297
|
+
endpoint:Optional[str] = None,
|
|
298
|
+
parallel:ParallelType = 1,
|
|
299
|
+
request_payer:Optional[str] = None,
|
|
252
300
|
locking:Optional[bool] = None,
|
|
253
301
|
lock_dir:Optional[str] = None,
|
|
254
302
|
composite_upload_threshold:int = int(1e8),
|
|
@@ -257,6 +305,8 @@ class CloudFiles:
|
|
|
257
305
|
if use_https:
|
|
258
306
|
cloudpath = paths.to_https_protocol(cloudpath)
|
|
259
307
|
|
|
308
|
+
cloudpath = paths.normalize(cloudpath)
|
|
309
|
+
|
|
260
310
|
self.cloudpath = cloudpath
|
|
261
311
|
self.progress = progress
|
|
262
312
|
self.secrets = secrets
|
|
@@ -332,11 +382,16 @@ class CloudFiles:
|
|
|
332
382
|
|
|
333
383
|
@parallelize(desc="Download", returns_list=True)
|
|
334
384
|
def get(
|
|
335
|
-
self,
|
|
336
|
-
|
|
385
|
+
self,
|
|
386
|
+
paths:GetPathType,
|
|
387
|
+
total:Optional[int] = None,
|
|
388
|
+
raw:bool = False,
|
|
389
|
+
progress:Optional[bool] = None,
|
|
337
390
|
parallel:Optional[ParallelType] = None,
|
|
338
|
-
return_dict:bool = False,
|
|
339
|
-
|
|
391
|
+
return_dict:bool = False,
|
|
392
|
+
raise_errors:bool = True,
|
|
393
|
+
part_size:Optional[int] = None,
|
|
394
|
+
return_recording:bool = False,
|
|
340
395
|
) -> Union[dict,bytes,List[dict]]:
|
|
341
396
|
"""
|
|
342
397
|
Download one or more files. Return order is not guaranteed to match input.
|
|
@@ -362,6 +417,10 @@ class CloudFiles:
|
|
|
362
417
|
extra information. Errors will be raised immediately.
|
|
363
418
|
raise_errors: Raise the first error immediately instead
|
|
364
419
|
of returning them as part of the output.
|
|
420
|
+
return_recording: Also return a TransmissionMonitor object that
|
|
421
|
+
records the start and end times and the transmitted size of
|
|
422
|
+
each object (i.e. before decompression) stored in an interval
|
|
423
|
+
tree. This enables post-hoc analysis of performance.
|
|
365
424
|
|
|
366
425
|
Returns:
|
|
367
426
|
if return_dict:
|
|
@@ -379,12 +438,18 @@ class CloudFiles:
|
|
|
379
438
|
'raw': boolean,
|
|
380
439
|
}
|
|
381
440
|
]
|
|
441
|
+
|
|
442
|
+
if return_recording:
|
|
443
|
+
return (ABOVE, TransmissionMonitor)
|
|
444
|
+
else:
|
|
445
|
+
return ABOVE
|
|
382
446
|
"""
|
|
383
447
|
paths, multiple_return = toiter(paths, is_iter=True)
|
|
384
448
|
progress = nvl(progress, self.progress)
|
|
385
449
|
# return_dict prevents the user from having a chance
|
|
386
450
|
# to inspect errors, so we must raise here.
|
|
387
451
|
raise_errors = raise_errors or return_dict or (not multiple_return)
|
|
452
|
+
tm = TransmissionMonitor(IOEnum.RX)
|
|
388
453
|
|
|
389
454
|
def check_md5(path, content, server_hash):
|
|
390
455
|
if server_hash is None:
|
|
@@ -414,12 +479,17 @@ class CloudFiles:
|
|
|
414
479
|
encoding = None
|
|
415
480
|
server_hash = None
|
|
416
481
|
server_hash_type = None
|
|
482
|
+
num_bytes_rx = 0
|
|
417
483
|
try:
|
|
484
|
+
flight_id = tm.start_io(1)
|
|
485
|
+
|
|
418
486
|
with self._get_connection() as conn:
|
|
419
487
|
content, encoding, server_hash, server_hash_type = conn.get_file(
|
|
420
488
|
path, start=start, end=end, part_size=part_size
|
|
421
489
|
)
|
|
422
490
|
|
|
491
|
+
num_bytes_rx = len(content) if content is not None else 0
|
|
492
|
+
|
|
423
493
|
# md5s don't match for partial reads
|
|
424
494
|
if start is None and end is None:
|
|
425
495
|
if server_hash_type == "md5":
|
|
@@ -431,6 +501,9 @@ class CloudFiles:
|
|
|
431
501
|
content = compression.decompress(content, encoding, filename=path)
|
|
432
502
|
except Exception as err:
|
|
433
503
|
error = err
|
|
504
|
+
tm.end_error(flight_id)
|
|
505
|
+
|
|
506
|
+
tm.end_io(flight_id, num_bytes_rx)
|
|
434
507
|
|
|
435
508
|
if raise_errors and error:
|
|
436
509
|
raise error
|
|
@@ -450,11 +523,16 @@ class CloudFiles:
|
|
|
450
523
|
if total == 1:
|
|
451
524
|
ret = download(first(paths))
|
|
452
525
|
if return_dict:
|
|
453
|
-
|
|
526
|
+
ret = { ret["path"]: ret["content"] }
|
|
454
527
|
elif multiple_return:
|
|
455
|
-
|
|
528
|
+
ret = [ ret ]
|
|
456
529
|
else:
|
|
457
|
-
|
|
530
|
+
ret = ret['content']
|
|
531
|
+
|
|
532
|
+
if return_recording:
|
|
533
|
+
return (ret, tm)
|
|
534
|
+
else:
|
|
535
|
+
return ret
|
|
458
536
|
|
|
459
537
|
num_threads = self.num_threads
|
|
460
538
|
if self.protocol == "file":
|
|
@@ -470,10 +548,14 @@ class CloudFiles:
|
|
|
470
548
|
green=self.green,
|
|
471
549
|
)
|
|
472
550
|
|
|
551
|
+
ret = results
|
|
473
552
|
if return_dict:
|
|
474
|
-
|
|
553
|
+
ret = { res["path"]: res["content"] for res in results }
|
|
554
|
+
|
|
555
|
+
if return_recording:
|
|
556
|
+
return (ret, tm)
|
|
475
557
|
|
|
476
|
-
return
|
|
558
|
+
return ret
|
|
477
559
|
|
|
478
560
|
def get_json(
|
|
479
561
|
self, paths:GetPathType, total:Optional[int] = None
|
|
@@ -520,12 +602,19 @@ class CloudFiles:
|
|
|
520
602
|
|
|
521
603
|
@parallelize(desc="Upload")
|
|
522
604
|
def puts(
|
|
523
|
-
self,
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
605
|
+
self,
|
|
606
|
+
files:PutType,
|
|
607
|
+
content_type:Optional[str] = None,
|
|
608
|
+
compress:CompressType = None,
|
|
609
|
+
compression_level:Optional[int] = None,
|
|
610
|
+
cache_control:Optional[str] = None,
|
|
611
|
+
total:Optional[int] = None,
|
|
612
|
+
raw:bool = False,
|
|
613
|
+
progress:Optional[bool] = None,
|
|
614
|
+
parallel:ParallelType = 1,
|
|
615
|
+
storage_class:Optional[str] = None,
|
|
616
|
+
return_recording:bool = False,
|
|
617
|
+
) -> Union[int, tuple[int,TransmissionMonitor]]:
|
|
529
618
|
"""
|
|
530
619
|
Writes one or more files at a given location.
|
|
531
620
|
|
|
@@ -560,11 +649,22 @@ class CloudFiles:
|
|
|
560
649
|
function call. If progress is a string, it sets the
|
|
561
650
|
text of the progress bar.
|
|
562
651
|
parallel: number of concurrent processes (0 means all cores)
|
|
563
|
-
|
|
564
|
-
|
|
652
|
+
return_recording: Also return a TransmissionMonitor object that
|
|
653
|
+
records the start and end times and the transmitted size of
|
|
654
|
+
each object (i.e. before decompression) stored in an interval
|
|
655
|
+
tree. This enables post-hoc analysis of performance.
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
N = number of files uploaded
|
|
659
|
+
tm = TransmissionMonitor
|
|
660
|
+
if return_recording:
|
|
661
|
+
return (N, tm)
|
|
662
|
+
else:
|
|
663
|
+
return N
|
|
565
664
|
"""
|
|
566
665
|
files = toiter(files)
|
|
567
666
|
progress = nvl(progress, self.progress)
|
|
667
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
568
668
|
|
|
569
669
|
def todict(file):
|
|
570
670
|
if isinstance(file, tuple):
|
|
@@ -572,6 +672,7 @@ class CloudFiles:
|
|
|
572
672
|
return file
|
|
573
673
|
|
|
574
674
|
def uploadfn(file):
|
|
675
|
+
start_time = time.monotonic()
|
|
575
676
|
file = todict(file)
|
|
576
677
|
|
|
577
678
|
file_compress = file.get('compress', compress)
|
|
@@ -586,11 +687,19 @@ class CloudFiles:
|
|
|
586
687
|
compress_level=file.get('compression_level', compression_level),
|
|
587
688
|
)
|
|
588
689
|
|
|
690
|
+
num_bytes_tx = 0
|
|
691
|
+
if hasattr(content, "__len__"):
|
|
692
|
+
num_bytes_tx = len(content)
|
|
693
|
+
elif isinstance(content, io.IOBase):
|
|
694
|
+
num_bytes_tx = os.fstat(content.fileno()).st_size
|
|
695
|
+
|
|
696
|
+
flight_id = tm.start_io(num_bytes_tx, start_time)
|
|
697
|
+
|
|
589
698
|
if (
|
|
590
699
|
self.protocol == "gs"
|
|
591
700
|
and (
|
|
592
701
|
(hasattr(content, "read") and hasattr(content, "seek"))
|
|
593
|
-
or (
|
|
702
|
+
or (num_bytes_tx > self.composite_upload_threshold)
|
|
594
703
|
)
|
|
595
704
|
):
|
|
596
705
|
gcs.composite_upload(
|
|
@@ -603,6 +712,7 @@ class CloudFiles:
|
|
|
603
712
|
cache_control=cache_control,
|
|
604
713
|
storage_class=storage_class,
|
|
605
714
|
compress=file_compress,
|
|
715
|
+
skip_compress=True,
|
|
606
716
|
)
|
|
607
717
|
return
|
|
608
718
|
|
|
@@ -616,6 +726,8 @@ class CloudFiles:
|
|
|
616
726
|
storage_class=file.get('storage_class', storage_class)
|
|
617
727
|
)
|
|
618
728
|
|
|
729
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
730
|
+
|
|
619
731
|
if not isinstance(files, (types.GeneratorType, zip)):
|
|
620
732
|
dupes = duplicates([ todict(file)['path'] for file in files ])
|
|
621
733
|
if dupes:
|
|
@@ -625,7 +737,10 @@ class CloudFiles:
|
|
|
625
737
|
|
|
626
738
|
if total == 1:
|
|
627
739
|
uploadfn(first(files))
|
|
628
|
-
|
|
740
|
+
if return_recording:
|
|
741
|
+
return (1,tm)
|
|
742
|
+
else:
|
|
743
|
+
return 1
|
|
629
744
|
|
|
630
745
|
fns = ( partial(uploadfn, file) for file in files )
|
|
631
746
|
desc = self._progress_description("Upload")
|
|
@@ -636,7 +751,11 @@ class CloudFiles:
|
|
|
636
751
|
total=total,
|
|
637
752
|
green=self.green,
|
|
638
753
|
)
|
|
639
|
-
|
|
754
|
+
|
|
755
|
+
if return_recording:
|
|
756
|
+
return (len(results), tm)
|
|
757
|
+
else:
|
|
758
|
+
return len(results)
|
|
640
759
|
|
|
641
760
|
def put(
|
|
642
761
|
self,
|
|
@@ -674,9 +793,13 @@ class CloudFiles:
|
|
|
674
793
|
self, files:PutType,
|
|
675
794
|
compress:CompressType = None,
|
|
676
795
|
compression_level:Optional[int] = None,
|
|
677
|
-
cache_control:Optional[str] = None,
|
|
678
|
-
|
|
679
|
-
|
|
796
|
+
cache_control:Optional[str] = None,
|
|
797
|
+
total:Optional[int] = None,
|
|
798
|
+
raw:bool = False,
|
|
799
|
+
progress:Optional[bool] = None,
|
|
800
|
+
parallel:ParallelType = 1,
|
|
801
|
+
storage_class:Optional[str] = None,
|
|
802
|
+
return_recording:bool = False,
|
|
680
803
|
) -> int:
|
|
681
804
|
"""
|
|
682
805
|
Write one or more files as JSON.
|
|
@@ -705,7 +828,7 @@ class CloudFiles:
|
|
|
705
828
|
compress=compress, compression_level=compression_level,
|
|
706
829
|
content_type='application/json', storage_class=storage_class,
|
|
707
830
|
total=total, raw=raw,
|
|
708
|
-
progress=progress, parallel=parallel
|
|
831
|
+
progress=progress, parallel=parallel, return_recording=return_recording,
|
|
709
832
|
)
|
|
710
833
|
|
|
711
834
|
def put_json(
|
|
@@ -755,9 +878,11 @@ class CloudFiles:
|
|
|
755
878
|
return True
|
|
756
879
|
elif prefix[-1] == "/":
|
|
757
880
|
return True
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
881
|
+
try:
|
|
882
|
+
res = first(self.list(prefix=prefix))
|
|
883
|
+
return res is not None
|
|
884
|
+
except NotImplementedError as err:
|
|
885
|
+
return not CloudFile(self.cloudpath).exists()
|
|
761
886
|
|
|
762
887
|
def exists(
|
|
763
888
|
self, paths:GetPathType,
|
|
@@ -852,8 +977,10 @@ class CloudFiles:
|
|
|
852
977
|
|
|
853
978
|
def size(
|
|
854
979
|
self, paths:GetPathType,
|
|
855
|
-
total:Optional[int] = None,
|
|
856
|
-
|
|
980
|
+
total:Optional[int] = None,
|
|
981
|
+
progress:Optional[bool] = None,
|
|
982
|
+
return_sum:bool = False,
|
|
983
|
+
) -> Union[Dict[str,int],List[Dict[str,int]],int]:
|
|
857
984
|
"""
|
|
858
985
|
Get the size in bytes of one or more files in its stored state.
|
|
859
986
|
"""
|
|
@@ -874,10 +1001,47 @@ class CloudFiles:
|
|
|
874
1001
|
green=self.green,
|
|
875
1002
|
)
|
|
876
1003
|
|
|
1004
|
+
if return_sum:
|
|
1005
|
+
return sum(( sz for sz in results.values() ))
|
|
1006
|
+
|
|
877
1007
|
if return_multiple:
|
|
878
1008
|
return results
|
|
879
1009
|
return first(results.values())
|
|
880
1010
|
|
|
1011
|
+
def subtree_size(self, prefix:GetPathType = "") -> dict[str,int]:
|
|
1012
|
+
"""High performance size calculation for directory trees."""
|
|
1013
|
+
prefix, return_multiple = toiter(prefix, is_iter=True)
|
|
1014
|
+
total_bytes = 0
|
|
1015
|
+
total_files = 0
|
|
1016
|
+
|
|
1017
|
+
total = totalfn(prefix, None)
|
|
1018
|
+
|
|
1019
|
+
lock = threading.Lock()
|
|
1020
|
+
|
|
1021
|
+
def size_thunk(prefix):
|
|
1022
|
+
nonlocal total_bytes
|
|
1023
|
+
nonlocal total_files
|
|
1024
|
+
nonlocal lock
|
|
1025
|
+
|
|
1026
|
+
with self._get_connection() as conn:
|
|
1027
|
+
subtree_files, subtree_bytes = conn.subtree_size(prefix)
|
|
1028
|
+
with lock:
|
|
1029
|
+
total_files += subtree_files
|
|
1030
|
+
total_bytes += subtree_bytes
|
|
1031
|
+
|
|
1032
|
+
schedule_jobs(
|
|
1033
|
+
fns=( partial(size_thunk, path) for path in prefix ),
|
|
1034
|
+
concurrency=self.num_threads,
|
|
1035
|
+
progress=self.progress,
|
|
1036
|
+
green=self.green,
|
|
1037
|
+
total=total,
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
return {
|
|
1041
|
+
"N": total_files,
|
|
1042
|
+
"num_bytes": total_bytes,
|
|
1043
|
+
}
|
|
1044
|
+
|
|
881
1045
|
@parallelize(desc="Delete")
|
|
882
1046
|
def delete(
|
|
883
1047
|
self, paths:GetPathType, total:Optional[int] = None,
|
|
@@ -919,6 +1083,60 @@ class CloudFiles:
|
|
|
919
1083
|
)
|
|
920
1084
|
return len(results)
|
|
921
1085
|
|
|
1086
|
+
def touch(
|
|
1087
|
+
self,
|
|
1088
|
+
paths:GetPathType,
|
|
1089
|
+
progress:Optional[bool] = None,
|
|
1090
|
+
total:Optional[int] = None,
|
|
1091
|
+
nocopy:bool = False,
|
|
1092
|
+
):
|
|
1093
|
+
"""
|
|
1094
|
+
Create a zero byte file if it doesn't exist.
|
|
1095
|
+
"""
|
|
1096
|
+
paths = toiter(paths)
|
|
1097
|
+
progress = nvl(progress, self.progress)
|
|
1098
|
+
total = totalfn(paths, total)
|
|
1099
|
+
|
|
1100
|
+
if self.protocol == "file":
|
|
1101
|
+
basepath = self.cloudpath.replace("file://", "")
|
|
1102
|
+
for path in tqdm(paths, disable=(not progress), total=total):
|
|
1103
|
+
pth = path
|
|
1104
|
+
if isinstance(path, dict):
|
|
1105
|
+
pth = path["path"]
|
|
1106
|
+
touch(self.join(basepath, pth))
|
|
1107
|
+
return
|
|
1108
|
+
|
|
1109
|
+
results = self.exists(paths, total=total, progress=progress)
|
|
1110
|
+
|
|
1111
|
+
dne = [
|
|
1112
|
+
(fname, b'')
|
|
1113
|
+
for fname, exists in results.items()
|
|
1114
|
+
if not exists
|
|
1115
|
+
]
|
|
1116
|
+
|
|
1117
|
+
self.puts(dne, progress=progress)
|
|
1118
|
+
|
|
1119
|
+
# def thunk_copy(path):
|
|
1120
|
+
# with self._get_connection() as conn:
|
|
1121
|
+
# conn.copy_file(path, self._path.bucket, self.join(self._path.path, path))
|
|
1122
|
+
# return 1
|
|
1123
|
+
|
|
1124
|
+
# if not nocopy:
|
|
1125
|
+
# already_exists = (
|
|
1126
|
+
# fname
|
|
1127
|
+
# for fname, exists in results.items()
|
|
1128
|
+
# if exists
|
|
1129
|
+
# )
|
|
1130
|
+
|
|
1131
|
+
# results = schedule_jobs(
|
|
1132
|
+
# fns=( partial(thunk_copy, path) for path in already_exists ),
|
|
1133
|
+
# progress=progress,
|
|
1134
|
+
# total=(total - len(dne)),
|
|
1135
|
+
# concurrency=self.num_threads,
|
|
1136
|
+
# green=self.green,
|
|
1137
|
+
# count_return=True,
|
|
1138
|
+
# )
|
|
1139
|
+
|
|
922
1140
|
def list(
|
|
923
1141
|
self, prefix:str = "", flat:bool = False
|
|
924
1142
|
) -> Generator[str,None,None]:
|
|
@@ -953,7 +1171,9 @@ class CloudFiles:
|
|
|
953
1171
|
reencode:Optional[str] = None,
|
|
954
1172
|
content_type:Optional[str] = None,
|
|
955
1173
|
allow_missing:bool = False,
|
|
956
|
-
|
|
1174
|
+
progress:Optional[bool] = None,
|
|
1175
|
+
resumable:bool = False,
|
|
1176
|
+
) -> TransmissionMonitor:
|
|
957
1177
|
"""
|
|
958
1178
|
Transfer all files from this CloudFiles storage
|
|
959
1179
|
to the destination CloudFiles in batches sized
|
|
@@ -969,7 +1189,7 @@ class CloudFiles:
|
|
|
969
1189
|
- gs->gs: Uses GCS copy API to minimize data movement
|
|
970
1190
|
- s3->s3: Uses boto s3 copy API to minimize data movement
|
|
971
1191
|
|
|
972
|
-
|
|
1192
|
+
cf_dest: another CloudFiles instance or cloudpath
|
|
973
1193
|
paths: if None transfer all files from src, else if
|
|
974
1194
|
an iterable, transfer only these files.
|
|
975
1195
|
|
|
@@ -987,6 +1207,11 @@ class CloudFiles:
|
|
|
987
1207
|
as '' (None), 'gzip', 'br', 'zstd'
|
|
988
1208
|
content_type: if provided, set the Content-Type header
|
|
989
1209
|
on the upload. This is necessary for e.g. file->cloud
|
|
1210
|
+
|
|
1211
|
+
resumable: for remote->file downloads, download to a .part
|
|
1212
|
+
file and rename it when the download completes. If the
|
|
1213
|
+
download does not complete, it can be resumed. Only
|
|
1214
|
+
supported for https->file currently.
|
|
990
1215
|
"""
|
|
991
1216
|
if isinstance(cf_dest, str):
|
|
992
1217
|
cf_dest = CloudFiles(
|
|
@@ -997,7 +1222,8 @@ class CloudFiles:
|
|
|
997
1222
|
return cf_dest.transfer_from(
|
|
998
1223
|
self, paths, block_size,
|
|
999
1224
|
reencode, content_type,
|
|
1000
|
-
allow_missing,
|
|
1225
|
+
allow_missing,
|
|
1226
|
+
progress, resumable,
|
|
1001
1227
|
)
|
|
1002
1228
|
|
|
1003
1229
|
def transfer_from(
|
|
@@ -1008,7 +1234,9 @@ class CloudFiles:
|
|
|
1008
1234
|
reencode:Optional[str] = None,
|
|
1009
1235
|
content_type:Optional[str] = None,
|
|
1010
1236
|
allow_missing:bool = False,
|
|
1011
|
-
|
|
1237
|
+
progress:Optional[bool] = None,
|
|
1238
|
+
resumable:bool = False,
|
|
1239
|
+
) -> TransmissionMonitor:
|
|
1012
1240
|
"""
|
|
1013
1241
|
Transfer all files from the source CloudFiles storage
|
|
1014
1242
|
to this CloudFiles in batches sized in the
|
|
@@ -1042,6 +1270,10 @@ class CloudFiles:
|
|
|
1042
1270
|
as '' (None), 'gzip', 'br', 'zstd'
|
|
1043
1271
|
content_type: if provided, set the Content-Type header
|
|
1044
1272
|
on the upload. This is necessary for e.g. file->cloud
|
|
1273
|
+
resumable: for remote->file downloads, download to a .part
|
|
1274
|
+
file and rename it when the download completes. If the
|
|
1275
|
+
download does not complete, it can be resumed. Only
|
|
1276
|
+
supported for https->file currently.
|
|
1045
1277
|
"""
|
|
1046
1278
|
if isinstance(cf_src, str):
|
|
1047
1279
|
cf_src = CloudFiles(
|
|
@@ -1054,22 +1286,40 @@ class CloudFiles:
|
|
|
1054
1286
|
|
|
1055
1287
|
total = totalfn(paths, None)
|
|
1056
1288
|
|
|
1057
|
-
|
|
1289
|
+
disable = progress
|
|
1290
|
+
if disable is None:
|
|
1291
|
+
disable = self.progress
|
|
1292
|
+
if disable is None:
|
|
1293
|
+
disable = False
|
|
1294
|
+
else:
|
|
1295
|
+
disable = not disable
|
|
1296
|
+
|
|
1297
|
+
with tqdm(desc="Transferring", total=total, disable=disable) as pbar:
|
|
1058
1298
|
if (
|
|
1059
1299
|
cf_src.protocol == "file"
|
|
1060
1300
|
and self.protocol == "file"
|
|
1061
1301
|
and reencode is None
|
|
1062
1302
|
):
|
|
1063
|
-
self.__transfer_file_to_file(
|
|
1303
|
+
return self.__transfer_file_to_file(
|
|
1064
1304
|
cf_src, self, paths, total,
|
|
1065
1305
|
pbar, block_size, allow_missing
|
|
1066
1306
|
)
|
|
1307
|
+
elif (
|
|
1308
|
+
cf_src.protocol != "file"
|
|
1309
|
+
and self.protocol == "file"
|
|
1310
|
+
and reencode is None
|
|
1311
|
+
):
|
|
1312
|
+
return self.__transfer_remote_to_file(
|
|
1313
|
+
cf_src, self, paths, total,
|
|
1314
|
+
pbar, block_size, content_type,
|
|
1315
|
+
allow_missing, resumable,
|
|
1316
|
+
)
|
|
1067
1317
|
elif (
|
|
1068
1318
|
cf_src.protocol == "file"
|
|
1069
1319
|
and self.protocol != "file"
|
|
1070
1320
|
and reencode is None
|
|
1071
1321
|
):
|
|
1072
|
-
self.__transfer_file_to_remote(
|
|
1322
|
+
return self.__transfer_file_to_remote(
|
|
1073
1323
|
cf_src, self, paths, total,
|
|
1074
1324
|
pbar, block_size, content_type,
|
|
1075
1325
|
allow_missing,
|
|
@@ -1085,13 +1335,13 @@ class CloudFiles:
|
|
|
1085
1335
|
)
|
|
1086
1336
|
and reencode is None
|
|
1087
1337
|
):
|
|
1088
|
-
self.__transfer_cloud_internal(
|
|
1338
|
+
return self.__transfer_cloud_internal(
|
|
1089
1339
|
cf_src, self, paths,
|
|
1090
1340
|
total, pbar, block_size,
|
|
1091
1341
|
allow_missing,
|
|
1092
1342
|
)
|
|
1093
1343
|
else:
|
|
1094
|
-
self.__transfer_general(
|
|
1344
|
+
return self.__transfer_general(
|
|
1095
1345
|
cf_src, self, paths, total,
|
|
1096
1346
|
pbar, block_size,
|
|
1097
1347
|
reencode, content_type,
|
|
@@ -1103,7 +1353,7 @@ class CloudFiles:
|
|
|
1103
1353
|
total, pbar, block_size,
|
|
1104
1354
|
reencode, content_type,
|
|
1105
1355
|
allow_missing
|
|
1106
|
-
):
|
|
1356
|
+
) -> TransmissionMonitor:
|
|
1107
1357
|
"""
|
|
1108
1358
|
Downloads the file into RAM, transforms
|
|
1109
1359
|
the data, and uploads it. This is the slowest and
|
|
@@ -1112,6 +1362,7 @@ class CloudFiles:
|
|
|
1112
1362
|
pair of endpoints as well as transcoding compression
|
|
1113
1363
|
formats.
|
|
1114
1364
|
"""
|
|
1365
|
+
upload_tms = []
|
|
1115
1366
|
for block_paths in sip(paths, block_size):
|
|
1116
1367
|
for path in block_paths:
|
|
1117
1368
|
if isinstance(path, dict):
|
|
@@ -1135,26 +1386,32 @@ class CloudFiles:
|
|
|
1135
1386
|
item["path"] = item["tags"]["dest_path"]
|
|
1136
1387
|
del item["tags"]["dest_path"]
|
|
1137
1388
|
yield item
|
|
1138
|
-
self.puts(
|
|
1389
|
+
(ct, batch_tm) = self.puts(
|
|
1139
1390
|
renameiter(),
|
|
1140
1391
|
raw=True,
|
|
1141
1392
|
progress=False,
|
|
1142
1393
|
compress=reencode,
|
|
1143
1394
|
content_type=content_type,
|
|
1395
|
+
return_recording=True,
|
|
1144
1396
|
)
|
|
1145
1397
|
pbar.update(len(block_paths))
|
|
1398
|
+
upload_tms.append(batch_tm)
|
|
1399
|
+
|
|
1400
|
+
return TransmissionMonitor.merge(upload_tms)
|
|
1146
1401
|
|
|
1147
1402
|
def __transfer_file_to_file(
|
|
1148
1403
|
self, cf_src, cf_dest, paths,
|
|
1149
1404
|
total, pbar, block_size, allow_missing
|
|
1150
|
-
):
|
|
1405
|
+
) -> TransmissionMonitor:
|
|
1151
1406
|
"""
|
|
1152
1407
|
shutil.copyfile, starting in Python 3.8, uses
|
|
1153
1408
|
special OS kernel functions to accelerate file copies
|
|
1154
1409
|
"""
|
|
1410
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
1155
1411
|
srcdir = cf_src.cloudpath.replace("file://", "")
|
|
1156
1412
|
destdir = mkdir(cf_dest.cloudpath.replace("file://", ""))
|
|
1157
1413
|
for path in paths:
|
|
1414
|
+
start_time = time.monotonic()
|
|
1158
1415
|
if isinstance(path, dict):
|
|
1159
1416
|
src = os.path.join(srcdir, path["path"])
|
|
1160
1417
|
dest = os.path.join(destdir, path["dest_path"])
|
|
@@ -1168,6 +1425,15 @@ class CloudFiles:
|
|
|
1168
1425
|
if dest_ext_compress != dest_ext:
|
|
1169
1426
|
dest += dest_ext_compress
|
|
1170
1427
|
|
|
1428
|
+
num_bytes_tx = 0
|
|
1429
|
+
try:
|
|
1430
|
+
if src:
|
|
1431
|
+
num_bytes_tx = os.path.getsize(src)
|
|
1432
|
+
except FileNotFoundError:
|
|
1433
|
+
pass
|
|
1434
|
+
|
|
1435
|
+
flight_id = tm.start_io(num_bytes_tx, start_time)
|
|
1436
|
+
|
|
1171
1437
|
try:
|
|
1172
1438
|
shutil.copyfile(src, dest) # avoids user space
|
|
1173
1439
|
except FileNotFoundError:
|
|
@@ -1175,10 +1441,55 @@ class CloudFiles:
|
|
|
1175
1441
|
with open(dest, "wb") as f:
|
|
1176
1442
|
f.write(b'')
|
|
1177
1443
|
else:
|
|
1444
|
+
tm.end_error(flight_id)
|
|
1178
1445
|
raise
|
|
1446
|
+
finally:
|
|
1447
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
1179
1448
|
|
|
1180
1449
|
pbar.update(1)
|
|
1181
1450
|
|
|
1451
|
+
return tm
|
|
1452
|
+
|
|
1453
|
+
def __transfer_remote_to_file(
|
|
1454
|
+
self, cf_src, cf_dest, paths,
|
|
1455
|
+
total, pbar, block_size, content_type,
|
|
1456
|
+
allow_missing, resumable,
|
|
1457
|
+
) -> TransmissionMonitor:
|
|
1458
|
+
|
|
1459
|
+
tm = TransmissionMonitor(IOEnum.RX)
|
|
1460
|
+
|
|
1461
|
+
def thunk_save(key):
|
|
1462
|
+
nonlocal tm
|
|
1463
|
+
flight_id = tm.start_io(1)
|
|
1464
|
+
with cf_src._get_connection() as conn:
|
|
1465
|
+
if isinstance(key, dict):
|
|
1466
|
+
dest_key = key.get("dest_path", key["path"])
|
|
1467
|
+
src_key = key["path"]
|
|
1468
|
+
else:
|
|
1469
|
+
src_key = key
|
|
1470
|
+
dest_key = key
|
|
1471
|
+
|
|
1472
|
+
dest_key = os.path.join(cf_dest._path.path, dest_key)
|
|
1473
|
+
(found, num_bytes_rx) = conn.save_file(src_key, dest_key, resumable=resumable)
|
|
1474
|
+
|
|
1475
|
+
tm.end_io(flight_id, num_bytes_rx)
|
|
1476
|
+
|
|
1477
|
+
if found == False and not allow_missing:
|
|
1478
|
+
tm.end_error(flight_id)
|
|
1479
|
+
raise FileNotFoundError(src_key)
|
|
1480
|
+
|
|
1481
|
+
return int(found)
|
|
1482
|
+
|
|
1483
|
+
schedule_jobs(
|
|
1484
|
+
fns=( partial(thunk_save, path) for path in paths ),
|
|
1485
|
+
progress=pbar,
|
|
1486
|
+
concurrency=self.num_threads,
|
|
1487
|
+
total=totalfn(paths, total),
|
|
1488
|
+
green=self.green,
|
|
1489
|
+
count_return=True,
|
|
1490
|
+
)
|
|
1491
|
+
return tm
|
|
1492
|
+
|
|
1182
1493
|
def __transfer_file_to_remote(
|
|
1183
1494
|
self, cf_src, cf_dest, paths,
|
|
1184
1495
|
total, pbar, block_size, content_type,
|
|
@@ -1189,6 +1500,7 @@ class CloudFiles:
|
|
|
1189
1500
|
so that GCS and S3 can do low-memory chunked multi-part
|
|
1190
1501
|
uploads if necessary.
|
|
1191
1502
|
"""
|
|
1503
|
+
tms = []
|
|
1192
1504
|
srcdir = cf_src.cloudpath.replace("file://", "")
|
|
1193
1505
|
for block_paths in sip(paths, block_size):
|
|
1194
1506
|
to_upload = []
|
|
@@ -1211,18 +1523,30 @@ class CloudFiles:
|
|
|
1211
1523
|
else:
|
|
1212
1524
|
raise
|
|
1213
1525
|
|
|
1526
|
+
if dest_path == '':
|
|
1527
|
+
dest_path = src_path
|
|
1528
|
+
|
|
1214
1529
|
to_upload.append({
|
|
1215
1530
|
"path": dest_path,
|
|
1216
1531
|
"content": handle,
|
|
1217
1532
|
"compress": encoding,
|
|
1218
1533
|
})
|
|
1219
|
-
|
|
1534
|
+
(ct, batch_tm) = cf_dest.puts(
|
|
1535
|
+
to_upload,
|
|
1536
|
+
raw=True,
|
|
1537
|
+
progress=False,
|
|
1538
|
+
content_type=content_type,
|
|
1539
|
+
return_recording=True,
|
|
1540
|
+
)
|
|
1220
1541
|
for item in to_upload:
|
|
1221
1542
|
handle = item["content"]
|
|
1222
1543
|
if hasattr(handle, "close"):
|
|
1223
1544
|
handle.close()
|
|
1545
|
+
tms.append(batch_tm)
|
|
1224
1546
|
pbar.update(len(block_paths))
|
|
1225
1547
|
|
|
1548
|
+
return TransmissionMonitor.merge(tms)
|
|
1549
|
+
|
|
1226
1550
|
def __transfer_cloud_internal(
|
|
1227
1551
|
self, cf_src, cf_dest, paths,
|
|
1228
1552
|
total, pbar, block_size, allow_missing
|
|
@@ -1235,7 +1559,11 @@ class CloudFiles:
|
|
|
1235
1559
|
of the cloud, this is much slower and more expensive
|
|
1236
1560
|
than necessary.
|
|
1237
1561
|
"""
|
|
1562
|
+
tm = TransmissionMonitor(IOEnum.TX)
|
|
1563
|
+
|
|
1238
1564
|
def thunk_copy(key):
|
|
1565
|
+
nonlocal tm
|
|
1566
|
+
flight_id = tm.start_io(1)
|
|
1239
1567
|
with cf_src._get_connection() as conn:
|
|
1240
1568
|
if isinstance(key, dict):
|
|
1241
1569
|
dest_key = key.get("dest_path", key["path"])
|
|
@@ -1245,14 +1573,17 @@ class CloudFiles:
|
|
|
1245
1573
|
dest_key = key
|
|
1246
1574
|
|
|
1247
1575
|
dest_key = posixpath.join(cf_dest._path.path, dest_key)
|
|
1248
|
-
found = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
|
|
1576
|
+
(found, num_bytes_tx) = conn.copy_file(src_key, cf_dest._path.bucket, dest_key)
|
|
1577
|
+
|
|
1578
|
+
tm.end_io(flight_id, num_bytes_tx)
|
|
1249
1579
|
|
|
1250
1580
|
if found == False and not allow_missing:
|
|
1581
|
+
tm.end_error(flight_id)
|
|
1251
1582
|
raise FileNotFoundError(src_key)
|
|
1252
1583
|
|
|
1253
1584
|
return int(found)
|
|
1254
1585
|
|
|
1255
|
-
|
|
1586
|
+
schedule_jobs(
|
|
1256
1587
|
fns=( partial(thunk_copy, path) for path in paths ),
|
|
1257
1588
|
progress=pbar,
|
|
1258
1589
|
concurrency=self.num_threads,
|
|
@@ -1260,7 +1591,100 @@ class CloudFiles:
|
|
|
1260
1591
|
green=self.green,
|
|
1261
1592
|
count_return=True,
|
|
1262
1593
|
)
|
|
1263
|
-
return
|
|
1594
|
+
return tm
|
|
1595
|
+
|
|
1596
|
+
def move(self, src:str, dest:str):
|
|
1597
|
+
"""Move (rename) src to dest.
|
|
1598
|
+
|
|
1599
|
+
src and dest do not have to be on the same filesystem.
|
|
1600
|
+
"""
|
|
1601
|
+
epath = paths.extract(dest)
|
|
1602
|
+
full_cloudpath = paths.asprotocolpath(epath)
|
|
1603
|
+
dest_cloudpath = paths.dirname(full_cloudpath)
|
|
1604
|
+
base_dest = paths.basename(full_cloudpath)
|
|
1605
|
+
|
|
1606
|
+
return self.moves(dest_cloudpath, [
|
|
1607
|
+
(src, base_dest)
|
|
1608
|
+
], block_size=1, progress=False)
|
|
1609
|
+
|
|
1610
|
+
def moves(
|
|
1611
|
+
self,
|
|
1612
|
+
cf_dest:Any,
|
|
1613
|
+
paths:Union[Sequence[str], Sequence[Tuple[str, str]]],
|
|
1614
|
+
block_size:int = 64,
|
|
1615
|
+
total:Optional[int] = None,
|
|
1616
|
+
progress:Optional[bool] = None,
|
|
1617
|
+
):
|
|
1618
|
+
"""
|
|
1619
|
+
Move (rename) files.
|
|
1620
|
+
|
|
1621
|
+
pairs: [ (src, dest), (src, dest), ... ]
|
|
1622
|
+
"""
|
|
1623
|
+
if isinstance(cf_dest, str):
|
|
1624
|
+
cf_dest = CloudFiles(
|
|
1625
|
+
cf_dest, progress=False,
|
|
1626
|
+
green=self.green, num_threads=self.num_threads,
|
|
1627
|
+
)
|
|
1628
|
+
|
|
1629
|
+
total = totalfn(paths, total)
|
|
1630
|
+
|
|
1631
|
+
disable = not (self.progress if progress is None else progress)
|
|
1632
|
+
|
|
1633
|
+
if self.protocol == "file" and cf_dest.protocol == "file":
|
|
1634
|
+
self.__moves_file_to_file(
|
|
1635
|
+
cf_dest, paths, total,
|
|
1636
|
+
disable, block_size
|
|
1637
|
+
)
|
|
1638
|
+
return
|
|
1639
|
+
|
|
1640
|
+
pbar = tqdm(total=total, disable=disable, desc="Moving")
|
|
1641
|
+
|
|
1642
|
+
with pbar:
|
|
1643
|
+
for subpairs in sip(paths, block_size):
|
|
1644
|
+
subpairs = [
|
|
1645
|
+
((pair, pair) if isinstance(pair, str) else pair)
|
|
1646
|
+
for pair in subpairs
|
|
1647
|
+
]
|
|
1648
|
+
|
|
1649
|
+
self.transfer_to(cf_dest, paths=(
|
|
1650
|
+
{
|
|
1651
|
+
"path": src,
|
|
1652
|
+
"dest_path": dest,
|
|
1653
|
+
}
|
|
1654
|
+
for src, dest in subpairs
|
|
1655
|
+
), progress=False)
|
|
1656
|
+
self.delete(( src for src, dest in subpairs ), progress=False)
|
|
1657
|
+
pbar.update(len(subpairs))
|
|
1658
|
+
|
|
1659
|
+
def __moves_file_to_file(
|
|
1660
|
+
self,
|
|
1661
|
+
cf_dest:Any,
|
|
1662
|
+
paths:Union[Sequence[str], Sequence[Tuple[str,str]]],
|
|
1663
|
+
total:Optional[int],
|
|
1664
|
+
disable:bool,
|
|
1665
|
+
block_size:int,
|
|
1666
|
+
):
|
|
1667
|
+
for pair in tqdm(paths, total=total, disable=disable, desc="Moving"):
|
|
1668
|
+
if isinstance(pair, str):
|
|
1669
|
+
src = pair
|
|
1670
|
+
dest = pair
|
|
1671
|
+
else:
|
|
1672
|
+
(src, dest) = pair
|
|
1673
|
+
|
|
1674
|
+
src = self.join(self.cloudpath, src).replace("file://", "")
|
|
1675
|
+
dest = cf_dest.join(cf_dest.cloudpath, dest).replace("file://", "")
|
|
1676
|
+
|
|
1677
|
+
if os.path.isdir(dest):
|
|
1678
|
+
dest = cf_dest.join(dest, os.path.basename(src))
|
|
1679
|
+
else:
|
|
1680
|
+
mkdir(os.path.dirname(dest))
|
|
1681
|
+
|
|
1682
|
+
src, encoding = FileInterface.get_encoded_file_path(src)
|
|
1683
|
+
_, dest_ext = os.path.splitext(dest)
|
|
1684
|
+
dest_ext_compress = FileInterface.get_extension(encoding)
|
|
1685
|
+
if dest_ext_compress != dest_ext:
|
|
1686
|
+
dest += dest_ext_compress
|
|
1687
|
+
shutil.move(src, dest)
|
|
1264
1688
|
|
|
1265
1689
|
def join(self, *paths:str) -> str:
|
|
1266
1690
|
"""
|
|
@@ -1277,6 +1701,22 @@ class CloudFiles:
|
|
|
1277
1701
|
return os.path.join(*paths)
|
|
1278
1702
|
return posixpath.join(*paths)
|
|
1279
1703
|
|
|
1704
|
+
@property
|
|
1705
|
+
def sep(self) -> str:
|
|
1706
|
+
if self._path.protocol == "file":
|
|
1707
|
+
return os.sep
|
|
1708
|
+
return posixpath.sep
|
|
1709
|
+
|
|
1710
|
+
def dirname(self, path:str) -> str:
|
|
1711
|
+
if self._path.protocol == "file":
|
|
1712
|
+
return os.path.dirname(path)
|
|
1713
|
+
return posixpath.dirname(path)
|
|
1714
|
+
|
|
1715
|
+
def basename(self, path:str) -> str:
|
|
1716
|
+
if self._path.protocol == "file":
|
|
1717
|
+
return os.path.basename(path)
|
|
1718
|
+
return posixpath.basename(path)
|
|
1719
|
+
|
|
1280
1720
|
def __getitem__(self, key) -> Union[dict,bytes,List[dict]]:
|
|
1281
1721
|
if isinstance(key, tuple) and len(key) == 2 and isinstance(key[1], slice) and isinstance(key[0], str):
|
|
1282
1722
|
return self.get({ 'path': key[0], 'start': key[1].start, 'end': key[1].stop })
|
|
@@ -1307,11 +1747,17 @@ class CloudFiles:
|
|
|
1307
1747
|
|
|
1308
1748
|
class CloudFile:
|
|
1309
1749
|
def __init__(
|
|
1310
|
-
self,
|
|
1750
|
+
self,
|
|
1751
|
+
path:str,
|
|
1752
|
+
cache_meta:bool = False,
|
|
1311
1753
|
secrets:SecretsType = None,
|
|
1312
1754
|
composite_upload_threshold:int = int(1e8),
|
|
1313
1755
|
locking:bool = True,
|
|
1314
1756
|
lock_dir:Optional[str] = None,
|
|
1757
|
+
endpoint:Optional[str] = None,
|
|
1758
|
+
no_sign_request:bool = False,
|
|
1759
|
+
request_payer:Optional[str] = None,
|
|
1760
|
+
use_https:bool = False,
|
|
1315
1761
|
):
|
|
1316
1762
|
path = paths.normalize(path)
|
|
1317
1763
|
self.cf = CloudFiles(
|
|
@@ -1320,6 +1766,10 @@ class CloudFile:
|
|
|
1320
1766
|
composite_upload_threshold=composite_upload_threshold,
|
|
1321
1767
|
locking=locking,
|
|
1322
1768
|
lock_dir=lock_dir,
|
|
1769
|
+
use_https=use_https,
|
|
1770
|
+
endpoint=endpoint,
|
|
1771
|
+
request_payer=request_payer,
|
|
1772
|
+
no_sign_request=no_sign_request,
|
|
1323
1773
|
)
|
|
1324
1774
|
self.filename = paths.basename(path)
|
|
1325
1775
|
|
|
@@ -1327,6 +1777,10 @@ class CloudFile:
|
|
|
1327
1777
|
self._size:Optional[int] = None
|
|
1328
1778
|
self._head = None
|
|
1329
1779
|
|
|
1780
|
+
@property
|
|
1781
|
+
def sep(self) -> str:
|
|
1782
|
+
return self.cf.sep
|
|
1783
|
+
|
|
1330
1784
|
@property
|
|
1331
1785
|
def protocol(self):
|
|
1332
1786
|
return self.cf.protocol
|
|
@@ -1440,6 +1894,22 @@ class CloudFile:
|
|
|
1440
1894
|
reencode=reencode,
|
|
1441
1895
|
)
|
|
1442
1896
|
|
|
1897
|
+
def join(self, *args):
|
|
1898
|
+
return self.cf.join(*args)
|
|
1899
|
+
|
|
1900
|
+
def dirname(self, *args):
|
|
1901
|
+
return self.cf.dirname(*args)
|
|
1902
|
+
|
|
1903
|
+
def basename(self, *args):
|
|
1904
|
+
return self.cf.basename(*args)
|
|
1905
|
+
|
|
1906
|
+
def touch(self):
|
|
1907
|
+
return self.cf.touch(self.filename)
|
|
1908
|
+
|
|
1909
|
+
def move(self, dest):
|
|
1910
|
+
"""Move (rename) this file to dest."""
|
|
1911
|
+
return self.cf.move(self.filename, dest)
|
|
1912
|
+
|
|
1443
1913
|
def __len__(self):
|
|
1444
1914
|
return self.size()
|
|
1445
1915
|
|