cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/AUTHORS +1 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/METADATA +101 -21
- cloud_files-6.0.0.dist-info/RECORD +27 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/WHEEL +1 -1
- cloud_files-6.0.0.dist-info/pbr.json +1 -0
- cloudfiles/cloudfiles.py +548 -78
- cloudfiles/compression.py +8 -3
- cloudfiles/exceptions.py +4 -0
- cloudfiles/gcs.py +7 -3
- cloudfiles/interfaces.py +462 -69
- cloudfiles/lib.py +12 -2
- cloudfiles/monitoring.py +724 -0
- cloudfiles/paths.py +61 -5
- cloudfiles/resumable_tools.py +50 -15
- cloudfiles/scheduler.py +6 -1
- cloudfiles/secrets.py +16 -12
- cloudfiles/test.py +28 -0
- cloudfiles_cli/cloudfiles_cli.py +349 -41
- cloud_files-4.27.0.dist-info/RECORD +0 -26
- cloud_files-4.27.0.dist-info/pbr.json +0 -1
- cloudfiles/buckets.py +0 -10
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/LICENSE +0 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/entry_points.txt +0 -0
- {cloud_files-4.27.0.dist-info → cloud_files-6.0.0.dist-info}/top_level.txt +0 -0
cloudfiles/interfaces.py
CHANGED
|
@@ -4,14 +4,13 @@ from collections import defaultdict, namedtuple
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from io import BytesIO
|
|
6
6
|
import json
|
|
7
|
-
import os
|
|
7
|
+
import os
|
|
8
8
|
import posixpath
|
|
9
9
|
import re
|
|
10
10
|
|
|
11
11
|
import boto3
|
|
12
12
|
import botocore
|
|
13
13
|
import gevent.monkey
|
|
14
|
-
from glob import glob
|
|
15
14
|
import google.cloud.exceptions
|
|
16
15
|
from google.cloud.storage import Batch, Client
|
|
17
16
|
import requests
|
|
@@ -22,8 +21,8 @@ import fasteners
|
|
|
22
21
|
|
|
23
22
|
from .compression import COMPRESSION_TYPES
|
|
24
23
|
from .connectionpools import S3ConnectionPool, GCloudBucketPool, MemoryPool, MEMORY_DATA
|
|
25
|
-
from .exceptions import MD5IntegrityError, CompressionError
|
|
26
|
-
from .lib import mkdir, sip, md5, validate_s3_multipart_etag
|
|
24
|
+
from .exceptions import MD5IntegrityError, CompressionError, AuthorizationError
|
|
25
|
+
from .lib import mkdir, sip, md5, encode_crc32c_b64, validate_s3_multipart_etag
|
|
27
26
|
from .secrets import (
|
|
28
27
|
http_credentials,
|
|
29
28
|
cave_credentials,
|
|
@@ -49,6 +48,7 @@ MEM_POOL = None
|
|
|
49
48
|
|
|
50
49
|
S3_ACLS = {
|
|
51
50
|
"tigerdata": "private",
|
|
51
|
+
"nokura": "public-read",
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
|
|
@@ -304,6 +304,22 @@ class FileInterface(StorageInterface):
|
|
|
304
304
|
|
|
305
305
|
return self.io_with_lock(do_size, path, exclusive=False)
|
|
306
306
|
|
|
307
|
+
def subtree_size(self, prefix:str = "") -> tuple[int,int]:
|
|
308
|
+
total_bytes = 0
|
|
309
|
+
total_files = 0
|
|
310
|
+
|
|
311
|
+
subdir = self.get_path_to_file("")
|
|
312
|
+
if prefix:
|
|
313
|
+
subdir = os.path.join(subdir, os.path.dirname(prefix))
|
|
314
|
+
|
|
315
|
+
for root, dirs, files in os.walk(subdir):
|
|
316
|
+
for f in files:
|
|
317
|
+
path = os.path.join(root, f)
|
|
318
|
+
total_files += 1
|
|
319
|
+
total_bytes += os.path.getsize(path)
|
|
320
|
+
|
|
321
|
+
return (total_files, total_bytes)
|
|
322
|
+
|
|
307
323
|
def exists(self, file_path):
|
|
308
324
|
path = self.get_path_to_file(file_path)
|
|
309
325
|
def do_exists():
|
|
@@ -339,7 +355,7 @@ class FileInterface(StorageInterface):
|
|
|
339
355
|
"""
|
|
340
356
|
|
|
341
357
|
layer_path = self.get_path_to_file("")
|
|
342
|
-
path = os.path.join(layer_path, prefix)
|
|
358
|
+
path = os.path.join(layer_path, prefix)
|
|
343
359
|
|
|
344
360
|
filenames = []
|
|
345
361
|
|
|
@@ -348,17 +364,33 @@ class FileInterface(StorageInterface):
|
|
|
348
364
|
remove += os.path.sep
|
|
349
365
|
|
|
350
366
|
if flat:
|
|
351
|
-
|
|
352
|
-
|
|
367
|
+
if os.path.isdir(path):
|
|
368
|
+
list_path = path
|
|
369
|
+
list_prefix = ''
|
|
370
|
+
prepend_prefix = prefix
|
|
371
|
+
if prepend_prefix and prepend_prefix[-1] != os.path.sep:
|
|
372
|
+
prepend_prefix += os.path.sep
|
|
373
|
+
else:
|
|
374
|
+
list_path = os.path.dirname(path)
|
|
375
|
+
list_prefix = os.path.basename(prefix)
|
|
376
|
+
prepend_prefix = os.path.dirname(prefix)
|
|
377
|
+
if prepend_prefix != '':
|
|
378
|
+
prepend_prefix += os.path.sep
|
|
379
|
+
|
|
380
|
+
for fobj in os.scandir(list_path):
|
|
381
|
+
if list_prefix != '' and not fobj.name.startswith(list_prefix):
|
|
353
382
|
continue
|
|
354
|
-
|
|
355
|
-
|
|
383
|
+
|
|
384
|
+
if fobj.is_dir():
|
|
385
|
+
filenames.append(f"{prepend_prefix}{fobj.name}{os.path.sep}")
|
|
386
|
+
else:
|
|
387
|
+
filenames.append(f"{prepend_prefix}{fobj.name}")
|
|
356
388
|
else:
|
|
357
389
|
subdir = os.path.join(layer_path, os.path.dirname(prefix))
|
|
358
390
|
for root, dirs, files in os.walk(subdir):
|
|
359
|
-
files =
|
|
360
|
-
files =
|
|
361
|
-
files =
|
|
391
|
+
files = ( os.path.join(root, f) for f in files )
|
|
392
|
+
files = ( f.removeprefix(remove) for f in files )
|
|
393
|
+
files = ( f for f in files if f[:len(prefix)] == prefix )
|
|
362
394
|
|
|
363
395
|
for filename in files:
|
|
364
396
|
filenames.append(filename)
|
|
@@ -452,8 +484,60 @@ class MemoryInterface(StorageInterface):
|
|
|
452
484
|
result = result[slice(start, end)]
|
|
453
485
|
return (result, encoding, None, None)
|
|
454
486
|
|
|
487
|
+
def save_file(self, src, dest, resumable) -> tuple[bool,int]:
|
|
488
|
+
key = self.get_path_to_file(src)
|
|
489
|
+
with EXT_TEST_SEQUENCE_LOCK:
|
|
490
|
+
exts = list(EXT_TEST_SEQUENCE)
|
|
491
|
+
exts = [ x[0] for x in exts ]
|
|
492
|
+
|
|
493
|
+
path = key
|
|
494
|
+
true_ext = ''
|
|
495
|
+
for ext in exts:
|
|
496
|
+
pathext = key + ext
|
|
497
|
+
if pathext in self._data:
|
|
498
|
+
path = pathext
|
|
499
|
+
true_ext = ext
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
filepath = os.path.join(dest, os.path.basename(path))
|
|
503
|
+
|
|
504
|
+
mkdir(os.path.dirname(dest))
|
|
505
|
+
try:
|
|
506
|
+
with open(dest + true_ext, "wb") as f:
|
|
507
|
+
f.write(self._data[path])
|
|
508
|
+
except KeyError:
|
|
509
|
+
return (False, 0)
|
|
510
|
+
|
|
511
|
+
return (True, len(self._data[path]))
|
|
512
|
+
|
|
455
513
|
def head(self, file_path):
|
|
456
|
-
|
|
514
|
+
path = self.get_path_to_file(file_path)
|
|
515
|
+
|
|
516
|
+
data = None
|
|
517
|
+
encoding = ''
|
|
518
|
+
|
|
519
|
+
with EXT_TEST_SEQUENCE_LOCK:
|
|
520
|
+
for ext, enc in EXT_TEST_SEQUENCE:
|
|
521
|
+
pathext = path + ext
|
|
522
|
+
if pathext in self._data:
|
|
523
|
+
data = self._data[pathext]
|
|
524
|
+
encoding = enc
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
return {
|
|
528
|
+
"Cache-Control": None,
|
|
529
|
+
"Content-Length": len(data),
|
|
530
|
+
"Content-Type": None,
|
|
531
|
+
"ETag": None,
|
|
532
|
+
"Last-Modified": None,
|
|
533
|
+
"Content-Md5": None,
|
|
534
|
+
"Content-Encoding": encoding,
|
|
535
|
+
"Content-Disposition": None,
|
|
536
|
+
"Content-Language": None,
|
|
537
|
+
"Storage-Class": None,
|
|
538
|
+
"Request-Charged": None,
|
|
539
|
+
"Parts-Count": None,
|
|
540
|
+
}
|
|
457
541
|
|
|
458
542
|
def size(self, file_path):
|
|
459
543
|
path = self.get_path_to_file(file_path)
|
|
@@ -474,6 +558,14 @@ class MemoryInterface(StorageInterface):
|
|
|
474
558
|
|
|
475
559
|
return None
|
|
476
560
|
|
|
561
|
+
def copy_file(self, src_path, dest_bucket, dest_key) -> tuple[bool,int]:
|
|
562
|
+
key = self.get_path_to_file(src_path)
|
|
563
|
+
with MEM_BUCKET_POOL_LOCK:
|
|
564
|
+
pool = MEM_POOL[MemoryPoolParams(dest_bucket)]
|
|
565
|
+
dest_bucket = pool.get_connection(None, None)
|
|
566
|
+
dest_bucket[dest_key] = self._data[key]
|
|
567
|
+
return (True, len(self._data[key]))
|
|
568
|
+
|
|
477
569
|
def exists(self, file_path):
|
|
478
570
|
path = self.get_path_to_file(file_path)
|
|
479
571
|
return path in self._data or any(( (path + ext in self._data) for ext in COMPRESSION_EXTENSIONS ))
|
|
@@ -505,18 +597,28 @@ class MemoryInterface(StorageInterface):
|
|
|
505
597
|
|
|
506
598
|
Returns: iterator
|
|
507
599
|
"""
|
|
508
|
-
layer_path = self.get_path_to_file("")
|
|
509
|
-
path = os.path.join(layer_path, prefix) + '*'
|
|
600
|
+
layer_path = self.get_path_to_file("")
|
|
510
601
|
|
|
511
602
|
remove = layer_path
|
|
512
603
|
if len(remove) and remove[-1] != '/':
|
|
513
604
|
remove += '/'
|
|
514
605
|
|
|
515
|
-
filenames =
|
|
516
|
-
filenames =
|
|
606
|
+
filenames = ( f.removeprefix(remove) for f in self._data )
|
|
607
|
+
filenames = ( f for f in filenames if f[:len(prefix)] == prefix )
|
|
517
608
|
|
|
518
609
|
if flat:
|
|
519
|
-
|
|
610
|
+
tmp = []
|
|
611
|
+
for f in filenames:
|
|
612
|
+
elems = f.removeprefix(prefix).split('/')
|
|
613
|
+
if len(elems) > 1 and elems[0] == '':
|
|
614
|
+
elems.pop(0)
|
|
615
|
+
elems[0] = f'/{elems[0]}'
|
|
616
|
+
|
|
617
|
+
if len(elems) > 1:
|
|
618
|
+
tmp.append(f"{prefix}{elems[0]}/")
|
|
619
|
+
else:
|
|
620
|
+
tmp.append(f"{prefix}{elems[0]}")
|
|
621
|
+
filenames = tmp
|
|
520
622
|
|
|
521
623
|
def stripext(fname):
|
|
522
624
|
(base, ext) = os.path.splitext(fname)
|
|
@@ -529,6 +631,23 @@ class MemoryInterface(StorageInterface):
|
|
|
529
631
|
filenames.sort()
|
|
530
632
|
return iter(filenames)
|
|
531
633
|
|
|
634
|
+
def subtree_size(self, prefix:str = "") -> tuple[int,int]:
|
|
635
|
+
layer_path = self.get_path_to_file("")
|
|
636
|
+
|
|
637
|
+
remove = layer_path
|
|
638
|
+
if len(remove) and remove[-1] != '/':
|
|
639
|
+
remove += '/'
|
|
640
|
+
|
|
641
|
+
total_bytes = 0
|
|
642
|
+
total_files = 0
|
|
643
|
+
for filename, binary in self._data.items():
|
|
644
|
+
f_prefix = f.removeprefix(remove)[:len(prefix)]
|
|
645
|
+
if f_prefix == prefix:
|
|
646
|
+
total_bytes += len(binary)
|
|
647
|
+
total_files += 1
|
|
648
|
+
|
|
649
|
+
return (total_files, total_bytes)
|
|
650
|
+
|
|
532
651
|
class GoogleCloudStorageInterface(StorageInterface):
|
|
533
652
|
exists_batch_size = Batch._MAX_BATCH_SIZE
|
|
534
653
|
delete_batch_size = Batch._MAX_BATCH_SIZE
|
|
@@ -576,7 +695,7 @@ class GoogleCloudStorageInterface(StorageInterface):
|
|
|
576
695
|
blob.upload_from_string(content, content_type)
|
|
577
696
|
|
|
578
697
|
@retry
|
|
579
|
-
def copy_file(self, src_path, dest_bucket, dest_key):
|
|
698
|
+
def copy_file(self, src_path, dest_bucket, dest_key) -> tuple[bool,int]:
|
|
580
699
|
key = self.get_path_to_file(src_path)
|
|
581
700
|
source_blob = self._bucket.blob( key )
|
|
582
701
|
with GCS_BUCKET_POOL_LOCK:
|
|
@@ -584,13 +703,13 @@ class GoogleCloudStorageInterface(StorageInterface):
|
|
|
584
703
|
dest_bucket = pool.get_connection(self._secrets, None)
|
|
585
704
|
|
|
586
705
|
try:
|
|
587
|
-
self._bucket.copy_blob(
|
|
706
|
+
blob = self._bucket.copy_blob(
|
|
588
707
|
source_blob, dest_bucket, dest_key
|
|
589
708
|
)
|
|
590
709
|
except google.api_core.exceptions.NotFound:
|
|
591
|
-
return False
|
|
710
|
+
return (False, 0)
|
|
592
711
|
|
|
593
|
-
return True
|
|
712
|
+
return (True, blob.size)
|
|
594
713
|
|
|
595
714
|
@retry_if_not(google.cloud.exceptions.NotFound)
|
|
596
715
|
def get_file(self, file_path, start=None, end=None, part_size=None):
|
|
@@ -616,6 +735,28 @@ class GoogleCloudStorageInterface(StorageInterface):
|
|
|
616
735
|
|
|
617
736
|
return (content, blob.content_encoding, hash_value, hash_type)
|
|
618
737
|
|
|
738
|
+
@retry
|
|
739
|
+
def save_file(self, src, dest, resumable) -> tuple[bool, int]:
|
|
740
|
+
key = self.get_path_to_file(src)
|
|
741
|
+
blob = self._bucket.blob(key)
|
|
742
|
+
try:
|
|
743
|
+
mkdir(os.path.dirname(dest))
|
|
744
|
+
blob.download_to_filename(
|
|
745
|
+
filename=dest,
|
|
746
|
+
raw_download=True,
|
|
747
|
+
checksum=None
|
|
748
|
+
)
|
|
749
|
+
except google.cloud.exceptions.NotFound:
|
|
750
|
+
return (False, 0)
|
|
751
|
+
|
|
752
|
+
num_bytes = os.path.getsize(dest)
|
|
753
|
+
|
|
754
|
+
ext = FileInterface.get_extension(blob.content_encoding)
|
|
755
|
+
if not dest.endswith(ext):
|
|
756
|
+
os.rename(dest, dest + ext)
|
|
757
|
+
|
|
758
|
+
return (True, num_bytes)
|
|
759
|
+
|
|
619
760
|
@retry_if_not(google.cloud.exceptions.NotFound)
|
|
620
761
|
def head(self, file_path):
|
|
621
762
|
key = self.get_path_to_file(file_path)
|
|
@@ -690,6 +831,7 @@ class GoogleCloudStorageInterface(StorageInterface):
|
|
|
690
831
|
except google.cloud.exceptions.NotFound:
|
|
691
832
|
pass
|
|
692
833
|
|
|
834
|
+
|
|
693
835
|
@retry
|
|
694
836
|
def list_files(self, prefix, flat=False):
|
|
695
837
|
"""
|
|
@@ -703,14 +845,50 @@ class GoogleCloudStorageInterface(StorageInterface):
|
|
|
703
845
|
path = posixpath.join(layer_path, prefix)
|
|
704
846
|
|
|
705
847
|
delimiter = '/' if flat else None
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
848
|
+
|
|
849
|
+
blobs = self._bucket.list_blobs(
|
|
850
|
+
prefix=path,
|
|
851
|
+
delimiter=delimiter,
|
|
852
|
+
page_size=2500,
|
|
853
|
+
fields="items(name),nextPageToken",
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
for page in blobs.pages:
|
|
857
|
+
if page.prefixes:
|
|
858
|
+
yield from (
|
|
859
|
+
item.removeprefix(path)
|
|
860
|
+
for item in page.prefixes
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
for blob in page:
|
|
864
|
+
filename = blob.name.removeprefix(layer_path)
|
|
865
|
+
if not filename:
|
|
866
|
+
continue
|
|
867
|
+
elif not flat and filename[-1] != '/':
|
|
868
|
+
yield filename
|
|
869
|
+
elif flat and '/' not in blob.name.removeprefix(path):
|
|
870
|
+
yield filename
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
@retry
|
|
874
|
+
def subtree_size(self, prefix:str = "") -> tuple[int,int]:
|
|
875
|
+
layer_path = self.get_path_to_file("")
|
|
876
|
+
path = posixpath.join(layer_path, prefix)
|
|
877
|
+
|
|
878
|
+
blobs = self._bucket.list_blobs(
|
|
879
|
+
prefix=path,
|
|
880
|
+
page_size=5000,
|
|
881
|
+
fields="items(name,size),nextPageToken",
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
total_bytes = 0
|
|
885
|
+
total_files = 0
|
|
886
|
+
for page in blobs.pages:
|
|
887
|
+
for blob in page:
|
|
888
|
+
total_bytes += blob.size
|
|
889
|
+
total_files += 1
|
|
890
|
+
|
|
891
|
+
return (total_files, total_bytes)
|
|
714
892
|
|
|
715
893
|
def release_connection(self):
|
|
716
894
|
global GC_POOL
|
|
@@ -759,6 +937,8 @@ class HttpInterface(StorageInterface):
|
|
|
759
937
|
key = self.get_path_to_file(file_path)
|
|
760
938
|
headers = self.default_headers()
|
|
761
939
|
with self.session.head(key, headers=headers) as resp:
|
|
940
|
+
if resp.status_code in (404, 403):
|
|
941
|
+
return None
|
|
762
942
|
resp.raise_for_status()
|
|
763
943
|
return resp.headers
|
|
764
944
|
|
|
@@ -766,6 +946,9 @@ class HttpInterface(StorageInterface):
|
|
|
766
946
|
headers = self.head(file_path)
|
|
767
947
|
return int(headers["Content-Length"])
|
|
768
948
|
|
|
949
|
+
def subtree_size(self, prefix:str = "") -> tuple[int,int]:
|
|
950
|
+
raise NotImplementedError()
|
|
951
|
+
|
|
769
952
|
@retry
|
|
770
953
|
def get_file(self, file_path, start=None, end=None, part_size=None):
|
|
771
954
|
key = self.get_path_to_file(file_path)
|
|
@@ -776,24 +959,60 @@ class HttpInterface(StorageInterface):
|
|
|
776
959
|
end = int(end - 1) if end is not None else ''
|
|
777
960
|
headers["Range"] = f"bytes={start}-{end}"
|
|
778
961
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
962
|
+
with self.session.get(key, headers=headers, stream=True) as resp:
|
|
963
|
+
if resp.status_code in (404, 403):
|
|
964
|
+
return (None, None, None, None)
|
|
965
|
+
resp.raise_for_status()
|
|
966
|
+
resp.raw.decode_content = False
|
|
967
|
+
content = resp.raw.read()
|
|
968
|
+
content_encoding = resp.headers.get('Content-Encoding', None)
|
|
785
969
|
|
|
786
970
|
# Don't check MD5 for http because the etag can come in many
|
|
787
971
|
# forms from either GCS, S3 or another service entirely. We
|
|
788
972
|
# probably won't figure out how to decode it right.
|
|
789
973
|
# etag = resp.headers.get('etag', None)
|
|
790
|
-
content_encoding = resp.headers.get('Content-Encoding', None)
|
|
791
|
-
|
|
792
|
-
# requests automatically decodes these
|
|
793
|
-
if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
|
|
794
|
-
content_encoding = None
|
|
795
974
|
|
|
796
|
-
return (
|
|
975
|
+
return (content, content_encoding, None, None)
|
|
976
|
+
|
|
977
|
+
@retry
|
|
978
|
+
def save_file(self, src, dest, resumable) -> tuple[bool, int]:
|
|
979
|
+
key = self.get_path_to_file(src)
|
|
980
|
+
|
|
981
|
+
headers = self.head(src)
|
|
982
|
+
content_encoding = headers.get('Content-Encoding', None)
|
|
983
|
+
|
|
984
|
+
try:
|
|
985
|
+
ext = FileInterface.get_extension(content_encoding)
|
|
986
|
+
except ValueError:
|
|
987
|
+
ext = ""
|
|
988
|
+
|
|
989
|
+
fulldest = dest + ext
|
|
990
|
+
|
|
991
|
+
partname = fulldest
|
|
992
|
+
if resumable:
|
|
993
|
+
partname += ".part"
|
|
994
|
+
|
|
995
|
+
downloaded_size = 0
|
|
996
|
+
if resumable and os.path.exists(partname):
|
|
997
|
+
downloaded_size = os.path.getsize(partname)
|
|
998
|
+
|
|
999
|
+
streamed_bytes = 0
|
|
1000
|
+
|
|
1001
|
+
range_headers = { "Range": f"bytes={downloaded_size}-" }
|
|
1002
|
+
with self.session.get(key, headers=range_headers, stream=True) as resp:
|
|
1003
|
+
if resp.status_code not in [200, 206]:
|
|
1004
|
+
resp.raise_for_status()
|
|
1005
|
+
return (False, 0)
|
|
1006
|
+
|
|
1007
|
+
with open(partname, 'ab') as f:
|
|
1008
|
+
for chunk in resp.iter_content(chunk_size=int(10e6)):
|
|
1009
|
+
f.write(chunk)
|
|
1010
|
+
streamed_bytes += len(chunk)
|
|
1011
|
+
|
|
1012
|
+
if resumable:
|
|
1013
|
+
os.rename(partname, fulldest)
|
|
1014
|
+
|
|
1015
|
+
return (True, streamed_bytes)
|
|
797
1016
|
|
|
798
1017
|
@retry
|
|
799
1018
|
def exists(self, file_path):
|
|
@@ -813,29 +1032,48 @@ class HttpInterface(StorageInterface):
|
|
|
813
1032
|
)
|
|
814
1033
|
if prefix and prefix[0] == '/':
|
|
815
1034
|
prefix = prefix[1:]
|
|
816
|
-
if prefix and prefix[-1] != '/':
|
|
817
|
-
prefix += '/'
|
|
818
1035
|
|
|
819
1036
|
headers = self.default_headers()
|
|
820
1037
|
|
|
821
|
-
@
|
|
1038
|
+
@retry_if_not(AuthorizationError)
|
|
822
1039
|
def request(token):
|
|
823
1040
|
nonlocal headers
|
|
1041
|
+
params = {}
|
|
1042
|
+
if prefix:
|
|
1043
|
+
params["prefix"] = prefix
|
|
1044
|
+
if token is not None:
|
|
1045
|
+
params["pageToken"] = token
|
|
1046
|
+
if flat:
|
|
1047
|
+
params["delimiter"] = '/'
|
|
1048
|
+
|
|
824
1049
|
results = self.session.get(
|
|
825
1050
|
f"https://storage.googleapis.com/storage/v1/b/{bucket}/o",
|
|
826
|
-
params=
|
|
1051
|
+
params=params,
|
|
827
1052
|
headers=headers,
|
|
828
1053
|
)
|
|
1054
|
+
if results.status_code in [401,403]:
|
|
1055
|
+
raise AuthorizationError(f"http {results.status_code}")
|
|
1056
|
+
|
|
829
1057
|
results.raise_for_status()
|
|
830
1058
|
results.close()
|
|
831
1059
|
return results.json()
|
|
832
1060
|
|
|
1061
|
+
strip = posixpath.dirname(prefix)
|
|
1062
|
+
if strip and strip[-1] != '/':
|
|
1063
|
+
strip += '/'
|
|
1064
|
+
|
|
833
1065
|
token = None
|
|
834
1066
|
while True:
|
|
835
1067
|
results = request(token)
|
|
836
1068
|
|
|
837
|
-
|
|
838
|
-
yield
|
|
1069
|
+
if 'prefixes' in results:
|
|
1070
|
+
yield from (
|
|
1071
|
+
item.removeprefix(strip)
|
|
1072
|
+
for item in results["prefixes"]
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
for res in results.get("items", []):
|
|
1076
|
+
yield res["name"].removeprefix(strip)
|
|
839
1077
|
|
|
840
1078
|
token = results.get("nextPageToken", None)
|
|
841
1079
|
if token is None:
|
|
@@ -887,13 +1125,15 @@ class HttpInterface(StorageInterface):
|
|
|
887
1125
|
def list_files(self, prefix, flat=False):
|
|
888
1126
|
if self._path.host == "https://storage.googleapis.com":
|
|
889
1127
|
yield from self._list_files_google(prefix, flat)
|
|
890
|
-
|
|
1128
|
+
return
|
|
1129
|
+
|
|
891
1130
|
url = posixpath.join(self._path.host, self._path.path, prefix)
|
|
892
1131
|
resp = requests.head(url)
|
|
893
1132
|
|
|
894
1133
|
server = resp.headers.get("Server", "").lower()
|
|
895
1134
|
if 'apache' in server:
|
|
896
1135
|
yield from self._list_files_apache(prefix, flat)
|
|
1136
|
+
return
|
|
897
1137
|
else:
|
|
898
1138
|
raise NotImplementedError()
|
|
899
1139
|
|
|
@@ -963,7 +1203,7 @@ class S3Interface(StorageInterface):
|
|
|
963
1203
|
elif compress in ("xz", "lzma"):
|
|
964
1204
|
attrs['ContentEncoding'] = 'xz'
|
|
965
1205
|
elif compress in ("bzip2", "bz2"):
|
|
966
|
-
attrs['ContentEncoding'] = '
|
|
1206
|
+
attrs['ContentEncoding'] = 'bzip2'
|
|
967
1207
|
elif compress:
|
|
968
1208
|
raise ValueError("Compression type {} not supported.".format(compress))
|
|
969
1209
|
|
|
@@ -972,10 +1212,17 @@ class S3Interface(StorageInterface):
|
|
|
972
1212
|
if storage_class:
|
|
973
1213
|
attrs['StorageClass'] = storage_class
|
|
974
1214
|
|
|
975
|
-
multipart =
|
|
1215
|
+
multipart = False
|
|
1216
|
+
is_file_handle = hasattr(content, "read") and hasattr(content, "seek")
|
|
1217
|
+
|
|
1218
|
+
if is_file_handle:
|
|
1219
|
+
content_length = os.fstat(content.fileno()).st_size
|
|
1220
|
+
else:
|
|
1221
|
+
content_length = len(content)
|
|
976
1222
|
|
|
977
|
-
if not multipart and
|
|
978
|
-
|
|
1223
|
+
if not multipart and content_length > int(self.composite_upload_threshold):
|
|
1224
|
+
if not is_file_handle:
|
|
1225
|
+
content = BytesIO(content)
|
|
979
1226
|
multipart = True
|
|
980
1227
|
|
|
981
1228
|
# gevent monkey patching has a bad interaction with s3's use
|
|
@@ -985,32 +1232,55 @@ class S3Interface(StorageInterface):
|
|
|
985
1232
|
multipart = False
|
|
986
1233
|
content = content.read()
|
|
987
1234
|
|
|
1235
|
+
# WMS 2025-07-05:
|
|
1236
|
+
# Currently, boto3 does not properly support streaming smaller files.
|
|
1237
|
+
# It uses an S3 API that requires a checksum up-front, but streaming
|
|
1238
|
+
# checksums can only be provided at the end.
|
|
1239
|
+
# https://github.com/boto/boto3/issues/3738
|
|
1240
|
+
# https://github.com/boto/boto3/issues/4392
|
|
1241
|
+
# https://docs.aws.amazon.com/sdkref/latest/guide/feature-dataintegrity.html
|
|
1242
|
+
if not multipart and is_file_handle and content_length < int(self.composite_upload_threshold):
|
|
1243
|
+
content = content.read()
|
|
1244
|
+
|
|
988
1245
|
if multipart:
|
|
989
1246
|
self._conn.upload_fileobj(content, self._path.bucket, key, ExtraArgs=attrs)
|
|
990
1247
|
else:
|
|
1248
|
+
if isinstance(content, str):
|
|
1249
|
+
content = content.encode('utf8')
|
|
1250
|
+
|
|
991
1251
|
attrs['Bucket'] = self._path.bucket
|
|
992
1252
|
attrs['Body'] = content
|
|
993
1253
|
attrs['Key'] = key
|
|
994
|
-
attrs[
|
|
1254
|
+
attrs["ChecksumCRC32C"] = encode_crc32c_b64(content).decode('utf8')
|
|
995
1255
|
self._conn.put_object(**attrs)
|
|
996
1256
|
|
|
997
1257
|
@retry
|
|
998
|
-
def copy_file(self, src_path, dest_bucket_name, dest_key):
|
|
1258
|
+
def copy_file(self, src_path, dest_bucket_name, dest_key) -> tuple[bool,int]:
|
|
999
1259
|
key = self.get_path_to_file(src_path)
|
|
1000
|
-
|
|
1260
|
+
s3client = self._get_bucket(dest_bucket_name)
|
|
1001
1261
|
copy_source = {
|
|
1002
1262
|
'Bucket': self._path.bucket,
|
|
1003
1263
|
'Key': key,
|
|
1004
1264
|
}
|
|
1005
1265
|
try:
|
|
1006
|
-
|
|
1266
|
+
response = s3client.copy_object(
|
|
1267
|
+
CopySource=copy_source,
|
|
1268
|
+
Bucket=dest_bucket_name,
|
|
1269
|
+
Key=dest_key,
|
|
1270
|
+
MetadataDirective='COPY' # Ensure metadata like Content-Encoding is copied
|
|
1271
|
+
)
|
|
1007
1272
|
except botocore.exceptions.ClientError as err:
|
|
1008
1273
|
if err.response['Error']['Code'] in ('NoSuchKey', '404'):
|
|
1009
|
-
return False
|
|
1274
|
+
return (False, 0)
|
|
1010
1275
|
else:
|
|
1011
1276
|
raise
|
|
1012
1277
|
|
|
1013
|
-
|
|
1278
|
+
try:
|
|
1279
|
+
num_bytes = int(response["ResponseMetadata"]["HTTPHeaders"]["content-length"])
|
|
1280
|
+
except KeyError:
|
|
1281
|
+
num_bytes = 0
|
|
1282
|
+
|
|
1283
|
+
return (True, num_bytes)
|
|
1014
1284
|
|
|
1015
1285
|
@retry
|
|
1016
1286
|
def get_file(self, file_path, start=None, end=None, part_size=None):
|
|
@@ -1038,6 +1308,11 @@ class S3Interface(StorageInterface):
|
|
|
1038
1308
|
if 'ContentEncoding' in resp:
|
|
1039
1309
|
encoding = resp['ContentEncoding']
|
|
1040
1310
|
|
|
1311
|
+
encoding = ",".join([
|
|
1312
|
+
enc for enc in encoding.split(",")
|
|
1313
|
+
if enc != "aws-chunked"
|
|
1314
|
+
])
|
|
1315
|
+
|
|
1041
1316
|
# s3 etags return hex digests but we need the base64 encoding
|
|
1042
1317
|
# to make uniform comparisons.
|
|
1043
1318
|
# example s3 etag: "31ee76261d87fed8cb9d4c465c48158c"
|
|
@@ -1065,6 +1340,44 @@ class S3Interface(StorageInterface):
|
|
|
1065
1340
|
else:
|
|
1066
1341
|
raise
|
|
1067
1342
|
|
|
1343
|
+
@retry
|
|
1344
|
+
def save_file(self, src, dest, resumable) -> tuple[bool,int]:
|
|
1345
|
+
key = self.get_path_to_file(src)
|
|
1346
|
+
kwargs = self._additional_attrs.copy()
|
|
1347
|
+
|
|
1348
|
+
resp = self.head(src)
|
|
1349
|
+
|
|
1350
|
+
if resp is None:
|
|
1351
|
+
return (False, 0)
|
|
1352
|
+
|
|
1353
|
+
mkdir(os.path.dirname(dest))
|
|
1354
|
+
|
|
1355
|
+
encoding = resp.get("Content-Encoding", "") or ""
|
|
1356
|
+
encoding = ",".join([
|
|
1357
|
+
enc for enc in encoding.split(",")
|
|
1358
|
+
if enc != "aws-chunked"
|
|
1359
|
+
])
|
|
1360
|
+
ext = FileInterface.get_extension(encoding)
|
|
1361
|
+
|
|
1362
|
+
if not dest.endswith(ext):
|
|
1363
|
+
dest += ext
|
|
1364
|
+
|
|
1365
|
+
try:
|
|
1366
|
+
self._conn.download_file(
|
|
1367
|
+
Bucket=self._path.bucket,
|
|
1368
|
+
Key=key,
|
|
1369
|
+
Filename=dest,
|
|
1370
|
+
**kwargs
|
|
1371
|
+
)
|
|
1372
|
+
except botocore.exceptions.ClientError as err:
|
|
1373
|
+
if err.response['Error']['Code'] in ('NoSuchKey', '404'):
|
|
1374
|
+
return (False, 0)
|
|
1375
|
+
else:
|
|
1376
|
+
raise
|
|
1377
|
+
|
|
1378
|
+
num_bytes = os.path.getsize(dest)
|
|
1379
|
+
return (True, num_bytes)
|
|
1380
|
+
|
|
1068
1381
|
@retry
|
|
1069
1382
|
def head(self, file_path):
|
|
1070
1383
|
try:
|
|
@@ -1073,6 +1386,11 @@ class S3Interface(StorageInterface):
|
|
|
1073
1386
|
Key=self.get_path_to_file(file_path),
|
|
1074
1387
|
**self._additional_attrs,
|
|
1075
1388
|
)
|
|
1389
|
+
|
|
1390
|
+
encoding = response.get("ContentEncoding", None)
|
|
1391
|
+
if encoding == '':
|
|
1392
|
+
encoding = None
|
|
1393
|
+
|
|
1076
1394
|
return {
|
|
1077
1395
|
"Cache-Control": response.get("CacheControl", None),
|
|
1078
1396
|
"Content-Length": response.get("ContentLength", None),
|
|
@@ -1080,7 +1398,7 @@ class S3Interface(StorageInterface):
|
|
|
1080
1398
|
"ETag": response.get("ETag", None),
|
|
1081
1399
|
"Last-Modified": response.get("LastModified", None),
|
|
1082
1400
|
"Content-Md5": response["ResponseMetadata"]["HTTPHeaders"].get("content-md5", None),
|
|
1083
|
-
"Content-Encoding":
|
|
1401
|
+
"Content-Encoding": encoding,
|
|
1084
1402
|
"Content-Disposition": response.get("ContentDisposition", None),
|
|
1085
1403
|
"Content-Language": response.get("ContentLanguage", None),
|
|
1086
1404
|
"Storage-Class": response.get("StorageClass", None),
|
|
@@ -1171,7 +1489,7 @@ class S3Interface(StorageInterface):
|
|
|
1171
1489
|
path = posixpath.join(layer_path, prefix)
|
|
1172
1490
|
|
|
1173
1491
|
@retry
|
|
1174
|
-
def s3lst(continuation_token=None):
|
|
1492
|
+
def s3lst(path, continuation_token=None):
|
|
1175
1493
|
kwargs = {
|
|
1176
1494
|
'Bucket': self._path.bucket,
|
|
1177
1495
|
'Prefix': path,
|
|
@@ -1185,31 +1503,89 @@ class S3Interface(StorageInterface):
|
|
|
1185
1503
|
|
|
1186
1504
|
return self._conn.list_objects_v2(**kwargs)
|
|
1187
1505
|
|
|
1188
|
-
resp = s3lst()
|
|
1506
|
+
resp = s3lst(path)
|
|
1507
|
+
# the case where the prefix is something like "build", but "build" is a subdirectory
|
|
1508
|
+
# so requery with "build/" to get the proper behavior
|
|
1509
|
+
if (
|
|
1510
|
+
flat
|
|
1511
|
+
and path
|
|
1512
|
+
and path[-1] != '/'
|
|
1513
|
+
and 'Contents' not in resp
|
|
1514
|
+
and len(resp.get("CommonPrefixes", [])) == 1
|
|
1515
|
+
):
|
|
1516
|
+
path += '/'
|
|
1517
|
+
resp = s3lst(path)
|
|
1189
1518
|
|
|
1190
1519
|
def iterate(resp):
|
|
1520
|
+
if 'CommonPrefixes' in resp.keys():
|
|
1521
|
+
yield from [
|
|
1522
|
+
item["Prefix"].removeprefix(layer_path)
|
|
1523
|
+
for item in resp['CommonPrefixes']
|
|
1524
|
+
]
|
|
1525
|
+
|
|
1191
1526
|
if 'Contents' not in resp.keys():
|
|
1192
1527
|
resp['Contents'] = []
|
|
1193
1528
|
|
|
1194
1529
|
for item in resp['Contents']:
|
|
1195
1530
|
key = item['Key']
|
|
1196
|
-
filename = key.
|
|
1531
|
+
filename = key.removeprefix(layer_path)
|
|
1197
1532
|
if filename == '':
|
|
1198
1533
|
continue
|
|
1199
1534
|
elif not flat and filename[-1] != '/':
|
|
1200
1535
|
yield filename
|
|
1201
|
-
elif flat and '/' not in key.
|
|
1536
|
+
elif flat and '/' not in key.removeprefix(path):
|
|
1202
1537
|
yield filename
|
|
1203
1538
|
|
|
1204
1539
|
for filename in iterate(resp):
|
|
1205
1540
|
yield filename
|
|
1206
1541
|
|
|
1207
1542
|
while resp['IsTruncated'] and resp['NextContinuationToken']:
|
|
1208
|
-
resp = s3lst(resp['NextContinuationToken'])
|
|
1543
|
+
resp = s3lst(path, resp['NextContinuationToken'])
|
|
1209
1544
|
|
|
1210
1545
|
for filename in iterate(resp):
|
|
1211
1546
|
yield filename
|
|
1212
1547
|
|
|
1548
|
+
def subtree_size(self, prefix:str = "") -> tuple[int,int]:
|
|
1549
|
+
layer_path = self.get_path_to_file("")
|
|
1550
|
+
path = posixpath.join(layer_path, prefix)
|
|
1551
|
+
|
|
1552
|
+
@retry
|
|
1553
|
+
def s3lst(path, continuation_token=None):
|
|
1554
|
+
kwargs = {
|
|
1555
|
+
'Bucket': self._path.bucket,
|
|
1556
|
+
'Prefix': path,
|
|
1557
|
+
**self._additional_attrs
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
if continuation_token:
|
|
1561
|
+
kwargs['ContinuationToken'] = continuation_token
|
|
1562
|
+
|
|
1563
|
+
return self._conn.list_objects_v2(**kwargs)
|
|
1564
|
+
|
|
1565
|
+
resp = s3lst(path)
|
|
1566
|
+
|
|
1567
|
+
def iterate(resp):
|
|
1568
|
+
if 'Contents' not in resp.keys():
|
|
1569
|
+
resp['Contents'] = []
|
|
1570
|
+
|
|
1571
|
+
for item in resp['Contents']:
|
|
1572
|
+
yield item.get('Size', 0)
|
|
1573
|
+
|
|
1574
|
+
total_bytes = 0
|
|
1575
|
+
total_files = 0
|
|
1576
|
+
for num_bytes in iterate(resp):
|
|
1577
|
+
total_files += 1
|
|
1578
|
+
total_bytes += num_bytes
|
|
1579
|
+
|
|
1580
|
+
while resp['IsTruncated'] and resp['NextContinuationToken']:
|
|
1581
|
+
resp = s3lst(path, resp['NextContinuationToken'])
|
|
1582
|
+
|
|
1583
|
+
for num_bytes in iterate(resp):
|
|
1584
|
+
total_files += 1
|
|
1585
|
+
total_bytes += num_bytes
|
|
1586
|
+
|
|
1587
|
+
return (total_files, total_bytes)
|
|
1588
|
+
|
|
1213
1589
|
def release_connection(self):
|
|
1214
1590
|
global S3_POOL
|
|
1215
1591
|
service = self._path.alias or 's3'
|
|
@@ -1224,8 +1600,25 @@ class CaveInterface(HttpInterface):
|
|
|
1224
1600
|
is, don't worry about it.
|
|
1225
1601
|
see: https://github.com/CAVEconnectome
|
|
1226
1602
|
"""
|
|
1227
|
-
def
|
|
1228
|
-
|
|
1603
|
+
def __init__(self, path, secrets=None, **kwargs):
|
|
1604
|
+
super().__init__(path, secrets=secrets, **kwargs)
|
|
1605
|
+
|
|
1606
|
+
secrets = kwargs.get('secrets', None)
|
|
1607
|
+
if secrets is None:
|
|
1608
|
+
secrets = {}
|
|
1609
|
+
|
|
1610
|
+
self._token = secrets.get('token', None)
|
|
1611
|
+
if self._token is None:
|
|
1612
|
+
server = self._path.host.replace("https://", "", 1)
|
|
1613
|
+
server = server.replace("http://", "", 1)
|
|
1614
|
+
self._token = cave_credentials(server)
|
|
1615
|
+
if self._token is not None:
|
|
1616
|
+
self._token = self._token.get('token', None)
|
|
1617
|
+
|
|
1618
|
+
def default_headers(self) -> dict:
|
|
1619
|
+
if self._token is None:
|
|
1620
|
+
return {}
|
|
1621
|
+
|
|
1229
1622
|
return {
|
|
1230
|
-
"Authorization": f"Bearer {
|
|
1231
|
-
}
|
|
1623
|
+
"Authorization": f"Bearer {self._token}",
|
|
1624
|
+
}
|