cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cloudfiles/interfaces.py CHANGED
@@ -4,14 +4,13 @@ from collections import defaultdict, namedtuple
4
4
  from datetime import datetime
5
5
  from io import BytesIO
6
6
  import json
7
- import os.path
7
+ import os
8
8
  import posixpath
9
9
  import re
10
10
 
11
11
  import boto3
12
12
  import botocore
13
13
  import gevent.monkey
14
- from glob import glob
15
14
  import google.cloud.exceptions
16
15
  from google.cloud.storage import Batch, Client
17
16
  import requests
@@ -22,8 +21,8 @@ import fasteners
22
21
 
23
22
  from .compression import COMPRESSION_TYPES
24
23
  from .connectionpools import S3ConnectionPool, GCloudBucketPool, MemoryPool, MEMORY_DATA
25
- from .exceptions import MD5IntegrityError, CompressionError
26
- from .lib import mkdir, sip, md5, validate_s3_multipart_etag
24
+ from .exceptions import MD5IntegrityError, CompressionError, AuthorizationError
25
+ from .lib import mkdir, sip, md5, encode_crc32c_b64, validate_s3_multipart_etag
27
26
  from .secrets import (
28
27
  http_credentials,
29
28
  cave_credentials,
@@ -49,6 +48,7 @@ MEM_POOL = None
49
48
 
50
49
  S3_ACLS = {
51
50
  "tigerdata": "private",
51
+ "nokura": "public-read",
52
52
  }
53
53
 
54
54
  S3ConnectionPoolParams = namedtuple('S3ConnectionPoolParams', 'service bucket_name request_payer')
@@ -304,6 +304,22 @@ class FileInterface(StorageInterface):
304
304
 
305
305
  return self.io_with_lock(do_size, path, exclusive=False)
306
306
 
307
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
308
+ total_bytes = 0
309
+ total_files = 0
310
+
311
+ subdir = self.get_path_to_file("")
312
+ if prefix:
313
+ subdir = os.path.join(subdir, os.path.dirname(prefix))
314
+
315
+ for root, dirs, files in os.walk(subdir):
316
+ for f in files:
317
+ path = os.path.join(root, f)
318
+ total_files += 1
319
+ total_bytes += os.path.getsize(path)
320
+
321
+ return (total_files, total_bytes)
322
+
307
323
  def exists(self, file_path):
308
324
  path = self.get_path_to_file(file_path)
309
325
  def do_exists():
@@ -339,7 +355,7 @@ class FileInterface(StorageInterface):
339
355
  """
340
356
 
341
357
  layer_path = self.get_path_to_file("")
342
- path = os.path.join(layer_path, prefix) + '*'
358
+ path = os.path.join(layer_path, prefix)
343
359
 
344
360
  filenames = []
345
361
 
@@ -348,17 +364,33 @@ class FileInterface(StorageInterface):
348
364
  remove += os.path.sep
349
365
 
350
366
  if flat:
351
- for file_path in glob(path):
352
- if not os.path.isfile(file_path):
367
+ if os.path.isdir(path):
368
+ list_path = path
369
+ list_prefix = ''
370
+ prepend_prefix = prefix
371
+ if prepend_prefix and prepend_prefix[-1] != os.path.sep:
372
+ prepend_prefix += os.path.sep
373
+ else:
374
+ list_path = os.path.dirname(path)
375
+ list_prefix = os.path.basename(prefix)
376
+ prepend_prefix = os.path.dirname(prefix)
377
+ if prepend_prefix != '':
378
+ prepend_prefix += os.path.sep
379
+
380
+ for fobj in os.scandir(list_path):
381
+ if list_prefix != '' and not fobj.name.startswith(list_prefix):
353
382
  continue
354
- filename = file_path.replace(remove, '')
355
- filenames.append(filename)
383
+
384
+ if fobj.is_dir():
385
+ filenames.append(f"{prepend_prefix}{fobj.name}{os.path.sep}")
386
+ else:
387
+ filenames.append(f"{prepend_prefix}{fobj.name}")
356
388
  else:
357
389
  subdir = os.path.join(layer_path, os.path.dirname(prefix))
358
390
  for root, dirs, files in os.walk(subdir):
359
- files = [ os.path.join(root, f) for f in files ]
360
- files = [ f.replace(remove, '') for f in files ]
361
- files = [ f for f in files if f[:len(prefix)] == prefix ]
391
+ files = ( os.path.join(root, f) for f in files )
392
+ files = ( f.removeprefix(remove) for f in files )
393
+ files = ( f for f in files if f[:len(prefix)] == prefix )
362
394
 
363
395
  for filename in files:
364
396
  filenames.append(filename)
@@ -452,8 +484,60 @@ class MemoryInterface(StorageInterface):
452
484
  result = result[slice(start, end)]
453
485
  return (result, encoding, None, None)
454
486
 
487
+ def save_file(self, src, dest, resumable) -> tuple[bool,int]:
488
+ key = self.get_path_to_file(src)
489
+ with EXT_TEST_SEQUENCE_LOCK:
490
+ exts = list(EXT_TEST_SEQUENCE)
491
+ exts = [ x[0] for x in exts ]
492
+
493
+ path = key
494
+ true_ext = ''
495
+ for ext in exts:
496
+ pathext = key + ext
497
+ if pathext in self._data:
498
+ path = pathext
499
+ true_ext = ext
500
+ break
501
+
502
+ filepath = os.path.join(dest, os.path.basename(path))
503
+
504
+ mkdir(os.path.dirname(dest))
505
+ try:
506
+ with open(dest + true_ext, "wb") as f:
507
+ f.write(self._data[path])
508
+ except KeyError:
509
+ return (False, 0)
510
+
511
+ return (True, len(self._data[path]))
512
+
455
513
  def head(self, file_path):
456
- raise NotImplementedError()
514
+ path = self.get_path_to_file(file_path)
515
+
516
+ data = None
517
+ encoding = ''
518
+
519
+ with EXT_TEST_SEQUENCE_LOCK:
520
+ for ext, enc in EXT_TEST_SEQUENCE:
521
+ pathext = path + ext
522
+ if pathext in self._data:
523
+ data = self._data[pathext]
524
+ encoding = enc
525
+ break
526
+
527
+ return {
528
+ "Cache-Control": None,
529
+ "Content-Length": len(data),
530
+ "Content-Type": None,
531
+ "ETag": None,
532
+ "Last-Modified": None,
533
+ "Content-Md5": None,
534
+ "Content-Encoding": encoding,
535
+ "Content-Disposition": None,
536
+ "Content-Language": None,
537
+ "Storage-Class": None,
538
+ "Request-Charged": None,
539
+ "Parts-Count": None,
540
+ }
457
541
 
458
542
  def size(self, file_path):
459
543
  path = self.get_path_to_file(file_path)
@@ -474,6 +558,14 @@ class MemoryInterface(StorageInterface):
474
558
 
475
559
  return None
476
560
 
561
+ def copy_file(self, src_path, dest_bucket, dest_key) -> tuple[bool,int]:
562
+ key = self.get_path_to_file(src_path)
563
+ with MEM_BUCKET_POOL_LOCK:
564
+ pool = MEM_POOL[MemoryPoolParams(dest_bucket)]
565
+ dest_bucket = pool.get_connection(None, None)
566
+ dest_bucket[dest_key] = self._data[key]
567
+ return (True, len(self._data[key]))
568
+
477
569
  def exists(self, file_path):
478
570
  path = self.get_path_to_file(file_path)
479
571
  return path in self._data or any(( (path + ext in self._data) for ext in COMPRESSION_EXTENSIONS ))
@@ -505,18 +597,28 @@ class MemoryInterface(StorageInterface):
505
597
 
506
598
  Returns: iterator
507
599
  """
508
- layer_path = self.get_path_to_file("")
509
- path = os.path.join(layer_path, prefix) + '*'
600
+ layer_path = self.get_path_to_file("")
510
601
 
511
602
  remove = layer_path
512
603
  if len(remove) and remove[-1] != '/':
513
604
  remove += '/'
514
605
 
515
- filenames = [ f.replace(remove, '') for f in self._data ]
516
- filenames = [ f for f in filenames if f[:len(prefix)] == prefix ]
606
+ filenames = ( f.removeprefix(remove) for f in self._data )
607
+ filenames = ( f for f in filenames if f[:len(prefix)] == prefix )
517
608
 
518
609
  if flat:
519
- filenames = [ f for f in filenames if '/' not in f.replace(prefix, '') ]
610
+ tmp = []
611
+ for f in filenames:
612
+ elems = f.removeprefix(prefix).split('/')
613
+ if len(elems) > 1 and elems[0] == '':
614
+ elems.pop(0)
615
+ elems[0] = f'/{elems[0]}'
616
+
617
+ if len(elems) > 1:
618
+ tmp.append(f"{prefix}{elems[0]}/")
619
+ else:
620
+ tmp.append(f"{prefix}{elems[0]}")
621
+ filenames = tmp
520
622
 
521
623
  def stripext(fname):
522
624
  (base, ext) = os.path.splitext(fname)
@@ -529,6 +631,23 @@ class MemoryInterface(StorageInterface):
529
631
  filenames.sort()
530
632
  return iter(filenames)
531
633
 
634
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
635
+ layer_path = self.get_path_to_file("")
636
+
637
+ remove = layer_path
638
+ if len(remove) and remove[-1] != '/':
639
+ remove += '/'
640
+
641
+ total_bytes = 0
642
+ total_files = 0
643
+ for filename, binary in self._data.items():
644
+ f_prefix = f.removeprefix(remove)[:len(prefix)]
645
+ if f_prefix == prefix:
646
+ total_bytes += len(binary)
647
+ total_files += 1
648
+
649
+ return (total_files, total_bytes)
650
+
532
651
  class GoogleCloudStorageInterface(StorageInterface):
533
652
  exists_batch_size = Batch._MAX_BATCH_SIZE
534
653
  delete_batch_size = Batch._MAX_BATCH_SIZE
@@ -576,7 +695,7 @@ class GoogleCloudStorageInterface(StorageInterface):
576
695
  blob.upload_from_string(content, content_type)
577
696
 
578
697
  @retry
579
- def copy_file(self, src_path, dest_bucket, dest_key):
698
+ def copy_file(self, src_path, dest_bucket, dest_key) -> tuple[bool,int]:
580
699
  key = self.get_path_to_file(src_path)
581
700
  source_blob = self._bucket.blob( key )
582
701
  with GCS_BUCKET_POOL_LOCK:
@@ -584,13 +703,13 @@ class GoogleCloudStorageInterface(StorageInterface):
584
703
  dest_bucket = pool.get_connection(self._secrets, None)
585
704
 
586
705
  try:
587
- self._bucket.copy_blob(
706
+ blob = self._bucket.copy_blob(
588
707
  source_blob, dest_bucket, dest_key
589
708
  )
590
709
  except google.api_core.exceptions.NotFound:
591
- return False
710
+ return (False, 0)
592
711
 
593
- return True
712
+ return (True, blob.size)
594
713
 
595
714
  @retry_if_not(google.cloud.exceptions.NotFound)
596
715
  def get_file(self, file_path, start=None, end=None, part_size=None):
@@ -616,6 +735,28 @@ class GoogleCloudStorageInterface(StorageInterface):
616
735
 
617
736
  return (content, blob.content_encoding, hash_value, hash_type)
618
737
 
738
+ @retry
739
+ def save_file(self, src, dest, resumable) -> tuple[bool, int]:
740
+ key = self.get_path_to_file(src)
741
+ blob = self._bucket.blob(key)
742
+ try:
743
+ mkdir(os.path.dirname(dest))
744
+ blob.download_to_filename(
745
+ filename=dest,
746
+ raw_download=True,
747
+ checksum=None
748
+ )
749
+ except google.cloud.exceptions.NotFound:
750
+ return (False, 0)
751
+
752
+ num_bytes = os.path.getsize(dest)
753
+
754
+ ext = FileInterface.get_extension(blob.content_encoding)
755
+ if not dest.endswith(ext):
756
+ os.rename(dest, dest + ext)
757
+
758
+ return (True, num_bytes)
759
+
619
760
  @retry_if_not(google.cloud.exceptions.NotFound)
620
761
  def head(self, file_path):
621
762
  key = self.get_path_to_file(file_path)
@@ -690,6 +831,7 @@ class GoogleCloudStorageInterface(StorageInterface):
690
831
  except google.cloud.exceptions.NotFound:
691
832
  pass
692
833
 
834
+
693
835
  @retry
694
836
  def list_files(self, prefix, flat=False):
695
837
  """
@@ -703,14 +845,50 @@ class GoogleCloudStorageInterface(StorageInterface):
703
845
  path = posixpath.join(layer_path, prefix)
704
846
 
705
847
  delimiter = '/' if flat else None
706
- for blob in self._bucket.list_blobs(prefix=path, delimiter=delimiter):
707
- filename = blob.name.replace(layer_path, '')
708
- if not filename:
709
- continue
710
- elif not flat and filename[-1] != '/':
711
- yield filename
712
- elif flat and '/' not in blob.name.replace(path, ''):
713
- yield filename
848
+
849
+ blobs = self._bucket.list_blobs(
850
+ prefix=path,
851
+ delimiter=delimiter,
852
+ page_size=2500,
853
+ fields="items(name),nextPageToken",
854
+ )
855
+
856
+ for page in blobs.pages:
857
+ if page.prefixes:
858
+ yield from (
859
+ item.removeprefix(path)
860
+ for item in page.prefixes
861
+ )
862
+
863
+ for blob in page:
864
+ filename = blob.name.removeprefix(layer_path)
865
+ if not filename:
866
+ continue
867
+ elif not flat and filename[-1] != '/':
868
+ yield filename
869
+ elif flat and '/' not in blob.name.removeprefix(path):
870
+ yield filename
871
+
872
+
873
+ @retry
874
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
875
+ layer_path = self.get_path_to_file("")
876
+ path = posixpath.join(layer_path, prefix)
877
+
878
+ blobs = self._bucket.list_blobs(
879
+ prefix=path,
880
+ page_size=5000,
881
+ fields="items(name,size),nextPageToken",
882
+ )
883
+
884
+ total_bytes = 0
885
+ total_files = 0
886
+ for page in blobs.pages:
887
+ for blob in page:
888
+ total_bytes += blob.size
889
+ total_files += 1
890
+
891
+ return (total_files, total_bytes)
714
892
 
715
893
  def release_connection(self):
716
894
  global GC_POOL
@@ -759,6 +937,8 @@ class HttpInterface(StorageInterface):
759
937
  key = self.get_path_to_file(file_path)
760
938
  headers = self.default_headers()
761
939
  with self.session.head(key, headers=headers) as resp:
940
+ if resp.status_code in (404, 403):
941
+ return None
762
942
  resp.raise_for_status()
763
943
  return resp.headers
764
944
 
@@ -766,6 +946,9 @@ class HttpInterface(StorageInterface):
766
946
  headers = self.head(file_path)
767
947
  return int(headers["Content-Length"])
768
948
 
949
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
950
+ raise NotImplementedError()
951
+
769
952
  @retry
770
953
  def get_file(self, file_path, start=None, end=None, part_size=None):
771
954
  key = self.get_path_to_file(file_path)
@@ -776,24 +959,60 @@ class HttpInterface(StorageInterface):
776
959
  end = int(end - 1) if end is not None else ''
777
960
  headers["Range"] = f"bytes={start}-{end}"
778
961
 
779
- resp = self.session.get(key, headers=headers)
780
-
781
- if resp.status_code in (404, 403):
782
- return (None, None, None, None)
783
- resp.close()
784
- resp.raise_for_status()
962
+ with self.session.get(key, headers=headers, stream=True) as resp:
963
+ if resp.status_code in (404, 403):
964
+ return (None, None, None, None)
965
+ resp.raise_for_status()
966
+ resp.raw.decode_content = False
967
+ content = resp.raw.read()
968
+ content_encoding = resp.headers.get('Content-Encoding', None)
785
969
 
786
970
  # Don't check MD5 for http because the etag can come in many
787
971
  # forms from either GCS, S3 or another service entirely. We
788
972
  # probably won't figure out how to decode it right.
789
973
  # etag = resp.headers.get('etag', None)
790
- content_encoding = resp.headers.get('Content-Encoding', None)
791
-
792
- # requests automatically decodes these
793
- if content_encoding in (None, '', 'gzip', 'deflate', 'br'):
794
- content_encoding = None
795
974
 
796
- return (resp.content, content_encoding, None, None)
975
+ return (content, content_encoding, None, None)
976
+
977
+ @retry
978
+ def save_file(self, src, dest, resumable) -> tuple[bool, int]:
979
+ key = self.get_path_to_file(src)
980
+
981
+ headers = self.head(src)
982
+ content_encoding = headers.get('Content-Encoding', None)
983
+
984
+ try:
985
+ ext = FileInterface.get_extension(content_encoding)
986
+ except ValueError:
987
+ ext = ""
988
+
989
+ fulldest = dest + ext
990
+
991
+ partname = fulldest
992
+ if resumable:
993
+ partname += ".part"
994
+
995
+ downloaded_size = 0
996
+ if resumable and os.path.exists(partname):
997
+ downloaded_size = os.path.getsize(partname)
998
+
999
+ streamed_bytes = 0
1000
+
1001
+ range_headers = { "Range": f"bytes={downloaded_size}-" }
1002
+ with self.session.get(key, headers=range_headers, stream=True) as resp:
1003
+ if resp.status_code not in [200, 206]:
1004
+ resp.raise_for_status()
1005
+ return (False, 0)
1006
+
1007
+ with open(partname, 'ab') as f:
1008
+ for chunk in resp.iter_content(chunk_size=int(10e6)):
1009
+ f.write(chunk)
1010
+ streamed_bytes += len(chunk)
1011
+
1012
+ if resumable:
1013
+ os.rename(partname, fulldest)
1014
+
1015
+ return (True, streamed_bytes)
797
1016
 
798
1017
  @retry
799
1018
  def exists(self, file_path):
@@ -813,29 +1032,48 @@ class HttpInterface(StorageInterface):
813
1032
  )
814
1033
  if prefix and prefix[0] == '/':
815
1034
  prefix = prefix[1:]
816
- if prefix and prefix[-1] != '/':
817
- prefix += '/'
818
1035
 
819
1036
  headers = self.default_headers()
820
1037
 
821
- @retry
1038
+ @retry_if_not(AuthorizationError)
822
1039
  def request(token):
823
1040
  nonlocal headers
1041
+ params = {}
1042
+ if prefix:
1043
+ params["prefix"] = prefix
1044
+ if token is not None:
1045
+ params["pageToken"] = token
1046
+ if flat:
1047
+ params["delimiter"] = '/'
1048
+
824
1049
  results = self.session.get(
825
1050
  f"https://storage.googleapis.com/storage/v1/b/{bucket}/o",
826
- params={ "prefix": prefix, "pageToken": token },
1051
+ params=params,
827
1052
  headers=headers,
828
1053
  )
1054
+ if results.status_code in [401,403]:
1055
+ raise AuthorizationError(f"http {results.status_code}")
1056
+
829
1057
  results.raise_for_status()
830
1058
  results.close()
831
1059
  return results.json()
832
1060
 
1061
+ strip = posixpath.dirname(prefix)
1062
+ if strip and strip[-1] != '/':
1063
+ strip += '/'
1064
+
833
1065
  token = None
834
1066
  while True:
835
1067
  results = request(token)
836
1068
 
837
- for res in results["items"]:
838
- yield res["name"].replace(prefix, "", 1)
1069
+ if 'prefixes' in results:
1070
+ yield from (
1071
+ item.removeprefix(strip)
1072
+ for item in results["prefixes"]
1073
+ )
1074
+
1075
+ for res in results.get("items", []):
1076
+ yield res["name"].removeprefix(strip)
839
1077
 
840
1078
  token = results.get("nextPageToken", None)
841
1079
  if token is None:
@@ -887,13 +1125,15 @@ class HttpInterface(StorageInterface):
887
1125
  def list_files(self, prefix, flat=False):
888
1126
  if self._path.host == "https://storage.googleapis.com":
889
1127
  yield from self._list_files_google(prefix, flat)
890
-
1128
+ return
1129
+
891
1130
  url = posixpath.join(self._path.host, self._path.path, prefix)
892
1131
  resp = requests.head(url)
893
1132
 
894
1133
  server = resp.headers.get("Server", "").lower()
895
1134
  if 'apache' in server:
896
1135
  yield from self._list_files_apache(prefix, flat)
1136
+ return
897
1137
  else:
898
1138
  raise NotImplementedError()
899
1139
 
@@ -963,7 +1203,7 @@ class S3Interface(StorageInterface):
963
1203
  elif compress in ("xz", "lzma"):
964
1204
  attrs['ContentEncoding'] = 'xz'
965
1205
  elif compress in ("bzip2", "bz2"):
966
- attrs['ContentEncoding'] = 'bz2'
1206
+ attrs['ContentEncoding'] = 'bzip2'
967
1207
  elif compress:
968
1208
  raise ValueError("Compression type {} not supported.".format(compress))
969
1209
 
@@ -972,10 +1212,17 @@ class S3Interface(StorageInterface):
972
1212
  if storage_class:
973
1213
  attrs['StorageClass'] = storage_class
974
1214
 
975
- multipart = hasattr(content, "read") and hasattr(content, "seek")
1215
+ multipart = False
1216
+ is_file_handle = hasattr(content, "read") and hasattr(content, "seek")
1217
+
1218
+ if is_file_handle:
1219
+ content_length = os.fstat(content.fileno()).st_size
1220
+ else:
1221
+ content_length = len(content)
976
1222
 
977
- if not multipart and len(content) > int(self.composite_upload_threshold):
978
- content = BytesIO(content)
1223
+ if not multipart and content_length > int(self.composite_upload_threshold):
1224
+ if not is_file_handle:
1225
+ content = BytesIO(content)
979
1226
  multipart = True
980
1227
 
981
1228
  # gevent monkey patching has a bad interaction with s3's use
@@ -985,32 +1232,55 @@ class S3Interface(StorageInterface):
985
1232
  multipart = False
986
1233
  content = content.read()
987
1234
 
1235
+ # WMS 2025-07-05:
1236
+ # Currently, boto3 does not properly support streaming smaller files.
1237
+ # It uses an S3 API that requires a checksum up-front, but streaming
1238
+ # checksums can only be provided at the end.
1239
+ # https://github.com/boto/boto3/issues/3738
1240
+ # https://github.com/boto/boto3/issues/4392
1241
+ # https://docs.aws.amazon.com/sdkref/latest/guide/feature-dataintegrity.html
1242
+ if not multipart and is_file_handle and content_length < int(self.composite_upload_threshold):
1243
+ content = content.read()
1244
+
988
1245
  if multipart:
989
1246
  self._conn.upload_fileobj(content, self._path.bucket, key, ExtraArgs=attrs)
990
1247
  else:
1248
+ if isinstance(content, str):
1249
+ content = content.encode('utf8')
1250
+
991
1251
  attrs['Bucket'] = self._path.bucket
992
1252
  attrs['Body'] = content
993
1253
  attrs['Key'] = key
994
- attrs['ContentMD5'] = md5(content)
1254
+ attrs["ChecksumCRC32C"] = encode_crc32c_b64(content).decode('utf8')
995
1255
  self._conn.put_object(**attrs)
996
1256
 
997
1257
  @retry
998
- def copy_file(self, src_path, dest_bucket_name, dest_key):
1258
+ def copy_file(self, src_path, dest_bucket_name, dest_key) -> tuple[bool,int]:
999
1259
  key = self.get_path_to_file(src_path)
1000
- dest_bucket = self._get_bucket(dest_bucket_name)
1260
+ s3client = self._get_bucket(dest_bucket_name)
1001
1261
  copy_source = {
1002
1262
  'Bucket': self._path.bucket,
1003
1263
  'Key': key,
1004
1264
  }
1005
1265
  try:
1006
- dest_bucket.copy(CopySource=copy_source, Bucket=dest_bucket_name, Key=dest_key)
1266
+ response = s3client.copy_object(
1267
+ CopySource=copy_source,
1268
+ Bucket=dest_bucket_name,
1269
+ Key=dest_key,
1270
+ MetadataDirective='COPY' # Ensure metadata like Content-Encoding is copied
1271
+ )
1007
1272
  except botocore.exceptions.ClientError as err:
1008
1273
  if err.response['Error']['Code'] in ('NoSuchKey', '404'):
1009
- return False
1274
+ return (False, 0)
1010
1275
  else:
1011
1276
  raise
1012
1277
 
1013
- return True
1278
+ try:
1279
+ num_bytes = int(response["ResponseMetadata"]["HTTPHeaders"]["content-length"])
1280
+ except KeyError:
1281
+ num_bytes = 0
1282
+
1283
+ return (True, num_bytes)
1014
1284
 
1015
1285
  @retry
1016
1286
  def get_file(self, file_path, start=None, end=None, part_size=None):
@@ -1038,6 +1308,11 @@ class S3Interface(StorageInterface):
1038
1308
  if 'ContentEncoding' in resp:
1039
1309
  encoding = resp['ContentEncoding']
1040
1310
 
1311
+ encoding = ",".join([
1312
+ enc for enc in encoding.split(",")
1313
+ if enc != "aws-chunked"
1314
+ ])
1315
+
1041
1316
  # s3 etags return hex digests but we need the base64 encoding
1042
1317
  # to make uniform comparisons.
1043
1318
  # example s3 etag: "31ee76261d87fed8cb9d4c465c48158c"
@@ -1065,6 +1340,44 @@ class S3Interface(StorageInterface):
1065
1340
  else:
1066
1341
  raise
1067
1342
 
1343
+ @retry
1344
+ def save_file(self, src, dest, resumable) -> tuple[bool,int]:
1345
+ key = self.get_path_to_file(src)
1346
+ kwargs = self._additional_attrs.copy()
1347
+
1348
+ resp = self.head(src)
1349
+
1350
+ if resp is None:
1351
+ return (False, 0)
1352
+
1353
+ mkdir(os.path.dirname(dest))
1354
+
1355
+ encoding = resp.get("Content-Encoding", "") or ""
1356
+ encoding = ",".join([
1357
+ enc for enc in encoding.split(",")
1358
+ if enc != "aws-chunked"
1359
+ ])
1360
+ ext = FileInterface.get_extension(encoding)
1361
+
1362
+ if not dest.endswith(ext):
1363
+ dest += ext
1364
+
1365
+ try:
1366
+ self._conn.download_file(
1367
+ Bucket=self._path.bucket,
1368
+ Key=key,
1369
+ Filename=dest,
1370
+ **kwargs
1371
+ )
1372
+ except botocore.exceptions.ClientError as err:
1373
+ if err.response['Error']['Code'] in ('NoSuchKey', '404'):
1374
+ return (False, 0)
1375
+ else:
1376
+ raise
1377
+
1378
+ num_bytes = os.path.getsize(dest)
1379
+ return (True, num_bytes)
1380
+
1068
1381
  @retry
1069
1382
  def head(self, file_path):
1070
1383
  try:
@@ -1073,6 +1386,11 @@ class S3Interface(StorageInterface):
1073
1386
  Key=self.get_path_to_file(file_path),
1074
1387
  **self._additional_attrs,
1075
1388
  )
1389
+
1390
+ encoding = response.get("ContentEncoding", None)
1391
+ if encoding == '':
1392
+ encoding = None
1393
+
1076
1394
  return {
1077
1395
  "Cache-Control": response.get("CacheControl", None),
1078
1396
  "Content-Length": response.get("ContentLength", None),
@@ -1080,7 +1398,7 @@ class S3Interface(StorageInterface):
1080
1398
  "ETag": response.get("ETag", None),
1081
1399
  "Last-Modified": response.get("LastModified", None),
1082
1400
  "Content-Md5": response["ResponseMetadata"]["HTTPHeaders"].get("content-md5", None),
1083
- "Content-Encoding": response.get("ContentEncoding", None),
1401
+ "Content-Encoding": encoding,
1084
1402
  "Content-Disposition": response.get("ContentDisposition", None),
1085
1403
  "Content-Language": response.get("ContentLanguage", None),
1086
1404
  "Storage-Class": response.get("StorageClass", None),
@@ -1171,7 +1489,7 @@ class S3Interface(StorageInterface):
1171
1489
  path = posixpath.join(layer_path, prefix)
1172
1490
 
1173
1491
  @retry
1174
- def s3lst(continuation_token=None):
1492
+ def s3lst(path, continuation_token=None):
1175
1493
  kwargs = {
1176
1494
  'Bucket': self._path.bucket,
1177
1495
  'Prefix': path,
@@ -1185,31 +1503,89 @@ class S3Interface(StorageInterface):
1185
1503
 
1186
1504
  return self._conn.list_objects_v2(**kwargs)
1187
1505
 
1188
- resp = s3lst()
1506
+ resp = s3lst(path)
1507
+ # the case where the prefix is something like "build", but "build" is a subdirectory
1508
+ # so requery with "build/" to get the proper behavior
1509
+ if (
1510
+ flat
1511
+ and path
1512
+ and path[-1] != '/'
1513
+ and 'Contents' not in resp
1514
+ and len(resp.get("CommonPrefixes", [])) == 1
1515
+ ):
1516
+ path += '/'
1517
+ resp = s3lst(path)
1189
1518
 
1190
1519
  def iterate(resp):
1520
+ if 'CommonPrefixes' in resp.keys():
1521
+ yield from [
1522
+ item["Prefix"].removeprefix(layer_path)
1523
+ for item in resp['CommonPrefixes']
1524
+ ]
1525
+
1191
1526
  if 'Contents' not in resp.keys():
1192
1527
  resp['Contents'] = []
1193
1528
 
1194
1529
  for item in resp['Contents']:
1195
1530
  key = item['Key']
1196
- filename = key.replace(layer_path, '')
1531
+ filename = key.removeprefix(layer_path)
1197
1532
  if filename == '':
1198
1533
  continue
1199
1534
  elif not flat and filename[-1] != '/':
1200
1535
  yield filename
1201
- elif flat and '/' not in key.replace(path, ''):
1536
+ elif flat and '/' not in key.removeprefix(path):
1202
1537
  yield filename
1203
1538
 
1204
1539
  for filename in iterate(resp):
1205
1540
  yield filename
1206
1541
 
1207
1542
  while resp['IsTruncated'] and resp['NextContinuationToken']:
1208
- resp = s3lst(resp['NextContinuationToken'])
1543
+ resp = s3lst(path, resp['NextContinuationToken'])
1209
1544
 
1210
1545
  for filename in iterate(resp):
1211
1546
  yield filename
1212
1547
 
1548
+ def subtree_size(self, prefix:str = "") -> tuple[int,int]:
1549
+ layer_path = self.get_path_to_file("")
1550
+ path = posixpath.join(layer_path, prefix)
1551
+
1552
+ @retry
1553
+ def s3lst(path, continuation_token=None):
1554
+ kwargs = {
1555
+ 'Bucket': self._path.bucket,
1556
+ 'Prefix': path,
1557
+ **self._additional_attrs
1558
+ }
1559
+
1560
+ if continuation_token:
1561
+ kwargs['ContinuationToken'] = continuation_token
1562
+
1563
+ return self._conn.list_objects_v2(**kwargs)
1564
+
1565
+ resp = s3lst(path)
1566
+
1567
+ def iterate(resp):
1568
+ if 'Contents' not in resp.keys():
1569
+ resp['Contents'] = []
1570
+
1571
+ for item in resp['Contents']:
1572
+ yield item.get('Size', 0)
1573
+
1574
+ total_bytes = 0
1575
+ total_files = 0
1576
+ for num_bytes in iterate(resp):
1577
+ total_files += 1
1578
+ total_bytes += num_bytes
1579
+
1580
+ while resp['IsTruncated'] and resp['NextContinuationToken']:
1581
+ resp = s3lst(path, resp['NextContinuationToken'])
1582
+
1583
+ for num_bytes in iterate(resp):
1584
+ total_files += 1
1585
+ total_bytes += num_bytes
1586
+
1587
+ return (total_files, total_bytes)
1588
+
1213
1589
  def release_connection(self):
1214
1590
  global S3_POOL
1215
1591
  service = self._path.alias or 's3'
@@ -1224,8 +1600,25 @@ class CaveInterface(HttpInterface):
1224
1600
  is, don't worry about it.
1225
1601
  see: https://github.com/CAVEconnectome
1226
1602
  """
1227
- def default_headers(self):
1228
- cred = cave_credentials()
1603
+ def __init__(self, path, secrets=None, **kwargs):
1604
+ super().__init__(path, secrets=secrets, **kwargs)
1605
+
1606
+ secrets = kwargs.get('secrets', None)
1607
+ if secrets is None:
1608
+ secrets = {}
1609
+
1610
+ self._token = secrets.get('token', None)
1611
+ if self._token is None:
1612
+ server = self._path.host.replace("https://", "", 1)
1613
+ server = server.replace("http://", "", 1)
1614
+ self._token = cave_credentials(server)
1615
+ if self._token is not None:
1616
+ self._token = self._token.get('token', None)
1617
+
1618
+ def default_headers(self) -> dict:
1619
+ if self._token is None:
1620
+ return {}
1621
+
1229
1622
  return {
1230
- "Authorization": f"Bearer {cred['token']}",
1231
- }
1623
+ "Authorization": f"Bearer {self._token}",
1624
+ }