pyPreservica 2.7.2__py3-none-any.whl → 3.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyPreservica/uploadAPI.py CHANGED
@@ -13,7 +13,7 @@ import shutil
13
13
  import tempfile
14
14
  import uuid
15
15
  import xml
16
- from datetime import datetime, timedelta
16
+ from datetime import datetime, timedelta, timezone
17
17
  from time import sleep
18
18
  from xml.dom import minidom
19
19
  from xml.etree import ElementTree
@@ -22,11 +22,12 @@ from xml.etree.ElementTree import Element, SubElement
22
22
  import boto3
23
23
  import s3transfer.tasks
24
24
  import s3transfer.upload
25
-
25
+ from botocore.session import get_session
26
26
  from boto3.s3.transfer import TransferConfig, S3Transfer
27
27
  from botocore.config import Config
28
28
  from botocore.credentials import RefreshableCredentials
29
- from botocore.exceptions import ClientError
29
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
30
+ from dateutil.tz import tzlocal
30
31
  from s3transfer import S3UploadFailedError
31
32
  from tqdm import tqdm
32
33
 
@@ -37,7 +38,7 @@ logger = logging.getLogger(__name__)
37
38
 
38
39
  MB = 1024 * 1024
39
40
  GB = 1024 ** 3
40
- transfer_config = TransferConfig(multipart_threshold=int((1 * GB) / 16))
41
+ transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
41
42
 
42
43
  CONTENT_FOLDER = "content"
43
44
  PRESERVATION_CONTENT_FOLDER = "p1"
@@ -80,7 +81,8 @@ class PutObjectTask(s3transfer.tasks.Task):
80
81
  class CompleteMultipartUploadTask(s3transfer.tasks.Task):
81
82
  # Copied from s3transfer/tasks.py, changed to return a result.
82
83
  def _main(self, client, bucket, key, upload_id, parts, extra_args):
83
- return client.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id, MultipartUpload={"Parts": parts},
84
+ return client.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id,
85
+ MultipartUpload={"Parts": parts},
84
86
  **extra_args, )
85
87
 
86
88
 
@@ -99,8 +101,8 @@ def prettify(elem):
99
101
  def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
100
102
  if xip is None:
101
103
  xip = Element('xip:XIP')
104
+ xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
102
105
  assert xip is not None
103
- xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
104
106
  io = SubElement(xip, 'xip:InformationObject')
105
107
  ref = SubElement(io, 'xip:Ref')
106
108
 
@@ -224,7 +226,8 @@ def __make_representation_multiple_co__(xip, rep_name, rep_type, rep_files, io_r
224
226
  return refs_dict
225
227
 
226
228
 
227
- def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Title", export_folder=None, additional_namespaces=None):
229
+ def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Title", export_folder=None,
230
+ additional_namespaces=None):
228
231
  """
229
232
  Create a custom CMIS transform to display metadata within UA.
230
233
 
@@ -242,7 +245,8 @@ def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Titl
242
245
 
243
246
  namespaces = {"version": "2.0", "xmlns:xsl": "http://www.w3.org/1999/XSL/Transform",
244
247
  "xmlns:fn": "http://www.w3.org/2005/xpath-functions", "xmlns:xs": "http://www.w3.org/2001/XMLSchema",
245
- "xmlns:csv": xml_namespace, "xmlns": "http://www.tessella.com/sdb/cmis/metadata", "exclude-result-prefixes": "csv"}
248
+ "xmlns:csv": xml_namespace, "xmlns": "http://www.tessella.com/sdb/cmis/metadata",
249
+ "exclude-result-prefixes": "csv"}
246
250
 
247
251
  if additional_namespaces is not None:
248
252
  for prefix, uri in additional_namespaces.items():
@@ -311,7 +315,8 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
311
315
  headers.add(xml_tag)
312
316
  break
313
317
 
314
- namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema", "attributeFormDefault": "unqualified", "elementFormDefault": "qualified",
318
+ namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema", "attributeFormDefault": "unqualified",
319
+ "elementFormDefault": "qualified",
315
320
  "targetNamespace": xml_namespace}
316
321
 
317
322
  if additional_namespaces is not None:
@@ -333,7 +338,8 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
333
338
  prefix, sep, tag = header.partition(":")
334
339
  try:
335
340
  namespace = additional_namespaces[prefix]
336
- xml.etree.ElementTree.SubElement(xml_sequence, "xs:element", {"ref": header, "xmlns:" + prefix: namespace})
341
+ xml.etree.ElementTree.SubElement(xml_sequence, "xs:element",
342
+ {"ref": header, "xmlns:" + prefix: namespace})
337
343
  except KeyError:
338
344
  xml.etree.ElementTree.SubElement(xml_sequence, "xs:element", {"type": "xs:string", "name": header})
339
345
  else:
@@ -350,7 +356,8 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
350
356
  return xsd_file
351
357
 
352
358
 
353
- def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Title", export_folder=None, additional_namespaces=None):
359
+ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Title", export_folder=None,
360
+ additional_namespaces=None):
354
361
  """
355
362
  Create a custom Preservica search index based on the columns in a csv file
356
363
 
@@ -400,7 +407,8 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
400
407
  return search_xml
401
408
 
402
409
 
403
- def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename", export_folder=None, additional_namespaces=None):
410
+ def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename", export_folder=None,
411
+ additional_namespaces=None):
404
412
  """
405
413
  Export the rows of a CSV file as XML metadata documents which can be added to Preservica assets
406
414
 
@@ -451,7 +459,8 @@ def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename
451
459
  yield name
452
460
 
453
461
 
454
- def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None, parent_folder=None, compress=True,
462
+ def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None, parent_folder=None,
463
+ compress=True,
455
464
  **kwargs):
456
465
  # some basic validation
457
466
  if export_folder is None:
@@ -473,7 +482,7 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
473
482
  content_type = kwargs.get('CustomType', "")
474
483
 
475
484
  if not compress:
476
- shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
485
+ shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
477
486
 
478
487
  has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
479
488
  has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
@@ -501,8 +510,10 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
501
510
  if has_preservation_files:
502
511
  for representation_name in preservation_files_dict.keys():
503
512
  preservation_files_list = preservation_files_dict[representation_name]
504
- preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name, rep_type="Preservation",
505
- rep_files=preservation_files_list, io_ref=io_ref)
513
+ preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
514
+ rep_type="Preservation",
515
+ rep_files=preservation_files_list,
516
+ io_ref=io_ref)
506
517
  preservation_representation_refs_dict[representation_name] = preservation_refs_dict
507
518
 
508
519
  if has_access_files:
@@ -519,13 +530,16 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
519
530
  default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
520
531
 
521
532
  preservation_content_title = kwargs.get('Preservation_Content_Title', default_content_objects_title)
522
- preservation_content_description = kwargs.get('Preservation_Content_Description', default_content_objects_title)
533
+ preservation_content_description = kwargs.get('Preservation_Content_Description',
534
+ default_content_objects_title)
523
535
 
524
536
  if isinstance(preservation_content_title, dict):
525
- preservation_content_title = preservation_content_title.get("filename", default_content_objects_title)
537
+ preservation_content_title = preservation_content_title.get("filename",
538
+ default_content_objects_title)
526
539
 
527
540
  if isinstance(preservation_content_description, dict):
528
- preservation_content_description = preservation_content_description.get("filename", default_content_objects_title)
541
+ preservation_content_description = preservation_content_description.get("filename",
542
+ default_content_objects_title)
529
543
 
530
544
  __make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
531
545
  preservation_content_description, content_type)
@@ -545,7 +559,8 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
545
559
  if isinstance(access_content_description, dict):
546
560
  access_content_description = access_content_title.get("filename", default_content_objects_title)
547
561
 
548
- __make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag, access_content_description,
562
+ __make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag,
563
+ access_content_description,
549
564
  content_type)
550
565
 
551
566
  if has_preservation_files:
@@ -598,12 +613,12 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
598
613
  for identifier_key, identifier_value in identifier_map.items():
599
614
  if identifier_key:
600
615
  if identifier_value:
601
- identifier = SubElement(xip, 'Identifier')
602
- id_type = SubElement(identifier, "Type")
616
+ identifier = SubElement(xip, 'xip:Identifier')
617
+ id_type = SubElement(identifier, "xip:Type")
603
618
  id_type.text = identifier_key
604
- id_value = SubElement(identifier, "Value")
619
+ id_value = SubElement(identifier, "xip:Value")
605
620
  id_value.text = identifier_value
606
- id_io = SubElement(identifier, "Entity")
621
+ id_io = SubElement(identifier, "xip:Entity")
607
622
  id_io.text = io_ref
608
623
 
609
624
  if 'Asset_Metadata' in kwargs:
@@ -613,22 +628,22 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
613
628
  if metadata_path:
614
629
  if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
615
630
  descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
616
- metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
617
- metadata_ref = SubElement(metadata, 'Ref')
631
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
632
+ metadata_ref = SubElement(metadata, 'xip:Ref')
618
633
  metadata_ref.text = str(uuid.uuid4())
619
- entity = SubElement(metadata, 'Entity')
634
+ entity = SubElement(metadata, 'xip:Entity')
620
635
  entity.text = io_ref
621
- content = SubElement(metadata, 'Content')
636
+ content = SubElement(metadata, 'xip:Content')
622
637
  content.append(descriptive_metadata.getroot())
623
638
  elif isinstance(metadata_path, str):
624
639
  try:
625
640
  descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
626
- metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
627
- metadata_ref = SubElement(metadata, 'Ref')
641
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
642
+ metadata_ref = SubElement(metadata, 'xip:Ref')
628
643
  metadata_ref.text = str(uuid.uuid4())
629
- entity = SubElement(metadata, 'Entity')
644
+ entity = SubElement(metadata, 'xip:Entity')
630
645
  entity.text = io_ref
631
- content = SubElement(metadata, 'Content')
646
+ content = SubElement(metadata, 'xip:Content')
632
647
  content.append(descriptive_metadata)
633
648
  except RuntimeError:
634
649
  logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
@@ -712,71 +727,72 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
712
727
  os.mkdir(os.path.join(inner_folder, CONTENT_FOLDER))
713
728
 
714
729
  asset_map = dict()
715
- xip = Element('XIP')
730
+ xip = Element('xip:XIP')
731
+ xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
716
732
  for file in asset_file_list:
717
733
  default_asset_title = os.path.splitext(os.path.basename(file))[0]
718
734
  xip, io_ref = __create_io__(xip, file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
719
735
  asset_map[file] = io_ref
720
- representation = SubElement(xip, 'Representation')
721
- io_link = SubElement(representation, 'InformationObject')
736
+ representation = SubElement(xip, 'xip:Representation')
737
+ io_link = SubElement(representation, 'xip:InformationObject')
722
738
  io_link.text = io_ref
723
- access_name = SubElement(representation, 'Name')
739
+ access_name = SubElement(representation, 'xip:Name')
724
740
  access_name.text = "Preservation"
725
- access_type = SubElement(representation, 'Type')
741
+ access_type = SubElement(representation, 'xip:Type')
726
742
  access_type.text = "Preservation"
727
- content_objects = SubElement(representation, 'ContentObjects')
728
- content_object = SubElement(content_objects, 'ContentObject')
743
+ content_objects = SubElement(representation, 'xip:ContentObjects')
744
+ content_object = SubElement(content_objects, 'xip:ContentObject')
729
745
  content_object_ref = str(uuid.uuid4())
730
746
  content_object.text = content_object_ref
731
747
 
732
748
  default_content_objects_title = os.path.splitext(os.path.basename(file))[0]
733
- content_object = SubElement(xip, 'ContentObject')
734
- ref_element = SubElement(content_object, "Ref")
749
+ content_object = SubElement(xip, 'xip:ContentObject')
750
+ ref_element = SubElement(content_object, "xip:Ref")
735
751
  ref_element.text = content_object_ref
736
- title = SubElement(content_object, "Title")
752
+ title = SubElement(content_object, "xip:Title")
737
753
  title.text = default_content_objects_title
738
- description = SubElement(content_object, "Description")
754
+ description = SubElement(content_object, "xip:Description")
739
755
  description.text = default_content_objects_title
740
- security_tag_element = SubElement(content_object, "SecurityTag")
756
+ security_tag_element = SubElement(content_object, "xip:SecurityTag")
741
757
  security_tag_element.text = security_tag
742
- custom_type = SubElement(content_object, "CustomType")
758
+ custom_type = SubElement(content_object, "xip:CustomType")
743
759
  custom_type.text = content_type
744
- parent = SubElement(content_object, "Parent")
760
+ parent = SubElement(content_object, "xip:Parent")
745
761
  parent.text = io_ref
746
762
 
747
- generation = SubElement(xip, 'Generation', {"original": "true", "active": "true"})
748
- content_object = SubElement(generation, "ContentObject")
763
+ generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
764
+ content_object = SubElement(generation, "xip:ContentObject")
749
765
  content_object.text = content_object_ref
750
- label = SubElement(generation, "Label")
766
+ label = SubElement(generation, "xip:Label")
751
767
  label.text = os.path.splitext(os.path.basename(file))[0]
752
- effective_date = SubElement(generation, "EffectiveDate")
768
+ effective_date = SubElement(generation, "xip:EffectiveDate")
753
769
  effective_date.text = datetime.now().isoformat()
754
- bitstreams = SubElement(generation, "Bitstreams")
755
- bitstream = SubElement(bitstreams, "Bitstream")
770
+ bitstreams = SubElement(generation, "xip:Bitstreams")
771
+ bitstream = SubElement(bitstreams, "xip:Bitstream")
756
772
  bitstream.text = os.path.basename(file)
757
- SubElement(generation, "Formats")
758
- SubElement(generation, "Properties")
773
+ SubElement(generation, "xip:Formats")
774
+ SubElement(generation, "xip:Properties")
759
775
 
760
- bitstream = SubElement(xip, 'Bitstream')
761
- filename_element = SubElement(bitstream, "Filename")
776
+ bitstream = SubElement(xip, 'xip:Bitstream')
777
+ filename_element = SubElement(bitstream, "xip:Filename")
762
778
  filename_element.text = os.path.basename(file)
763
- filesize = SubElement(bitstream, "FileSize")
779
+ filesize = SubElement(bitstream, "xip:FileSize")
764
780
  file_stats = os.stat(file)
765
781
  filesize.text = str(file_stats.st_size)
766
- physical_location = SubElement(bitstream, "PhysicalLocation")
767
- fixities = SubElement(bitstream, "Fixities")
782
+ physical_location = SubElement(bitstream, "xip:PhysicalLocation")
783
+ fixities = SubElement(bitstream, "xip:Fixities")
768
784
  fixity_result = fixity_callback(filename_element.text, file)
769
785
  if type(fixity_result) == tuple:
770
- fixity = SubElement(fixities, "Fixity")
771
- fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
772
- fixity_value = SubElement(fixity, "FixityValue")
786
+ fixity = SubElement(fixities, "xip:Fixity")
787
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
788
+ fixity_value = SubElement(fixity, "xip:FixityValue")
773
789
  fixity_algorithm_ref.text = fixity_result[0]
774
790
  fixity_value.text = fixity_result[1]
775
791
  elif type(fixity_result) == dict:
776
792
  for key, val in fixity_result.items():
777
- fixity = SubElement(fixities, "Fixity")
778
- fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
779
- fixity_value = SubElement(fixity, "FixityValue")
793
+ fixity = SubElement(fixities, "xip:Fixity")
794
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
795
+ fixity_value = SubElement(fixity, "xip:FixityValue")
780
796
  fixity_algorithm_ref.text = key
781
797
  fixity_value.text = val
782
798
  else:
@@ -790,12 +806,12 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
790
806
  for identifier_key, identifier_value in identifier_map_values.items():
791
807
  if identifier_key:
792
808
  if identifier_value:
793
- identifier = SubElement(xip, 'Identifier')
794
- id_type = SubElement(identifier, "Type")
809
+ identifier = SubElement(xip, 'xip:Identifier')
810
+ id_type = SubElement(identifier, "xip:Type")
795
811
  id_type.text = identifier_key
796
- id_value = SubElement(identifier, "Value")
812
+ id_value = SubElement(identifier, "xip:Value")
797
813
  id_value.text = identifier_value
798
- id_io = SubElement(identifier, "Entity")
814
+ id_io = SubElement(identifier, "xip:Entity")
799
815
  id_io.text = io_ref
800
816
 
801
817
  src_file = file
@@ -815,7 +831,8 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
815
831
  return top_level_folder + ".zip"
816
832
 
817
833
 
818
- def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None, parent_folder=None, compress=True,
834
+ def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None, parent_folder=None,
835
+ compress=True,
819
836
  **kwargs):
820
837
  """
821
838
 
@@ -894,27 +911,34 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
894
911
  if has_preservation_files:
895
912
  if default_asset_title is None:
896
913
  default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
897
-
898
914
  # create the asset
899
- xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
915
+ if io_ref is None:
916
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
900
917
 
901
918
  if has_access_files:
902
919
  if default_asset_title is None:
903
920
  default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
904
-
905
921
  if io_ref is None:
906
922
  xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
907
923
 
924
+ if io_ref is None:
925
+ default_asset_title = kwargs.get('Title', None)
926
+ if default_asset_title is None:
927
+ default_asset_title = "New Asset"
928
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
929
+
908
930
  if has_preservation_files:
909
931
  # add the content objects
910
932
  representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
911
- preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name, rep_type="Preservation",
933
+ preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
934
+ rep_type="Preservation",
912
935
  rep_files=preservation_files_list, io_ref=io_ref)
913
936
 
914
937
  if has_access_files:
915
938
  # add the content objects
916
939
  access_name = kwargs.get('Access_Representation_Name', "Access")
917
- access_refs_dict = __make_representation_multiple_co__(xip, rep_name=access_name, rep_type="Access", rep_files=access_files_list,
940
+ access_refs_dict = __make_representation_multiple_co__(xip, rep_name=access_name, rep_type="Access",
941
+ rep_files=access_files_list,
918
942
  io_ref=io_ref)
919
943
 
920
944
  if has_preservation_files:
@@ -922,7 +946,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
922
946
  for content_ref, filename in preservation_refs_dict.items():
923
947
  default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
924
948
  preservation_content_title = kwargs.get('Preservation_Content_Title', default_content_objects_title)
925
- preservation_content_description = kwargs.get('Preservation_Content_Description', default_content_objects_title)
949
+ preservation_content_description = kwargs.get('Preservation_Content_Description',
950
+ default_content_objects_title)
926
951
 
927
952
  if isinstance(preservation_content_title, dict):
928
953
  preservation_content_title = preservation_content_title[filename]
@@ -930,7 +955,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
930
955
  if isinstance(preservation_content_description, dict):
931
956
  preservation_content_description = preservation_content_description[filename]
932
957
 
933
- __make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag, preservation_content_description,
958
+ __make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
959
+ preservation_content_description,
934
960
  content_type)
935
961
 
936
962
  if has_access_files:
@@ -947,7 +973,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
947
973
  if isinstance(access_content_description, dict):
948
974
  access_content_title = access_content_title[filename]
949
975
 
950
- __make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag, access_content_description, content_type)
976
+ __make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag,
977
+ access_content_description, content_type)
951
978
 
952
979
  if has_preservation_files:
953
980
 
@@ -955,7 +982,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
955
982
 
956
983
  for content_ref, filename in preservation_refs_dict.items():
957
984
  preservation_file_name = os.path.basename(filename)
958
- __make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label, PRESERVATION_CONTENT_FOLDER)
985
+ __make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label,
986
+ PRESERVATION_CONTENT_FOLDER)
959
987
 
960
988
  if has_access_files:
961
989
 
@@ -1070,7 +1098,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
1070
1098
  return top_level_folder + ".zip"
1071
1099
 
1072
1100
 
1073
- def simple_asset_package(preservation_file=None, access_file=None, export_folder=None, parent_folder=None, compress=True, **kwargs):
1101
+ def simple_asset_package(preservation_file=None, access_file=None, export_folder=None, parent_folder=None,
1102
+ compress=True, **kwargs):
1074
1103
  """
1075
1104
  Create a Preservica package containing a single Asset from a single preservation file
1076
1105
  and an optional access file.
@@ -1130,378 +1159,14 @@ def _unpad(s):
1130
1159
 
1131
1160
  class UploadAPI(AuthenticatedAPI):
1132
1161
 
1133
- def ingest_tweet(self, twitter_user=None, tweet_id: int = 0, twitter_consumer_key=None, twitter_secret_key=None, folder=None,
1134
- callback=None, **kwargs):
1135
-
1136
- """
1137
- Ingest tweets from a twitter stream by twitter username
1138
-
1139
- :param tweet_id:
1140
- :param str twitter_user: Twitter Username
1141
- :param str twitter_consumer_key: Optional asset title
1142
- :param str twitter_secret_key: Optional asset description
1143
- :param str folder: Folder to ingest into
1144
- :param callback callback: Optional upload progress callback
1145
- :raises RuntimeError:
1146
-
1147
-
1148
- """
1149
-
1150
- def get_image(m, has_video_element):
1151
- media_url_https_ = m["media_url_https"]
1152
- if media_url_https_:
1153
- req = requests.get(media_url_https_)
1154
- if req.status_code == requests.codes.ok:
1155
- if has_video_element:
1156
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
1157
- else:
1158
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
1159
- image_name_document_ = open(image_name_, "wb")
1160
- image_name_document_.write(req.content)
1161
- image_name_document_.close()
1162
- return image_name_
1163
-
1164
- def get_video(m):
1165
- video_info_ = m["video_info"]
1166
- variants_ = video_info_["variants"]
1167
- for v_ in variants_:
1168
- video_url_ = v_["url"]
1169
- req = requests.get(video_url_)
1170
- if req.status_code == requests.codes.ok:
1171
- video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
1172
- video_name_document_ = open(video_name_, "wb")
1173
- video_name_document_.write(req.content)
1174
- video_name_document_.close()
1175
- return video_name_, True
1176
-
1177
- entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server, tenant=self.tenant)
1178
- if hasattr(folder, "reference"):
1179
- folder = entity_client.folder(folder.reference)
1180
- else:
1181
- folder = entity_client.folder(folder)
1182
- try:
1183
- import tweepy
1184
- except ImportError:
1185
- logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1186
- raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1187
- config = configparser.ConfigParser()
1188
- config.read('credentials.properties')
1189
- if twitter_consumer_key is None:
1190
- twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
1191
- if twitter_consumer_key is None:
1192
- try:
1193
- twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
1194
- except KeyError:
1195
- logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1196
- "environment variables or credentials.properties file")
1197
- raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1198
- "environment variables or credentials.properties file")
1199
- if twitter_secret_key is None:
1200
- twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
1201
- if twitter_secret_key is None:
1202
- try:
1203
- twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
1204
- except KeyError:
1205
- logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
1206
- "environment variables or credentials.properties file")
1207
- raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
1208
- "environment variables or credentials.properties file")
1209
-
1210
- api = None
1211
- try:
1212
- auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
1213
- api = tweepy.API(auth, wait_on_rate_limit=True)
1214
- except TweepError:
1215
- logger.error("No valid Twitter API keys. Could not authenticate")
1216
- raise RuntimeError("No valid Twitter API keys. Could not authenticate")
1217
- if api is not None:
1218
- logger.debug(api)
1219
- tweet = api.get_status(tweet_id, tweet_mode="extended", include_entities=True)
1220
- created_at = tweet.created_at
1221
- id_str = tweet.id_str
1222
- author = tweet.author.name
1223
- tweet_entities = tweet.entities
1224
- hashtags = dict()
1225
- if 'hashtags' in tweet_entities:
1226
- hashtags = tweet.entities['hashtags']
1227
- entities = entity_client.identifier("tweet_id", id_str.strip())
1228
- if len(entities) > 0:
1229
- logger.warning("Tweet already exists, skipping....")
1230
- return
1231
- logger.info(f"Processing tweet {id_str} ...")
1232
- tid = tweet.id
1233
- content_objects = list()
1234
- full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
1235
- text = tweet.full_text
1236
- full_text = full_tweet.full_text
1237
- file_name = f"{{{id_str}}}_[{twitter_user}].json"
1238
- json_doc = json.dumps(full_tweet._json)
1239
- json_file = open(file_name, "wt", encoding="utf-8")
1240
- json_file.write(json_doc)
1241
- json_file.close()
1242
- content_objects.append(file_name)
1243
- if hasattr(full_tweet, "extended_entities"):
1244
- extended_entities = full_tweet.extended_entities
1245
- if "media" in extended_entities:
1246
- media = extended_entities["media"]
1247
- for med in media:
1248
- media_id_str = med["id_str"]
1249
- has_video = False
1250
- if "video_info" in med:
1251
- co, has_video = get_video(med)
1252
- content_objects.append(co)
1253
- if has_video:
1254
- co = get_image(med, has_video)
1255
- content_objects.append(co)
1256
- continue
1257
- if "media_url_https" in med:
1258
- co = get_image(med, has_video)
1259
- content_objects.append(co)
1260
- identifiers = dict()
1261
- asset_metadata = dict()
1262
- identifiers["tweet_id"] = id_str
1263
-
1264
- user = full_tweet._json['user']
1265
-
1266
- if full_tweet._json.get('retweeted_status'):
1267
- retweeted_status = full_tweet._json['retweeted_status']
1268
- if retweeted_status.get("extended_entities"):
1269
- extended_entities = retweeted_status["extended_entities"]
1270
- if "media" in extended_entities:
1271
- media = extended_entities["media"]
1272
- for med in media:
1273
- media_id_str = med["id_str"]
1274
- has_video = False
1275
- if "video_info" in med:
1276
- co, has_video = get_video(med)
1277
- content_objects.append(co)
1278
- continue
1279
- if "media_url_https" in med:
1280
- co = get_image(med, has_video)
1281
- content_objects.append(co)
1282
-
1283
- xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
1284
- xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
1285
- xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
1286
- xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
1287
- xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
1288
- for h in hashtags:
1289
- xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
1290
-
1291
- xml.etree.ElementTree.SubElement(xml_object, "name").text = author
1292
- xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
1293
- xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
1294
-
1295
- xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
1296
-
1297
- metadata_document = open("metadata.xml", "wt", encoding="utf-8")
1298
- metadata_document.write(xml_request.decode("utf-8"))
1299
- metadata_document.close()
1300
-
1301
- asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
1302
-
1303
- security_tag = kwargs.get("SecurityTag", "open")
1304
- asset_title = kwargs.get("Title", text)
1305
- asset_description = kwargs.get("Description", full_text)
1306
-
1307
- p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder, Title=asset_title,
1308
- Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
1309
- Asset_Metadata=asset_metadata, SecurityTag=security_tag)
1310
- self.upload_zip_package(p, folder=folder, callback=callback)
1311
- for ob in content_objects:
1312
- os.remove(ob)
1313
- os.remove("metadata.xml")
1314
-
1315
- def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None, twitter_secret_key=None, folder=None,
1316
- callback=None, **kwargs):
1317
-
1318
- """
1319
- Ingest tweets from a twitter stream by twitter username
1320
-
1321
- :param str twitter_user: Twitter Username
1322
- :param int num_tweets: The number of tweets from the stream
1323
- :param str twitter_consumer_key: Optional asset title
1324
- :param str twitter_secret_key: Optional asset description
1325
- :param str folder: Folder to ingest into
1326
- :param callback callback: Optional upload progress callback
1327
- :raises RuntimeError:
1328
1162
 
1329
1163
 
1330
- """
1331
-
1332
- def get_image(m, has_video_element):
1333
- media_url_https_ = m["media_url_https"]
1334
- if media_url_https_:
1335
- req = requests.get(media_url_https_)
1336
- if req.status_code == requests.codes.ok:
1337
- if has_video_element:
1338
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
1339
- else:
1340
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
1341
- image_name_document_ = open(image_name_, "wb")
1342
- image_name_document_.write(req.content)
1343
- image_name_document_.close()
1344
- return image_name_
1345
-
1346
- def get_video(m):
1347
- video_info_ = m["video_info"]
1348
- variants_ = video_info_["variants"]
1349
- for v_ in variants_:
1350
- if v_['content_type'] == 'video/mp4':
1351
- video_url_ = v_["url"]
1352
- with requests.get(video_url_, stream=True) as req:
1353
- video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
1354
- with open(video_name_, 'wb') as video_name_document_:
1355
- for chunk in req.iter_content(chunk_size=1024):
1356
- video_name_document_.write(chunk)
1357
- video_name_document_.flush()
1358
- return video_name_, True
1359
-
1360
- entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server, tenant=self.tenant)
1361
- if hasattr(folder, "reference"):
1362
- folder = entity_client.folder(folder.reference)
1363
- else:
1364
- folder = entity_client.folder(folder)
1365
- try:
1366
- import tweepy
1367
- except ImportError:
1368
- logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1369
- raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1370
- config = configparser.ConfigParser()
1371
- config.read('credentials.properties')
1372
- if twitter_consumer_key is None:
1373
- twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
1374
- if twitter_consumer_key is None:
1375
- try:
1376
- twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
1377
- except KeyError:
1378
- logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1379
- "environment variables or credentials.properties file")
1380
- raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1381
- "environment variables or credentials.properties file")
1382
- if twitter_secret_key is None:
1383
- twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
1384
- if twitter_secret_key is None:
1385
- try:
1386
- twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
1387
- except KeyError:
1388
- logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
1389
- "environment variables or credentials.properties file")
1390
- raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
1391
- "environment variables or credentials.properties file")
1392
-
1393
- api = None
1394
- try:
1395
- auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
1396
- api = tweepy.API(auth, wait_on_rate_limit=True)
1397
- except TweepError:
1398
- logger.error("No valid Twitter API keys. Could not authenticate")
1399
- raise RuntimeError("No valid Twitter API keys. Could not authenticate")
1400
- if api is not None:
1401
- logger.debug(api)
1402
- for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
1403
- created_at = tweet.created_at
1404
- id_str = tweet.id_str
1405
- author = tweet.author.name
1406
- tweet_entities = tweet.entities
1407
- hashtags = dict()
1408
- if 'hashtags' in tweet_entities:
1409
- hashtags = tweet.entities['hashtags']
1410
- entities = entity_client.identifier("tweet_id", id_str.strip())
1411
- if len(entities) > 0:
1412
- logger.warning("Tweet already exists, skipping....")
1413
- continue
1414
- logger.info(f"Processing tweet {id_str} ...")
1415
- tid = tweet.id
1416
- content_objects = list()
1417
- full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
1418
- text = tweet.text
1419
- logger.debug(text)
1420
- full_text = full_tweet.full_text
1421
- file_name = f"{{{id_str}}}_[{twitter_user}].json"
1422
- json_doc = json.dumps(full_tweet._json)
1423
- json_file = open(file_name, "wt", encoding="utf-8")
1424
- json_file.write(json_doc)
1425
- json_file.close()
1426
- content_objects.append(file_name)
1427
- if hasattr(full_tweet, "extended_entities"):
1428
- extended_entities = full_tweet.extended_entities
1429
- if "media" in extended_entities:
1430
- media = extended_entities["media"]
1431
- for med in media:
1432
- media_id_str = med["id_str"]
1433
- has_video = False
1434
- if "video_info" in med:
1435
- co, has_video = get_video(med)
1436
- content_objects.append(co)
1437
- if has_video:
1438
- co = get_image(med, has_video)
1439
- content_objects.append(co)
1440
- continue
1441
- if "media_url_https" in med:
1442
- co = get_image(med, has_video)
1443
- content_objects.append(co)
1444
- identifiers = {}
1445
- asset_metadata = {}
1446
- identifiers["tweet_id"] = id_str
1447
-
1448
- user = full_tweet._json['user']
1449
-
1450
- if full_tweet._json.get('retweeted_status'):
1451
- retweeted_status = full_tweet._json['retweeted_status']
1452
- if retweeted_status.get("extended_entities"):
1453
- extended_entities = retweeted_status["extended_entities"]
1454
- if "media" in extended_entities:
1455
- media = extended_entities["media"]
1456
- for med in media:
1457
- media_id_str = med["id_str"]
1458
- has_video = False
1459
- if "video_info" in med:
1460
- co, has_video = get_video(med)
1461
- content_objects.append(co)
1462
- continue
1463
- if "media_url_https" in med:
1464
- co = get_image(med, has_video)
1465
- content_objects.append(co)
1466
-
1467
- xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
1468
- xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
1469
- xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
1470
- xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
1471
- xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
1472
- for h in hashtags:
1473
- xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
1474
-
1475
- xml.etree.ElementTree.SubElement(xml_object, "name").text = author
1476
- xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
1477
- xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
1478
-
1479
- xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
1480
-
1481
- metadata_document = open("metadata.xml", "wt", encoding="utf-8")
1482
- metadata_document.write(xml_request.decode("utf-8"))
1483
- metadata_document.close()
1484
-
1485
- asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
1486
-
1487
- security_tag = kwargs.get("SecurityTag", "open")
1488
- asset_title = kwargs.get("Title", text)
1489
- asset_description = kwargs.get("Description", full_text)
1490
-
1491
- p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder, Title=asset_title,
1492
- Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
1493
- Asset_Metadata=asset_metadata, SecurityTag=security_tag)
1494
- self.upload_zip_package(p, folder=folder, callback=callback)
1495
- for ob in content_objects:
1496
- os.remove(ob)
1497
- os.remove("metadata.xml")
1498
- sleep(2)
1499
1164
 
1500
1165
  def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
1501
1166
  """
1502
1167
  Ingest a web video such as YouTube etc based on the URL
1503
1168
 
1504
- :param str url: URL to the youtube video
1169
+ :param str url: URL to the YouTube video
1505
1170
  :param Folder parent_folder: The folder to ingest the video into
1506
1171
  :param str Title: Optional asset title
1507
1172
  :param str Description: Optional asset description
@@ -1572,7 +1237,8 @@ class UploadAPI(AuthenticatedAPI):
1572
1237
  duration = meta.get('duration')
1573
1238
 
1574
1239
  package = simple_asset_package(preservation_file=f"{vid_id}.mp4", parent_folder=parent_folder, Title=title,
1575
- Description=description, Identifiers=identifier_map, Asset_Metadata=descriptive_metadata,
1240
+ Description=description, Identifiers=identifier_map,
1241
+ Asset_Metadata=descriptive_metadata,
1576
1242
  Preservation_Content_Title=title, SecurityTag=security_tag)
1577
1243
 
1578
1244
  self.upload_zip_package(path_to_zip_package=package, folder=parent_folder, callback=callback)
@@ -1593,10 +1259,57 @@ class UploadAPI(AuthenticatedAPI):
1593
1259
  self.token = self.__token__()
1594
1260
  return self.upload_credentials(location_id)
1595
1261
  else:
1596
- exception = HTTPException(location_id, request.status_code, request.url, "upload_credentials", request.content.decode('utf-8'))
1262
+ exception = HTTPException(location_id, request.status_code, request.url, "upload_credentials",
1263
+ request.content.decode('utf-8'))
1597
1264
  logger.error(exception)
1598
1265
  raise exception
1599
1266
 
1267
+ def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
1268
+ """
1269
+ Clean up objects in an upload bucket which are older than older_than_days.
1270
+
1271
+ """
1272
+ from azure.storage.blob import ContainerClient
1273
+
1274
+ for location in self.upload_locations():
1275
+ if location['containerName'] == bucket_name:
1276
+
1277
+ if location['type'] != 'AWS':
1278
+ credentials = self.upload_credentials(location['apiId'])
1279
+ account_key = credentials['key']
1280
+ session_token = credentials['sessionToken']
1281
+ sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
1282
+ container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
1283
+ now = datetime.now(timezone.utc)
1284
+ for blob in container.list_blobs():
1285
+ if abs((blob.last_modified - now).days) > older_than_days:
1286
+ logger.debug(f"Deleting expired object {blob.name}")
1287
+ container.delete_blob(blob.name)
1288
+
1289
+ if location['type'] == 'AWS':
1290
+ credentials = self.upload_credentials(location['apiId'])
1291
+ access_key = credentials['key']
1292
+ secret_key = credentials['secret']
1293
+ session_token = credentials['sessionToken']
1294
+ session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
1295
+ aws_session_token=session_token)
1296
+ s3_client = session.client("s3")
1297
+ paginator = s3_client.get_paginator('list_objects_v2')
1298
+ now = datetime.now(timezone.utc)
1299
+ for page in paginator.paginate(Bucket=bucket_name):
1300
+ if 'Contents' in page:
1301
+ for key in page['Contents']:
1302
+ last_modified = key['LastModified']
1303
+ if abs((last_modified - now).days) > older_than_days:
1304
+ logger.debug(f"Deleting expired object {key['Key']}")
1305
+ s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
1306
+
1307
+
1308
+
1309
+
1310
+
1311
+
1312
+
1600
1313
  def upload_locations(self):
1601
1314
  """
1602
1315
  Upload locations are configured on the Sources page as 'SIP Upload'.
@@ -1612,7 +1325,8 @@ class UploadAPI(AuthenticatedAPI):
1612
1325
  self.token = self.__token__()
1613
1326
  return self.upload_locations()
1614
1327
  else:
1615
- exception = HTTPException("", request.status_code, request.url, "upload_locations", request.content.decode('utf-8'))
1328
+ exception = HTTPException("", request.status_code, request.url, "upload_locations",
1329
+ request.content.decode('utf-8'))
1616
1330
  logger.error(exception)
1617
1331
  raise exception
1618
1332
 
@@ -1624,35 +1338,60 @@ class UploadAPI(AuthenticatedAPI):
1624
1338
  """
1625
1339
  return self.upload_locations()
1626
1340
 
1627
- def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback: bool = False, security_tag: str = "open",
1341
+ def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback: bool = False,
1342
+ security_tag: str = "open",
1628
1343
  delete_after_upload: bool = True, max_MB_ingested: int = -1):
1629
1344
 
1345
+ from pyPreservica import EntityAPI
1346
+
1347
+ def entity_value(client: EntityAPI, identifier: str) -> Entity:
1348
+ back_off: int = 5
1349
+ while True:
1350
+ try:
1351
+ entities = client.identifier("code", identifier)
1352
+ if bool(len(entities) > 0):
1353
+ return entities.pop()
1354
+ else:
1355
+ return None
1356
+ except HTTPException as e:
1357
+ sleep(back_off)
1358
+ back_off = back_off * 2
1359
+
1360
+ def entity_exists(client: EntityAPI, identifier: str) -> bool:
1361
+ back_off: int = 5
1362
+ while True:
1363
+ try:
1364
+ entities = client.identifier("code", identifier)
1365
+ return bool(len(entities) > 0)
1366
+ except HTTPException as e:
1367
+ sleep(back_off)
1368
+ back_off = back_off * 2
1369
+
1630
1370
  def get_parent(client, identifier, parent_reference):
1631
- id = str(os.path.dirname(identifier))
1632
- if not id:
1633
- id = identifier
1634
- entities = client.identifier("code", id)
1635
- if len(entities) > 0:
1636
- folder = entities.pop()
1371
+ dirname_id: str = str(os.path.dirname(identifier))
1372
+ if not dirname_id:
1373
+ dirname_id = identifier
1374
+ folder = entity_value(client, dirname_id)
1375
+ if folder is not None:
1637
1376
  folder = client.folder(folder.reference)
1638
1377
  return folder.reference
1639
1378
  else:
1640
1379
  return parent_reference
1641
1380
 
1642
1381
  def get_folder(client, name, tag, parent_reference, identifier):
1643
- entities = client.identifier("code", identifier)
1644
- if len(entities) == 0:
1382
+ folder = entity_value(client, identifier)
1383
+ if folder is None:
1645
1384
  logger.info(f"Creating new folder with name {name}")
1646
1385
  folder = client.create_folder(name, name, tag, parent_reference)
1647
1386
  client.add_identifier(folder, "code", identifier)
1648
1387
  else:
1649
1388
  logger.info(f"Found existing folder with name {name}")
1650
- folder = entities.pop()
1651
1389
  return folder
1652
1390
 
1653
- from pyPreservica import EntityAPI
1654
- entity_client = EntityAPI(username=self.username, password=self.password, server=self.server, tenant=self.tenant,
1655
- two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret, protocol=self.protocol)
1391
+ entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
1392
+ tenant=self.tenant,
1393
+ two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
1394
+ protocol=self.protocol)
1656
1395
 
1657
1396
  if preservica_parent:
1658
1397
  parent = entity_client.folder(preservica_parent)
@@ -1678,7 +1417,7 @@ class UploadAPI(AuthenticatedAPI):
1678
1417
  files.remove(file)
1679
1418
  continue
1680
1419
  asset_code = os.path.join(code, file)
1681
- if len(entity_client.identifier("code", asset_code)) == 0:
1420
+ if not entity_exists(entity_client, asset_code):
1682
1421
  bytes_ingested = bytes_ingested + os.stat(full_path).st_size
1683
1422
  logger.info(f"Adding new file: {file} to package ready for upload")
1684
1423
  file_identifiers = {"code": asset_code}
@@ -1696,8 +1435,14 @@ class UploadAPI(AuthenticatedAPI):
1696
1435
  else:
1697
1436
  progress_display = None
1698
1437
 
1699
- self.upload_zip_package_to_S3(path_to_zip_package=package, bucket_name=bucket_name, callback=progress_display,
1438
+ if bucket_name is None:
1439
+ self.upload_zip_package(path_to_zip_package=package, callback=progress_display,
1440
+ delete_after_upload=delete_after_upload)
1441
+ else:
1442
+ self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
1443
+ show_progress=bool(progress_display is not None),
1700
1444
  delete_after_upload=delete_after_upload)
1445
+
1701
1446
  logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
1702
1447
 
1703
1448
  if max_MB_ingested > 0:
@@ -1705,7 +1450,8 @@ class UploadAPI(AuthenticatedAPI):
1705
1450
  logger.info(f"Reached Max Upload Limit")
1706
1451
  break
1707
1452
 
1708
- def upload_zip_to_Source(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False, show_progress=False):
1453
+ def upload_zip_to_Source(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
1454
+ show_progress=False):
1709
1455
 
1710
1456
  """
1711
1457
  Uploads a zip file package to either an Azure container or S3 bucket
@@ -1726,13 +1472,17 @@ class UploadAPI(AuthenticatedAPI):
1726
1472
  callback = None
1727
1473
  if show_progress:
1728
1474
  callback = UploadProgressConsoleCallback(path_to_zip_package)
1729
- self.upload_zip_package_to_S3(path_to_zip_package=path_to_zip_package, bucket_name=container_name, folder=folder,
1475
+ self.upload_zip_package_to_S3(path_to_zip_package=path_to_zip_package, bucket_name=container_name,
1476
+ folder=folder,
1730
1477
  callback=callback, delete_after_upload=delete_after_upload)
1731
1478
  else:
1732
- self.upload_zip_package_to_Azure(path_to_zip_package=path_to_zip_package, container_name=container_name, folder=folder,
1733
- delete_after_upload=delete_after_upload, show_progress=show_progress)
1479
+ self.upload_zip_package_to_Azure(path_to_zip_package=path_to_zip_package,
1480
+ container_name=container_name, folder=folder,
1481
+ delete_after_upload=delete_after_upload,
1482
+ show_progress=show_progress)
1734
1483
 
1735
- def upload_zip_package_to_Azure(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False, show_progress=False):
1484
+ def upload_zip_package_to_Azure(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
1485
+ show_progress=False):
1736
1486
 
1737
1487
  """
1738
1488
  Uploads a zip file package to an Azure container connected to a Preservica Cloud System
@@ -1745,7 +1495,8 @@ class UploadAPI(AuthenticatedAPI):
1745
1495
  """
1746
1496
 
1747
1497
  if (self.major_version < 7) and (self.minor_version < 5):
1748
- raise RuntimeError("This call [upload_zip_package_to_Azure] is only available against v6.5 systems and above")
1498
+ raise RuntimeError(
1499
+ "This call [upload_zip_package_to_Azure] is only available against v6.5 systems and above")
1749
1500
 
1750
1501
  from azure.storage.blob import ContainerClient
1751
1502
 
@@ -1773,11 +1524,13 @@ class UploadAPI(AuthenticatedAPI):
1773
1524
 
1774
1525
  if show_progress:
1775
1526
  with tqdm.wrapattr(open(path_to_zip_package, 'rb'), "read", total=len_bytes) as data:
1776
- blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata, length=len_bytes)
1527
+ blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
1528
+ length=len_bytes)
1777
1529
  properties = blob_client.get_blob_properties()
1778
1530
  else:
1779
1531
  with open(path_to_zip_package, "rb") as data:
1780
- blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata, length=len_bytes)
1532
+ blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
1533
+ length=len_bytes)
1781
1534
  properties = blob_client.get_blob_properties()
1782
1535
 
1783
1536
  if delete_after_upload:
@@ -1785,7 +1538,8 @@ class UploadAPI(AuthenticatedAPI):
1785
1538
 
1786
1539
  return properties
1787
1540
 
1788
- def upload_zip_package_to_S3(self, path_to_zip_package, bucket_name, folder=None, callback=None, delete_after_upload=False):
1541
+ def upload_zip_package_to_S3(self, path_to_zip_package, bucket_name, folder=None, callback=None,
1542
+ delete_after_upload=False):
1789
1543
 
1790
1544
  """
1791
1545
  Uploads a zip file package to an S3 bucket connected to a Preservica Cloud System
@@ -1814,7 +1568,8 @@ class UploadAPI(AuthenticatedAPI):
1814
1568
  session_token = credentials['sessionToken']
1815
1569
  endpoint = credentials['endpoint']
1816
1570
 
1817
- session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, aws_session_token=session_token)
1571
+ session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
1572
+ aws_session_token=session_token)
1818
1573
  s3 = session.resource(service_name="s3")
1819
1574
 
1820
1575
  logger.debug(f"S3 Session: {s3}")
@@ -1833,7 +1588,8 @@ class UploadAPI(AuthenticatedAPI):
1833
1588
 
1834
1589
  metadata_map = {'Metadata': metadata}
1835
1590
 
1836
- s3_object.upload_file(path_to_zip_package, Callback=callback, ExtraArgs=metadata_map, Config=transfer_config)
1591
+ s3_object.upload_file(path_to_zip_package, Callback=callback, ExtraArgs=metadata_map,
1592
+ Config=transfer_config)
1837
1593
 
1838
1594
  if delete_after_upload:
1839
1595
  os.remove(path_to_zip_package)
@@ -1859,8 +1615,42 @@ class UploadAPI(AuthenticatedAPI):
1859
1615
  endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
1860
1616
  self.token = self.__token__()
1861
1617
 
1862
- s3_client = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=self.token, aws_secret_access_key="NOT_USED",
1863
- config=Config(s3={'addressing_style': 'path'}))
1618
+
1619
+ retries= {
1620
+ 'max_attempts': 5,
1621
+ 'mode': 'adaptive'
1622
+ }
1623
+
1624
+ def new_credentials():
1625
+ cred_metadata: dict = {}
1626
+ cred_metadata['access_key'] = self.__token__()
1627
+ cred_metadata['secret_key'] = "NOT_USED"
1628
+ cred_metadata['token'] = ""
1629
+ cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
1630
+ logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
1631
+ return cred_metadata
1632
+
1633
+ session = get_session()
1634
+
1635
+ session_credentials = RefreshableCredentials.create_from_metadata(
1636
+ metadata=new_credentials(),
1637
+ refresh_using=new_credentials,
1638
+ advisory_timeout = 4 * 60,
1639
+ mandatory_timeout = 12 * 60,
1640
+ method = 'Preservica'
1641
+ )
1642
+
1643
+ autorefresh_session = boto3.Session(botocore_session=session)
1644
+
1645
+ session._credentials = session_credentials
1646
+
1647
+ config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
1648
+ request_checksum_calculation="WHEN_REQUIRED",
1649
+ response_checksum_validation="WHEN_REQUIRED",
1650
+ retries=retries, tcp_keepalive=True)
1651
+
1652
+
1653
+ s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
1864
1654
 
1865
1655
  metadata = {}
1866
1656
  if folder is not None:
@@ -1873,20 +1663,48 @@ class UploadAPI(AuthenticatedAPI):
1873
1663
  try:
1874
1664
  key_id = str(uuid.uuid4()) + ".zip"
1875
1665
 
1666
+
1667
+ # how big is the package
1668
+ package_size = os.path.getsize(path_to_zip_package)
1669
+ if package_size > 1 * GB:
1670
+ transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
1671
+ if package_size > 8 * GB:
1672
+ transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
1673
+ if package_size > 24 * GB:
1674
+ transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
1675
+ if package_size > 48 * GB:
1676
+ transfer_config.multipart_chunksize = 64 * MB
1677
+
1678
+ logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
1679
+
1876
1680
  transfer = S3Transfer(client=s3_client, config=transfer_config)
1877
1681
 
1878
1682
  transfer.PutObjectTask = PutObjectTask
1879
1683
  transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
1880
1684
  transfer.upload_file = upload_file
1881
1685
 
1882
- response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket, key=key_id, extra_args=metadata,
1686
+
1687
+ response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
1688
+ key=key_id,
1689
+ extra_args=metadata,
1883
1690
  callback=callback)
1884
1691
 
1692
+
1885
1693
  if delete_after_upload:
1886
1694
  os.remove(path_to_zip_package)
1887
1695
 
1888
1696
  return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
1889
1697
 
1890
- except ClientError as e:
1891
- logger.error(e)
1892
- raise e
1698
+ except (NoCredentialsError, PartialCredentialsError) as ex:
1699
+ logger.error(ex)
1700
+ raise ex
1701
+
1702
+ except ClientError as ex:
1703
+ logger.error(ex)
1704
+ raise ex
1705
+
1706
+
1707
+
1708
+
1709
+
1710
+