pyPreservica 0.9.9__py3-none-any.whl → 3.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyPreservica/uploadAPI.py CHANGED
@@ -1,30 +1,94 @@
1
+ """
2
+ pyPreservica UploadAPI module definition
3
+
4
+ A client library for the Preservica Repository Upload API
5
+
6
+ author: James Carr
7
+ licence: Apache License 2.0
8
+
9
+ """
10
+
1
11
  import csv
2
- import json
3
12
  import shutil
4
13
  import tempfile
5
14
  import uuid
6
15
  import xml
16
+ from datetime import datetime, timedelta, timezone
7
17
  from time import sleep
8
-
9
- import boto3
10
- from datetime import datetime
11
18
  from xml.dom import minidom
12
19
  from xml.etree import ElementTree
13
20
  from xml.etree.ElementTree import Element, SubElement
14
- from boto3.s3.transfer import TransferConfig
21
+
22
+ import boto3
23
+ import s3transfer.tasks
24
+ import s3transfer.upload
25
+ from botocore.session import get_session
26
+ from boto3.s3.transfer import TransferConfig, S3Transfer
15
27
  from botocore.config import Config
16
- from botocore.exceptions import ClientError
28
+ from botocore.credentials import RefreshableCredentials
29
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
30
+ from dateutil.tz import tzlocal
31
+ from s3transfer import S3UploadFailedError
32
+ from tqdm import tqdm
17
33
 
18
34
  from pyPreservica.common import *
19
35
  from pyPreservica.common import _make_stored_zipfile
20
36
 
21
37
  logger = logging.getLogger(__name__)
22
38
 
39
+ MB = 1024 * 1024
23
40
  GB = 1024 ** 3
24
- transfer_config = TransferConfig(multipart_threshold=int((1 * GB) / 8))
41
+ transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
42
+
43
+ CONTENT_FOLDER = "content"
44
+ PRESERVATION_CONTENT_FOLDER = "p1"
45
+ ACCESS_CONTENT_FOLDER = "a1"
46
+
47
+
48
+ def upload_file(self, filename, bucket, key, callback=None, extra_args=None):
49
+ """Upload a file to an S3 object.
50
+
51
+ Variants have also been injected into S3 client, Bucket and Object.
52
+ You don't have to use S3Transfer.upload_file() directly.
53
+
54
+ .. seealso::
55
+ :py:meth:`S3.Client.upload_file`
56
+ :py:meth:`S3.Client.upload_fileobj`
57
+ """
58
+ if not isinstance(filename, str):
59
+ raise ValueError('Filename must be a string')
60
+
61
+ subscribers = self._get_subscribers(callback)
62
+ future = self._manager.upload(filename, bucket, key, extra_args, subscribers)
63
+ try:
64
+ return future.result()
65
+ # If a client error was raised, add the backwards compatibility layer
66
+ # that raises a S3UploadFailedError. These specific errors were only
67
+ # ever thrown for upload_parts but now can be thrown for any related
68
+ # client error.
69
+ except ClientError as e:
70
+ raise S3UploadFailedError("Failed to upload %s to %s: %s" % (filename, '/'.join([bucket, key]), e))
71
+
72
+
73
+ class PutObjectTask(s3transfer.tasks.Task):
74
+ # Copied from s3transfer/upload.py, changed to return the result of client.put_object.
75
+ def _main(self, client, fileobj, bucket, key, extra_args):
76
+ with fileobj as body:
77
+ response = client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
78
+ return response
79
+
25
80
 
81
+ class CompleteMultipartUploadTask(s3transfer.tasks.Task):
82
+ # Copied from s3transfer/tasks.py, changed to return a result.
83
+ def _main(self, client, bucket, key, upload_id, parts, extra_args):
84
+ return client.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id,
85
+ MultipartUpload={"Parts": parts},
86
+ **extra_args, )
26
87
 
27
88
 
89
+ s3transfer.upload.PutObjectTask = PutObjectTask
90
+ s3transfer.upload.CompleteMultipartUploadTask = CompleteMultipartUploadTask
91
+
28
92
 
29
93
  def prettify(elem):
30
94
  """Return a pretty-printed XML string for the Element.
@@ -34,11 +98,13 @@ def prettify(elem):
34
98
  return re_parsed.toprettyxml(indent=" ")
35
99
 
36
100
 
37
- def __create_io__(file_name=None, parent_folder=None, **kwargs):
38
- xip = Element('XIP')
39
- xip.set('xmlns', 'http://preservica.com/XIP/v6.0')
40
- io = SubElement(xip, 'InformationObject')
41
- ref = SubElement(io, 'Ref')
101
+ def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
102
+ if xip is None:
103
+ xip = Element('xip:XIP')
104
+ xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
105
+ assert xip is not None
106
+ io = SubElement(xip, 'xip:InformationObject')
107
+ ref = SubElement(io, 'xip:Ref')
42
108
 
43
109
  if 'IO_Identifier_callback' in kwargs:
44
110
  ident_callback = kwargs.get('IO_Identifier_callback')
@@ -46,15 +112,15 @@ def __create_io__(file_name=None, parent_folder=None, **kwargs):
46
112
  else:
47
113
  ref.text = str(uuid.uuid4())
48
114
 
49
- title = SubElement(io, 'Title')
115
+ title = SubElement(io, 'xip:Title')
50
116
  title.text = kwargs.get('Title', file_name)
51
- description = SubElement(io, 'Description')
117
+ description = SubElement(io, 'xip:Description')
52
118
  description.text = kwargs.get('Description', file_name)
53
- security = SubElement(io, 'SecurityTag')
119
+ security = SubElement(io, 'xip:SecurityTag')
54
120
  security.text = kwargs.get('SecurityTag', "open")
55
- custom_type = SubElement(io, 'CustomType')
121
+ custom_type = SubElement(io, 'xip:CustomType')
56
122
  custom_type.text = kwargs.get('CustomType', "")
57
- parent = SubElement(io, 'Parent')
123
+ parent = SubElement(io, 'xip:Parent')
58
124
 
59
125
  if hasattr(parent_folder, "reference"):
60
126
  parent.text = parent_folder.reference
@@ -65,83 +131,95 @@ def __create_io__(file_name=None, parent_folder=None, **kwargs):
65
131
 
66
132
 
67
133
  def __make_representation__(xip, rep_name, rep_type, io_ref):
68
- representation = SubElement(xip, 'Representation')
69
- io_link = SubElement(representation, 'InformationObject')
134
+ representation = SubElement(xip, 'xip:Representation')
135
+ io_link = SubElement(representation, 'xip:InformationObject')
70
136
  io_link.text = io_ref
71
- access_name = SubElement(representation, 'Name')
137
+ access_name = SubElement(representation, 'xip:Name')
72
138
  access_name.text = rep_name
73
- access_type = SubElement(representation, 'Type')
139
+ access_type = SubElement(representation, 'xip:Type')
74
140
  access_type.text = rep_type
75
- content_objects = SubElement(representation, 'ContentObjects')
76
- content_object = SubElement(content_objects, 'ContentObject')
141
+ content_objects = SubElement(representation, 'xip:ContentObjects')
142
+ content_object = SubElement(content_objects, 'xip:ContentObject')
77
143
  content_object_ref = str(uuid.uuid4())
78
144
  content_object.text = content_object_ref
79
145
  return content_object_ref
80
146
 
81
147
 
82
148
  def __make_content_objects__(xip, content_title, co_ref, io_ref, tag, content_description, content_type):
83
- content_object = SubElement(xip, 'ContentObject')
84
- ref_element = SubElement(content_object, "Ref")
149
+ content_object = SubElement(xip, 'xip:ContentObject')
150
+ ref_element = SubElement(content_object, "xip:Ref")
85
151
  ref_element.text = co_ref
86
- title = SubElement(content_object, "Title")
152
+ title = SubElement(content_object, "xip:Title")
87
153
  title.text = content_title
88
- description = SubElement(content_object, "Description")
154
+ description = SubElement(content_object, "xip:Description")
89
155
  description.text = content_description
90
- security_tag = SubElement(content_object, "SecurityTag")
156
+ security_tag = SubElement(content_object, "xip:SecurityTag")
91
157
  security_tag.text = tag
92
- custom_type = SubElement(content_object, "CustomType")
158
+ custom_type = SubElement(content_object, "xip:CustomType")
93
159
  custom_type.text = content_type
94
- parent = SubElement(content_object, "Parent")
160
+ parent = SubElement(content_object, "xip:Parent")
95
161
  parent.text = io_ref
96
162
 
97
163
 
98
- def __make_generation__(xip, filename, co_ref, generation_label):
99
- generation = SubElement(xip, 'Generation', {"original": "true", "active": "true"})
100
- content_object = SubElement(generation, "ContentObject")
164
+ def __make_generation__(xip, filename, co_ref, generation_label, location=None):
165
+ generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
166
+ content_object = SubElement(generation, "xip:ContentObject")
101
167
  content_object.text = co_ref
102
- label = SubElement(generation, "Label")
168
+ label = SubElement(generation, "xip:Label")
103
169
  if generation_label:
104
170
  label.text = generation_label
105
171
  else:
106
172
  label.text = os.path.splitext(filename)[0]
107
- effective_date = SubElement(generation, "EffectiveDate")
173
+ effective_date = SubElement(generation, "xip:EffectiveDate")
108
174
  effective_date.text = datetime.now().isoformat()
109
- bitstreams = SubElement(generation, "Bitstreams")
110
- bitstream = SubElement(bitstreams, "Bitstream")
111
- bitstream.text = filename
112
- SubElement(generation, "Formats")
113
- SubElement(generation, "Properties")
175
+ bitstreams = SubElement(generation, "xip:Bitstreams")
176
+ bitstream = SubElement(bitstreams, "xip:Bitstream")
177
+ bitstream.text = f"{location}/{filename}"
178
+ SubElement(generation, "xip:Formats")
179
+ SubElement(generation, "xip:Properties")
114
180
 
115
181
 
116
- def __make_bitstream__(xip, file_name, full_path, callback):
117
- bitstream = SubElement(xip, 'Bitstream')
118
- filename_element = SubElement(bitstream, "Filename")
182
+ def __make_bitstream__(xip, file_name, full_path, callback, location=None):
183
+ bitstream = SubElement(xip, 'xip:Bitstream')
184
+ filename_element = SubElement(bitstream, "xip:Filename")
119
185
  filename_element.text = file_name
120
- filesize = SubElement(bitstream, "FileSize")
186
+ filesize = SubElement(bitstream, "xip:FileSize")
121
187
  file_stats = os.stat(full_path)
122
188
  filesize.text = str(file_stats.st_size)
123
- physical_location = SubElement(bitstream, "PhysicalLocation")
124
- fixities = SubElement(bitstream, "Fixities")
125
- fixity = SubElement(fixities, "Fixity")
126
- fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
127
- fixity_value = SubElement(fixity, "FixityValue")
128
- fixity = callback(file_name, full_path)
129
- fixity_algorithm_ref.text = fixity[0]
130
- fixity_value.text = fixity[1]
189
+ physical_location = SubElement(bitstream, "xip:PhysicalLocation")
190
+ physical_location.text = location
191
+ fixities = SubElement(bitstream, "xip:Fixities")
192
+ fixity_result = callback(file_name, full_path)
193
+ if type(fixity_result) == tuple:
194
+ fixity = SubElement(fixities, "xip:Fixity")
195
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
196
+ fixity_value = SubElement(fixity, "xip:FixityValue")
197
+ fixity_algorithm_ref.text = fixity_result[0]
198
+ fixity_value.text = fixity_result[1]
199
+ elif type(fixity_result) == dict:
200
+ for key, val in fixity_result.items():
201
+ fixity = SubElement(fixities, "xip:Fixity")
202
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
203
+ fixity_value = SubElement(fixity, "xip:FixityValue")
204
+ fixity_algorithm_ref.text = key
205
+ fixity_value.text = val
206
+ else:
207
+ logger.error("Could Not Find Fixity Value")
208
+ raise RuntimeError("Could Not Find Fixity Value")
131
209
 
132
210
 
133
211
  def __make_representation_multiple_co__(xip, rep_name, rep_type, rep_files, io_ref):
134
- representation = SubElement(xip, 'Representation')
135
- io_link = SubElement(representation, 'InformationObject')
212
+ representation = SubElement(xip, 'xip:Representation')
213
+ io_link = SubElement(representation, 'xip:InformationObject')
136
214
  io_link.text = io_ref
137
- access_name = SubElement(representation, 'Name')
215
+ access_name = SubElement(representation, 'xip:Name')
138
216
  access_name.text = rep_name
139
- access_type = SubElement(representation, 'Type')
217
+ access_type = SubElement(representation, 'xip:Type')
140
218
  access_type.text = rep_type
141
- content_objects = SubElement(representation, 'ContentObjects')
219
+ content_objects = SubElement(representation, 'xip:ContentObjects')
142
220
  refs_dict = {}
143
221
  for f in rep_files:
144
- content_object = SubElement(content_objects, 'ContentObject')
222
+ content_object = SubElement(content_objects, 'xip:ContentObject')
145
223
  content_object_ref = str(uuid.uuid4())
146
224
  content_object.text = content_object_ref
147
225
  refs_dict[content_object_ref] = f
@@ -165,12 +243,9 @@ def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Titl
165
243
  headers.add(xml_tag)
166
244
  break
167
245
 
168
- namespaces = {"version": "2.0",
169
- "xmlns:xsl": "http://www.w3.org/1999/XSL/Transform",
170
- "xmlns:fn": "http://www.w3.org/2005/xpath-functions",
171
- "xmlns:xs": "http://www.w3.org/2001/XMLSchema",
172
- "xmlns:csv": xml_namespace,
173
- "xmlns": "http://www.tessella.com/sdb/cmis/metadata",
246
+ namespaces = {"version": "2.0", "xmlns:xsl": "http://www.w3.org/1999/XSL/Transform",
247
+ "xmlns:fn": "http://www.w3.org/2005/xpath-functions", "xmlns:xs": "http://www.w3.org/2001/XMLSchema",
248
+ "xmlns:csv": xml_namespace, "xmlns": "http://www.tessella.com/sdb/cmis/metadata",
174
249
  "exclude-result-prefixes": "csv"}
175
250
 
176
251
  if additional_namespaces is not None:
@@ -240,14 +315,13 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
240
315
  headers.add(xml_tag)
241
316
  break
242
317
 
243
- namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema",
244
- "attributeFormDefault": "unqualified",
318
+ namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema", "attributeFormDefault": "unqualified",
245
319
  "elementFormDefault": "qualified",
246
320
  "targetNamespace": xml_namespace}
247
321
 
248
322
  if additional_namespaces is not None:
249
323
  for prefix, uri in additional_namespaces.items():
250
- namespaces["xmlns:" + prefix.trim()] = uri.trim()
324
+ namespaces["xmlns:" + prefix.strip()] = uri.strip()
251
325
 
252
326
  xml_schema = xml.etree.ElementTree.Element("xs:schema", namespaces)
253
327
 
@@ -303,12 +377,12 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
303
377
 
304
378
  short_name = "csv"
305
379
 
306
- xml_schemaName = xml.etree.ElementTree.SubElement(xml_index, "schemaName")
307
- xml_schemaName.text = title
308
- xml_schemaUri = xml.etree.ElementTree.SubElement(xml_index, "schemaUri")
309
- xml_schemaUri.text = xml_namespace
310
- xml_shortName = xml.etree.ElementTree.SubElement(xml_index, "shortName")
311
- xml_shortName.text = short_name
380
+ xml_schema_name = xml.etree.ElementTree.SubElement(xml_index, "schemaName")
381
+ xml_schema_name.text = title
382
+ xml_schema_uri = xml.etree.ElementTree.SubElement(xml_index, "schemaUri")
383
+ xml_schema_uri.text = xml_namespace
384
+ xml_short_name = xml.etree.ElementTree.SubElement(xml_index, "shortName")
385
+ xml_short_name.text = short_name
312
386
 
313
387
  for header in headers:
314
388
  if ":" in header:
@@ -316,9 +390,7 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
316
390
  else:
317
391
  xpath_expression = f"//{short_name}:{root_element}/{short_name}:{header}"
318
392
 
319
- attr = {"indexName": header, "displayName": header,
320
- "xpath": xpath_expression,
321
- "indexType": "STRING_DEFAULT"}
393
+ attr = {"indexName": header, "displayName": header, "xpath": xpath_expression, "indexType": "STRING_DEFAULT"}
322
394
  xml_term = xml.etree.ElementTree.SubElement(xml_index, "term", attr)
323
395
 
324
396
  if additional_namespaces is not None:
@@ -338,7 +410,14 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
338
410
  def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename", export_folder=None,
339
411
  additional_namespaces=None):
340
412
  """
341
- Export the rows of a CSV file as XML metadata documents which can be added to Preservica assets
413
+ Export the rows of a CSV file as XML metadata documents which can be added to Preservica assets
414
+
415
+ :param str csv_file: Path to the csv file
416
+ :param str xml_namespace: The XML namespace for the created XML documents
417
+ :param str root_element: The root element for the XML documents
418
+ :param str file_name_column: The CSV column which should be used to name the xml files
419
+ :param str export_folder: The path to the export folder
420
+ :param dict additional_namespaces: A map of prefix, uris to use as additional namespaces
342
421
 
343
422
  """
344
423
  headers = list()
@@ -380,9 +459,409 @@ def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename
380
459
  yield name
381
460
 
382
461
 
383
- def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None,
384
- parent_folder=None, compress=True, **kwargs):
462
+ def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None, parent_folder=None,
463
+ compress=True,
464
+ **kwargs):
465
+ # some basic validation
466
+ if export_folder is None:
467
+ export_folder = tempfile.gettempdir()
468
+ if not os.path.isdir(export_folder):
469
+ logger.error("Export Folder Does Not Exist")
470
+ raise RuntimeError(export_folder, "Export Folder Does Not Exist")
471
+ if parent_folder is None:
472
+ logger.error("You must specify a parent folder for the package asset")
473
+ raise RuntimeError("You must specify a parent folder for the package asset")
474
+
475
+ io_ref = None
476
+ xip = None
477
+ default_asset_title = None
478
+ preservation_representation_refs_dict = {}
479
+ access_representation_refs_dict = {}
480
+
481
+ security_tag = kwargs.get('SecurityTag', "open")
482
+ content_type = kwargs.get('CustomType', "")
483
+
484
+ if not compress:
485
+ shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
486
+
487
+ has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
488
+ has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
489
+
490
+ if has_preservation_files:
491
+ if default_asset_title is None:
492
+ key = list(preservation_files_dict.keys())[0]
493
+ preservation_files_list = preservation_files_dict[key]
494
+ default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
495
+
496
+ # create the asset
497
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
498
+
499
+ if has_access_files:
500
+ if default_asset_title is None:
501
+ key = list(access_files_dict.keys())[0]
502
+ access_files_list = access_files_dict[key]
503
+ default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
504
+
505
+ if io_ref is None:
506
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
507
+
508
+ # loop over preservation_files_map
509
+
510
+ if has_preservation_files:
511
+ for representation_name in preservation_files_dict.keys():
512
+ preservation_files_list = preservation_files_dict[representation_name]
513
+ preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
514
+ rep_type="Preservation",
515
+ rep_files=preservation_files_list,
516
+ io_ref=io_ref)
517
+ preservation_representation_refs_dict[representation_name] = preservation_refs_dict
518
+
519
+ if has_access_files:
520
+ for representation_name in access_files_dict.keys():
521
+ access_files_list = access_files_dict[representation_name]
522
+ access_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name, rep_type="Access",
523
+ rep_files=access_files_list, io_ref=io_ref)
524
+ access_representation_refs_dict[representation_name] = access_refs_dict
525
+
526
+ if has_preservation_files:
527
+ for representation_name in preservation_representation_refs_dict.keys():
528
+ preservation_refs_dict = preservation_representation_refs_dict[representation_name]
529
+ for content_ref, filename in preservation_refs_dict.items():
530
+ default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
531
+
532
+ preservation_content_title = kwargs.get('Preservation_Content_Title', default_content_objects_title)
533
+ preservation_content_description = kwargs.get('Preservation_Content_Description',
534
+ default_content_objects_title)
535
+
536
+ if isinstance(preservation_content_title, dict):
537
+ preservation_content_title = preservation_content_title.get("filename",
538
+ default_content_objects_title)
539
+
540
+ if isinstance(preservation_content_description, dict):
541
+ preservation_content_description = preservation_content_description.get("filename",
542
+ default_content_objects_title)
543
+
544
+ __make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
545
+ preservation_content_description, content_type)
546
+
547
+ if has_access_files:
548
+ for representation_name in access_representation_refs_dict.keys():
549
+ access_refs_dict = access_representation_refs_dict[representation_name]
550
+ for content_ref, filename in access_refs_dict.items():
551
+ default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
552
+
553
+ access_content_title = kwargs.get('Access_Content_Title', default_content_objects_title)
554
+ access_content_description = kwargs.get('Access_Content_Description', default_content_objects_title)
555
+
556
+ if isinstance(access_content_title, dict):
557
+ access_content_title = access_content_title.get("filename", default_content_objects_title)
558
+
559
+ if isinstance(access_content_description, dict):
560
+ access_content_description = access_content_title.get("filename", default_content_objects_title)
561
+
562
+ __make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag,
563
+ access_content_description,
564
+ content_type)
565
+
566
+ if has_preservation_files:
567
+ for representation_name in preservation_representation_refs_dict.keys():
568
+ location = sanitize(representation_name)
569
+ preservation_refs_dict = preservation_representation_refs_dict[representation_name]
570
+ preservation_generation_label = kwargs.get('Preservation_Generation_Label', "")
571
+ for content_ref, filename in preservation_refs_dict.items():
572
+ preservation_file_name = os.path.basename(filename)
573
+ __make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label, location)
574
+
575
+ if has_access_files:
576
+ for representation_name in access_representation_refs_dict.keys():
577
+ location = sanitize(representation_name)
578
+ access_refs_dict = access_representation_refs_dict[representation_name]
579
+ access_generation_label = kwargs.get('Access_Generation_Label', "")
580
+ for content_ref, filename in access_refs_dict.items():
581
+ access_file_name = os.path.basename(filename)
582
+ __make_generation__(xip, access_file_name, content_ref, access_generation_label, location)
583
+
584
+ if has_preservation_files:
585
+
586
+ if 'Preservation_files_fixity_callback' in kwargs:
587
+ callback = kwargs.get('Preservation_files_fixity_callback')
588
+ else:
589
+ callback = Sha1FixityCallBack()
590
+ for representation_name in preservation_representation_refs_dict.keys():
591
+ location = sanitize(representation_name)
592
+ preservation_refs_dict = preservation_representation_refs_dict[representation_name]
593
+ for content_ref, filename in preservation_refs_dict.items():
594
+ preservation_file_name = os.path.basename(filename)
595
+ __make_bitstream__(xip, preservation_file_name, filename, callback, location)
596
+
597
+ if has_access_files:
598
+
599
+ if 'Access_files_fixity_callback' in kwargs:
600
+ callback = kwargs.get('Access_files_fixity_callback')
601
+ else:
602
+ callback = Sha1FixityCallBack()
603
+
604
+ for representation_name in access_representation_refs_dict.keys():
605
+ location = sanitize(representation_name)
606
+ access_refs_dict = access_representation_refs_dict[representation_name]
607
+ for content_ref, filename in access_refs_dict.items():
608
+ access_file_name = os.path.basename(filename)
609
+ __make_bitstream__(xip, access_file_name, filename, callback, location)
610
+
611
+ if 'Identifiers' in kwargs:
612
+ identifier_map = kwargs.get('Identifiers')
613
+ for identifier_key, identifier_value in identifier_map.items():
614
+ if identifier_key:
615
+ if identifier_value:
616
+ identifier = SubElement(xip, 'xip:Identifier')
617
+ id_type = SubElement(identifier, "xip:Type")
618
+ id_type.text = identifier_key
619
+ id_value = SubElement(identifier, "xip:Value")
620
+ id_value.text = identifier_value
621
+ id_io = SubElement(identifier, "xip:Entity")
622
+ id_io.text = io_ref
623
+
624
+ if 'Asset_Metadata' in kwargs:
625
+ metadata_map = kwargs.get('Asset_Metadata')
626
+ for metadata_ns, metadata_path in metadata_map.items():
627
+ if metadata_ns:
628
+ if metadata_path:
629
+ if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
630
+ descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
631
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
632
+ metadata_ref = SubElement(metadata, 'xip:Ref')
633
+ metadata_ref.text = str(uuid.uuid4())
634
+ entity = SubElement(metadata, 'xip:Entity')
635
+ entity.text = io_ref
636
+ content = SubElement(metadata, 'xip:Content')
637
+ content.append(descriptive_metadata.getroot())
638
+ elif isinstance(metadata_path, str):
639
+ try:
640
+ descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
641
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
642
+ metadata_ref = SubElement(metadata, 'xip:Ref')
643
+ metadata_ref.text = str(uuid.uuid4())
644
+ entity = SubElement(metadata, 'xip:Entity')
645
+ entity.text = io_ref
646
+ content = SubElement(metadata, 'xip:Content')
647
+ content.append(descriptive_metadata)
648
+ except RuntimeError:
649
+ logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
650
+
651
+ if xip is not None:
652
+ export_folder = export_folder
653
+ top_level_folder = os.path.join(export_folder, io_ref)
654
+ os.mkdir(top_level_folder)
655
+ inner_folder = os.path.join(top_level_folder, io_ref)
656
+ os.mkdir(inner_folder)
657
+ content_folder = os.path.join(inner_folder, CONTENT_FOLDER)
658
+ os.mkdir(content_folder)
659
+ metadata_path = os.path.join(inner_folder, "metadata.xml")
660
+ metadata = open(metadata_path, "wt", encoding='utf-8')
661
+ metadata.write(prettify(xip))
662
+ metadata.close()
663
+ for representation_name in preservation_representation_refs_dict.keys():
664
+ location = sanitize(representation_name)
665
+ Path(os.path.join(content_folder, location)).mkdir(parents=True, exist_ok=True)
666
+ preservation_refs_dict = preservation_representation_refs_dict[representation_name]
667
+ for content_ref, filename in preservation_refs_dict.items():
668
+ src_file = filename
669
+ dst_file = os.path.join(os.path.join(content_folder, location), os.path.basename(filename))
670
+ shutil.copyfile(src_file, dst_file)
671
+ for representation_name in access_representation_refs_dict.keys():
672
+ location = sanitize(representation_name)
673
+ Path(os.path.join(content_folder, location)).mkdir(parents=True, exist_ok=True)
674
+ access_refs_dict = access_representation_refs_dict[representation_name]
675
+ for content_ref, filename in access_refs_dict.items():
676
+ src_file = filename
677
+ dst_file = os.path.join(os.path.join(content_folder, location), os.path.basename(filename))
678
+ shutil.copyfile(src_file, dst_file)
679
+ if compress:
680
+ shutil.make_archive(top_level_folder, 'zip', top_level_folder)
681
+ else:
682
+ shutil.make_archive(top_level_folder, 'szip', top_level_folder)
683
+ shutil.rmtree(top_level_folder)
684
+ return top_level_folder + ".zip"
685
+
686
+
687
+ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=None, compress=True, **kwargs):
688
+ """
689
+ Create a package containing multiple assets, all the assets are ingested into the same parent folder provided
690
+ by the parent_folder argument.
691
+
692
+ :param asset_file_list: List of files. One asset per file
693
+ :param export_folder: Location where the package is written to
694
+ :param parent_folder: The folder the assets will be ingested into
695
+ :param compress: Bool, compress the package
696
+ :param kwargs:
697
+ :return:
385
698
  """
699
+
700
+ # some basic validation
701
+ if export_folder is None:
702
+ export_folder = tempfile.gettempdir()
703
+ if not os.path.isdir(export_folder):
704
+ logger.error("Export Folder Does Not Exist")
705
+ raise RuntimeError(export_folder, "Export Folder Does Not Exist")
706
+ if parent_folder is None:
707
+ logger.error("You must specify a parent folder for the package asset")
708
+ raise RuntimeError("You must specify a parent folder for the package asset")
709
+
710
+ security_tag = kwargs.get('SecurityTag', "open")
711
+ content_type = kwargs.get('CustomType', "")
712
+
713
+ if not compress:
714
+ shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
715
+
716
+ if 'Preservation_files_fixity_callback' in kwargs:
717
+ fixity_callback = kwargs.get('Preservation_files_fixity_callback')
718
+ else:
719
+ fixity_callback = Sha1FixityCallBack()
720
+
721
+ package_id = str(uuid.uuid4())
722
+ export_folder = export_folder
723
+ top_level_folder = os.path.join(export_folder, package_id)
724
+ os.mkdir(top_level_folder)
725
+ inner_folder = os.path.join(top_level_folder, package_id)
726
+ os.mkdir(inner_folder)
727
+ os.mkdir(os.path.join(inner_folder, CONTENT_FOLDER))
728
+
729
+ asset_map = dict()
730
+ xip = Element('xip:XIP')
731
+ xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
732
+ for file in asset_file_list:
733
+ default_asset_title = os.path.splitext(os.path.basename(file))[0]
734
+ xip, io_ref = __create_io__(xip, file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
735
+ asset_map[file] = io_ref
736
+ representation = SubElement(xip, 'xip:Representation')
737
+ io_link = SubElement(representation, 'xip:InformationObject')
738
+ io_link.text = io_ref
739
+ access_name = SubElement(representation, 'xip:Name')
740
+ access_name.text = "Preservation"
741
+ access_type = SubElement(representation, 'xip:Type')
742
+ access_type.text = "Preservation"
743
+ content_objects = SubElement(representation, 'xip:ContentObjects')
744
+ content_object = SubElement(content_objects, 'xip:ContentObject')
745
+ content_object_ref = str(uuid.uuid4())
746
+ content_object.text = content_object_ref
747
+
748
+ default_content_objects_title = os.path.splitext(os.path.basename(file))[0]
749
+ content_object = SubElement(xip, 'xip:ContentObject')
750
+ ref_element = SubElement(content_object, "xip:Ref")
751
+ ref_element.text = content_object_ref
752
+ title = SubElement(content_object, "xip:Title")
753
+ title.text = default_content_objects_title
754
+ description = SubElement(content_object, "xip:Description")
755
+ description.text = default_content_objects_title
756
+ security_tag_element = SubElement(content_object, "xip:SecurityTag")
757
+ security_tag_element.text = security_tag
758
+ custom_type = SubElement(content_object, "xip:CustomType")
759
+ custom_type.text = content_type
760
+ parent = SubElement(content_object, "xip:Parent")
761
+ parent.text = io_ref
762
+
763
+ generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
764
+ content_object = SubElement(generation, "xip:ContentObject")
765
+ content_object.text = content_object_ref
766
+ label = SubElement(generation, "xip:Label")
767
+ label.text = os.path.splitext(os.path.basename(file))[0]
768
+ effective_date = SubElement(generation, "xip:EffectiveDate")
769
+ effective_date.text = datetime.now().isoformat()
770
+ bitstreams = SubElement(generation, "xip:Bitstreams")
771
+ bitstream = SubElement(bitstreams, "xip:Bitstream")
772
+ bitstream.text = os.path.basename(file)
773
+ SubElement(generation, "xip:Formats")
774
+ SubElement(generation, "xip:Properties")
775
+
776
+ bitstream = SubElement(xip, 'xip:Bitstream')
777
+ filename_element = SubElement(bitstream, "xip:Filename")
778
+ filename_element.text = os.path.basename(file)
779
+ filesize = SubElement(bitstream, "xip:FileSize")
780
+ file_stats = os.stat(file)
781
+ filesize.text = str(file_stats.st_size)
782
+ physical_location = SubElement(bitstream, "xip:PhysicalLocation")
783
+ fixities = SubElement(bitstream, "xip:Fixities")
784
+ fixity_result = fixity_callback(filename_element.text, file)
785
+ if type(fixity_result) == tuple:
786
+ fixity = SubElement(fixities, "xip:Fixity")
787
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
788
+ fixity_value = SubElement(fixity, "xip:FixityValue")
789
+ fixity_algorithm_ref.text = fixity_result[0]
790
+ fixity_value.text = fixity_result[1]
791
+ elif type(fixity_result) == dict:
792
+ for key, val in fixity_result.items():
793
+ fixity = SubElement(fixities, "xip:Fixity")
794
+ fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
795
+ fixity_value = SubElement(fixity, "xip:FixityValue")
796
+ fixity_algorithm_ref.text = key
797
+ fixity_value.text = val
798
+ else:
799
+ logger.error("Could Not Find Fixity Value")
800
+ raise RuntimeError("Could Not Find Fixity Value")
801
+
802
+ if 'Identifiers' in kwargs:
803
+ identifier_map = kwargs.get('Identifiers')
804
+ if str(file) in identifier_map:
805
+ identifier_map_values = identifier_map[str(file)]
806
+ for identifier_key, identifier_value in identifier_map_values.items():
807
+ if identifier_key:
808
+ if identifier_value:
809
+ identifier = SubElement(xip, 'xip:Identifier')
810
+ id_type = SubElement(identifier, "xip:Type")
811
+ id_type.text = identifier_key
812
+ id_value = SubElement(identifier, "xip:Value")
813
+ id_value.text = identifier_value
814
+ id_io = SubElement(identifier, "xip:Entity")
815
+ id_io.text = io_ref
816
+
817
+ src_file = file
818
+ dst_file = os.path.join(os.path.join(inner_folder, CONTENT_FOLDER), os.path.basename(file))
819
+ shutil.copyfile(src_file, dst_file)
820
+
821
+ if xip is not None:
822
+ metadata_path = os.path.join(inner_folder, "metadata.xml")
823
+ metadata = open(metadata_path, "wt", encoding='utf-8')
824
+ metadata.write(prettify(xip))
825
+ metadata.close()
826
+ if compress:
827
+ shutil.make_archive(top_level_folder, 'zip', top_level_folder)
828
+ else:
829
+ shutil.make_archive(top_level_folder, 'szip', top_level_folder)
830
+ shutil.rmtree(top_level_folder)
831
+ return top_level_folder + ".zip"
832
+
833
+
834
+ def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None, parent_folder=None,
835
+ compress=True,
836
+ **kwargs):
837
+ """
838
+
839
+ Create a Preservica package containing a single Asset from a multiple preservation files
840
+ and optional access files.
841
+ The Asset contains multiple Content Objects within each representation.
842
+
843
+ If only the preservation files are provided the asset has one representation
844
+
845
+
846
+ :param list preservation_files_list: Paths to the preservation files
847
+ :param list access_files_list: Paths to the access files
848
+ :param str export_folder: The package location folder
849
+ :param Folder parent_folder: The folder to ingest the asset into
850
+ :param bool compress: Compress the ZIP file
851
+ :param str Title: Asset Title
852
+ :param str Description: Asset Description
853
+ :param str SecurityTag: Asset SecurityTag
854
+ :param str CustomType: Asset CustomType
855
+ :param str Preservation_Content_Title: Title of the Preservation Representation Content Object
856
+ :param str Preservation_Content_Description: Description of the Preservation Representation Content Object
857
+ :param str Access_Content_Title: Title of the Access Representation Content Object
858
+ :param str Access_Content_Description: Description of the Access Representation Content Object
859
+ :param dict Asset_Metadata: Dictionary of Asset metadata documents
860
+ :param dict Identifiers: Dictionary of Asset rd party identifiers
861
+
862
+
863
+
864
+
386
865
  optional kwargs map
387
866
  'Title' Asset Title
388
867
  'Description' Asset Description
@@ -399,7 +878,11 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
399
878
  'Preservation_files_fixity_callback' Callback to allow external generated fixity values
400
879
  'Access_files_fixity_callback' Callback to allow external generated fixity values
401
880
  'IO_Identifier_callback' Callback to allow external generated Asset identifier
881
+ 'Preservation_Representation_Name' Name of the Preservation Representation
882
+ 'Access_Representation_Name' Name of the Access Representation
402
883
  """
884
+ xml.etree.ElementTree.register_namespace("xip", "http://preservica.com/XIP/v6.0")
885
+
403
886
  # some basic validation
404
887
  if export_folder is None:
405
888
  export_folder = tempfile.gettempdir()
@@ -413,8 +896,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
413
896
  io_ref = None
414
897
  xip = None
415
898
  default_asset_title = None
416
- preservation_refs_dict = dict()
417
- access_refs_dict = dict()
899
+ preservation_refs_dict = {}
900
+ access_refs_dict = {}
418
901
 
419
902
  security_tag = kwargs.get('SecurityTag', "open")
420
903
  content_type = kwargs.get('CustomType', "")
@@ -428,25 +911,35 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
428
911
  if has_preservation_files:
429
912
  if default_asset_title is None:
430
913
  default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
431
-
432
914
  # create the asset
433
- xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
915
+ if io_ref is None:
916
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
434
917
 
435
918
  if has_access_files:
436
919
  if default_asset_title is None:
437
920
  default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
438
-
439
921
  if io_ref is None:
440
922
  xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
441
923
 
924
+ if io_ref is None:
925
+ default_asset_title = kwargs.get('Title', None)
926
+ if default_asset_title is None:
927
+ default_asset_title = "New Asset"
928
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
929
+
442
930
  if has_preservation_files:
443
931
  # add the content objects
444
- preservation_refs_dict = __make_representation_multiple_co__(xip, "Preservation", "Preservation",
445
- preservation_files_list, io_ref)
932
+ representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
933
+ preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
934
+ rep_type="Preservation",
935
+ rep_files=preservation_files_list, io_ref=io_ref)
446
936
 
447
937
  if has_access_files:
448
938
  # add the content objects
449
- access_refs_dict = __make_representation_multiple_co__(xip, "Access", "Access", access_files_list, io_ref)
939
+ access_name = kwargs.get('Access_Representation_Name', "Access")
940
+ access_refs_dict = __make_representation_multiple_co__(xip, rep_name=access_name, rep_type="Access",
941
+ rep_files=access_files_list,
942
+ io_ref=io_ref)
450
943
 
451
944
  if has_preservation_files:
452
945
 
@@ -463,7 +956,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
463
956
  preservation_content_description = preservation_content_description[filename]
464
957
 
465
958
  __make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
466
- preservation_content_description, content_type)
959
+ preservation_content_description,
960
+ content_type)
467
961
 
468
962
  if has_access_files:
469
963
 
@@ -488,7 +982,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
488
982
 
489
983
  for content_ref, filename in preservation_refs_dict.items():
490
984
  preservation_file_name = os.path.basename(filename)
491
- __make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label)
985
+ __make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label,
986
+ PRESERVATION_CONTENT_FOLDER)
492
987
 
493
988
  if has_access_files:
494
989
 
@@ -496,7 +991,7 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
496
991
 
497
992
  for content_ref, filename in access_refs_dict.items():
498
993
  access_file_name = os.path.basename(filename)
499
- __make_generation__(xip, access_file_name, content_ref, access_generation_label)
994
+ __make_generation__(xip, access_file_name, content_ref, access_generation_label, ACCESS_CONTENT_FOLDER)
500
995
 
501
996
  if has_preservation_files:
502
997
 
@@ -507,7 +1002,7 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
507
1002
 
508
1003
  for content_ref, filename in preservation_refs_dict.items():
509
1004
  preservation_file_name = os.path.basename(filename)
510
- __make_bitstream__(xip, preservation_file_name, filename, callback)
1005
+ __make_bitstream__(xip, preservation_file_name, filename, callback, PRESERVATION_CONTENT_FOLDER)
511
1006
 
512
1007
  if has_access_files:
513
1008
 
@@ -518,35 +1013,58 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
518
1013
 
519
1014
  for content_ref, filename in access_refs_dict.items():
520
1015
  access_file_name = os.path.basename(filename)
521
- __make_bitstream__(xip, access_file_name, filename, callback)
1016
+ __make_bitstream__(xip, access_file_name, filename, callback, ACCESS_CONTENT_FOLDER)
522
1017
 
523
1018
  if 'Identifiers' in kwargs:
524
1019
  identifier_map = kwargs.get('Identifiers')
525
1020
  for identifier_key, identifier_value in identifier_map.items():
526
1021
  if identifier_key:
527
1022
  if identifier_value:
528
- identifier = SubElement(xip, 'Identifier')
529
- id_type = SubElement(identifier, "Type")
1023
+ identifier = SubElement(xip, 'xip:Identifier')
1024
+ id_type = SubElement(identifier, "xip:Type")
530
1025
  id_type.text = identifier_key
531
- id_value = SubElement(identifier, "Value")
1026
+ id_value = SubElement(identifier, "xip:Value")
532
1027
  id_value.text = identifier_value
533
- id_io = SubElement(identifier, "Entity")
1028
+ id_io = SubElement(identifier, "xip:Entity")
534
1029
  id_io.text = io_ref
535
1030
 
536
1031
  if 'Asset_Metadata' in kwargs:
537
1032
  metadata_map = kwargs.get('Asset_Metadata')
538
1033
  for metadata_ns, metadata_path in metadata_map.items():
539
1034
  if metadata_ns:
540
- if metadata_path:
1035
+ if metadata_path and isinstance(metadata_path, str):
541
1036
  if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
542
1037
  descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
543
- metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
544
- metadata_ref = SubElement(metadata, 'Ref')
1038
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
1039
+ metadata_ref = SubElement(metadata, 'xip:Ref')
545
1040
  metadata_ref.text = str(uuid.uuid4())
546
- entity = SubElement(metadata, 'Entity')
1041
+ entity = SubElement(metadata, 'xip:Entity')
547
1042
  entity.text = io_ref
548
- content = SubElement(metadata, 'Content')
1043
+ content = SubElement(metadata, 'xip:Content')
549
1044
  content.append(descriptive_metadata.getroot())
1045
+ elif isinstance(metadata_path, str):
1046
+ try:
1047
+ descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
1048
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
1049
+ metadata_ref = SubElement(metadata, 'xip:Ref')
1050
+ metadata_ref.text = str(uuid.uuid4())
1051
+ entity = SubElement(metadata, 'xip:Entity')
1052
+ entity.text = io_ref
1053
+ content = SubElement(metadata, 'xip:Content')
1054
+ content.append(descriptive_metadata)
1055
+ except RuntimeError:
1056
+ logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
1057
+ if metadata_path and isinstance(metadata_path, list):
1058
+ for path in metadata_path:
1059
+ if os.path.exists(path) and os.path.isfile(path):
1060
+ descriptive_metadata = xml.etree.ElementTree.parse(source=path)
1061
+ metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
1062
+ metadata_ref = SubElement(metadata, 'xip:Ref')
1063
+ metadata_ref.text = str(uuid.uuid4())
1064
+ entity = SubElement(metadata, 'xip:Entity')
1065
+ entity.text = io_ref
1066
+ content = SubElement(metadata, 'xip:Content')
1067
+ content.append(descriptive_metadata.getroot())
550
1068
 
551
1069
  if xip is not None:
552
1070
  export_folder = export_folder
@@ -554,18 +1072,23 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
554
1072
  os.mkdir(top_level_folder)
555
1073
  inner_folder = os.path.join(top_level_folder, io_ref)
556
1074
  os.mkdir(inner_folder)
557
- os.mkdir(os.path.join(inner_folder, "content"))
1075
+ content_folder = os.path.join(inner_folder, CONTENT_FOLDER)
1076
+ os.mkdir(content_folder)
1077
+ preservation_content_folder = os.path.join(content_folder, PRESERVATION_CONTENT_FOLDER)
1078
+ os.mkdir(preservation_content_folder)
1079
+ access_content_folder = os.path.join(content_folder, ACCESS_CONTENT_FOLDER)
1080
+ os.mkdir(access_content_folder)
558
1081
  metadata_path = os.path.join(inner_folder, "metadata.xml")
559
1082
  metadata = open(metadata_path, "wt", encoding='utf-8')
560
1083
  metadata.write(prettify(xip))
561
1084
  metadata.close()
562
1085
  for content_ref, filename in preservation_refs_dict.items():
563
1086
  src_file = filename
564
- dst_file = os.path.join(os.path.join(inner_folder, "content"), os.path.basename(filename))
1087
+ dst_file = os.path.join(preservation_content_folder, os.path.basename(filename))
565
1088
  shutil.copyfile(src_file, dst_file)
566
1089
  for content_ref, filename in access_refs_dict.items():
567
1090
  src_file = filename
568
- dst_file = os.path.join(os.path.join(inner_folder, "content"), os.path.basename(filename))
1091
+ dst_file = os.path.join(access_content_folder, os.path.basename(filename))
569
1092
  shutil.copyfile(src_file, dst_file)
570
1093
  if compress:
571
1094
  shutil.make_archive(top_level_folder, 'zip', top_level_folder)
@@ -578,21 +1101,29 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
578
1101
  def simple_asset_package(preservation_file=None, access_file=None, export_folder=None, parent_folder=None,
579
1102
  compress=True, **kwargs):
580
1103
  """
581
- optional kwargs map
582
- 'Title' Asset Title
583
- 'Description' Asset Description
584
- 'SecurityTag' Asset Security Tag
585
- 'CustomType' Asset Type
586
- 'Preservation_Content_Title' Content Object Title of the Preservation Object
587
- 'Preservation_Content_Description' Content Object Description of the Preservation Object
588
- 'Access_Content_Title' Content Object Title of the Access Object
589
- 'Access_Content_Description' Content Object Description of the Access Object
590
- 'Preservation_Generation_Label' Generation Label for the Preservation Object
591
- 'Access_Generation_Label' Generation Label for the Access Object
592
- 'Asset_Metadata' Map of metadata schema/documents to add to asset
593
- 'Identifiers' Map of asset identifiers
594
- 'Preservation_files_fixity_callback' Callback to allow external generated fixity values
595
- 'Access_files_fixity_callback' Callback to allow external generated fixity values
1104
+ Create a Preservica package containing a single Asset from a single preservation file
1105
+ and an optional access file.
1106
+ The Asset contains one Content Object for each representation.
1107
+
1108
+ If only the preservation file is provided the asset has one representation
1109
+
1110
+
1111
+ :param str preservation_file: Path to the preservation file
1112
+ :param str access_file: Path to the access file
1113
+ :param str export_folder: The package location folder
1114
+ :param Folder parent_folder: The folder to ingest the asset into
1115
+ :param bool compress: Compress the ZIP file
1116
+ :param str Title: Asset Title
1117
+ :param str Description: Asset Description
1118
+ :param str SecurityTag: Asset SecurityTag
1119
+ :param str CustomType: Asset CustomType
1120
+ :param str Preservation_Content_Title: Title of the Preservation Representation Content Object
1121
+ :param str Preservation_Content_Description: Description of the Preservation Representation Content Object
1122
+ :param str Access_Content_Title: Title of the Access Representation Content Object
1123
+ :param str Access_Content_Description: Description of the Access Representation Content Object
1124
+ :param dict Asset_Metadata: Dictionary of Asset metadata documents
1125
+ :param dict Identifiers: Dictionary of Asset rd party identifiers
1126
+
596
1127
  """
597
1128
 
598
1129
  # some basic validation
@@ -618,178 +1149,35 @@ def simple_asset_package(preservation_file=None, access_file=None, export_folder
618
1149
  export_folder=export_folder, parent_folder=parent_folder, compress=compress, **kwargs)
619
1150
 
620
1151
 
1152
+ def upload_config():
1153
+ return transfer_config
1154
+
1155
+
1156
+ def _unpad(s):
1157
+ return s[:-ord(s[len(s) - 1:])]
1158
+
1159
+
621
1160
  class UploadAPI(AuthenticatedAPI):
622
1161
 
623
- def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None,
624
- twitter_secret_key=None, folder=None, callback=None, **kwargs):
625
1162
 
626
- def get_image(m, has_video_element):
627
- media_url_https_ = m["media_url_https"]
628
- if media_url_https_:
629
- req = requests.get(media_url_https_)
630
- if req.status_code == requests.codes.ok:
631
- if has_video_element:
632
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
633
- else:
634
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
635
- image_name_document_ = open(image_name_, "wb")
636
- image_name_document_.write(req.content)
637
- image_name_document_.close()
638
- return image_name_
639
-
640
- def get_video(m):
641
- video_info_ = m["video_info"]
642
- variants_ = video_info_["variants"]
643
- for v_ in variants_:
644
- video_url_ = v_["url"]
645
- req = requests.get(video_url_)
646
- if req.status_code == requests.codes.ok:
647
- video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
648
- video_name_document_ = open(video_name_, "wb")
649
- video_name_document_.write(req.content)
650
- video_name_document_.close()
651
- return video_name_, True
652
-
653
- entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
654
- tenant=self.tenant)
655
- if hasattr(folder, "reference"):
656
- folder = entity_client.folder(folder.reference)
657
- else:
658
- folder = entity_client.folder(folder)
659
- try:
660
- import tweepy
661
- from tweepy import TweepError
662
- except ImportError:
663
- logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
664
- raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
665
- config = configparser.ConfigParser()
666
- config.read('credentials.properties')
667
- if twitter_consumer_key is None:
668
- twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
669
- if twitter_consumer_key is None:
670
- try:
671
- twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
672
- except KeyError:
673
- logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
674
- "environment variables or credentials.properties file")
675
- raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
676
- "environment variables or credentials.properties file")
677
- if twitter_secret_key is None:
678
- twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
679
- if twitter_secret_key is None:
680
- try:
681
- twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
682
- except KeyError:
683
- logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
684
- "environment variables or credentials.properties file")
685
- raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
686
- "environment variables or credentials.properties file")
687
-
688
- api = None
689
- try:
690
- auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
691
- api = tweepy.API(auth, wait_on_rate_limit=True)
692
- except TweepError:
693
- logger.error("No valid Twitter API keys. Could not authenticate")
694
- raise RuntimeError("No valid Twitter API keys. Could not authenticate")
695
- if api is not None:
696
- logger.debug(api)
697
- for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
698
- created_at = tweet.created_at
699
- id_str = tweet.id_str
700
- author = tweet.author.name
701
- tweet_entities = tweet.entities
702
- hashtags = dict()
703
- if 'hashtags' in tweet_entities:
704
- hashtags = tweet.entities['hashtags']
705
- entities = entity_client.identifier("tweet_id", id_str.strip())
706
- if len(entities) > 0:
707
- logger.warning("Tweet already exists, skipping....")
708
- continue
709
- logger.info(f"Processing tweet {id_str} ...")
710
- tid = tweet.id
711
- content_objects = list()
712
- full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
713
- text = tweet.text
714
- full_text = full_tweet.full_text
715
- file_name = f"{{{id_str}}}_[{twitter_user}].json"
716
- json_doc = json.dumps(full_tweet._json)
717
- json_file = open(file_name, "wt", encoding="utf-8")
718
- json_file.write(json_doc)
719
- json_file.close()
720
- content_objects.append(file_name)
721
- if hasattr(full_tweet, "extended_entities"):
722
- extended_entities = full_tweet.extended_entities
723
- if "media" in extended_entities:
724
- media = extended_entities["media"]
725
- for med in media:
726
- media_id_str = med["id_str"]
727
- has_video = False
728
- if "video_info" in med:
729
- co, has_video = get_video(med)
730
- content_objects.append(co)
731
- continue
732
- if "media_url_https" in med:
733
- co = get_image(med, has_video)
734
- content_objects.append(co)
735
- identifiers = dict()
736
- asset_metadata = dict()
737
- identifiers["tweet_id"] = id_str
738
-
739
- user = full_tweet._json['user']
740
-
741
- if full_tweet._json.get('retweeted_status'):
742
- retweeted_status = full_tweet._json['retweeted_status']
743
- if retweeted_status.get("extended_entities"):
744
- extended_entities = retweeted_status["extended_entities"]
745
- if "media" in extended_entities:
746
- media = extended_entities["media"]
747
- for med in media:
748
- media_id_str = med["id_str"]
749
- has_video = False
750
- if "video_info" in med:
751
- co, has_video = get_video(med)
752
- content_objects.append(co)
753
- continue
754
- if "media_url_https" in med:
755
- co = get_image(med, has_video)
756
- content_objects.append(co)
757
-
758
- xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
759
- xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
760
- xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
761
- xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
762
- xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
763
- for h in hashtags:
764
- xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
765
-
766
- xml.etree.ElementTree.SubElement(xml_object, "name").text = author
767
- xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
768
- xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
769
-
770
- xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
771
-
772
- metadata_document = open("metadata.xml", "wt", encoding="utf-8")
773
- metadata_document.write(xml_request.decode("utf-8"))
774
- metadata_document.close()
775
-
776
- asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
777
-
778
- security_tag = kwargs.get("SecurityTag", "open")
779
- asset_title = kwargs.get("Title", text)
780
- asset_description = kwargs.get("Description", full_text)
781
-
782
- p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
783
- Title=asset_title, Description=asset_description, CustomType="Tweet",
784
- Identifiers=identifiers, Asset_Metadata=asset_metadata,
785
- SecurityTag=security_tag)
786
- self.upload_zip_package(p, folder=folder, callback=callback)
787
- for ob in content_objects:
788
- os.remove(ob)
789
- os.remove("metadata.xml")
790
- sleep(2)
1163
+
791
1164
 
792
1165
  def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
1166
+ """
1167
+ Ingest a web video such as YouTube etc based on the URL
1168
+
1169
+ :param str url: URL to the YouTube video
1170
+ :param Folder parent_folder: The folder to ingest the video into
1171
+ :param str Title: Optional asset title
1172
+ :param str Description: Optional asset description
1173
+ :param str SecurityTag: Optional asset security tag
1174
+ :param dict Identifiers: Optional asset 3rd party identifiers
1175
+ :param dict Asset_Metadata: Optional asset additional descriptive metadata
1176
+ :param callback callback: Optional upload progress callback
1177
+ :raises RuntimeError:
1178
+
1179
+
1180
+ """
793
1181
  try:
794
1182
  import youtube_dl
795
1183
  except ImportError:
@@ -802,10 +1190,7 @@ class UploadAPI(AuthenticatedAPI):
802
1190
  if d['status'] == 'finished':
803
1191
  logger.info('Download Complete. Uploading to Preservica ...')
804
1192
 
805
- ydl_opts = {
806
- 'outtmpl': '%(id)s.mp4',
807
- 'progress_hooks': [my_hook],
808
- }
1193
+ ydl_opts = {'outtmpl': '%(id)s.mp4', 'progress_hooks': [my_hook], }
809
1194
 
810
1195
  # if True:
811
1196
  # ydl_opts['writesubtitles'] = True
@@ -858,54 +1243,468 @@ class UploadAPI(AuthenticatedAPI):
858
1243
 
859
1244
  self.upload_zip_package(path_to_zip_package=package, folder=parent_folder, callback=callback)
860
1245
 
861
- def upload_zip_package(self, path_to_zip_package, folder=None, callback=None, delete_after_upload=False):
862
- bucket = f'{self.tenant.lower()}.package.upload'
863
- endpoint = f'https://{self.server}/api/s3/buckets'
1246
+ def upload_credentials(self, location_id: str):
1247
+ """
1248
+ Retrieves temporary upload credentials (Amazon STS, or Azure SAS) for this location.
1249
+
1250
+ :return: dict
1251
+ """
1252
+ headers = {HEADER_TOKEN: self.token}
1253
+ endpoint = f"/upload/{location_id}/upload-credentials"
1254
+ request = self.session.get(f'https://{self.server}/api/location{endpoint}', headers=headers)
1255
+ if request.status_code == requests.codes.ok:
1256
+ json_response = str(request.content.decode('utf-8'))
1257
+ return json.loads(json_response)
1258
+ elif request.status_code == requests.codes.unauthorized:
1259
+ self.token = self.__token__()
1260
+ return self.upload_credentials(location_id)
1261
+ else:
1262
+ exception = HTTPException(location_id, request.status_code, request.url, "upload_credentials",
1263
+ request.content.decode('utf-8'))
1264
+ logger.error(exception)
1265
+ raise exception
1266
+
1267
+ def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
1268
+ """
1269
+ Clean up objects in an upload bucket which are older than older_than_days.
1270
+
1271
+ """
1272
+ from azure.storage.blob import ContainerClient
1273
+
1274
+ for location in self.upload_locations():
1275
+ if location['containerName'] == bucket_name:
1276
+
1277
+ if location['type'] != 'AWS':
1278
+ credentials = self.upload_credentials(location['apiId'])
1279
+ account_key = credentials['key']
1280
+ session_token = credentials['sessionToken']
1281
+ sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
1282
+ container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
1283
+ now = datetime.now(timezone.utc)
1284
+ for blob in container.list_blobs():
1285
+ if abs((blob.last_modified - now).days) > older_than_days:
1286
+ logger.debug(f"Deleting expired object {blob.name}")
1287
+ container.delete_blob(blob.name)
1288
+
1289
+ if location['type'] == 'AWS':
1290
+ credentials = self.upload_credentials(location['apiId'])
1291
+ access_key = credentials['key']
1292
+ secret_key = credentials['secret']
1293
+ session_token = credentials['sessionToken']
1294
+ session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
1295
+ aws_session_token=session_token)
1296
+ s3_client = session.client("s3")
1297
+ paginator = s3_client.get_paginator('list_objects_v2')
1298
+ now = datetime.now(timezone.utc)
1299
+ for page in paginator.paginate(Bucket=bucket_name):
1300
+ if 'Contents' in page:
1301
+ for key in page['Contents']:
1302
+ last_modified = key['LastModified']
1303
+ if abs((last_modified - now).days) > older_than_days:
1304
+ logger.debug(f"Deleting expired object {key['Key']}")
1305
+ s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
1306
+
1307
+
1308
+
1309
+
1310
+
1311
+
1312
+
1313
+ def upload_locations(self):
1314
+ """
1315
+ Upload locations are configured on the Sources page as 'SIP Upload'.
1316
+ :return: dict
1317
+ """
1318
+ headers = {HEADER_TOKEN: self.token}
1319
+ endpoint = "/api/location/upload"
1320
+ request = self.session.get(f'https://{self.server}{endpoint}', headers=headers)
1321
+ if request.status_code == requests.codes.ok:
1322
+ json_response = str(request.content.decode('utf-8'))
1323
+ return json.loads(json_response)['locations']
1324
+ elif request.status_code == requests.codes.unauthorized:
1325
+ self.token = self.__token__()
1326
+ return self.upload_locations()
1327
+ else:
1328
+ exception = HTTPException("", request.status_code, request.url, "upload_locations",
1329
+ request.content.decode('utf-8'))
1330
+ logger.error(exception)
1331
+ raise exception
1332
+
1333
+ def upload_buckets(self):
1334
+ """
1335
+ Get a list of available upload buckets
1336
+
1337
+ :return: dict of bucket names and regions
1338
+ """
1339
+ return self.upload_locations()
1340
+
1341
+ def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback: bool = False,
1342
+ security_tag: str = "open",
1343
+ delete_after_upload: bool = True, max_MB_ingested: int = -1):
1344
+
1345
+ from pyPreservica import EntityAPI
1346
+
1347
+ def entity_value(client: EntityAPI, identifier: str) -> Entity:
1348
+ back_off: int = 5
1349
+ while True:
1350
+ try:
1351
+ entities = client.identifier("code", identifier)
1352
+ if bool(len(entities) > 0):
1353
+ return entities.pop()
1354
+ else:
1355
+ return None
1356
+ except HTTPException as e:
1357
+ sleep(back_off)
1358
+ back_off = back_off * 2
1359
+
1360
+ def entity_exists(client: EntityAPI, identifier: str) -> bool:
1361
+ back_off: int = 5
1362
+ while True:
1363
+ try:
1364
+ entities = client.identifier("code", identifier)
1365
+ return bool(len(entities) > 0)
1366
+ except HTTPException as e:
1367
+ sleep(back_off)
1368
+ back_off = back_off * 2
1369
+
1370
+ def get_parent(client, identifier, parent_reference):
1371
+ dirname_id: str = str(os.path.dirname(identifier))
1372
+ if not dirname_id:
1373
+ dirname_id = identifier
1374
+ folder = entity_value(client, dirname_id)
1375
+ if folder is not None:
1376
+ folder = client.folder(folder.reference)
1377
+ return folder.reference
1378
+ else:
1379
+ return parent_reference
1380
+
1381
+ def get_folder(client, name, tag, parent_reference, identifier):
1382
+ folder = entity_value(client, identifier)
1383
+ if folder is None:
1384
+ logger.info(f"Creating new folder with name {name}")
1385
+ folder = client.create_folder(name, name, tag, parent_reference)
1386
+ client.add_identifier(folder, "code", identifier)
1387
+ else:
1388
+ logger.info(f"Found existing folder with name {name}")
1389
+ return folder
1390
+
1391
+ entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
1392
+ tenant=self.tenant,
1393
+ two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
1394
+ protocol=self.protocol)
1395
+
1396
+ if preservica_parent:
1397
+ parent = entity_client.folder(preservica_parent)
1398
+ logger.info(f"Folders will be created inside Preservica collection {parent.title}")
1399
+ parent_ref = parent.reference
1400
+ else:
1401
+ parent_ref = None
1402
+
1403
+ bytes_ingested = 0
1404
+
1405
+ folder_path = os.path.normpath(filesystem_path)
1406
+
1407
+ for dirname, subdirs, files in os.walk(folder_path):
1408
+ base = os.path.basename(dirname)
1409
+ code = os.path.relpath(dirname, Path(folder_path).parent)
1410
+ p = get_parent(entity_client, code, parent_ref)
1411
+ f = get_folder(entity_client, base, security_tag, p, code)
1412
+ identifiers = dict()
1413
+ for file in list(files):
1414
+ full_path = os.path.join(dirname, file)
1415
+ if os.path.islink(full_path):
1416
+ logger.info(f"Skipping link {file}")
1417
+ files.remove(file)
1418
+ continue
1419
+ asset_code = os.path.join(code, file)
1420
+ if not entity_exists(entity_client, asset_code):
1421
+ bytes_ingested = bytes_ingested + os.stat(full_path).st_size
1422
+ logger.info(f"Adding new file: {file} to package ready for upload")
1423
+ file_identifiers = {"code": asset_code}
1424
+ identifiers[full_path] = file_identifiers
1425
+ else:
1426
+ logger.info(f"Skipping file {file} already exists in repository")
1427
+ files.remove(file)
1428
+
1429
+ if len(files) > 0:
1430
+ full_path_list = [os.path.join(dirname, file) for file in files]
1431
+ package = multi_asset_package(asset_file_list=full_path_list, parent_folder=f, SecurityTag=security_tag,
1432
+ Identifiers=identifiers)
1433
+ if callback:
1434
+ progress_display = UploadProgressConsoleCallback(package)
1435
+ else:
1436
+ progress_display = None
1437
+
1438
+ if bucket_name is None:
1439
+ self.upload_zip_package(path_to_zip_package=package, callback=progress_display,
1440
+ delete_after_upload=delete_after_upload)
1441
+ else:
1442
+ self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
1443
+ show_progress=bool(progress_display is not None),
1444
+ delete_after_upload=delete_after_upload)
1445
+
1446
+ logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
1447
+
1448
+ if max_MB_ingested > 0:
1449
+ if bytes_ingested > (1024 * 1024 * max_MB_ingested):
1450
+ logger.info(f"Reached Max Upload Limit")
1451
+ break
1452
+
1453
+ def upload_zip_to_Source(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
1454
+ show_progress=False):
1455
+
1456
+ """
1457
+ Uploads a zip file package to either an Azure container or S3 bucket
1458
+ depending on the Preservica system deployment
1459
+
1460
+ :param str path_to_zip_package: Path to the package
1461
+ :param str container_name: container connected to the ingest workflow
1462
+ :param Folder folder: The folder to ingest the package into
1463
+ :param bool delete_after_upload: Delete the local copy of the package after the upload has completed
1464
+ :param bool show_progress: Show upload progress bar
1465
+
1466
+ """
1467
+
1468
+ locations = self.upload_locations()
1469
+ for location in locations:
1470
+ if location['containerName'] == container_name:
1471
+ if location['type'] == 'AWS':
1472
+ callback = None
1473
+ if show_progress:
1474
+ callback = UploadProgressConsoleCallback(path_to_zip_package)
1475
+ self.upload_zip_package_to_S3(path_to_zip_package=path_to_zip_package, bucket_name=container_name,
1476
+ folder=folder,
1477
+ callback=callback, delete_after_upload=delete_after_upload)
1478
+ else:
1479
+ self.upload_zip_package_to_Azure(path_to_zip_package=path_to_zip_package,
1480
+ container_name=container_name, folder=folder,
1481
+ delete_after_upload=delete_after_upload,
1482
+ show_progress=show_progress)
1483
+
1484
+ def upload_zip_package_to_Azure(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
1485
+ show_progress=False):
1486
+
1487
+ """
1488
+ Uploads a zip file package to an Azure container connected to a Preservica Cloud System
1489
+
1490
+ :param str path_to_zip_package: Path to the package
1491
+ :param str container_name: container connected to the ingest workflow
1492
+ :param Folder folder: The folder to ingest the package into
1493
+ :param bool delete_after_upload: Delete the local copy of the package after the upload has completed
1494
+
1495
+ """
1496
+
1497
+ if (self.major_version < 7) and (self.minor_version < 5):
1498
+ raise RuntimeError(
1499
+ "This call [upload_zip_package_to_Azure] is only available against v6.5 systems and above")
1500
+
1501
+ from azure.storage.blob import ContainerClient
1502
+
1503
+ locations = self.upload_locations()
1504
+ for location in locations:
1505
+ if location['containerName'] == container_name:
1506
+ credentials = self.upload_credentials(location['apiId'])
1507
+ account_key = credentials['key']
1508
+ session_token = credentials['sessionToken']
1509
+
1510
+ sas_url = f"https://{account_key}.blob.core.windows.net/{container_name}"
1511
+ container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
1512
+
1513
+ upload_key = str(uuid.uuid4())
1514
+ metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': container_name, 'status': 'ready'}
1515
+
1516
+ if hasattr(folder, "reference"):
1517
+ metadata['collectionreference'] = folder.reference
1518
+ elif isinstance(folder, str):
1519
+ metadata['collectionreference'] = folder
1520
+
1521
+ properties = None
1522
+
1523
+ len_bytes = Path(path_to_zip_package).stat().st_size
1524
+
1525
+ if show_progress:
1526
+ with tqdm.wrapattr(open(path_to_zip_package, 'rb'), "read", total=len_bytes) as data:
1527
+ blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
1528
+ length=len_bytes)
1529
+ properties = blob_client.get_blob_properties()
1530
+ else:
1531
+ with open(path_to_zip_package, "rb") as data:
1532
+ blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
1533
+ length=len_bytes)
1534
+ properties = blob_client.get_blob_properties()
1535
+
1536
+ if delete_after_upload:
1537
+ os.remove(path_to_zip_package)
1538
+
1539
+ return properties
1540
+
1541
+ def upload_zip_package_to_S3(self, path_to_zip_package, bucket_name, folder=None, callback=None,
1542
+ delete_after_upload=False):
1543
+
1544
+ """
1545
+ Uploads a zip file package to an S3 bucket connected to a Preservica Cloud System
1546
+
1547
+ :param str path_to_zip_package: Path to the package
1548
+ :param str bucket_name: Bucket connected to an ingest workflow
1549
+ :param Folder folder: The folder to ingest the package into
1550
+ :param Callable callback: Optional callback to allow the callee to monitor the upload progress
1551
+ :param bool delete_after_upload: Delete the local copy of the package after the upload has completed
1552
+
1553
+ """
1554
+
1555
+ if (self.major_version < 7) and (self.minor_version < 5):
1556
+ raise RuntimeError("This call [upload_zip_package_to_S3] is only available against v6.5 systems and above")
1557
+
1558
+ logger.debug("Finding Upload Locations")
864
1559
  self.token = self.__token__()
865
- s3_client = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=self.token,
866
- aws_secret_access_key="NOT_USED",
867
- config=Config(s3={'addressing_style': 'path'}))
1560
+ locations = self.upload_locations()
1561
+ for location in locations:
1562
+ if location['containerName'] == bucket_name:
1563
+ logger.debug(f"Found Upload Location {location['containerName']}")
1564
+ logger.debug(f"Fetching Upload Credentials for {location['containerName']}")
1565
+ credentials = self.upload_credentials(location['apiId'])
1566
+ access_key = credentials['key']
1567
+ secret_key = credentials['secret']
1568
+ session_token = credentials['sessionToken']
1569
+ endpoint = credentials['endpoint']
868
1570
 
869
- metadata = dict()
870
- if folder is not None:
871
- if hasattr(folder, "reference"):
872
- metadata = {'Metadata': {'structuralobjectreference': folder.reference}}
873
- elif isinstance(folder, str):
874
- metadata = {'Metadata': {'structuralobjectreference': folder}}
1571
+ session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
1572
+ aws_session_token=session_token)
1573
+ s3 = session.resource(service_name="s3")
1574
+
1575
+ logger.debug(f"S3 Session: {s3}")
1576
+
1577
+ upload_key = str(uuid.uuid4())
1578
+ s3_object = s3.Object(bucket_name, upload_key)
1579
+ metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': bucket_name, 'status': 'ready'}
1580
+
1581
+ if hasattr(folder, "reference"):
1582
+ metadata['collectionreference'] = folder.reference
1583
+ elif isinstance(folder, str):
1584
+ metadata['collectionreference'] = folder
1585
+
1586
+ metadata['size'] = str(Path(path_to_zip_package).stat().st_size)
1587
+ metadata['createdby'] = self.username
1588
+
1589
+ metadata_map = {'Metadata': metadata}
1590
+
1591
+ s3_object.upload_file(path_to_zip_package, Callback=callback, ExtraArgs=metadata_map,
1592
+ Config=transfer_config)
875
1593
 
876
- if os.path.exists(path_to_zip_package) and os.path.isfile(path_to_zip_package):
877
- try:
878
- key_id = str(uuid.uuid4()) + ".zip"
879
- s3_client.upload_file(path_to_zip_package, bucket, key_id, ExtraArgs=metadata,
880
- Callback=callback, Config=transfer_config)
881
1594
  if delete_after_upload:
882
1595
  os.remove(path_to_zip_package)
883
- except ClientError as e:
884
- raise e
885
1596
 
886
- def upload_zip_package_progress_token(self, path_to_zip_package, folder=None, delete_after_upload=False):
1597
+ def upload_zip_package(self, path_to_zip_package, folder=None, callback=None, delete_after_upload=False):
1598
+ """
1599
+ Uploads a zip file package directly to Preservica and starts an ingest workflow
1600
+
1601
+ :param str path_to_zip_package: Path to the package
1602
+ :param Folder folder: The folder to ingest the package into
1603
+ :param Callable callback: Optional callback to allow the callee to monitor the upload progress
1604
+ :param bool delete_after_upload: Delete the local copy of the package after the upload has completed
1605
+
1606
+ :return: preservica-progress-token to allow the workflow progress to be monitored
1607
+ :rtype: str
1608
+
1609
+
1610
+ :raises RuntimeError:
1611
+
1612
+
1613
+ """
887
1614
  bucket = f'{self.tenant.lower()}.package.upload'
888
- endpoint = f'https://{self.server}/api/s3/buckets'
1615
+ endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
889
1616
  self.token = self.__token__()
890
- s3_client = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=self.token,
891
- aws_secret_access_key="NOT_USED",
892
- config=Config(s3={'addressing_style': 'path'}))
893
1617
 
894
- metadata = dict()
1618
+
1619
+ retries= {
1620
+ 'max_attempts': 5,
1621
+ 'mode': 'adaptive'
1622
+ }
1623
+
1624
+ def new_credentials():
1625
+ cred_metadata: dict = {}
1626
+ cred_metadata['access_key'] = self.__token__()
1627
+ cred_metadata['secret_key'] = "NOT_USED"
1628
+ cred_metadata['token'] = ""
1629
+ cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
1630
+ logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
1631
+ return cred_metadata
1632
+
1633
+ session = get_session()
1634
+
1635
+ session_credentials = RefreshableCredentials.create_from_metadata(
1636
+ metadata=new_credentials(),
1637
+ refresh_using=new_credentials,
1638
+ advisory_timeout = 4 * 60,
1639
+ mandatory_timeout = 12 * 60,
1640
+ method = 'Preservica'
1641
+ )
1642
+
1643
+ autorefresh_session = boto3.Session(botocore_session=session)
1644
+
1645
+ session._credentials = session_credentials
1646
+
1647
+ config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
1648
+ request_checksum_calculation="WHEN_REQUIRED",
1649
+ response_checksum_validation="WHEN_REQUIRED",
1650
+ retries=retries, tcp_keepalive=True)
1651
+
1652
+
1653
+ s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
1654
+
1655
+ metadata = {}
895
1656
  if folder is not None:
896
1657
  if hasattr(folder, "reference"):
897
- metadata = {'structuralobjectreference': folder.reference}
1658
+ metadata = {'Metadata': {'structuralobjectreference': folder.reference}}
898
1659
  elif isinstance(folder, str):
899
- metadata = {'structuralobjectreference': folder}
1660
+ metadata = {'Metadata': {'structuralobjectreference': folder}}
900
1661
 
901
1662
  if os.path.exists(path_to_zip_package) and os.path.isfile(path_to_zip_package):
902
1663
  try:
903
1664
  key_id = str(uuid.uuid4()) + ".zip"
904
- with open(path_to_zip_package, 'rb') as fd:
905
- response = s3_client.put_object(Body=fd, Bucket=bucket, Key=key_id, Metadata=metadata)
1665
+
1666
+
1667
+ # how big is the package
1668
+ package_size = os.path.getsize(path_to_zip_package)
1669
+ if package_size > 1 * GB:
1670
+ transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
1671
+ if package_size > 8 * GB:
1672
+ transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
1673
+ if package_size > 24 * GB:
1674
+ transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
1675
+ if package_size > 48 * GB:
1676
+ transfer_config.multipart_chunksize = 64 * MB
1677
+
1678
+ logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
1679
+
1680
+ transfer = S3Transfer(client=s3_client, config=transfer_config)
1681
+
1682
+ transfer.PutObjectTask = PutObjectTask
1683
+ transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
1684
+ transfer.upload_file = upload_file
1685
+
1686
+
1687
+ response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
1688
+ key=key_id,
1689
+ extra_args=metadata,
1690
+ callback=callback)
1691
+
906
1692
 
907
1693
  if delete_after_upload:
908
1694
  os.remove(path_to_zip_package)
1695
+
909
1696
  return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
910
- except ClientError as e:
911
- raise e
1697
+
1698
+ except (NoCredentialsError, PartialCredentialsError) as ex:
1699
+ logger.error(ex)
1700
+ raise ex
1701
+
1702
+ except ClientError as ex:
1703
+ logger.error(ex)
1704
+ raise ex
1705
+
1706
+
1707
+
1708
+
1709
+
1710
+