pyPreservica 2.0.3__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyPreservica might be problematic. Click here for more details.
- pyPreservica/__init__.py +19 -7
- pyPreservica/adminAPI.py +43 -33
- pyPreservica/authorityAPI.py +9 -9
- pyPreservica/common.py +198 -54
- pyPreservica/contentAPI.py +199 -18
- pyPreservica/entityAPI.py +944 -250
- pyPreservica/mdformsAPI.py +572 -0
- pyPreservica/monitorAPI.py +3 -3
- pyPreservica/parAPI.py +7 -40
- pyPreservica/retentionAPI.py +58 -26
- pyPreservica/settingsAPI.py +295 -0
- pyPreservica/uploadAPI.py +426 -609
- pyPreservica/webHooksAPI.py +3 -1
- pyPreservica/workflowAPI.py +21 -37
- {pyPreservica-2.0.3.dist-info → pypreservica-3.3.3.dist-info}/METADATA +93 -84
- pypreservica-3.3.3.dist-info/RECORD +20 -0
- {pyPreservica-2.0.3.dist-info → pypreservica-3.3.3.dist-info}/WHEEL +1 -1
- pyPreservica/vocabularyAPI.py +0 -141
- pyPreservica-2.0.3.dist-info/RECORD +0 -19
- {pyPreservica-2.0.3.dist-info → pypreservica-3.3.3.dist-info/licenses}/LICENSE.txt +0 -0
- {pyPreservica-2.0.3.dist-info → pypreservica-3.3.3.dist-info}/top_level.txt +0 -0
pyPreservica/uploadAPI.py
CHANGED
|
@@ -13,7 +13,7 @@ import shutil
|
|
|
13
13
|
import tempfile
|
|
14
14
|
import uuid
|
|
15
15
|
import xml
|
|
16
|
-
from datetime import datetime
|
|
16
|
+
from datetime import datetime, timedelta, timezone
|
|
17
17
|
from time import sleep
|
|
18
18
|
from xml.dom import minidom
|
|
19
19
|
from xml.etree import ElementTree
|
|
@@ -22,10 +22,12 @@ from xml.etree.ElementTree import Element, SubElement
|
|
|
22
22
|
import boto3
|
|
23
23
|
import s3transfer.tasks
|
|
24
24
|
import s3transfer.upload
|
|
25
|
-
|
|
25
|
+
from botocore.session import get_session
|
|
26
26
|
from boto3.s3.transfer import TransferConfig, S3Transfer
|
|
27
27
|
from botocore.config import Config
|
|
28
|
-
from botocore.
|
|
28
|
+
from botocore.credentials import RefreshableCredentials
|
|
29
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
30
|
+
from dateutil.tz import tzlocal
|
|
29
31
|
from s3transfer import S3UploadFailedError
|
|
30
32
|
from tqdm import tqdm
|
|
31
33
|
|
|
@@ -36,7 +38,7 @@ logger = logging.getLogger(__name__)
|
|
|
36
38
|
|
|
37
39
|
MB = 1024 * 1024
|
|
38
40
|
GB = 1024 ** 3
|
|
39
|
-
transfer_config = TransferConfig(multipart_threshold=int(
|
|
41
|
+
transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
|
|
40
42
|
|
|
41
43
|
CONTENT_FOLDER = "content"
|
|
42
44
|
PRESERVATION_CONTENT_FOLDER = "p1"
|
|
@@ -57,8 +59,7 @@ def upload_file(self, filename, bucket, key, callback=None, extra_args=None):
|
|
|
57
59
|
raise ValueError('Filename must be a string')
|
|
58
60
|
|
|
59
61
|
subscribers = self._get_subscribers(callback)
|
|
60
|
-
future = self._manager.upload(
|
|
61
|
-
filename, bucket, key, extra_args, subscribers)
|
|
62
|
+
future = self._manager.upload(filename, bucket, key, extra_args, subscribers)
|
|
62
63
|
try:
|
|
63
64
|
return future.result()
|
|
64
65
|
# If a client error was raised, add the backwards compatibility layer
|
|
@@ -66,9 +67,7 @@ def upload_file(self, filename, bucket, key, callback=None, extra_args=None):
|
|
|
66
67
|
# ever thrown for upload_parts but now can be thrown for any related
|
|
67
68
|
# client error.
|
|
68
69
|
except ClientError as e:
|
|
69
|
-
raise S3UploadFailedError(
|
|
70
|
-
"Failed to upload %s to %s: %s" % (
|
|
71
|
-
filename, '/'.join([bucket, key]), e))
|
|
70
|
+
raise S3UploadFailedError("Failed to upload %s to %s: %s" % (filename, '/'.join([bucket, key]), e))
|
|
72
71
|
|
|
73
72
|
|
|
74
73
|
class PutObjectTask(s3transfer.tasks.Task):
|
|
@@ -82,13 +81,9 @@ class PutObjectTask(s3transfer.tasks.Task):
|
|
|
82
81
|
class CompleteMultipartUploadTask(s3transfer.tasks.Task):
|
|
83
82
|
# Copied from s3transfer/tasks.py, changed to return a result.
|
|
84
83
|
def _main(self, client, bucket, key, upload_id, parts, extra_args):
|
|
85
|
-
return client.complete_multipart_upload(
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
UploadId=upload_id,
|
|
89
|
-
MultipartUpload={"Parts": parts},
|
|
90
|
-
**extra_args,
|
|
91
|
-
)
|
|
84
|
+
return client.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id,
|
|
85
|
+
MultipartUpload={"Parts": parts},
|
|
86
|
+
**extra_args, )
|
|
92
87
|
|
|
93
88
|
|
|
94
89
|
s3transfer.upload.PutObjectTask = PutObjectTask
|
|
@@ -105,11 +100,11 @@ def prettify(elem):
|
|
|
105
100
|
|
|
106
101
|
def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
|
|
107
102
|
if xip is None:
|
|
108
|
-
xip = Element('XIP')
|
|
103
|
+
xip = Element('xip:XIP')
|
|
104
|
+
xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
|
|
109
105
|
assert xip is not None
|
|
110
|
-
xip
|
|
111
|
-
|
|
112
|
-
ref = SubElement(io, 'Ref')
|
|
106
|
+
io = SubElement(xip, 'xip:InformationObject')
|
|
107
|
+
ref = SubElement(io, 'xip:Ref')
|
|
113
108
|
|
|
114
109
|
if 'IO_Identifier_callback' in kwargs:
|
|
115
110
|
ident_callback = kwargs.get('IO_Identifier_callback')
|
|
@@ -117,15 +112,15 @@ def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
|
|
|
117
112
|
else:
|
|
118
113
|
ref.text = str(uuid.uuid4())
|
|
119
114
|
|
|
120
|
-
title = SubElement(io, 'Title')
|
|
115
|
+
title = SubElement(io, 'xip:Title')
|
|
121
116
|
title.text = kwargs.get('Title', file_name)
|
|
122
|
-
description = SubElement(io, 'Description')
|
|
117
|
+
description = SubElement(io, 'xip:Description')
|
|
123
118
|
description.text = kwargs.get('Description', file_name)
|
|
124
|
-
security = SubElement(io, 'SecurityTag')
|
|
119
|
+
security = SubElement(io, 'xip:SecurityTag')
|
|
125
120
|
security.text = kwargs.get('SecurityTag', "open")
|
|
126
|
-
custom_type = SubElement(io, 'CustomType')
|
|
121
|
+
custom_type = SubElement(io, 'xip:CustomType')
|
|
127
122
|
custom_type.text = kwargs.get('CustomType', "")
|
|
128
|
-
parent = SubElement(io, 'Parent')
|
|
123
|
+
parent = SubElement(io, 'xip:Parent')
|
|
129
124
|
|
|
130
125
|
if hasattr(parent_folder, "reference"):
|
|
131
126
|
parent.text = parent_folder.reference
|
|
@@ -136,76 +131,76 @@ def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
|
|
|
136
131
|
|
|
137
132
|
|
|
138
133
|
def __make_representation__(xip, rep_name, rep_type, io_ref):
|
|
139
|
-
representation = SubElement(xip, 'Representation')
|
|
140
|
-
io_link = SubElement(representation, 'InformationObject')
|
|
134
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
135
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
141
136
|
io_link.text = io_ref
|
|
142
|
-
access_name = SubElement(representation, 'Name')
|
|
137
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
143
138
|
access_name.text = rep_name
|
|
144
|
-
access_type = SubElement(representation, 'Type')
|
|
139
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
145
140
|
access_type.text = rep_type
|
|
146
|
-
content_objects = SubElement(representation, 'ContentObjects')
|
|
147
|
-
content_object = SubElement(content_objects, 'ContentObject')
|
|
141
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
142
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
148
143
|
content_object_ref = str(uuid.uuid4())
|
|
149
144
|
content_object.text = content_object_ref
|
|
150
145
|
return content_object_ref
|
|
151
146
|
|
|
152
147
|
|
|
153
148
|
def __make_content_objects__(xip, content_title, co_ref, io_ref, tag, content_description, content_type):
|
|
154
|
-
content_object = SubElement(xip, 'ContentObject')
|
|
155
|
-
ref_element = SubElement(content_object, "Ref")
|
|
149
|
+
content_object = SubElement(xip, 'xip:ContentObject')
|
|
150
|
+
ref_element = SubElement(content_object, "xip:Ref")
|
|
156
151
|
ref_element.text = co_ref
|
|
157
|
-
title = SubElement(content_object, "Title")
|
|
152
|
+
title = SubElement(content_object, "xip:Title")
|
|
158
153
|
title.text = content_title
|
|
159
|
-
description = SubElement(content_object, "Description")
|
|
154
|
+
description = SubElement(content_object, "xip:Description")
|
|
160
155
|
description.text = content_description
|
|
161
|
-
security_tag = SubElement(content_object, "SecurityTag")
|
|
156
|
+
security_tag = SubElement(content_object, "xip:SecurityTag")
|
|
162
157
|
security_tag.text = tag
|
|
163
|
-
custom_type = SubElement(content_object, "CustomType")
|
|
158
|
+
custom_type = SubElement(content_object, "xip:CustomType")
|
|
164
159
|
custom_type.text = content_type
|
|
165
|
-
parent = SubElement(content_object, "Parent")
|
|
160
|
+
parent = SubElement(content_object, "xip:Parent")
|
|
166
161
|
parent.text = io_ref
|
|
167
162
|
|
|
168
163
|
|
|
169
164
|
def __make_generation__(xip, filename, co_ref, generation_label, location=None):
|
|
170
|
-
generation = SubElement(xip, 'Generation', {"original": "true", "active": "true"})
|
|
171
|
-
content_object = SubElement(generation, "ContentObject")
|
|
165
|
+
generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
|
|
166
|
+
content_object = SubElement(generation, "xip:ContentObject")
|
|
172
167
|
content_object.text = co_ref
|
|
173
|
-
label = SubElement(generation, "Label")
|
|
168
|
+
label = SubElement(generation, "xip:Label")
|
|
174
169
|
if generation_label:
|
|
175
170
|
label.text = generation_label
|
|
176
171
|
else:
|
|
177
172
|
label.text = os.path.splitext(filename)[0]
|
|
178
|
-
effective_date = SubElement(generation, "EffectiveDate")
|
|
173
|
+
effective_date = SubElement(generation, "xip:EffectiveDate")
|
|
179
174
|
effective_date.text = datetime.now().isoformat()
|
|
180
|
-
bitstreams = SubElement(generation, "Bitstreams")
|
|
181
|
-
bitstream = SubElement(bitstreams, "Bitstream")
|
|
175
|
+
bitstreams = SubElement(generation, "xip:Bitstreams")
|
|
176
|
+
bitstream = SubElement(bitstreams, "xip:Bitstream")
|
|
182
177
|
bitstream.text = f"{location}/{filename}"
|
|
183
|
-
SubElement(generation, "Formats")
|
|
184
|
-
SubElement(generation, "Properties")
|
|
178
|
+
SubElement(generation, "xip:Formats")
|
|
179
|
+
SubElement(generation, "xip:Properties")
|
|
185
180
|
|
|
186
181
|
|
|
187
182
|
def __make_bitstream__(xip, file_name, full_path, callback, location=None):
|
|
188
|
-
bitstream = SubElement(xip, 'Bitstream')
|
|
189
|
-
filename_element = SubElement(bitstream, "Filename")
|
|
183
|
+
bitstream = SubElement(xip, 'xip:Bitstream')
|
|
184
|
+
filename_element = SubElement(bitstream, "xip:Filename")
|
|
190
185
|
filename_element.text = file_name
|
|
191
|
-
filesize = SubElement(bitstream, "FileSize")
|
|
186
|
+
filesize = SubElement(bitstream, "xip:FileSize")
|
|
192
187
|
file_stats = os.stat(full_path)
|
|
193
188
|
filesize.text = str(file_stats.st_size)
|
|
194
|
-
physical_location = SubElement(bitstream, "PhysicalLocation")
|
|
189
|
+
physical_location = SubElement(bitstream, "xip:PhysicalLocation")
|
|
195
190
|
physical_location.text = location
|
|
196
|
-
fixities = SubElement(bitstream, "Fixities")
|
|
191
|
+
fixities = SubElement(bitstream, "xip:Fixities")
|
|
197
192
|
fixity_result = callback(file_name, full_path)
|
|
198
193
|
if type(fixity_result) == tuple:
|
|
199
|
-
fixity = SubElement(fixities, "Fixity")
|
|
200
|
-
fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
|
|
201
|
-
fixity_value = SubElement(fixity, "FixityValue")
|
|
194
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
195
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
196
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
202
197
|
fixity_algorithm_ref.text = fixity_result[0]
|
|
203
198
|
fixity_value.text = fixity_result[1]
|
|
204
199
|
elif type(fixity_result) == dict:
|
|
205
200
|
for key, val in fixity_result.items():
|
|
206
|
-
fixity = SubElement(fixities, "Fixity")
|
|
207
|
-
fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
|
|
208
|
-
fixity_value = SubElement(fixity, "FixityValue")
|
|
201
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
202
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
203
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
209
204
|
fixity_algorithm_ref.text = key
|
|
210
205
|
fixity_value.text = val
|
|
211
206
|
else:
|
|
@@ -214,17 +209,17 @@ def __make_bitstream__(xip, file_name, full_path, callback, location=None):
|
|
|
214
209
|
|
|
215
210
|
|
|
216
211
|
def __make_representation_multiple_co__(xip, rep_name, rep_type, rep_files, io_ref):
|
|
217
|
-
representation = SubElement(xip, 'Representation')
|
|
218
|
-
io_link = SubElement(representation, 'InformationObject')
|
|
212
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
213
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
219
214
|
io_link.text = io_ref
|
|
220
|
-
access_name = SubElement(representation, 'Name')
|
|
215
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
221
216
|
access_name.text = rep_name
|
|
222
|
-
access_type = SubElement(representation, 'Type')
|
|
217
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
223
218
|
access_type.text = rep_type
|
|
224
|
-
content_objects = SubElement(representation, 'ContentObjects')
|
|
219
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
225
220
|
refs_dict = {}
|
|
226
221
|
for f in rep_files:
|
|
227
|
-
content_object = SubElement(content_objects, 'ContentObject')
|
|
222
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
228
223
|
content_object_ref = str(uuid.uuid4())
|
|
229
224
|
content_object.text = content_object_ref
|
|
230
225
|
refs_dict[content_object_ref] = f
|
|
@@ -248,12 +243,9 @@ def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Titl
|
|
|
248
243
|
headers.add(xml_tag)
|
|
249
244
|
break
|
|
250
245
|
|
|
251
|
-
namespaces = {"version": "2.0",
|
|
252
|
-
"xmlns:
|
|
253
|
-
"xmlns:
|
|
254
|
-
"xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
255
|
-
"xmlns:csv": xml_namespace,
|
|
256
|
-
"xmlns": "http://www.tessella.com/sdb/cmis/metadata",
|
|
246
|
+
namespaces = {"version": "2.0", "xmlns:xsl": "http://www.w3.org/1999/XSL/Transform",
|
|
247
|
+
"xmlns:fn": "http://www.w3.org/2005/xpath-functions", "xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
248
|
+
"xmlns:csv": xml_namespace, "xmlns": "http://www.tessella.com/sdb/cmis/metadata",
|
|
257
249
|
"exclude-result-prefixes": "csv"}
|
|
258
250
|
|
|
259
251
|
if additional_namespaces is not None:
|
|
@@ -323,8 +315,7 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
|
|
|
323
315
|
headers.add(xml_tag)
|
|
324
316
|
break
|
|
325
317
|
|
|
326
|
-
namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
327
|
-
"attributeFormDefault": "unqualified",
|
|
318
|
+
namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema", "attributeFormDefault": "unqualified",
|
|
328
319
|
"elementFormDefault": "qualified",
|
|
329
320
|
"targetNamespace": xml_namespace}
|
|
330
321
|
|
|
@@ -399,9 +390,7 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
|
|
|
399
390
|
else:
|
|
400
391
|
xpath_expression = f"//{short_name}:{root_element}/{short_name}:{header}"
|
|
401
392
|
|
|
402
|
-
attr = {"indexName": header, "displayName": header,
|
|
403
|
-
"xpath": xpath_expression,
|
|
404
|
-
"indexType": "STRING_DEFAULT"}
|
|
393
|
+
attr = {"indexName": header, "displayName": header, "xpath": xpath_expression, "indexType": "STRING_DEFAULT"}
|
|
405
394
|
xml_term = xml.etree.ElementTree.SubElement(xml_index, "term", attr)
|
|
406
395
|
|
|
407
396
|
if additional_namespaces is not None:
|
|
@@ -470,8 +459,9 @@ def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename
|
|
|
470
459
|
yield name
|
|
471
460
|
|
|
472
461
|
|
|
473
|
-
def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None,
|
|
474
|
-
|
|
462
|
+
def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None, parent_folder=None,
|
|
463
|
+
compress=True,
|
|
464
|
+
**kwargs):
|
|
475
465
|
# some basic validation
|
|
476
466
|
if export_folder is None:
|
|
477
467
|
export_folder = tempfile.gettempdir()
|
|
@@ -492,7 +482,7 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
|
|
|
492
482
|
content_type = kwargs.get('CustomType', "")
|
|
493
483
|
|
|
494
484
|
if not compress:
|
|
495
|
-
shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
|
|
485
|
+
shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
|
|
496
486
|
|
|
497
487
|
has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
|
|
498
488
|
has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
|
|
@@ -570,7 +560,8 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
|
|
|
570
560
|
access_content_description = access_content_title.get("filename", default_content_objects_title)
|
|
571
561
|
|
|
572
562
|
__make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag,
|
|
573
|
-
access_content_description,
|
|
563
|
+
access_content_description,
|
|
564
|
+
content_type)
|
|
574
565
|
|
|
575
566
|
if has_preservation_files:
|
|
576
567
|
for representation_name in preservation_representation_refs_dict.keys():
|
|
@@ -622,12 +613,12 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
|
|
|
622
613
|
for identifier_key, identifier_value in identifier_map.items():
|
|
623
614
|
if identifier_key:
|
|
624
615
|
if identifier_value:
|
|
625
|
-
identifier = SubElement(xip, 'Identifier')
|
|
626
|
-
id_type = SubElement(identifier, "Type")
|
|
616
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
617
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
627
618
|
id_type.text = identifier_key
|
|
628
|
-
id_value = SubElement(identifier, "Value")
|
|
619
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
629
620
|
id_value.text = identifier_value
|
|
630
|
-
id_io = SubElement(identifier, "Entity")
|
|
621
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
631
622
|
id_io.text = io_ref
|
|
632
623
|
|
|
633
624
|
if 'Asset_Metadata' in kwargs:
|
|
@@ -637,22 +628,22 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
|
|
|
637
628
|
if metadata_path:
|
|
638
629
|
if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
|
|
639
630
|
descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
|
|
640
|
-
metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
|
|
641
|
-
metadata_ref = SubElement(metadata, 'Ref')
|
|
631
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
632
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
642
633
|
metadata_ref.text = str(uuid.uuid4())
|
|
643
|
-
entity = SubElement(metadata, 'Entity')
|
|
634
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
644
635
|
entity.text = io_ref
|
|
645
|
-
content = SubElement(metadata, 'Content')
|
|
636
|
+
content = SubElement(metadata, 'xip:Content')
|
|
646
637
|
content.append(descriptive_metadata.getroot())
|
|
647
638
|
elif isinstance(metadata_path, str):
|
|
648
639
|
try:
|
|
649
640
|
descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
|
|
650
|
-
metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
|
|
651
|
-
metadata_ref = SubElement(metadata, 'Ref')
|
|
641
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
642
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
652
643
|
metadata_ref.text = str(uuid.uuid4())
|
|
653
|
-
entity = SubElement(metadata, 'Entity')
|
|
644
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
654
645
|
entity.text = io_ref
|
|
655
|
-
content = SubElement(metadata, 'Content')
|
|
646
|
+
content = SubElement(metadata, 'xip:Content')
|
|
656
647
|
content.append(descriptive_metadata)
|
|
657
648
|
except RuntimeError:
|
|
658
649
|
logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
|
|
@@ -736,71 +727,72 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
|
|
|
736
727
|
os.mkdir(os.path.join(inner_folder, CONTENT_FOLDER))
|
|
737
728
|
|
|
738
729
|
asset_map = dict()
|
|
739
|
-
xip = Element('XIP')
|
|
730
|
+
xip = Element('xip:XIP')
|
|
731
|
+
xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
|
|
740
732
|
for file in asset_file_list:
|
|
741
733
|
default_asset_title = os.path.splitext(os.path.basename(file))[0]
|
|
742
734
|
xip, io_ref = __create_io__(xip, file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
743
735
|
asset_map[file] = io_ref
|
|
744
|
-
representation = SubElement(xip, 'Representation')
|
|
745
|
-
io_link = SubElement(representation, 'InformationObject')
|
|
736
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
737
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
746
738
|
io_link.text = io_ref
|
|
747
|
-
access_name = SubElement(representation, 'Name')
|
|
739
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
748
740
|
access_name.text = "Preservation"
|
|
749
|
-
access_type = SubElement(representation, 'Type')
|
|
741
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
750
742
|
access_type.text = "Preservation"
|
|
751
|
-
content_objects = SubElement(representation, 'ContentObjects')
|
|
752
|
-
content_object = SubElement(content_objects, 'ContentObject')
|
|
743
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
744
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
753
745
|
content_object_ref = str(uuid.uuid4())
|
|
754
746
|
content_object.text = content_object_ref
|
|
755
747
|
|
|
756
748
|
default_content_objects_title = os.path.splitext(os.path.basename(file))[0]
|
|
757
|
-
content_object = SubElement(xip, 'ContentObject')
|
|
758
|
-
ref_element = SubElement(content_object, "Ref")
|
|
749
|
+
content_object = SubElement(xip, 'xip:ContentObject')
|
|
750
|
+
ref_element = SubElement(content_object, "xip:Ref")
|
|
759
751
|
ref_element.text = content_object_ref
|
|
760
|
-
title = SubElement(content_object, "Title")
|
|
752
|
+
title = SubElement(content_object, "xip:Title")
|
|
761
753
|
title.text = default_content_objects_title
|
|
762
|
-
description = SubElement(content_object, "Description")
|
|
754
|
+
description = SubElement(content_object, "xip:Description")
|
|
763
755
|
description.text = default_content_objects_title
|
|
764
|
-
security_tag_element = SubElement(content_object, "SecurityTag")
|
|
756
|
+
security_tag_element = SubElement(content_object, "xip:SecurityTag")
|
|
765
757
|
security_tag_element.text = security_tag
|
|
766
|
-
custom_type = SubElement(content_object, "CustomType")
|
|
758
|
+
custom_type = SubElement(content_object, "xip:CustomType")
|
|
767
759
|
custom_type.text = content_type
|
|
768
|
-
parent = SubElement(content_object, "Parent")
|
|
760
|
+
parent = SubElement(content_object, "xip:Parent")
|
|
769
761
|
parent.text = io_ref
|
|
770
762
|
|
|
771
|
-
generation = SubElement(xip, 'Generation', {"original": "true", "active": "true"})
|
|
772
|
-
content_object = SubElement(generation, "ContentObject")
|
|
763
|
+
generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
|
|
764
|
+
content_object = SubElement(generation, "xip:ContentObject")
|
|
773
765
|
content_object.text = content_object_ref
|
|
774
|
-
label = SubElement(generation, "Label")
|
|
766
|
+
label = SubElement(generation, "xip:Label")
|
|
775
767
|
label.text = os.path.splitext(os.path.basename(file))[0]
|
|
776
|
-
effective_date = SubElement(generation, "EffectiveDate")
|
|
768
|
+
effective_date = SubElement(generation, "xip:EffectiveDate")
|
|
777
769
|
effective_date.text = datetime.now().isoformat()
|
|
778
|
-
bitstreams = SubElement(generation, "Bitstreams")
|
|
779
|
-
bitstream = SubElement(bitstreams, "Bitstream")
|
|
770
|
+
bitstreams = SubElement(generation, "xip:Bitstreams")
|
|
771
|
+
bitstream = SubElement(bitstreams, "xip:Bitstream")
|
|
780
772
|
bitstream.text = os.path.basename(file)
|
|
781
|
-
SubElement(generation, "Formats")
|
|
782
|
-
SubElement(generation, "Properties")
|
|
773
|
+
SubElement(generation, "xip:Formats")
|
|
774
|
+
SubElement(generation, "xip:Properties")
|
|
783
775
|
|
|
784
|
-
bitstream = SubElement(xip, 'Bitstream')
|
|
785
|
-
filename_element = SubElement(bitstream, "Filename")
|
|
776
|
+
bitstream = SubElement(xip, 'xip:Bitstream')
|
|
777
|
+
filename_element = SubElement(bitstream, "xip:Filename")
|
|
786
778
|
filename_element.text = os.path.basename(file)
|
|
787
|
-
filesize = SubElement(bitstream, "FileSize")
|
|
779
|
+
filesize = SubElement(bitstream, "xip:FileSize")
|
|
788
780
|
file_stats = os.stat(file)
|
|
789
781
|
filesize.text = str(file_stats.st_size)
|
|
790
|
-
physical_location = SubElement(bitstream, "PhysicalLocation")
|
|
791
|
-
fixities = SubElement(bitstream, "Fixities")
|
|
782
|
+
physical_location = SubElement(bitstream, "xip:PhysicalLocation")
|
|
783
|
+
fixities = SubElement(bitstream, "xip:Fixities")
|
|
792
784
|
fixity_result = fixity_callback(filename_element.text, file)
|
|
793
785
|
if type(fixity_result) == tuple:
|
|
794
|
-
fixity = SubElement(fixities, "Fixity")
|
|
795
|
-
fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
|
|
796
|
-
fixity_value = SubElement(fixity, "FixityValue")
|
|
786
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
787
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
788
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
797
789
|
fixity_algorithm_ref.text = fixity_result[0]
|
|
798
790
|
fixity_value.text = fixity_result[1]
|
|
799
791
|
elif type(fixity_result) == dict:
|
|
800
792
|
for key, val in fixity_result.items():
|
|
801
|
-
fixity = SubElement(fixities, "Fixity")
|
|
802
|
-
fixity_algorithm_ref = SubElement(fixity, "FixityAlgorithmRef")
|
|
803
|
-
fixity_value = SubElement(fixity, "FixityValue")
|
|
793
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
794
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
795
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
804
796
|
fixity_algorithm_ref.text = key
|
|
805
797
|
fixity_value.text = val
|
|
806
798
|
else:
|
|
@@ -814,12 +806,12 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
|
|
|
814
806
|
for identifier_key, identifier_value in identifier_map_values.items():
|
|
815
807
|
if identifier_key:
|
|
816
808
|
if identifier_value:
|
|
817
|
-
identifier = SubElement(xip, 'Identifier')
|
|
818
|
-
id_type = SubElement(identifier, "Type")
|
|
809
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
810
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
819
811
|
id_type.text = identifier_key
|
|
820
|
-
id_value = SubElement(identifier, "Value")
|
|
812
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
821
813
|
id_value.text = identifier_value
|
|
822
|
-
id_io = SubElement(identifier, "Entity")
|
|
814
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
823
815
|
id_io.text = io_ref
|
|
824
816
|
|
|
825
817
|
src_file = file
|
|
@@ -839,8 +831,9 @@ def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=
|
|
|
839
831
|
return top_level_folder + ".zip"
|
|
840
832
|
|
|
841
833
|
|
|
842
|
-
def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None,
|
|
843
|
-
|
|
834
|
+
def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None, parent_folder=None,
|
|
835
|
+
compress=True,
|
|
836
|
+
**kwargs):
|
|
844
837
|
"""
|
|
845
838
|
|
|
846
839
|
Create a Preservica package containing a single Asset from a multiple preservation files
|
|
@@ -888,6 +881,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
888
881
|
'Preservation_Representation_Name' Name of the Preservation Representation
|
|
889
882
|
'Access_Representation_Name' Name of the Access Representation
|
|
890
883
|
"""
|
|
884
|
+
xml.etree.ElementTree.register_namespace("xip", "http://preservica.com/XIP/v6.0")
|
|
885
|
+
|
|
891
886
|
# some basic validation
|
|
892
887
|
if export_folder is None:
|
|
893
888
|
export_folder = tempfile.gettempdir()
|
|
@@ -916,17 +911,22 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
916
911
|
if has_preservation_files:
|
|
917
912
|
if default_asset_title is None:
|
|
918
913
|
default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
|
|
919
|
-
|
|
920
914
|
# create the asset
|
|
921
|
-
|
|
915
|
+
if io_ref is None:
|
|
916
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
922
917
|
|
|
923
918
|
if has_access_files:
|
|
924
919
|
if default_asset_title is None:
|
|
925
920
|
default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
|
|
926
|
-
|
|
927
921
|
if io_ref is None:
|
|
928
922
|
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
929
923
|
|
|
924
|
+
if io_ref is None:
|
|
925
|
+
default_asset_title = kwargs.get('Title', None)
|
|
926
|
+
if default_asset_title is None:
|
|
927
|
+
default_asset_title = "New Asset"
|
|
928
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
929
|
+
|
|
930
930
|
if has_preservation_files:
|
|
931
931
|
# add the content objects
|
|
932
932
|
representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
|
|
@@ -938,7 +938,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
938
938
|
# add the content objects
|
|
939
939
|
access_name = kwargs.get('Access_Representation_Name', "Access")
|
|
940
940
|
access_refs_dict = __make_representation_multiple_co__(xip, rep_name=access_name, rep_type="Access",
|
|
941
|
-
rep_files=access_files_list,
|
|
941
|
+
rep_files=access_files_list,
|
|
942
|
+
io_ref=io_ref)
|
|
942
943
|
|
|
943
944
|
if has_preservation_files:
|
|
944
945
|
|
|
@@ -955,7 +956,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
955
956
|
preservation_content_description = preservation_content_description[filename]
|
|
956
957
|
|
|
957
958
|
__make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
|
|
958
|
-
preservation_content_description,
|
|
959
|
+
preservation_content_description,
|
|
960
|
+
content_type)
|
|
959
961
|
|
|
960
962
|
if has_access_files:
|
|
961
963
|
|
|
@@ -1018,40 +1020,51 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
1018
1020
|
for identifier_key, identifier_value in identifier_map.items():
|
|
1019
1021
|
if identifier_key:
|
|
1020
1022
|
if identifier_value:
|
|
1021
|
-
identifier = SubElement(xip, 'Identifier')
|
|
1022
|
-
id_type = SubElement(identifier, "Type")
|
|
1023
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
1024
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
1023
1025
|
id_type.text = identifier_key
|
|
1024
|
-
id_value = SubElement(identifier, "Value")
|
|
1026
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
1025
1027
|
id_value.text = identifier_value
|
|
1026
|
-
id_io = SubElement(identifier, "Entity")
|
|
1028
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
1027
1029
|
id_io.text = io_ref
|
|
1028
1030
|
|
|
1029
1031
|
if 'Asset_Metadata' in kwargs:
|
|
1030
1032
|
metadata_map = kwargs.get('Asset_Metadata')
|
|
1031
1033
|
for metadata_ns, metadata_path in metadata_map.items():
|
|
1032
1034
|
if metadata_ns:
|
|
1033
|
-
if metadata_path:
|
|
1035
|
+
if metadata_path and isinstance(metadata_path, str):
|
|
1034
1036
|
if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
|
|
1035
1037
|
descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
|
|
1036
|
-
metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
|
|
1037
|
-
metadata_ref = SubElement(metadata, 'Ref')
|
|
1038
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1039
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
1038
1040
|
metadata_ref.text = str(uuid.uuid4())
|
|
1039
|
-
entity = SubElement(metadata, 'Entity')
|
|
1041
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
1040
1042
|
entity.text = io_ref
|
|
1041
|
-
content = SubElement(metadata, 'Content')
|
|
1043
|
+
content = SubElement(metadata, 'xip:Content')
|
|
1042
1044
|
content.append(descriptive_metadata.getroot())
|
|
1043
1045
|
elif isinstance(metadata_path, str):
|
|
1044
1046
|
try:
|
|
1045
1047
|
descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
|
|
1046
|
-
metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
|
|
1047
|
-
metadata_ref = SubElement(metadata, 'Ref')
|
|
1048
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1049
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
1048
1050
|
metadata_ref.text = str(uuid.uuid4())
|
|
1049
|
-
entity = SubElement(metadata, 'Entity')
|
|
1051
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
1050
1052
|
entity.text = io_ref
|
|
1051
|
-
content = SubElement(metadata, 'Content')
|
|
1053
|
+
content = SubElement(metadata, 'xip:Content')
|
|
1052
1054
|
content.append(descriptive_metadata)
|
|
1053
1055
|
except RuntimeError:
|
|
1054
1056
|
logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
|
|
1057
|
+
if metadata_path and isinstance(metadata_path, list):
|
|
1058
|
+
for path in metadata_path:
|
|
1059
|
+
if os.path.exists(path) and os.path.isfile(path):
|
|
1060
|
+
descriptive_metadata = xml.etree.ElementTree.parse(source=path)
|
|
1061
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1062
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
1063
|
+
metadata_ref.text = str(uuid.uuid4())
|
|
1064
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
1065
|
+
entity.text = io_ref
|
|
1066
|
+
content = SubElement(metadata, 'xip:Content')
|
|
1067
|
+
content.append(descriptive_metadata.getroot())
|
|
1055
1068
|
|
|
1056
1069
|
if xip is not None:
|
|
1057
1070
|
export_folder = export_folder
|
|
@@ -1146,382 +1159,14 @@ def _unpad(s):
|
|
|
1146
1159
|
|
|
1147
1160
|
class UploadAPI(AuthenticatedAPI):
|
|
1148
1161
|
|
|
1149
|
-
def ingest_tweet(self, twitter_user=None, tweet_id: int = 0, twitter_consumer_key=None,
|
|
1150
|
-
twitter_secret_key=None, folder=None, callback=None, **kwargs):
|
|
1151
|
-
|
|
1152
|
-
"""
|
|
1153
|
-
Ingest tweets from a twitter stream by twitter username
|
|
1154
|
-
|
|
1155
|
-
:param tweet_id:
|
|
1156
|
-
:param str twitter_user: Twitter Username
|
|
1157
|
-
:param str twitter_consumer_key: Optional asset title
|
|
1158
|
-
:param str twitter_secret_key: Optional asset description
|
|
1159
|
-
:param str folder: Folder to ingest into
|
|
1160
|
-
:param callback callback: Optional upload progress callback
|
|
1161
|
-
:raises RuntimeError:
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
"""
|
|
1165
|
-
|
|
1166
|
-
def get_image(m, has_video_element):
|
|
1167
|
-
media_url_https_ = m["media_url_https"]
|
|
1168
|
-
if media_url_https_:
|
|
1169
|
-
req = requests.get(media_url_https_)
|
|
1170
|
-
if req.status_code == requests.codes.ok:
|
|
1171
|
-
if has_video_element:
|
|
1172
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
|
|
1173
|
-
else:
|
|
1174
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
|
|
1175
|
-
image_name_document_ = open(image_name_, "wb")
|
|
1176
|
-
image_name_document_.write(req.content)
|
|
1177
|
-
image_name_document_.close()
|
|
1178
|
-
return image_name_
|
|
1179
|
-
|
|
1180
|
-
def get_video(m):
|
|
1181
|
-
video_info_ = m["video_info"]
|
|
1182
|
-
variants_ = video_info_["variants"]
|
|
1183
|
-
for v_ in variants_:
|
|
1184
|
-
video_url_ = v_["url"]
|
|
1185
|
-
req = requests.get(video_url_)
|
|
1186
|
-
if req.status_code == requests.codes.ok:
|
|
1187
|
-
video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
|
|
1188
|
-
video_name_document_ = open(video_name_, "wb")
|
|
1189
|
-
video_name_document_.write(req.content)
|
|
1190
|
-
video_name_document_.close()
|
|
1191
|
-
return video_name_, True
|
|
1192
|
-
|
|
1193
|
-
entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1194
|
-
tenant=self.tenant)
|
|
1195
|
-
if hasattr(folder, "reference"):
|
|
1196
|
-
folder = entity_client.folder(folder.reference)
|
|
1197
|
-
else:
|
|
1198
|
-
folder = entity_client.folder(folder)
|
|
1199
|
-
try:
|
|
1200
|
-
import tweepy
|
|
1201
|
-
except ImportError:
|
|
1202
|
-
logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1203
|
-
raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1204
|
-
config = configparser.ConfigParser()
|
|
1205
|
-
config.read('credentials.properties')
|
|
1206
|
-
if twitter_consumer_key is None:
|
|
1207
|
-
twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
|
|
1208
|
-
if twitter_consumer_key is None:
|
|
1209
|
-
try:
|
|
1210
|
-
twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
|
|
1211
|
-
except KeyError:
|
|
1212
|
-
logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1213
|
-
"environment variables or credentials.properties file")
|
|
1214
|
-
raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1215
|
-
"environment variables or credentials.properties file")
|
|
1216
|
-
if twitter_secret_key is None:
|
|
1217
|
-
twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
|
|
1218
|
-
if twitter_secret_key is None:
|
|
1219
|
-
try:
|
|
1220
|
-
twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
|
|
1221
|
-
except KeyError:
|
|
1222
|
-
logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1223
|
-
"environment variables or credentials.properties file")
|
|
1224
|
-
raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1225
|
-
"environment variables or credentials.properties file")
|
|
1226
|
-
|
|
1227
|
-
api = None
|
|
1228
|
-
try:
|
|
1229
|
-
auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
|
|
1230
|
-
api = tweepy.API(auth, wait_on_rate_limit=True)
|
|
1231
|
-
except TweepError:
|
|
1232
|
-
logger.error("No valid Twitter API keys. Could not authenticate")
|
|
1233
|
-
raise RuntimeError("No valid Twitter API keys. Could not authenticate")
|
|
1234
|
-
if api is not None:
|
|
1235
|
-
logger.debug(api)
|
|
1236
|
-
tweet = api.get_status(tweet_id, tweet_mode="extended", include_entities=True)
|
|
1237
|
-
created_at = tweet.created_at
|
|
1238
|
-
id_str = tweet.id_str
|
|
1239
|
-
author = tweet.author.name
|
|
1240
|
-
tweet_entities = tweet.entities
|
|
1241
|
-
hashtags = dict()
|
|
1242
|
-
if 'hashtags' in tweet_entities:
|
|
1243
|
-
hashtags = tweet.entities['hashtags']
|
|
1244
|
-
entities = entity_client.identifier("tweet_id", id_str.strip())
|
|
1245
|
-
if len(entities) > 0:
|
|
1246
|
-
logger.warning("Tweet already exists, skipping....")
|
|
1247
|
-
return
|
|
1248
|
-
logger.info(f"Processing tweet {id_str} ...")
|
|
1249
|
-
tid = tweet.id
|
|
1250
|
-
content_objects = list()
|
|
1251
|
-
full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
|
|
1252
|
-
text = tweet.full_text
|
|
1253
|
-
full_text = full_tweet.full_text
|
|
1254
|
-
file_name = f"{{{id_str}}}_[{twitter_user}].json"
|
|
1255
|
-
json_doc = json.dumps(full_tweet._json)
|
|
1256
|
-
json_file = open(file_name, "wt", encoding="utf-8")
|
|
1257
|
-
json_file.write(json_doc)
|
|
1258
|
-
json_file.close()
|
|
1259
|
-
content_objects.append(file_name)
|
|
1260
|
-
if hasattr(full_tweet, "extended_entities"):
|
|
1261
|
-
extended_entities = full_tweet.extended_entities
|
|
1262
|
-
if "media" in extended_entities:
|
|
1263
|
-
media = extended_entities["media"]
|
|
1264
|
-
for med in media:
|
|
1265
|
-
media_id_str = med["id_str"]
|
|
1266
|
-
has_video = False
|
|
1267
|
-
if "video_info" in med:
|
|
1268
|
-
co, has_video = get_video(med)
|
|
1269
|
-
content_objects.append(co)
|
|
1270
|
-
if has_video:
|
|
1271
|
-
co = get_image(med, has_video)
|
|
1272
|
-
content_objects.append(co)
|
|
1273
|
-
continue
|
|
1274
|
-
if "media_url_https" in med:
|
|
1275
|
-
co = get_image(med, has_video)
|
|
1276
|
-
content_objects.append(co)
|
|
1277
|
-
identifiers = dict()
|
|
1278
|
-
asset_metadata = dict()
|
|
1279
|
-
identifiers["tweet_id"] = id_str
|
|
1280
|
-
|
|
1281
|
-
user = full_tweet._json['user']
|
|
1282
|
-
|
|
1283
|
-
if full_tweet._json.get('retweeted_status'):
|
|
1284
|
-
retweeted_status = full_tweet._json['retweeted_status']
|
|
1285
|
-
if retweeted_status.get("extended_entities"):
|
|
1286
|
-
extended_entities = retweeted_status["extended_entities"]
|
|
1287
|
-
if "media" in extended_entities:
|
|
1288
|
-
media = extended_entities["media"]
|
|
1289
|
-
for med in media:
|
|
1290
|
-
media_id_str = med["id_str"]
|
|
1291
|
-
has_video = False
|
|
1292
|
-
if "video_info" in med:
|
|
1293
|
-
co, has_video = get_video(med)
|
|
1294
|
-
content_objects.append(co)
|
|
1295
|
-
continue
|
|
1296
|
-
if "media_url_https" in med:
|
|
1297
|
-
co = get_image(med, has_video)
|
|
1298
|
-
content_objects.append(co)
|
|
1299
|
-
|
|
1300
|
-
xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
|
|
1301
|
-
xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
|
|
1302
|
-
xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
|
|
1303
|
-
xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
|
|
1304
|
-
xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
|
|
1305
|
-
for h in hashtags:
|
|
1306
|
-
xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
|
|
1307
|
-
|
|
1308
|
-
xml.etree.ElementTree.SubElement(xml_object, "name").text = author
|
|
1309
|
-
xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
|
|
1310
|
-
xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
|
|
1311
|
-
|
|
1312
|
-
xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
|
|
1313
|
-
|
|
1314
|
-
metadata_document = open("metadata.xml", "wt", encoding="utf-8")
|
|
1315
|
-
metadata_document.write(xml_request.decode("utf-8"))
|
|
1316
|
-
metadata_document.close()
|
|
1317
|
-
|
|
1318
|
-
asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
|
|
1319
|
-
|
|
1320
|
-
security_tag = kwargs.get("SecurityTag", "open")
|
|
1321
|
-
asset_title = kwargs.get("Title", text)
|
|
1322
|
-
asset_description = kwargs.get("Description", full_text)
|
|
1323
|
-
|
|
1324
|
-
p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
|
|
1325
|
-
Title=asset_title, Description=asset_description, CustomType="Tweet",
|
|
1326
|
-
Identifiers=identifiers, Asset_Metadata=asset_metadata,
|
|
1327
|
-
SecurityTag=security_tag)
|
|
1328
|
-
self.upload_zip_package(p, folder=folder, callback=callback)
|
|
1329
|
-
for ob in content_objects:
|
|
1330
|
-
os.remove(ob)
|
|
1331
|
-
os.remove("metadata.xml")
|
|
1332
|
-
|
|
1333
|
-
def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None,
|
|
1334
|
-
twitter_secret_key=None, folder=None, callback=None, **kwargs):
|
|
1335
|
-
|
|
1336
|
-
"""
|
|
1337
|
-
Ingest tweets from a twitter stream by twitter username
|
|
1338
|
-
|
|
1339
|
-
:param str twitter_user: Twitter Username
|
|
1340
|
-
:param int num_tweets: The number of tweets from the stream
|
|
1341
|
-
:param str twitter_consumer_key: Optional asset title
|
|
1342
|
-
:param str twitter_secret_key: Optional asset description
|
|
1343
|
-
:param str folder: Folder to ingest into
|
|
1344
|
-
:param callback callback: Optional upload progress callback
|
|
1345
|
-
:raises RuntimeError:
|
|
1346
1162
|
|
|
1347
1163
|
|
|
1348
|
-
"""
|
|
1349
|
-
|
|
1350
|
-
def get_image(m, has_video_element):
|
|
1351
|
-
media_url_https_ = m["media_url_https"]
|
|
1352
|
-
if media_url_https_:
|
|
1353
|
-
req = requests.get(media_url_https_)
|
|
1354
|
-
if req.status_code == requests.codes.ok:
|
|
1355
|
-
if has_video_element:
|
|
1356
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
|
|
1357
|
-
else:
|
|
1358
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
|
|
1359
|
-
image_name_document_ = open(image_name_, "wb")
|
|
1360
|
-
image_name_document_.write(req.content)
|
|
1361
|
-
image_name_document_.close()
|
|
1362
|
-
return image_name_
|
|
1363
|
-
|
|
1364
|
-
def get_video(m):
|
|
1365
|
-
video_info_ = m["video_info"]
|
|
1366
|
-
variants_ = video_info_["variants"]
|
|
1367
|
-
for v_ in variants_:
|
|
1368
|
-
if v_['content_type'] == 'video/mp4':
|
|
1369
|
-
video_url_ = v_["url"]
|
|
1370
|
-
with requests.get(video_url_, stream=True) as req:
|
|
1371
|
-
video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
|
|
1372
|
-
with open(video_name_, 'wb') as video_name_document_:
|
|
1373
|
-
for chunk in req.iter_content(chunk_size=1024):
|
|
1374
|
-
video_name_document_.write(chunk)
|
|
1375
|
-
video_name_document_.flush()
|
|
1376
|
-
return video_name_, True
|
|
1377
|
-
|
|
1378
|
-
entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1379
|
-
tenant=self.tenant)
|
|
1380
|
-
if hasattr(folder, "reference"):
|
|
1381
|
-
folder = entity_client.folder(folder.reference)
|
|
1382
|
-
else:
|
|
1383
|
-
folder = entity_client.folder(folder)
|
|
1384
|
-
try:
|
|
1385
|
-
import tweepy
|
|
1386
|
-
except ImportError:
|
|
1387
|
-
logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1388
|
-
raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1389
|
-
config = configparser.ConfigParser()
|
|
1390
|
-
config.read('credentials.properties')
|
|
1391
|
-
if twitter_consumer_key is None:
|
|
1392
|
-
twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
|
|
1393
|
-
if twitter_consumer_key is None:
|
|
1394
|
-
try:
|
|
1395
|
-
twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
|
|
1396
|
-
except KeyError:
|
|
1397
|
-
logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1398
|
-
"environment variables or credentials.properties file")
|
|
1399
|
-
raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1400
|
-
"environment variables or credentials.properties file")
|
|
1401
|
-
if twitter_secret_key is None:
|
|
1402
|
-
twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
|
|
1403
|
-
if twitter_secret_key is None:
|
|
1404
|
-
try:
|
|
1405
|
-
twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
|
|
1406
|
-
except KeyError:
|
|
1407
|
-
logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1408
|
-
"environment variables or credentials.properties file")
|
|
1409
|
-
raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1410
|
-
"environment variables or credentials.properties file")
|
|
1411
|
-
|
|
1412
|
-
api = None
|
|
1413
|
-
try:
|
|
1414
|
-
auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
|
|
1415
|
-
api = tweepy.API(auth, wait_on_rate_limit=True)
|
|
1416
|
-
except TweepError:
|
|
1417
|
-
logger.error("No valid Twitter API keys. Could not authenticate")
|
|
1418
|
-
raise RuntimeError("No valid Twitter API keys. Could not authenticate")
|
|
1419
|
-
if api is not None:
|
|
1420
|
-
logger.debug(api)
|
|
1421
|
-
for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
|
|
1422
|
-
created_at = tweet.created_at
|
|
1423
|
-
id_str = tweet.id_str
|
|
1424
|
-
author = tweet.author.name
|
|
1425
|
-
tweet_entities = tweet.entities
|
|
1426
|
-
hashtags = dict()
|
|
1427
|
-
if 'hashtags' in tweet_entities:
|
|
1428
|
-
hashtags = tweet.entities['hashtags']
|
|
1429
|
-
entities = entity_client.identifier("tweet_id", id_str.strip())
|
|
1430
|
-
if len(entities) > 0:
|
|
1431
|
-
logger.warning("Tweet already exists, skipping....")
|
|
1432
|
-
continue
|
|
1433
|
-
logger.info(f"Processing tweet {id_str} ...")
|
|
1434
|
-
tid = tweet.id
|
|
1435
|
-
content_objects = list()
|
|
1436
|
-
full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
|
|
1437
|
-
text = tweet.text
|
|
1438
|
-
logger.debug(text)
|
|
1439
|
-
full_text = full_tweet.full_text
|
|
1440
|
-
file_name = f"{{{id_str}}}_[{twitter_user}].json"
|
|
1441
|
-
json_doc = json.dumps(full_tweet._json)
|
|
1442
|
-
json_file = open(file_name, "wt", encoding="utf-8")
|
|
1443
|
-
json_file.write(json_doc)
|
|
1444
|
-
json_file.close()
|
|
1445
|
-
content_objects.append(file_name)
|
|
1446
|
-
if hasattr(full_tweet, "extended_entities"):
|
|
1447
|
-
extended_entities = full_tweet.extended_entities
|
|
1448
|
-
if "media" in extended_entities:
|
|
1449
|
-
media = extended_entities["media"]
|
|
1450
|
-
for med in media:
|
|
1451
|
-
media_id_str = med["id_str"]
|
|
1452
|
-
has_video = False
|
|
1453
|
-
if "video_info" in med:
|
|
1454
|
-
co, has_video = get_video(med)
|
|
1455
|
-
content_objects.append(co)
|
|
1456
|
-
if has_video:
|
|
1457
|
-
co = get_image(med, has_video)
|
|
1458
|
-
content_objects.append(co)
|
|
1459
|
-
continue
|
|
1460
|
-
if "media_url_https" in med:
|
|
1461
|
-
co = get_image(med, has_video)
|
|
1462
|
-
content_objects.append(co)
|
|
1463
|
-
identifiers = {}
|
|
1464
|
-
asset_metadata = {}
|
|
1465
|
-
identifiers["tweet_id"] = id_str
|
|
1466
|
-
|
|
1467
|
-
user = full_tweet._json['user']
|
|
1468
|
-
|
|
1469
|
-
if full_tweet._json.get('retweeted_status'):
|
|
1470
|
-
retweeted_status = full_tweet._json['retweeted_status']
|
|
1471
|
-
if retweeted_status.get("extended_entities"):
|
|
1472
|
-
extended_entities = retweeted_status["extended_entities"]
|
|
1473
|
-
if "media" in extended_entities:
|
|
1474
|
-
media = extended_entities["media"]
|
|
1475
|
-
for med in media:
|
|
1476
|
-
media_id_str = med["id_str"]
|
|
1477
|
-
has_video = False
|
|
1478
|
-
if "video_info" in med:
|
|
1479
|
-
co, has_video = get_video(med)
|
|
1480
|
-
content_objects.append(co)
|
|
1481
|
-
continue
|
|
1482
|
-
if "media_url_https" in med:
|
|
1483
|
-
co = get_image(med, has_video)
|
|
1484
|
-
content_objects.append(co)
|
|
1485
|
-
|
|
1486
|
-
xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
|
|
1487
|
-
xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
|
|
1488
|
-
xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
|
|
1489
|
-
xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
|
|
1490
|
-
xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
|
|
1491
|
-
for h in hashtags:
|
|
1492
|
-
xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
|
|
1493
|
-
|
|
1494
|
-
xml.etree.ElementTree.SubElement(xml_object, "name").text = author
|
|
1495
|
-
xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
|
|
1496
|
-
xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
|
|
1497
|
-
|
|
1498
|
-
xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
|
|
1499
|
-
|
|
1500
|
-
metadata_document = open("metadata.xml", "wt", encoding="utf-8")
|
|
1501
|
-
metadata_document.write(xml_request.decode("utf-8"))
|
|
1502
|
-
metadata_document.close()
|
|
1503
|
-
|
|
1504
|
-
asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
|
|
1505
|
-
|
|
1506
|
-
security_tag = kwargs.get("SecurityTag", "open")
|
|
1507
|
-
asset_title = kwargs.get("Title", text)
|
|
1508
|
-
asset_description = kwargs.get("Description", full_text)
|
|
1509
|
-
|
|
1510
|
-
p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
|
|
1511
|
-
Title=asset_title, Description=asset_description, CustomType="Tweet",
|
|
1512
|
-
Identifiers=identifiers, Asset_Metadata=asset_metadata,
|
|
1513
|
-
SecurityTag=security_tag)
|
|
1514
|
-
self.upload_zip_package(p, folder=folder, callback=callback)
|
|
1515
|
-
for ob in content_objects:
|
|
1516
|
-
os.remove(ob)
|
|
1517
|
-
os.remove("metadata.xml")
|
|
1518
|
-
sleep(2)
|
|
1519
1164
|
|
|
1520
1165
|
def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
|
|
1521
1166
|
"""
|
|
1522
1167
|
Ingest a web video such as YouTube etc based on the URL
|
|
1523
1168
|
|
|
1524
|
-
:param str url: URL to the
|
|
1169
|
+
:param str url: URL to the YouTube video
|
|
1525
1170
|
:param Folder parent_folder: The folder to ingest the video into
|
|
1526
1171
|
:param str Title: Optional asset title
|
|
1527
1172
|
:param str Description: Optional asset description
|
|
@@ -1545,10 +1190,7 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1545
1190
|
if d['status'] == 'finished':
|
|
1546
1191
|
logger.info('Download Complete. Uploading to Preservica ...')
|
|
1547
1192
|
|
|
1548
|
-
ydl_opts = {
|
|
1549
|
-
'outtmpl': '%(id)s.mp4',
|
|
1550
|
-
'progress_hooks': [my_hook],
|
|
1551
|
-
}
|
|
1193
|
+
ydl_opts = {'outtmpl': '%(id)s.mp4', 'progress_hooks': [my_hook], }
|
|
1552
1194
|
|
|
1553
1195
|
# if True:
|
|
1554
1196
|
# ydl_opts['writesubtitles'] = True
|
|
@@ -1622,6 +1264,52 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1622
1264
|
logger.error(exception)
|
|
1623
1265
|
raise exception
|
|
1624
1266
|
|
|
1267
|
+
def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
|
|
1268
|
+
"""
|
|
1269
|
+
Clean up objects in an upload bucket which are older than older_than_days.
|
|
1270
|
+
|
|
1271
|
+
"""
|
|
1272
|
+
from azure.storage.blob import ContainerClient
|
|
1273
|
+
|
|
1274
|
+
for location in self.upload_locations():
|
|
1275
|
+
if location['containerName'] == bucket_name:
|
|
1276
|
+
|
|
1277
|
+
if location['type'] != 'AWS':
|
|
1278
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1279
|
+
account_key = credentials['key']
|
|
1280
|
+
session_token = credentials['sessionToken']
|
|
1281
|
+
sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
|
|
1282
|
+
container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
|
|
1283
|
+
now = datetime.now(timezone.utc)
|
|
1284
|
+
for blob in container.list_blobs():
|
|
1285
|
+
if abs((blob.last_modified - now).days) > older_than_days:
|
|
1286
|
+
logger.debug(f"Deleting expired object {blob.name}")
|
|
1287
|
+
container.delete_blob(blob.name)
|
|
1288
|
+
|
|
1289
|
+
if location['type'] == 'AWS':
|
|
1290
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1291
|
+
access_key = credentials['key']
|
|
1292
|
+
secret_key = credentials['secret']
|
|
1293
|
+
session_token = credentials['sessionToken']
|
|
1294
|
+
session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
|
|
1295
|
+
aws_session_token=session_token)
|
|
1296
|
+
s3_client = session.client("s3")
|
|
1297
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
1298
|
+
now = datetime.now(timezone.utc)
|
|
1299
|
+
for page in paginator.paginate(Bucket=bucket_name):
|
|
1300
|
+
if 'Contents' in page:
|
|
1301
|
+
for key in page['Contents']:
|
|
1302
|
+
last_modified = key['LastModified']
|
|
1303
|
+
if abs((last_modified - now).days) > older_than_days:
|
|
1304
|
+
logger.debug(f"Deleting expired object {key['Key']}")
|
|
1305
|
+
s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
|
|
1625
1313
|
def upload_locations(self):
|
|
1626
1314
|
"""
|
|
1627
1315
|
Upload locations are configured on the Sources page as 'SIP Upload'.
|
|
@@ -1650,36 +1338,60 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1650
1338
|
"""
|
|
1651
1339
|
return self.upload_locations()
|
|
1652
1340
|
|
|
1653
|
-
def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback=
|
|
1341
|
+
def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback: bool = False,
|
|
1654
1342
|
security_tag: str = "open",
|
|
1655
|
-
delete_after_upload=True, max_MB_ingested: int = -1):
|
|
1343
|
+
delete_after_upload: bool = True, max_MB_ingested: int = -1):
|
|
1344
|
+
|
|
1345
|
+
from pyPreservica import EntityAPI
|
|
1346
|
+
|
|
1347
|
+
def entity_value(client: EntityAPI, identifier: str) -> Entity:
|
|
1348
|
+
back_off: int = 5
|
|
1349
|
+
while True:
|
|
1350
|
+
try:
|
|
1351
|
+
entities = client.identifier("code", identifier)
|
|
1352
|
+
if bool(len(entities) > 0):
|
|
1353
|
+
return entities.pop()
|
|
1354
|
+
else:
|
|
1355
|
+
return None
|
|
1356
|
+
except HTTPException as e:
|
|
1357
|
+
sleep(back_off)
|
|
1358
|
+
back_off = back_off * 2
|
|
1359
|
+
|
|
1360
|
+
def entity_exists(client: EntityAPI, identifier: str) -> bool:
|
|
1361
|
+
back_off: int = 5
|
|
1362
|
+
while True:
|
|
1363
|
+
try:
|
|
1364
|
+
entities = client.identifier("code", identifier)
|
|
1365
|
+
return bool(len(entities) > 0)
|
|
1366
|
+
except HTTPException as e:
|
|
1367
|
+
sleep(back_off)
|
|
1368
|
+
back_off = back_off * 2
|
|
1656
1369
|
|
|
1657
1370
|
def get_parent(client, identifier, parent_reference):
|
|
1658
|
-
|
|
1659
|
-
if not
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
if
|
|
1663
|
-
folder = entities.pop()
|
|
1371
|
+
dirname_id: str = str(os.path.dirname(identifier))
|
|
1372
|
+
if not dirname_id:
|
|
1373
|
+
dirname_id = identifier
|
|
1374
|
+
folder = entity_value(client, dirname_id)
|
|
1375
|
+
if folder is not None:
|
|
1664
1376
|
folder = client.folder(folder.reference)
|
|
1665
1377
|
return folder.reference
|
|
1666
1378
|
else:
|
|
1667
1379
|
return parent_reference
|
|
1668
1380
|
|
|
1669
1381
|
def get_folder(client, name, tag, parent_reference, identifier):
|
|
1670
|
-
|
|
1671
|
-
if
|
|
1382
|
+
folder = entity_value(client, identifier)
|
|
1383
|
+
if folder is None:
|
|
1672
1384
|
logger.info(f"Creating new folder with name {name}")
|
|
1673
1385
|
folder = client.create_folder(name, name, tag, parent_reference)
|
|
1674
1386
|
client.add_identifier(folder, "code", identifier)
|
|
1675
1387
|
else:
|
|
1676
1388
|
logger.info(f"Found existing folder with name {name}")
|
|
1677
|
-
folder = entities.pop()
|
|
1678
1389
|
return folder
|
|
1679
1390
|
|
|
1680
|
-
from pyPreservica import EntityAPI
|
|
1681
1391
|
entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1682
|
-
tenant=self.tenant
|
|
1392
|
+
tenant=self.tenant,
|
|
1393
|
+
two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
|
|
1394
|
+
protocol=self.protocol)
|
|
1683
1395
|
|
|
1684
1396
|
if preservica_parent:
|
|
1685
1397
|
parent = entity_client.folder(preservica_parent)
|
|
@@ -1705,7 +1417,7 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1705
1417
|
files.remove(file)
|
|
1706
1418
|
continue
|
|
1707
1419
|
asset_code = os.path.join(code, file)
|
|
1708
|
-
if
|
|
1420
|
+
if not entity_exists(entity_client, asset_code):
|
|
1709
1421
|
bytes_ingested = bytes_ingested + os.stat(full_path).st_size
|
|
1710
1422
|
logger.info(f"Adding new file: {file} to package ready for upload")
|
|
1711
1423
|
file_identifiers = {"code": asset_code}
|
|
@@ -1718,8 +1430,19 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1718
1430
|
full_path_list = [os.path.join(dirname, file) for file in files]
|
|
1719
1431
|
package = multi_asset_package(asset_file_list=full_path_list, parent_folder=f, SecurityTag=security_tag,
|
|
1720
1432
|
Identifiers=identifiers)
|
|
1721
|
-
|
|
1722
|
-
|
|
1433
|
+
if callback:
|
|
1434
|
+
progress_display = UploadProgressConsoleCallback(package)
|
|
1435
|
+
else:
|
|
1436
|
+
progress_display = None
|
|
1437
|
+
|
|
1438
|
+
if bucket_name is None:
|
|
1439
|
+
self.upload_zip_package(path_to_zip_package=package, callback=progress_display,
|
|
1440
|
+
delete_after_upload=delete_after_upload)
|
|
1441
|
+
else:
|
|
1442
|
+
self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
|
|
1443
|
+
show_progress=bool(progress_display is not None),
|
|
1444
|
+
delete_after_upload=delete_after_upload)
|
|
1445
|
+
|
|
1723
1446
|
logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
|
|
1724
1447
|
|
|
1725
1448
|
if max_MB_ingested > 0:
|
|
@@ -1727,6 +1450,37 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1727
1450
|
logger.info(f"Reached Max Upload Limit")
|
|
1728
1451
|
break
|
|
1729
1452
|
|
|
1453
|
+
def upload_zip_to_Source(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
|
|
1454
|
+
show_progress=False):
|
|
1455
|
+
|
|
1456
|
+
"""
|
|
1457
|
+
Uploads a zip file package to either an Azure container or S3 bucket
|
|
1458
|
+
depending on the Preservica system deployment
|
|
1459
|
+
|
|
1460
|
+
:param str path_to_zip_package: Path to the package
|
|
1461
|
+
:param str container_name: container connected to the ingest workflow
|
|
1462
|
+
:param Folder folder: The folder to ingest the package into
|
|
1463
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1464
|
+
:param bool show_progress: Show upload progress bar
|
|
1465
|
+
|
|
1466
|
+
"""
|
|
1467
|
+
|
|
1468
|
+
locations = self.upload_locations()
|
|
1469
|
+
for location in locations:
|
|
1470
|
+
if location['containerName'] == container_name:
|
|
1471
|
+
if location['type'] == 'AWS':
|
|
1472
|
+
callback = None
|
|
1473
|
+
if show_progress:
|
|
1474
|
+
callback = UploadProgressConsoleCallback(path_to_zip_package)
|
|
1475
|
+
self.upload_zip_package_to_S3(path_to_zip_package=path_to_zip_package, bucket_name=container_name,
|
|
1476
|
+
folder=folder,
|
|
1477
|
+
callback=callback, delete_after_upload=delete_after_upload)
|
|
1478
|
+
else:
|
|
1479
|
+
self.upload_zip_package_to_Azure(path_to_zip_package=path_to_zip_package,
|
|
1480
|
+
container_name=container_name, folder=folder,
|
|
1481
|
+
delete_after_upload=delete_after_upload,
|
|
1482
|
+
show_progress=show_progress)
|
|
1483
|
+
|
|
1730
1484
|
def upload_zip_package_to_Azure(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
|
|
1731
1485
|
show_progress=False):
|
|
1732
1486
|
|
|
@@ -1746,97 +1500,99 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1746
1500
|
|
|
1747
1501
|
from azure.storage.blob import ContainerClient
|
|
1748
1502
|
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
session_token = credentials['sessionToken']
|
|
1503
|
+
locations = self.upload_locations()
|
|
1504
|
+
for location in locations:
|
|
1505
|
+
if location['containerName'] == container_name:
|
|
1506
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1507
|
+
account_key = credentials['key']
|
|
1508
|
+
session_token = credentials['sessionToken']
|
|
1756
1509
|
|
|
1757
|
-
|
|
1758
|
-
|
|
1510
|
+
sas_url = f"https://{account_key}.blob.core.windows.net/{container_name}"
|
|
1511
|
+
container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
|
|
1759
1512
|
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
'status': 'ready'}
|
|
1513
|
+
upload_key = str(uuid.uuid4())
|
|
1514
|
+
metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': container_name, 'status': 'ready'}
|
|
1763
1515
|
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1516
|
+
if hasattr(folder, "reference"):
|
|
1517
|
+
metadata['collectionreference'] = folder.reference
|
|
1518
|
+
elif isinstance(folder, str):
|
|
1519
|
+
metadata['collectionreference'] = folder
|
|
1768
1520
|
|
|
1769
|
-
|
|
1521
|
+
properties = None
|
|
1770
1522
|
|
|
1771
|
-
|
|
1523
|
+
len_bytes = Path(path_to_zip_package).stat().st_size
|
|
1772
1524
|
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1525
|
+
if show_progress:
|
|
1526
|
+
with tqdm.wrapattr(open(path_to_zip_package, 'rb'), "read", total=len_bytes) as data:
|
|
1527
|
+
blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
|
|
1528
|
+
length=len_bytes)
|
|
1529
|
+
properties = blob_client.get_blob_properties()
|
|
1530
|
+
else:
|
|
1531
|
+
with open(path_to_zip_package, "rb") as data:
|
|
1532
|
+
blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
|
|
1533
|
+
length=len_bytes)
|
|
1534
|
+
properties = blob_client.get_blob_properties()
|
|
1783
1535
|
|
|
1784
|
-
|
|
1785
|
-
|
|
1536
|
+
if delete_after_upload:
|
|
1537
|
+
os.remove(path_to_zip_package)
|
|
1786
1538
|
|
|
1787
|
-
|
|
1539
|
+
return properties
|
|
1788
1540
|
|
|
1789
1541
|
def upload_zip_package_to_S3(self, path_to_zip_package, bucket_name, folder=None, callback=None,
|
|
1790
1542
|
delete_after_upload=False):
|
|
1791
1543
|
|
|
1792
1544
|
"""
|
|
1793
|
-
|
|
1545
|
+
Uploads a zip file package to an S3 bucket connected to a Preservica Cloud System
|
|
1794
1546
|
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1547
|
+
:param str path_to_zip_package: Path to the package
|
|
1548
|
+
:param str bucket_name: Bucket connected to an ingest workflow
|
|
1549
|
+
:param Folder folder: The folder to ingest the package into
|
|
1550
|
+
:param Callable callback: Optional callback to allow the callee to monitor the upload progress
|
|
1551
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1800
1552
|
|
|
1801
|
-
|
|
1553
|
+
"""
|
|
1802
1554
|
|
|
1803
1555
|
if (self.major_version < 7) and (self.minor_version < 5):
|
|
1804
1556
|
raise RuntimeError("This call [upload_zip_package_to_S3] is only available against v6.5 systems and above")
|
|
1805
1557
|
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1558
|
+
logger.debug("Finding Upload Locations")
|
|
1559
|
+
self.token = self.__token__()
|
|
1560
|
+
locations = self.upload_locations()
|
|
1561
|
+
for location in locations:
|
|
1562
|
+
if location['containerName'] == bucket_name:
|
|
1563
|
+
logger.debug(f"Found Upload Location {location['containerName']}")
|
|
1564
|
+
logger.debug(f"Fetching Upload Credentials for {location['containerName']}")
|
|
1565
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1566
|
+
access_key = credentials['key']
|
|
1567
|
+
secret_key = credentials['secret']
|
|
1568
|
+
session_token = credentials['sessionToken']
|
|
1569
|
+
endpoint = credentials['endpoint']
|
|
1815
1570
|
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1571
|
+
session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
|
|
1572
|
+
aws_session_token=session_token)
|
|
1573
|
+
s3 = session.resource(service_name="s3")
|
|
1574
|
+
|
|
1575
|
+
logger.debug(f"S3 Session: {s3}")
|
|
1819
1576
|
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
'status': 'ready'}
|
|
1577
|
+
upload_key = str(uuid.uuid4())
|
|
1578
|
+
s3_object = s3.Object(bucket_name, upload_key)
|
|
1579
|
+
metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': bucket_name, 'status': 'ready'}
|
|
1824
1580
|
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1581
|
+
if hasattr(folder, "reference"):
|
|
1582
|
+
metadata['collectionreference'] = folder.reference
|
|
1583
|
+
elif isinstance(folder, str):
|
|
1584
|
+
metadata['collectionreference'] = folder
|
|
1829
1585
|
|
|
1830
|
-
|
|
1831
|
-
|
|
1586
|
+
metadata['size'] = str(Path(path_to_zip_package).stat().st_size)
|
|
1587
|
+
metadata['createdby'] = self.username
|
|
1832
1588
|
|
|
1833
|
-
|
|
1589
|
+
metadata_map = {'Metadata': metadata}
|
|
1834
1590
|
|
|
1835
|
-
|
|
1836
|
-
|
|
1591
|
+
s3_object.upload_file(path_to_zip_package, Callback=callback, ExtraArgs=metadata_map,
|
|
1592
|
+
Config=transfer_config)
|
|
1837
1593
|
|
|
1838
|
-
|
|
1839
|
-
|
|
1594
|
+
if delete_after_upload:
|
|
1595
|
+
os.remove(path_to_zip_package)
|
|
1840
1596
|
|
|
1841
1597
|
def upload_zip_package(self, path_to_zip_package, folder=None, callback=None, delete_after_upload=False):
|
|
1842
1598
|
"""
|
|
@@ -1859,9 +1615,42 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1859
1615
|
endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
|
|
1860
1616
|
self.token = self.__token__()
|
|
1861
1617
|
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1618
|
+
|
|
1619
|
+
retries= {
|
|
1620
|
+
'max_attempts': 5,
|
|
1621
|
+
'mode': 'adaptive'
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
def new_credentials():
|
|
1625
|
+
cred_metadata: dict = {}
|
|
1626
|
+
cred_metadata['access_key'] = self.__token__()
|
|
1627
|
+
cred_metadata['secret_key'] = "NOT_USED"
|
|
1628
|
+
cred_metadata['token'] = ""
|
|
1629
|
+
cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
|
|
1630
|
+
logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
|
|
1631
|
+
return cred_metadata
|
|
1632
|
+
|
|
1633
|
+
session = get_session()
|
|
1634
|
+
|
|
1635
|
+
session_credentials = RefreshableCredentials.create_from_metadata(
|
|
1636
|
+
metadata=new_credentials(),
|
|
1637
|
+
refresh_using=new_credentials,
|
|
1638
|
+
advisory_timeout = 4 * 60,
|
|
1639
|
+
mandatory_timeout = 12 * 60,
|
|
1640
|
+
method = 'Preservica'
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
autorefresh_session = boto3.Session(botocore_session=session)
|
|
1644
|
+
|
|
1645
|
+
session._credentials = session_credentials
|
|
1646
|
+
|
|
1647
|
+
config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
|
|
1648
|
+
request_checksum_calculation="WHEN_REQUIRED",
|
|
1649
|
+
response_checksum_validation="WHEN_REQUIRED",
|
|
1650
|
+
retries=retries, tcp_keepalive=True)
|
|
1651
|
+
|
|
1652
|
+
|
|
1653
|
+
s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
|
|
1865
1654
|
|
|
1866
1655
|
metadata = {}
|
|
1867
1656
|
if folder is not None:
|
|
@@ -1874,20 +1663,48 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1874
1663
|
try:
|
|
1875
1664
|
key_id = str(uuid.uuid4()) + ".zip"
|
|
1876
1665
|
|
|
1666
|
+
|
|
1667
|
+
# how big is the package
|
|
1668
|
+
package_size = os.path.getsize(path_to_zip_package)
|
|
1669
|
+
if package_size > 1 * GB:
|
|
1670
|
+
transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
|
|
1671
|
+
if package_size > 8 * GB:
|
|
1672
|
+
transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
|
|
1673
|
+
if package_size > 24 * GB:
|
|
1674
|
+
transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
|
|
1675
|
+
if package_size > 48 * GB:
|
|
1676
|
+
transfer_config.multipart_chunksize = 64 * MB
|
|
1677
|
+
|
|
1678
|
+
logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
|
|
1679
|
+
|
|
1877
1680
|
transfer = S3Transfer(client=s3_client, config=transfer_config)
|
|
1878
1681
|
|
|
1879
1682
|
transfer.PutObjectTask = PutObjectTask
|
|
1880
1683
|
transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
|
|
1881
1684
|
transfer.upload_file = upload_file
|
|
1882
1685
|
|
|
1883
|
-
|
|
1884
|
-
|
|
1686
|
+
|
|
1687
|
+
response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
|
|
1688
|
+
key=key_id,
|
|
1689
|
+
extra_args=metadata,
|
|
1690
|
+
callback=callback)
|
|
1691
|
+
|
|
1885
1692
|
|
|
1886
1693
|
if delete_after_upload:
|
|
1887
1694
|
os.remove(path_to_zip_package)
|
|
1888
1695
|
|
|
1889
1696
|
return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
|
|
1890
1697
|
|
|
1891
|
-
except
|
|
1892
|
-
logger.error(
|
|
1893
|
-
raise
|
|
1698
|
+
except (NoCredentialsError, PartialCredentialsError) as ex:
|
|
1699
|
+
logger.error(ex)
|
|
1700
|
+
raise ex
|
|
1701
|
+
|
|
1702
|
+
except ClientError as ex:
|
|
1703
|
+
logger.error(ex)
|
|
1704
|
+
raise ex
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
|