pyPreservica 0.9.9__py3-none-any.whl → 3.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyPreservica/__init__.py +26 -8
- pyPreservica/adminAPI.py +877 -0
- pyPreservica/authorityAPI.py +229 -0
- pyPreservica/common.py +553 -94
- pyPreservica/contentAPI.py +331 -65
- pyPreservica/entityAPI.py +1805 -446
- pyPreservica/mdformsAPI.py +572 -0
- pyPreservica/monitorAPI.py +153 -0
- pyPreservica/opex.py +98 -0
- pyPreservica/parAPI.py +226 -0
- pyPreservica/retentionAPI.py +155 -44
- pyPreservica/settingsAPI.py +295 -0
- pyPreservica/uploadAPI.py +1120 -321
- pyPreservica/webHooksAPI.py +211 -0
- pyPreservica/workflowAPI.py +99 -47
- {pyPreservica-0.9.9.dist-info → pypreservica-3.3.4.dist-info}/METADATA +93 -66
- pypreservica-3.3.4.dist-info/RECORD +20 -0
- {pyPreservica-0.9.9.dist-info → pypreservica-3.3.4.dist-info}/WHEEL +5 -5
- pyPreservica-0.9.9.dist-info/RECORD +0 -12
- {pyPreservica-0.9.9.dist-info → pypreservica-3.3.4.dist-info/licenses}/LICENSE.txt +0 -0
- {pyPreservica-0.9.9.dist-info → pypreservica-3.3.4.dist-info}/top_level.txt +0 -0
pyPreservica/uploadAPI.py
CHANGED
|
@@ -1,30 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pyPreservica UploadAPI module definition
|
|
3
|
+
|
|
4
|
+
A client library for the Preservica Repository Upload API
|
|
5
|
+
|
|
6
|
+
author: James Carr
|
|
7
|
+
licence: Apache License 2.0
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import csv
|
|
2
|
-
import json
|
|
3
12
|
import shutil
|
|
4
13
|
import tempfile
|
|
5
14
|
import uuid
|
|
6
15
|
import xml
|
|
16
|
+
from datetime import datetime, timedelta, timezone
|
|
7
17
|
from time import sleep
|
|
8
|
-
|
|
9
|
-
import boto3
|
|
10
|
-
from datetime import datetime
|
|
11
18
|
from xml.dom import minidom
|
|
12
19
|
from xml.etree import ElementTree
|
|
13
20
|
from xml.etree.ElementTree import Element, SubElement
|
|
14
|
-
|
|
21
|
+
|
|
22
|
+
import boto3
|
|
23
|
+
import s3transfer.tasks
|
|
24
|
+
import s3transfer.upload
|
|
25
|
+
from botocore.session import get_session
|
|
26
|
+
from boto3.s3.transfer import TransferConfig, S3Transfer
|
|
15
27
|
from botocore.config import Config
|
|
16
|
-
from botocore.
|
|
28
|
+
from botocore.credentials import RefreshableCredentials
|
|
29
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
30
|
+
from dateutil.tz import tzlocal
|
|
31
|
+
from s3transfer import S3UploadFailedError
|
|
32
|
+
from tqdm import tqdm
|
|
17
33
|
|
|
18
34
|
from pyPreservica.common import *
|
|
19
35
|
from pyPreservica.common import _make_stored_zipfile
|
|
20
36
|
|
|
21
37
|
logger = logging.getLogger(__name__)
|
|
22
38
|
|
|
39
|
+
MB = 1024 * 1024
|
|
23
40
|
GB = 1024 ** 3
|
|
24
|
-
transfer_config = TransferConfig(multipart_threshold=int(
|
|
41
|
+
transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
|
|
42
|
+
|
|
43
|
+
CONTENT_FOLDER = "content"
|
|
44
|
+
PRESERVATION_CONTENT_FOLDER = "p1"
|
|
45
|
+
ACCESS_CONTENT_FOLDER = "a1"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def upload_file(self, filename, bucket, key, callback=None, extra_args=None):
|
|
49
|
+
"""Upload a file to an S3 object.
|
|
50
|
+
|
|
51
|
+
Variants have also been injected into S3 client, Bucket and Object.
|
|
52
|
+
You don't have to use S3Transfer.upload_file() directly.
|
|
53
|
+
|
|
54
|
+
.. seealso::
|
|
55
|
+
:py:meth:`S3.Client.upload_file`
|
|
56
|
+
:py:meth:`S3.Client.upload_fileobj`
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(filename, str):
|
|
59
|
+
raise ValueError('Filename must be a string')
|
|
60
|
+
|
|
61
|
+
subscribers = self._get_subscribers(callback)
|
|
62
|
+
future = self._manager.upload(filename, bucket, key, extra_args, subscribers)
|
|
63
|
+
try:
|
|
64
|
+
return future.result()
|
|
65
|
+
# If a client error was raised, add the backwards compatibility layer
|
|
66
|
+
# that raises a S3UploadFailedError. These specific errors were only
|
|
67
|
+
# ever thrown for upload_parts but now can be thrown for any related
|
|
68
|
+
# client error.
|
|
69
|
+
except ClientError as e:
|
|
70
|
+
raise S3UploadFailedError("Failed to upload %s to %s: %s" % (filename, '/'.join([bucket, key]), e))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PutObjectTask(s3transfer.tasks.Task):
|
|
74
|
+
# Copied from s3transfer/upload.py, changed to return the result of client.put_object.
|
|
75
|
+
def _main(self, client, fileobj, bucket, key, extra_args):
|
|
76
|
+
with fileobj as body:
|
|
77
|
+
response = client.put_object(Bucket=bucket, Key=key, Body=body, **extra_args)
|
|
78
|
+
return response
|
|
79
|
+
|
|
25
80
|
|
|
81
|
+
class CompleteMultipartUploadTask(s3transfer.tasks.Task):
|
|
82
|
+
# Copied from s3transfer/tasks.py, changed to return a result.
|
|
83
|
+
def _main(self, client, bucket, key, upload_id, parts, extra_args):
|
|
84
|
+
return client.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=upload_id,
|
|
85
|
+
MultipartUpload={"Parts": parts},
|
|
86
|
+
**extra_args, )
|
|
26
87
|
|
|
27
88
|
|
|
89
|
+
s3transfer.upload.PutObjectTask = PutObjectTask
|
|
90
|
+
s3transfer.upload.CompleteMultipartUploadTask = CompleteMultipartUploadTask
|
|
91
|
+
|
|
28
92
|
|
|
29
93
|
def prettify(elem):
|
|
30
94
|
"""Return a pretty-printed XML string for the Element.
|
|
@@ -34,11 +98,13 @@ def prettify(elem):
|
|
|
34
98
|
return re_parsed.toprettyxml(indent=" ")
|
|
35
99
|
|
|
36
100
|
|
|
37
|
-
def __create_io__(file_name=None, parent_folder=None, **kwargs):
|
|
38
|
-
xip
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
101
|
+
def __create_io__(xip=None, file_name=None, parent_folder=None, **kwargs):
|
|
102
|
+
if xip is None:
|
|
103
|
+
xip = Element('xip:XIP')
|
|
104
|
+
xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
|
|
105
|
+
assert xip is not None
|
|
106
|
+
io = SubElement(xip, 'xip:InformationObject')
|
|
107
|
+
ref = SubElement(io, 'xip:Ref')
|
|
42
108
|
|
|
43
109
|
if 'IO_Identifier_callback' in kwargs:
|
|
44
110
|
ident_callback = kwargs.get('IO_Identifier_callback')
|
|
@@ -46,15 +112,15 @@ def __create_io__(file_name=None, parent_folder=None, **kwargs):
|
|
|
46
112
|
else:
|
|
47
113
|
ref.text = str(uuid.uuid4())
|
|
48
114
|
|
|
49
|
-
title = SubElement(io, 'Title')
|
|
115
|
+
title = SubElement(io, 'xip:Title')
|
|
50
116
|
title.text = kwargs.get('Title', file_name)
|
|
51
|
-
description = SubElement(io, 'Description')
|
|
117
|
+
description = SubElement(io, 'xip:Description')
|
|
52
118
|
description.text = kwargs.get('Description', file_name)
|
|
53
|
-
security = SubElement(io, 'SecurityTag')
|
|
119
|
+
security = SubElement(io, 'xip:SecurityTag')
|
|
54
120
|
security.text = kwargs.get('SecurityTag', "open")
|
|
55
|
-
custom_type = SubElement(io, 'CustomType')
|
|
121
|
+
custom_type = SubElement(io, 'xip:CustomType')
|
|
56
122
|
custom_type.text = kwargs.get('CustomType', "")
|
|
57
|
-
parent = SubElement(io, 'Parent')
|
|
123
|
+
parent = SubElement(io, 'xip:Parent')
|
|
58
124
|
|
|
59
125
|
if hasattr(parent_folder, "reference"):
|
|
60
126
|
parent.text = parent_folder.reference
|
|
@@ -65,83 +131,95 @@ def __create_io__(file_name=None, parent_folder=None, **kwargs):
|
|
|
65
131
|
|
|
66
132
|
|
|
67
133
|
def __make_representation__(xip, rep_name, rep_type, io_ref):
|
|
68
|
-
representation = SubElement(xip, 'Representation')
|
|
69
|
-
io_link = SubElement(representation, 'InformationObject')
|
|
134
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
135
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
70
136
|
io_link.text = io_ref
|
|
71
|
-
access_name = SubElement(representation, 'Name')
|
|
137
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
72
138
|
access_name.text = rep_name
|
|
73
|
-
access_type = SubElement(representation, 'Type')
|
|
139
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
74
140
|
access_type.text = rep_type
|
|
75
|
-
content_objects = SubElement(representation, 'ContentObjects')
|
|
76
|
-
content_object = SubElement(content_objects, 'ContentObject')
|
|
141
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
142
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
77
143
|
content_object_ref = str(uuid.uuid4())
|
|
78
144
|
content_object.text = content_object_ref
|
|
79
145
|
return content_object_ref
|
|
80
146
|
|
|
81
147
|
|
|
82
148
|
def __make_content_objects__(xip, content_title, co_ref, io_ref, tag, content_description, content_type):
|
|
83
|
-
content_object = SubElement(xip, 'ContentObject')
|
|
84
|
-
ref_element = SubElement(content_object, "Ref")
|
|
149
|
+
content_object = SubElement(xip, 'xip:ContentObject')
|
|
150
|
+
ref_element = SubElement(content_object, "xip:Ref")
|
|
85
151
|
ref_element.text = co_ref
|
|
86
|
-
title = SubElement(content_object, "Title")
|
|
152
|
+
title = SubElement(content_object, "xip:Title")
|
|
87
153
|
title.text = content_title
|
|
88
|
-
description = SubElement(content_object, "Description")
|
|
154
|
+
description = SubElement(content_object, "xip:Description")
|
|
89
155
|
description.text = content_description
|
|
90
|
-
security_tag = SubElement(content_object, "SecurityTag")
|
|
156
|
+
security_tag = SubElement(content_object, "xip:SecurityTag")
|
|
91
157
|
security_tag.text = tag
|
|
92
|
-
custom_type = SubElement(content_object, "CustomType")
|
|
158
|
+
custom_type = SubElement(content_object, "xip:CustomType")
|
|
93
159
|
custom_type.text = content_type
|
|
94
|
-
parent = SubElement(content_object, "Parent")
|
|
160
|
+
parent = SubElement(content_object, "xip:Parent")
|
|
95
161
|
parent.text = io_ref
|
|
96
162
|
|
|
97
163
|
|
|
98
|
-
def __make_generation__(xip, filename, co_ref, generation_label):
|
|
99
|
-
generation = SubElement(xip, 'Generation', {"original": "true", "active": "true"})
|
|
100
|
-
content_object = SubElement(generation, "ContentObject")
|
|
164
|
+
def __make_generation__(xip, filename, co_ref, generation_label, location=None):
|
|
165
|
+
generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
|
|
166
|
+
content_object = SubElement(generation, "xip:ContentObject")
|
|
101
167
|
content_object.text = co_ref
|
|
102
|
-
label = SubElement(generation, "Label")
|
|
168
|
+
label = SubElement(generation, "xip:Label")
|
|
103
169
|
if generation_label:
|
|
104
170
|
label.text = generation_label
|
|
105
171
|
else:
|
|
106
172
|
label.text = os.path.splitext(filename)[0]
|
|
107
|
-
effective_date = SubElement(generation, "EffectiveDate")
|
|
173
|
+
effective_date = SubElement(generation, "xip:EffectiveDate")
|
|
108
174
|
effective_date.text = datetime.now().isoformat()
|
|
109
|
-
bitstreams = SubElement(generation, "Bitstreams")
|
|
110
|
-
bitstream = SubElement(bitstreams, "Bitstream")
|
|
111
|
-
bitstream.text = filename
|
|
112
|
-
SubElement(generation, "Formats")
|
|
113
|
-
SubElement(generation, "Properties")
|
|
175
|
+
bitstreams = SubElement(generation, "xip:Bitstreams")
|
|
176
|
+
bitstream = SubElement(bitstreams, "xip:Bitstream")
|
|
177
|
+
bitstream.text = f"{location}/{filename}"
|
|
178
|
+
SubElement(generation, "xip:Formats")
|
|
179
|
+
SubElement(generation, "xip:Properties")
|
|
114
180
|
|
|
115
181
|
|
|
116
|
-
def __make_bitstream__(xip, file_name, full_path, callback):
|
|
117
|
-
bitstream = SubElement(xip, 'Bitstream')
|
|
118
|
-
filename_element = SubElement(bitstream, "Filename")
|
|
182
|
+
def __make_bitstream__(xip, file_name, full_path, callback, location=None):
|
|
183
|
+
bitstream = SubElement(xip, 'xip:Bitstream')
|
|
184
|
+
filename_element = SubElement(bitstream, "xip:Filename")
|
|
119
185
|
filename_element.text = file_name
|
|
120
|
-
filesize = SubElement(bitstream, "FileSize")
|
|
186
|
+
filesize = SubElement(bitstream, "xip:FileSize")
|
|
121
187
|
file_stats = os.stat(full_path)
|
|
122
188
|
filesize.text = str(file_stats.st_size)
|
|
123
|
-
physical_location = SubElement(bitstream, "PhysicalLocation")
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
189
|
+
physical_location = SubElement(bitstream, "xip:PhysicalLocation")
|
|
190
|
+
physical_location.text = location
|
|
191
|
+
fixities = SubElement(bitstream, "xip:Fixities")
|
|
192
|
+
fixity_result = callback(file_name, full_path)
|
|
193
|
+
if type(fixity_result) == tuple:
|
|
194
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
195
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
196
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
197
|
+
fixity_algorithm_ref.text = fixity_result[0]
|
|
198
|
+
fixity_value.text = fixity_result[1]
|
|
199
|
+
elif type(fixity_result) == dict:
|
|
200
|
+
for key, val in fixity_result.items():
|
|
201
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
202
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
203
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
204
|
+
fixity_algorithm_ref.text = key
|
|
205
|
+
fixity_value.text = val
|
|
206
|
+
else:
|
|
207
|
+
logger.error("Could Not Find Fixity Value")
|
|
208
|
+
raise RuntimeError("Could Not Find Fixity Value")
|
|
131
209
|
|
|
132
210
|
|
|
133
211
|
def __make_representation_multiple_co__(xip, rep_name, rep_type, rep_files, io_ref):
|
|
134
|
-
representation = SubElement(xip, 'Representation')
|
|
135
|
-
io_link = SubElement(representation, 'InformationObject')
|
|
212
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
213
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
136
214
|
io_link.text = io_ref
|
|
137
|
-
access_name = SubElement(representation, 'Name')
|
|
215
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
138
216
|
access_name.text = rep_name
|
|
139
|
-
access_type = SubElement(representation, 'Type')
|
|
217
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
140
218
|
access_type.text = rep_type
|
|
141
|
-
content_objects = SubElement(representation, 'ContentObjects')
|
|
219
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
142
220
|
refs_dict = {}
|
|
143
221
|
for f in rep_files:
|
|
144
|
-
content_object = SubElement(content_objects, 'ContentObject')
|
|
222
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
145
223
|
content_object_ref = str(uuid.uuid4())
|
|
146
224
|
content_object.text = content_object_ref
|
|
147
225
|
refs_dict[content_object_ref] = f
|
|
@@ -165,12 +243,9 @@ def cvs_to_cmis_xslt(csv_file, xml_namespace, root_element, title="Metadata Titl
|
|
|
165
243
|
headers.add(xml_tag)
|
|
166
244
|
break
|
|
167
245
|
|
|
168
|
-
namespaces = {"version": "2.0",
|
|
169
|
-
"xmlns:
|
|
170
|
-
"xmlns:
|
|
171
|
-
"xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
172
|
-
"xmlns:csv": xml_namespace,
|
|
173
|
-
"xmlns": "http://www.tessella.com/sdb/cmis/metadata",
|
|
246
|
+
namespaces = {"version": "2.0", "xmlns:xsl": "http://www.w3.org/1999/XSL/Transform",
|
|
247
|
+
"xmlns:fn": "http://www.w3.org/2005/xpath-functions", "xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
248
|
+
"xmlns:csv": xml_namespace, "xmlns": "http://www.tessella.com/sdb/cmis/metadata",
|
|
174
249
|
"exclude-result-prefixes": "csv"}
|
|
175
250
|
|
|
176
251
|
if additional_namespaces is not None:
|
|
@@ -240,14 +315,13 @@ def cvs_to_xsd(csv_file, xml_namespace, root_element, export_folder=None, additi
|
|
|
240
315
|
headers.add(xml_tag)
|
|
241
316
|
break
|
|
242
317
|
|
|
243
|
-
namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema",
|
|
244
|
-
"attributeFormDefault": "unqualified",
|
|
318
|
+
namespaces = {"xmlns:xs": "http://www.w3.org/2001/XMLSchema", "attributeFormDefault": "unqualified",
|
|
245
319
|
"elementFormDefault": "qualified",
|
|
246
320
|
"targetNamespace": xml_namespace}
|
|
247
321
|
|
|
248
322
|
if additional_namespaces is not None:
|
|
249
323
|
for prefix, uri in additional_namespaces.items():
|
|
250
|
-
namespaces["xmlns:" + prefix.
|
|
324
|
+
namespaces["xmlns:" + prefix.strip()] = uri.strip()
|
|
251
325
|
|
|
252
326
|
xml_schema = xml.etree.ElementTree.Element("xs:schema", namespaces)
|
|
253
327
|
|
|
@@ -303,12 +377,12 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
|
|
|
303
377
|
|
|
304
378
|
short_name = "csv"
|
|
305
379
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
380
|
+
xml_schema_name = xml.etree.ElementTree.SubElement(xml_index, "schemaName")
|
|
381
|
+
xml_schema_name.text = title
|
|
382
|
+
xml_schema_uri = xml.etree.ElementTree.SubElement(xml_index, "schemaUri")
|
|
383
|
+
xml_schema_uri.text = xml_namespace
|
|
384
|
+
xml_short_name = xml.etree.ElementTree.SubElement(xml_index, "shortName")
|
|
385
|
+
xml_short_name.text = short_name
|
|
312
386
|
|
|
313
387
|
for header in headers:
|
|
314
388
|
if ":" in header:
|
|
@@ -316,9 +390,7 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
|
|
|
316
390
|
else:
|
|
317
391
|
xpath_expression = f"//{short_name}:{root_element}/{short_name}:{header}"
|
|
318
392
|
|
|
319
|
-
attr = {"indexName": header, "displayName": header,
|
|
320
|
-
"xpath": xpath_expression,
|
|
321
|
-
"indexType": "STRING_DEFAULT"}
|
|
393
|
+
attr = {"indexName": header, "displayName": header, "xpath": xpath_expression, "indexType": "STRING_DEFAULT"}
|
|
322
394
|
xml_term = xml.etree.ElementTree.SubElement(xml_index, "term", attr)
|
|
323
395
|
|
|
324
396
|
if additional_namespaces is not None:
|
|
@@ -338,7 +410,14 @@ def csv_to_search_xml(csv_file, xml_namespace, root_element, title="Metadata Tit
|
|
|
338
410
|
def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename", export_folder=None,
|
|
339
411
|
additional_namespaces=None):
|
|
340
412
|
"""
|
|
341
|
-
|
|
413
|
+
Export the rows of a CSV file as XML metadata documents which can be added to Preservica assets
|
|
414
|
+
|
|
415
|
+
:param str csv_file: Path to the csv file
|
|
416
|
+
:param str xml_namespace: The XML namespace for the created XML documents
|
|
417
|
+
:param str root_element: The root element for the XML documents
|
|
418
|
+
:param str file_name_column: The CSV column which should be used to name the xml files
|
|
419
|
+
:param str export_folder: The path to the export folder
|
|
420
|
+
:param dict additional_namespaces: A map of prefix, uris to use as additional namespaces
|
|
342
421
|
|
|
343
422
|
"""
|
|
344
423
|
headers = list()
|
|
@@ -380,9 +459,409 @@ def cvs_to_xml(csv_file, xml_namespace, root_element, file_name_column="filename
|
|
|
380
459
|
yield name
|
|
381
460
|
|
|
382
461
|
|
|
383
|
-
def
|
|
384
|
-
|
|
462
|
+
def generic_asset_package(preservation_files_dict=None, access_files_dict=None, export_folder=None, parent_folder=None,
|
|
463
|
+
compress=True,
|
|
464
|
+
**kwargs):
|
|
465
|
+
# some basic validation
|
|
466
|
+
if export_folder is None:
|
|
467
|
+
export_folder = tempfile.gettempdir()
|
|
468
|
+
if not os.path.isdir(export_folder):
|
|
469
|
+
logger.error("Export Folder Does Not Exist")
|
|
470
|
+
raise RuntimeError(export_folder, "Export Folder Does Not Exist")
|
|
471
|
+
if parent_folder is None:
|
|
472
|
+
logger.error("You must specify a parent folder for the package asset")
|
|
473
|
+
raise RuntimeError("You must specify a parent folder for the package asset")
|
|
474
|
+
|
|
475
|
+
io_ref = None
|
|
476
|
+
xip = None
|
|
477
|
+
default_asset_title = None
|
|
478
|
+
preservation_representation_refs_dict = {}
|
|
479
|
+
access_representation_refs_dict = {}
|
|
480
|
+
|
|
481
|
+
security_tag = kwargs.get('SecurityTag', "open")
|
|
482
|
+
content_type = kwargs.get('CustomType', "")
|
|
483
|
+
|
|
484
|
+
if not compress:
|
|
485
|
+
shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
|
|
486
|
+
|
|
487
|
+
has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
|
|
488
|
+
has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
|
|
489
|
+
|
|
490
|
+
if has_preservation_files:
|
|
491
|
+
if default_asset_title is None:
|
|
492
|
+
key = list(preservation_files_dict.keys())[0]
|
|
493
|
+
preservation_files_list = preservation_files_dict[key]
|
|
494
|
+
default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
|
|
495
|
+
|
|
496
|
+
# create the asset
|
|
497
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
498
|
+
|
|
499
|
+
if has_access_files:
|
|
500
|
+
if default_asset_title is None:
|
|
501
|
+
key = list(access_files_dict.keys())[0]
|
|
502
|
+
access_files_list = access_files_dict[key]
|
|
503
|
+
default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
|
|
504
|
+
|
|
505
|
+
if io_ref is None:
|
|
506
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
507
|
+
|
|
508
|
+
# loop over preservation_files_map
|
|
509
|
+
|
|
510
|
+
if has_preservation_files:
|
|
511
|
+
for representation_name in preservation_files_dict.keys():
|
|
512
|
+
preservation_files_list = preservation_files_dict[representation_name]
|
|
513
|
+
preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
|
|
514
|
+
rep_type="Preservation",
|
|
515
|
+
rep_files=preservation_files_list,
|
|
516
|
+
io_ref=io_ref)
|
|
517
|
+
preservation_representation_refs_dict[representation_name] = preservation_refs_dict
|
|
518
|
+
|
|
519
|
+
if has_access_files:
|
|
520
|
+
for representation_name in access_files_dict.keys():
|
|
521
|
+
access_files_list = access_files_dict[representation_name]
|
|
522
|
+
access_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name, rep_type="Access",
|
|
523
|
+
rep_files=access_files_list, io_ref=io_ref)
|
|
524
|
+
access_representation_refs_dict[representation_name] = access_refs_dict
|
|
525
|
+
|
|
526
|
+
if has_preservation_files:
|
|
527
|
+
for representation_name in preservation_representation_refs_dict.keys():
|
|
528
|
+
preservation_refs_dict = preservation_representation_refs_dict[representation_name]
|
|
529
|
+
for content_ref, filename in preservation_refs_dict.items():
|
|
530
|
+
default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
|
|
531
|
+
|
|
532
|
+
preservation_content_title = kwargs.get('Preservation_Content_Title', default_content_objects_title)
|
|
533
|
+
preservation_content_description = kwargs.get('Preservation_Content_Description',
|
|
534
|
+
default_content_objects_title)
|
|
535
|
+
|
|
536
|
+
if isinstance(preservation_content_title, dict):
|
|
537
|
+
preservation_content_title = preservation_content_title.get("filename",
|
|
538
|
+
default_content_objects_title)
|
|
539
|
+
|
|
540
|
+
if isinstance(preservation_content_description, dict):
|
|
541
|
+
preservation_content_description = preservation_content_description.get("filename",
|
|
542
|
+
default_content_objects_title)
|
|
543
|
+
|
|
544
|
+
__make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
|
|
545
|
+
preservation_content_description, content_type)
|
|
546
|
+
|
|
547
|
+
if has_access_files:
|
|
548
|
+
for representation_name in access_representation_refs_dict.keys():
|
|
549
|
+
access_refs_dict = access_representation_refs_dict[representation_name]
|
|
550
|
+
for content_ref, filename in access_refs_dict.items():
|
|
551
|
+
default_content_objects_title = os.path.splitext(os.path.basename(filename))[0]
|
|
552
|
+
|
|
553
|
+
access_content_title = kwargs.get('Access_Content_Title', default_content_objects_title)
|
|
554
|
+
access_content_description = kwargs.get('Access_Content_Description', default_content_objects_title)
|
|
555
|
+
|
|
556
|
+
if isinstance(access_content_title, dict):
|
|
557
|
+
access_content_title = access_content_title.get("filename", default_content_objects_title)
|
|
558
|
+
|
|
559
|
+
if isinstance(access_content_description, dict):
|
|
560
|
+
access_content_description = access_content_title.get("filename", default_content_objects_title)
|
|
561
|
+
|
|
562
|
+
__make_content_objects__(xip, access_content_title, content_ref, io_ref, security_tag,
|
|
563
|
+
access_content_description,
|
|
564
|
+
content_type)
|
|
565
|
+
|
|
566
|
+
if has_preservation_files:
|
|
567
|
+
for representation_name in preservation_representation_refs_dict.keys():
|
|
568
|
+
location = sanitize(representation_name)
|
|
569
|
+
preservation_refs_dict = preservation_representation_refs_dict[representation_name]
|
|
570
|
+
preservation_generation_label = kwargs.get('Preservation_Generation_Label', "")
|
|
571
|
+
for content_ref, filename in preservation_refs_dict.items():
|
|
572
|
+
preservation_file_name = os.path.basename(filename)
|
|
573
|
+
__make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label, location)
|
|
574
|
+
|
|
575
|
+
if has_access_files:
|
|
576
|
+
for representation_name in access_representation_refs_dict.keys():
|
|
577
|
+
location = sanitize(representation_name)
|
|
578
|
+
access_refs_dict = access_representation_refs_dict[representation_name]
|
|
579
|
+
access_generation_label = kwargs.get('Access_Generation_Label', "")
|
|
580
|
+
for content_ref, filename in access_refs_dict.items():
|
|
581
|
+
access_file_name = os.path.basename(filename)
|
|
582
|
+
__make_generation__(xip, access_file_name, content_ref, access_generation_label, location)
|
|
583
|
+
|
|
584
|
+
if has_preservation_files:
|
|
585
|
+
|
|
586
|
+
if 'Preservation_files_fixity_callback' in kwargs:
|
|
587
|
+
callback = kwargs.get('Preservation_files_fixity_callback')
|
|
588
|
+
else:
|
|
589
|
+
callback = Sha1FixityCallBack()
|
|
590
|
+
for representation_name in preservation_representation_refs_dict.keys():
|
|
591
|
+
location = sanitize(representation_name)
|
|
592
|
+
preservation_refs_dict = preservation_representation_refs_dict[representation_name]
|
|
593
|
+
for content_ref, filename in preservation_refs_dict.items():
|
|
594
|
+
preservation_file_name = os.path.basename(filename)
|
|
595
|
+
__make_bitstream__(xip, preservation_file_name, filename, callback, location)
|
|
596
|
+
|
|
597
|
+
if has_access_files:
|
|
598
|
+
|
|
599
|
+
if 'Access_files_fixity_callback' in kwargs:
|
|
600
|
+
callback = kwargs.get('Access_files_fixity_callback')
|
|
601
|
+
else:
|
|
602
|
+
callback = Sha1FixityCallBack()
|
|
603
|
+
|
|
604
|
+
for representation_name in access_representation_refs_dict.keys():
|
|
605
|
+
location = sanitize(representation_name)
|
|
606
|
+
access_refs_dict = access_representation_refs_dict[representation_name]
|
|
607
|
+
for content_ref, filename in access_refs_dict.items():
|
|
608
|
+
access_file_name = os.path.basename(filename)
|
|
609
|
+
__make_bitstream__(xip, access_file_name, filename, callback, location)
|
|
610
|
+
|
|
611
|
+
if 'Identifiers' in kwargs:
|
|
612
|
+
identifier_map = kwargs.get('Identifiers')
|
|
613
|
+
for identifier_key, identifier_value in identifier_map.items():
|
|
614
|
+
if identifier_key:
|
|
615
|
+
if identifier_value:
|
|
616
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
617
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
618
|
+
id_type.text = identifier_key
|
|
619
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
620
|
+
id_value.text = identifier_value
|
|
621
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
622
|
+
id_io.text = io_ref
|
|
623
|
+
|
|
624
|
+
if 'Asset_Metadata' in kwargs:
|
|
625
|
+
metadata_map = kwargs.get('Asset_Metadata')
|
|
626
|
+
for metadata_ns, metadata_path in metadata_map.items():
|
|
627
|
+
if metadata_ns:
|
|
628
|
+
if metadata_path:
|
|
629
|
+
if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
|
|
630
|
+
descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
|
|
631
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
632
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
633
|
+
metadata_ref.text = str(uuid.uuid4())
|
|
634
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
635
|
+
entity.text = io_ref
|
|
636
|
+
content = SubElement(metadata, 'xip:Content')
|
|
637
|
+
content.append(descriptive_metadata.getroot())
|
|
638
|
+
elif isinstance(metadata_path, str):
|
|
639
|
+
try:
|
|
640
|
+
descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
|
|
641
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
642
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
643
|
+
metadata_ref.text = str(uuid.uuid4())
|
|
644
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
645
|
+
entity.text = io_ref
|
|
646
|
+
content = SubElement(metadata, 'xip:Content')
|
|
647
|
+
content.append(descriptive_metadata)
|
|
648
|
+
except RuntimeError:
|
|
649
|
+
logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
|
|
650
|
+
|
|
651
|
+
if xip is not None:
|
|
652
|
+
export_folder = export_folder
|
|
653
|
+
top_level_folder = os.path.join(export_folder, io_ref)
|
|
654
|
+
os.mkdir(top_level_folder)
|
|
655
|
+
inner_folder = os.path.join(top_level_folder, io_ref)
|
|
656
|
+
os.mkdir(inner_folder)
|
|
657
|
+
content_folder = os.path.join(inner_folder, CONTENT_FOLDER)
|
|
658
|
+
os.mkdir(content_folder)
|
|
659
|
+
metadata_path = os.path.join(inner_folder, "metadata.xml")
|
|
660
|
+
metadata = open(metadata_path, "wt", encoding='utf-8')
|
|
661
|
+
metadata.write(prettify(xip))
|
|
662
|
+
metadata.close()
|
|
663
|
+
for representation_name in preservation_representation_refs_dict.keys():
|
|
664
|
+
location = sanitize(representation_name)
|
|
665
|
+
Path(os.path.join(content_folder, location)).mkdir(parents=True, exist_ok=True)
|
|
666
|
+
preservation_refs_dict = preservation_representation_refs_dict[representation_name]
|
|
667
|
+
for content_ref, filename in preservation_refs_dict.items():
|
|
668
|
+
src_file = filename
|
|
669
|
+
dst_file = os.path.join(os.path.join(content_folder, location), os.path.basename(filename))
|
|
670
|
+
shutil.copyfile(src_file, dst_file)
|
|
671
|
+
for representation_name in access_representation_refs_dict.keys():
|
|
672
|
+
location = sanitize(representation_name)
|
|
673
|
+
Path(os.path.join(content_folder, location)).mkdir(parents=True, exist_ok=True)
|
|
674
|
+
access_refs_dict = access_representation_refs_dict[representation_name]
|
|
675
|
+
for content_ref, filename in access_refs_dict.items():
|
|
676
|
+
src_file = filename
|
|
677
|
+
dst_file = os.path.join(os.path.join(content_folder, location), os.path.basename(filename))
|
|
678
|
+
shutil.copyfile(src_file, dst_file)
|
|
679
|
+
if compress:
|
|
680
|
+
shutil.make_archive(top_level_folder, 'zip', top_level_folder)
|
|
681
|
+
else:
|
|
682
|
+
shutil.make_archive(top_level_folder, 'szip', top_level_folder)
|
|
683
|
+
shutil.rmtree(top_level_folder)
|
|
684
|
+
return top_level_folder + ".zip"
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def multi_asset_package(asset_file_list=None, export_folder=None, parent_folder=None, compress=True, **kwargs):
|
|
688
|
+
"""
|
|
689
|
+
Create a package containing multiple assets, all the assets are ingested into the same parent folder provided
|
|
690
|
+
by the parent_folder argument.
|
|
691
|
+
|
|
692
|
+
:param asset_file_list: List of files. One asset per file
|
|
693
|
+
:param export_folder: Location where the package is written to
|
|
694
|
+
:param parent_folder: The folder the assets will be ingested into
|
|
695
|
+
:param compress: Bool, compress the package
|
|
696
|
+
:param kwargs:
|
|
697
|
+
:return:
|
|
385
698
|
"""
|
|
699
|
+
|
|
700
|
+
# some basic validation
|
|
701
|
+
if export_folder is None:
|
|
702
|
+
export_folder = tempfile.gettempdir()
|
|
703
|
+
if not os.path.isdir(export_folder):
|
|
704
|
+
logger.error("Export Folder Does Not Exist")
|
|
705
|
+
raise RuntimeError(export_folder, "Export Folder Does Not Exist")
|
|
706
|
+
if parent_folder is None:
|
|
707
|
+
logger.error("You must specify a parent folder for the package asset")
|
|
708
|
+
raise RuntimeError("You must specify a parent folder for the package asset")
|
|
709
|
+
|
|
710
|
+
security_tag = kwargs.get('SecurityTag', "open")
|
|
711
|
+
content_type = kwargs.get('CustomType', "")
|
|
712
|
+
|
|
713
|
+
if not compress:
|
|
714
|
+
shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
|
|
715
|
+
|
|
716
|
+
if 'Preservation_files_fixity_callback' in kwargs:
|
|
717
|
+
fixity_callback = kwargs.get('Preservation_files_fixity_callback')
|
|
718
|
+
else:
|
|
719
|
+
fixity_callback = Sha1FixityCallBack()
|
|
720
|
+
|
|
721
|
+
package_id = str(uuid.uuid4())
|
|
722
|
+
export_folder = export_folder
|
|
723
|
+
top_level_folder = os.path.join(export_folder, package_id)
|
|
724
|
+
os.mkdir(top_level_folder)
|
|
725
|
+
inner_folder = os.path.join(top_level_folder, package_id)
|
|
726
|
+
os.mkdir(inner_folder)
|
|
727
|
+
os.mkdir(os.path.join(inner_folder, CONTENT_FOLDER))
|
|
728
|
+
|
|
729
|
+
asset_map = dict()
|
|
730
|
+
xip = Element('xip:XIP')
|
|
731
|
+
xip.set('xmlns:xip', 'http://preservica.com/XIP/v6.0')
|
|
732
|
+
for file in asset_file_list:
|
|
733
|
+
default_asset_title = os.path.splitext(os.path.basename(file))[0]
|
|
734
|
+
xip, io_ref = __create_io__(xip, file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
735
|
+
asset_map[file] = io_ref
|
|
736
|
+
representation = SubElement(xip, 'xip:Representation')
|
|
737
|
+
io_link = SubElement(representation, 'xip:InformationObject')
|
|
738
|
+
io_link.text = io_ref
|
|
739
|
+
access_name = SubElement(representation, 'xip:Name')
|
|
740
|
+
access_name.text = "Preservation"
|
|
741
|
+
access_type = SubElement(representation, 'xip:Type')
|
|
742
|
+
access_type.text = "Preservation"
|
|
743
|
+
content_objects = SubElement(representation, 'xip:ContentObjects')
|
|
744
|
+
content_object = SubElement(content_objects, 'xip:ContentObject')
|
|
745
|
+
content_object_ref = str(uuid.uuid4())
|
|
746
|
+
content_object.text = content_object_ref
|
|
747
|
+
|
|
748
|
+
default_content_objects_title = os.path.splitext(os.path.basename(file))[0]
|
|
749
|
+
content_object = SubElement(xip, 'xip:ContentObject')
|
|
750
|
+
ref_element = SubElement(content_object, "xip:Ref")
|
|
751
|
+
ref_element.text = content_object_ref
|
|
752
|
+
title = SubElement(content_object, "xip:Title")
|
|
753
|
+
title.text = default_content_objects_title
|
|
754
|
+
description = SubElement(content_object, "xip:Description")
|
|
755
|
+
description.text = default_content_objects_title
|
|
756
|
+
security_tag_element = SubElement(content_object, "xip:SecurityTag")
|
|
757
|
+
security_tag_element.text = security_tag
|
|
758
|
+
custom_type = SubElement(content_object, "xip:CustomType")
|
|
759
|
+
custom_type.text = content_type
|
|
760
|
+
parent = SubElement(content_object, "xip:Parent")
|
|
761
|
+
parent.text = io_ref
|
|
762
|
+
|
|
763
|
+
generation = SubElement(xip, 'xip:Generation', {"original": "true", "active": "true"})
|
|
764
|
+
content_object = SubElement(generation, "xip:ContentObject")
|
|
765
|
+
content_object.text = content_object_ref
|
|
766
|
+
label = SubElement(generation, "xip:Label")
|
|
767
|
+
label.text = os.path.splitext(os.path.basename(file))[0]
|
|
768
|
+
effective_date = SubElement(generation, "xip:EffectiveDate")
|
|
769
|
+
effective_date.text = datetime.now().isoformat()
|
|
770
|
+
bitstreams = SubElement(generation, "xip:Bitstreams")
|
|
771
|
+
bitstream = SubElement(bitstreams, "xip:Bitstream")
|
|
772
|
+
bitstream.text = os.path.basename(file)
|
|
773
|
+
SubElement(generation, "xip:Formats")
|
|
774
|
+
SubElement(generation, "xip:Properties")
|
|
775
|
+
|
|
776
|
+
bitstream = SubElement(xip, 'xip:Bitstream')
|
|
777
|
+
filename_element = SubElement(bitstream, "xip:Filename")
|
|
778
|
+
filename_element.text = os.path.basename(file)
|
|
779
|
+
filesize = SubElement(bitstream, "xip:FileSize")
|
|
780
|
+
file_stats = os.stat(file)
|
|
781
|
+
filesize.text = str(file_stats.st_size)
|
|
782
|
+
physical_location = SubElement(bitstream, "xip:PhysicalLocation")
|
|
783
|
+
fixities = SubElement(bitstream, "xip:Fixities")
|
|
784
|
+
fixity_result = fixity_callback(filename_element.text, file)
|
|
785
|
+
if type(fixity_result) == tuple:
|
|
786
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
787
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
788
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
789
|
+
fixity_algorithm_ref.text = fixity_result[0]
|
|
790
|
+
fixity_value.text = fixity_result[1]
|
|
791
|
+
elif type(fixity_result) == dict:
|
|
792
|
+
for key, val in fixity_result.items():
|
|
793
|
+
fixity = SubElement(fixities, "xip:Fixity")
|
|
794
|
+
fixity_algorithm_ref = SubElement(fixity, "xip:FixityAlgorithmRef")
|
|
795
|
+
fixity_value = SubElement(fixity, "xip:FixityValue")
|
|
796
|
+
fixity_algorithm_ref.text = key
|
|
797
|
+
fixity_value.text = val
|
|
798
|
+
else:
|
|
799
|
+
logger.error("Could Not Find Fixity Value")
|
|
800
|
+
raise RuntimeError("Could Not Find Fixity Value")
|
|
801
|
+
|
|
802
|
+
if 'Identifiers' in kwargs:
|
|
803
|
+
identifier_map = kwargs.get('Identifiers')
|
|
804
|
+
if str(file) in identifier_map:
|
|
805
|
+
identifier_map_values = identifier_map[str(file)]
|
|
806
|
+
for identifier_key, identifier_value in identifier_map_values.items():
|
|
807
|
+
if identifier_key:
|
|
808
|
+
if identifier_value:
|
|
809
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
810
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
811
|
+
id_type.text = identifier_key
|
|
812
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
813
|
+
id_value.text = identifier_value
|
|
814
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
815
|
+
id_io.text = io_ref
|
|
816
|
+
|
|
817
|
+
src_file = file
|
|
818
|
+
dst_file = os.path.join(os.path.join(inner_folder, CONTENT_FOLDER), os.path.basename(file))
|
|
819
|
+
shutil.copyfile(src_file, dst_file)
|
|
820
|
+
|
|
821
|
+
if xip is not None:
|
|
822
|
+
metadata_path = os.path.join(inner_folder, "metadata.xml")
|
|
823
|
+
metadata = open(metadata_path, "wt", encoding='utf-8')
|
|
824
|
+
metadata.write(prettify(xip))
|
|
825
|
+
metadata.close()
|
|
826
|
+
if compress:
|
|
827
|
+
shutil.make_archive(top_level_folder, 'zip', top_level_folder)
|
|
828
|
+
else:
|
|
829
|
+
shutil.make_archive(top_level_folder, 'szip', top_level_folder)
|
|
830
|
+
shutil.rmtree(top_level_folder)
|
|
831
|
+
return top_level_folder + ".zip"
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def complex_asset_package(preservation_files_list=None, access_files_list=None, export_folder=None, parent_folder=None,
|
|
835
|
+
compress=True,
|
|
836
|
+
**kwargs):
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
Create a Preservica package containing a single Asset from a multiple preservation files
|
|
840
|
+
and optional access files.
|
|
841
|
+
The Asset contains multiple Content Objects within each representation.
|
|
842
|
+
|
|
843
|
+
If only the preservation files are provided the asset has one representation
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
:param list preservation_files_list: Paths to the preservation files
|
|
847
|
+
:param list access_files_list: Paths to the access files
|
|
848
|
+
:param str export_folder: The package location folder
|
|
849
|
+
:param Folder parent_folder: The folder to ingest the asset into
|
|
850
|
+
:param bool compress: Compress the ZIP file
|
|
851
|
+
:param str Title: Asset Title
|
|
852
|
+
:param str Description: Asset Description
|
|
853
|
+
:param str SecurityTag: Asset SecurityTag
|
|
854
|
+
:param str CustomType: Asset CustomType
|
|
855
|
+
:param str Preservation_Content_Title: Title of the Preservation Representation Content Object
|
|
856
|
+
:param str Preservation_Content_Description: Description of the Preservation Representation Content Object
|
|
857
|
+
:param str Access_Content_Title: Title of the Access Representation Content Object
|
|
858
|
+
:param str Access_Content_Description: Description of the Access Representation Content Object
|
|
859
|
+
:param dict Asset_Metadata: Dictionary of Asset metadata documents
|
|
860
|
+
:param dict Identifiers: Dictionary of Asset rd party identifiers
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
|
|
386
865
|
optional kwargs map
|
|
387
866
|
'Title' Asset Title
|
|
388
867
|
'Description' Asset Description
|
|
@@ -399,7 +878,11 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
399
878
|
'Preservation_files_fixity_callback' Callback to allow external generated fixity values
|
|
400
879
|
'Access_files_fixity_callback' Callback to allow external generated fixity values
|
|
401
880
|
'IO_Identifier_callback' Callback to allow external generated Asset identifier
|
|
881
|
+
'Preservation_Representation_Name' Name of the Preservation Representation
|
|
882
|
+
'Access_Representation_Name' Name of the Access Representation
|
|
402
883
|
"""
|
|
884
|
+
xml.etree.ElementTree.register_namespace("xip", "http://preservica.com/XIP/v6.0")
|
|
885
|
+
|
|
403
886
|
# some basic validation
|
|
404
887
|
if export_folder is None:
|
|
405
888
|
export_folder = tempfile.gettempdir()
|
|
@@ -413,8 +896,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
413
896
|
io_ref = None
|
|
414
897
|
xip = None
|
|
415
898
|
default_asset_title = None
|
|
416
|
-
preservation_refs_dict =
|
|
417
|
-
access_refs_dict =
|
|
899
|
+
preservation_refs_dict = {}
|
|
900
|
+
access_refs_dict = {}
|
|
418
901
|
|
|
419
902
|
security_tag = kwargs.get('SecurityTag', "open")
|
|
420
903
|
content_type = kwargs.get('CustomType', "")
|
|
@@ -428,25 +911,35 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
428
911
|
if has_preservation_files:
|
|
429
912
|
if default_asset_title is None:
|
|
430
913
|
default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
|
|
431
|
-
|
|
432
914
|
# create the asset
|
|
433
|
-
|
|
915
|
+
if io_ref is None:
|
|
916
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
434
917
|
|
|
435
918
|
if has_access_files:
|
|
436
919
|
if default_asset_title is None:
|
|
437
920
|
default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
|
|
438
|
-
|
|
439
921
|
if io_ref is None:
|
|
440
922
|
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
441
923
|
|
|
924
|
+
if io_ref is None:
|
|
925
|
+
default_asset_title = kwargs.get('Title', None)
|
|
926
|
+
if default_asset_title is None:
|
|
927
|
+
default_asset_title = "New Asset"
|
|
928
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
929
|
+
|
|
442
930
|
if has_preservation_files:
|
|
443
931
|
# add the content objects
|
|
444
|
-
|
|
445
|
-
|
|
932
|
+
representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
|
|
933
|
+
preservation_refs_dict = __make_representation_multiple_co__(xip, rep_name=representation_name,
|
|
934
|
+
rep_type="Preservation",
|
|
935
|
+
rep_files=preservation_files_list, io_ref=io_ref)
|
|
446
936
|
|
|
447
937
|
if has_access_files:
|
|
448
938
|
# add the content objects
|
|
449
|
-
|
|
939
|
+
access_name = kwargs.get('Access_Representation_Name', "Access")
|
|
940
|
+
access_refs_dict = __make_representation_multiple_co__(xip, rep_name=access_name, rep_type="Access",
|
|
941
|
+
rep_files=access_files_list,
|
|
942
|
+
io_ref=io_ref)
|
|
450
943
|
|
|
451
944
|
if has_preservation_files:
|
|
452
945
|
|
|
@@ -463,7 +956,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
463
956
|
preservation_content_description = preservation_content_description[filename]
|
|
464
957
|
|
|
465
958
|
__make_content_objects__(xip, preservation_content_title, content_ref, io_ref, security_tag,
|
|
466
|
-
preservation_content_description,
|
|
959
|
+
preservation_content_description,
|
|
960
|
+
content_type)
|
|
467
961
|
|
|
468
962
|
if has_access_files:
|
|
469
963
|
|
|
@@ -488,7 +982,8 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
488
982
|
|
|
489
983
|
for content_ref, filename in preservation_refs_dict.items():
|
|
490
984
|
preservation_file_name = os.path.basename(filename)
|
|
491
|
-
__make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label
|
|
985
|
+
__make_generation__(xip, preservation_file_name, content_ref, preservation_generation_label,
|
|
986
|
+
PRESERVATION_CONTENT_FOLDER)
|
|
492
987
|
|
|
493
988
|
if has_access_files:
|
|
494
989
|
|
|
@@ -496,7 +991,7 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
496
991
|
|
|
497
992
|
for content_ref, filename in access_refs_dict.items():
|
|
498
993
|
access_file_name = os.path.basename(filename)
|
|
499
|
-
__make_generation__(xip, access_file_name, content_ref, access_generation_label)
|
|
994
|
+
__make_generation__(xip, access_file_name, content_ref, access_generation_label, ACCESS_CONTENT_FOLDER)
|
|
500
995
|
|
|
501
996
|
if has_preservation_files:
|
|
502
997
|
|
|
@@ -507,7 +1002,7 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
507
1002
|
|
|
508
1003
|
for content_ref, filename in preservation_refs_dict.items():
|
|
509
1004
|
preservation_file_name = os.path.basename(filename)
|
|
510
|
-
__make_bitstream__(xip, preservation_file_name, filename, callback)
|
|
1005
|
+
__make_bitstream__(xip, preservation_file_name, filename, callback, PRESERVATION_CONTENT_FOLDER)
|
|
511
1006
|
|
|
512
1007
|
if has_access_files:
|
|
513
1008
|
|
|
@@ -518,35 +1013,58 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
518
1013
|
|
|
519
1014
|
for content_ref, filename in access_refs_dict.items():
|
|
520
1015
|
access_file_name = os.path.basename(filename)
|
|
521
|
-
__make_bitstream__(xip, access_file_name, filename, callback)
|
|
1016
|
+
__make_bitstream__(xip, access_file_name, filename, callback, ACCESS_CONTENT_FOLDER)
|
|
522
1017
|
|
|
523
1018
|
if 'Identifiers' in kwargs:
|
|
524
1019
|
identifier_map = kwargs.get('Identifiers')
|
|
525
1020
|
for identifier_key, identifier_value in identifier_map.items():
|
|
526
1021
|
if identifier_key:
|
|
527
1022
|
if identifier_value:
|
|
528
|
-
identifier = SubElement(xip, 'Identifier')
|
|
529
|
-
id_type = SubElement(identifier, "Type")
|
|
1023
|
+
identifier = SubElement(xip, 'xip:Identifier')
|
|
1024
|
+
id_type = SubElement(identifier, "xip:Type")
|
|
530
1025
|
id_type.text = identifier_key
|
|
531
|
-
id_value = SubElement(identifier, "Value")
|
|
1026
|
+
id_value = SubElement(identifier, "xip:Value")
|
|
532
1027
|
id_value.text = identifier_value
|
|
533
|
-
id_io = SubElement(identifier, "Entity")
|
|
1028
|
+
id_io = SubElement(identifier, "xip:Entity")
|
|
534
1029
|
id_io.text = io_ref
|
|
535
1030
|
|
|
536
1031
|
if 'Asset_Metadata' in kwargs:
|
|
537
1032
|
metadata_map = kwargs.get('Asset_Metadata')
|
|
538
1033
|
for metadata_ns, metadata_path in metadata_map.items():
|
|
539
1034
|
if metadata_ns:
|
|
540
|
-
if metadata_path:
|
|
1035
|
+
if metadata_path and isinstance(metadata_path, str):
|
|
541
1036
|
if os.path.exists(metadata_path) and os.path.isfile(metadata_path):
|
|
542
1037
|
descriptive_metadata = xml.etree.ElementTree.parse(source=metadata_path)
|
|
543
|
-
metadata = SubElement(xip, 'Metadata', {'schemaUri': metadata_ns})
|
|
544
|
-
metadata_ref = SubElement(metadata, 'Ref')
|
|
1038
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1039
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
545
1040
|
metadata_ref.text = str(uuid.uuid4())
|
|
546
|
-
entity = SubElement(metadata, 'Entity')
|
|
1041
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
547
1042
|
entity.text = io_ref
|
|
548
|
-
content = SubElement(metadata, 'Content')
|
|
1043
|
+
content = SubElement(metadata, 'xip:Content')
|
|
549
1044
|
content.append(descriptive_metadata.getroot())
|
|
1045
|
+
elif isinstance(metadata_path, str):
|
|
1046
|
+
try:
|
|
1047
|
+
descriptive_metadata = xml.etree.ElementTree.fromstring(metadata_path)
|
|
1048
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1049
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
1050
|
+
metadata_ref.text = str(uuid.uuid4())
|
|
1051
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
1052
|
+
entity.text = io_ref
|
|
1053
|
+
content = SubElement(metadata, 'xip:Content')
|
|
1054
|
+
content.append(descriptive_metadata)
|
|
1055
|
+
except RuntimeError:
|
|
1056
|
+
logging.info(f"Could not parse asset metadata in namespace {metadata_ns}")
|
|
1057
|
+
if metadata_path and isinstance(metadata_path, list):
|
|
1058
|
+
for path in metadata_path:
|
|
1059
|
+
if os.path.exists(path) and os.path.isfile(path):
|
|
1060
|
+
descriptive_metadata = xml.etree.ElementTree.parse(source=path)
|
|
1061
|
+
metadata = SubElement(xip, 'xip:Metadata', {'schemaUri': metadata_ns})
|
|
1062
|
+
metadata_ref = SubElement(metadata, 'xip:Ref')
|
|
1063
|
+
metadata_ref.text = str(uuid.uuid4())
|
|
1064
|
+
entity = SubElement(metadata, 'xip:Entity')
|
|
1065
|
+
entity.text = io_ref
|
|
1066
|
+
content = SubElement(metadata, 'xip:Content')
|
|
1067
|
+
content.append(descriptive_metadata.getroot())
|
|
550
1068
|
|
|
551
1069
|
if xip is not None:
|
|
552
1070
|
export_folder = export_folder
|
|
@@ -554,18 +1072,23 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
554
1072
|
os.mkdir(top_level_folder)
|
|
555
1073
|
inner_folder = os.path.join(top_level_folder, io_ref)
|
|
556
1074
|
os.mkdir(inner_folder)
|
|
557
|
-
os.
|
|
1075
|
+
content_folder = os.path.join(inner_folder, CONTENT_FOLDER)
|
|
1076
|
+
os.mkdir(content_folder)
|
|
1077
|
+
preservation_content_folder = os.path.join(content_folder, PRESERVATION_CONTENT_FOLDER)
|
|
1078
|
+
os.mkdir(preservation_content_folder)
|
|
1079
|
+
access_content_folder = os.path.join(content_folder, ACCESS_CONTENT_FOLDER)
|
|
1080
|
+
os.mkdir(access_content_folder)
|
|
558
1081
|
metadata_path = os.path.join(inner_folder, "metadata.xml")
|
|
559
1082
|
metadata = open(metadata_path, "wt", encoding='utf-8')
|
|
560
1083
|
metadata.write(prettify(xip))
|
|
561
1084
|
metadata.close()
|
|
562
1085
|
for content_ref, filename in preservation_refs_dict.items():
|
|
563
1086
|
src_file = filename
|
|
564
|
-
dst_file = os.path.join(
|
|
1087
|
+
dst_file = os.path.join(preservation_content_folder, os.path.basename(filename))
|
|
565
1088
|
shutil.copyfile(src_file, dst_file)
|
|
566
1089
|
for content_ref, filename in access_refs_dict.items():
|
|
567
1090
|
src_file = filename
|
|
568
|
-
dst_file = os.path.join(
|
|
1091
|
+
dst_file = os.path.join(access_content_folder, os.path.basename(filename))
|
|
569
1092
|
shutil.copyfile(src_file, dst_file)
|
|
570
1093
|
if compress:
|
|
571
1094
|
shutil.make_archive(top_level_folder, 'zip', top_level_folder)
|
|
@@ -578,21 +1101,29 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
578
1101
|
def simple_asset_package(preservation_file=None, access_file=None, export_folder=None, parent_folder=None,
|
|
579
1102
|
compress=True, **kwargs):
|
|
580
1103
|
"""
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
1104
|
+
Create a Preservica package containing a single Asset from a single preservation file
|
|
1105
|
+
and an optional access file.
|
|
1106
|
+
The Asset contains one Content Object for each representation.
|
|
1107
|
+
|
|
1108
|
+
If only the preservation file is provided the asset has one representation
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
:param str preservation_file: Path to the preservation file
|
|
1112
|
+
:param str access_file: Path to the access file
|
|
1113
|
+
:param str export_folder: The package location folder
|
|
1114
|
+
:param Folder parent_folder: The folder to ingest the asset into
|
|
1115
|
+
:param bool compress: Compress the ZIP file
|
|
1116
|
+
:param str Title: Asset Title
|
|
1117
|
+
:param str Description: Asset Description
|
|
1118
|
+
:param str SecurityTag: Asset SecurityTag
|
|
1119
|
+
:param str CustomType: Asset CustomType
|
|
1120
|
+
:param str Preservation_Content_Title: Title of the Preservation Representation Content Object
|
|
1121
|
+
:param str Preservation_Content_Description: Description of the Preservation Representation Content Object
|
|
1122
|
+
:param str Access_Content_Title: Title of the Access Representation Content Object
|
|
1123
|
+
:param str Access_Content_Description: Description of the Access Representation Content Object
|
|
1124
|
+
:param dict Asset_Metadata: Dictionary of Asset metadata documents
|
|
1125
|
+
:param dict Identifiers: Dictionary of Asset rd party identifiers
|
|
1126
|
+
|
|
596
1127
|
"""
|
|
597
1128
|
|
|
598
1129
|
# some basic validation
|
|
@@ -618,178 +1149,35 @@ def simple_asset_package(preservation_file=None, access_file=None, export_folder
|
|
|
618
1149
|
export_folder=export_folder, parent_folder=parent_folder, compress=compress, **kwargs)
|
|
619
1150
|
|
|
620
1151
|
|
|
1152
|
+
def upload_config():
|
|
1153
|
+
return transfer_config
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
def _unpad(s):
|
|
1157
|
+
return s[:-ord(s[len(s) - 1:])]
|
|
1158
|
+
|
|
1159
|
+
|
|
621
1160
|
class UploadAPI(AuthenticatedAPI):
|
|
622
1161
|
|
|
623
|
-
def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None,
|
|
624
|
-
twitter_secret_key=None, folder=None, callback=None, **kwargs):
|
|
625
1162
|
|
|
626
|
-
|
|
627
|
-
media_url_https_ = m["media_url_https"]
|
|
628
|
-
if media_url_https_:
|
|
629
|
-
req = requests.get(media_url_https_)
|
|
630
|
-
if req.status_code == requests.codes.ok:
|
|
631
|
-
if has_video_element:
|
|
632
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
|
|
633
|
-
else:
|
|
634
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
|
|
635
|
-
image_name_document_ = open(image_name_, "wb")
|
|
636
|
-
image_name_document_.write(req.content)
|
|
637
|
-
image_name_document_.close()
|
|
638
|
-
return image_name_
|
|
639
|
-
|
|
640
|
-
def get_video(m):
|
|
641
|
-
video_info_ = m["video_info"]
|
|
642
|
-
variants_ = video_info_["variants"]
|
|
643
|
-
for v_ in variants_:
|
|
644
|
-
video_url_ = v_["url"]
|
|
645
|
-
req = requests.get(video_url_)
|
|
646
|
-
if req.status_code == requests.codes.ok:
|
|
647
|
-
video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
|
|
648
|
-
video_name_document_ = open(video_name_, "wb")
|
|
649
|
-
video_name_document_.write(req.content)
|
|
650
|
-
video_name_document_.close()
|
|
651
|
-
return video_name_, True
|
|
652
|
-
|
|
653
|
-
entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
654
|
-
tenant=self.tenant)
|
|
655
|
-
if hasattr(folder, "reference"):
|
|
656
|
-
folder = entity_client.folder(folder.reference)
|
|
657
|
-
else:
|
|
658
|
-
folder = entity_client.folder(folder)
|
|
659
|
-
try:
|
|
660
|
-
import tweepy
|
|
661
|
-
from tweepy import TweepError
|
|
662
|
-
except ImportError:
|
|
663
|
-
logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
664
|
-
raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
665
|
-
config = configparser.ConfigParser()
|
|
666
|
-
config.read('credentials.properties')
|
|
667
|
-
if twitter_consumer_key is None:
|
|
668
|
-
twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
|
|
669
|
-
if twitter_consumer_key is None:
|
|
670
|
-
try:
|
|
671
|
-
twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
|
|
672
|
-
except KeyError:
|
|
673
|
-
logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
674
|
-
"environment variables or credentials.properties file")
|
|
675
|
-
raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
676
|
-
"environment variables or credentials.properties file")
|
|
677
|
-
if twitter_secret_key is None:
|
|
678
|
-
twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
|
|
679
|
-
if twitter_secret_key is None:
|
|
680
|
-
try:
|
|
681
|
-
twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
|
|
682
|
-
except KeyError:
|
|
683
|
-
logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
684
|
-
"environment variables or credentials.properties file")
|
|
685
|
-
raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
686
|
-
"environment variables or credentials.properties file")
|
|
687
|
-
|
|
688
|
-
api = None
|
|
689
|
-
try:
|
|
690
|
-
auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
|
|
691
|
-
api = tweepy.API(auth, wait_on_rate_limit=True)
|
|
692
|
-
except TweepError:
|
|
693
|
-
logger.error("No valid Twitter API keys. Could not authenticate")
|
|
694
|
-
raise RuntimeError("No valid Twitter API keys. Could not authenticate")
|
|
695
|
-
if api is not None:
|
|
696
|
-
logger.debug(api)
|
|
697
|
-
for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
|
|
698
|
-
created_at = tweet.created_at
|
|
699
|
-
id_str = tweet.id_str
|
|
700
|
-
author = tweet.author.name
|
|
701
|
-
tweet_entities = tweet.entities
|
|
702
|
-
hashtags = dict()
|
|
703
|
-
if 'hashtags' in tweet_entities:
|
|
704
|
-
hashtags = tweet.entities['hashtags']
|
|
705
|
-
entities = entity_client.identifier("tweet_id", id_str.strip())
|
|
706
|
-
if len(entities) > 0:
|
|
707
|
-
logger.warning("Tweet already exists, skipping....")
|
|
708
|
-
continue
|
|
709
|
-
logger.info(f"Processing tweet {id_str} ...")
|
|
710
|
-
tid = tweet.id
|
|
711
|
-
content_objects = list()
|
|
712
|
-
full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
|
|
713
|
-
text = tweet.text
|
|
714
|
-
full_text = full_tweet.full_text
|
|
715
|
-
file_name = f"{{{id_str}}}_[{twitter_user}].json"
|
|
716
|
-
json_doc = json.dumps(full_tweet._json)
|
|
717
|
-
json_file = open(file_name, "wt", encoding="utf-8")
|
|
718
|
-
json_file.write(json_doc)
|
|
719
|
-
json_file.close()
|
|
720
|
-
content_objects.append(file_name)
|
|
721
|
-
if hasattr(full_tweet, "extended_entities"):
|
|
722
|
-
extended_entities = full_tweet.extended_entities
|
|
723
|
-
if "media" in extended_entities:
|
|
724
|
-
media = extended_entities["media"]
|
|
725
|
-
for med in media:
|
|
726
|
-
media_id_str = med["id_str"]
|
|
727
|
-
has_video = False
|
|
728
|
-
if "video_info" in med:
|
|
729
|
-
co, has_video = get_video(med)
|
|
730
|
-
content_objects.append(co)
|
|
731
|
-
continue
|
|
732
|
-
if "media_url_https" in med:
|
|
733
|
-
co = get_image(med, has_video)
|
|
734
|
-
content_objects.append(co)
|
|
735
|
-
identifiers = dict()
|
|
736
|
-
asset_metadata = dict()
|
|
737
|
-
identifiers["tweet_id"] = id_str
|
|
738
|
-
|
|
739
|
-
user = full_tweet._json['user']
|
|
740
|
-
|
|
741
|
-
if full_tweet._json.get('retweeted_status'):
|
|
742
|
-
retweeted_status = full_tweet._json['retweeted_status']
|
|
743
|
-
if retweeted_status.get("extended_entities"):
|
|
744
|
-
extended_entities = retweeted_status["extended_entities"]
|
|
745
|
-
if "media" in extended_entities:
|
|
746
|
-
media = extended_entities["media"]
|
|
747
|
-
for med in media:
|
|
748
|
-
media_id_str = med["id_str"]
|
|
749
|
-
has_video = False
|
|
750
|
-
if "video_info" in med:
|
|
751
|
-
co, has_video = get_video(med)
|
|
752
|
-
content_objects.append(co)
|
|
753
|
-
continue
|
|
754
|
-
if "media_url_https" in med:
|
|
755
|
-
co = get_image(med, has_video)
|
|
756
|
-
content_objects.append(co)
|
|
757
|
-
|
|
758
|
-
xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
|
|
759
|
-
xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
|
|
760
|
-
xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
|
|
761
|
-
xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
|
|
762
|
-
xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
|
|
763
|
-
for h in hashtags:
|
|
764
|
-
xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
|
|
765
|
-
|
|
766
|
-
xml.etree.ElementTree.SubElement(xml_object, "name").text = author
|
|
767
|
-
xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
|
|
768
|
-
xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
|
|
769
|
-
|
|
770
|
-
xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
|
|
771
|
-
|
|
772
|
-
metadata_document = open("metadata.xml", "wt", encoding="utf-8")
|
|
773
|
-
metadata_document.write(xml_request.decode("utf-8"))
|
|
774
|
-
metadata_document.close()
|
|
775
|
-
|
|
776
|
-
asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
|
|
777
|
-
|
|
778
|
-
security_tag = kwargs.get("SecurityTag", "open")
|
|
779
|
-
asset_title = kwargs.get("Title", text)
|
|
780
|
-
asset_description = kwargs.get("Description", full_text)
|
|
781
|
-
|
|
782
|
-
p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
|
|
783
|
-
Title=asset_title, Description=asset_description, CustomType="Tweet",
|
|
784
|
-
Identifiers=identifiers, Asset_Metadata=asset_metadata,
|
|
785
|
-
SecurityTag=security_tag)
|
|
786
|
-
self.upload_zip_package(p, folder=folder, callback=callback)
|
|
787
|
-
for ob in content_objects:
|
|
788
|
-
os.remove(ob)
|
|
789
|
-
os.remove("metadata.xml")
|
|
790
|
-
sleep(2)
|
|
1163
|
+
|
|
791
1164
|
|
|
792
1165
|
def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
|
|
1166
|
+
"""
|
|
1167
|
+
Ingest a web video such as YouTube etc based on the URL
|
|
1168
|
+
|
|
1169
|
+
:param str url: URL to the YouTube video
|
|
1170
|
+
:param Folder parent_folder: The folder to ingest the video into
|
|
1171
|
+
:param str Title: Optional asset title
|
|
1172
|
+
:param str Description: Optional asset description
|
|
1173
|
+
:param str SecurityTag: Optional asset security tag
|
|
1174
|
+
:param dict Identifiers: Optional asset 3rd party identifiers
|
|
1175
|
+
:param dict Asset_Metadata: Optional asset additional descriptive metadata
|
|
1176
|
+
:param callback callback: Optional upload progress callback
|
|
1177
|
+
:raises RuntimeError:
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
"""
|
|
793
1181
|
try:
|
|
794
1182
|
import youtube_dl
|
|
795
1183
|
except ImportError:
|
|
@@ -802,10 +1190,7 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
802
1190
|
if d['status'] == 'finished':
|
|
803
1191
|
logger.info('Download Complete. Uploading to Preservica ...')
|
|
804
1192
|
|
|
805
|
-
ydl_opts = {
|
|
806
|
-
'outtmpl': '%(id)s.mp4',
|
|
807
|
-
'progress_hooks': [my_hook],
|
|
808
|
-
}
|
|
1193
|
+
ydl_opts = {'outtmpl': '%(id)s.mp4', 'progress_hooks': [my_hook], }
|
|
809
1194
|
|
|
810
1195
|
# if True:
|
|
811
1196
|
# ydl_opts['writesubtitles'] = True
|
|
@@ -858,54 +1243,468 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
858
1243
|
|
|
859
1244
|
self.upload_zip_package(path_to_zip_package=package, folder=parent_folder, callback=callback)
|
|
860
1245
|
|
|
861
|
-
def
|
|
862
|
-
|
|
863
|
-
|
|
1246
|
+
def upload_credentials(self, location_id: str):
|
|
1247
|
+
"""
|
|
1248
|
+
Retrieves temporary upload credentials (Amazon STS, or Azure SAS) for this location.
|
|
1249
|
+
|
|
1250
|
+
:return: dict
|
|
1251
|
+
"""
|
|
1252
|
+
headers = {HEADER_TOKEN: self.token}
|
|
1253
|
+
endpoint = f"/upload/{location_id}/upload-credentials"
|
|
1254
|
+
request = self.session.get(f'https://{self.server}/api/location{endpoint}', headers=headers)
|
|
1255
|
+
if request.status_code == requests.codes.ok:
|
|
1256
|
+
json_response = str(request.content.decode('utf-8'))
|
|
1257
|
+
return json.loads(json_response)
|
|
1258
|
+
elif request.status_code == requests.codes.unauthorized:
|
|
1259
|
+
self.token = self.__token__()
|
|
1260
|
+
return self.upload_credentials(location_id)
|
|
1261
|
+
else:
|
|
1262
|
+
exception = HTTPException(location_id, request.status_code, request.url, "upload_credentials",
|
|
1263
|
+
request.content.decode('utf-8'))
|
|
1264
|
+
logger.error(exception)
|
|
1265
|
+
raise exception
|
|
1266
|
+
|
|
1267
|
+
def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
|
|
1268
|
+
"""
|
|
1269
|
+
Clean up objects in an upload bucket which are older than older_than_days.
|
|
1270
|
+
|
|
1271
|
+
"""
|
|
1272
|
+
from azure.storage.blob import ContainerClient
|
|
1273
|
+
|
|
1274
|
+
for location in self.upload_locations():
|
|
1275
|
+
if location['containerName'] == bucket_name:
|
|
1276
|
+
|
|
1277
|
+
if location['type'] != 'AWS':
|
|
1278
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1279
|
+
account_key = credentials['key']
|
|
1280
|
+
session_token = credentials['sessionToken']
|
|
1281
|
+
sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
|
|
1282
|
+
container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
|
|
1283
|
+
now = datetime.now(timezone.utc)
|
|
1284
|
+
for blob in container.list_blobs():
|
|
1285
|
+
if abs((blob.last_modified - now).days) > older_than_days:
|
|
1286
|
+
logger.debug(f"Deleting expired object {blob.name}")
|
|
1287
|
+
container.delete_blob(blob.name)
|
|
1288
|
+
|
|
1289
|
+
if location['type'] == 'AWS':
|
|
1290
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1291
|
+
access_key = credentials['key']
|
|
1292
|
+
secret_key = credentials['secret']
|
|
1293
|
+
session_token = credentials['sessionToken']
|
|
1294
|
+
session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
|
|
1295
|
+
aws_session_token=session_token)
|
|
1296
|
+
s3_client = session.client("s3")
|
|
1297
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
1298
|
+
now = datetime.now(timezone.utc)
|
|
1299
|
+
for page in paginator.paginate(Bucket=bucket_name):
|
|
1300
|
+
if 'Contents' in page:
|
|
1301
|
+
for key in page['Contents']:
|
|
1302
|
+
last_modified = key['LastModified']
|
|
1303
|
+
if abs((last_modified - now).days) > older_than_days:
|
|
1304
|
+
logger.debug(f"Deleting expired object {key['Key']}")
|
|
1305
|
+
s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
def upload_locations(self):
|
|
1314
|
+
"""
|
|
1315
|
+
Upload locations are configured on the Sources page as 'SIP Upload'.
|
|
1316
|
+
:return: dict
|
|
1317
|
+
"""
|
|
1318
|
+
headers = {HEADER_TOKEN: self.token}
|
|
1319
|
+
endpoint = "/api/location/upload"
|
|
1320
|
+
request = self.session.get(f'https://{self.server}{endpoint}', headers=headers)
|
|
1321
|
+
if request.status_code == requests.codes.ok:
|
|
1322
|
+
json_response = str(request.content.decode('utf-8'))
|
|
1323
|
+
return json.loads(json_response)['locations']
|
|
1324
|
+
elif request.status_code == requests.codes.unauthorized:
|
|
1325
|
+
self.token = self.__token__()
|
|
1326
|
+
return self.upload_locations()
|
|
1327
|
+
else:
|
|
1328
|
+
exception = HTTPException("", request.status_code, request.url, "upload_locations",
|
|
1329
|
+
request.content.decode('utf-8'))
|
|
1330
|
+
logger.error(exception)
|
|
1331
|
+
raise exception
|
|
1332
|
+
|
|
1333
|
+
def upload_buckets(self):
|
|
1334
|
+
"""
|
|
1335
|
+
Get a list of available upload buckets
|
|
1336
|
+
|
|
1337
|
+
:return: dict of bucket names and regions
|
|
1338
|
+
"""
|
|
1339
|
+
return self.upload_locations()
|
|
1340
|
+
|
|
1341
|
+
def crawl_filesystem(self, filesystem_path, bucket_name, preservica_parent, callback: bool = False,
|
|
1342
|
+
security_tag: str = "open",
|
|
1343
|
+
delete_after_upload: bool = True, max_MB_ingested: int = -1):
|
|
1344
|
+
|
|
1345
|
+
from pyPreservica import EntityAPI
|
|
1346
|
+
|
|
1347
|
+
def entity_value(client: EntityAPI, identifier: str) -> Entity:
|
|
1348
|
+
back_off: int = 5
|
|
1349
|
+
while True:
|
|
1350
|
+
try:
|
|
1351
|
+
entities = client.identifier("code", identifier)
|
|
1352
|
+
if bool(len(entities) > 0):
|
|
1353
|
+
return entities.pop()
|
|
1354
|
+
else:
|
|
1355
|
+
return None
|
|
1356
|
+
except HTTPException as e:
|
|
1357
|
+
sleep(back_off)
|
|
1358
|
+
back_off = back_off * 2
|
|
1359
|
+
|
|
1360
|
+
def entity_exists(client: EntityAPI, identifier: str) -> bool:
|
|
1361
|
+
back_off: int = 5
|
|
1362
|
+
while True:
|
|
1363
|
+
try:
|
|
1364
|
+
entities = client.identifier("code", identifier)
|
|
1365
|
+
return bool(len(entities) > 0)
|
|
1366
|
+
except HTTPException as e:
|
|
1367
|
+
sleep(back_off)
|
|
1368
|
+
back_off = back_off * 2
|
|
1369
|
+
|
|
1370
|
+
def get_parent(client, identifier, parent_reference):
|
|
1371
|
+
dirname_id: str = str(os.path.dirname(identifier))
|
|
1372
|
+
if not dirname_id:
|
|
1373
|
+
dirname_id = identifier
|
|
1374
|
+
folder = entity_value(client, dirname_id)
|
|
1375
|
+
if folder is not None:
|
|
1376
|
+
folder = client.folder(folder.reference)
|
|
1377
|
+
return folder.reference
|
|
1378
|
+
else:
|
|
1379
|
+
return parent_reference
|
|
1380
|
+
|
|
1381
|
+
def get_folder(client, name, tag, parent_reference, identifier):
|
|
1382
|
+
folder = entity_value(client, identifier)
|
|
1383
|
+
if folder is None:
|
|
1384
|
+
logger.info(f"Creating new folder with name {name}")
|
|
1385
|
+
folder = client.create_folder(name, name, tag, parent_reference)
|
|
1386
|
+
client.add_identifier(folder, "code", identifier)
|
|
1387
|
+
else:
|
|
1388
|
+
logger.info(f"Found existing folder with name {name}")
|
|
1389
|
+
return folder
|
|
1390
|
+
|
|
1391
|
+
entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1392
|
+
tenant=self.tenant,
|
|
1393
|
+
two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
|
|
1394
|
+
protocol=self.protocol)
|
|
1395
|
+
|
|
1396
|
+
if preservica_parent:
|
|
1397
|
+
parent = entity_client.folder(preservica_parent)
|
|
1398
|
+
logger.info(f"Folders will be created inside Preservica collection {parent.title}")
|
|
1399
|
+
parent_ref = parent.reference
|
|
1400
|
+
else:
|
|
1401
|
+
parent_ref = None
|
|
1402
|
+
|
|
1403
|
+
bytes_ingested = 0
|
|
1404
|
+
|
|
1405
|
+
folder_path = os.path.normpath(filesystem_path)
|
|
1406
|
+
|
|
1407
|
+
for dirname, subdirs, files in os.walk(folder_path):
|
|
1408
|
+
base = os.path.basename(dirname)
|
|
1409
|
+
code = os.path.relpath(dirname, Path(folder_path).parent)
|
|
1410
|
+
p = get_parent(entity_client, code, parent_ref)
|
|
1411
|
+
f = get_folder(entity_client, base, security_tag, p, code)
|
|
1412
|
+
identifiers = dict()
|
|
1413
|
+
for file in list(files):
|
|
1414
|
+
full_path = os.path.join(dirname, file)
|
|
1415
|
+
if os.path.islink(full_path):
|
|
1416
|
+
logger.info(f"Skipping link {file}")
|
|
1417
|
+
files.remove(file)
|
|
1418
|
+
continue
|
|
1419
|
+
asset_code = os.path.join(code, file)
|
|
1420
|
+
if not entity_exists(entity_client, asset_code):
|
|
1421
|
+
bytes_ingested = bytes_ingested + os.stat(full_path).st_size
|
|
1422
|
+
logger.info(f"Adding new file: {file} to package ready for upload")
|
|
1423
|
+
file_identifiers = {"code": asset_code}
|
|
1424
|
+
identifiers[full_path] = file_identifiers
|
|
1425
|
+
else:
|
|
1426
|
+
logger.info(f"Skipping file {file} already exists in repository")
|
|
1427
|
+
files.remove(file)
|
|
1428
|
+
|
|
1429
|
+
if len(files) > 0:
|
|
1430
|
+
full_path_list = [os.path.join(dirname, file) for file in files]
|
|
1431
|
+
package = multi_asset_package(asset_file_list=full_path_list, parent_folder=f, SecurityTag=security_tag,
|
|
1432
|
+
Identifiers=identifiers)
|
|
1433
|
+
if callback:
|
|
1434
|
+
progress_display = UploadProgressConsoleCallback(package)
|
|
1435
|
+
else:
|
|
1436
|
+
progress_display = None
|
|
1437
|
+
|
|
1438
|
+
if bucket_name is None:
|
|
1439
|
+
self.upload_zip_package(path_to_zip_package=package, callback=progress_display,
|
|
1440
|
+
delete_after_upload=delete_after_upload)
|
|
1441
|
+
else:
|
|
1442
|
+
self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
|
|
1443
|
+
show_progress=bool(progress_display is not None),
|
|
1444
|
+
delete_after_upload=delete_after_upload)
|
|
1445
|
+
|
|
1446
|
+
logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
|
|
1447
|
+
|
|
1448
|
+
if max_MB_ingested > 0:
|
|
1449
|
+
if bytes_ingested > (1024 * 1024 * max_MB_ingested):
|
|
1450
|
+
logger.info(f"Reached Max Upload Limit")
|
|
1451
|
+
break
|
|
1452
|
+
|
|
1453
|
+
def upload_zip_to_Source(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
|
|
1454
|
+
show_progress=False):
|
|
1455
|
+
|
|
1456
|
+
"""
|
|
1457
|
+
Uploads a zip file package to either an Azure container or S3 bucket
|
|
1458
|
+
depending on the Preservica system deployment
|
|
1459
|
+
|
|
1460
|
+
:param str path_to_zip_package: Path to the package
|
|
1461
|
+
:param str container_name: container connected to the ingest workflow
|
|
1462
|
+
:param Folder folder: The folder to ingest the package into
|
|
1463
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1464
|
+
:param bool show_progress: Show upload progress bar
|
|
1465
|
+
|
|
1466
|
+
"""
|
|
1467
|
+
|
|
1468
|
+
locations = self.upload_locations()
|
|
1469
|
+
for location in locations:
|
|
1470
|
+
if location['containerName'] == container_name:
|
|
1471
|
+
if location['type'] == 'AWS':
|
|
1472
|
+
callback = None
|
|
1473
|
+
if show_progress:
|
|
1474
|
+
callback = UploadProgressConsoleCallback(path_to_zip_package)
|
|
1475
|
+
self.upload_zip_package_to_S3(path_to_zip_package=path_to_zip_package, bucket_name=container_name,
|
|
1476
|
+
folder=folder,
|
|
1477
|
+
callback=callback, delete_after_upload=delete_after_upload)
|
|
1478
|
+
else:
|
|
1479
|
+
self.upload_zip_package_to_Azure(path_to_zip_package=path_to_zip_package,
|
|
1480
|
+
container_name=container_name, folder=folder,
|
|
1481
|
+
delete_after_upload=delete_after_upload,
|
|
1482
|
+
show_progress=show_progress)
|
|
1483
|
+
|
|
1484
|
+
def upload_zip_package_to_Azure(self, path_to_zip_package, container_name, folder=None, delete_after_upload=False,
|
|
1485
|
+
show_progress=False):
|
|
1486
|
+
|
|
1487
|
+
"""
|
|
1488
|
+
Uploads a zip file package to an Azure container connected to a Preservica Cloud System
|
|
1489
|
+
|
|
1490
|
+
:param str path_to_zip_package: Path to the package
|
|
1491
|
+
:param str container_name: container connected to the ingest workflow
|
|
1492
|
+
:param Folder folder: The folder to ingest the package into
|
|
1493
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1494
|
+
|
|
1495
|
+
"""
|
|
1496
|
+
|
|
1497
|
+
if (self.major_version < 7) and (self.minor_version < 5):
|
|
1498
|
+
raise RuntimeError(
|
|
1499
|
+
"This call [upload_zip_package_to_Azure] is only available against v6.5 systems and above")
|
|
1500
|
+
|
|
1501
|
+
from azure.storage.blob import ContainerClient
|
|
1502
|
+
|
|
1503
|
+
locations = self.upload_locations()
|
|
1504
|
+
for location in locations:
|
|
1505
|
+
if location['containerName'] == container_name:
|
|
1506
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1507
|
+
account_key = credentials['key']
|
|
1508
|
+
session_token = credentials['sessionToken']
|
|
1509
|
+
|
|
1510
|
+
sas_url = f"https://{account_key}.blob.core.windows.net/{container_name}"
|
|
1511
|
+
container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
|
|
1512
|
+
|
|
1513
|
+
upload_key = str(uuid.uuid4())
|
|
1514
|
+
metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': container_name, 'status': 'ready'}
|
|
1515
|
+
|
|
1516
|
+
if hasattr(folder, "reference"):
|
|
1517
|
+
metadata['collectionreference'] = folder.reference
|
|
1518
|
+
elif isinstance(folder, str):
|
|
1519
|
+
metadata['collectionreference'] = folder
|
|
1520
|
+
|
|
1521
|
+
properties = None
|
|
1522
|
+
|
|
1523
|
+
len_bytes = Path(path_to_zip_package).stat().st_size
|
|
1524
|
+
|
|
1525
|
+
if show_progress:
|
|
1526
|
+
with tqdm.wrapattr(open(path_to_zip_package, 'rb'), "read", total=len_bytes) as data:
|
|
1527
|
+
blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
|
|
1528
|
+
length=len_bytes)
|
|
1529
|
+
properties = blob_client.get_blob_properties()
|
|
1530
|
+
else:
|
|
1531
|
+
with open(path_to_zip_package, "rb") as data:
|
|
1532
|
+
blob_client = container.upload_blob(name=upload_key, data=data, metadata=metadata,
|
|
1533
|
+
length=len_bytes)
|
|
1534
|
+
properties = blob_client.get_blob_properties()
|
|
1535
|
+
|
|
1536
|
+
if delete_after_upload:
|
|
1537
|
+
os.remove(path_to_zip_package)
|
|
1538
|
+
|
|
1539
|
+
return properties
|
|
1540
|
+
|
|
1541
|
+
def upload_zip_package_to_S3(self, path_to_zip_package, bucket_name, folder=None, callback=None,
|
|
1542
|
+
delete_after_upload=False):
|
|
1543
|
+
|
|
1544
|
+
"""
|
|
1545
|
+
Uploads a zip file package to an S3 bucket connected to a Preservica Cloud System
|
|
1546
|
+
|
|
1547
|
+
:param str path_to_zip_package: Path to the package
|
|
1548
|
+
:param str bucket_name: Bucket connected to an ingest workflow
|
|
1549
|
+
:param Folder folder: The folder to ingest the package into
|
|
1550
|
+
:param Callable callback: Optional callback to allow the callee to monitor the upload progress
|
|
1551
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1552
|
+
|
|
1553
|
+
"""
|
|
1554
|
+
|
|
1555
|
+
if (self.major_version < 7) and (self.minor_version < 5):
|
|
1556
|
+
raise RuntimeError("This call [upload_zip_package_to_S3] is only available against v6.5 systems and above")
|
|
1557
|
+
|
|
1558
|
+
logger.debug("Finding Upload Locations")
|
|
864
1559
|
self.token = self.__token__()
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
1560
|
+
locations = self.upload_locations()
|
|
1561
|
+
for location in locations:
|
|
1562
|
+
if location['containerName'] == bucket_name:
|
|
1563
|
+
logger.debug(f"Found Upload Location {location['containerName']}")
|
|
1564
|
+
logger.debug(f"Fetching Upload Credentials for {location['containerName']}")
|
|
1565
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1566
|
+
access_key = credentials['key']
|
|
1567
|
+
secret_key = credentials['secret']
|
|
1568
|
+
session_token = credentials['sessionToken']
|
|
1569
|
+
endpoint = credentials['endpoint']
|
|
868
1570
|
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
1571
|
+
session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
|
|
1572
|
+
aws_session_token=session_token)
|
|
1573
|
+
s3 = session.resource(service_name="s3")
|
|
1574
|
+
|
|
1575
|
+
logger.debug(f"S3 Session: {s3}")
|
|
1576
|
+
|
|
1577
|
+
upload_key = str(uuid.uuid4())
|
|
1578
|
+
s3_object = s3.Object(bucket_name, upload_key)
|
|
1579
|
+
metadata = {'key': upload_key, 'name': upload_key + ".zip", 'bucket': bucket_name, 'status': 'ready'}
|
|
1580
|
+
|
|
1581
|
+
if hasattr(folder, "reference"):
|
|
1582
|
+
metadata['collectionreference'] = folder.reference
|
|
1583
|
+
elif isinstance(folder, str):
|
|
1584
|
+
metadata['collectionreference'] = folder
|
|
1585
|
+
|
|
1586
|
+
metadata['size'] = str(Path(path_to_zip_package).stat().st_size)
|
|
1587
|
+
metadata['createdby'] = self.username
|
|
1588
|
+
|
|
1589
|
+
metadata_map = {'Metadata': metadata}
|
|
1590
|
+
|
|
1591
|
+
s3_object.upload_file(path_to_zip_package, Callback=callback, ExtraArgs=metadata_map,
|
|
1592
|
+
Config=transfer_config)
|
|
875
1593
|
|
|
876
|
-
if os.path.exists(path_to_zip_package) and os.path.isfile(path_to_zip_package):
|
|
877
|
-
try:
|
|
878
|
-
key_id = str(uuid.uuid4()) + ".zip"
|
|
879
|
-
s3_client.upload_file(path_to_zip_package, bucket, key_id, ExtraArgs=metadata,
|
|
880
|
-
Callback=callback, Config=transfer_config)
|
|
881
1594
|
if delete_after_upload:
|
|
882
1595
|
os.remove(path_to_zip_package)
|
|
883
|
-
except ClientError as e:
|
|
884
|
-
raise e
|
|
885
1596
|
|
|
886
|
-
def
|
|
1597
|
+
def upload_zip_package(self, path_to_zip_package, folder=None, callback=None, delete_after_upload=False):
|
|
1598
|
+
"""
|
|
1599
|
+
Uploads a zip file package directly to Preservica and starts an ingest workflow
|
|
1600
|
+
|
|
1601
|
+
:param str path_to_zip_package: Path to the package
|
|
1602
|
+
:param Folder folder: The folder to ingest the package into
|
|
1603
|
+
:param Callable callback: Optional callback to allow the callee to monitor the upload progress
|
|
1604
|
+
:param bool delete_after_upload: Delete the local copy of the package after the upload has completed
|
|
1605
|
+
|
|
1606
|
+
:return: preservica-progress-token to allow the workflow progress to be monitored
|
|
1607
|
+
:rtype: str
|
|
1608
|
+
|
|
1609
|
+
|
|
1610
|
+
:raises RuntimeError:
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
"""
|
|
887
1614
|
bucket = f'{self.tenant.lower()}.package.upload'
|
|
888
|
-
endpoint = f'
|
|
1615
|
+
endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
|
|
889
1616
|
self.token = self.__token__()
|
|
890
|
-
s3_client = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=self.token,
|
|
891
|
-
aws_secret_access_key="NOT_USED",
|
|
892
|
-
config=Config(s3={'addressing_style': 'path'}))
|
|
893
1617
|
|
|
894
|
-
|
|
1618
|
+
|
|
1619
|
+
retries= {
|
|
1620
|
+
'max_attempts': 5,
|
|
1621
|
+
'mode': 'adaptive'
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
def new_credentials():
|
|
1625
|
+
cred_metadata: dict = {}
|
|
1626
|
+
cred_metadata['access_key'] = self.__token__()
|
|
1627
|
+
cred_metadata['secret_key'] = "NOT_USED"
|
|
1628
|
+
cred_metadata['token'] = ""
|
|
1629
|
+
cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
|
|
1630
|
+
logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
|
|
1631
|
+
return cred_metadata
|
|
1632
|
+
|
|
1633
|
+
session = get_session()
|
|
1634
|
+
|
|
1635
|
+
session_credentials = RefreshableCredentials.create_from_metadata(
|
|
1636
|
+
metadata=new_credentials(),
|
|
1637
|
+
refresh_using=new_credentials,
|
|
1638
|
+
advisory_timeout = 4 * 60,
|
|
1639
|
+
mandatory_timeout = 12 * 60,
|
|
1640
|
+
method = 'Preservica'
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
autorefresh_session = boto3.Session(botocore_session=session)
|
|
1644
|
+
|
|
1645
|
+
session._credentials = session_credentials
|
|
1646
|
+
|
|
1647
|
+
config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
|
|
1648
|
+
request_checksum_calculation="WHEN_REQUIRED",
|
|
1649
|
+
response_checksum_validation="WHEN_REQUIRED",
|
|
1650
|
+
retries=retries, tcp_keepalive=True)
|
|
1651
|
+
|
|
1652
|
+
|
|
1653
|
+
s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
|
|
1654
|
+
|
|
1655
|
+
metadata = {}
|
|
895
1656
|
if folder is not None:
|
|
896
1657
|
if hasattr(folder, "reference"):
|
|
897
|
-
metadata = {'structuralobjectreference': folder.reference}
|
|
1658
|
+
metadata = {'Metadata': {'structuralobjectreference': folder.reference}}
|
|
898
1659
|
elif isinstance(folder, str):
|
|
899
|
-
metadata = {'structuralobjectreference': folder}
|
|
1660
|
+
metadata = {'Metadata': {'structuralobjectreference': folder}}
|
|
900
1661
|
|
|
901
1662
|
if os.path.exists(path_to_zip_package) and os.path.isfile(path_to_zip_package):
|
|
902
1663
|
try:
|
|
903
1664
|
key_id = str(uuid.uuid4()) + ".zip"
|
|
904
|
-
|
|
905
|
-
|
|
1665
|
+
|
|
1666
|
+
|
|
1667
|
+
# how big is the package
|
|
1668
|
+
package_size = os.path.getsize(path_to_zip_package)
|
|
1669
|
+
if package_size > 1 * GB:
|
|
1670
|
+
transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
|
|
1671
|
+
if package_size > 8 * GB:
|
|
1672
|
+
transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
|
|
1673
|
+
if package_size > 24 * GB:
|
|
1674
|
+
transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
|
|
1675
|
+
if package_size > 48 * GB:
|
|
1676
|
+
transfer_config.multipart_chunksize = 64 * MB
|
|
1677
|
+
|
|
1678
|
+
logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
|
|
1679
|
+
|
|
1680
|
+
transfer = S3Transfer(client=s3_client, config=transfer_config)
|
|
1681
|
+
|
|
1682
|
+
transfer.PutObjectTask = PutObjectTask
|
|
1683
|
+
transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
|
|
1684
|
+
transfer.upload_file = upload_file
|
|
1685
|
+
|
|
1686
|
+
|
|
1687
|
+
response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
|
|
1688
|
+
key=key_id,
|
|
1689
|
+
extra_args=metadata,
|
|
1690
|
+
callback=callback)
|
|
1691
|
+
|
|
906
1692
|
|
|
907
1693
|
if delete_after_upload:
|
|
908
1694
|
os.remove(path_to_zip_package)
|
|
1695
|
+
|
|
909
1696
|
return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
|
|
910
|
-
|
|
911
|
-
|
|
1697
|
+
|
|
1698
|
+
except (NoCredentialsError, PartialCredentialsError) as ex:
|
|
1699
|
+
logger.error(ex)
|
|
1700
|
+
raise ex
|
|
1701
|
+
|
|
1702
|
+
except ClientError as ex:
|
|
1703
|
+
logger.error(ex)
|
|
1704
|
+
raise ex
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
|