pyPreservica 2.9.3__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyPreservica might be problematic. Click here for more details.
- pyPreservica/__init__.py +15 -3
- pyPreservica/adminAPI.py +29 -22
- pyPreservica/authorityAPI.py +6 -7
- pyPreservica/common.py +85 -14
- pyPreservica/contentAPI.py +56 -5
- pyPreservica/entityAPI.py +652 -215
- pyPreservica/mdformsAPI.py +87 -6
- pyPreservica/monitorAPI.py +2 -2
- pyPreservica/parAPI.py +1 -37
- pyPreservica/retentionAPI.py +5 -4
- pyPreservica/settingsAPI.py +295 -0
- pyPreservica/uploadAPI.py +163 -398
- pyPreservica/webHooksAPI.py +1 -1
- pyPreservica/workflowAPI.py +8 -8
- {pyPreservica-2.9.3.dist-info → pypreservica-3.3.3.dist-info}/METADATA +18 -5
- pypreservica-3.3.3.dist-info/RECORD +20 -0
- {pyPreservica-2.9.3.dist-info → pypreservica-3.3.3.dist-info}/WHEEL +1 -1
- pyPreservica-2.9.3.dist-info/RECORD +0 -19
- {pyPreservica-2.9.3.dist-info → pypreservica-3.3.3.dist-info/licenses}/LICENSE.txt +0 -0
- {pyPreservica-2.9.3.dist-info → pypreservica-3.3.3.dist-info}/top_level.txt +0 -0
pyPreservica/uploadAPI.py
CHANGED
|
@@ -13,7 +13,7 @@ import shutil
|
|
|
13
13
|
import tempfile
|
|
14
14
|
import uuid
|
|
15
15
|
import xml
|
|
16
|
-
from datetime import datetime, timedelta
|
|
16
|
+
from datetime import datetime, timedelta, timezone
|
|
17
17
|
from time import sleep
|
|
18
18
|
from xml.dom import minidom
|
|
19
19
|
from xml.etree import ElementTree
|
|
@@ -22,11 +22,12 @@ from xml.etree.ElementTree import Element, SubElement
|
|
|
22
22
|
import boto3
|
|
23
23
|
import s3transfer.tasks
|
|
24
24
|
import s3transfer.upload
|
|
25
|
-
|
|
25
|
+
from botocore.session import get_session
|
|
26
26
|
from boto3.s3.transfer import TransferConfig, S3Transfer
|
|
27
27
|
from botocore.config import Config
|
|
28
28
|
from botocore.credentials import RefreshableCredentials
|
|
29
|
-
from botocore.exceptions import ClientError
|
|
29
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
30
|
+
from dateutil.tz import tzlocal
|
|
30
31
|
from s3transfer import S3UploadFailedError
|
|
31
32
|
from tqdm import tqdm
|
|
32
33
|
|
|
@@ -37,7 +38,7 @@ logger = logging.getLogger(__name__)
|
|
|
37
38
|
|
|
38
39
|
MB = 1024 * 1024
|
|
39
40
|
GB = 1024 ** 3
|
|
40
|
-
transfer_config = TransferConfig(multipart_threshold=int(
|
|
41
|
+
transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
|
|
41
42
|
|
|
42
43
|
CONTENT_FOLDER = "content"
|
|
43
44
|
PRESERVATION_CONTENT_FOLDER = "p1"
|
|
@@ -481,7 +482,7 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
|
|
|
481
482
|
content_type = kwargs.get('CustomType', "")
|
|
482
483
|
|
|
483
484
|
if not compress:
|
|
484
|
-
shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
|
|
485
|
+
shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
|
|
485
486
|
|
|
486
487
|
has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
|
|
487
488
|
has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
|
|
@@ -910,17 +911,22 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
|
|
|
910
911
|
if has_preservation_files:
|
|
911
912
|
if default_asset_title is None:
|
|
912
913
|
default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
|
|
913
|
-
|
|
914
914
|
# create the asset
|
|
915
|
-
|
|
915
|
+
if io_ref is None:
|
|
916
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
916
917
|
|
|
917
918
|
if has_access_files:
|
|
918
919
|
if default_asset_title is None:
|
|
919
920
|
default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
|
|
920
|
-
|
|
921
921
|
if io_ref is None:
|
|
922
922
|
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
923
923
|
|
|
924
|
+
if io_ref is None:
|
|
925
|
+
default_asset_title = kwargs.get('Title', None)
|
|
926
|
+
if default_asset_title is None:
|
|
927
|
+
default_asset_title = "New Asset"
|
|
928
|
+
xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
|
|
929
|
+
|
|
924
930
|
if has_preservation_files:
|
|
925
931
|
# add the content objects
|
|
926
932
|
representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
|
|
@@ -1153,383 +1159,14 @@ def _unpad(s):
|
|
|
1153
1159
|
|
|
1154
1160
|
class UploadAPI(AuthenticatedAPI):
|
|
1155
1161
|
|
|
1156
|
-
def ingest_tweet(self, twitter_user=None, tweet_id: int = 0, twitter_consumer_key=None, twitter_secret_key=None,
|
|
1157
|
-
folder=None,
|
|
1158
|
-
callback=None, **kwargs):
|
|
1159
|
-
|
|
1160
|
-
"""
|
|
1161
|
-
Ingest tweets from a twitter stream by twitter username
|
|
1162
|
-
|
|
1163
|
-
:param tweet_id:
|
|
1164
|
-
:param str twitter_user: Twitter Username
|
|
1165
|
-
:param str twitter_consumer_key: Optional asset title
|
|
1166
|
-
:param str twitter_secret_key: Optional asset description
|
|
1167
|
-
:param str folder: Folder to ingest into
|
|
1168
|
-
:param callback callback: Optional upload progress callback
|
|
1169
|
-
:raises RuntimeError:
|
|
1170
1162
|
|
|
1171
1163
|
|
|
1172
|
-
"""
|
|
1173
|
-
|
|
1174
|
-
def get_image(m, has_video_element):
|
|
1175
|
-
media_url_https_ = m["media_url_https"]
|
|
1176
|
-
if media_url_https_:
|
|
1177
|
-
req = requests.get(media_url_https_)
|
|
1178
|
-
if req.status_code == requests.codes.ok:
|
|
1179
|
-
if has_video_element:
|
|
1180
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
|
|
1181
|
-
else:
|
|
1182
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
|
|
1183
|
-
image_name_document_ = open(image_name_, "wb")
|
|
1184
|
-
image_name_document_.write(req.content)
|
|
1185
|
-
image_name_document_.close()
|
|
1186
|
-
return image_name_
|
|
1187
|
-
|
|
1188
|
-
def get_video(m):
|
|
1189
|
-
video_info_ = m["video_info"]
|
|
1190
|
-
variants_ = video_info_["variants"]
|
|
1191
|
-
for v_ in variants_:
|
|
1192
|
-
video_url_ = v_["url"]
|
|
1193
|
-
req = requests.get(video_url_)
|
|
1194
|
-
if req.status_code == requests.codes.ok:
|
|
1195
|
-
video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
|
|
1196
|
-
video_name_document_ = open(video_name_, "wb")
|
|
1197
|
-
video_name_document_.write(req.content)
|
|
1198
|
-
video_name_document_.close()
|
|
1199
|
-
return video_name_, True
|
|
1200
|
-
|
|
1201
|
-
entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1202
|
-
tenant=self.tenant)
|
|
1203
|
-
if hasattr(folder, "reference"):
|
|
1204
|
-
folder = entity_client.folder(folder.reference)
|
|
1205
|
-
else:
|
|
1206
|
-
folder = entity_client.folder(folder)
|
|
1207
|
-
try:
|
|
1208
|
-
import tweepy
|
|
1209
|
-
except ImportError:
|
|
1210
|
-
logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1211
|
-
raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1212
|
-
config = configparser.ConfigParser()
|
|
1213
|
-
config.read('credentials.properties')
|
|
1214
|
-
if twitter_consumer_key is None:
|
|
1215
|
-
twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
|
|
1216
|
-
if twitter_consumer_key is None:
|
|
1217
|
-
try:
|
|
1218
|
-
twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
|
|
1219
|
-
except KeyError:
|
|
1220
|
-
logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1221
|
-
"environment variables or credentials.properties file")
|
|
1222
|
-
raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1223
|
-
"environment variables or credentials.properties file")
|
|
1224
|
-
if twitter_secret_key is None:
|
|
1225
|
-
twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
|
|
1226
|
-
if twitter_secret_key is None:
|
|
1227
|
-
try:
|
|
1228
|
-
twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
|
|
1229
|
-
except KeyError:
|
|
1230
|
-
logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1231
|
-
"environment variables or credentials.properties file")
|
|
1232
|
-
raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1233
|
-
"environment variables or credentials.properties file")
|
|
1234
|
-
|
|
1235
|
-
api = None
|
|
1236
|
-
try:
|
|
1237
|
-
auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
|
|
1238
|
-
api = tweepy.API(auth, wait_on_rate_limit=True)
|
|
1239
|
-
except TweepError:
|
|
1240
|
-
logger.error("No valid Twitter API keys. Could not authenticate")
|
|
1241
|
-
raise RuntimeError("No valid Twitter API keys. Could not authenticate")
|
|
1242
|
-
if api is not None:
|
|
1243
|
-
logger.debug(api)
|
|
1244
|
-
tweet = api.get_status(tweet_id, tweet_mode="extended", include_entities=True)
|
|
1245
|
-
created_at = tweet.created_at
|
|
1246
|
-
id_str = tweet.id_str
|
|
1247
|
-
author = tweet.author.name
|
|
1248
|
-
tweet_entities = tweet.entities
|
|
1249
|
-
hashtags = dict()
|
|
1250
|
-
if 'hashtags' in tweet_entities:
|
|
1251
|
-
hashtags = tweet.entities['hashtags']
|
|
1252
|
-
entities = entity_client.identifier("tweet_id", id_str.strip())
|
|
1253
|
-
if len(entities) > 0:
|
|
1254
|
-
logger.warning("Tweet already exists, skipping....")
|
|
1255
|
-
return
|
|
1256
|
-
logger.info(f"Processing tweet {id_str} ...")
|
|
1257
|
-
tid = tweet.id
|
|
1258
|
-
content_objects = list()
|
|
1259
|
-
full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
|
|
1260
|
-
text = tweet.full_text
|
|
1261
|
-
full_text = full_tweet.full_text
|
|
1262
|
-
file_name = f"{{{id_str}}}_[{twitter_user}].json"
|
|
1263
|
-
json_doc = json.dumps(full_tweet._json)
|
|
1264
|
-
json_file = open(file_name, "wt", encoding="utf-8")
|
|
1265
|
-
json_file.write(json_doc)
|
|
1266
|
-
json_file.close()
|
|
1267
|
-
content_objects.append(file_name)
|
|
1268
|
-
if hasattr(full_tweet, "extended_entities"):
|
|
1269
|
-
extended_entities = full_tweet.extended_entities
|
|
1270
|
-
if "media" in extended_entities:
|
|
1271
|
-
media = extended_entities["media"]
|
|
1272
|
-
for med in media:
|
|
1273
|
-
media_id_str = med["id_str"]
|
|
1274
|
-
has_video = False
|
|
1275
|
-
if "video_info" in med:
|
|
1276
|
-
co, has_video = get_video(med)
|
|
1277
|
-
content_objects.append(co)
|
|
1278
|
-
if has_video:
|
|
1279
|
-
co = get_image(med, has_video)
|
|
1280
|
-
content_objects.append(co)
|
|
1281
|
-
continue
|
|
1282
|
-
if "media_url_https" in med:
|
|
1283
|
-
co = get_image(med, has_video)
|
|
1284
|
-
content_objects.append(co)
|
|
1285
|
-
identifiers = dict()
|
|
1286
|
-
asset_metadata = dict()
|
|
1287
|
-
identifiers["tweet_id"] = id_str
|
|
1288
|
-
|
|
1289
|
-
user = full_tweet._json['user']
|
|
1290
|
-
|
|
1291
|
-
if full_tweet._json.get('retweeted_status'):
|
|
1292
|
-
retweeted_status = full_tweet._json['retweeted_status']
|
|
1293
|
-
if retweeted_status.get("extended_entities"):
|
|
1294
|
-
extended_entities = retweeted_status["extended_entities"]
|
|
1295
|
-
if "media" in extended_entities:
|
|
1296
|
-
media = extended_entities["media"]
|
|
1297
|
-
for med in media:
|
|
1298
|
-
media_id_str = med["id_str"]
|
|
1299
|
-
has_video = False
|
|
1300
|
-
if "video_info" in med:
|
|
1301
|
-
co, has_video = get_video(med)
|
|
1302
|
-
content_objects.append(co)
|
|
1303
|
-
continue
|
|
1304
|
-
if "media_url_https" in med:
|
|
1305
|
-
co = get_image(med, has_video)
|
|
1306
|
-
content_objects.append(co)
|
|
1307
|
-
|
|
1308
|
-
xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
|
|
1309
|
-
xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
|
|
1310
|
-
xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
|
|
1311
|
-
xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
|
|
1312
|
-
xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
|
|
1313
|
-
for h in hashtags:
|
|
1314
|
-
xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
|
|
1315
|
-
|
|
1316
|
-
xml.etree.ElementTree.SubElement(xml_object, "name").text = author
|
|
1317
|
-
xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
|
|
1318
|
-
xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
|
|
1319
|
-
|
|
1320
|
-
xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
|
|
1321
|
-
|
|
1322
|
-
metadata_document = open("metadata.xml", "wt", encoding="utf-8")
|
|
1323
|
-
metadata_document.write(xml_request.decode("utf-8"))
|
|
1324
|
-
metadata_document.close()
|
|
1325
|
-
|
|
1326
|
-
asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
|
|
1327
|
-
|
|
1328
|
-
security_tag = kwargs.get("SecurityTag", "open")
|
|
1329
|
-
asset_title = kwargs.get("Title", text)
|
|
1330
|
-
asset_description = kwargs.get("Description", full_text)
|
|
1331
|
-
|
|
1332
|
-
p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder, Title=asset_title,
|
|
1333
|
-
Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
|
|
1334
|
-
Asset_Metadata=asset_metadata, SecurityTag=security_tag)
|
|
1335
|
-
self.upload_zip_package(p, folder=folder, callback=callback)
|
|
1336
|
-
for ob in content_objects:
|
|
1337
|
-
os.remove(ob)
|
|
1338
|
-
os.remove("metadata.xml")
|
|
1339
|
-
|
|
1340
|
-
def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None,
|
|
1341
|
-
twitter_secret_key=None, folder=None,
|
|
1342
|
-
callback=None, **kwargs):
|
|
1343
|
-
|
|
1344
|
-
"""
|
|
1345
|
-
Ingest tweets from a twitter stream by twitter username
|
|
1346
|
-
|
|
1347
|
-
:param str twitter_user: Twitter Username
|
|
1348
|
-
:param int num_tweets: The number of tweets from the stream
|
|
1349
|
-
:param str twitter_consumer_key: Optional asset title
|
|
1350
|
-
:param str twitter_secret_key: Optional asset description
|
|
1351
|
-
:param str folder: Folder to ingest into
|
|
1352
|
-
:param callback callback: Optional upload progress callback
|
|
1353
|
-
:raises RuntimeError:
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
"""
|
|
1357
|
-
|
|
1358
|
-
def get_image(m, has_video_element):
|
|
1359
|
-
media_url_https_ = m["media_url_https"]
|
|
1360
|
-
if media_url_https_:
|
|
1361
|
-
req = requests.get(media_url_https_)
|
|
1362
|
-
if req.status_code == requests.codes.ok:
|
|
1363
|
-
if has_video_element:
|
|
1364
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
|
|
1365
|
-
else:
|
|
1366
|
-
image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
|
|
1367
|
-
image_name_document_ = open(image_name_, "wb")
|
|
1368
|
-
image_name_document_.write(req.content)
|
|
1369
|
-
image_name_document_.close()
|
|
1370
|
-
return image_name_
|
|
1371
|
-
|
|
1372
|
-
def get_video(m):
|
|
1373
|
-
video_info_ = m["video_info"]
|
|
1374
|
-
variants_ = video_info_["variants"]
|
|
1375
|
-
for v_ in variants_:
|
|
1376
|
-
if v_['content_type'] == 'video/mp4':
|
|
1377
|
-
video_url_ = v_["url"]
|
|
1378
|
-
with requests.get(video_url_, stream=True) as req:
|
|
1379
|
-
video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
|
|
1380
|
-
with open(video_name_, 'wb') as video_name_document_:
|
|
1381
|
-
for chunk in req.iter_content(chunk_size=1024):
|
|
1382
|
-
video_name_document_.write(chunk)
|
|
1383
|
-
video_name_document_.flush()
|
|
1384
|
-
return video_name_, True
|
|
1385
|
-
|
|
1386
|
-
entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1387
|
-
tenant=self.tenant)
|
|
1388
|
-
if hasattr(folder, "reference"):
|
|
1389
|
-
folder = entity_client.folder(folder.reference)
|
|
1390
|
-
else:
|
|
1391
|
-
folder = entity_client.folder(folder)
|
|
1392
|
-
try:
|
|
1393
|
-
import tweepy
|
|
1394
|
-
except ImportError:
|
|
1395
|
-
logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1396
|
-
raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
|
|
1397
|
-
config = configparser.ConfigParser()
|
|
1398
|
-
config.read('credentials.properties')
|
|
1399
|
-
if twitter_consumer_key is None:
|
|
1400
|
-
twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
|
|
1401
|
-
if twitter_consumer_key is None:
|
|
1402
|
-
try:
|
|
1403
|
-
twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
|
|
1404
|
-
except KeyError:
|
|
1405
|
-
logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1406
|
-
"environment variables or credentials.properties file")
|
|
1407
|
-
raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
|
|
1408
|
-
"environment variables or credentials.properties file")
|
|
1409
|
-
if twitter_secret_key is None:
|
|
1410
|
-
twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
|
|
1411
|
-
if twitter_secret_key is None:
|
|
1412
|
-
try:
|
|
1413
|
-
twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
|
|
1414
|
-
except KeyError:
|
|
1415
|
-
logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1416
|
-
"environment variables or credentials.properties file")
|
|
1417
|
-
raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
|
|
1418
|
-
"environment variables or credentials.properties file")
|
|
1419
|
-
|
|
1420
|
-
api = None
|
|
1421
|
-
try:
|
|
1422
|
-
auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
|
|
1423
|
-
api = tweepy.API(auth, wait_on_rate_limit=True)
|
|
1424
|
-
except RuntimeError:
|
|
1425
|
-
logger.error("No valid Twitter API keys. Could not authenticate")
|
|
1426
|
-
raise RuntimeError("No valid Twitter API keys. Could not authenticate")
|
|
1427
|
-
if api is not None:
|
|
1428
|
-
logger.debug(api)
|
|
1429
|
-
for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
|
|
1430
|
-
created_at = tweet.created_at
|
|
1431
|
-
id_str = tweet.id_str
|
|
1432
|
-
author = tweet.author.name
|
|
1433
|
-
tweet_entities = tweet.entities
|
|
1434
|
-
hashtags = dict()
|
|
1435
|
-
if 'hashtags' in tweet_entities:
|
|
1436
|
-
hashtags = tweet.entities['hashtags']
|
|
1437
|
-
entities = entity_client.identifier("tweet_id", id_str.strip())
|
|
1438
|
-
if len(entities) > 0:
|
|
1439
|
-
logger.warning("Tweet already exists, skipping....")
|
|
1440
|
-
continue
|
|
1441
|
-
logger.info(f"Processing tweet {id_str} ...")
|
|
1442
|
-
tid = tweet.id
|
|
1443
|
-
content_objects = list()
|
|
1444
|
-
full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
|
|
1445
|
-
text = tweet.text
|
|
1446
|
-
logger.debug(text)
|
|
1447
|
-
full_text = full_tweet.full_text
|
|
1448
|
-
file_name = f"{{{id_str}}}_[{twitter_user}].json"
|
|
1449
|
-
json_doc = json.dumps(full_tweet._json)
|
|
1450
|
-
json_file = open(file_name, "wt", encoding="utf-8")
|
|
1451
|
-
json_file.write(json_doc)
|
|
1452
|
-
json_file.close()
|
|
1453
|
-
content_objects.append(file_name)
|
|
1454
|
-
if hasattr(full_tweet, "extended_entities"):
|
|
1455
|
-
extended_entities = full_tweet.extended_entities
|
|
1456
|
-
if "media" in extended_entities:
|
|
1457
|
-
media = extended_entities["media"]
|
|
1458
|
-
for med in media:
|
|
1459
|
-
media_id_str = med["id_str"]
|
|
1460
|
-
has_video = False
|
|
1461
|
-
if "video_info" in med:
|
|
1462
|
-
co, has_video = get_video(med)
|
|
1463
|
-
content_objects.append(co)
|
|
1464
|
-
if has_video:
|
|
1465
|
-
co = get_image(med, has_video)
|
|
1466
|
-
content_objects.append(co)
|
|
1467
|
-
continue
|
|
1468
|
-
if "media_url_https" in med:
|
|
1469
|
-
co = get_image(med, has_video)
|
|
1470
|
-
content_objects.append(co)
|
|
1471
|
-
identifiers = {}
|
|
1472
|
-
asset_metadata = {}
|
|
1473
|
-
identifiers["tweet_id"] = id_str
|
|
1474
|
-
|
|
1475
|
-
user = full_tweet._json['user']
|
|
1476
|
-
|
|
1477
|
-
if full_tweet._json.get('retweeted_status'):
|
|
1478
|
-
retweeted_status = full_tweet._json['retweeted_status']
|
|
1479
|
-
if retweeted_status.get("extended_entities"):
|
|
1480
|
-
extended_entities = retweeted_status["extended_entities"]
|
|
1481
|
-
if "media" in extended_entities:
|
|
1482
|
-
media = extended_entities["media"]
|
|
1483
|
-
for med in media:
|
|
1484
|
-
media_id_str = med["id_str"]
|
|
1485
|
-
has_video = False
|
|
1486
|
-
if "video_info" in med:
|
|
1487
|
-
co, has_video = get_video(med)
|
|
1488
|
-
content_objects.append(co)
|
|
1489
|
-
continue
|
|
1490
|
-
if "media_url_https" in med:
|
|
1491
|
-
co = get_image(med, has_video)
|
|
1492
|
-
content_objects.append(co)
|
|
1493
|
-
|
|
1494
|
-
xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
|
|
1495
|
-
xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
|
|
1496
|
-
xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
|
|
1497
|
-
xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
|
|
1498
|
-
xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
|
|
1499
|
-
for h in hashtags:
|
|
1500
|
-
xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
|
|
1501
|
-
|
|
1502
|
-
xml.etree.ElementTree.SubElement(xml_object, "name").text = author
|
|
1503
|
-
xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
|
|
1504
|
-
xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
|
|
1505
|
-
|
|
1506
|
-
xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
|
|
1507
|
-
|
|
1508
|
-
metadata_document = open("metadata.xml", "wt", encoding="utf-8")
|
|
1509
|
-
metadata_document.write(xml_request.decode("utf-8"))
|
|
1510
|
-
metadata_document.close()
|
|
1511
|
-
|
|
1512
|
-
asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
|
|
1513
|
-
|
|
1514
|
-
security_tag = kwargs.get("SecurityTag", "open")
|
|
1515
|
-
asset_title = kwargs.get("Title", text)
|
|
1516
|
-
asset_description = kwargs.get("Description", full_text)
|
|
1517
|
-
|
|
1518
|
-
p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
|
|
1519
|
-
Title=asset_title,
|
|
1520
|
-
Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
|
|
1521
|
-
Asset_Metadata=asset_metadata, SecurityTag=security_tag)
|
|
1522
|
-
self.upload_zip_package(p, folder=folder, callback=callback)
|
|
1523
|
-
for ob in content_objects:
|
|
1524
|
-
os.remove(ob)
|
|
1525
|
-
os.remove("metadata.xml")
|
|
1526
|
-
sleep(2)
|
|
1527
1164
|
|
|
1528
1165
|
def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
|
|
1529
1166
|
"""
|
|
1530
1167
|
Ingest a web video such as YouTube etc based on the URL
|
|
1531
1168
|
|
|
1532
|
-
:param str url: URL to the
|
|
1169
|
+
:param str url: URL to the YouTube video
|
|
1533
1170
|
:param Folder parent_folder: The folder to ingest the video into
|
|
1534
1171
|
:param str Title: Optional asset title
|
|
1535
1172
|
:param str Description: Optional asset description
|
|
@@ -1627,6 +1264,52 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1627
1264
|
logger.error(exception)
|
|
1628
1265
|
raise exception
|
|
1629
1266
|
|
|
1267
|
+
def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
|
|
1268
|
+
"""
|
|
1269
|
+
Clean up objects in an upload bucket which are older than older_than_days.
|
|
1270
|
+
|
|
1271
|
+
"""
|
|
1272
|
+
from azure.storage.blob import ContainerClient
|
|
1273
|
+
|
|
1274
|
+
for location in self.upload_locations():
|
|
1275
|
+
if location['containerName'] == bucket_name:
|
|
1276
|
+
|
|
1277
|
+
if location['type'] != 'AWS':
|
|
1278
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1279
|
+
account_key = credentials['key']
|
|
1280
|
+
session_token = credentials['sessionToken']
|
|
1281
|
+
sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
|
|
1282
|
+
container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
|
|
1283
|
+
now = datetime.now(timezone.utc)
|
|
1284
|
+
for blob in container.list_blobs():
|
|
1285
|
+
if abs((blob.last_modified - now).days) > older_than_days:
|
|
1286
|
+
logger.debug(f"Deleting expired object {blob.name}")
|
|
1287
|
+
container.delete_blob(blob.name)
|
|
1288
|
+
|
|
1289
|
+
if location['type'] == 'AWS':
|
|
1290
|
+
credentials = self.upload_credentials(location['apiId'])
|
|
1291
|
+
access_key = credentials['key']
|
|
1292
|
+
secret_key = credentials['secret']
|
|
1293
|
+
session_token = credentials['sessionToken']
|
|
1294
|
+
session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
|
|
1295
|
+
aws_session_token=session_token)
|
|
1296
|
+
s3_client = session.client("s3")
|
|
1297
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
1298
|
+
now = datetime.now(timezone.utc)
|
|
1299
|
+
for page in paginator.paginate(Bucket=bucket_name):
|
|
1300
|
+
if 'Contents' in page:
|
|
1301
|
+
for key in page['Contents']:
|
|
1302
|
+
last_modified = key['LastModified']
|
|
1303
|
+
if abs((last_modified - now).days) > older_than_days:
|
|
1304
|
+
logger.debug(f"Deleting expired object {key['Key']}")
|
|
1305
|
+
s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
|
|
1630
1313
|
def upload_locations(self):
|
|
1631
1314
|
"""
|
|
1632
1315
|
Upload locations are configured on the Sources page as 'SIP Upload'.
|
|
@@ -1659,30 +1342,52 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1659
1342
|
security_tag: str = "open",
|
|
1660
1343
|
delete_after_upload: bool = True, max_MB_ingested: int = -1):
|
|
1661
1344
|
|
|
1345
|
+
from pyPreservica import EntityAPI
|
|
1346
|
+
|
|
1347
|
+
def entity_value(client: EntityAPI, identifier: str) -> Entity:
|
|
1348
|
+
back_off: int = 5
|
|
1349
|
+
while True:
|
|
1350
|
+
try:
|
|
1351
|
+
entities = client.identifier("code", identifier)
|
|
1352
|
+
if bool(len(entities) > 0):
|
|
1353
|
+
return entities.pop()
|
|
1354
|
+
else:
|
|
1355
|
+
return None
|
|
1356
|
+
except HTTPException as e:
|
|
1357
|
+
sleep(back_off)
|
|
1358
|
+
back_off = back_off * 2
|
|
1359
|
+
|
|
1360
|
+
def entity_exists(client: EntityAPI, identifier: str) -> bool:
|
|
1361
|
+
back_off: int = 5
|
|
1362
|
+
while True:
|
|
1363
|
+
try:
|
|
1364
|
+
entities = client.identifier("code", identifier)
|
|
1365
|
+
return bool(len(entities) > 0)
|
|
1366
|
+
except HTTPException as e:
|
|
1367
|
+
sleep(back_off)
|
|
1368
|
+
back_off = back_off * 2
|
|
1369
|
+
|
|
1662
1370
|
def get_parent(client, identifier, parent_reference):
|
|
1663
|
-
|
|
1664
|
-
if not
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
if
|
|
1668
|
-
folder = entities.pop()
|
|
1371
|
+
dirname_id: str = str(os.path.dirname(identifier))
|
|
1372
|
+
if not dirname_id:
|
|
1373
|
+
dirname_id = identifier
|
|
1374
|
+
folder = entity_value(client, dirname_id)
|
|
1375
|
+
if folder is not None:
|
|
1669
1376
|
folder = client.folder(folder.reference)
|
|
1670
1377
|
return folder.reference
|
|
1671
1378
|
else:
|
|
1672
1379
|
return parent_reference
|
|
1673
1380
|
|
|
1674
1381
|
def get_folder(client, name, tag, parent_reference, identifier):
|
|
1675
|
-
|
|
1676
|
-
if
|
|
1382
|
+
folder = entity_value(client, identifier)
|
|
1383
|
+
if folder is None:
|
|
1677
1384
|
logger.info(f"Creating new folder with name {name}")
|
|
1678
1385
|
folder = client.create_folder(name, name, tag, parent_reference)
|
|
1679
1386
|
client.add_identifier(folder, "code", identifier)
|
|
1680
1387
|
else:
|
|
1681
1388
|
logger.info(f"Found existing folder with name {name}")
|
|
1682
|
-
folder = entities.pop()
|
|
1683
1389
|
return folder
|
|
1684
1390
|
|
|
1685
|
-
from pyPreservica import EntityAPI
|
|
1686
1391
|
entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
|
|
1687
1392
|
tenant=self.tenant,
|
|
1688
1393
|
two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
|
|
@@ -1712,7 +1417,7 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1712
1417
|
files.remove(file)
|
|
1713
1418
|
continue
|
|
1714
1419
|
asset_code = os.path.join(code, file)
|
|
1715
|
-
if
|
|
1420
|
+
if not entity_exists(entity_client, asset_code):
|
|
1716
1421
|
bytes_ingested = bytes_ingested + os.stat(full_path).st_size
|
|
1717
1422
|
logger.info(f"Adding new file: {file} to package ready for upload")
|
|
1718
1423
|
file_identifiers = {"code": asset_code}
|
|
@@ -1735,8 +1440,8 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1735
1440
|
delete_after_upload=delete_after_upload)
|
|
1736
1441
|
else:
|
|
1737
1442
|
self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
|
|
1738
|
-
|
|
1739
|
-
|
|
1443
|
+
show_progress=bool(progress_display is not None),
|
|
1444
|
+
delete_after_upload=delete_after_upload)
|
|
1740
1445
|
|
|
1741
1446
|
logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
|
|
1742
1447
|
|
|
@@ -1910,9 +1615,42 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1910
1615
|
endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
|
|
1911
1616
|
self.token = self.__token__()
|
|
1912
1617
|
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1618
|
+
|
|
1619
|
+
retries= {
|
|
1620
|
+
'max_attempts': 5,
|
|
1621
|
+
'mode': 'adaptive'
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
def new_credentials():
|
|
1625
|
+
cred_metadata: dict = {}
|
|
1626
|
+
cred_metadata['access_key'] = self.__token__()
|
|
1627
|
+
cred_metadata['secret_key'] = "NOT_USED"
|
|
1628
|
+
cred_metadata['token'] = ""
|
|
1629
|
+
cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
|
|
1630
|
+
logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
|
|
1631
|
+
return cred_metadata
|
|
1632
|
+
|
|
1633
|
+
session = get_session()
|
|
1634
|
+
|
|
1635
|
+
session_credentials = RefreshableCredentials.create_from_metadata(
|
|
1636
|
+
metadata=new_credentials(),
|
|
1637
|
+
refresh_using=new_credentials,
|
|
1638
|
+
advisory_timeout = 4 * 60,
|
|
1639
|
+
mandatory_timeout = 12 * 60,
|
|
1640
|
+
method = 'Preservica'
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
autorefresh_session = boto3.Session(botocore_session=session)
|
|
1644
|
+
|
|
1645
|
+
session._credentials = session_credentials
|
|
1646
|
+
|
|
1647
|
+
config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
|
|
1648
|
+
request_checksum_calculation="WHEN_REQUIRED",
|
|
1649
|
+
response_checksum_validation="WHEN_REQUIRED",
|
|
1650
|
+
retries=retries, tcp_keepalive=True)
|
|
1651
|
+
|
|
1652
|
+
|
|
1653
|
+
s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
|
|
1916
1654
|
|
|
1917
1655
|
metadata = {}
|
|
1918
1656
|
if folder is not None:
|
|
@@ -1925,21 +1663,48 @@ class UploadAPI(AuthenticatedAPI):
|
|
|
1925
1663
|
try:
|
|
1926
1664
|
key_id = str(uuid.uuid4()) + ".zip"
|
|
1927
1665
|
|
|
1666
|
+
|
|
1667
|
+
# how big is the package
|
|
1668
|
+
package_size = os.path.getsize(path_to_zip_package)
|
|
1669
|
+
if package_size > 1 * GB:
|
|
1670
|
+
transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
|
|
1671
|
+
if package_size > 8 * GB:
|
|
1672
|
+
transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
|
|
1673
|
+
if package_size > 24 * GB:
|
|
1674
|
+
transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
|
|
1675
|
+
if package_size > 48 * GB:
|
|
1676
|
+
transfer_config.multipart_chunksize = 64 * MB
|
|
1677
|
+
|
|
1678
|
+
logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
|
|
1679
|
+
|
|
1928
1680
|
transfer = S3Transfer(client=s3_client, config=transfer_config)
|
|
1929
1681
|
|
|
1930
1682
|
transfer.PutObjectTask = PutObjectTask
|
|
1931
1683
|
transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
|
|
1932
1684
|
transfer.upload_file = upload_file
|
|
1933
1685
|
|
|
1934
|
-
|
|
1686
|
+
|
|
1687
|
+
response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
|
|
1688
|
+
key=key_id,
|
|
1935
1689
|
extra_args=metadata,
|
|
1936
1690
|
callback=callback)
|
|
1937
1691
|
|
|
1692
|
+
|
|
1938
1693
|
if delete_after_upload:
|
|
1939
1694
|
os.remove(path_to_zip_package)
|
|
1940
1695
|
|
|
1941
1696
|
return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
|
|
1942
1697
|
|
|
1943
|
-
except
|
|
1944
|
-
logger.error(
|
|
1945
|
-
raise
|
|
1698
|
+
except (NoCredentialsError, PartialCredentialsError) as ex:
|
|
1699
|
+
logger.error(ex)
|
|
1700
|
+
raise ex
|
|
1701
|
+
|
|
1702
|
+
except ClientError as ex:
|
|
1703
|
+
logger.error(ex)
|
|
1704
|
+
raise ex
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
|