pyPreservica 2.9.3__py3-none-any.whl → 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyPreservica might be problematic. Click here for more details.

pyPreservica/uploadAPI.py CHANGED
@@ -13,7 +13,7 @@ import shutil
13
13
  import tempfile
14
14
  import uuid
15
15
  import xml
16
- from datetime import datetime, timedelta
16
+ from datetime import datetime, timedelta, timezone
17
17
  from time import sleep
18
18
  from xml.dom import minidom
19
19
  from xml.etree import ElementTree
@@ -22,11 +22,12 @@ from xml.etree.ElementTree import Element, SubElement
22
22
  import boto3
23
23
  import s3transfer.tasks
24
24
  import s3transfer.upload
25
-
25
+ from botocore.session import get_session
26
26
  from boto3.s3.transfer import TransferConfig, S3Transfer
27
27
  from botocore.config import Config
28
28
  from botocore.credentials import RefreshableCredentials
29
- from botocore.exceptions import ClientError
29
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
30
+ from dateutil.tz import tzlocal
30
31
  from s3transfer import S3UploadFailedError
31
32
  from tqdm import tqdm
32
33
 
@@ -37,7 +38,7 @@ logger = logging.getLogger(__name__)
37
38
 
38
39
  MB = 1024 * 1024
39
40
  GB = 1024 ** 3
40
- transfer_config = TransferConfig(multipart_threshold=int((1 * GB) / 16))
41
+ transfer_config = TransferConfig(multipart_threshold=int(32 * MB))
41
42
 
42
43
  CONTENT_FOLDER = "content"
43
44
  PRESERVATION_CONTENT_FOLDER = "p1"
@@ -481,7 +482,7 @@ def generic_asset_package(preservation_files_dict=None, access_files_dict=None,
481
482
  content_type = kwargs.get('CustomType', "")
482
483
 
483
484
  if not compress:
484
- shutil.register_archive_format("szip", _make_stored_zipfile, None, "UnCompressed ZIP file")
485
+ shutil.register_archive_format(name="szip", function=_make_stored_zipfile, extra_args=None, description="UnCompressed ZIP file")
485
486
 
486
487
  has_preservation_files = bool((preservation_files_dict is not None) and (len(preservation_files_dict) > 0))
487
488
  has_access_files = bool((access_files_dict is not None) and (len(access_files_dict) > 0))
@@ -910,17 +911,22 @@ def complex_asset_package(preservation_files_list=None, access_files_list=None,
910
911
  if has_preservation_files:
911
912
  if default_asset_title is None:
912
913
  default_asset_title = os.path.splitext(os.path.basename(preservation_files_list[0]))[0]
913
-
914
914
  # create the asset
915
- xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
915
+ if io_ref is None:
916
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
916
917
 
917
918
  if has_access_files:
918
919
  if default_asset_title is None:
919
920
  default_asset_title = os.path.splitext(os.path.basename(access_files_list[0]))[0]
920
-
921
921
  if io_ref is None:
922
922
  xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
923
923
 
924
+ if io_ref is None:
925
+ default_asset_title = kwargs.get('Title', None)
926
+ if default_asset_title is None:
927
+ default_asset_title = "New Asset"
928
+ xip, io_ref = __create_io__(file_name=default_asset_title, parent_folder=parent_folder, **kwargs)
929
+
924
930
  if has_preservation_files:
925
931
  # add the content objects
926
932
  representation_name = kwargs.get('Preservation_Representation_Name', "Preservation")
@@ -1153,383 +1159,14 @@ def _unpad(s):
1153
1159
 
1154
1160
  class UploadAPI(AuthenticatedAPI):
1155
1161
 
1156
- def ingest_tweet(self, twitter_user=None, tweet_id: int = 0, twitter_consumer_key=None, twitter_secret_key=None,
1157
- folder=None,
1158
- callback=None, **kwargs):
1159
-
1160
- """
1161
- Ingest tweets from a twitter stream by twitter username
1162
-
1163
- :param tweet_id:
1164
- :param str twitter_user: Twitter Username
1165
- :param str twitter_consumer_key: Optional asset title
1166
- :param str twitter_secret_key: Optional asset description
1167
- :param str folder: Folder to ingest into
1168
- :param callback callback: Optional upload progress callback
1169
- :raises RuntimeError:
1170
1162
 
1171
1163
 
1172
- """
1173
-
1174
- def get_image(m, has_video_element):
1175
- media_url_https_ = m["media_url_https"]
1176
- if media_url_https_:
1177
- req = requests.get(media_url_https_)
1178
- if req.status_code == requests.codes.ok:
1179
- if has_video_element:
1180
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
1181
- else:
1182
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
1183
- image_name_document_ = open(image_name_, "wb")
1184
- image_name_document_.write(req.content)
1185
- image_name_document_.close()
1186
- return image_name_
1187
-
1188
- def get_video(m):
1189
- video_info_ = m["video_info"]
1190
- variants_ = video_info_["variants"]
1191
- for v_ in variants_:
1192
- video_url_ = v_["url"]
1193
- req = requests.get(video_url_)
1194
- if req.status_code == requests.codes.ok:
1195
- video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
1196
- video_name_document_ = open(video_name_, "wb")
1197
- video_name_document_.write(req.content)
1198
- video_name_document_.close()
1199
- return video_name_, True
1200
-
1201
- entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
1202
- tenant=self.tenant)
1203
- if hasattr(folder, "reference"):
1204
- folder = entity_client.folder(folder.reference)
1205
- else:
1206
- folder = entity_client.folder(folder)
1207
- try:
1208
- import tweepy
1209
- except ImportError:
1210
- logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1211
- raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1212
- config = configparser.ConfigParser()
1213
- config.read('credentials.properties')
1214
- if twitter_consumer_key is None:
1215
- twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
1216
- if twitter_consumer_key is None:
1217
- try:
1218
- twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
1219
- except KeyError:
1220
- logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1221
- "environment variables or credentials.properties file")
1222
- raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1223
- "environment variables or credentials.properties file")
1224
- if twitter_secret_key is None:
1225
- twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
1226
- if twitter_secret_key is None:
1227
- try:
1228
- twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
1229
- except KeyError:
1230
- logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
1231
- "environment variables or credentials.properties file")
1232
- raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
1233
- "environment variables or credentials.properties file")
1234
-
1235
- api = None
1236
- try:
1237
- auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
1238
- api = tweepy.API(auth, wait_on_rate_limit=True)
1239
- except TweepError:
1240
- logger.error("No valid Twitter API keys. Could not authenticate")
1241
- raise RuntimeError("No valid Twitter API keys. Could not authenticate")
1242
- if api is not None:
1243
- logger.debug(api)
1244
- tweet = api.get_status(tweet_id, tweet_mode="extended", include_entities=True)
1245
- created_at = tweet.created_at
1246
- id_str = tweet.id_str
1247
- author = tweet.author.name
1248
- tweet_entities = tweet.entities
1249
- hashtags = dict()
1250
- if 'hashtags' in tweet_entities:
1251
- hashtags = tweet.entities['hashtags']
1252
- entities = entity_client.identifier("tweet_id", id_str.strip())
1253
- if len(entities) > 0:
1254
- logger.warning("Tweet already exists, skipping....")
1255
- return
1256
- logger.info(f"Processing tweet {id_str} ...")
1257
- tid = tweet.id
1258
- content_objects = list()
1259
- full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
1260
- text = tweet.full_text
1261
- full_text = full_tweet.full_text
1262
- file_name = f"{{{id_str}}}_[{twitter_user}].json"
1263
- json_doc = json.dumps(full_tweet._json)
1264
- json_file = open(file_name, "wt", encoding="utf-8")
1265
- json_file.write(json_doc)
1266
- json_file.close()
1267
- content_objects.append(file_name)
1268
- if hasattr(full_tweet, "extended_entities"):
1269
- extended_entities = full_tweet.extended_entities
1270
- if "media" in extended_entities:
1271
- media = extended_entities["media"]
1272
- for med in media:
1273
- media_id_str = med["id_str"]
1274
- has_video = False
1275
- if "video_info" in med:
1276
- co, has_video = get_video(med)
1277
- content_objects.append(co)
1278
- if has_video:
1279
- co = get_image(med, has_video)
1280
- content_objects.append(co)
1281
- continue
1282
- if "media_url_https" in med:
1283
- co = get_image(med, has_video)
1284
- content_objects.append(co)
1285
- identifiers = dict()
1286
- asset_metadata = dict()
1287
- identifiers["tweet_id"] = id_str
1288
-
1289
- user = full_tweet._json['user']
1290
-
1291
- if full_tweet._json.get('retweeted_status'):
1292
- retweeted_status = full_tweet._json['retweeted_status']
1293
- if retweeted_status.get("extended_entities"):
1294
- extended_entities = retweeted_status["extended_entities"]
1295
- if "media" in extended_entities:
1296
- media = extended_entities["media"]
1297
- for med in media:
1298
- media_id_str = med["id_str"]
1299
- has_video = False
1300
- if "video_info" in med:
1301
- co, has_video = get_video(med)
1302
- content_objects.append(co)
1303
- continue
1304
- if "media_url_https" in med:
1305
- co = get_image(med, has_video)
1306
- content_objects.append(co)
1307
-
1308
- xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
1309
- xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
1310
- xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
1311
- xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
1312
- xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
1313
- for h in hashtags:
1314
- xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
1315
-
1316
- xml.etree.ElementTree.SubElement(xml_object, "name").text = author
1317
- xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
1318
- xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
1319
-
1320
- xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
1321
-
1322
- metadata_document = open("metadata.xml", "wt", encoding="utf-8")
1323
- metadata_document.write(xml_request.decode("utf-8"))
1324
- metadata_document.close()
1325
-
1326
- asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
1327
-
1328
- security_tag = kwargs.get("SecurityTag", "open")
1329
- asset_title = kwargs.get("Title", text)
1330
- asset_description = kwargs.get("Description", full_text)
1331
-
1332
- p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder, Title=asset_title,
1333
- Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
1334
- Asset_Metadata=asset_metadata, SecurityTag=security_tag)
1335
- self.upload_zip_package(p, folder=folder, callback=callback)
1336
- for ob in content_objects:
1337
- os.remove(ob)
1338
- os.remove("metadata.xml")
1339
-
1340
- def ingest_twitter_feed(self, twitter_user=None, num_tweets: int = 25, twitter_consumer_key=None,
1341
- twitter_secret_key=None, folder=None,
1342
- callback=None, **kwargs):
1343
-
1344
- """
1345
- Ingest tweets from a twitter stream by twitter username
1346
-
1347
- :param str twitter_user: Twitter Username
1348
- :param int num_tweets: The number of tweets from the stream
1349
- :param str twitter_consumer_key: Optional asset title
1350
- :param str twitter_secret_key: Optional asset description
1351
- :param str folder: Folder to ingest into
1352
- :param callback callback: Optional upload progress callback
1353
- :raises RuntimeError:
1354
-
1355
-
1356
- """
1357
-
1358
- def get_image(m, has_video_element):
1359
- media_url_https_ = m["media_url_https"]
1360
- if media_url_https_:
1361
- req = requests.get(media_url_https_)
1362
- if req.status_code == requests.codes.ok:
1363
- if has_video_element:
1364
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}]_thumb.jpg"
1365
- else:
1366
- image_name_ = f"{{{media_id_str}}}_[{twitter_user}].jpg"
1367
- image_name_document_ = open(image_name_, "wb")
1368
- image_name_document_.write(req.content)
1369
- image_name_document_.close()
1370
- return image_name_
1371
-
1372
- def get_video(m):
1373
- video_info_ = m["video_info"]
1374
- variants_ = video_info_["variants"]
1375
- for v_ in variants_:
1376
- if v_['content_type'] == 'video/mp4':
1377
- video_url_ = v_["url"]
1378
- with requests.get(video_url_, stream=True) as req:
1379
- video_name_ = f"{{{media_id_str}}}_[{twitter_user}].mp4"
1380
- with open(video_name_, 'wb') as video_name_document_:
1381
- for chunk in req.iter_content(chunk_size=1024):
1382
- video_name_document_.write(chunk)
1383
- video_name_document_.flush()
1384
- return video_name_, True
1385
-
1386
- entity_client = pyPreservica.EntityAPI(username=self.username, password=self.password, server=self.server,
1387
- tenant=self.tenant)
1388
- if hasattr(folder, "reference"):
1389
- folder = entity_client.folder(folder.reference)
1390
- else:
1391
- folder = entity_client.folder(folder)
1392
- try:
1393
- import tweepy
1394
- except ImportError:
1395
- logger.error("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1396
- raise RuntimeError("Package tweepy is required for twitter harvesting. pip install --upgrade tweepy")
1397
- config = configparser.ConfigParser()
1398
- config.read('credentials.properties')
1399
- if twitter_consumer_key is None:
1400
- twitter_consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
1401
- if twitter_consumer_key is None:
1402
- try:
1403
- twitter_consumer_key = config['credentials']['TWITTER_CONSUMER_KEY']
1404
- except KeyError:
1405
- logger.error("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1406
- "environment variables or credentials.properties file")
1407
- raise RuntimeError("No valid TWITTER_CONSUMER_KEY found in method arguments, "
1408
- "environment variables or credentials.properties file")
1409
- if twitter_secret_key is None:
1410
- twitter_secret_key = os.environ.get('TWITTER_SECRET_KEY')
1411
- if twitter_secret_key is None:
1412
- try:
1413
- twitter_secret_key = config['credentials']['TWITTER_SECRET_KEY']
1414
- except KeyError:
1415
- logger.error("No valid TWITTER_SECRET_KEY found in method arguments, "
1416
- "environment variables or credentials.properties file")
1417
- raise RuntimeError("No valid TWITTER_SECRET_KEY found in method arguments, "
1418
- "environment variables or credentials.properties file")
1419
-
1420
- api = None
1421
- try:
1422
- auth = tweepy.AppAuthHandler(twitter_consumer_key, twitter_secret_key)
1423
- api = tweepy.API(auth, wait_on_rate_limit=True)
1424
- except RuntimeError:
1425
- logger.error("No valid Twitter API keys. Could not authenticate")
1426
- raise RuntimeError("No valid Twitter API keys. Could not authenticate")
1427
- if api is not None:
1428
- logger.debug(api)
1429
- for tweet in tweepy.Cursor(api.user_timeline, id=twitter_user).items(int(num_tweets)):
1430
- created_at = tweet.created_at
1431
- id_str = tweet.id_str
1432
- author = tweet.author.name
1433
- tweet_entities = tweet.entities
1434
- hashtags = dict()
1435
- if 'hashtags' in tweet_entities:
1436
- hashtags = tweet.entities['hashtags']
1437
- entities = entity_client.identifier("tweet_id", id_str.strip())
1438
- if len(entities) > 0:
1439
- logger.warning("Tweet already exists, skipping....")
1440
- continue
1441
- logger.info(f"Processing tweet {id_str} ...")
1442
- tid = tweet.id
1443
- content_objects = list()
1444
- full_tweet = api.get_status(tid, tweet_mode="extended", include_entities=True)
1445
- text = tweet.text
1446
- logger.debug(text)
1447
- full_text = full_tweet.full_text
1448
- file_name = f"{{{id_str}}}_[{twitter_user}].json"
1449
- json_doc = json.dumps(full_tweet._json)
1450
- json_file = open(file_name, "wt", encoding="utf-8")
1451
- json_file.write(json_doc)
1452
- json_file.close()
1453
- content_objects.append(file_name)
1454
- if hasattr(full_tweet, "extended_entities"):
1455
- extended_entities = full_tweet.extended_entities
1456
- if "media" in extended_entities:
1457
- media = extended_entities["media"]
1458
- for med in media:
1459
- media_id_str = med["id_str"]
1460
- has_video = False
1461
- if "video_info" in med:
1462
- co, has_video = get_video(med)
1463
- content_objects.append(co)
1464
- if has_video:
1465
- co = get_image(med, has_video)
1466
- content_objects.append(co)
1467
- continue
1468
- if "media_url_https" in med:
1469
- co = get_image(med, has_video)
1470
- content_objects.append(co)
1471
- identifiers = {}
1472
- asset_metadata = {}
1473
- identifiers["tweet_id"] = id_str
1474
-
1475
- user = full_tweet._json['user']
1476
-
1477
- if full_tweet._json.get('retweeted_status'):
1478
- retweeted_status = full_tweet._json['retweeted_status']
1479
- if retweeted_status.get("extended_entities"):
1480
- extended_entities = retweeted_status["extended_entities"]
1481
- if "media" in extended_entities:
1482
- media = extended_entities["media"]
1483
- for med in media:
1484
- media_id_str = med["id_str"]
1485
- has_video = False
1486
- if "video_info" in med:
1487
- co, has_video = get_video(med)
1488
- content_objects.append(co)
1489
- continue
1490
- if "media_url_https" in med:
1491
- co = get_image(med, has_video)
1492
- content_objects.append(co)
1493
-
1494
- xml_object = xml.etree.ElementTree.Element('tweet', {"xmlns": "http://www.preservica.com/tweets/v1"})
1495
- xml.etree.ElementTree.SubElement(xml_object, "id").text = id_str
1496
- xml.etree.ElementTree.SubElement(xml_object, "full_text").text = full_text
1497
- xml.etree.ElementTree.SubElement(xml_object, "created_at").text = str(created_at)
1498
- xml.etree.ElementTree.SubElement(xml_object, "screen_name_sender").text = user.get('screen_name')
1499
- for h in hashtags:
1500
- xml.etree.ElementTree.SubElement(xml_object, "hashtag").text = str(h['text'])
1501
-
1502
- xml.etree.ElementTree.SubElement(xml_object, "name").text = author
1503
- xml.etree.ElementTree.SubElement(xml_object, "retweet").text = str(full_tweet._json['retweet_count'])
1504
- xml.etree.ElementTree.SubElement(xml_object, "likes").text = str(full_tweet._json['favorite_count'])
1505
-
1506
- xml_request = xml.etree.ElementTree.tostring(xml_object, encoding='utf-8')
1507
-
1508
- metadata_document = open("metadata.xml", "wt", encoding="utf-8")
1509
- metadata_document.write(xml_request.decode("utf-8"))
1510
- metadata_document.close()
1511
-
1512
- asset_metadata["http://www.preservica.com/tweets/v1"] = "metadata.xml"
1513
-
1514
- security_tag = kwargs.get("SecurityTag", "open")
1515
- asset_title = kwargs.get("Title", text)
1516
- asset_description = kwargs.get("Description", full_text)
1517
-
1518
- p = complex_asset_package(preservation_files_list=content_objects, parent_folder=folder,
1519
- Title=asset_title,
1520
- Description=asset_description, CustomType="Tweet", Identifiers=identifiers,
1521
- Asset_Metadata=asset_metadata, SecurityTag=security_tag)
1522
- self.upload_zip_package(p, folder=folder, callback=callback)
1523
- for ob in content_objects:
1524
- os.remove(ob)
1525
- os.remove("metadata.xml")
1526
- sleep(2)
1527
1164
 
1528
1165
  def ingest_web_video(self, url=None, parent_folder=None, **kwargs):
1529
1166
  """
1530
1167
  Ingest a web video such as YouTube etc based on the URL
1531
1168
 
1532
- :param str url: URL to the youtube video
1169
+ :param str url: URL to the YouTube video
1533
1170
  :param Folder parent_folder: The folder to ingest the video into
1534
1171
  :param str Title: Optional asset title
1535
1172
  :param str Description: Optional asset description
@@ -1627,6 +1264,52 @@ class UploadAPI(AuthenticatedAPI):
1627
1264
  logger.error(exception)
1628
1265
  raise exception
1629
1266
 
1267
+ def clean_upload_bucket(self, bucket_name: str, older_than_days: int = 90):
1268
+ """
1269
+ Clean up objects in an upload bucket which are older than older_than_days.
1270
+
1271
+ """
1272
+ from azure.storage.blob import ContainerClient
1273
+
1274
+ for location in self.upload_locations():
1275
+ if location['containerName'] == bucket_name:
1276
+
1277
+ if location['type'] != 'AWS':
1278
+ credentials = self.upload_credentials(location['apiId'])
1279
+ account_key = credentials['key']
1280
+ session_token = credentials['sessionToken']
1281
+ sas_url = f"https://{account_key}.blob.core.windows.net/{bucket_name}"
1282
+ container = ContainerClient.from_container_url(container_url=sas_url, credential=session_token)
1283
+ now = datetime.now(timezone.utc)
1284
+ for blob in container.list_blobs():
1285
+ if abs((blob.last_modified - now).days) > older_than_days:
1286
+ logger.debug(f"Deleting expired object {blob.name}")
1287
+ container.delete_blob(blob.name)
1288
+
1289
+ if location['type'] == 'AWS':
1290
+ credentials = self.upload_credentials(location['apiId'])
1291
+ access_key = credentials['key']
1292
+ secret_key = credentials['secret']
1293
+ session_token = credentials['sessionToken']
1294
+ session = boto3.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key,
1295
+ aws_session_token=session_token)
1296
+ s3_client = session.client("s3")
1297
+ paginator = s3_client.get_paginator('list_objects_v2')
1298
+ now = datetime.now(timezone.utc)
1299
+ for page in paginator.paginate(Bucket=bucket_name):
1300
+ if 'Contents' in page:
1301
+ for key in page['Contents']:
1302
+ last_modified = key['LastModified']
1303
+ if abs((last_modified - now).days) > older_than_days:
1304
+ logger.debug(f"Deleting expired object {key['Key']}")
1305
+ s3_client.delete_object(Bucket=bucket_name, Key=key['Key'])
1306
+
1307
+
1308
+
1309
+
1310
+
1311
+
1312
+
1630
1313
  def upload_locations(self):
1631
1314
  """
1632
1315
  Upload locations are configured on the Sources page as 'SIP Upload'.
@@ -1659,30 +1342,52 @@ class UploadAPI(AuthenticatedAPI):
1659
1342
  security_tag: str = "open",
1660
1343
  delete_after_upload: bool = True, max_MB_ingested: int = -1):
1661
1344
 
1345
+ from pyPreservica import EntityAPI
1346
+
1347
+ def entity_value(client: EntityAPI, identifier: str) -> Entity:
1348
+ back_off: int = 5
1349
+ while True:
1350
+ try:
1351
+ entities = client.identifier("code", identifier)
1352
+ if bool(len(entities) > 0):
1353
+ return entities.pop()
1354
+ else:
1355
+ return None
1356
+ except HTTPException as e:
1357
+ sleep(back_off)
1358
+ back_off = back_off * 2
1359
+
1360
+ def entity_exists(client: EntityAPI, identifier: str) -> bool:
1361
+ back_off: int = 5
1362
+ while True:
1363
+ try:
1364
+ entities = client.identifier("code", identifier)
1365
+ return bool(len(entities) > 0)
1366
+ except HTTPException as e:
1367
+ sleep(back_off)
1368
+ back_off = back_off * 2
1369
+
1662
1370
  def get_parent(client, identifier, parent_reference):
1663
- id = str(os.path.dirname(identifier))
1664
- if not id:
1665
- id = identifier
1666
- entities = client.identifier("code", id)
1667
- if len(entities) > 0:
1668
- folder = entities.pop()
1371
+ dirname_id: str = str(os.path.dirname(identifier))
1372
+ if not dirname_id:
1373
+ dirname_id = identifier
1374
+ folder = entity_value(client, dirname_id)
1375
+ if folder is not None:
1669
1376
  folder = client.folder(folder.reference)
1670
1377
  return folder.reference
1671
1378
  else:
1672
1379
  return parent_reference
1673
1380
 
1674
1381
  def get_folder(client, name, tag, parent_reference, identifier):
1675
- entities = client.identifier("code", identifier)
1676
- if len(entities) == 0:
1382
+ folder = entity_value(client, identifier)
1383
+ if folder is None:
1677
1384
  logger.info(f"Creating new folder with name {name}")
1678
1385
  folder = client.create_folder(name, name, tag, parent_reference)
1679
1386
  client.add_identifier(folder, "code", identifier)
1680
1387
  else:
1681
1388
  logger.info(f"Found existing folder with name {name}")
1682
- folder = entities.pop()
1683
1389
  return folder
1684
1390
 
1685
- from pyPreservica import EntityAPI
1686
1391
  entity_client = EntityAPI(username=self.username, password=self.password, server=self.server,
1687
1392
  tenant=self.tenant,
1688
1393
  two_fa_secret_key=self.two_fa_secret_key, use_shared_secret=self.shared_secret,
@@ -1712,7 +1417,7 @@ class UploadAPI(AuthenticatedAPI):
1712
1417
  files.remove(file)
1713
1418
  continue
1714
1419
  asset_code = os.path.join(code, file)
1715
- if len(entity_client.identifier("code", asset_code)) == 0:
1420
+ if not entity_exists(entity_client, asset_code):
1716
1421
  bytes_ingested = bytes_ingested + os.stat(full_path).st_size
1717
1422
  logger.info(f"Adding new file: {file} to package ready for upload")
1718
1423
  file_identifiers = {"code": asset_code}
@@ -1735,8 +1440,8 @@ class UploadAPI(AuthenticatedAPI):
1735
1440
  delete_after_upload=delete_after_upload)
1736
1441
  else:
1737
1442
  self.upload_zip_to_Source(path_to_zip_package=package, container_name=bucket_name,
1738
- show_progress= bool(progress_display is not None),
1739
- delete_after_upload=delete_after_upload)
1443
+ show_progress=bool(progress_display is not None),
1444
+ delete_after_upload=delete_after_upload)
1740
1445
 
1741
1446
  logger.info(f"Uploaded " + "{:.1f}".format(bytes_ingested / (1024 * 1024)) + " MB")
1742
1447
 
@@ -1910,9 +1615,42 @@ class UploadAPI(AuthenticatedAPI):
1910
1615
  endpoint = f'{self.protocol}://{self.server}/api/s3/buckets'
1911
1616
  self.token = self.__token__()
1912
1617
 
1913
- s3_client = boto3.client('s3', endpoint_url=endpoint, aws_access_key_id=self.token,
1914
- aws_secret_access_key="NOT_USED",
1915
- config=Config(s3={'addressing_style': 'path'}))
1618
+
1619
+ retries= {
1620
+ 'max_attempts': 5,
1621
+ 'mode': 'adaptive'
1622
+ }
1623
+
1624
+ def new_credentials():
1625
+ cred_metadata: dict = {}
1626
+ cred_metadata['access_key'] = self.__token__()
1627
+ cred_metadata['secret_key'] = "NOT_USED"
1628
+ cred_metadata['token'] = ""
1629
+ cred_metadata["expiry_time"] = (datetime.now(tzlocal()) + timedelta(minutes=12)).isoformat()
1630
+ logger.info("Refreshing credentials at: " + str(datetime.now(tzlocal())))
1631
+ return cred_metadata
1632
+
1633
+ session = get_session()
1634
+
1635
+ session_credentials = RefreshableCredentials.create_from_metadata(
1636
+ metadata=new_credentials(),
1637
+ refresh_using=new_credentials,
1638
+ advisory_timeout = 4 * 60,
1639
+ mandatory_timeout = 12 * 60,
1640
+ method = 'Preservica'
1641
+ )
1642
+
1643
+ autorefresh_session = boto3.Session(botocore_session=session)
1644
+
1645
+ session._credentials = session_credentials
1646
+
1647
+ config = Config(s3={'addressing_style': 'path'}, read_timeout=120, connect_timeout=120,
1648
+ request_checksum_calculation="WHEN_REQUIRED",
1649
+ response_checksum_validation="WHEN_REQUIRED",
1650
+ retries=retries, tcp_keepalive=True)
1651
+
1652
+
1653
+ s3_client = autorefresh_session.client('s3', endpoint_url=endpoint, config=config)
1916
1654
 
1917
1655
  metadata = {}
1918
1656
  if folder is not None:
@@ -1925,21 +1663,48 @@ class UploadAPI(AuthenticatedAPI):
1925
1663
  try:
1926
1664
  key_id = str(uuid.uuid4()) + ".zip"
1927
1665
 
1666
+
1667
+ # how big is the package
1668
+ package_size = os.path.getsize(path_to_zip_package)
1669
+ if package_size > 1 * GB:
1670
+ transfer_config.multipart_chunksize = 16 * MB ## Min 64 Chunks
1671
+ if package_size > 8 * GB:
1672
+ transfer_config.multipart_chunksize = 32 * MB ## Min 256 Chunks
1673
+ if package_size > 24 * GB:
1674
+ transfer_config.multipart_chunksize = 48 * MB ## Min 512 Chunks
1675
+ if package_size > 48 * GB:
1676
+ transfer_config.multipart_chunksize = 64 * MB
1677
+
1678
+ logger.info("Using Multipart Chunk Size: " + str(transfer_config.multipart_chunksize))
1679
+
1928
1680
  transfer = S3Transfer(client=s3_client, config=transfer_config)
1929
1681
 
1930
1682
  transfer.PutObjectTask = PutObjectTask
1931
1683
  transfer.CompleteMultipartUploadTask = CompleteMultipartUploadTask
1932
1684
  transfer.upload_file = upload_file
1933
1685
 
1934
- response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket, key=key_id,
1686
+
1687
+ response = transfer.upload_file(self=transfer, filename=path_to_zip_package, bucket=bucket,
1688
+ key=key_id,
1935
1689
  extra_args=metadata,
1936
1690
  callback=callback)
1937
1691
 
1692
+
1938
1693
  if delete_after_upload:
1939
1694
  os.remove(path_to_zip_package)
1940
1695
 
1941
1696
  return response['ResponseMetadata']['HTTPHeaders']['preservica-progress-token']
1942
1697
 
1943
- except ClientError as e:
1944
- logger.error(e)
1945
- raise e
1698
+ except (NoCredentialsError, PartialCredentialsError) as ex:
1699
+ logger.error(ex)
1700
+ raise ex
1701
+
1702
+ except ClientError as ex:
1703
+ logger.error(ex)
1704
+ raise ex
1705
+
1706
+
1707
+
1708
+
1709
+
1710
+