bdrc-util2 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ """
2
+ SQLAlchemy version of clean up current.
3
+ Too risky, but it's at least a skeleton of an API for dip_activity
4
+ """
5
+
6
+ # Base classes for drs tables
7
+ import pprint
8
+
9
+ from BdrcDbLib.DbOrm.DrsContextBase import DrsDbContextBase
10
+ from BdrcDbLib.DbOrm.models.drs import Works
11
+
12
+ # Import sqlalchemy stuff
13
+ # TODO: Trim
14
+
15
+
16
+ from sqlalchemy.orm import declarative_base, relationship, backref, Mapped, mapped_column
17
+ from sqlalchemy import Column, DateTime, text, func, select
18
+ from sqlalchemy import ForeignKey, String, Integer
19
+ from sqlalchemy.dialects.mysql import (
20
+ BIGINT,
21
+ # BINARY,
22
+ # BIT,
23
+ # BLOB,
24
+ # BOOLEAN,
25
+ # CHAR,
26
+ # DATE,
27
+ # DATETIME,
28
+ # DECIMAL,
29
+ # DECIMAL,
30
+ # DOUBLE,
31
+ # ENUM,
32
+ # FLOAT,
33
+ INTEGER,
34
+ # LONGBLOB,
35
+ LONGTEXT,
36
+ # MEDIUMBLOB,
37
+ # MEDIUMINT,
38
+ # MEDIUMTEXT,
39
+ # NCHAR,
40
+ # NUMERIC,
41
+ # NVARCHAR,
42
+ # REAL,
43
+ # SET,
44
+ # SMALLINT,
45
+ # TEXT,
46
+ # TIME,
47
+ TIMESTAMP,
48
+ # TINYBLOB,
49
+ # TINYINT,
50
+ # TINYTEXT,
51
+ # VARBINARY,
52
+ # VARCHAR,
53
+ # YEAR,
54
+ )
55
+
56
+ from BdrcDbLib.DbOrm.models.drs import *
57
+
58
+ Base = declarative_base()
59
+
60
+
61
+ class DipActivityTypes(Base):
62
+ id: Mapped[int] = mapped_column(name="iddip_activity_types", primary_key=True)
63
+ label: Mapped[str] = mapped_column(name="dip_activity_types_label", primary_key=True)
64
+ __tablename__ = 'dip_activity_types'
65
+
66
+
67
+ class dac(TimestampMixin, Base):
68
+ """
69
+ Two tables share this structure
70
+ """
71
+ dip_activity_type_id = Column(Integer, ForeignKey(DipActivityTypes.id))
72
+ dip_activity_start = Column(TIMESTAMP, nullable=True)
73
+ dip_activity_finish = Column(TIMESTAMP, nullable=True)
74
+ dip_activity_result_code = Column(INTEGER)
75
+ dip_source_path = Column(String(255))
76
+ dip_dest_path = Column(String(255))
77
+ work_id = Column(INTEGER, ForeignKey(Works.workId))
78
+ dip_external_id = Column(String(45), primary_key=True)
79
+ dip_comment = Column(LONGTEXT)
80
+ __tablename__ = "dip_activity_current"
81
+
82
+ work = relationship(Works)
83
+ activity = relationship(DipActivityTypes)
84
+
85
+ def __repr__(self):
86
+ return f"{self.dip_external_id:2} - w:{self.work.WorkName:10} a: {self.activity.label:15} fin:{self.dip_activity_finish}, dest:{self.dip_dest_path}"
87
+
88
+
89
+ #
90
+ # class dac(DipActivityBase):
91
+ # __tablename__ = 'dip_activity_current'
92
+
93
+
94
+ with DrsDbContextBase() as ctx:
95
+ # currents = ctx.session.query(dac).all()
96
+ sq = select(dac.dip_activity_finish, Works.WorkName, dac.dip_dest_path, DipActivityTypes.label, func.count(dac.work_id))\
97
+ .join(Works)\
98
+ .join(DipActivityTypes)\
99
+ .group_by(dac.dip_activity_type_id)\
100
+ .group_by(dac.work_id)\
101
+ .having(func.count(dac.work_id) > 1)
102
+ # .group_by(dac.dip_dest_path)
103
+
104
+ print(sq)
105
+ querystmt = ctx.session.execute(sq
106
+ )
107
+
108
+ ares = querystmt.all()
109
+
110
+ pprint.pprint(ares)
@@ -0,0 +1,553 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Replacement for scripts/glacier/DIP-pump-uploadWorkToGlacier.py
4
+
5
+ 2024 update:
6
+ archive-ops-187 wants to only upload all media for any image group that was updated in a given sync process (determined
7
+ by the input record's dip_external_id field).
8
+ At the same time, this program should have the flexibility to do a complete sync of all image groups and all other contents
9
+ of the work. The prior architecture of depending on structure in the 'comments' field is replaced by using an inventory of a sync
10
+ whose path is retrieved from the database.
11
+
12
+ The inventory field
13
+
14
+ As before, the structure is retained:
15
+ - each image group is inverted and uploaded into its own bag.zip
16
+ - everything else is assembled into its own bag.zip, which is named after the work_rid
17
+
18
+ ** Special credentials note: This module uses the [default] section of the host's AWS Credential file.
19
+ ** In the docker implementation, this is in a non-standard place (you don't add secrets to the image)
20
+ ** Docker applications are responsible for setting the AWS_SHARED_CREDENTIALS_FILE environment variable to point to the
21
+ ** docker container's AWS file. This can be done in the code, or in the docker-compose.yml file
22
+ ** (see ao-workflow/airflow-docker)
23
+ """
24
+ # import sys
25
+ # print(sys.path)
26
+ # import os
27
+ # print(os.environ['PYTHONPATH'])
28
+
29
+ import datetime
30
+ import logging
31
+ import re
32
+ from collections import namedtuple
33
+ from pathlib import Path
34
+ import sys
35
+ from tempfile import TemporaryDirectory
36
+ import traceback
37
+ from typing import Optional
38
+
39
+ from s3pathlib import S3Path
40
+
41
+ try:
42
+ import archive_ops.api as AoApi
43
+ except ModuleNotFoundError:
44
+ # For local debugging, add parent directory to sys.path
45
+ sys.path.append(str(Path(__file__).parent.parent))
46
+ import archive_ops.api as AoApi
47
+ from archive_ops.DeepArchiveParser import DeepArchiveParser, DeepArchiveArgs
48
+ from archive_ops.DipLog import DipLog
49
+ from archive_ops.InvertWork import get_igs_for_invert, get_media_splits, invert_image_group_media
50
+ from util_lib.AOLogger import AOLogger
51
+ from util_lib.utils import *
52
+
53
+ GLACIER_KEY_ROOT: str = 'Archive'
54
+ BAGZIP_SUFFIX: str = '.bag.zip'
55
+ INVENTORY_IMAGE_GROUP_PARENTS: list[str] = ['archive', 'images']
56
+
57
+
58
+ # ----------------- globals -------------------
59
+ # noinspection PyTypeChecker
60
+ _log: Optional[AOLogger] = None
61
+ # noinspection PyTypeChecker
62
+ dip_logger: Optional[DipLog] = None
63
+
64
+ # This describes the fields in the results of GetDIPActivityCandidates.sql as
65
+ # written by get_works_for_activity
66
+ # Potential headers are:
67
+ # WorkName,path,create_time,update_time,dip_activity_type_id,
68
+ # dip_activity_start,dip_activity_finish,dip_activity_result_code,
69
+ # dip_source_path,dip_dest_path,work_id,dip_external_id,dip_comment
70
+ HEADERS = ['WorkName', 'path', 'dip_comment', 'dip_external_id']
71
+ InputDirectiveRow = namedtuple('Todo', HEADERS) # Create a named tuple type
72
+
73
+
74
+ # ----------------- /globals -------------------
75
+
76
+ def do_deep_archive_incremental(work_rid: str, archive_path: Path, inventory_path, bucket: str,
77
+ in_daemon: bool = False) -> int:
78
+ """
79
+ creates bag zips for all media in any image group mentioned in the manifest whose path
80
+ is referenced in the sync_inventory table by the dip_external_id given
81
+ :param work_rid: tag for work
82
+ :param archive_path: root of media sources
83
+ :param inventory_path: abspath of inventory of files to deep archive
84
+ :param bucket: S3 bucket to upload to
85
+ :param in_daemon: True if running in a daemon or on docker
86
+ :return: 0 if successful, throws exception if not
87
+ """
88
+ work_tag: dict = AOS3WorkTag(work_rid).extra_args_tag
89
+
90
+ # The name of the bag zip is the same as the inventory file
91
+ archive_inventory_name: str = inventory_path.name
92
+
93
+ # Get the files grouped by image groups, non-image groups from the inventory
94
+ image_groups, non_image_groups = get_image_groups_from_inventory(inventory_path)
95
+
96
+ # Get the media splits. For incremental upload, we're disregarding the rest,
97
+ # and using the list of files in non_image_groups, which is the specific content
98
+ # that was sync'd this sync
99
+ media_with_image_group, _ = get_media_splits(archive_path, image_groups)
100
+
101
+ # Invert and upload the image groups
102
+ for ig, media_dirs in media_with_image_group.items():
103
+ do_bag_upload(archive_path=str(archive_path), work_rid=work_rid, work_tag=work_tag, invert_context=(ig, media_dirs),
104
+ is_complete=False, bucket=bucket, in_daemon=in_daemon)
105
+
106
+ # Upload the non-image group media
107
+ # TODO: Copy the loose files into one parent, run do_bag_upload on that.
108
+ # Problem to solve - do_bag_upload needs a distinct file name for an incremental bag
109
+
110
+ do_bag_upload(archive_path=str(archive_path), work_rid=work_rid, work_tag=work_tag, as_is=non_image_groups,
111
+ is_complete=False, bag_file_name=archive_inventory_name, bucket=bucket, in_daemon=in_daemon)
112
+ return 0
113
+
114
+
115
+ def do_deep_archive_complete(work_rid: str, archive_path: Path, image_groups: list[str], bucket: str,
116
+ in_daemon: bool = False) -> int:
117
+ """
118
+ Splits a work archive
119
+ :param work_rid:
120
+ :param archive_path: data source
121
+ :param image_groups: list of image groups to process
122
+ :param bucket: S3 bucket to upload to
123
+ :param in_daemon: True if running in a daemon or on docker
124
+ :return:
125
+ """
126
+ # Get s3 archive home, under the key. See archive-ops/scripts/glacier/DIP-pump-uploadWorkToGlacier.sh
127
+
128
+ work_tag: {} = AOS3WorkTag(work_rid).extra_args_tag
129
+
130
+ # Different algorithm. We need to divide the work into two sets: 1 for the image groups which
131
+ # are to be inverted, the other into a set of directories that will be copied as is.
132
+ #
133
+ media_with_image_group, non_image_groups = get_media_splits(archive_path, image_groups)
134
+
135
+ for ig, media_dirs in media_with_image_group.items():
136
+ do_bag_upload(archive_path=archive_path, work_rid=work_rid, work_tag=work_tag, invert_context=(ig, media_dirs),
137
+ bucket=bucket, in_daemon=in_daemon)
138
+
139
+ if non_image_groups:
140
+ do_bag_upload(archive_path=archive_path, work_rid=work_rid, work_tag=work_tag, as_is=non_image_groups,
141
+ bucket=bucket, in_daemon=in_daemon)
142
+
143
+ return 0
144
+
145
+
146
+ def get_inventory_path(dip_external_id, db_conf: str) -> Optional[Path]:
147
+ """
148
+ Get the inventory path from the database
149
+ """
150
+ from BdrcDbLib.DbOrm.DrsContextBase import DrsDbContextBase
151
+ from archive_ops.models.drsmodel import SyncInventory
152
+ from sqlalchemy import select
153
+ from sqlalchemy.exc import SQLAlchemyError
154
+
155
+ # TODO: Use connection from args
156
+ try:
157
+ with DrsDbContextBase(db_conf) as drs:
158
+ sess = drs.get_session()
159
+ query = select(SyncInventory).where(SyncInventory.dip_external_id == dip_external_id)
160
+ # Let it raise, if error
161
+ result = sess.execute(query).scalar_one_or_none()
162
+ if result:
163
+ return Path(result.inventory_path)
164
+ else:
165
+ return None
166
+ except SQLAlchemyError as e:
167
+ _log.exception(e)
168
+ raise e
169
+
170
+
171
+ def get_image_groups_from_inventory(inventory_path: Path) -> ([], [str]):
172
+ """
173
+ Get the image groups and the non-image group lists from the inventory file
174
+ """
175
+ with open(inventory_path, 'r') as inventory:
176
+ lines = [line.strip() for line in inventory]
177
+
178
+ # Get the image groups.
179
+ raw_igs = set([line.split('/')[2] for line in lines])
180
+
181
+ # filter in Only subdirs that begin with Workrid-Isomething or Workrid-4digits
182
+ is_ig = lambda x: re.fullmatch(r"W\w+-(\d{4}|I\w+)", x)
183
+ image_groups: [str] = list(filter(is_ig, raw_igs))
184
+
185
+ # Now get everything that does not have any image group in its path - meta
186
+ non_ig_elements: [str] = list(filter(lambda x: not any(ig in x for ig in image_groups), lines))
187
+
188
+ return image_groups, non_ig_elements
189
+
190
+
191
+ # write a function that takes a file path and copies it to a new location, preserving the directories in the input path
192
+ def copy_file_to_new_location(input_parent: Path, input_file_path: Path, new_location: Path):
193
+ """
194
+ Copy a file to a new location, preserving the directories in the input path
195
+ Ex copy_file_to_new_location(Path('a/b/c'), 'a/b/c/d/e/f.txt', Path('x/y/z')) -> x/y/z/c/d/e/f.txt
196
+ :param input_parent: parent Path of the file - needed to preserve the directory structure in the output
197
+ :param input_file_path: file to copy, relative to input_parent
198
+ :param new_location: destination directory
199
+ """
200
+ import shutil
201
+ # .parent is the full path to the directory containing the file, relative to its root
202
+ target_dir: Path = new_location / input_file_path.parent
203
+ target_dir.mkdir(parents=True, exist_ok=True)
204
+ shutil.copy(input_parent / input_file_path, target_dir)
205
+
206
+
207
+ def do_bag_upload(archive_path: str,
208
+ work_rid: str,
209
+ work_tag: str,
210
+ bucket: str,
211
+ invert_context: () = None,
212
+ as_is: [str] = None,
213
+ is_complete: bool = True,
214
+ bag_file_name: str = None,
215
+ in_daemon: bool = False) -> None:
216
+ """
217
+ create a bag.zip for an image group or a list of files, and upload it to the archive
218
+ :param bucket:
219
+ :param archive_path: Parent of the source
220
+ :param work_rid: Work name
221
+ :param work_tag:
222
+ :param bucket: S3 bucket to upload to
223
+ :param as_is: list of separate files to include in an incremental upload bag
224
+ :param is_complete: True if bagging a complete work, False if an incremental sync
225
+ :param bag_file_name: name of the bag file to create
226
+ :param in_daemon: True if running in a daemon or on docker
227
+ :return:
228
+ """
229
+ import shutil
230
+ from bag.bag_ops import bag
231
+
232
+ if invert_context:
233
+ dest_name, media_with_image_group = invert_context
234
+ else:
235
+ dest_name = bag_file_name if bag_file_name else work_rid
236
+
237
+ s3_parent: S3Path = S3Path(bucket, AoApi.get_archive_location(GLACIER_KEY_ROOT, work_rid))
238
+ # Make a temporary path for the output:
239
+ exit_e: Exception = None
240
+ with (TemporaryDirectory() as out_buffer):
241
+ dip_id: str = ""
242
+ failed_item_message: str = ""
243
+ had_fail: bool = False
244
+ ig_dest_path: S3Path = s3_parent / f"{dest_name}{BAGZIP_SUFFIX}"
245
+ try:
246
+ dip_id = open_log_dip(work_rid, archive_path, ig_dest_path.arn)
247
+ out_path = Path(out_buffer)
248
+ dest_path: Path = out_path / dest_name
249
+ dest_path.mkdir(parents=True, exist_ok=True)
250
+ bag_path: Path = out_path / "bag" / dest_name
251
+ bag_path.mkdir(parents=True, exist_ok=True)
252
+ bp_str = str(bag_path)
253
+
254
+ # Invert the image group into the temp directory's work_folder
255
+ if invert_context:
256
+ invert_image_group_media(dest_path, media_with_image_group, dest_name)
257
+ else:
258
+ if is_complete:
259
+ # as_is is a list of directories
260
+ for dir_name in as_is:
261
+ complete_sub_path = dest_path / children_of(dir_name, work_rid)
262
+ complete_sub_path.mkdir(parents=True, exist_ok=True)
263
+ shutil.copytree(dir_name, complete_sub_path, dirs_exist_ok=True)
264
+ else:
265
+ # as_is is a list of files
266
+ archive_parent: Path = Path(archive_path).parent
267
+ for file_name in as_is:
268
+ copy_file_to_new_location(archive_parent, Path(file_name), dest_path)
269
+ bag(str(dest_path), bp_str, False, in_daemon, False)
270
+
271
+ # Upload the inversion(s) to the archive. In this workflow, there should only be one
272
+ for root, dirs, files in os.walk(bag_path):
273
+ for file in files:
274
+ file_path = os.path.join(root, file)
275
+ #
276
+ # Handle subdirs by removing the top of tree
277
+ s3_object_name = file_path.replace(bp_str, "", 1).lstrip(os.sep)
278
+ s3_target: S3Path = s3_parent / s3_object_name
279
+ upload_file_to_s3_with_storage_class(file_path, s3_target.bucket, s3_target.key, 'STANDARD_IA',
280
+ work_tag)
281
+ except Exception as e:
282
+ failed_item_message = f"failed deep_archive {work_rid=} {dest_path=} {e=}"
283
+ _log.exception(e)
284
+ exit_e = e
285
+ finally:
286
+ if dip_id:
287
+ update_log_dip(dip_id, 1 if had_fail else 0, failed_item_message)
288
+ if exit_e:
289
+ complain(f"{work_rid=}, {failed_item_message=}", 1, "do_deep_archive_complete")
290
+ raise exit_e
291
+
292
+
293
+ def children_of(anchor: str, a_path: str) -> Path:
294
+ """
295
+ Returns a path relative to the work_rid in dir_name
296
+ :param a_path:
297
+ :param anchor:
298
+ :return:
299
+ """
300
+ dir_path = Path(anchor)
301
+ _d_parts = dir_path.parts
302
+ _w_sub = _d_parts.index(a_path)
303
+ sub_path = Path(*_d_parts[_w_sub + 1:])
304
+ return sub_path
305
+
306
+
307
+ def upload_file_to_s3_with_storage_class(file_name, bucket, key=None, storage_class='STANDARD',
308
+ tag_set: Optional[str] = None):
309
+ """Upload a file to an S3 bucket with a specific storage class
310
+
311
+ :param file_name: File to upload
312
+ :param bucket: Bucket to upload to
313
+ :param key: S3 object name. If not specified then file_name is used
314
+ :param storage_class: Storage class to use for the object
315
+ :return: True if file was uploaded, else False
316
+ """
317
+
318
+ # If S3 object_name was not specified, use file_name
319
+ if key is None:
320
+ key = file_name
321
+
322
+ # Upload the file
323
+ import boto3
324
+ s3_client = boto3.client('s3')
325
+ from botocore.exceptions import ClientError
326
+ try:
327
+
328
+ # handy: Can set tagging here:
329
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object_tagging
330
+ extra_args: {} = {'StorageClass': storage_class}
331
+ if tag_set:
332
+ # __/|\__ gh copilot
333
+ extra_args.update(tag_set)
334
+ import json
335
+ extra_arg_string = json.dumps(extra_args)
336
+ s3_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args)
337
+ except ClientError as e:
338
+ _log.exception(e)
339
+ complain(f"S3 upload {file_name=}, {bucket=}", 1, "upload_file_to_S3")
340
+ raise e
341
+ return True
342
+
343
+
344
+ def setup(args: DeepArchiveArgs):
345
+ """
346
+ Open resources
347
+ :return: sets logger and dip_log context
348
+ """
349
+
350
+ import os
351
+ global _log
352
+ global dip_logger
353
+
354
+ if not _log:
355
+ os.makedirs(args.log_root, exist_ok=True)
356
+ _log = AOLogger("Deep_Archive", args.log_level, Path(args.log_root), extra_quiet_loggers=['bagit'])
357
+ if not dip_logger:
358
+ import os
359
+ # Need different path under docker
360
+ db_cfg: [str] = args.drsDbConfig.split(':')
361
+
362
+ # Still need to check, in case not run through argparse (i.e. db_config manually populated)
363
+ if len(db_cfg) < 2:
364
+ raise ValueError(f"Invalid db config {args.drsDbConfig} requires section:configFileName")
365
+
366
+ # Adjust for running under docker - should be part of DbApps
367
+ db_cfg[1] = '/run/secrets/db_apps' if os.path.exists('/run/secrets') else db_cfg[1]
368
+ args.drsDbConfig = ':'.join(db_cfg)
369
+ dip_logger = DipLog(args.drsDbConfig)
370
+
371
+
372
+ def displayed_comment(displayed, display_length: int = 108, illus_len: int = 20) -> str:
373
+ """
374
+ if displayed is > 108, trim to [1:20]"..."[-20:]
375
+ :param display_length: invocation threshold
376
+ :param illus_len:
377
+
378
+ :param displayed:
379
+ :return:
380
+ """
381
+ return f"{displayed[:illus_len]}...{displayed[-illus_len:]}" if len(displayed) > display_length else displayed
382
+
383
+
384
+ def update_log_dip(dip_log_id: str,
385
+ rc: int,
386
+ comment: str,
387
+ src_path: Optional[str] = None,
388
+ dest_path: Optional[str] = None,
389
+ ):
390
+ """
391
+ Closes a dip log entry
392
+ :param dip_log_id: Key to locate entry
393
+ :param comment: goes into database
394
+ :param rc: activity return code for log
395
+ :param src_path: source path (shouldn't usually update)
396
+ :param dest_path: output path
397
+ :return:
398
+ """
399
+ log_comment = displayed_comment(comment)
400
+ _log.info(f'closing :{dip_log_id=}\trc:{rc=} {log_comment=}')
401
+ return dip_logger.set_dip(
402
+ # These are table PKs - you can't update them
403
+ activity_name=None,
404
+ begin_t=None,
405
+ work_name=None,
406
+ # end keys
407
+ # this is the identifying key
408
+ dip_id=dip_log_id,
409
+ # Tell the truth now
410
+ end_t=datetime.datetime.now(),
411
+ # The rest of these are optional
412
+ s_path=src_path,
413
+ d_path=dest_path,
414
+ ac_result=rc,
415
+ comment=f"{comment} log file {_log.log_file_name}",
416
+ inventory=None)
417
+
418
+
419
+ def open_log_dip(work_rid: str, src_path: str, aws_object_path: Optional[str] = None) -> str:
420
+ """
421
+ Opens a dip log entry
422
+ :return:
423
+ :param work_rid: Key to locate entry
424
+ :param src_path: goes into database
425
+ :param aws_object_path: return code to log
426
+ :return: dip_log_id
427
+ """
428
+ global dip_logger
429
+ _log.info(f'opening :{work_rid=}\t{src_path=}\t{aws_object_path=}')
430
+
431
+ # set_dip has no optional args:
432
+ return dip_logger.set_dip(activity_name='DEEP_ARCHIVE',
433
+ begin_t=datetime.datetime.now(),
434
+ end_t=None,
435
+ s_path=src_path,
436
+ d_path=aws_object_path,
437
+ dip_id=None,
438
+ work_name=work_rid,
439
+ ac_result=None,
440
+ comment=None,
441
+ inventory=None)
442
+
443
+
444
+ # send a message to an AWS SNS topic
445
+ def send_sns(subject: str, message_str):
446
+ """
447
+ Send a message to an AWS SNS topic
448
+ :return:
449
+ """
450
+ import os
451
+ topic: str = os.getenv('AO_AWS_SNS_TOPIC_ARN')
452
+ if topic:
453
+ # Usually configured for default
454
+ import boto3
455
+ sns = boto3.client('sns').publish(TopicArn=topic, Message=message_str,
456
+ Subject=subject)
457
+ _log.info(f'{"[sns]" if topic else "[log]"} {subject}, {message_str}')
458
+
459
+
460
+ def complain(object_tag: str, rc: int, operation_tag: str, detail: str = None):
461
+ d4_fstring = f"with {detail=}" if detail else ''""
462
+ sns_fails_message_string = f"""
463
+ The following work could not be uploaded to Glacier:
464
+ {object_tag}
465
+ .
466
+ {operation_tag} returned with exit code {rc} {d4_fstring}.
467
+
468
+ See log file {_log.log_file_name} for details.
469
+ """
470
+
471
+ send_sns("Glacier Upload Failure Report", sns_fails_message_string)
472
+
473
+
474
+ def get_header_indices(header: [str], columns: [str]) -> ():
475
+ return tuple(map(lambda x: header.index(x), columns))
476
+
477
+
478
+ def read_csv(csv_path: Path) -> [InputDirectiveRow]:
479
+ """
480
+ Map a csv file into a list of named tuples.
481
+ Note if the dip_comment field is used, the caller is responsible
482
+ for escaping commas
483
+ """
484
+ import csv
485
+ records: [InputDirectiveRow] = []
486
+ with open(csv_path, 'r') as f:
487
+ reader = csv.reader(f) # Skip the header
488
+ try:
489
+ header = next(reader)
490
+
491
+ h_map = get_header_indices(header, HEADERS)
492
+ records = [InputDirectiveRow(*(row[i] for i in h_map)) for row in reader]
493
+ # noinspection PyTypeChecker
494
+ _log.debug(records)
495
+ except StopIteration:
496
+ _log.info(f"Empty file {csv_path}")
497
+
498
+ return records
499
+
500
+
501
+ def deep_archive_shell():
502
+ """
503
+ Command line interface
504
+ :return:
505
+ """
506
+ da_parser: DeepArchiveParser = DeepArchiveParser(usage="%(prog)s -i input_file",
507
+ description="Uploads a series of inverted zip files to backup "
508
+ "bucket", )
509
+
510
+ args: DeepArchiveArgs = da_parser.parsedArgs
511
+
512
+ setup(args)
513
+ _log.info(f"Arguments: {str(args)}")
514
+
515
+ records: [InputDirectiveRow] = read_csv(args.input_file)
516
+ for record in records:
517
+ inventory_path: Path = None
518
+ try:
519
+ if args.incremental:
520
+ # Get the inventory path from the database. If none, do a complete deep archive
521
+ inventory_path = get_inventory_path(record.dip_external_id, args.drsDbConfig)
522
+ # bu3 t if no inventory, do complete
523
+ if not inventory_path or not inventory_path.exists():
524
+ args.complete = True
525
+ args.incremental = False
526
+ _log.warn(
527
+ f"No inventory found for {record.WorkName} archive record: {record.dip_external_id[:6]}... Running complete")
528
+
529
+ if args.complete:
530
+ image_group_list = get_igs_for_invert(record.WorkName)
531
+ do_deep_archive_complete(record.WorkName, Path(record.path), image_group_list, args.bucket,
532
+ args.in_daemon)
533
+ else: # args.incremental
534
+ do_deep_archive_incremental(record.WorkName, Path(record.path), inventory_path, args.bucket,
535
+ args.in_daemon)
536
+ # image_group_list = get_igs_for_invert(record.WorkName, record.path, record.dip_external_id)
537
+
538
+ # if there was a comment, we're only doing the image groups that were designated in the comment.
539
+ # Otherwise, segment the work into:
540
+ # - imagegroup + media tuples to be inverted and zipped separately
541
+ # - everything else
542
+ _log.info(f"Processing {record}")
543
+ except Exception as e:
544
+ dip_id = open_log_dip(record.WorkName, record.path)
545
+ error_string: str = f"Failed to process {record=} {dip_id=} Exception: " + f"{traceback.format_exc() if _log.py_logger.isEnabledFor(logging.DEBUG) else e}"
546
+ _log.error(error_string)
547
+ update_log_dip(dip_id, 1, error_string)
548
+ complain(record.WorkName, 1, "deep_archive_shell", error_string)
549
+ raise e
550
+
551
+
552
+ if __name__ == '__main__':
553
+ deep_archive_shell()