bdrc-util2 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archive_ops/CleanCurrent.py +110 -0
- archive_ops/DeepArchive.py +553 -0
- archive_ops/DeepArchiveParser.py +75 -0
- archive_ops/DeepArchiveSingleZip.py +21 -0
- archive_ops/DipLog.py +121 -0
- archive_ops/DipLogParser.py +138 -0
- archive_ops/GetReadyWorksForStates.py +128 -0
- archive_ops/InvertWork.py +255 -0
- archive_ops/InvertWorkException.py +9 -0
- archive_ops/Resolvers.py +11 -0
- archive_ops/SaveWorkFacts.py +97 -0
- archive_ops/TestGetReadyWorksForStates.py +160 -0
- archive_ops/__init__.py +0 -0
- archive_ops/api.py +62 -0
- archive_ops/locators.py +111 -0
- archive_ops/models/__init__.py +1 -0
- archive_ops/models/drsmodel.py +755 -0
- archive_ops/models/drsmodelSqlA20.py +1029 -0
- archive_ops/shell_ws.py +77 -0
- bdrc_util2-1.1.0.dist-info/METADATA +380 -0
- bdrc_util2-1.1.0.dist-info/RECORD +30 -0
- bdrc_util2-1.1.0.dist-info/WHEEL +5 -0
- bdrc_util2-1.1.0.dist-info/entry_points.txt +11 -0
- bdrc_util2-1.1.0.dist-info/licenses/LICENSE +21 -0
- bdrc_util2-1.1.0.dist-info/top_level.txt +2 -0
- util_lib/AOLogger.py +161 -0
- util_lib/GetFromBUDA.py +204 -0
- util_lib/__init__.py +1 -0
- util_lib/utils.py +202 -0
- util_lib/version.py +12 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQLAlchemy version of clean up current.
|
|
3
|
+
Too risky, but it's at least a skeleton of an API for dip_activity
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# Base classes for drs tables
|
|
7
|
+
import pprint
|
|
8
|
+
|
|
9
|
+
from BdrcDbLib.DbOrm.DrsContextBase import DrsDbContextBase
|
|
10
|
+
from BdrcDbLib.DbOrm.models.drs import Works
|
|
11
|
+
|
|
12
|
+
# Import sqlalchemy stuff
|
|
13
|
+
# TODO: Trim
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from sqlalchemy.orm import declarative_base, relationship, backref, Mapped, mapped_column
|
|
17
|
+
from sqlalchemy import Column, DateTime, text, func, select
|
|
18
|
+
from sqlalchemy import ForeignKey, String, Integer
|
|
19
|
+
from sqlalchemy.dialects.mysql import (
|
|
20
|
+
BIGINT,
|
|
21
|
+
# BINARY,
|
|
22
|
+
# BIT,
|
|
23
|
+
# BLOB,
|
|
24
|
+
# BOOLEAN,
|
|
25
|
+
# CHAR,
|
|
26
|
+
# DATE,
|
|
27
|
+
# DATETIME,
|
|
28
|
+
# DECIMAL,
|
|
29
|
+
# DECIMAL,
|
|
30
|
+
# DOUBLE,
|
|
31
|
+
# ENUM,
|
|
32
|
+
# FLOAT,
|
|
33
|
+
INTEGER,
|
|
34
|
+
# LONGBLOB,
|
|
35
|
+
LONGTEXT,
|
|
36
|
+
# MEDIUMBLOB,
|
|
37
|
+
# MEDIUMINT,
|
|
38
|
+
# MEDIUMTEXT,
|
|
39
|
+
# NCHAR,
|
|
40
|
+
# NUMERIC,
|
|
41
|
+
# NVARCHAR,
|
|
42
|
+
# REAL,
|
|
43
|
+
# SET,
|
|
44
|
+
# SMALLINT,
|
|
45
|
+
# TEXT,
|
|
46
|
+
# TIME,
|
|
47
|
+
TIMESTAMP,
|
|
48
|
+
# TINYBLOB,
|
|
49
|
+
# TINYINT,
|
|
50
|
+
# TINYTEXT,
|
|
51
|
+
# VARBINARY,
|
|
52
|
+
# VARCHAR,
|
|
53
|
+
# YEAR,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
from BdrcDbLib.DbOrm.models.drs import *
|
|
57
|
+
|
|
58
|
+
Base = declarative_base()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DipActivityTypes(Base):
|
|
62
|
+
id: Mapped[int] = mapped_column(name="iddip_activity_types", primary_key=True)
|
|
63
|
+
label: Mapped[str] = mapped_column(name="dip_activity_types_label", primary_key=True)
|
|
64
|
+
__tablename__ = 'dip_activity_types'
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class dac(TimestampMixin, Base):
|
|
68
|
+
"""
|
|
69
|
+
Two tables share this structure
|
|
70
|
+
"""
|
|
71
|
+
dip_activity_type_id = Column(Integer, ForeignKey(DipActivityTypes.id))
|
|
72
|
+
dip_activity_start = Column(TIMESTAMP, nullable=True)
|
|
73
|
+
dip_activity_finish = Column(TIMESTAMP, nullable=True)
|
|
74
|
+
dip_activity_result_code = Column(INTEGER)
|
|
75
|
+
dip_source_path = Column(String(255))
|
|
76
|
+
dip_dest_path = Column(String(255))
|
|
77
|
+
work_id = Column(INTEGER, ForeignKey(Works.workId))
|
|
78
|
+
dip_external_id = Column(String(45), primary_key=True)
|
|
79
|
+
dip_comment = Column(LONGTEXT)
|
|
80
|
+
__tablename__ = "dip_activity_current"
|
|
81
|
+
|
|
82
|
+
work = relationship(Works)
|
|
83
|
+
activity = relationship(DipActivityTypes)
|
|
84
|
+
|
|
85
|
+
def __repr__(self):
|
|
86
|
+
return f"{self.dip_external_id:2} - w:{self.work.WorkName:10} a: {self.activity.label:15} fin:{self.dip_activity_finish}, dest:{self.dip_dest_path}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
#
|
|
90
|
+
# class dac(DipActivityBase):
|
|
91
|
+
# __tablename__ = 'dip_activity_current'
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
with DrsDbContextBase() as ctx:
|
|
95
|
+
# currents = ctx.session.query(dac).all()
|
|
96
|
+
sq = select(dac.dip_activity_finish, Works.WorkName, dac.dip_dest_path, DipActivityTypes.label, func.count(dac.work_id))\
|
|
97
|
+
.join(Works)\
|
|
98
|
+
.join(DipActivityTypes)\
|
|
99
|
+
.group_by(dac.dip_activity_type_id)\
|
|
100
|
+
.group_by(dac.work_id)\
|
|
101
|
+
.having(func.count(dac.work_id) > 1)
|
|
102
|
+
# .group_by(dac.dip_dest_path)
|
|
103
|
+
|
|
104
|
+
print(sq)
|
|
105
|
+
querystmt = ctx.session.execute(sq
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
ares = querystmt.all()
|
|
109
|
+
|
|
110
|
+
pprint.pprint(ares)
|
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Replacement for scripts/glacier/DIP-pump-uploadWorkToGlacier.py
|
|
4
|
+
|
|
5
|
+
2024 update:
|
|
6
|
+
archive-ops-187 wants to only upload all media for any image group that was updated in a given sync process (determined
|
|
7
|
+
by the input record's dip_external_id field).
|
|
8
|
+
At the same time, this program should have the flexibility to do a complete sync of all image groups and all other contents
|
|
9
|
+
of the work. The prior architecture of depending on structure in the 'comments' field is replaced by using an inventory of a sync
|
|
10
|
+
whose path is retrieved from the database.
|
|
11
|
+
|
|
12
|
+
The inventory field
|
|
13
|
+
|
|
14
|
+
As before, the structure is retained:
|
|
15
|
+
- each image group is inverted and uploaded into its own bag.zip
|
|
16
|
+
- everything else is assembled into its own bag.zip, which is named after the work_rid
|
|
17
|
+
|
|
18
|
+
** Special credentials note: This module uses the [default] section of the host's AWS Credential file.
|
|
19
|
+
** In the docker implementation, this is in a non-standard place (you don't add secrets to the image)
|
|
20
|
+
** Docker applications are responsible for setting the AWS_SHARED_CREDENTIALS_FILE environment variable to point to the
|
|
21
|
+
** docker container's AWS file. This can be done in the code, or in the docker-compose.yml file
|
|
22
|
+
** (see ao-workflow/airflow-docker)
|
|
23
|
+
"""
|
|
24
|
+
# import sys
|
|
25
|
+
# print(sys.path)
|
|
26
|
+
# import os
|
|
27
|
+
# print(os.environ['PYTHONPATH'])
|
|
28
|
+
|
|
29
|
+
import datetime
|
|
30
|
+
import logging
|
|
31
|
+
import re
|
|
32
|
+
from collections import namedtuple
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
import sys
|
|
35
|
+
from tempfile import TemporaryDirectory
|
|
36
|
+
import traceback
|
|
37
|
+
from typing import Optional
|
|
38
|
+
|
|
39
|
+
from s3pathlib import S3Path
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import archive_ops.api as AoApi
|
|
43
|
+
except ModuleNotFoundError:
|
|
44
|
+
# For local debugging, add parent directory to sys.path
|
|
45
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
46
|
+
import archive_ops.api as AoApi
|
|
47
|
+
from archive_ops.DeepArchiveParser import DeepArchiveParser, DeepArchiveArgs
|
|
48
|
+
from archive_ops.DipLog import DipLog
|
|
49
|
+
from archive_ops.InvertWork import get_igs_for_invert, get_media_splits, invert_image_group_media
|
|
50
|
+
from util_lib.AOLogger import AOLogger
|
|
51
|
+
from util_lib.utils import *
|
|
52
|
+
|
|
53
|
+
GLACIER_KEY_ROOT: str = 'Archive'
|
|
54
|
+
BAGZIP_SUFFIX: str = '.bag.zip'
|
|
55
|
+
INVENTORY_IMAGE_GROUP_PARENTS: list[str] = ['archive', 'images']
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ----------------- globals -------------------
|
|
59
|
+
# noinspection PyTypeChecker
|
|
60
|
+
_log: Optional[AOLogger] = None
|
|
61
|
+
# noinspection PyTypeChecker
|
|
62
|
+
dip_logger: Optional[DipLog] = None
|
|
63
|
+
|
|
64
|
+
# This describes the fields in the results of GetDIPActivityCandidates.sql as
|
|
65
|
+
# written by get_works_for_activity
|
|
66
|
+
# Potential headers are:
|
|
67
|
+
# WorkName,path,create_time,update_time,dip_activity_type_id,
|
|
68
|
+
# dip_activity_start,dip_activity_finish,dip_activity_result_code,
|
|
69
|
+
# dip_source_path,dip_dest_path,work_id,dip_external_id,dip_comment
|
|
70
|
+
HEADERS = ['WorkName', 'path', 'dip_comment', 'dip_external_id']
|
|
71
|
+
InputDirectiveRow = namedtuple('Todo', HEADERS) # Create a named tuple type
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ----------------- /globals -------------------
|
|
75
|
+
|
|
76
|
+
def do_deep_archive_incremental(work_rid: str, archive_path: Path, inventory_path, bucket: str,
|
|
77
|
+
in_daemon: bool = False) -> int:
|
|
78
|
+
"""
|
|
79
|
+
creates bag zips for all media in any image group mentioned in the manifest whose path
|
|
80
|
+
is referenced in the sync_inventory table by the dip_external_id given
|
|
81
|
+
:param work_rid: tag for work
|
|
82
|
+
:param archive_path: root of media sources
|
|
83
|
+
:param inventory_path: abspath of inventory of files to deep archive
|
|
84
|
+
:param bucket: S3 bucket to upload to
|
|
85
|
+
:param in_daemon: True if running in a daemon or on docker
|
|
86
|
+
:return: 0 if successful, throws exception if not
|
|
87
|
+
"""
|
|
88
|
+
work_tag: dict = AOS3WorkTag(work_rid).extra_args_tag
|
|
89
|
+
|
|
90
|
+
# The name of the bag zip is the same as the inventory file
|
|
91
|
+
archive_inventory_name: str = inventory_path.name
|
|
92
|
+
|
|
93
|
+
# Get the files grouped by image groups, non-image groups from the inventory
|
|
94
|
+
image_groups, non_image_groups = get_image_groups_from_inventory(inventory_path)
|
|
95
|
+
|
|
96
|
+
# Get the media splits. For incremental upload, we're disregarding the rest,
|
|
97
|
+
# and using the list of files in non_image_groups, which is the specific content
|
|
98
|
+
# that was sync'd this sync
|
|
99
|
+
media_with_image_group, _ = get_media_splits(archive_path, image_groups)
|
|
100
|
+
|
|
101
|
+
# Invert and upload the image groups
|
|
102
|
+
for ig, media_dirs in media_with_image_group.items():
|
|
103
|
+
do_bag_upload(archive_path=str(archive_path), work_rid=work_rid, work_tag=work_tag, invert_context=(ig, media_dirs),
|
|
104
|
+
is_complete=False, bucket=bucket, in_daemon=in_daemon)
|
|
105
|
+
|
|
106
|
+
# Upload the non-image group media
|
|
107
|
+
# TODO: Copy the loose files into one parent, run do_bag_upload on that.
|
|
108
|
+
# Problem to solve - do_bag_upload needs a distinct file name for an incremental bag
|
|
109
|
+
|
|
110
|
+
do_bag_upload(archive_path=str(archive_path), work_rid=work_rid, work_tag=work_tag, as_is=non_image_groups,
|
|
111
|
+
is_complete=False, bag_file_name=archive_inventory_name, bucket=bucket, in_daemon=in_daemon)
|
|
112
|
+
return 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def do_deep_archive_complete(work_rid: str, archive_path: Path, image_groups: list[str], bucket: str,
|
|
116
|
+
in_daemon: bool = False) -> int:
|
|
117
|
+
"""
|
|
118
|
+
Splits a work archive
|
|
119
|
+
:param work_rid:
|
|
120
|
+
:param archive_path: data source
|
|
121
|
+
:param image_groups: list of image groups to process
|
|
122
|
+
:param bucket: S3 bucket to upload to
|
|
123
|
+
:param in_daemon: True if running in a daemon or on docker
|
|
124
|
+
:return:
|
|
125
|
+
"""
|
|
126
|
+
# Get s3 archive home, under the key. See archive-ops/scripts/glacier/DIP-pump-uploadWorkToGlacier.sh
|
|
127
|
+
|
|
128
|
+
work_tag: {} = AOS3WorkTag(work_rid).extra_args_tag
|
|
129
|
+
|
|
130
|
+
# Different algorithm. We need to divide the work into two sets: 1 for the image groups which
|
|
131
|
+
# are to be inverted, the other into a set of directories that will be copied as is.
|
|
132
|
+
#
|
|
133
|
+
media_with_image_group, non_image_groups = get_media_splits(archive_path, image_groups)
|
|
134
|
+
|
|
135
|
+
for ig, media_dirs in media_with_image_group.items():
|
|
136
|
+
do_bag_upload(archive_path=archive_path, work_rid=work_rid, work_tag=work_tag, invert_context=(ig, media_dirs),
|
|
137
|
+
bucket=bucket, in_daemon=in_daemon)
|
|
138
|
+
|
|
139
|
+
if non_image_groups:
|
|
140
|
+
do_bag_upload(archive_path=archive_path, work_rid=work_rid, work_tag=work_tag, as_is=non_image_groups,
|
|
141
|
+
bucket=bucket, in_daemon=in_daemon)
|
|
142
|
+
|
|
143
|
+
return 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_inventory_path(dip_external_id, db_conf: str) -> Optional[Path]:
|
|
147
|
+
"""
|
|
148
|
+
Get the inventory path from the database
|
|
149
|
+
"""
|
|
150
|
+
from BdrcDbLib.DbOrm.DrsContextBase import DrsDbContextBase
|
|
151
|
+
from archive_ops.models.drsmodel import SyncInventory
|
|
152
|
+
from sqlalchemy import select
|
|
153
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
154
|
+
|
|
155
|
+
# TODO: Use connection from args
|
|
156
|
+
try:
|
|
157
|
+
with DrsDbContextBase(db_conf) as drs:
|
|
158
|
+
sess = drs.get_session()
|
|
159
|
+
query = select(SyncInventory).where(SyncInventory.dip_external_id == dip_external_id)
|
|
160
|
+
# Let it raise, if error
|
|
161
|
+
result = sess.execute(query).scalar_one_or_none()
|
|
162
|
+
if result:
|
|
163
|
+
return Path(result.inventory_path)
|
|
164
|
+
else:
|
|
165
|
+
return None
|
|
166
|
+
except SQLAlchemyError as e:
|
|
167
|
+
_log.exception(e)
|
|
168
|
+
raise e
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_image_groups_from_inventory(inventory_path: Path) -> ([], [str]):
|
|
172
|
+
"""
|
|
173
|
+
Get the image groups and the non-image group lists from the inventory file
|
|
174
|
+
"""
|
|
175
|
+
with open(inventory_path, 'r') as inventory:
|
|
176
|
+
lines = [line.strip() for line in inventory]
|
|
177
|
+
|
|
178
|
+
# Get the image groups.
|
|
179
|
+
raw_igs = set([line.split('/')[2] for line in lines])
|
|
180
|
+
|
|
181
|
+
# filter in Only subdirs that begin with Workrid-Isomething or Workrid-4digits
|
|
182
|
+
is_ig = lambda x: re.fullmatch(r"W\w+-(\d{4}|I\w+)", x)
|
|
183
|
+
image_groups: [str] = list(filter(is_ig, raw_igs))
|
|
184
|
+
|
|
185
|
+
# Now get everything that does not have any image group in its path - meta
|
|
186
|
+
non_ig_elements: [str] = list(filter(lambda x: not any(ig in x for ig in image_groups), lines))
|
|
187
|
+
|
|
188
|
+
return image_groups, non_ig_elements
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# write a function that takes a file path and copies it to a new location, preserving the directories in the input path
|
|
192
|
+
def copy_file_to_new_location(input_parent: Path, input_file_path: Path, new_location: Path):
|
|
193
|
+
"""
|
|
194
|
+
Copy a file to a new location, preserving the directories in the input path
|
|
195
|
+
Ex copy_file_to_new_location(Path('a/b/c'), 'a/b/c/d/e/f.txt', Path('x/y/z')) -> x/y/z/c/d/e/f.txt
|
|
196
|
+
:param input_parent: parent Path of the file - needed to preserve the directory structure in the output
|
|
197
|
+
:param input_file_path: file to copy, relative to input_parent
|
|
198
|
+
:param new_location: destination directory
|
|
199
|
+
"""
|
|
200
|
+
import shutil
|
|
201
|
+
# .parent is the full path to the directory containing the file, relative to its root
|
|
202
|
+
target_dir: Path = new_location / input_file_path.parent
|
|
203
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
204
|
+
shutil.copy(input_parent / input_file_path, target_dir)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def do_bag_upload(archive_path: str,
|
|
208
|
+
work_rid: str,
|
|
209
|
+
work_tag: str,
|
|
210
|
+
bucket: str,
|
|
211
|
+
invert_context: () = None,
|
|
212
|
+
as_is: [str] = None,
|
|
213
|
+
is_complete: bool = True,
|
|
214
|
+
bag_file_name: str = None,
|
|
215
|
+
in_daemon: bool = False) -> None:
|
|
216
|
+
"""
|
|
217
|
+
create a bag.zip for an image group or a list of files, and upload it to the archive
|
|
218
|
+
:param bucket:
|
|
219
|
+
:param archive_path: Parent of the source
|
|
220
|
+
:param work_rid: Work name
|
|
221
|
+
:param work_tag:
|
|
222
|
+
:param bucket: S3 bucket to upload to
|
|
223
|
+
:param as_is: list of separate files to include in an incremental upload bag
|
|
224
|
+
:param is_complete: True if bagging a complete work, False if an incremental sync
|
|
225
|
+
:param bag_file_name: name of the bag file to create
|
|
226
|
+
:param in_daemon: True if running in a daemon or on docker
|
|
227
|
+
:return:
|
|
228
|
+
"""
|
|
229
|
+
import shutil
|
|
230
|
+
from bag.bag_ops import bag
|
|
231
|
+
|
|
232
|
+
if invert_context:
|
|
233
|
+
dest_name, media_with_image_group = invert_context
|
|
234
|
+
else:
|
|
235
|
+
dest_name = bag_file_name if bag_file_name else work_rid
|
|
236
|
+
|
|
237
|
+
s3_parent: S3Path = S3Path(bucket, AoApi.get_archive_location(GLACIER_KEY_ROOT, work_rid))
|
|
238
|
+
# Make a temporary path for the output:
|
|
239
|
+
exit_e: Exception = None
|
|
240
|
+
with (TemporaryDirectory() as out_buffer):
|
|
241
|
+
dip_id: str = ""
|
|
242
|
+
failed_item_message: str = ""
|
|
243
|
+
had_fail: bool = False
|
|
244
|
+
ig_dest_path: S3Path = s3_parent / f"{dest_name}{BAGZIP_SUFFIX}"
|
|
245
|
+
try:
|
|
246
|
+
dip_id = open_log_dip(work_rid, archive_path, ig_dest_path.arn)
|
|
247
|
+
out_path = Path(out_buffer)
|
|
248
|
+
dest_path: Path = out_path / dest_name
|
|
249
|
+
dest_path.mkdir(parents=True, exist_ok=True)
|
|
250
|
+
bag_path: Path = out_path / "bag" / dest_name
|
|
251
|
+
bag_path.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
bp_str = str(bag_path)
|
|
253
|
+
|
|
254
|
+
# Invert the image group into the temp directory's work_folder
|
|
255
|
+
if invert_context:
|
|
256
|
+
invert_image_group_media(dest_path, media_with_image_group, dest_name)
|
|
257
|
+
else:
|
|
258
|
+
if is_complete:
|
|
259
|
+
# as_is is a list of directories
|
|
260
|
+
for dir_name in as_is:
|
|
261
|
+
complete_sub_path = dest_path / children_of(dir_name, work_rid)
|
|
262
|
+
complete_sub_path.mkdir(parents=True, exist_ok=True)
|
|
263
|
+
shutil.copytree(dir_name, complete_sub_path, dirs_exist_ok=True)
|
|
264
|
+
else:
|
|
265
|
+
# as_is is a list of files
|
|
266
|
+
archive_parent: Path = Path(archive_path).parent
|
|
267
|
+
for file_name in as_is:
|
|
268
|
+
copy_file_to_new_location(archive_parent, Path(file_name), dest_path)
|
|
269
|
+
bag(str(dest_path), bp_str, False, in_daemon, False)
|
|
270
|
+
|
|
271
|
+
# Upload the inversion(s) to the archive. In this workflow, there should only be one
|
|
272
|
+
for root, dirs, files in os.walk(bag_path):
|
|
273
|
+
for file in files:
|
|
274
|
+
file_path = os.path.join(root, file)
|
|
275
|
+
#
|
|
276
|
+
# Handle subdirs by removing the top of tree
|
|
277
|
+
s3_object_name = file_path.replace(bp_str, "", 1).lstrip(os.sep)
|
|
278
|
+
s3_target: S3Path = s3_parent / s3_object_name
|
|
279
|
+
upload_file_to_s3_with_storage_class(file_path, s3_target.bucket, s3_target.key, 'STANDARD_IA',
|
|
280
|
+
work_tag)
|
|
281
|
+
except Exception as e:
|
|
282
|
+
failed_item_message = f"failed deep_archive {work_rid=} {dest_path=} {e=}"
|
|
283
|
+
_log.exception(e)
|
|
284
|
+
exit_e = e
|
|
285
|
+
finally:
|
|
286
|
+
if dip_id:
|
|
287
|
+
update_log_dip(dip_id, 1 if had_fail else 0, failed_item_message)
|
|
288
|
+
if exit_e:
|
|
289
|
+
complain(f"{work_rid=}, {failed_item_message=}", 1, "do_deep_archive_complete")
|
|
290
|
+
raise exit_e
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def children_of(anchor: str, a_path: str) -> Path:
|
|
294
|
+
"""
|
|
295
|
+
Returns a path relative to the work_rid in dir_name
|
|
296
|
+
:param a_path:
|
|
297
|
+
:param anchor:
|
|
298
|
+
:return:
|
|
299
|
+
"""
|
|
300
|
+
dir_path = Path(anchor)
|
|
301
|
+
_d_parts = dir_path.parts
|
|
302
|
+
_w_sub = _d_parts.index(a_path)
|
|
303
|
+
sub_path = Path(*_d_parts[_w_sub + 1:])
|
|
304
|
+
return sub_path
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def upload_file_to_s3_with_storage_class(file_name, bucket, key=None, storage_class='STANDARD',
|
|
308
|
+
tag_set: Optional[str] = None):
|
|
309
|
+
"""Upload a file to an S3 bucket with a specific storage class
|
|
310
|
+
|
|
311
|
+
:param file_name: File to upload
|
|
312
|
+
:param bucket: Bucket to upload to
|
|
313
|
+
:param key: S3 object name. If not specified then file_name is used
|
|
314
|
+
:param storage_class: Storage class to use for the object
|
|
315
|
+
:return: True if file was uploaded, else False
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
# If S3 object_name was not specified, use file_name
|
|
319
|
+
if key is None:
|
|
320
|
+
key = file_name
|
|
321
|
+
|
|
322
|
+
# Upload the file
|
|
323
|
+
import boto3
|
|
324
|
+
s3_client = boto3.client('s3')
|
|
325
|
+
from botocore.exceptions import ClientError
|
|
326
|
+
try:
|
|
327
|
+
|
|
328
|
+
# handy: Can set tagging here:
|
|
329
|
+
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object_tagging
|
|
330
|
+
extra_args: {} = {'StorageClass': storage_class}
|
|
331
|
+
if tag_set:
|
|
332
|
+
# __/|\__ gh copilot
|
|
333
|
+
extra_args.update(tag_set)
|
|
334
|
+
import json
|
|
335
|
+
extra_arg_string = json.dumps(extra_args)
|
|
336
|
+
s3_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args)
|
|
337
|
+
except ClientError as e:
|
|
338
|
+
_log.exception(e)
|
|
339
|
+
complain(f"S3 upload {file_name=}, {bucket=}", 1, "upload_file_to_S3")
|
|
340
|
+
raise e
|
|
341
|
+
return True
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def setup(args: DeepArchiveArgs):
|
|
345
|
+
"""
|
|
346
|
+
Open resources
|
|
347
|
+
:return: sets logger and dip_log context
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
import os
|
|
351
|
+
global _log
|
|
352
|
+
global dip_logger
|
|
353
|
+
|
|
354
|
+
if not _log:
|
|
355
|
+
os.makedirs(args.log_root, exist_ok=True)
|
|
356
|
+
_log = AOLogger("Deep_Archive", args.log_level, Path(args.log_root), extra_quiet_loggers=['bagit'])
|
|
357
|
+
if not dip_logger:
|
|
358
|
+
import os
|
|
359
|
+
# Need different path under docker
|
|
360
|
+
db_cfg: [str] = args.drsDbConfig.split(':')
|
|
361
|
+
|
|
362
|
+
# Still need to check, in case not run through argparse (i.e. db_config manually populated)
|
|
363
|
+
if len(db_cfg) < 2:
|
|
364
|
+
raise ValueError(f"Invalid db config {args.drsDbConfig} requires section:configFileName")
|
|
365
|
+
|
|
366
|
+
# Adjust for running under docker - should be part of DbApps
|
|
367
|
+
db_cfg[1] = '/run/secrets/db_apps' if os.path.exists('/run/secrets') else db_cfg[1]
|
|
368
|
+
args.drsDbConfig = ':'.join(db_cfg)
|
|
369
|
+
dip_logger = DipLog(args.drsDbConfig)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def displayed_comment(displayed, display_length: int = 108, illus_len: int = 20) -> str:
|
|
373
|
+
"""
|
|
374
|
+
if displayed is > 108, trim to [1:20]"..."[-20:]
|
|
375
|
+
:param display_length: invocation threshold
|
|
376
|
+
:param illus_len:
|
|
377
|
+
|
|
378
|
+
:param displayed:
|
|
379
|
+
:return:
|
|
380
|
+
"""
|
|
381
|
+
return f"{displayed[:illus_len]}...{displayed[-illus_len:]}" if len(displayed) > display_length else displayed
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def update_log_dip(dip_log_id: str,
|
|
385
|
+
rc: int,
|
|
386
|
+
comment: str,
|
|
387
|
+
src_path: Optional[str] = None,
|
|
388
|
+
dest_path: Optional[str] = None,
|
|
389
|
+
):
|
|
390
|
+
"""
|
|
391
|
+
Closes a dip log entry
|
|
392
|
+
:param dip_log_id: Key to locate entry
|
|
393
|
+
:param comment: goes into database
|
|
394
|
+
:param rc: activity return code for log
|
|
395
|
+
:param src_path: source path (shouldn't usually update)
|
|
396
|
+
:param dest_path: output path
|
|
397
|
+
:return:
|
|
398
|
+
"""
|
|
399
|
+
log_comment = displayed_comment(comment)
|
|
400
|
+
_log.info(f'closing :{dip_log_id=}\trc:{rc=} {log_comment=}')
|
|
401
|
+
return dip_logger.set_dip(
|
|
402
|
+
# These are table PKs - you can't update them
|
|
403
|
+
activity_name=None,
|
|
404
|
+
begin_t=None,
|
|
405
|
+
work_name=None,
|
|
406
|
+
# end keys
|
|
407
|
+
# this is the identifying key
|
|
408
|
+
dip_id=dip_log_id,
|
|
409
|
+
# Tell the truth now
|
|
410
|
+
end_t=datetime.datetime.now(),
|
|
411
|
+
# The rest of these are optional
|
|
412
|
+
s_path=src_path,
|
|
413
|
+
d_path=dest_path,
|
|
414
|
+
ac_result=rc,
|
|
415
|
+
comment=f"{comment} log file {_log.log_file_name}",
|
|
416
|
+
inventory=None)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def open_log_dip(work_rid: str, src_path: str, aws_object_path: Optional[str] = None) -> str:
|
|
420
|
+
"""
|
|
421
|
+
Opens a dip log entry
|
|
422
|
+
:return:
|
|
423
|
+
:param work_rid: Key to locate entry
|
|
424
|
+
:param src_path: goes into database
|
|
425
|
+
:param aws_object_path: return code to log
|
|
426
|
+
:return: dip_log_id
|
|
427
|
+
"""
|
|
428
|
+
global dip_logger
|
|
429
|
+
_log.info(f'opening :{work_rid=}\t{src_path=}\t{aws_object_path=}')
|
|
430
|
+
|
|
431
|
+
# set_dip has no optional args:
|
|
432
|
+
return dip_logger.set_dip(activity_name='DEEP_ARCHIVE',
|
|
433
|
+
begin_t=datetime.datetime.now(),
|
|
434
|
+
end_t=None,
|
|
435
|
+
s_path=src_path,
|
|
436
|
+
d_path=aws_object_path,
|
|
437
|
+
dip_id=None,
|
|
438
|
+
work_name=work_rid,
|
|
439
|
+
ac_result=None,
|
|
440
|
+
comment=None,
|
|
441
|
+
inventory=None)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
# send a message to an AWS SNS topic
|
|
445
|
+
def send_sns(subject: str, message_str):
|
|
446
|
+
"""
|
|
447
|
+
Send a message to an AWS SNS topic
|
|
448
|
+
:return:
|
|
449
|
+
"""
|
|
450
|
+
import os
|
|
451
|
+
topic: str = os.getenv('AO_AWS_SNS_TOPIC_ARN')
|
|
452
|
+
if topic:
|
|
453
|
+
# Usually configured for default
|
|
454
|
+
import boto3
|
|
455
|
+
sns = boto3.client('sns').publish(TopicArn=topic, Message=message_str,
|
|
456
|
+
Subject=subject)
|
|
457
|
+
_log.info(f'{"[sns]" if topic else "[log]"} {subject}, {message_str}')
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def complain(object_tag: str, rc: int, operation_tag: str, detail: str = None):
|
|
461
|
+
d4_fstring = f"with {detail=}" if detail else ''""
|
|
462
|
+
sns_fails_message_string = f"""
|
|
463
|
+
The following work could not be uploaded to Glacier:
|
|
464
|
+
{object_tag}
|
|
465
|
+
.
|
|
466
|
+
{operation_tag} returned with exit code {rc} {d4_fstring}.
|
|
467
|
+
|
|
468
|
+
See log file {_log.log_file_name} for details.
|
|
469
|
+
"""
|
|
470
|
+
|
|
471
|
+
send_sns("Glacier Upload Failure Report", sns_fails_message_string)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def get_header_indices(header: [str], columns: [str]) -> ():
|
|
475
|
+
return tuple(map(lambda x: header.index(x), columns))
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def read_csv(csv_path: Path) -> [InputDirectiveRow]:
|
|
479
|
+
"""
|
|
480
|
+
Map a csv file into a list of named tuples.
|
|
481
|
+
Note if the dip_comment field is used, the caller is responsible
|
|
482
|
+
for escaping commas
|
|
483
|
+
"""
|
|
484
|
+
import csv
|
|
485
|
+
records: [InputDirectiveRow] = []
|
|
486
|
+
with open(csv_path, 'r') as f:
|
|
487
|
+
reader = csv.reader(f) # Skip the header
|
|
488
|
+
try:
|
|
489
|
+
header = next(reader)
|
|
490
|
+
|
|
491
|
+
h_map = get_header_indices(header, HEADERS)
|
|
492
|
+
records = [InputDirectiveRow(*(row[i] for i in h_map)) for row in reader]
|
|
493
|
+
# noinspection PyTypeChecker
|
|
494
|
+
_log.debug(records)
|
|
495
|
+
except StopIteration:
|
|
496
|
+
_log.info(f"Empty file {csv_path}")
|
|
497
|
+
|
|
498
|
+
return records
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def deep_archive_shell():
|
|
502
|
+
"""
|
|
503
|
+
Command line interface
|
|
504
|
+
:return:
|
|
505
|
+
"""
|
|
506
|
+
da_parser: DeepArchiveParser = DeepArchiveParser(usage="%(prog)s -i input_file",
|
|
507
|
+
description="Uploads a series of inverted zip files to backup "
|
|
508
|
+
"bucket", )
|
|
509
|
+
|
|
510
|
+
args: DeepArchiveArgs = da_parser.parsedArgs
|
|
511
|
+
|
|
512
|
+
setup(args)
|
|
513
|
+
_log.info(f"Arguments: {str(args)}")
|
|
514
|
+
|
|
515
|
+
records: [InputDirectiveRow] = read_csv(args.input_file)
|
|
516
|
+
for record in records:
|
|
517
|
+
inventory_path: Path = None
|
|
518
|
+
try:
|
|
519
|
+
if args.incremental:
|
|
520
|
+
# Get the inventory path from the database. If none, do a complete deep archive
|
|
521
|
+
inventory_path = get_inventory_path(record.dip_external_id, args.drsDbConfig)
|
|
522
|
+
# bu3 t if no inventory, do complete
|
|
523
|
+
if not inventory_path or not inventory_path.exists():
|
|
524
|
+
args.complete = True
|
|
525
|
+
args.incremental = False
|
|
526
|
+
_log.warn(
|
|
527
|
+
f"No inventory found for {record.WorkName} archive record: {record.dip_external_id[:6]}... Running complete")
|
|
528
|
+
|
|
529
|
+
if args.complete:
|
|
530
|
+
image_group_list = get_igs_for_invert(record.WorkName)
|
|
531
|
+
do_deep_archive_complete(record.WorkName, Path(record.path), image_group_list, args.bucket,
|
|
532
|
+
args.in_daemon)
|
|
533
|
+
else: # args.incremental
|
|
534
|
+
do_deep_archive_incremental(record.WorkName, Path(record.path), inventory_path, args.bucket,
|
|
535
|
+
args.in_daemon)
|
|
536
|
+
# image_group_list = get_igs_for_invert(record.WorkName, record.path, record.dip_external_id)
|
|
537
|
+
|
|
538
|
+
# if there was a comment, we're only doing the image groups that were designated in the comment.
|
|
539
|
+
# Otherwise, segment the work into:
|
|
540
|
+
# - imagegroup + media tuples to be inverted and zipped separately
|
|
541
|
+
# - everything else
|
|
542
|
+
_log.info(f"Processing {record}")
|
|
543
|
+
except Exception as e:
|
|
544
|
+
dip_id = open_log_dip(record.WorkName, record.path)
|
|
545
|
+
error_string: str = f"Failed to process {record=} {dip_id=} Exception: " + f"{traceback.format_exc() if _log.py_logger.isEnabledFor(logging.DEBUG) else e}"
|
|
546
|
+
_log.error(error_string)
|
|
547
|
+
update_log_dip(dip_id, 1, error_string)
|
|
548
|
+
complain(record.WorkName, 1, "deep_archive_shell", error_string)
|
|
549
|
+
raise e
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
if __name__ == '__main__':
|
|
553
|
+
deep_archive_shell()
|