opex-manifest-generator 1.3.3__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,15 +7,30 @@ author: Christopher Prince
7
7
  license: Apache License 2.0"
8
8
  """
9
9
 
10
- import lxml.etree as ET
10
+ from lxml import etree as ET
11
11
  import pandas as pd
12
- import os, time, datetime
12
+ import os, configparser, logging, zipfile
13
+ from typing import Optional
13
14
  from auto_reference_generator import ReferenceGenerator
14
- from auto_reference_generator.common import export_list_txt, export_xl, export_csv, export_json, export_ods, export_xml,define_output_file
15
+ from auto_reference_generator.common import export_list_txt, \
16
+ export_xl, \
17
+ export_csv, \
18
+ export_json, \
19
+ export_ods, \
20
+ export_xml, \
21
+ define_output_file
15
22
  from pandas.api.types import is_datetime64_any_dtype
16
23
  from opex_manifest_generator.hash import HashGenerator
17
- from opex_manifest_generator.common import *
18
- import configparser
24
+ from opex_manifest_generator.common import zip_opex,\
25
+ remove_tree,\
26
+ win_256_check,\
27
+ filter_win_hidden,\
28
+ check_nan,\
29
+ check_opex,\
30
+ write_opex
31
+ from datetime import datetime
32
+
33
+ logger = logging.getLogger(__name__)
19
34
 
20
35
  class OpexManifestGenerator():
21
36
  """
@@ -53,7 +68,7 @@ class OpexManifestGenerator():
53
68
  output_path: str = os.getcwd(),
54
69
  meta_dir_flag: bool = True,
55
70
  metadata_dir: str = os.path.join(os.path.dirname(os.path.realpath(__file__)), "metadata"),
56
- metadata_flag: str = 'none',
71
+ metadata_flag: Optional[str] = None,
57
72
  autoref_flag: str = None,
58
73
  prefix: str = None,
59
74
  suffix: str = None,
@@ -65,19 +80,21 @@ class OpexManifestGenerator():
65
80
  pax_fixity: bool = False,
66
81
  fixity_export_flag: bool = True,
67
82
  empty_flag: bool = False,
83
+ empty_export_flag: bool = True,
68
84
  removal_flag: bool = False,
85
+ removal_export_flag: bool = True,
69
86
  clear_opex_flag: bool = False,
70
87
  export_flag: bool = False,
71
88
  input: str = None,
72
89
  zip_flag: bool = False,
90
+ zip_file_removal: bool = False,
73
91
  hidden_flag: bool = False,
74
92
  output_format: str = "xlsx",
75
- print_xmls_flag: bool = False,
76
93
  options_file: str = os.path.join(os.path.dirname(__file__),'options','options.properties'),
77
94
  keywords: list = None,
78
95
  keywords_mode: str = "initialise",
79
96
  keywords_retain_order: bool = False,
80
- keywords_case_sensitive: bool = False,
97
+ keywords_case_sensitivity: bool = False,
81
98
  keywords_abbreviation_number: int = 3,
82
99
  sort_key = lambda x: (os.path.isfile(x), str.casefold(x)),
83
100
  delimiter = "/",
@@ -86,7 +103,7 @@ class OpexManifestGenerator():
86
103
  self.root = os.path.abspath(root)
87
104
  # Base Parameters
88
105
  self.opexns = "http://www.openpreservationexchange.org/opex/v1.2"
89
- self.start_time = datetime.datetime.now()
106
+ self.start_time = datetime.now()
90
107
  self.list_path = []
91
108
  self.list_fixity = []
92
109
 
@@ -99,17 +116,20 @@ class OpexManifestGenerator():
99
116
  self.meta_dir_flag = meta_dir_flag
100
117
  self.hidden_flag = hidden_flag
101
118
  self.zip_flag = zip_flag
119
+ self.zip_file_removal = zip_file_removal
120
+
102
121
  self.empty_flag = empty_flag
122
+ self.empty_export_flag = empty_export_flag
103
123
 
104
124
  # Parameters for Input Option
105
125
  self.input = input
106
126
  self.removal_flag = removal_flag
107
127
  if self.removal_flag:
108
128
  self.removal_list = []
129
+ self.removal_export_flag = removal_export_flag
109
130
  self.export_flag = export_flag
110
131
  self.metadata_flag = metadata_flag
111
132
  self.metadata_dir = metadata_dir
112
- self.print_xmls_flag = print_xmls_flag
113
133
 
114
134
  # Parameters for Auto Reference
115
135
  self.autoref_flag = autoref_flag
@@ -123,7 +143,7 @@ class OpexManifestGenerator():
123
143
  self.keywords_list = keywords
124
144
  self.keywords_mode = keywords_mode
125
145
  self.keywords_retain_order = keywords_retain_order
126
- self.keywords_case_sensitive = keywords_case_sensitive
146
+ self.keywords_case_sensitivity = keywords_case_sensitivity
127
147
  self.keywords_abbreviation_number = keywords_abbreviation_number
128
148
  self.sort_key = sort_key
129
149
  self.delimiter = delimiter
@@ -141,146 +161,198 @@ class OpexManifestGenerator():
141
161
 
142
162
  def parse_config(self, options_file: str = os.path.join('options','options.properties')) -> None:
143
163
  config = configparser.ConfigParser()
144
- config.read(options_file, encoding='utf-8')
145
- global INDEX_FIELD
146
- INDEX_FIELD = config['options']['INDEX_FIELD']
147
- global TITLE_FIELD
148
- TITLE_FIELD = config['options']['TITLE_FIELD']
149
- global DESCRIPTION_FIELD
150
- DESCRIPTION_FIELD = config['options']['DESCRIPTION_FIELD']
151
- global SECURITY_FIELD
152
- SECURITY_FIELD = config['options']['SECURITY_FIELD']
153
- global IDENTIFIER_FIELD
154
- IDENTIFIER_FIELD = config['options']['IDENTIFIER_FIELD']
155
- global IDENTIFIER_DEFAULT
156
- IDENTIFIER_DEFAULT = config['options']['IDENTIFIER_DEFAULT']
157
- global REMOVAL_FIELD
158
- REMOVAL_FIELD = config['options']['REMOVAL_FIELD']
159
- global IGNORE_FIELD
160
- IGNORE_FIELD = config['options']['IGNORE_FIELD']
161
- global SOURCEID_FIELD
162
- SOURCEID_FIELD = config['options']['SOURCEID_FIELD']
163
- global HASH_FIELD
164
- HASH_FIELD = config['options']['HASH_FIELD']
165
- global ALGORITHM_FIELD
166
- ALGORITHM_FIELD = config['options']['ALGORITHM_FIELD']
167
- global ARCREF_FIELD
168
- ARCREF_FIELD = config['options']['ARCREF_FIELD']
169
- global ACCREF_CODE
170
- ACCREF_CODE = config['options']['ACCREF_CODE']
171
- global ACCREF_FIELD
172
- ACCREF_FIELD = config['options']['ACCREF_FIELD']
164
+ read_config = config.read(options_file, encoding='utf-8')
165
+ if not read_config:
166
+ logger.warning(f"Options files not found or not reable: {options_file}. Using defaults.")
173
167
 
174
- global FIXITY_SUFFIX
175
- FIXITY_SUFFIX = config['options']['FIXITY_SUFFIX']
176
- global REMOVALS_SUFFIX
177
- REMOVALS_SUFFIX = config['options']['REMOVALS_SUFFIX']
178
- global METAFOLDER
179
- METAFOLDER = config['options']['METAFOLDER']
180
- global GENERIC_DEFAULT_SECURITY
181
- GENERIC_DEFAULT_SECURITY = config['options']['GENERIC_DEFAULT_SECURITY']
168
+ section = config['options'] if 'options' in config else {}
182
169
 
170
+ self.INDEX_FIELD = section.get('INDEX_FIELD', "FullName")
171
+ self.TITLE_FIELD = section.get('TITLE_FIELD', "Title")
172
+ self.DESCRIPTION_FIELD = section.get('DESCRIPTION_FIELD', "Description")
173
+ self.SECURITY_FIELD = section.get('SECURITY_FIELD', "Security")
174
+ self.IDENTIFIER_FIELD = section.get('IDENTIFIER_FIELD', "Identifier")
175
+ self.IDENTIFIER_DEFAULT = section.get('IDENTIFIER_DEFAULT', "code")
176
+ self.REMOVAL_FIELD = section.get('REMOVAL_FIELD', "Removals")
177
+ self.IGNORE_FIELD = section.get('IGNORE_FIELD', "Ignore")
178
+ self.SOURCEID_FIELD = section.get('SOURCEID_FIELD', "SourceID")
179
+ self.HASH_FIELD = section.get('HASH_FIELD', "Hash")
180
+ self.ALGORITHM_FIELD = section.get('ALGORITHM_FIELD', "Algorithm")
181
+ self.ARCREF_FIELD = section.get('ARCREF_FIELD', "Archive_Reference")
182
+ self.ACCREF_CODE = section.get('ACCREF_CODE', "Accession_Reference")
183
+ self.ACCREF_FIELD = section.get('ACCREF_FIELD', "accref")
184
+ self.FIXITY_SUFFIX = section.get('FIXITY_SUFFIX', "_Fixity")
185
+ self.REMOVALS_SUFFIX = section.get('REMOVALS_SUFFIX', "_Removals")
186
+ self.METAFOLDER = section.get('METAFOLDER', "meta")
187
+ self.GENERIC_DEFAULT_SECURITY = section.get('GENERIC_DEFAULT_SECURITY', "open")
188
+ logger.debug(f'Configuration set to: {[{k,v} for k,v in (section.items())]}')
183
189
 
184
190
  def print_descriptive_xmls(self) -> None:
185
- for file in os.scandir(self.metadata_dir):
186
- path = os.path.join(self.metadata_dir, file.name)
187
- print(path)
188
- xml_file = ET.parse(path)
189
- root_element = ET.QName(xml_file.find('.'))
190
- root_element_ln = root_element.localname
191
- for elem in xml_file.findall(".//"):
192
- if elem.getchildren():
193
- pass
194
- else:
195
- elem_path = xml_file.getelementpath(elem)
196
- elem = ET.QName(elem)
197
- elem_lnpath = elem_path.replace(f"{{{elem.namespace}}}", root_element_ln + ":")
198
- print(elem_lnpath)
191
+ try:
192
+ for file in os.scandir(self.metadata_dir):
193
+ path = os.path.join(self.metadata_dir, file.name)
194
+ print(path)
195
+ xml_file = ET.parse(path)
196
+ root_element = ET.QName(xml_file.find('.'))
197
+ root_element_ln = root_element.localname
198
+ for elem in xml_file.findall(".//"):
199
+ if elem.getchildren():
200
+ pass
201
+ else:
202
+ elem_path = xml_file.getelementpath(elem)
203
+ elem = ET.QName(elem)
204
+ elem_lnpath = elem_path.replace(f"{{{elem.namespace}}}", root_element_ln + ":")
205
+ print(elem_lnpath)
206
+ except Exception as e:
207
+ logger.exception(f'Failed to print Descriptive metadta files, ensure correct path {e}')
208
+ raise
199
209
 
210
+ def convert_descriptive_xmls(self) -> None:
211
+ try:
212
+ for file in os.scandir(self.metadata_dir):
213
+ path = os.path.join(self.metadata_dir, file.name)
214
+ xml_file = ET.parse(path)
215
+ root_element = ET.QName(xml_file.find('.'))
216
+ root_element_ln = root_element.localname
217
+ column_list = []
218
+ for elem in xml_file.findall(".//"):
219
+ if elem.getchildren():
220
+ pass
221
+ else:
222
+ elem_path = xml_file.getelementpath(elem)
223
+ elem = ET.QName(elem)
224
+ elem_lnpath = elem_path.replace(f"{{{elem.namespace}}}", root_element_ln + ":")
225
+ column_list.append(elem_lnpath)
226
+ df = pd.DataFrame(columns=column_list,index=None)
227
+ if self.output_format == 'xlsx':
228
+ export_xl(df,file.name.replace('.xml','.xlsx'))
229
+ elif self.output_format == 'ods':
230
+ export_ods(df,file.name.replace('.xml','.ods'))
231
+ elif self.output_format == 'csv':
232
+ export_csv(df,file.name.replace('.xml','.csv'))
233
+ elif self.output_format == 'json':
234
+ export_json(df,file.name.replace('.xml','.json'))
235
+ else:
236
+ export_xl(df, file.name.replace('.xml','.xlsx'))
237
+ except Exception as e:
238
+ logger.exception(f'Failed to print Descriptive metadta files, ensure correct path {e}')
239
+ raise
240
+
200
241
  def set_input_flags(self) -> None:
201
- if TITLE_FIELD in self.column_headers:
242
+ if self.TITLE_FIELD in self.column_headers:
202
243
  self.title_flag = True
203
- if DESCRIPTION_FIELD in self.column_headers:
244
+ if self.DESCRIPTION_FIELD in self.column_headers:
204
245
  self.description_flag = True
205
- if SECURITY_FIELD in self.column_headers:
246
+ if self.SECURITY_FIELD in self.column_headers:
206
247
  self.security_flag = True
207
- if SOURCEID_FIELD in self.column_headers:
248
+ if self.SOURCEID_FIELD in self.column_headers:
208
249
  self.sourceid_flag = True
209
- if IGNORE_FIELD in self.column_headers:
250
+ if self.IGNORE_FIELD in self.column_headers:
210
251
  self.ignore_flag = True
211
- if HASH_FIELD in self.column_headers and ALGORITHM_FIELD in self.column_headers:
252
+ if self.HASH_FIELD in self.column_headers and self.ALGORITHM_FIELD in self.column_headers:
212
253
  self.hash_from_spread = True
213
- print("Hash detected in Spreadsheet; taking hashes from spreadsheet")
214
- time.sleep(3)
254
+ logger.info("Hash detected in Spreadsheet; taking hashes from spreadsheet")
255
+ logger.debug("Flags set")
215
256
 
216
257
  def init_df(self) -> None:
217
- if self.autoref_flag:
218
- ac = ReferenceGenerator(self.root,
219
- output_path = self.output_path,
220
- prefix = self.prefix,
221
- accprefix = self.acc_prefix,
222
- suffix = self.suffix,
223
- suffix_option = self.suffix_option,
224
- start_ref = self.start_ref,
225
- empty_flag = self.empty_flag,
226
- accession_flag=self.accession_mode,
227
- keywords = self.keywords_list,
228
- keywords_mode = self.keywords_mode,
229
- keywords_retain_order = self.keywords_retain_order,
230
- keywords_abbreviation_number = self.keywords_abbreviation_number,
231
- keywords_case_sensitive = self.keywords_case_sensitive,
232
- delimiter = self.delimiter,
233
- sort_key = self.sort_key,
234
- options_file = os.path.join(os.path.dirname(__file__),'options','options.properties')
235
- )
236
- self.df = ac.init_dataframe()
237
- if self.autoref_flag in {"accession", "a", "accession-generic", "ag"}:
238
- self.df = self.df.drop(ARCREF_FIELD, axis=1)
239
- self.column_headers = self.df.columns.values.tolist()
240
- self.set_input_flags()
241
- if self.export_flag:
242
- output_path = define_output_file(self.output_path, self.root, METAFOLDER, meta_dir_flag = self.meta_dir_flag, output_format = self.output_format)
243
- if self.output_format == "xlsx":
244
- export_xl(self.df, output_path)
245
- elif self.output_format == "csv":
246
- export_csv(self.df, output_path)
247
- elif self.output_format == "json":
248
- export_json(self.df.to_dict(orient='records'), output_path)
249
- elif self.output_format == "ods":
250
- export_ods(self.df, output_path)
251
- elif self.output_format == "xml":
252
- export_xml(self.df, output_path)
253
- elif self.input:
254
- if self.input.endswith('xlsx'):
255
- self.df = pd.read_excel(self.input)
256
- elif self.input.endswith('csv'):
257
- self.df = pd.read_csv(self.input)
258
- elif self.input.endswith('json'):
259
- self.df = pd.read_json(self.input)
260
- elif self.input.endswith('ods'):
261
- self.df = pd.read_excel(self.input, engine='odf')
262
- elif self.input.endswith('xml'):
263
- self.df = pd.read_xml(self.input)
264
- self.column_headers = self.df.columns.values.tolist()
265
- self.set_input_flags()
266
- else:
267
- self.df = None
268
- self.column_headers = None
269
-
258
+ try:
259
+ if self.autoref_flag:
260
+ ar = ReferenceGenerator(self.root,
261
+ output_path = self.output_path,
262
+ prefix = self.prefix,
263
+ accprefix = self.acc_prefix,
264
+ suffix = self.suffix,
265
+ suffix_options = self.suffix_option,
266
+ start_ref = self.start_ref,
267
+ empty_flag = self.empty_flag,
268
+ accession_flag=self.accession_mode,
269
+ keywords = self.keywords_list,
270
+ keywords_mode = self.keywords_mode,
271
+ keywords_retain_order = self.keywords_retain_order,
272
+ keywords_abbreviation_number = self.keywords_abbreviation_number,
273
+ keywords_case_sensitivity = self.keywords_case_sensitivity,
274
+ delimiter = self.delimiter,
275
+ sort_key = self.sort_key)
276
+ self.df = ar.init_dataframe()
277
+ if self.autoref_flag in {"accession", "a", "accession-generic", "ag"}:
278
+ self.df = self.df.drop(self.ARCREF_FIELD, axis=1)
279
+ self.column_headers = self.df.columns.values.tolist()
280
+ self.set_input_flags()
281
+ if self.export_flag:
282
+ output_path = define_output_file(self.output_path, self.root, self.METAFOLDER, meta_dir_flag = self.meta_dir_flag, output_format = self.output_format)
283
+ if self.output_format == "xlsx":
284
+ export_xl(self.df, output_path)
285
+ elif self.output_format == "csv":
286
+ export_csv(self.df, output_path)
287
+ elif self.output_format == "json":
288
+ export_json(self.df.to_dict(orient='records'), output_path)
289
+ elif self.output_format == "ods":
290
+ export_ods(self.df, output_path)
291
+ elif self.output_format == "xml":
292
+ export_xml(self.df, output_path)
293
+ logger.debug(f'Auto Reference Dataframe initialised with columns: {self.column_headers}')
294
+ return True
295
+ elif self.input:
296
+ if self.input.endswith(('.xlsx','.xls','.xlsm')):
297
+ self.df = pd.read_excel(self.input)
298
+ elif self.input.endswith('.csv'):
299
+ self.df = pd.read_csv(self.input)
300
+ elif self.input.endswith('.json'):
301
+ self.df = pd.read_json(self.input)
302
+ elif self.input.endswith('.ods'):
303
+ self.df = pd.read_excel(self.input, engine='odf')
304
+ elif self.input.endswith('.xml'):
305
+ self.df = pd.read_xml(self.input)
306
+ self.column_headers = self.df.columns.values.tolist()
307
+ self.set_input_flags()
308
+ logger.debug(f'Input Dataframe initialised with columns: {self.column_headers}')
309
+ return True
310
+ else:
311
+ logger.warning('No Auto Reference or Input file specified, proceeding without Dataframe')
312
+ self.df = None
313
+ self.column_headers = None
314
+ return False
315
+ except Exception as e:
316
+ logger.exception(f'Failed to intialise Dataframe: {e}')
317
+ raise
318
+
270
319
  def clear_opex(self) -> None:
271
- walk = list(os.walk(self.root))
272
- for dir, _, files in walk[::-1]:
273
- for file in files:
274
- file_path = win_256_check(os.path.join(dir, file))
275
- if str(file_path).endswith('.opex'):
276
- os.remove(file_path)
277
- print(f'Cleared Opex: {file_path}')
320
+ try:
321
+ walk = list(os.walk(self.root))
322
+ for dir, _, files in walk[::-1]:
323
+ for file in files:
324
+ file_path = win_256_check(os.path.join(dir, file))
325
+ if str(file_path).endswith('.opex'):
326
+ os.remove(file_path)
327
+ logger.info(f'Cleared Opex: {file_path}')
328
+ except Exception as e:
329
+ logger.exception(f'Error looking up Clearing Opex: {e}')
330
+ raise
278
331
 
279
332
  def index_df_lookup(self, path: str) -> pd.Index:
280
- idx = self.df.loc[self.df[INDEX_FIELD ==path], INDEX_FIELD].index
281
- return idx
333
+ if getattr(self, 'df', None) is None:
334
+ logger.error('Dataframe not initialised, cannot perform lookup')
335
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
336
+ try:
337
+ idx = self.df.loc[self.df[self.INDEX_FIELD] == path, self.INDEX_FIELD].index
338
+ return idx
339
+ except KeyError as e:
340
+ logger.exception(f'Key Error in Index Lookup: {e}' \
341
+ '\n Please ensure column header\'s are an exact match.')
342
+ raise
343
+ except IndexError as e:
344
+ logger.warning(f'Index Error in Index Lookup: {e}. Proceeding...' \
345
+ '\nIt is likely you have removed or added a file/folder to the directory' \
346
+ '\nafter generating your input spreadsheet. An opex will still be generated but information may be missing.' \
347
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
348
+ except Exception as e:
349
+ logger.exception(f'Error looking up Index from Dataframe: {e}')
350
+ raise
282
351
 
283
352
  def xip_df_lookup(self, idx: pd.Index) -> tuple:
353
+ if getattr(self, 'df', None) is None:
354
+ logger.error('Dataframe not initialised, cannot perform lookup')
355
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
284
356
  try:
285
357
  title = None
286
358
  description = None
@@ -289,139 +361,247 @@ class OpexManifestGenerator():
289
361
  pass
290
362
  else:
291
363
  if self.title_flag:
292
- title = check_nan(self.df.loc[idx,TITLE_FIELD].item())
364
+ title = check_nan(self.df.loc[idx,self.TITLE_FIELD].item())
293
365
  if self.description_flag:
294
- description = check_nan(self.df.loc[idx,DESCRIPTION_FIELD].item())
366
+ description = check_nan(self.df.loc[idx,self.DESCRIPTION_FIELD].item())
295
367
  if self.security_flag:
296
- security = check_nan(self.df.loc[idx,SECURITY_FIELD].item())
368
+ security = check_nan(self.df.loc[idx,self.SECURITY_FIELD].item())
297
369
  return title,description,security
370
+ except KeyError as e:
371
+ logger.exception(f'Key Error in Removal Lookup: {e}'
372
+ '\n Please ensure column header\'s are an exact match.')
373
+ raise
374
+ except IndexError as e:
375
+ logger.warning(f'Index Error in Removal Lookup: {e}. Proceeding...'
376
+ '\nIt is likely you have removed or added a file/folder to the directory'
377
+ '\nafter generating your input spreadsheet. An opex will still be generated, but information may be missing.'
378
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
298
379
  except Exception as e:
299
- print('Error Looking up XIP Metadata')
300
- print(e)
380
+ logger.exception(f'Error looking up XIP from Dataframe: {e}')
381
+ raise
301
382
 
302
383
  def removal_df_lookup(self, idx: pd.Index) -> bool:
384
+ if getattr(self, 'df', None) is None:
385
+ logger.error('Dataframe not initialised, cannot perform lookup')
386
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
303
387
  try:
304
388
  if idx.empty:
305
389
  return False
306
390
  else:
307
- remove = check_nan(self.df.loc[idx,REMOVAL_FIELD].item())
391
+ remove = check_nan(self.df.loc[idx,self.REMOVAL_FIELD].item())
308
392
  if remove is not None:
309
393
  return True
310
394
  else:
311
395
  return False
396
+ except KeyError as e:
397
+ logger.exception(f'Key Error in Removal Lookup: {e}'
398
+ '\n Please ensure column header\'s are an exact match.')
399
+ raise
400
+ except IndexError as e:
401
+ logger.warning(f'Index Error in Removal Lookup: {e}. Proceeding...'
402
+ '\nIt is likely you have removed or added a file/folder to the directory'
403
+ '\nafter generating your input spreadsheet. An opex will still be generated, but information may be missing.'
404
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
312
405
  except Exception as e:
313
- print('Error looking up Removals')
314
- print(e)
406
+ logger.exception(f'Error looking up Removals from Dataframe: {e}')
407
+ raise
315
408
 
316
409
  def ignore_df_lookup(self, idx: pd.Index) -> bool:
410
+ if getattr(self, 'df', None) is None:
411
+ logger.error('Dataframe not initialised, cannot perform lookup')
412
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
317
413
  try:
318
414
  if idx.empty:
319
415
  return False
320
416
  else:
321
- ignore = check_nan(self.df.loc[idx,IGNORE_FIELD].item())
417
+ ignore = check_nan(self.df.loc[idx,self.IGNORE_FIELD].item())
322
418
  return bool(ignore)
419
+ except KeyError as e:
420
+ logger.exception(f'Key Error in Ignore Lookup: {e}'
421
+ '\n Please ensure column header\'s are an exact match.')
422
+ raise
423
+ except IndexError as e:
424
+ logger.warning(f'Index Error in Ignore Lookup: {e}. Proceeding...'
425
+ '\nIt is likely you have removed or added a file/folder to the directory'
426
+ '\nafter generating your input spreadsheet. An opex will still be generated but information may be missing.'
427
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
323
428
  except Exception as e:
324
- print('Error looking up Ignore')
325
- print(e)
429
+ logger.exception(f'Error looking up Ignore from Dataframe: {e}')
430
+ return False
326
431
 
327
432
  def sourceid_df_lookup(self, xml_element: ET.SubElement, idx: pd.Index) -> None:
433
+ if getattr(self, 'df', None) is None:
434
+ logger.error('Dataframe not initialised, cannot perform lookup')
435
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
328
436
  try:
329
437
  if idx.empty:
330
438
  pass
331
439
  else:
332
- sourceid = check_nan(self.df.loc[idx,SOURCEID_FIELD].item())
440
+ sourceid = check_nan(self.df.loc[idx,self.SOURCEID_FIELD].item())
333
441
  if sourceid:
334
442
  source_xml = ET.SubElement(xml_element,f"{{{self.opexns}}}SourceID")
335
443
  source_xml.text = str(sourceid)
444
+ except KeyError as e:
445
+ logger.exception(f'Key Error in SourceID Lookup: {e}'
446
+ '\n Please ensure column header\'s are an exact match.')
447
+ raise
448
+ except IndexError as e:
449
+ logger.warning(f'Index Error in SourceID Lookup: {e}. Proceeding...'
450
+ '\nIt is likely you have removed or added a file/folder to the directory'
451
+ '\nafter generating your input spreadsheet. An opex will still be generated but information may be missing.'
452
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
336
453
  except Exception as e:
337
- print('Error looking up SourceID')
338
- print(e)
454
+ logger.exception(f'Error looking up SourceID from Dataframe: {e}')
455
+ raise
339
456
 
340
457
  def hash_df_lookup(self, xml_fixities: ET.SubElement, idx: pd.Index) -> None:
458
+ if getattr(self, 'df', None) is None:
459
+ logger.error('Dataframe not initialised, cannot perform lookup')
460
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
341
461
  try:
462
+ hash_value = None
463
+ algo_value = None
464
+ file_path = None
465
+
342
466
  if idx.empty:
343
- pass
467
+ return
344
468
  else:
345
- for algorithm_type in self.algorithm:
346
- self.fixity = ET.SubElement(xml_fixities,f"{{{self.opexns}}}Fixity")
347
- self.hash = self.df.loc[idx,HASH_FIELD].item()
348
- self.algorithm = self.df.loc[idx,ALGORITHM_FIELD].item()
349
- self.fixity.set('type', algorithm_type)
350
- self.fixity.set('value',self.hash)
469
+ # prefer the algorithm specified in the spreadsheet for this row
470
+ if not self.column_headers or (self.HASH_FIELD not in self.column_headers and self.ALGORITHM_FIELD not in self.column_headers):
471
+ return
472
+ hash_value = check_nan(self.df.loc[idx, self.HASH_FIELD].item())
473
+ algo_value = check_nan(self.df.loc[idx, self.ALGORITHM_FIELD].item())
474
+ file_path = check_nan(self.df.loc[idx, self.INDEX_FIELD].item()) if self.INDEX_FIELD in self.column_headers else None
475
+
476
+ if algo_value is not None:
477
+ self.fixity = ET.SubElement(xml_fixities, f"{{{self.opexns}}}Fixity")
478
+ self.fixity.set('type', algo_value)
479
+ self.fixity.set('value', str(hash_value))
480
+ logger.debug(f'Using Algorithm from Spreadsheet: {algo_value} with Hash: {hash_value}')
481
+
482
+ else:
483
+ if file_path is not None:
484
+ # fallback to configured algorithms
485
+ logger.debug('No Algorithm specified in Spreadsheet for this entry; ')
486
+ if file_path.endswith('.pax.zip') or file_path.endswith('.pax'):
487
+ self.generate_pax_zip_opex_fixity(file_path, self.algorithm, self.list_fixity)
488
+ else:
489
+ self.generate_opex_fixity(file_path, self.algorithm, self.list_fixity)
490
+ except KeyError as e:
491
+ logger.exception(f'Key Error in Hash Lookup: {e}'
492
+ '\n Please ensure column header\'s are an exact match.')
493
+ raise
494
+ except IndexError as e:
495
+ logger.warning(f'Index Error in Hash Lookup: {e}. Proceeding...'
496
+ '\nIt is likely you have removed or added a file/folder to the directory'
497
+ '\nafter generating your input spreadsheet. An opex will still be generated but information may be missing.'
498
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
351
499
  except Exception as e:
352
- print('Error looking up Hash')
353
- print(e)
500
+ logger.exception(f'Error looking up Hash from Dataframe: {e}')
501
+ raise
354
502
 
355
503
  def ident_df_lookup(self, idx: pd.Index, default_key: str = None) -> None:
504
+ if getattr(self, 'df', None) is None:
505
+ logger.error('Dataframe not initialised, cannot perform lookup')
506
+ raise RuntimeError('Dataframe not initialised, cannot perform lookup')
356
507
  try:
357
508
  if idx.empty:
358
509
  pass
359
510
  else:
360
511
  for header in self.column_headers:
361
512
  ident = None
362
- if any(s in header for s in {IDENTIFIER_FIELD,ARCREF_FIELD,ACCREF_FIELD}):
363
- if f'{IDENTIFIER_FIELD}:' in header:
513
+ if any(s in header for s in {self.IDENTIFIER_FIELD,self.ARCREF_FIELD,self.ACCREF_FIELD}):
514
+ if f'{self.IDENTIFIER_FIELD}:' in header:
364
515
  key_name = str(header).split(':',1)[-1]
365
- elif IDENTIFIER_FIELD in header:
366
- key_name = IDENTIFIER_DEFAULT
367
- elif ARCREF_FIELD in header:
368
- key_name = IDENTIFIER_DEFAULT
369
- elif ACCREF_FIELD in header:
370
- key_name = ACCREF_CODE
516
+ elif self.IDENTIFIER_FIELD in header:
517
+ key_name = self.IDENTIFIER_DEFAULT
518
+ elif self.ARCREF_FIELD in header:
519
+ key_name = self.IDENTIFIER_DEFAULT
520
+ elif self.ACCREF_FIELD in header:
521
+ key_name = self.ACCREF_CODE
371
522
  else:
372
- key_name = IDENTIFIER_DEFAULT
523
+ key_name = self.IDENTIFIER_DEFAULT
373
524
  ident = check_nan(self.df.loc[idx,header].item())
374
525
  if ident:
375
526
  self.identifier = ET.SubElement(self.identifiers, f"{{{self.opexns}}}Identifier")
376
527
  self.identifier.set("type", key_name)
377
528
  self.identifier.text = str(ident)
529
+ logger.debug(f'Adding Identifer: {header}: {ident}')
530
+ except KeyError as e:
531
+ logger.exception(f'Key Error in Identifer Lookup: {e}' \
532
+ '\n Please ensure column header\'s are an exact match.')
533
+ raise
534
+ except IndexError as e:
535
+ logger.warning(f'Index Error in Identifier Lookup: {e}. Proceeding...' \
536
+ '\nIt is likely you have removed or added a file/folder to the directory' \
537
+ '\nafter generating your input spreadsheet. An opex will still be generated but xml information may be missing.' \
538
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
378
539
  except Exception as e:
379
- print('Error looking up Identifiers')
380
- print(e)
540
+ logger.exception(f'Error looking up Identifiers: {e}')
541
+ raise
381
542
 
382
543
  def init_generate_descriptive_metadata(self) -> None:
383
- self.xml_files = []
384
- for file in os.scandir(self.metadata_dir):
385
- if file.name.endswith('xml'):
386
- """
387
- Generates info on the elements of the XML Files placed in the Metadata directory.
388
- Composed as a list of dictionaries.
389
- """
390
- path = os.path.join(self.metadata_dir, file)
391
- xml_file = ET.parse(path)
392
- root_element = ET.QName(xml_file.find('.'))
393
- root_element_ln = root_element.localname
394
- #root_element_ns = root_element.namespace
395
- elements_list = []
396
- for elem in xml_file.findall('.//'):
397
- elem_path = xml_file.getelementpath(elem)
398
- elem = ET.QName(elem)
399
- elem_ln = elem.localname
400
- elem_ns = elem.namespace
401
- elem_lnpath = elem_path.replace(f"{{{elem_ns}}}", root_element_ln + ":")
402
- elements_list.append({"Name": root_element_ln + ":" + elem_ln, "Namespace": elem_ns, "Path": elem_lnpath})
403
-
404
- """
405
- Compares the column headers in the Spreadsheet against the headers. Filters out non-matching data.
406
- """
544
+ try:
545
+ self.xml_files = []
546
+ for file in os.scandir(self.metadata_dir):
407
547
  list_xml = []
408
- for elem_dict in elements_list:
409
- if elem_dict.get('Name') in self.column_headers or elem_dict.get('Path') in self.column_headers:
410
- list_xml.append({"Name": elem_dict.get('Name'), "Namespace": elem_dict.get('Namespace'), "Path": elem_dict.get('Path')})
411
- if len(list_xml) > 0:
412
- self.xml_files.append({'data': list_xml, 'localname': root_element_ln, 'xmlfile': path})
548
+ if file.name.endswith('xml'):
549
+ """
550
+ Generates info on the elements of the XML Files placed in the Metadata directory.
551
+ Composed as a list of dictionaries.
552
+ """
553
+ path = os.path.join(self.metadata_dir, file.name)
554
+ try:
555
+ xml_file = ET.parse(path)
556
+ except ET.XMLSyntaxError as e:
557
+ logger.exception(f'XML Syntax Error parsing file {file.name}: {e}')
558
+ raise
559
+ except FileNotFoundError as e:
560
+ logger.exception(f'XML file not found {file.name}: {e}')
561
+ raise
562
+ root_element = ET.QName(xml_file.find('.'))
563
+ root_element_ln = root_element.localname
564
+ #root_element_ns = root_element.namespace
565
+ elements_list = []
566
+ for elem in xml_file.findall('.//'):
567
+ elem_path = xml_file.getelementpath(elem)
568
+ elem = ET.QName(elem)
569
+ elem_ln = elem.localname
570
+ elem_ns = elem.namespace
571
+ elem_lnpath = elem_path.replace(f"{{{elem_ns}}}", root_element_ln + ":")
572
+ elements_list.append({"Name": root_element_ln + ":" + elem_ln, "Namespace": elem_ns, "Path": elem_lnpath})
413
573
 
574
+ """
575
+ Compares the column headers in the Spreadsheet against the headers. Filters out non-matching data.
576
+ """
577
+ try:
578
+ for elem_dict in elements_list:
579
+ if elem_dict.get('Name') in self.column_headers or elem_dict.get('Path') in self.column_headers:
580
+ list_xml.append({"Name": elem_dict.get('Name'), "Namespace": elem_dict.get('Namespace'), "Path": elem_dict.get('Path')})
581
+ except Exception as e:
582
+ logger.exception(f'Failed comparing Column headers in XML: {e}')
583
+ raise
584
+ if len(list_xml) != 0:
585
+ self.xml_files.append({'data': list_xml, 'localname': root_element_ln, 'xmlfile': path})
586
+ logger.debug(f'XML file: {file.name} with matching columns added for descriptive metadata.')
587
+ else:
588
+ logger.warning(f'No matching columns found in XML file: {file.name}, skipping.')
589
+ return self.xml_files
590
+ except FileNotFoundError as e:
591
+ logger.exception(f'Metadata directory not found: {e}')
592
+ raise
593
+ except Exception as e:
594
+ logger.exception(f'Failed to intialise XML Metadata: {e}')
595
+ raise
414
596
  def generate_descriptive_metadata(self, xml_desc_elem: ET.Element, idx: pd.Index) -> None:
415
597
  """
416
598
  Composes the data into an xml file.
417
599
  """
418
- for xml_file in self.xml_files:
419
- list_xml = xml_file.get('data')
420
- localname = xml_file.get('localname')
421
- if len(list_xml) == 0:
422
- pass
423
- else:
424
- if idx.empty:
600
+ try:
601
+ for xml_file in self.xml_files:
602
+ list_xml = xml_file.get('data')
603
+ localname = xml_file.get('localname')
604
+ if len(list_xml) == 0 or idx.empty:
425
605
  pass
426
606
  else:
427
607
  xml_new = ET.parse(xml_file.get('xmlfile'))
@@ -429,40 +609,44 @@ class OpexManifestGenerator():
429
609
  name = elem_dict.get('Name')
430
610
  path = elem_dict.get('Path')
431
611
  ns = elem_dict.get('Namespace')
432
- try:
433
- if self.metadata_flag in {'e', 'exact'}:
434
- val_series = self.df.loc[idx,path]
435
- val = check_nan(val_series.item())
436
- elif self.metadata_flag in {'f', 'flat'}:
437
- val_series = self.df.loc[idx,name]
438
- val = check_nan(val_series.item())
439
- if val is None:
612
+ if self.metadata_flag in {'e', 'exact'}:
613
+ val_series = self.df.loc[idx,path]
614
+ val = check_nan(val_series.item())
615
+ elif self.metadata_flag in {'f', 'flat'}:
616
+ val_series = self.df.loc[idx,name]
617
+ val = check_nan(val_series.item())
618
+ if val is None:
619
+ continue
620
+ else:
621
+ if is_datetime64_any_dtype(val_series):
622
+ val = pd.to_datetime(val)
623
+ val = datetime.strftime(val, "%Y-%m-%dT%H:%M:%S.000Z")
624
+ if self.metadata_flag in {'e','exact'}:
625
+ n = path.replace(localname + ":", f"{{{ns}}}")
626
+ elem = xml_new.find(f'./{n}')
627
+ if elem is None:
628
+ logger.warning(f'XML element not found for path: {n} in {xml_file.get("xmlfile")}')
629
+ continue
630
+ elif self.metadata_flag in {'f', 'flat'}:
631
+ n = name.split(':')[-1]
632
+ elem = xml_new.find(f'.//{{{ns}}}{n}')
633
+ if elem is None:
634
+ logger.warning(f'XML element not found for name: {name} in {xml_file.get("xmlfile")}')
440
635
  continue
441
- else:
442
- if is_datetime64_any_dtype(val_series):
443
- val = pd.to_datetime(val)
444
- val = datetime.datetime.strftime(val, "%Y-%m-%dT%H:%M:%S.000Z")
445
- if self.metadata_flag in {'e','exact'}:
446
- n = path.replace(localname + ":", f"{{{ns}}}")
447
- elem = xml_new.find(f'./{n}')
448
- elif self.metadata_flag in {'f', 'flat'}:
449
- n = name.split(':')[-1]
450
- elem = xml_new.find(f'.//{{{ns}}}{n}')
451
- elem.text = str(val)
452
- except KeyError as e:
453
- print('Key Error: please ensure column header\'s are an exact match...')
454
- print(f'Missing Column: {e}')
455
- print('Alternatively use flat mode...')
456
- time.sleep(3)
457
- raise SystemExit()
458
- except IndexError as e:
459
- print("""Index Error; it is likely you have removed or added a file/folder to the directory \
460
- after generating the spreadsheet. An opex will still be generated but with no xml metadata. \
461
- To ensure metadata match up please regenerate the spreadsheet...""")
462
- print(f'Error: {e}')
463
- time.sleep(5)
464
- break
636
+ elem.text = str(val)
465
637
  xml_desc_elem.append(xml_new.find('.'))
638
+ except KeyError as e:
639
+ logger.exception(f'Key Error in XML Lookup: {e}' \
640
+ '\n please ensure column header\'s are an exact match.')
641
+ raise
642
+ except IndexError as e:
643
+ logger.warning(f'Index Error: {e}' \
644
+ '\nIt is likely you have removed or added a file/folder to the directory' \
645
+ 'after generating your input spreadsheet. An opex will still be generated but with no xml metadata.' \
646
+ '\nTo ensure metadata match up please regenerate the spreadsheet.')
647
+ except Exception as e:
648
+ logger.exception(f'General Error in XML Lookup: {e}')
649
+ raise
466
650
 
467
651
  def generate_opex_properties(self, xmlroot: ET.Element, idx: int, title: str = None,
468
652
  description: str = None, security: str = None) -> None:
@@ -479,61 +663,93 @@ class OpexManifestGenerator():
479
663
  if self.autoref_flag not in {"generic", "g"} or self.input:
480
664
  self.identifiers = ET.SubElement(self.properties, f"{{{self.opexns}}}Identifiers")
481
665
  self.ident_df_lookup(idx)
482
- if self.properties is None:
666
+ # remove Properties element if no children were added
667
+ if len(self.properties) == 0:
483
668
  xmlroot.remove(self.properties)
484
669
 
485
- def generate_opex_fixity(self, file_path: str) -> None:
486
- for algorithm_type in self.OMG.algorithm:
670
+ def generate_opex_fixity(self, file_path: str, algorithm: list | None = None) -> list:
671
+ """Generate fixities for a file. If algorithm is None, defaults to ['SHA-1']."""
672
+ algorithm = algorithm or ['SHA-1']
673
+ list_fixity = []
674
+ for algorithm_type in algorithm:
487
675
  self.fixity = ET.SubElement(self.fixities, f"{{{self.opexns}}}Fixity")
488
- self.hash = HashGenerator(algorithm = algorithm_type).hash_generator(file_path)
676
+ hash_value = HashGenerator(algorithm = algorithm_type).hash_generator(file_path)
489
677
  self.fixity.set("type", algorithm_type)
490
- self.fixity.set("value", self.hash)
491
- self.OMG.list_fixity.append([algorithm_type, self.hash, file_path])
492
- self.OMG.list_path.append(file_path)
493
-
494
- def generate_pax_zip_opex_fixity(self, file_path) -> None:
495
- for algorithm_type in self.OMG.algorithm:
496
- z = zipfile.ZipFile(file_path,'r')
497
- for file in z.filelist:
498
- self.fixity = ET.SubElement(self.fixities, f"{{{self.opexns}}}Fixity")
499
- self.hash = HashGenerator(algorithm = algorithm_type).hash_generator_pax_zip(file.filename, z)
500
- file_replace = file.filename.replace('\\','/')
501
- self.fixity.set("path", file_replace)
502
- self.fixity.set("type", algorithm_type)
503
- self.fixity.set("value", self.hash)
504
- self.OMG.list_fixity.append([algorithm_type, self.hash, file_path + file.filename])
505
- self.OMG.list_path.append(file_path)
678
+ self.fixity.set("value", hash_value)
679
+ list_fixity.append([algorithm_type, hash_value, file_path])
680
+ return list_fixity
681
+
682
+ def generate_pax_folder_opex_fixity(self, folder_path: str, fixitiesxml: ET._Element, filesxml: ET._Element, algorithm: list | None = None) -> list:
683
+ """Generate fixities for files inside a pax folder. If algorithm is None, defaults to ['SHA-1']."""
684
+ algorithm = algorithm or ['SHA-1']
685
+ list_fixity = []
686
+ list_path = []
687
+ for dir,_,files in os.walk(folder_path):
688
+ for filename in files:
689
+ rel_path = os.path.relpath(dir,folder_path)
690
+ rel_file = os.path.join(rel_path, filename).replace('\\','/')
691
+ abs_file = os.path.abspath(os.path.join(dir,filename))
692
+ list_path.append(abs_file)
693
+ for algorithm_type in algorithm:
694
+ self.fixity = ET.SubElement(fixitiesxml, f"{{{self.opexns}}}Fixity")
695
+ hash_value = HashGenerator(algorithm = algorithm_type).hash_generator(abs_file)
696
+ self.fixity.set("type", algorithm_type)
697
+ self.fixity.set("value", hash_value)
698
+ self.fixity.set("path", rel_file)
699
+ list_fixity.append([algorithm_type, hash_value, abs_file])
700
+ file = ET.SubElement(filesxml, f"{{{self.opexns}}}File")
701
+ file.set("type", "content")
702
+ file.set("size", str(os.path.getsize(abs_file)))
703
+ file.text = str(rel_file)
704
+ return list_fixity, list_path
506
705
 
706
+
707
+ def generate_pax_zip_opex_fixity(self, file_path: str, algorithm: list | None = None) -> list:
708
+ """Generate fixities for files inside a pax/zip. If algorithm is None, defaults to ['SHA-1']."""
709
+ algorithm = algorithm or ['SHA-1']
710
+ list_fixity = []
711
+ for algorithm_type in algorithm:
712
+ with zipfile.ZipFile(file_path, 'r') as z:
713
+ for file in z.filelist:
714
+ self.fixity = ET.SubElement(self.fixities, f"{{{self.opexns}}}Fixity")
715
+ hash_value = HashGenerator(algorithm = algorithm_type).hash_generator_pax_zip(file.filename, z)
716
+ file_replace = file.filename.replace('\\', '/')
717
+ self.fixity.set("path", file_replace)
718
+ self.fixity.set("type", algorithm_type)
719
+ self.fixity.set("value", hash_value)
720
+ list_fixity.append([algorithm_type, hash_value, f"{file_path}/{file.filename}"])
721
+ return list_fixity
722
+
507
723
  def main(self) -> None:
508
- if self.print_xmls_flag:
509
- self.print_descriptive_xmls()
510
- input("Press Key to Close")
511
- raise SystemExit()
512
- print(f"Start time: {self.start_time}")
513
724
  if self.clear_opex_flag:
514
725
  self.clear_opex()
515
- if self.autoref_flag or self.algorithm or self.input:
726
+ if self.autoref_flag or self.algorithm or self.input or self.zip_flag or self.export_flag or self.empty_flag or self.removal_flag:
516
727
  pass
517
728
  else:
518
- print_running_time(self.start_time)
519
- print('Cleared OPEXES. No additional arguments passed, so ending program.'); time.sleep(3)
729
+ logger.info('Cleared Opexes. No additional arguments passed, so ending program.')
520
730
  raise SystemExit()
521
731
  if self.empty_flag:
522
- ReferenceGenerator(self.root, self.output_path, meta_dir_flag = self.meta_dir_flag).remove_empty_directories()
732
+ logger.debug('Removing empty directories as per empty flag.')
733
+ ReferenceGenerator(self.root, self.output_path, meta_dir_flag = self.meta_dir_flag).remove_empty_directories(self.empty_export_flag)
734
+ df_flag = False
523
735
  if not self.autoref_flag in {"g", "generic"}:
524
- self.init_df()
736
+ logger.debug('Auto Reference flag not set to generic, checking for Dataframe requirement.')
737
+ df_flag = self.init_df()
525
738
  self.count = 1
526
- if not self.metadata_flag in {'none', 'n'}:
739
+ if self.metadata_flag is not None:
740
+ if not df_flag:
741
+ logger.error('Metadata generation requires Auto Reference or Input file to be specified.')
742
+ raise ValueError('Metadata generation requires Auto Reference or Input file to be specified.')
527
743
  self.init_generate_descriptive_metadata()
528
744
  OpexDir(self, self.root).generate_opex_dirs(self.root)
529
745
  if self.algorithm:
530
- output_path = define_output_file(self.output_path, self.root, METAFOLDER, self.meta_dir_flag, output_suffix = FIXITY_SUFFIX, output_format = "txt")
746
+ output_path = define_output_file(self.output_path, self.root, self.METAFOLDER, self.meta_dir_flag, output_suffix = self.FIXITY_SUFFIX, output_format = "txt")
531
747
  if self.fixity_export_flag:
532
748
  export_list_txt(self.list_fixity, output_path)
533
749
  if self.removal_flag:
534
- output_path = define_output_file(self.output_path, self.root, METADFOLDER, self.meta_dir_flag, output_suffix = REMOVALS_SUFFIX, output_format = "txt")
535
- export_list_txt(self.removal_list, output_path)
536
- print_running_time(self.start_time)
750
+ output_path = define_output_file(self.output_path, self.root, self.METAFOLDER, self.meta_dir_flag, output_suffix = self.REMOVALS_SUFFIX, output_format = "txt")
751
+ if self.removal_export_flag:
752
+ export_list_txt(self.removal_list, output_path)
537
753
 
538
754
  class OpexDir(OpexManifestGenerator):
539
755
  def __init__(self, OMG: OpexManifestGenerator, folder_path: str, title: str = None, description: str = None, security: str = None) -> None:
@@ -562,10 +778,12 @@ class OpexDir(OpexManifestGenerator):
562
778
  if self.OMG.ignore_flag:
563
779
  self.ignore = self.OMG.ignore_df_lookup(index)
564
780
  if self.ignore:
781
+ logger.info(f'Ignoring folder as per ignore flag in spreadsheet: {self.folder_path}')
565
782
  return
566
783
  if self.OMG.removal_flag:
567
784
  self.removal = self.OMG.removal_df_lookup(index)
568
785
  if self.removal:
786
+ logger.info(f'Removing folder as per removal flag in spreadsheet: {self.folder_path}')
569
787
  remove_tree(self.folder_path, self.OMG.removal_list)
570
788
  return
571
789
  self.xmlroot = ET.Element(f"{{{self.opexns}}}OPEXMetadata", nsmap={"opex":self.opexns})
@@ -587,35 +805,27 @@ class OpexDir(OpexManifestGenerator):
587
805
  if security is not None:
588
806
  self.security = security
589
807
  else:
590
- self.security = GENERIC_DEFAULT_SECURITY
808
+ self.security = self.GENERIC_DEFAULT_SECURITY
591
809
  else:
592
810
  self.title = title
593
811
  self.description = description
594
812
  self.security = security
595
813
  if self.OMG.sourceid_flag:
596
- self.OMG.sourceid_df_lookup(self.transfer, self.folder_path, index)
814
+ self.OMG.sourceid_df_lookup(self.transfer, index)
815
+ # Handling Fixities for PAX Folders
597
816
  if self.OMG.algorithm and self.OMG.pax_fixity_flag is True and self.folder_path.endswith(".pax"):
598
817
  self.fixities = ET.SubElement(self.transfer, f"{{{self.opexns}}}Fixities")
599
- for dir,_,files in os.walk(folder_path):
600
- for filename in files:
601
- rel_path = os.path.relpath(dir,folder_path)
602
- rel_file = os.path.join(rel_path, filename)
603
- rel_file = rel_file.replace('\\','/')
604
- abs_file = os.path.abspath(os.path.join(dir,filename))
605
- self.generate_opex_fixity(abs_file)
606
- self.fixity.set("path",rel_file)
607
- file = ET.SubElement(self.files, f"{{{self.opexns}}}File")
608
- file.set("type", "content")
609
- file.set("size", str(os.path.getsize(abs_file)))
610
- file.text = str(rel_file)
818
+ tmp_list_fixity,tmp_list_path = self.OMG.generate_pax_folder_opex_fixity(self.folder_path, self.fixities, self.files, self.OMG.algorithm)
819
+ self.OMG.list_fixity.extend(tmp_list_fixity)
820
+ self.OMG.list_path.extend(tmp_list_path)
611
821
  if self.OMG.autoref_flag or self.OMG.input:
612
822
  self.OMG.generate_opex_properties(self.xmlroot, index,
613
823
  title = self.title,
614
824
  description = self.description,
615
825
  security = self.security)
616
- if not self.OMG.metadata_flag in {'none', 'n'}:
826
+ if self.OMG.metadata_flag is not None:
617
827
  self.xml_descmeta = ET.SubElement(self.xmlroot,f"{{{self.opexns}}}DescriptiveMetadata")
618
- self.OMG.generate_descriptive_metadata(self.xmlroot, idx = index)
828
+ self.OMG.generate_descriptive_metadata(self.xml_descmeta, idx = index)
619
829
 
620
830
  def filter_directories(self, directory: str, sort_key: str = str.casefold) -> list:
621
831
  try:
@@ -623,82 +833,85 @@ class OpexDir(OpexManifestGenerator):
623
833
  list_directories = sorted([win_256_check(os.path.join(directory, f.name)) for f in os.scandir(directory)
624
834
  if not f.name.startswith('.')
625
835
  and filter_win_hidden(win_256_check(os.path.join(directory, f.name))) is False
626
- and f.name != METAFOLDER
836
+ and not f.name in ('opex_generate.exe','opex_generate.bin')
837
+ and f.name != self.OMG.METAFOLDER
627
838
  and f.name != os.path.basename(__file__)],
628
839
  key=sort_key)
629
840
  elif self.OMG.hidden_flag is True:
630
- list_directories = sorted([os.path.join(directory, f.name) for f in os.scandir(directory) \
631
- if f.name != METAFOLDER
841
+ list_directories = sorted([win_256_check(os.path.join(directory, f.name)) for f in os.scandir(directory) \
842
+ if f.name != self.OMG.METAFOLDER
843
+ and not f.name in ('opex_generate.exe','opex_generate.bin')
632
844
  and f.name != os.path.basename(__file__)],
633
845
  key=sort_key)
634
846
  return list_directories
635
847
  except Exception as e:
636
- print('Failed to Filter')
637
- print(e)
638
- raise SystemError()
848
+ logger.exception(f'Failed to Filter Directories: {e}')
849
+ raise
639
850
 
640
851
  def generate_opex_dirs(self, path: str) -> None:
641
852
  """"
642
853
  This function loops recursively through a given directory.
643
854
 
644
- There are two loops to first generate Opexes for Files;
855
+ There are two loops to first generate Opexes for Files; Then Generate the Folder Opex Manifests.
645
856
  """
646
- self = OpexDir(self.OMG, path)
647
- if self.OMG.algorithm and self.OMG.pax_fixity_flag is True and self.folder_path.endswith(".pax"):
648
- opex_path = os.path.abspath(self.folder_path)
857
+ current = OpexDir(self.OMG, path)
858
+ if current.OMG.algorithm and current.OMG.pax_fixity_flag is True and current.folder_path.endswith(".pax"):
859
+ opex_path = os.path.abspath(current.folder_path)
649
860
  else:
650
- opex_path = os.path.join(os.path.abspath(self.folder_path), os.path.basename(self.folder_path))
861
+ opex_path = os.path.join(os.path.abspath(current.folder_path), os.path.basename(current.folder_path))
651
862
  #First Loop to Generate Folder Manifest Opexes & Individual File Opexes.
652
- if self.removal is True:
863
+ if current.removal is True:
653
864
  #If removal is True for Folder, then it will be removed - Does not need to descend.
654
865
  pass
655
866
  else:
656
- for f_path in self.filter_directories(path):
867
+ for f_path in current.filter_directories(path):
657
868
  if f_path.endswith('.opex'):
658
869
  #Ignores OPEX files / directories...
659
870
  pass
660
871
  elif os.path.isdir(f_path):
661
- if self.ignore is True or \
662
- (self.OMG.removal_flag is True and \
663
- self.OMG.removal_df_lookup(self.OMG.index_df_lookup(f_path)) is True):
872
+ if current.ignore is True or \
873
+ (current.OMG.removal_flag is True and \
874
+ current.OMG.removal_df_lookup(current.OMG.index_df_lookup(f_path)) is True):
664
875
  #If Ignore is True, or the Folder below is marked for Removal: Don't add to Opex
665
876
  pass
666
877
  else:
667
878
  #Add Folder to OPEX Manifest (doesn't get written yet...)
668
- self.folder = ET.SubElement(self.folders, f"{{{self.opexns}}}Folder")
669
- self.folder.text = str(os.path.basename(f_path))
670
- if self.OMG.algorithm and self.OMG.pax_fixity_flag is True and self.folder_path.endswith(".pax"):
671
- #If using fixity, but the folder is a PAX & using PAX Fixity: End descent.
879
+ current.folder = ET.SubElement(self.folders, f"{{{self.opexns}}}Folder")
880
+ current.folder.text = str(os.path.basename(f_path))
881
+ if current.OMG.algorithm and current.OMG.pax_fixity_flag is True and current.folder_path.endswith(".pax"):
882
+ #If using fixity, but the current folder is a PAX & using PAX Fixity: End descent.
672
883
  pass
673
884
  else:
674
885
  #Recurse Descent.
675
- self.generate_opex_dirs(f_path)
886
+ current.generate_opex_dirs(f_path)
676
887
  elif os.path.isfile(f_path):
677
888
  #Processes OPEXes for individual Files: this gets written.
678
- OpexFile(self.OMG, f_path)
889
+ OpexFile(current.OMG, f_path)
679
890
  else:
680
- print('Unknown File Type?')
891
+ logger.warning(f'Unknown File Type at: {f_path}')
681
892
  pass
682
893
  #Second Loop to add previously generated Opexes to Folder Manifest.
683
- if self.removal is True or self.ignore is True:
894
+ if current.removal is True or current.ignore is True:
895
+ logger.debug(f'Skipping Opex generation for: {current.folder_path}')
684
896
  pass
685
897
  else:
686
898
  if check_opex(opex_path):
687
899
  #Only processing Opexes.
688
- for f_path in self.filter_directories(path):
900
+ for f_path in current.filter_directories(path):
689
901
  if os.path.isfile(f_path):
690
- file = ET.SubElement(self.files, f"{{{self.opexns}}}File")
902
+ file = ET.SubElement(current.files, f"{{{current.opexns}}}File")
691
903
  if f_path.endswith('.opex'):
692
904
  file.set("type", "metadata")
693
905
  else:
694
906
  file.set("type", "content")
695
907
  file.set("size", str(os.path.getsize(f_path)))
696
908
  file.text = str(os.path.basename(f_path))
909
+ logger.debug(f'Adding File to Opex Manifest: {f_path}')
697
910
  #Writes Folder OPEX
698
- write_opex(opex_path, self.xmlroot)
911
+ write_opex(opex_path, current.xmlroot)
699
912
  else:
700
913
  #Avoids Override if exists, lets you continue where left off.
701
- print(f"Avoiding override, Opex exists at: {opex_path}")
914
+ logger.info(f"Avoiding override, Opex exists at: {opex_path}")
702
915
 
703
916
  class OpexFile(OpexManifestGenerator):
704
917
  def __init__(self, OMG: OpexManifestGenerator, file_path: str, title: str = None, description: str = None, security: str = None) -> None:
@@ -709,6 +922,7 @@ class OpexFile(OpexManifestGenerator):
709
922
  else:
710
923
  self.file_path = file_path
711
924
  if check_opex(self.file_path):
925
+ index = None
712
926
  if any([self.OMG.input,
713
927
  self.OMG.autoref_flag in {"c","catalog","a","accession","b","both","cg","catalog-generic","ag","accession-generic","bg","both-generic"},
714
928
  self.OMG.ignore_flag,
@@ -720,8 +934,6 @@ class OpexFile(OpexManifestGenerator):
720
934
  index = self.OMG.index_df_lookup(self.file_path)
721
935
  elif self.OMG.autoref_flag is None or self.OMG.autoref_flag in {"g","generic"}:
722
936
  index = None
723
- else:
724
- index = None
725
937
  self.ignore = False
726
938
  self.removal = False
727
939
  if self.OMG.ignore_flag:
@@ -746,25 +958,28 @@ class OpexFile(OpexManifestGenerator):
746
958
  if security is not None:
747
959
  self.security = security
748
960
  else:
749
- self.security = GENERIC_DEFAULT_SECURITY
961
+ self.security = self.GENERIC_DEFAULT_SECURITY
750
962
  else:
751
963
  self.title = title
752
964
  self.description = description
753
965
  self.security = security
966
+ opex_path = None
754
967
  if self.OMG.algorithm or self.OMG.autoref_flag or self.OMG.input:
755
968
  self.xmlroot = ET.Element(f"{{{self.opexns}}}OPEXMetadata", nsmap={"opex":self.opexns})
756
969
  self.transfer = ET.SubElement(self.xmlroot, f"{{{self.opexns}}}Transfer")
757
970
  if self.OMG.sourceid_flag:
758
- self.OMG.sourceid_df_lookup(self.transfer, self.file_path)
971
+ self.OMG.sourceid_df_lookup(self.transfer, index)
759
972
  if self.OMG.algorithm:
760
973
  self.fixities = ET.SubElement(self.transfer, f"{{{self.opexns}}}Fixities")
761
974
  if self.OMG.hash_from_spread:
762
975
  self.OMG.hash_df_lookup(self.fixities, index)
763
976
  else:
977
+ self.OMG.list_path.append(self.file_path)
764
978
  if self.OMG.pax_fixity_flag is True and (self.file_path.endswith("pax.zip") or self.file_path.endswith(".pax")):
765
- self.generate_pax_zip_opex_fixity(self.file_path)
979
+ tmp_list_fixity = self.generate_pax_zip_opex_fixity(self.file_path, self.OMG.algorithm)
766
980
  else:
767
- self.generate_opex_fixity(self.file_path)
981
+ tmp_list_fixity = self.generate_opex_fixity(self.file_path, self.OMG.algorithm)
982
+ self.OMG.list_fixity.extend(tmp_list_fixity)
768
983
  if self.transfer is None:
769
984
  self.xmlroot.remove(self.transfer)
770
985
  if self.OMG.autoref_flag or self.OMG.input:
@@ -772,11 +987,18 @@ class OpexFile(OpexManifestGenerator):
772
987
  title = self.title,
773
988
  description = self.description,
774
989
  security = self.security)
775
- if not self.OMG.metadata_flag in {'none','n'}:
990
+ if self.OMG.metadata_flag is not None:
776
991
  self.xml_descmeta = ET.SubElement(self.xmlroot, f"{{{self.opexns}}}DescriptiveMetadata")
777
992
  self.OMG.generate_descriptive_metadata(self.xml_descmeta, index)
778
993
  opex_path = write_opex(self.file_path, self.xmlroot)
779
- if self.OMG.zip_flag:
780
- zip_opex(self.file_path, opex_path)
994
+ # Zip cannot be activated unless another flag - which
995
+ if self.OMG.zip_flag:
996
+ zip_opex(self.file_path, opex_path)
997
+ if self.OMG.zip_file_removal:
998
+ os.remove(self.file_path)
999
+ if os.path.exists(opex_path):
1000
+ os.remove(opex_path)
1001
+ logger.debug(f'Removed file: {opex_path}')
1002
+ logger.debug(f'Removed file: {self.file_path}')
781
1003
  else:
782
- print(f"Avoiding override, Opex exists at: {self.file_path}: ")
1004
+ logger.info(f"Avoiding override, Opex exists at: {self.file_path}: ")