esgf-qa 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgf_qa/run_qa.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import argparse
2
2
  import csv
3
3
  import datetime
4
- import difflib
5
4
  import hashlib
6
5
  import json
7
6
  import multiprocessing
@@ -14,45 +13,19 @@ from pathlib import Path
14
13
  from compliance_checker import __version__ as cc_version
15
14
  from compliance_checker.runner import CheckSuite
16
15
 
16
+ from esgf_qa._constants import (
17
+ DRS_path_parent,
18
+ checker_dict,
19
+ checker_dict_ext,
20
+ checker_release_versions,
21
+ )
17
22
  from esgf_qa._version import version
23
+ from esgf_qa.cluster_results import QAResultAggregator
18
24
  from esgf_qa.con_checks import compatibility_checks as comp # noqa
19
25
  from esgf_qa.con_checks import consistency_checks as cons # noqa
20
26
  from esgf_qa.con_checks import continuity_checks as cont # noqa
21
27
  from esgf_qa.con_checks import dataset_coverage_checks, inter_dataset_consistency_checks
22
28
 
23
- checker_dict = {
24
- "cc6": "CORDEX-CMIP6",
25
- "cf": "CF-Conventions",
26
- "mip": "MIP",
27
- "plugin_cmip6": "CMIP6",
28
- # "wcrp-cmip5": "CMIP5",
29
- "wcrp_cmip6": "CMIP6",
30
- # "wcrp_cmip7": "CMIP7-AFT",
31
- # "wcrp_cmip7": "CMIP7",
32
- # "wcrp_cordex": "CORDEX",
33
- "wcrp_cordex_cmip6": "CORDEX-CMIP6",
34
- # "obs4mips": "Obs4MIPs",
35
- # "input4mips": "Input4MIPs",
36
- }
37
- DRS_path_parent = {
38
- "CMIP5": "CMIP5",
39
- "CMIP6": "CMIP6",
40
- "CMIP7": "CMIP7",
41
- "CMIP7-AFT": "CMIP7",
42
- "CORDEX": "CORDEX",
43
- "CORDEX-CMIP6": "CORDEX-CMIP6",
44
- "Obs4MIPs": "Obs4MIPs",
45
- "Input4MIPs": "Input4MIPs",
46
- }
47
- checker_release_versions = {}
48
- checker_dict_ext = {
49
- # "pcons": "ParentConsistency"
50
- "cons": "Consistency",
51
- "cont": "Continuity",
52
- "comp": "Compatibility",
53
- **checker_dict,
54
- }
55
-
56
29
  _timestamp_with_ms = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
57
30
  _timestamp_filename = datetime.datetime.strptime(
58
31
  _timestamp_with_ms, "%Y%m%d-%H%M%S%f"
@@ -62,321 +35,15 @@ _timestamp_pprint = datetime.datetime.strptime(
62
35
  ).strftime("%Y-%m-%d %H:%M")
63
36
 
64
37
 
65
- class QAResultAggregator:
66
- def __init__(self, checker_dict):
67
- """
68
- Initialize the aggregator with an empty summary.
69
- """
70
- self.summary = {
71
- "error": defaultdict(
72
- lambda: defaultdict(lambda: defaultdict(list))
73
- ), # No weight, just function -> error msg
74
- "fail": defaultdict(
75
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
76
- ), # weight -> test -> msg -> dsid -> filenames
77
- }
78
- self.checker_dict = checker_dict
79
-
80
- def update(self, result_dict, dsid, file_name):
81
- """
82
- Update the summary with a single result of a cc-run.
83
- """
84
- for checker in result_dict:
85
- for test in result_dict[checker]:
86
- if test == "errors":
87
- for function_name, error_msg in result_dict[checker][
88
- "errors"
89
- ].items():
90
- self.summary["error"][
91
- f"[{checker_dict[checker]}] " + function_name
92
- ][error_msg][dsid].append(file_name)
93
- else:
94
- score, max_score = result_dict[checker][test]["value"]
95
- weight = result_dict[checker][test].get("weight", 3)
96
- msgs = result_dict[checker][test].get("msgs", [])
97
- if score < max_score: # test outcome: fail
98
- for msg in msgs:
99
- self.summary["fail"][weight][
100
- f"[{checker_dict[checker]}] " + test
101
- ][msg][dsid].append(file_name)
102
-
103
- def update_ds(self, result_dict, dsid):
104
- """
105
- Update the summary with a single result of a esgf-qa run.
106
- """
107
- for checker in result_dict:
108
- for test in result_dict[checker]:
109
- if test == "errors":
110
- for function_name, errdict in result_dict[checker][
111
- "errors"
112
- ].items():
113
- for file_name in errdict["files"]:
114
- self.summary["error"][
115
- f"[{checker_dict_ext[checker]}] " + function_name
116
- ][errdict["msg"]][dsid].append(file_name)
117
- else:
118
- weight = result_dict[checker][test].get("weight", 3)
119
- fails = result_dict[checker][test].get("msgs", {})
120
- for msg, file_names in fails.items():
121
- for file_name in file_names:
122
- self.summary["fail"][weight][
123
- f"[{checker_dict_ext[checker]}] " + test
124
- ][msg][dsid].append(file_name)
125
-
126
- def sort(self):
127
- """
128
- Sort the summary.
129
- """
130
- self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
131
- for key in self.summary["fail"]:
132
- self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
133
-
134
- # Sort errors by function name
135
- for checker in self.summary["error"]:
136
- self.summary["error"][checker] = dict(
137
- sorted(self.summary["error"][checker].items())
138
- )
139
-
140
- @staticmethod
141
- def cluster_messages(messages, threshold):
142
- clusters = []
143
- while messages:
144
- base = messages.pop(0)
145
- cluster = [base]
146
- to_remove = []
147
- for msg in messages:
148
- ratio = difflib.SequenceMatcher(None, base, msg).ratio()
149
- if ratio >= threshold:
150
- cluster.append(msg)
151
- to_remove.append(msg)
152
- for msg in to_remove:
153
- messages.remove(msg)
154
- clusters.append(cluster)
155
- return clusters
156
-
157
- @staticmethod
158
- def generalize_message_group(messages):
159
- if len(messages) == 1:
160
- return messages[0], {}
161
-
162
- # Split messages into tokens
163
- split_messages = [re.findall(r"\w+|\W", m) for m in messages]
164
- transposed = list(zip(*split_messages))
165
- template = []
166
- placeholders = {}
167
- var_index = 0
168
-
169
- for i, tokens in enumerate(transposed):
170
- unique_tokens = set(tokens)
171
- if len(unique_tokens) == 1:
172
- template.append(tokens[0])
173
- else:
174
- var_name = chr(ord("A") + var_index)
175
- template.append(f"{{{var_name}}}")
176
- placeholders[var_name] = tokens[0]
177
- var_index += 1
178
-
179
- # Merge placeholders if possible
180
- template, placeholders = QAResultAggregator.merge_placeholders(
181
- template, placeholders
182
- )
183
-
184
- # Return the generalized message and the placeholders
185
- generalized = "".join(template)
186
- return generalized, placeholders
187
-
188
- @staticmethod
189
- def merge_placeholders(list_of_strings, dictionary, skip=0):
190
- def find_next_two_placeholders(list_of_strings, skip):
191
- placeholders = [
192
- s for s in list_of_strings if s.startswith("{") and s.endswith("}")
193
- ]
194
- if len(placeholders) < 2:
195
- return None, None
196
- return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
197
- placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
198
- )
199
-
200
- def extract_text_between_placeholders(
201
- list_of_strings, placeholder1, placeholder2
202
- ):
203
- idx1 = list_of_strings.index(placeholder1)
204
- idx2 = list_of_strings.index(placeholder2)
205
- return "".join(list_of_strings[idx1 + 1 : idx2])
206
-
207
- def merge_two_placeholders(
208
- placeholder1, placeholder2, text_between, dictionary
209
- ):
210
- new_value = (
211
- dictionary[placeholder1.lstrip("{").rstrip("}")]
212
- + text_between
213
- + dictionary[placeholder2.lstrip("{").rstrip("}")]
214
- )
215
- dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
216
- del dictionary[placeholder2.lstrip("{").rstrip("}")]
217
- return dictionary
218
-
219
- def update_placeholder_names(list_of_strings, dictionary):
220
- old_placeholders = sorted(list(dictionary.keys()))
221
- new_placeholders = [
222
- chr(ord("A") + i) for i in range(0, len(old_placeholders))
223
- ]
224
- new_dictionary = dict(
225
- zip(new_placeholders, [dictionary[val] for val in old_placeholders])
226
- )
227
- for old, new in zip(old_placeholders, new_placeholders):
228
- list_of_strings = [
229
- s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
230
- ]
231
- return list_of_strings, new_dictionary
232
-
233
- def replace_placeholders_with_new_one(
234
- list_of_strings, placeholder1, placeholder2
235
- ):
236
- idx1 = list_of_strings.index(placeholder1)
237
- idx2 = list_of_strings.index(placeholder2)
238
- list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
239
- if idx2 < len(list_of_strings) + 1:
240
- list_of_strings_new += list_of_strings[idx2 + 1 :]
241
- return list_of_strings_new
242
-
243
- if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
244
- return list_of_strings, dictionary
245
-
246
- placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
247
- if placeholder1 is None or placeholder2 is None:
248
- return list_of_strings, dictionary
249
-
250
- text_between = extract_text_between_placeholders(
251
- list_of_strings, placeholder1, placeholder2
252
- )
253
- if len(text_between) < 5:
254
- dictionary = merge_two_placeholders(
255
- placeholder1, placeholder2, text_between, dictionary
256
- )
257
- list_of_strings = replace_placeholders_with_new_one(
258
- list_of_strings, placeholder1, placeholder2
259
- )
260
- list_of_strings, dictionary = update_placeholder_names(
261
- list_of_strings, dictionary
262
- )
263
- return QAResultAggregator.merge_placeholders(
264
- list_of_strings, dictionary, skip
265
- )
266
- else:
267
- return QAResultAggregator.merge_placeholders(
268
- list_of_strings, dictionary, skip + 1
269
- )
270
-
271
- def cluster_summary(self, threshold=0.75):
272
- self.clustered_summary = defaultdict(
273
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
274
- )
275
- for status in self.summary:
276
- if status == "error":
277
- for test_id in self.summary[status]:
278
- messages = list(self.summary[status][test_id].keys())
279
- # Pass a copy of messages to cluster_messages to generate clusters
280
- clusters = QAResultAggregator.cluster_messages(
281
- messages[:], threshold
282
- )
283
-
284
- for cluster in clusters:
285
- generalized, placeholders = (
286
- QAResultAggregator.generalize_message_group(cluster)
287
- )
288
- example_parts = ", ".join(
289
- [
290
- (
291
- f"{k}='{v[0]}'"
292
- if isinstance(v, list)
293
- else f"{k}='{v}'"
294
- )
295
- for k, v in placeholders.items()
296
- ]
297
- )
298
- if example_parts:
299
- msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
300
- else:
301
- msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
302
-
303
- # Gather all ds_ids and filenames across the cluster
304
- combined = defaultdict(set)
305
- for message in cluster:
306
- for ds_id, files in self.summary[status][test_id][
307
- message
308
- ].items():
309
- combined[ds_id].update(files)
310
-
311
- # Shorten file lists to one example
312
- formatted = {
313
- ds_id
314
- + " ("
315
- + str(len(files))
316
- + f" file{'s' if len(files) > 1 else ''} affected)": (
317
- [f"e.g. '{next(iter(files))}'"]
318
- if len(files) > 1
319
- else [f"'{next(iter(files))}'"]
320
- )
321
- for ds_id, files in combined.items()
322
- }
323
-
324
- self.clustered_summary[status][test_id][msg_summary] = formatted
325
- elif status == "fail":
326
- for weight in self.summary[status]:
327
- for test_id in self.summary[status][weight]:
328
- messages = list(self.summary[status][weight][test_id].keys())
329
- # Pass a copy of messages to cluster_messages to generate clusters
330
- clusters = QAResultAggregator.cluster_messages(
331
- messages[:], threshold
332
- )
333
-
334
- for cluster in clusters:
335
- generalized, placeholders = (
336
- QAResultAggregator.generalize_message_group(cluster)
337
- )
338
- example_parts = ", ".join(
339
- [
340
- (
341
- f"{k}='{v[0]}'"
342
- if isinstance(v, list)
343
- else f"{k}='{v}'"
344
- )
345
- for k, v in placeholders.items()
346
- ]
347
- )
348
- if example_parts:
349
- msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
350
- else:
351
- msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
352
-
353
- # Gather all ds_ids and filenames across the cluster
354
- combined = defaultdict(set)
355
- for message in cluster:
356
- for ds_id, files in self.summary[status][weight][
357
- test_id
358
- ][message].items():
359
- combined[ds_id].update(files)
360
-
361
- # Shorten file lists to one example
362
- formatted = {
363
- ds_id
364
- + " ("
365
- + str(len(files))
366
- + f" file{'s' if len(files) > 1 else ''} affected)": (
367
- [f"e.g. '{next(iter(files))}'"]
368
- if len(files) > 1
369
- else [f"'{next(iter(files))}'"]
370
- )
371
- for ds_id, files in combined.items()
372
- }
373
-
374
- self.clustered_summary[status][weight][test_id][
375
- msg_summary
376
- ] = formatted
377
-
378
-
379
38
  def get_default_result_dir():
39
+ """
40
+ Get the default result directory.
41
+
42
+ Returns
43
+ -------
44
+ str
45
+ Default result directory.
46
+ """
380
47
  global _timestamp
381
48
  global _timestamp_with_ms
382
49
  hash_object = hashlib.md5(_timestamp_with_ms.encode())
@@ -385,7 +52,27 @@ def get_default_result_dir():
385
52
  + f"/esgf-qa-results_{_timestamp_filename}_{hash_object.hexdigest()}"
386
53
  )
387
54
 
55
+
388
56
  def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
57
+ """
58
+ Get the dataset id for a file.
59
+
60
+ Parameters
61
+ ----------
62
+ files_to_check_dict : dict
63
+ Dictionary of files to check.
64
+ dataset_files_map_ext : dict
65
+ Dictionary of dataset files.
66
+ file_path : str
67
+ Path to the file.
68
+ project_id : str
69
+ Project id.
70
+
71
+ Returns
72
+ -------
73
+ str
74
+ Dataset id.
75
+ """
389
76
  dir_id = files_to_check_dict[file_path]["id_dir"].split("/")
390
77
  fn_id = files_to_check_dict[file_path]["id_fn"].split("_")
391
78
  if project_id in dir_id:
@@ -397,7 +84,24 @@ def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
397
84
  dsid += "." + ".".join(fn_id)
398
85
  return dsid
399
86
 
87
+
400
88
  def get_checker_release_versions(checkers, checker_options={}):
89
+ """
90
+ Get the release versions of the checkers.
91
+
92
+ Parameters
93
+ ----------
94
+ checkers : list
95
+ A list of checkers to get the release versions for.
96
+ checker_options : dict, optional
97
+ A dictionary of options for the checkers.
98
+ Example format: {"cf": {"check_dimension_order": True}}
99
+
100
+ Returns
101
+ -------
102
+ None
103
+ Updates the global dictionary ``checker_release_versions``.
104
+ """
401
105
  global checker_release_versions
402
106
  global checker_dict
403
107
  global checker_dict_ext
@@ -419,11 +123,20 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
419
123
  """
420
124
  Run the compliance checker on a file with the specified checkers and options.
421
125
 
422
- Parameters:
423
- file_path (str): Path to the file to be checked.
424
- checkers (list): List of checkers to run.
425
- checker_options (dict): Dictionary of options for each checker.
426
- Example format: {"cf": {"check_dimension_order": True}}
126
+ Parameters
127
+ ----------
128
+ file_path : str
129
+ The path to the file to be checked.
130
+ checkers : list
131
+ A list of checkers to run.
132
+ checker_options : dict, optional
133
+ A dictionary of options for the checkers.
134
+ Example format: {"cf": {"check_dimension_order": True}}
135
+
136
+ Returns
137
+ -------
138
+ dict
139
+ A dictionary containing the results of the compliance checker.
427
140
  """
428
141
  check_suite = CheckSuite(options=checker_options)
429
142
  check_suite.load_all_available_checkers()
@@ -458,6 +171,21 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
458
171
 
459
172
 
460
173
  def track_checked_datasets(checked_datasets_file, checked_datasets):
174
+ """
175
+ Track checked datasets.
176
+
177
+ Parameters
178
+ ----------
179
+ checked_datasets_file : str
180
+ The path to the file to track checked datasets.
181
+ checked_datasets : list
182
+ A list of checked datasets.
183
+
184
+ Returns
185
+ -------
186
+ None
187
+ Writes the checked datasets to the file.
188
+ """
461
189
  with open(checked_datasets_file, "a") as file:
462
190
  writer = csv.writer(file)
463
191
  for dataset_id in checked_datasets:
@@ -472,6 +200,29 @@ def process_file(
472
200
  processed_files,
473
201
  progress_file,
474
202
  ):
203
+ """
204
+ Runs cc checks for a single file.
205
+
206
+ Parameters
207
+ ----------
208
+ file_path : str
209
+ The path to the file to be checked.
210
+ checkers : list
211
+ A list of checkers to run.
212
+ checker_options : dict
213
+ A dictionary of options for the checkers.
214
+ files_to_check_dict : dict
215
+ A special dictionary mapping files to check to datasets.
216
+ processed_files : list
217
+ A list of files that have already been checked.
218
+ progress_file : str
219
+ The path to the progress file.
220
+
221
+ Returns
222
+ -------
223
+ tuple
224
+ A tuple containing the file path and the results of the compliance checker.
225
+ """
475
226
  # Read result from disk if check was run previously
476
227
  result_file = files_to_check_dict[file_path]["result_file"]
477
228
  consistency_file = files_to_check_dict[file_path]["consistency_file"]
@@ -567,6 +318,31 @@ def process_dataset(
567
318
  processed_datasets,
568
319
  progress_file,
569
320
  ):
321
+ """
322
+ Runs esgf_qa checks on a dataset.
323
+
324
+ Parameters
325
+ ----------
326
+ ds : str
327
+ Dataset to process.
328
+ ds_map : dict
329
+ Dictionary mapping dataset IDs to file paths.
330
+ checkers : list
331
+ List of checkers to run.
332
+ checker_options : dict
333
+ Dictionary of checker options.
334
+ files_to_check_dict : dict
335
+ A special dictionary mapping files to check to datasets.
336
+ processed_datasets : set
337
+ Set of processed datasets.
338
+ progress_file : str
339
+ Path to progress file.
340
+
341
+ Returns
342
+ -------
343
+ tuple
344
+ Dataset ID and check results.
345
+ """
570
346
  # Read result from disk if check was run previously
571
347
  result_file = files_to_check_dict[ds_map[ds][0]]["result_file_ds"]
572
348
  if ds in processed_datasets and os.path.isfile(result_file):
@@ -637,10 +413,14 @@ def parse_options(opts):
637
413
  is a colon. Adapted from
638
414
  https://github.com/ioos/compliance-checker/blob/cbb40ed1981c169b74c954f0775d5bd23005ed23/cchecker.py#L23
639
415
 
640
- Parameters:
641
- opts: Iterable of strings with options
416
+ Parameters
417
+ ----------
418
+ opts : Iterable of strings
419
+ Iterable of option strings
642
420
 
643
- Returns:
421
+ Returns
422
+ -------
423
+ dict
644
424
  Dictionary with keys as checker type (i.e. "mip").
645
425
  Each value is a dictionary where keys are checker options and values
646
426
  are checker option values or None if not provided.
@@ -649,21 +429,39 @@ def parse_options(opts):
649
429
  for opt_str in opts:
650
430
  try:
651
431
  checker_type, checker_opt, *checker_val = opt_str.split(":", 2)
652
- checker_val = checker_val[0] if checker_val else None
432
+ checker_val = checker_val[0] if checker_val else True
653
433
  except ValueError:
654
434
  raise ValueError(
655
435
  f"Could not split option '{opt_str}', seems illegally formatted. The required format is: '<checker>:<option_name>[:<option_value>]', eg. 'mip:tables:/path/to/Tables'."
656
436
  )
657
- if checker_type != "mip":
658
- raise ValueError(
659
- f"Currently, only options for 'mip' checker are supported, got '{checker_type}'."
660
- )
661
437
  options_dict[checker_type][checker_opt] = checker_val
662
438
  return options_dict
663
439
 
664
440
 
441
+ def _verify_options_dict(options):
442
+ """
443
+ Helper function to verify that the options dictionary is correctly formatted.
444
+ """
445
+ if not isinstance(options, dict):
446
+ return False
447
+ if options == {}:
448
+ return True
449
+ try:
450
+ for checker_type in options.keys():
451
+ for checker_opt in options[checker_type].keys():
452
+ checker_val = options[checker_type][checker_opt]
453
+ if not isinstance(checker_val, (int, float, str, bool, type(None))):
454
+ return False
455
+ except (AttributeError, KeyError):
456
+ return False
457
+ # Seems to match the required format
458
+ return True
459
+
460
+
665
461
  def main():
666
- # CLI
462
+ """
463
+ CLI entry point.
464
+ """
667
465
  parser = argparse.ArgumentParser(description="Run QA checks")
668
466
  parser.add_argument(
669
467
  "parent_dir",
@@ -729,39 +527,51 @@ def main():
729
527
  # Resume information stored in a json file
730
528
  resume_info_file = Path(result_dir, ".resume_info")
731
529
 
530
+ # Do not allow arguments other than -o/--output_dir, -i/--info and -r/--resume if resuming previous QA run
531
+ if resume:
532
+ allowed_with_resume = {"output_dir", "info", "resume"}
533
+ # Convert Namespace to dict for easier checking
534
+ set_args = {k for k, v in vars(args).items() if v not in (None, False, [], "")}
535
+ invalid_args = set_args - allowed_with_resume
536
+ if invalid_args:
537
+ parser.error(
538
+ f"When using -r/--resume, only -o/--output_dir and -i/--info can be set. Invalid: {', '.join(invalid_args)}"
539
+ )
540
+
732
541
  # Deal with result_dir
733
542
  if not os.path.exists(result_dir):
734
543
  if resume:
735
- resume = False
736
- warnings.warn(
737
- "Resume is set but specified output_directory does not exist. Starting a new QA run..."
544
+ raise FileNotFoundError(
545
+ f"Resume is set but specified output_directory does not exist: '{result_dir}'."
738
546
  )
739
547
  os.mkdir(result_dir)
740
548
  elif os.listdir(result_dir) != []:
549
+ required_files = [progress_file, resume_info_file]
550
+ required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
741
551
  if resume:
742
- required_files = [progress_file, resume_info_file]
743
- required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
744
552
  if not all(os.path.isfile(rfile) for rfile in required_files) or not all(
745
553
  os.path.isdir(rpath) for rpath in required_paths
746
554
  ):
747
555
  raise Exception(
748
- "Resume is set but specified output_directory cannot be identified as output_directory of a previous QA run."
556
+ "Resume is set but specified output_directory cannot be identified as output directory of a previous QA run."
749
557
  )
750
558
  else:
751
- if "progress.txt" in os.listdir(
752
- result_dir
753
- ) and ".resume_info" in os.listdir(result_dir):
559
+ if all(os.path.isfile(rfile) for rfile in required_files) and all(
560
+ os.path.isdir(rpath) for rpath in required_paths
561
+ ):
754
562
  raise Exception(
755
- "Specified output_directory is not empty but can be identified as output_directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
563
+ "Specified output directory is not empty but can be identified as output directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
756
564
  )
757
565
  else:
758
- raise Exception("Specified output_directory is not empty.")
566
+ raise Exception("Specified output directory is not empty.")
759
567
  else:
760
568
  if resume:
761
569
  resume = False
762
- warnings.warn(
763
- "Resume is set but specified output_directory is empty. Starting a new QA run..."
570
+ raise FileNotFoundError(
571
+ f"Resume is set but specified output directory is empty: '{result_dir}'."
764
572
  )
573
+
574
+ # When resuming previous QA run
765
575
  if resume:
766
576
  print(f"Resuming previous QA run in '{result_dir}'")
767
577
  with open(os.path.join(result_dir, ".resume_info")) as f:
@@ -770,58 +580,54 @@ def main():
770
580
  required_keys = ["parent_dir", "info", "tests"]
771
581
  if not all(key in resume_info for key in required_keys):
772
582
  raise Exception(
773
- "Invalid .resume_info file. It should contain the keys 'parent_dir', 'info', and 'tests'."
583
+ f"Invalid .resume_info file in '{result_dir}'. It should contain the keys 'parent_dir', 'info', and 'tests'."
774
584
  )
775
585
  if not (
776
586
  isinstance(resume_info["parent_dir"], str)
777
587
  and isinstance(resume_info["info"], str)
778
588
  and isinstance(resume_info["tests"], list)
589
+ and isinstance(resume_info.get("cl_checker_options", {}), dict)
590
+ and isinstance(
591
+ resume_info.get("include_consistency_checks", False), bool
592
+ )
593
+ and _verify_options_dict(resume_info.get("cl_checker_options", {}))
779
594
  and all(isinstance(test, str) for test in resume_info["tests"])
780
595
  ):
781
596
  raise Exception(
782
- "Invalid .resume_info file. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings."
597
+ f"Invalid .resume_info file in '{result_dir}'. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings. "
598
+ "'cl_checker_options' (optional) should be a nested dictionary of format 'checker:option_name:option_value', and "
599
+ "'include_consistency_checks' (optional) should be a boolean."
783
600
  )
784
601
  except json.JSONDecodeError:
785
602
  raise Exception(
786
- "Invalid .resume_info file. It should be a valid JSON file."
603
+ f"Invalid .resume_info file in '{result_dir}'. It needs to be a valid JSON file."
787
604
  )
788
- if tests and sorted(tests) != resume_info["tests"]:
789
- raise Exception("Cannot resume a previous QA run with different tests.")
790
- else:
791
- tests = resume_info["tests"]
605
+ tests = resume_info["tests"]
606
+ parent_dir = resume_info["parent_dir"]
792
607
  if info and info != resume_info["info"]:
793
608
  warnings.warn(
794
609
  f"<info> argument differs from the originally specified <info> argument ('{resume_info['info']}'). Using the new specification."
795
610
  )
796
- if parent_dir is None:
797
- parent_dir = resume_info["parent_dir"]
798
- if parent_dir and Path(parent_dir) != Path(resume_info["parent_dir"]):
799
- raise Exception(
800
- "Cannot resume a previous QA run with different <parent_dir>."
801
- )
802
- if cl_checker_options and cl_checker_options != resume_info.get(
803
- "checker_options", {}
804
- ):
805
- raise Exception(
806
- "Cannot resume a previous QA run with different <option> arguments."
807
- )
808
- else:
809
- parent_dir = Path(resume_info["parent_dir"])
810
- if "include_consistency_checks" in resume_info:
811
- include_consistency_checks = resume_info["include_consistency_checks"]
611
+ cl_checker_options = resume_info.get("checker_options", {})
612
+ include_consistency_checks = resume_info.get(
613
+ "include_consistency_checks", False
614
+ )
812
615
  else:
813
616
  print(f"Storing check results in '{result_dir}'")
814
617
 
815
618
  # Deal with tests
816
619
  if not tests:
817
- checkers = ["cc6", "cf"]
818
- checkers_versions = {"cc6": "latest", "cf": "1.11"}
620
+ checkers = ["cf"]
621
+ checkers_versions = {"cf": "latest"}
819
622
  checker_options = defaultdict(dict)
820
623
  else:
821
- test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
624
+ # Require versions to be specified:
625
+ # test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
626
+ # Allow versions to be ommitted:
627
+ test_regex = re.compile(r"^[a-z0-9_]+(?::(latest|[0-9]+(?:\.[0-9]+)*))?$")
822
628
  if not all([test_regex.match(test) for test in tests]):
823
629
  raise Exception(
824
- f"Invalid test(s) specified. Please specify tests in the format 'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
630
+ f"Invalid test(s) specified. Please specify tests in the format 'checker_name' or'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
825
631
  )
826
632
  checkers = [test.split(":")[0] for test in tests]
827
633
  if sorted(checkers) != sorted(list(set(checkers))):
@@ -849,11 +655,9 @@ def main():
849
655
  if "eerie" in checkers_versions:
850
656
  checkers_versions["mip"] = "latest"
851
657
  del checkers_versions["eerie"]
852
- if "tables" in cl_checker_options["eerie"]:
853
- cl_checker_options["mip"]["tables"] = cl_checker_options["eerie"][
854
- "tables"
855
- ]
856
- elif "tables" not in cl_checker_options["mip"]:
658
+ if "eerie" in cl_checker_options:
659
+ cl_checker_options["mip"] = cl_checker_options.pop("eerie")
660
+ if "tables" not in cl_checker_options["mip"]:
857
661
  cl_checker_options["mip"][
858
662
  "tables"
859
663
  ] = "/work/bm0021/cmor_tables/eerie_cmor_tables/Tables"
@@ -887,7 +691,7 @@ def main():
887
691
  if cl_checker_options:
888
692
  resume_info["checker_options"] = cl_checker_options
889
693
  with open(os.path.join(result_dir, ".resume_info"), "w") as f:
890
- json.dump(resume_info, f)
694
+ json.dump(resume_info, f, sort_keys=True, indent=4)
891
695
 
892
696
  # If only cf checker is selected, run cc6 time checks only
893
697
  if (
@@ -907,8 +711,9 @@ def main():
907
711
 
908
712
  DRS_parent = "CORDEX-CMIP6"
909
713
  for cname in checkers:
910
- print(cname)
911
- DRS_parent_tmp = DRS_path_parent.get(checker_dict.get(cname.split(":")[0], ""), "")
714
+ DRS_parent_tmp = DRS_path_parent.get(
715
+ checker_dict.get(cname.split(":")[0], ""), ""
716
+ )
912
717
  if DRS_parent_tmp:
913
718
  DRS_parent = DRS_parent_tmp
914
719
  break
@@ -1027,14 +832,14 @@ def main():
1027
832
  dataset_files_map[files_to_check_dict[file_path]["id"]] = [file_path]
1028
833
  checker_options[file_path] = {
1029
834
  "mip": {
1030
- **cl_checker_options["mip"],
835
+ **cl_checker_options.get("mip", {}),
1031
836
  "consistency_output": files_to_check_dict[file_path][
1032
837
  "consistency_file"
1033
838
  ],
1034
839
  "time_checks_only": time_checks_only,
1035
840
  },
1036
841
  "cc6": {
1037
- **cl_checker_options["cc6"],
842
+ **cl_checker_options.get("cc6", {}),
1038
843
  "consistency_output": files_to_check_dict[file_path][
1039
844
  "consistency_file"
1040
845
  ],
@@ -1046,15 +851,32 @@ def main():
1046
851
  "time_checks_only": time_checks_only,
1047
852
  },
1048
853
  "cf:": {
1049
- **cl_checker_options["cf"],
854
+ **cl_checker_options.get("cf", {}),
1050
855
  "enable_appendix_a_checks": True,
1051
856
  },
857
+ "wcrp_cmip6": {
858
+ **cl_checker_options.get("wcrp_cmip6", {}),
859
+ "consistency_output": files_to_check_dict[file_path][
860
+ "consistency_file"
861
+ ],
862
+ },
863
+ "wcrp_cordex_cmip6": {
864
+ **cl_checker_options.get("wcrp_cordex_cmip6", {}),
865
+ "consistency_output": files_to_check_dict[file_path][
866
+ "consistency_file"
867
+ ],
868
+ "tables_dir": result_dir + "/tables",
869
+ "force_table_download": file_path == files_to_check[0]
870
+ and (
871
+ not resume or (resume and os.listdir(result_dir + "/tables") == [])
872
+ ),
873
+ },
1052
874
  }
1053
875
  checker_options[file_path].update(
1054
876
  {
1055
877
  k: v
1056
878
  for k, v in cl_checker_options.items()
1057
- if k not in ["cc6", "cf", "mip"]
879
+ if k not in ["cc6", "cf", "mip", "wcrp_cmip6", "wcrp_cordex_cmip6"]
1058
880
  }
1059
881
  )
1060
882
 
@@ -1090,7 +912,8 @@ def main():
1090
912
  print()
1091
913
 
1092
914
  # Initialize the summary
1093
- summary = QAResultAggregator(checker_dict=checker_dict_ext)
915
+ summary = QAResultAggregator()
916
+ reference_ds_dict = {}
1094
917
 
1095
918
  # Calculate the number of processes
1096
919
  num_processes = max(multiprocessing.cpu_count() - 4, 1)
@@ -1140,8 +963,14 @@ def main():
1140
963
 
1141
964
  # Skip continuity and consistency checks if no cc6/mip checks were run
1142
965
  # (and thus no consistency output file was created)
1143
- if "cc6:latest" in checkers or "mip:latest" in checkers:
1144
-
966
+ if (
967
+ "cc6:latest" in checkers
968
+ or "mip:latest" in checkers
969
+ or "wcrp_cmip6:1.0" in checkers
970
+ or "wcrp_cmip6:latest" in checkers
971
+ or "wcrp_cordex_cmip6:1.0" in checkers
972
+ or "wcrp_cordex_cmip6:latest" in checkers
973
+ ):
1145
974
  #########################################################
1146
975
  # QA Part 2 - Run all consistency & continuity checks
1147
976
  #########################################################
@@ -1252,7 +1081,7 @@ def main():
1252
1081
  "parent_dir": str(parent_dir),
1253
1082
  }
1254
1083
  # Add reference datasets for inter-dataset consistency checks
1255
- if 'cc6:latest' in checkers or 'mip:latest' in checkers:
1084
+ if reference_ds_dict:
1256
1085
  summary_info["inter_ds_con_checks_ref"] = reference_ds_dict
1257
1086
 
1258
1087
  dsid_common_prefix = os.path.commonprefix(list(dataset_files_map.keys()))