esgf-qa 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgf_qa/run_qa.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import argparse
2
2
  import csv
3
3
  import datetime
4
- import difflib
5
4
  import hashlib
6
5
  import json
7
6
  import multiprocessing
@@ -13,46 +12,22 @@ from pathlib import Path
13
12
 
14
13
  from compliance_checker import __version__ as cc_version
15
14
  from compliance_checker.runner import CheckSuite
16
-
15
+ from packaging import version as pversion
16
+
17
+ from esgf_qa._constants import (
18
+ checker_dict,
19
+ checker_dict_ext,
20
+ checker_release_versions,
21
+ checker_supporting_consistency_checks,
22
+ supported_project_ids,
23
+ )
17
24
  from esgf_qa._version import version
25
+ from esgf_qa.cluster_results import QAResultAggregator
18
26
  from esgf_qa.con_checks import compatibility_checks as comp # noqa
19
27
  from esgf_qa.con_checks import consistency_checks as cons # noqa
20
28
  from esgf_qa.con_checks import continuity_checks as cont # noqa
21
29
  from esgf_qa.con_checks import dataset_coverage_checks, inter_dataset_consistency_checks
22
30
 
23
- checker_dict = {
24
- "cc6": "CORDEX-CMIP6",
25
- "cf": "CF-Conventions",
26
- "mip": "MIP",
27
- "plugin_cmip6": "CMIP6",
28
- # "wcrp-cmip5": "CMIP5",
29
- "wcrp_cmip6": "CMIP6",
30
- # "wcrp_cmip7": "CMIP7-AFT",
31
- # "wcrp_cmip7": "CMIP7",
32
- # "wcrp_cordex": "CORDEX",
33
- "wcrp_cordex_cmip6": "CORDEX-CMIP6",
34
- # "obs4mips": "Obs4MIPs",
35
- # "input4mips": "Input4MIPs",
36
- }
37
- DRS_path_parent = {
38
- "CMIP5": "CMIP5",
39
- "CMIP6": "CMIP6",
40
- "CMIP7": "CMIP7",
41
- "CMIP7-AFT": "CMIP7",
42
- "CORDEX": "CORDEX",
43
- "CORDEX-CMIP6": "CORDEX-CMIP6",
44
- "Obs4MIPs": "Obs4MIPs",
45
- "Input4MIPs": "Input4MIPs",
46
- }
47
- checker_release_versions = {}
48
- checker_dict_ext = {
49
- # "pcons": "ParentConsistency"
50
- "cons": "Consistency",
51
- "cont": "Continuity",
52
- "comp": "Compatibility",
53
- **checker_dict,
54
- }
55
-
56
31
  _timestamp_with_ms = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
57
32
  _timestamp_filename = datetime.datetime.strptime(
58
33
  _timestamp_with_ms, "%Y%m%d-%H%M%S%f"
@@ -62,321 +37,15 @@ _timestamp_pprint = datetime.datetime.strptime(
62
37
  ).strftime("%Y-%m-%d %H:%M")
63
38
 
64
39
 
65
- class QAResultAggregator:
66
- def __init__(self, checker_dict):
67
- """
68
- Initialize the aggregator with an empty summary.
69
- """
70
- self.summary = {
71
- "error": defaultdict(
72
- lambda: defaultdict(lambda: defaultdict(list))
73
- ), # No weight, just function -> error msg
74
- "fail": defaultdict(
75
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
76
- ), # weight -> test -> msg -> dsid -> filenames
77
- }
78
- self.checker_dict = checker_dict
79
-
80
- def update(self, result_dict, dsid, file_name):
81
- """
82
- Update the summary with a single result of a cc-run.
83
- """
84
- for checker in result_dict:
85
- for test in result_dict[checker]:
86
- if test == "errors":
87
- for function_name, error_msg in result_dict[checker][
88
- "errors"
89
- ].items():
90
- self.summary["error"][
91
- f"[{checker_dict[checker]}] " + function_name
92
- ][error_msg][dsid].append(file_name)
93
- else:
94
- score, max_score = result_dict[checker][test]["value"]
95
- weight = result_dict[checker][test].get("weight", 3)
96
- msgs = result_dict[checker][test].get("msgs", [])
97
- if score < max_score: # test outcome: fail
98
- for msg in msgs:
99
- self.summary["fail"][weight][
100
- f"[{checker_dict[checker]}] " + test
101
- ][msg][dsid].append(file_name)
102
-
103
- def update_ds(self, result_dict, dsid):
104
- """
105
- Update the summary with a single result of a esgf-qa run.
106
- """
107
- for checker in result_dict:
108
- for test in result_dict[checker]:
109
- if test == "errors":
110
- for function_name, errdict in result_dict[checker][
111
- "errors"
112
- ].items():
113
- for file_name in errdict["files"]:
114
- self.summary["error"][
115
- f"[{checker_dict_ext[checker]}] " + function_name
116
- ][errdict["msg"]][dsid].append(file_name)
117
- else:
118
- weight = result_dict[checker][test].get("weight", 3)
119
- fails = result_dict[checker][test].get("msgs", {})
120
- for msg, file_names in fails.items():
121
- for file_name in file_names:
122
- self.summary["fail"][weight][
123
- f"[{checker_dict_ext[checker]}] " + test
124
- ][msg][dsid].append(file_name)
125
-
126
- def sort(self):
127
- """
128
- Sort the summary.
129
- """
130
- self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
131
- for key in self.summary["fail"]:
132
- self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
133
-
134
- # Sort errors by function name
135
- for checker in self.summary["error"]:
136
- self.summary["error"][checker] = dict(
137
- sorted(self.summary["error"][checker].items())
138
- )
139
-
140
- @staticmethod
141
- def cluster_messages(messages, threshold):
142
- clusters = []
143
- while messages:
144
- base = messages.pop(0)
145
- cluster = [base]
146
- to_remove = []
147
- for msg in messages:
148
- ratio = difflib.SequenceMatcher(None, base, msg).ratio()
149
- if ratio >= threshold:
150
- cluster.append(msg)
151
- to_remove.append(msg)
152
- for msg in to_remove:
153
- messages.remove(msg)
154
- clusters.append(cluster)
155
- return clusters
156
-
157
- @staticmethod
158
- def generalize_message_group(messages):
159
- if len(messages) == 1:
160
- return messages[0], {}
161
-
162
- # Split messages into tokens
163
- split_messages = [re.findall(r"\w+|\W", m) for m in messages]
164
- transposed = list(zip(*split_messages))
165
- template = []
166
- placeholders = {}
167
- var_index = 0
168
-
169
- for i, tokens in enumerate(transposed):
170
- unique_tokens = set(tokens)
171
- if len(unique_tokens) == 1:
172
- template.append(tokens[0])
173
- else:
174
- var_name = chr(ord("A") + var_index)
175
- template.append(f"{{{var_name}}}")
176
- placeholders[var_name] = tokens[0]
177
- var_index += 1
178
-
179
- # Merge placeholders if possible
180
- template, placeholders = QAResultAggregator.merge_placeholders(
181
- template, placeholders
182
- )
183
-
184
- # Return the generalized message and the placeholders
185
- generalized = "".join(template)
186
- return generalized, placeholders
187
-
188
- @staticmethod
189
- def merge_placeholders(list_of_strings, dictionary, skip=0):
190
- def find_next_two_placeholders(list_of_strings, skip):
191
- placeholders = [
192
- s for s in list_of_strings if s.startswith("{") and s.endswith("}")
193
- ]
194
- if len(placeholders) < 2:
195
- return None, None
196
- return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
197
- placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
198
- )
199
-
200
- def extract_text_between_placeholders(
201
- list_of_strings, placeholder1, placeholder2
202
- ):
203
- idx1 = list_of_strings.index(placeholder1)
204
- idx2 = list_of_strings.index(placeholder2)
205
- return "".join(list_of_strings[idx1 + 1 : idx2])
206
-
207
- def merge_two_placeholders(
208
- placeholder1, placeholder2, text_between, dictionary
209
- ):
210
- new_value = (
211
- dictionary[placeholder1.lstrip("{").rstrip("}")]
212
- + text_between
213
- + dictionary[placeholder2.lstrip("{").rstrip("}")]
214
- )
215
- dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
216
- del dictionary[placeholder2.lstrip("{").rstrip("}")]
217
- return dictionary
218
-
219
- def update_placeholder_names(list_of_strings, dictionary):
220
- old_placeholders = sorted(list(dictionary.keys()))
221
- new_placeholders = [
222
- chr(ord("A") + i) for i in range(0, len(old_placeholders))
223
- ]
224
- new_dictionary = dict(
225
- zip(new_placeholders, [dictionary[val] for val in old_placeholders])
226
- )
227
- for old, new in zip(old_placeholders, new_placeholders):
228
- list_of_strings = [
229
- s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
230
- ]
231
- return list_of_strings, new_dictionary
232
-
233
- def replace_placeholders_with_new_one(
234
- list_of_strings, placeholder1, placeholder2
235
- ):
236
- idx1 = list_of_strings.index(placeholder1)
237
- idx2 = list_of_strings.index(placeholder2)
238
- list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
239
- if idx2 < len(list_of_strings) + 1:
240
- list_of_strings_new += list_of_strings[idx2 + 1 :]
241
- return list_of_strings_new
242
-
243
- if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
244
- return list_of_strings, dictionary
245
-
246
- placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
247
- if placeholder1 is None or placeholder2 is None:
248
- return list_of_strings, dictionary
249
-
250
- text_between = extract_text_between_placeholders(
251
- list_of_strings, placeholder1, placeholder2
252
- )
253
- if len(text_between) < 5:
254
- dictionary = merge_two_placeholders(
255
- placeholder1, placeholder2, text_between, dictionary
256
- )
257
- list_of_strings = replace_placeholders_with_new_one(
258
- list_of_strings, placeholder1, placeholder2
259
- )
260
- list_of_strings, dictionary = update_placeholder_names(
261
- list_of_strings, dictionary
262
- )
263
- return QAResultAggregator.merge_placeholders(
264
- list_of_strings, dictionary, skip
265
- )
266
- else:
267
- return QAResultAggregator.merge_placeholders(
268
- list_of_strings, dictionary, skip + 1
269
- )
270
-
271
- def cluster_summary(self, threshold=0.75):
272
- self.clustered_summary = defaultdict(
273
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
274
- )
275
- for status in self.summary:
276
- if status == "error":
277
- for test_id in self.summary[status]:
278
- messages = list(self.summary[status][test_id].keys())
279
- # Pass a copy of messages to cluster_messages to generate clusters
280
- clusters = QAResultAggregator.cluster_messages(
281
- messages[:], threshold
282
- )
283
-
284
- for cluster in clusters:
285
- generalized, placeholders = (
286
- QAResultAggregator.generalize_message_group(cluster)
287
- )
288
- example_parts = ", ".join(
289
- [
290
- (
291
- f"{k}='{v[0]}'"
292
- if isinstance(v, list)
293
- else f"{k}='{v}'"
294
- )
295
- for k, v in placeholders.items()
296
- ]
297
- )
298
- if example_parts:
299
- msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
300
- else:
301
- msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
302
-
303
- # Gather all ds_ids and filenames across the cluster
304
- combined = defaultdict(set)
305
- for message in cluster:
306
- for ds_id, files in self.summary[status][test_id][
307
- message
308
- ].items():
309
- combined[ds_id].update(files)
310
-
311
- # Shorten file lists to one example
312
- formatted = {
313
- ds_id
314
- + " ("
315
- + str(len(files))
316
- + f" file{'s' if len(files) > 1 else ''} affected)": (
317
- [f"e.g. '{next(iter(files))}'"]
318
- if len(files) > 1
319
- else [f"'{next(iter(files))}'"]
320
- )
321
- for ds_id, files in combined.items()
322
- }
323
-
324
- self.clustered_summary[status][test_id][msg_summary] = formatted
325
- elif status == "fail":
326
- for weight in self.summary[status]:
327
- for test_id in self.summary[status][weight]:
328
- messages = list(self.summary[status][weight][test_id].keys())
329
- # Pass a copy of messages to cluster_messages to generate clusters
330
- clusters = QAResultAggregator.cluster_messages(
331
- messages[:], threshold
332
- )
333
-
334
- for cluster in clusters:
335
- generalized, placeholders = (
336
- QAResultAggregator.generalize_message_group(cluster)
337
- )
338
- example_parts = ", ".join(
339
- [
340
- (
341
- f"{k}='{v[0]}'"
342
- if isinstance(v, list)
343
- else f"{k}='{v}'"
344
- )
345
- for k, v in placeholders.items()
346
- ]
347
- )
348
- if example_parts:
349
- msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
350
- else:
351
- msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
352
-
353
- # Gather all ds_ids and filenames across the cluster
354
- combined = defaultdict(set)
355
- for message in cluster:
356
- for ds_id, files in self.summary[status][weight][
357
- test_id
358
- ][message].items():
359
- combined[ds_id].update(files)
360
-
361
- # Shorten file lists to one example
362
- formatted = {
363
- ds_id
364
- + " ("
365
- + str(len(files))
366
- + f" file{'s' if len(files) > 1 else ''} affected)": (
367
- [f"e.g. '{next(iter(files))}'"]
368
- if len(files) > 1
369
- else [f"'{next(iter(files))}'"]
370
- )
371
- for ds_id, files in combined.items()
372
- }
373
-
374
- self.clustered_summary[status][weight][test_id][
375
- msg_summary
376
- ] = formatted
377
-
378
-
379
40
  def get_default_result_dir():
41
+ """
42
+ Get the default result directory.
43
+
44
+ Returns
45
+ -------
46
+ str
47
+ Default result directory.
48
+ """
380
49
  global _timestamp
381
50
  global _timestamp_with_ms
382
51
  hash_object = hashlib.md5(_timestamp_with_ms.encode())
@@ -385,19 +54,86 @@ def get_default_result_dir():
385
54
  + f"/esgf-qa-results_{_timestamp_filename}_{hash_object.hexdigest()}"
386
55
  )
387
56
 
388
- def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
57
+
58
+ def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_ids):
59
+ """
60
+ Get the dataset id for a file.
61
+
62
+ Parameters
63
+ ----------
64
+ files_to_check_dict : dict
65
+ Dictionary of files to check.
66
+ dataset_files_map_ext : dict
67
+ Dictionary of dataset files.
68
+ file_path : str
69
+ Path to the file.
70
+ project_ids: list of str
71
+ List of supported project_ids
72
+
73
+ Returns
74
+ -------
75
+ str
76
+ Dataset id.
77
+ """
389
78
  dir_id = files_to_check_dict[file_path]["id_dir"].split("/")
390
79
  fn_id = files_to_check_dict[file_path]["id_fn"].split("_")
391
- if project_id in dir_id:
392
- last_index = len(dir_id) - 1 - dir_id[::-1].index(project_id)
393
- dsid = ".".join(dir_id[last_index:])
394
- else:
395
- dsid = ".".join(dir_id)
80
+ dsid = ".".join(dir_id)
81
+ dir_id_lower = [el.lower() for el in dir_id]
82
+ for project_id in project_ids:
83
+ if project_id in dir_id_lower:
84
+ last_index = len(dir_id_lower) - 1 - dir_id_lower[::-1].index(project_id)
85
+ dsid = ".".join(dir_id[last_index:])
86
+ break
396
87
  if len(dataset_files_map_ext[files_to_check_dict[file_path]["id_dir"]].keys()) > 1:
397
88
  dsid += "." + ".".join(fn_id)
398
89
  return dsid
399
90
 
91
+
92
+ def get_installed_checker_versions():
93
+ """
94
+ Get all available versions of installed cc-plugins.
95
+
96
+ Returns
97
+ -------
98
+ dict
99
+ A dictionary of {checker_name: [version1, version2, latest], ...}.
100
+ """
101
+ check_suite = CheckSuite()
102
+ check_suite.load_all_available_checkers()
103
+ installed_versions = {}
104
+ for checker in check_suite.checkers:
105
+ try:
106
+ name, version = checker.split(":")
107
+ except ValueError:
108
+ name, version = checker, "latest"
109
+ if version == "latest":
110
+ continue
111
+ if name not in installed_versions:
112
+ installed_versions[name] = []
113
+ installed_versions[name].append(version)
114
+ for name, versions in installed_versions.items():
115
+ installed_versions[name] = sorted(versions, key=pversion.parse) + ["latest"]
116
+
117
+ return installed_versions
118
+
119
+
400
120
  def get_checker_release_versions(checkers, checker_options={}):
121
+ """
122
+ Get the release versions of the checkers.
123
+
124
+ Parameters
125
+ ----------
126
+ checkers : list
127
+ A list of checkers to get the release versions for.
128
+ checker_options : dict, optional
129
+ A dictionary of options for the checkers.
130
+ Example format: {"cf": {"check_dimension_order": True}}
131
+
132
+ Returns
133
+ -------
134
+ None
135
+ Updates the global dictionary ``checker_release_versions``.
136
+ """
401
137
  global checker_release_versions
402
138
  global checker_dict
403
139
  global checker_dict_ext
@@ -413,17 +149,32 @@ def get_checker_release_versions(checkers, checker_options={}):
413
149
  )
414
150
  elif checker.split(":")[0] in checker_dict_ext:
415
151
  checker_release_versions[checker.split(":")[0]] = version
152
+ else:
153
+ checker_release_versions[checker.split(":")[0]] = (
154
+ check_suite.checkers.get(
155
+ checker, "unknown version"
156
+ )._cc_spec_version
157
+ )
416
158
 
417
159
 
418
160
  def run_compliance_checker(file_path, checkers, checker_options={}):
419
161
  """
420
162
  Run the compliance checker on a file with the specified checkers and options.
421
163
 
422
- Parameters:
423
- file_path (str): Path to the file to be checked.
424
- checkers (list): List of checkers to run.
425
- checker_options (dict): Dictionary of options for each checker.
426
- Example format: {"cf": {"check_dimension_order": True}}
164
+ Parameters
165
+ ----------
166
+ file_path : str
167
+ The path to the file to be checked.
168
+ checkers : list
169
+ A list of checkers to run.
170
+ checker_options : dict, optional
171
+ A dictionary of options for the checkers.
172
+ Example format: {"cf": {"check_dimension_order": True}}
173
+
174
+ Returns
175
+ -------
176
+ dict
177
+ A dictionary containing the results of the compliance checker.
427
178
  """
428
179
  check_suite = CheckSuite(options=checker_options)
429
180
  check_suite.load_all_available_checkers()
@@ -453,11 +204,31 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
453
204
  ds, [checker], include_checks=None, skip_checks=[]
454
205
  )
455
206
  )
207
+ if hasattr(ds, "close"):
208
+ ds.close()
456
209
  return results
457
- return check_suite.run_all(ds, checkers, include_checks=None, skip_checks=[])
210
+ results = check_suite.run_all(ds, checkers, include_checks=None, skip_checks=[])
211
+ if hasattr(ds, "close"):
212
+ ds.close()
213
+ return results
458
214
 
459
215
 
460
216
  def track_checked_datasets(checked_datasets_file, checked_datasets):
217
+ """
218
+ Track checked datasets.
219
+
220
+ Parameters
221
+ ----------
222
+ checked_datasets_file : str
223
+ The path to the file to track checked datasets.
224
+ checked_datasets : list
225
+ A list of checked datasets.
226
+
227
+ Returns
228
+ -------
229
+ None
230
+ Writes the checked datasets to the file.
231
+ """
461
232
  with open(checked_datasets_file, "a") as file:
462
233
  writer = csv.writer(file)
463
234
  for dataset_id in checked_datasets:
@@ -472,6 +243,29 @@ def process_file(
472
243
  processed_files,
473
244
  progress_file,
474
245
  ):
246
+ """
247
+ Runs cc checks for a single file.
248
+
249
+ Parameters
250
+ ----------
251
+ file_path : str
252
+ The path to the file to be checked.
253
+ checkers : list
254
+ A list of checkers to run.
255
+ checker_options : dict
256
+ A dictionary of options for the checkers.
257
+ files_to_check_dict : dict
258
+ A special dictionary mapping files to check to datasets.
259
+ processed_files : list
260
+ A list of files that have already been checked.
261
+ progress_file : str
262
+ The path to the progress file.
263
+
264
+ Returns
265
+ -------
266
+ tuple
267
+ A tuple containing the file path and the results of the compliance checker.
268
+ """
475
269
  # Read result from disk if check was run previously
476
270
  result_file = files_to_check_dict[file_path]["result_file"]
477
271
  consistency_file = files_to_check_dict[file_path]["consistency_file"]
@@ -512,14 +306,6 @@ def process_file(
512
306
  checker = checkerv.split(":")[0]
513
307
  check_results[checker] = dict()
514
308
  check_results[checker]["errors"] = {}
515
- # print()
516
- # print("name",result[checker][0][0].name)
517
- # print("weight", result[checker][0][0].weight)
518
- # print("value", result[checker][0][0].value)
519
- # print("msgs", result[checker][0][0].msgs)
520
- # print("method", result[checker][0][0].check_method)
521
- # print("children", result[checker][0][0].children)
522
- # quit()
523
309
  for check in result[checkerv][0]:
524
310
  check_results[checker][check.name] = {}
525
311
  check_results[checker][check.name]["weight"] = check.weight
@@ -567,6 +353,31 @@ def process_dataset(
567
353
  processed_datasets,
568
354
  progress_file,
569
355
  ):
356
+ """
357
+ Runs esgf_qa checks on a dataset.
358
+
359
+ Parameters
360
+ ----------
361
+ ds : str
362
+ Dataset to process.
363
+ ds_map : dict
364
+ Dictionary mapping dataset IDs to file paths.
365
+ checkers : list
366
+ List of checkers to run.
367
+ checker_options : dict
368
+ Dictionary of checker options.
369
+ files_to_check_dict : dict
370
+ A special dictionary mapping files to check to datasets.
371
+ processed_datasets : set
372
+ Set of processed datasets.
373
+ progress_file : str
374
+ Path to progress file.
375
+
376
+ Returns
377
+ -------
378
+ tuple
379
+ Dataset ID and check results.
380
+ """
570
381
  # Read result from disk if check was run previously
571
382
  result_file = files_to_check_dict[ds_map[ds][0]]["result_file_ds"]
572
383
  if ds in processed_datasets and os.path.isfile(result_file):
@@ -637,10 +448,14 @@ def parse_options(opts):
637
448
  is a colon. Adapted from
638
449
  https://github.com/ioos/compliance-checker/blob/cbb40ed1981c169b74c954f0775d5bd23005ed23/cchecker.py#L23
639
450
 
640
- Parameters:
641
- opts: Iterable of strings with options
451
+ Parameters
452
+ ----------
453
+ opts : Iterable of strings
454
+ Iterable of option strings
642
455
 
643
- Returns:
456
+ Returns
457
+ -------
458
+ dict
644
459
  Dictionary with keys as checker type (i.e. "mip").
645
460
  Each value is a dictionary where keys are checker options and values
646
461
  are checker option values or None if not provided.
@@ -649,21 +464,39 @@ def parse_options(opts):
649
464
  for opt_str in opts:
650
465
  try:
651
466
  checker_type, checker_opt, *checker_val = opt_str.split(":", 2)
652
- checker_val = checker_val[0] if checker_val else None
467
+ checker_val = checker_val[0] if checker_val else True
653
468
  except ValueError:
654
469
  raise ValueError(
655
470
  f"Could not split option '{opt_str}', seems illegally formatted. The required format is: '<checker>:<option_name>[:<option_value>]', eg. 'mip:tables:/path/to/Tables'."
656
471
  )
657
- if checker_type != "mip":
658
- raise ValueError(
659
- f"Currently, only options for 'mip' checker are supported, got '{checker_type}'."
660
- )
661
472
  options_dict[checker_type][checker_opt] = checker_val
662
473
  return options_dict
663
474
 
664
475
 
476
+ def _verify_options_dict(options):
477
+ """
478
+ Helper function to verify that the options dictionary is correctly formatted.
479
+ """
480
+ if not isinstance(options, dict):
481
+ return False
482
+ if options == {}:
483
+ return True
484
+ try:
485
+ for checker_type in options.keys():
486
+ for checker_opt in options[checker_type].keys():
487
+ checker_val = options[checker_type][checker_opt]
488
+ if not isinstance(checker_val, (int, float, str, bool, type(None))):
489
+ return False
490
+ except (AttributeError, KeyError):
491
+ return False
492
+ # Seems to match the required format
493
+ return True
494
+
495
+
665
496
  def main():
666
- # CLI
497
+ """
498
+ CLI entry point.
499
+ """
667
500
  parser = argparse.ArgumentParser(description="Run QA checks")
668
501
  parser.add_argument(
669
502
  "parent_dir",
@@ -683,13 +516,17 @@ def main():
683
516
  "--option",
684
517
  default=[],
685
518
  action="append",
686
- help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. Multiple invocations possible.",
519
+ help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. "
520
+ "Multiple invocations possible.",
687
521
  )
688
522
  parser.add_argument(
689
523
  "-t",
690
524
  "--test",
691
525
  action="append",
692
- help="The test to run ('cc6:latest' or 'cf:<version>', can be specified multiple times, eg.: '-t cc6:latest -t cf:1.8') - default: running 'cc6:latest' and 'cf:1.11'.",
526
+ help="The test(s) to run in format '<checker>[:<version>]', (eg. 'wcrp_cmip7', "
527
+ "'wcrp_cmip6:latest' or 'cf:<version>', can be specified multiple times, "
528
+ "eg.: '-t cc6:latest -t cf:1.11') - default: running 'cf:latest'. "
529
+ "The default version selected for each checker is 'latest'.",
693
530
  )
694
531
  parser.add_argument(
695
532
  "-i",
@@ -709,6 +546,13 @@ def main():
709
546
  action="store_true",
710
547
  help="Include basic consistency and continuity checks. Default: False.",
711
548
  )
549
+ parser.add_argument(
550
+ "-P",
551
+ "--parallel_processes",
552
+ type=int,
553
+ default=0,
554
+ help="Specify the maximum number of parallel processes. Default: 0 (= number of cores).",
555
+ )
712
556
  args = parser.parse_args()
713
557
 
714
558
  result_dir = os.path.abspath(args.output_dir)
@@ -720,6 +564,7 @@ def main():
720
564
  args.include_consistency_checks if args.include_consistency_checks else False
721
565
  )
722
566
  cl_checker_options = parse_options(args.option)
567
+ parallel_processes = args.parallel_processes
723
568
 
724
569
  # Progress file to track already checked files
725
570
  progress_file = Path(result_dir, "progress.txt")
@@ -729,39 +574,51 @@ def main():
729
574
  # Resume information stored in a json file
730
575
  resume_info_file = Path(result_dir, ".resume_info")
731
576
 
577
+ # Do not allow any but certain arguments if resuming previous QA run
578
+ if resume:
579
+ allowed_with_resume = {"output_dir", "info", "resume", "parallel_processes"}
580
+ # Convert Namespace to dict for easier checking
581
+ set_args = {k for k, v in vars(args).items() if v not in (None, False, [], "")}
582
+ invalid_args = set_args - allowed_with_resume
583
+ if invalid_args:
584
+ parser.error(
585
+ f"When using -r/--resume, the following arguments are not allowed: {', '.join(invalid_args)}"
586
+ )
587
+
732
588
  # Deal with result_dir
733
589
  if not os.path.exists(result_dir):
734
590
  if resume:
735
- resume = False
736
- warnings.warn(
737
- "Resume is set but specified output_directory does not exist. Starting a new QA run..."
591
+ raise FileNotFoundError(
592
+ f"Resume is set but specified output_directory does not exist: '{result_dir}'."
738
593
  )
739
594
  os.mkdir(result_dir)
740
595
  elif os.listdir(result_dir) != []:
596
+ required_files = [progress_file, resume_info_file]
597
+ required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
741
598
  if resume:
742
- required_files = [progress_file, resume_info_file]
743
- required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
744
599
  if not all(os.path.isfile(rfile) for rfile in required_files) or not all(
745
600
  os.path.isdir(rpath) for rpath in required_paths
746
601
  ):
747
602
  raise Exception(
748
- "Resume is set but specified output_directory cannot be identified as output_directory of a previous QA run."
603
+ "Resume is set but specified output_directory cannot be identified as output directory of a previous QA run."
749
604
  )
750
605
  else:
751
- if "progress.txt" in os.listdir(
752
- result_dir
753
- ) and ".resume_info" in os.listdir(result_dir):
606
+ if all(os.path.isfile(rfile) for rfile in required_files) and all(
607
+ os.path.isdir(rpath) for rpath in required_paths
608
+ ):
754
609
  raise Exception(
755
- "Specified output_directory is not empty but can be identified as output_directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
610
+ "Specified output directory is not empty but can be identified as output directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
756
611
  )
757
612
  else:
758
- raise Exception("Specified output_directory is not empty.")
613
+ raise Exception("Specified output directory is not empty.")
759
614
  else:
760
615
  if resume:
761
616
  resume = False
762
- warnings.warn(
763
- "Resume is set but specified output_directory is empty. Starting a new QA run..."
617
+ raise FileNotFoundError(
618
+ f"Resume is set but specified output directory is empty: '{result_dir}'."
764
619
  )
620
+
621
+ # When resuming previous QA run
765
622
  if resume:
766
623
  print(f"Resuming previous QA run in '{result_dir}'")
767
624
  with open(os.path.join(result_dir, ".resume_info")) as f:
@@ -770,58 +627,55 @@ def main():
770
627
  required_keys = ["parent_dir", "info", "tests"]
771
628
  if not all(key in resume_info for key in required_keys):
772
629
  raise Exception(
773
- "Invalid .resume_info file. It should contain the keys 'parent_dir', 'info', and 'tests'."
630
+ f"Invalid .resume_info file in '{result_dir}'. It should contain the keys 'parent_dir', 'info', and 'tests'."
774
631
  )
775
632
  if not (
776
633
  isinstance(resume_info["parent_dir"], str)
777
634
  and isinstance(resume_info["info"], str)
778
635
  and isinstance(resume_info["tests"], list)
636
+ and isinstance(resume_info.get("cl_checker_options", {}), dict)
637
+ and isinstance(
638
+ resume_info.get("include_consistency_checks", False), bool
639
+ )
640
+ and _verify_options_dict(resume_info.get("cl_checker_options", {}))
779
641
  and all(isinstance(test, str) for test in resume_info["tests"])
780
642
  ):
781
643
  raise Exception(
782
- "Invalid .resume_info file. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings."
644
+ f"Invalid .resume_info file in '{result_dir}'. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings. "
645
+ "'cl_checker_options' (optional) should be a nested dictionary of format 'checker:option_name:option_value', and "
646
+ "'include_consistency_checks' (optional) should be a boolean."
783
647
  )
784
648
  except json.JSONDecodeError:
785
649
  raise Exception(
786
- "Invalid .resume_info file. It should be a valid JSON file."
650
+ f"Invalid .resume_info file in '{result_dir}'. It needs to be a valid JSON file."
787
651
  )
788
- if tests and sorted(tests) != resume_info["tests"]:
789
- raise Exception("Cannot resume a previous QA run with different tests.")
790
- else:
791
- tests = resume_info["tests"]
652
+ tests = resume_info["tests"]
653
+ parent_dir = resume_info["parent_dir"]
792
654
  if info and info != resume_info["info"]:
793
655
  warnings.warn(
794
656
  f"<info> argument differs from the originally specified <info> argument ('{resume_info['info']}'). Using the new specification."
795
657
  )
796
- if parent_dir is None:
797
- parent_dir = resume_info["parent_dir"]
798
- if parent_dir and Path(parent_dir) != Path(resume_info["parent_dir"]):
799
- raise Exception(
800
- "Cannot resume a previous QA run with different <parent_dir>."
801
- )
802
- if cl_checker_options and cl_checker_options != resume_info.get(
803
- "checker_options", {}
804
- ):
805
- raise Exception(
806
- "Cannot resume a previous QA run with different <option> arguments."
807
- )
808
- else:
809
- parent_dir = Path(resume_info["parent_dir"])
810
- if "include_consistency_checks" in resume_info:
811
- include_consistency_checks = resume_info["include_consistency_checks"]
658
+ cl_checker_options = resume_info.get("checker_options", {})
659
+ include_consistency_checks = resume_info.get(
660
+ "include_consistency_checks", False
661
+ )
812
662
  else:
813
663
  print(f"Storing check results in '{result_dir}'")
814
664
 
815
665
  # Deal with tests
816
666
  if not tests:
817
- checkers = ["cc6", "cf"]
818
- checkers_versions = {"cc6": "latest", "cf": "1.11"}
667
+ checkers = ["cf"]
668
+ checkers_versions = {"cf": "latest"}
819
669
  checker_options = defaultdict(dict)
820
670
  else:
821
- test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
671
+ # Require versions to be specified:
672
+ # test_regex = re.compile(r"^[a-zA-Z0-9_-]+:(latest|[0-9]+(\.[0-9]+)*)$")
673
+ # Allow versions to be ommitted:
674
+ test_regex = re.compile(r"^[a-zA-Z0-9_-]+(?::(latest|[0-9]+(?:\.[0-9]+)*))?$")
675
+ # Check format of specified checkers and separate checker, version, options
822
676
  if not all([test_regex.match(test) for test in tests]):
823
677
  raise Exception(
824
- f"Invalid test(s) specified. Please specify tests in the format 'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
678
+ "Invalid test(s) specified. Please specify tests in the format 'checker_name' or'checker_name:version'."
825
679
  )
826
680
  checkers = [test.split(":")[0] for test in tests]
827
681
  if sorted(checkers) != sorted(list(set(checkers))):
@@ -835,6 +689,29 @@ def main():
835
689
  for test in tests
836
690
  }
837
691
  checker_options = defaultdict(dict)
692
+ # Check if specified checkers (or their requested versions) exist / are currently installed
693
+ cc_checker_versions = get_installed_checker_versions()
694
+ invalid_checkers = []
695
+ invalid_checkers_versions = []
696
+ invalid_checkers_errmsg = ""
697
+ for checker_i, checker_iv in checkers_versions.items():
698
+ if checker_i not in cc_checker_versions and checker_i != "eerie":
699
+ invalid_checkers.append(checker_i)
700
+ elif checker_i == "eerie":
701
+ pass
702
+ elif checker_iv not in cc_checker_versions[checker_i] and checker_i not in [
703
+ "cc6",
704
+ "mip",
705
+ ]:
706
+ invalid_checkers_versions.append(checker_i)
707
+ if invalid_checkers:
708
+ invalid_checkers_errmsg = f"ERROR: Invalid test(s) specified. The following checkers are not supported or installed: {', '.join(invalid_checkers)}. "
709
+ for checker_i in invalid_checkers_versions:
710
+ if not invalid_checkers_errmsg:
711
+ invalid_checkers_errmsg = "ERROR: Invalid test(s) specified. "
712
+ invalid_checkers_errmsg += f"For checker {checker_i} only the following versions are currently supported / installed: {', '.join(cc_checker_versions[checker_i])}. "
713
+ if invalid_checkers_errmsg:
714
+ raise ValueError(invalid_checkers_errmsg)
838
715
  if "cc6" in checkers_versions and checkers_versions["cc6"] != "latest":
839
716
  checkers_versions["cc6"] = "latest"
840
717
  warnings.warn("Version of checker 'cc6' must be 'latest'. Using 'latest'.")
@@ -849,11 +726,9 @@ def main():
849
726
  if "eerie" in checkers_versions:
850
727
  checkers_versions["mip"] = "latest"
851
728
  del checkers_versions["eerie"]
852
- if "tables" in cl_checker_options["eerie"]:
853
- cl_checker_options["mip"]["tables"] = cl_checker_options["eerie"][
854
- "tables"
855
- ]
856
- elif "tables" not in cl_checker_options["mip"]:
729
+ if "eerie" in cl_checker_options:
730
+ cl_checker_options["mip"] = cl_checker_options.pop("eerie")
731
+ if "tables" not in cl_checker_options["mip"]:
857
732
  cl_checker_options["mip"][
858
733
  "tables"
859
734
  ] = "/work/bm0021/cmor_tables/eerie_cmor_tables/Tables"
@@ -861,10 +736,6 @@ def main():
861
736
  raise Exception(
862
737
  "ERROR: Cannot run both 'cc6' and 'mip' checkers at the same time."
863
738
  )
864
- if any(test not in checker_dict.keys() for test in checkers_versions):
865
- raise Exception(
866
- f"Invalid test(s) specified. Supported are: {', '.join(checker_dict.keys())}"
867
- )
868
739
 
869
740
  # Combine checkers and versions
870
741
  # (checker_options are hardcoded)
@@ -887,7 +758,7 @@ def main():
887
758
  if cl_checker_options:
888
759
  resume_info["checker_options"] = cl_checker_options
889
760
  with open(os.path.join(result_dir, ".resume_info"), "w") as f:
890
- json.dump(resume_info, f)
761
+ json.dump(resume_info, f, sort_keys=True, indent=4)
891
762
 
892
763
  # If only cf checker is selected, run cc6 time checks only
893
764
  if (
@@ -905,14 +776,6 @@ def main():
905
776
  progress_file.touch()
906
777
  dataset_file.touch()
907
778
 
908
- DRS_parent = "CORDEX-CMIP6"
909
- for cname in checkers:
910
- print(cname)
911
- DRS_parent_tmp = DRS_path_parent.get(checker_dict.get(cname.split(":")[0], ""), "")
912
- if DRS_parent_tmp:
913
- DRS_parent = DRS_parent_tmp
914
- break
915
-
916
779
  # Check if progress files exist and read already processed files/datasets
917
780
  processed_files = set()
918
781
  with open(progress_file) as file:
@@ -1011,7 +874,7 @@ def main():
1011
874
  files_to_check = sorted(files_to_check)
1012
875
  for file_path in files_to_check:
1013
876
  files_to_check_dict[file_path]["id"] = get_dsid(
1014
- files_to_check_dict, dataset_files_map_ext, file_path, DRS_parent
877
+ files_to_check_dict, dataset_files_map_ext, file_path, supported_project_ids
1015
878
  )
1016
879
  files_to_check_dict[file_path]["result_file_ds"] = (
1017
880
  result_dir
@@ -1027,14 +890,14 @@ def main():
1027
890
  dataset_files_map[files_to_check_dict[file_path]["id"]] = [file_path]
1028
891
  checker_options[file_path] = {
1029
892
  "mip": {
1030
- **cl_checker_options["mip"],
893
+ **cl_checker_options.get("mip", {}),
1031
894
  "consistency_output": files_to_check_dict[file_path][
1032
895
  "consistency_file"
1033
896
  ],
1034
897
  "time_checks_only": time_checks_only,
1035
898
  },
1036
899
  "cc6": {
1037
- **cl_checker_options["cc6"],
900
+ **cl_checker_options.get("cc6", {}),
1038
901
  "consistency_output": files_to_check_dict[file_path][
1039
902
  "consistency_file"
1040
903
  ],
@@ -1046,15 +909,32 @@ def main():
1046
909
  "time_checks_only": time_checks_only,
1047
910
  },
1048
911
  "cf:": {
1049
- **cl_checker_options["cf"],
912
+ **cl_checker_options.get("cf", {}),
1050
913
  "enable_appendix_a_checks": True,
1051
914
  },
915
+ "wcrp_cmip6": {
916
+ **cl_checker_options.get("wcrp_cmip6", {}),
917
+ "consistency_output": files_to_check_dict[file_path][
918
+ "consistency_file"
919
+ ],
920
+ },
921
+ "wcrp_cordex_cmip6": {
922
+ **cl_checker_options.get("wcrp_cordex_cmip6", {}),
923
+ "consistency_output": files_to_check_dict[file_path][
924
+ "consistency_file"
925
+ ],
926
+ "tables_dir": result_dir + "/tables",
927
+ "force_table_download": file_path == files_to_check[0]
928
+ and (
929
+ not resume or (resume and os.listdir(result_dir + "/tables") == [])
930
+ ),
931
+ },
1052
932
  }
1053
933
  checker_options[file_path].update(
1054
934
  {
1055
935
  k: v
1056
936
  for k, v in cl_checker_options.items()
1057
- if k not in ["cc6", "cf", "mip"]
937
+ if k not in ["cc6", "cf", "mip", "wcrp_cmip6", "wcrp_cordex_cmip6"]
1058
938
  }
1059
939
  )
1060
940
 
@@ -1062,22 +942,27 @@ def main():
1062
942
  raise Exception("No files found to check.")
1063
943
  else:
1064
944
  print(
1065
- f"Found {len(files_to_check)} files (organized in {len(dataset_files_map)} datasets) to check."
945
+ f"\nFound {len(files_to_check)} files (organized in {len(dataset_files_map)} datasets) to check."
1066
946
  )
1067
947
 
1068
- print()
1069
- print("Files to check:")
1070
- print(json.dumps(files_to_check, indent=4))
1071
- print()
1072
- print("Dataset - Files mapping (extended):")
1073
- print(json.dumps(dataset_files_map_ext, indent=4))
1074
- print()
1075
- print("Dataset - Files mapping:")
1076
- print(json.dumps(dataset_files_map, indent=4))
1077
- print()
1078
- print("Files to check dict:")
1079
- print(json.dumps(files_to_check_dict, indent=4))
1080
- print()
948
+ # Save dictionaries to disk for information
949
+ with open(os.path.join(result_dir, "files_to_check.json"), "w") as f:
950
+ json.dump(files_to_check, f, indent=4)
951
+ with open(os.path.join(result_dir, "files_to_check_dict.json"), "w") as f:
952
+ json.dump(files_to_check_dict, f, indent=4)
953
+ with open(os.path.join(result_dir, "dataset_files_map.json"), "w") as f:
954
+ json.dump(dataset_files_map, f, indent=4)
955
+ with open(os.path.join(result_dir, "dataset_files_map_ext.json"), "w") as f:
956
+ json.dump(dataset_files_map_ext, f, indent=4)
957
+ print(
958
+ "Information on which files have been found and how these are organized into datasets was saved to disk:"
959
+ )
960
+ print(
961
+ f" - {os.path.join(result_dir, 'files_to_check.json')}\n"
962
+ f" - {os.path.join(result_dir, 'files_to_check_dict.json')}\n"
963
+ f" - {os.path.join(result_dir, 'dataset_files_map.json')}\n"
964
+ f" - {os.path.join(result_dir, 'dataset_files_map_ext.json')}"
965
+ )
1081
966
 
1082
967
  #########################################################
1083
968
  # QA Part 1 - Run all compliance-checker checks
@@ -1090,10 +975,13 @@ def main():
1090
975
  print()
1091
976
 
1092
977
  # Initialize the summary
1093
- summary = QAResultAggregator(checker_dict=checker_dict_ext)
978
+ summary = QAResultAggregator()
979
+ reference_ds_dict = {}
1094
980
 
1095
981
  # Calculate the number of processes
1096
982
  num_processes = max(multiprocessing.cpu_count() - 4, 1)
983
+ if parallel_processes > 0:
984
+ num_processes = min(num_processes, parallel_processes)
1097
985
  print(f"Using {num_processes} parallel processes for cc checks.")
1098
986
  print()
1099
987
 
@@ -1140,8 +1028,9 @@ def main():
1140
1028
 
1141
1029
  # Skip continuity and consistency checks if no cc6/mip checks were run
1142
1030
  # (and thus no consistency output file was created)
1143
- if "cc6:latest" in checkers or "mip:latest" in checkers:
1144
-
1031
+ if any(
1032
+ ch.split(":", 1)[0] in checker_supporting_consistency_checks for ch in checkers
1033
+ ):
1145
1034
  #########################################################
1146
1035
  # QA Part 2 - Run all consistency & continuity checks
1147
1036
  #########################################################
@@ -1167,6 +1056,8 @@ def main():
1167
1056
  # Limit the number of processes for consistency checks since a lot
1168
1057
  # of files will be opened at the same time
1169
1058
  num_processes = min(num_processes, 10)
1059
+ if parallel_processes > 0:
1060
+ num_processes = min(num_processes, parallel_processes)
1170
1061
  print(f"Using {num_processes} parallel processes for dataset checks.")
1171
1062
  print()
1172
1063
 
@@ -1217,7 +1108,9 @@ def main():
1217
1108
  else:
1218
1109
  print()
1219
1110
  warnings.warn(
1220
- "Continuity & Consistency checks skipped since no cc6 checks were run."
1111
+ "Continuity & consistency checks skipped since no appropriate checkers were run."
1112
+ " The following checkers support the continuity & consistency checks: "
1113
+ f"{', '.join(checker_supporting_consistency_checks)}"
1221
1114
  )
1222
1115
 
1223
1116
  #########################################################
@@ -1245,14 +1138,14 @@ def main():
1245
1138
  "cc_version": cc_version,
1246
1139
  "checkers": ", ".join(
1247
1140
  [
1248
- f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}"
1141
+ f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}".strip()
1249
1142
  for checker in checkers
1250
1143
  ]
1251
1144
  ),
1252
1145
  "parent_dir": str(parent_dir),
1253
1146
  }
1254
1147
  # Add reference datasets for inter-dataset consistency checks
1255
- if 'cc6:latest' in checkers or 'mip:latest' in checkers:
1148
+ if reference_ds_dict:
1256
1149
  summary_info["inter_ds_con_checks_ref"] = reference_ds_dict
1257
1150
 
1258
1151
  dsid_common_prefix = os.path.commonprefix(list(dataset_files_map.keys()))