esgf-qa 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgf_qa/_constants.py +63 -1
- esgf_qa/_version.py +2 -2
- esgf_qa/cluster_results.py +467 -0
- esgf_qa/con_checks.py +209 -11
- esgf_qa/run_qa.py +356 -463
- {esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/METADATA +47 -31
- esgf_qa-0.5.0.dist-info/RECORD +19 -0
- {esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/WHEEL +1 -1
- {esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/top_level.txt +1 -1
- tests/test_cli.py +271 -0
- tests/test_cluster_results.py +166 -0
- tests/test_con_checks.py +263 -0
- tests/test_qaviewer.py +147 -0
- tests/test_run_dummy_qa.py +191 -0
- tests/test_run_qa.py +181 -0
- docs/esgf-qa_Logo.png +0 -0
- esgf_qa-0.3.0.dist-info/RECORD +0 -13
- {esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/entry_points.txt +0 -0
- {esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/licenses/LICENSE +0 -0
esgf_qa/run_qa.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import csv
|
|
3
3
|
import datetime
|
|
4
|
-
import difflib
|
|
5
4
|
import hashlib
|
|
6
5
|
import json
|
|
7
6
|
import multiprocessing
|
|
@@ -13,46 +12,22 @@ from pathlib import Path
|
|
|
13
12
|
|
|
14
13
|
from compliance_checker import __version__ as cc_version
|
|
15
14
|
from compliance_checker.runner import CheckSuite
|
|
16
|
-
|
|
15
|
+
from packaging import version as pversion
|
|
16
|
+
|
|
17
|
+
from esgf_qa._constants import (
|
|
18
|
+
checker_dict,
|
|
19
|
+
checker_dict_ext,
|
|
20
|
+
checker_release_versions,
|
|
21
|
+
checker_supporting_consistency_checks,
|
|
22
|
+
supported_project_ids,
|
|
23
|
+
)
|
|
17
24
|
from esgf_qa._version import version
|
|
25
|
+
from esgf_qa.cluster_results import QAResultAggregator
|
|
18
26
|
from esgf_qa.con_checks import compatibility_checks as comp # noqa
|
|
19
27
|
from esgf_qa.con_checks import consistency_checks as cons # noqa
|
|
20
28
|
from esgf_qa.con_checks import continuity_checks as cont # noqa
|
|
21
29
|
from esgf_qa.con_checks import dataset_coverage_checks, inter_dataset_consistency_checks
|
|
22
30
|
|
|
23
|
-
checker_dict = {
|
|
24
|
-
"cc6": "CORDEX-CMIP6",
|
|
25
|
-
"cf": "CF-Conventions",
|
|
26
|
-
"mip": "MIP",
|
|
27
|
-
"plugin_cmip6": "CMIP6",
|
|
28
|
-
# "wcrp-cmip5": "CMIP5",
|
|
29
|
-
"wcrp_cmip6": "CMIP6",
|
|
30
|
-
# "wcrp_cmip7": "CMIP7-AFT",
|
|
31
|
-
# "wcrp_cmip7": "CMIP7",
|
|
32
|
-
# "wcrp_cordex": "CORDEX",
|
|
33
|
-
"wcrp_cordex_cmip6": "CORDEX-CMIP6",
|
|
34
|
-
# "obs4mips": "Obs4MIPs",
|
|
35
|
-
# "input4mips": "Input4MIPs",
|
|
36
|
-
}
|
|
37
|
-
DRS_path_parent = {
|
|
38
|
-
"CMIP5": "CMIP5",
|
|
39
|
-
"CMIP6": "CMIP6",
|
|
40
|
-
"CMIP7": "CMIP7",
|
|
41
|
-
"CMIP7-AFT": "CMIP7",
|
|
42
|
-
"CORDEX": "CORDEX",
|
|
43
|
-
"CORDEX-CMIP6": "CORDEX-CMIP6",
|
|
44
|
-
"Obs4MIPs": "Obs4MIPs",
|
|
45
|
-
"Input4MIPs": "Input4MIPs",
|
|
46
|
-
}
|
|
47
|
-
checker_release_versions = {}
|
|
48
|
-
checker_dict_ext = {
|
|
49
|
-
# "pcons": "ParentConsistency"
|
|
50
|
-
"cons": "Consistency",
|
|
51
|
-
"cont": "Continuity",
|
|
52
|
-
"comp": "Compatibility",
|
|
53
|
-
**checker_dict,
|
|
54
|
-
}
|
|
55
|
-
|
|
56
31
|
_timestamp_with_ms = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
|
|
57
32
|
_timestamp_filename = datetime.datetime.strptime(
|
|
58
33
|
_timestamp_with_ms, "%Y%m%d-%H%M%S%f"
|
|
@@ -62,321 +37,15 @@ _timestamp_pprint = datetime.datetime.strptime(
|
|
|
62
37
|
).strftime("%Y-%m-%d %H:%M")
|
|
63
38
|
|
|
64
39
|
|
|
65
|
-
class QAResultAggregator:
|
|
66
|
-
def __init__(self, checker_dict):
|
|
67
|
-
"""
|
|
68
|
-
Initialize the aggregator with an empty summary.
|
|
69
|
-
"""
|
|
70
|
-
self.summary = {
|
|
71
|
-
"error": defaultdict(
|
|
72
|
-
lambda: defaultdict(lambda: defaultdict(list))
|
|
73
|
-
), # No weight, just function -> error msg
|
|
74
|
-
"fail": defaultdict(
|
|
75
|
-
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
76
|
-
), # weight -> test -> msg -> dsid -> filenames
|
|
77
|
-
}
|
|
78
|
-
self.checker_dict = checker_dict
|
|
79
|
-
|
|
80
|
-
def update(self, result_dict, dsid, file_name):
|
|
81
|
-
"""
|
|
82
|
-
Update the summary with a single result of a cc-run.
|
|
83
|
-
"""
|
|
84
|
-
for checker in result_dict:
|
|
85
|
-
for test in result_dict[checker]:
|
|
86
|
-
if test == "errors":
|
|
87
|
-
for function_name, error_msg in result_dict[checker][
|
|
88
|
-
"errors"
|
|
89
|
-
].items():
|
|
90
|
-
self.summary["error"][
|
|
91
|
-
f"[{checker_dict[checker]}] " + function_name
|
|
92
|
-
][error_msg][dsid].append(file_name)
|
|
93
|
-
else:
|
|
94
|
-
score, max_score = result_dict[checker][test]["value"]
|
|
95
|
-
weight = result_dict[checker][test].get("weight", 3)
|
|
96
|
-
msgs = result_dict[checker][test].get("msgs", [])
|
|
97
|
-
if score < max_score: # test outcome: fail
|
|
98
|
-
for msg in msgs:
|
|
99
|
-
self.summary["fail"][weight][
|
|
100
|
-
f"[{checker_dict[checker]}] " + test
|
|
101
|
-
][msg][dsid].append(file_name)
|
|
102
|
-
|
|
103
|
-
def update_ds(self, result_dict, dsid):
|
|
104
|
-
"""
|
|
105
|
-
Update the summary with a single result of a esgf-qa run.
|
|
106
|
-
"""
|
|
107
|
-
for checker in result_dict:
|
|
108
|
-
for test in result_dict[checker]:
|
|
109
|
-
if test == "errors":
|
|
110
|
-
for function_name, errdict in result_dict[checker][
|
|
111
|
-
"errors"
|
|
112
|
-
].items():
|
|
113
|
-
for file_name in errdict["files"]:
|
|
114
|
-
self.summary["error"][
|
|
115
|
-
f"[{checker_dict_ext[checker]}] " + function_name
|
|
116
|
-
][errdict["msg"]][dsid].append(file_name)
|
|
117
|
-
else:
|
|
118
|
-
weight = result_dict[checker][test].get("weight", 3)
|
|
119
|
-
fails = result_dict[checker][test].get("msgs", {})
|
|
120
|
-
for msg, file_names in fails.items():
|
|
121
|
-
for file_name in file_names:
|
|
122
|
-
self.summary["fail"][weight][
|
|
123
|
-
f"[{checker_dict_ext[checker]}] " + test
|
|
124
|
-
][msg][dsid].append(file_name)
|
|
125
|
-
|
|
126
|
-
def sort(self):
|
|
127
|
-
"""
|
|
128
|
-
Sort the summary.
|
|
129
|
-
"""
|
|
130
|
-
self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
|
|
131
|
-
for key in self.summary["fail"]:
|
|
132
|
-
self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
|
|
133
|
-
|
|
134
|
-
# Sort errors by function name
|
|
135
|
-
for checker in self.summary["error"]:
|
|
136
|
-
self.summary["error"][checker] = dict(
|
|
137
|
-
sorted(self.summary["error"][checker].items())
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
@staticmethod
|
|
141
|
-
def cluster_messages(messages, threshold):
|
|
142
|
-
clusters = []
|
|
143
|
-
while messages:
|
|
144
|
-
base = messages.pop(0)
|
|
145
|
-
cluster = [base]
|
|
146
|
-
to_remove = []
|
|
147
|
-
for msg in messages:
|
|
148
|
-
ratio = difflib.SequenceMatcher(None, base, msg).ratio()
|
|
149
|
-
if ratio >= threshold:
|
|
150
|
-
cluster.append(msg)
|
|
151
|
-
to_remove.append(msg)
|
|
152
|
-
for msg in to_remove:
|
|
153
|
-
messages.remove(msg)
|
|
154
|
-
clusters.append(cluster)
|
|
155
|
-
return clusters
|
|
156
|
-
|
|
157
|
-
@staticmethod
|
|
158
|
-
def generalize_message_group(messages):
|
|
159
|
-
if len(messages) == 1:
|
|
160
|
-
return messages[0], {}
|
|
161
|
-
|
|
162
|
-
# Split messages into tokens
|
|
163
|
-
split_messages = [re.findall(r"\w+|\W", m) for m in messages]
|
|
164
|
-
transposed = list(zip(*split_messages))
|
|
165
|
-
template = []
|
|
166
|
-
placeholders = {}
|
|
167
|
-
var_index = 0
|
|
168
|
-
|
|
169
|
-
for i, tokens in enumerate(transposed):
|
|
170
|
-
unique_tokens = set(tokens)
|
|
171
|
-
if len(unique_tokens) == 1:
|
|
172
|
-
template.append(tokens[0])
|
|
173
|
-
else:
|
|
174
|
-
var_name = chr(ord("A") + var_index)
|
|
175
|
-
template.append(f"{{{var_name}}}")
|
|
176
|
-
placeholders[var_name] = tokens[0]
|
|
177
|
-
var_index += 1
|
|
178
|
-
|
|
179
|
-
# Merge placeholders if possible
|
|
180
|
-
template, placeholders = QAResultAggregator.merge_placeholders(
|
|
181
|
-
template, placeholders
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
# Return the generalized message and the placeholders
|
|
185
|
-
generalized = "".join(template)
|
|
186
|
-
return generalized, placeholders
|
|
187
|
-
|
|
188
|
-
@staticmethod
|
|
189
|
-
def merge_placeholders(list_of_strings, dictionary, skip=0):
|
|
190
|
-
def find_next_two_placeholders(list_of_strings, skip):
|
|
191
|
-
placeholders = [
|
|
192
|
-
s for s in list_of_strings if s.startswith("{") and s.endswith("}")
|
|
193
|
-
]
|
|
194
|
-
if len(placeholders) < 2:
|
|
195
|
-
return None, None
|
|
196
|
-
return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
|
|
197
|
-
placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
def extract_text_between_placeholders(
|
|
201
|
-
list_of_strings, placeholder1, placeholder2
|
|
202
|
-
):
|
|
203
|
-
idx1 = list_of_strings.index(placeholder1)
|
|
204
|
-
idx2 = list_of_strings.index(placeholder2)
|
|
205
|
-
return "".join(list_of_strings[idx1 + 1 : idx2])
|
|
206
|
-
|
|
207
|
-
def merge_two_placeholders(
|
|
208
|
-
placeholder1, placeholder2, text_between, dictionary
|
|
209
|
-
):
|
|
210
|
-
new_value = (
|
|
211
|
-
dictionary[placeholder1.lstrip("{").rstrip("}")]
|
|
212
|
-
+ text_between
|
|
213
|
-
+ dictionary[placeholder2.lstrip("{").rstrip("}")]
|
|
214
|
-
)
|
|
215
|
-
dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
|
|
216
|
-
del dictionary[placeholder2.lstrip("{").rstrip("}")]
|
|
217
|
-
return dictionary
|
|
218
|
-
|
|
219
|
-
def update_placeholder_names(list_of_strings, dictionary):
|
|
220
|
-
old_placeholders = sorted(list(dictionary.keys()))
|
|
221
|
-
new_placeholders = [
|
|
222
|
-
chr(ord("A") + i) for i in range(0, len(old_placeholders))
|
|
223
|
-
]
|
|
224
|
-
new_dictionary = dict(
|
|
225
|
-
zip(new_placeholders, [dictionary[val] for val in old_placeholders])
|
|
226
|
-
)
|
|
227
|
-
for old, new in zip(old_placeholders, new_placeholders):
|
|
228
|
-
list_of_strings = [
|
|
229
|
-
s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
|
|
230
|
-
]
|
|
231
|
-
return list_of_strings, new_dictionary
|
|
232
|
-
|
|
233
|
-
def replace_placeholders_with_new_one(
|
|
234
|
-
list_of_strings, placeholder1, placeholder2
|
|
235
|
-
):
|
|
236
|
-
idx1 = list_of_strings.index(placeholder1)
|
|
237
|
-
idx2 = list_of_strings.index(placeholder2)
|
|
238
|
-
list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
|
|
239
|
-
if idx2 < len(list_of_strings) + 1:
|
|
240
|
-
list_of_strings_new += list_of_strings[idx2 + 1 :]
|
|
241
|
-
return list_of_strings_new
|
|
242
|
-
|
|
243
|
-
if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
|
|
244
|
-
return list_of_strings, dictionary
|
|
245
|
-
|
|
246
|
-
placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
|
|
247
|
-
if placeholder1 is None or placeholder2 is None:
|
|
248
|
-
return list_of_strings, dictionary
|
|
249
|
-
|
|
250
|
-
text_between = extract_text_between_placeholders(
|
|
251
|
-
list_of_strings, placeholder1, placeholder2
|
|
252
|
-
)
|
|
253
|
-
if len(text_between) < 5:
|
|
254
|
-
dictionary = merge_two_placeholders(
|
|
255
|
-
placeholder1, placeholder2, text_between, dictionary
|
|
256
|
-
)
|
|
257
|
-
list_of_strings = replace_placeholders_with_new_one(
|
|
258
|
-
list_of_strings, placeholder1, placeholder2
|
|
259
|
-
)
|
|
260
|
-
list_of_strings, dictionary = update_placeholder_names(
|
|
261
|
-
list_of_strings, dictionary
|
|
262
|
-
)
|
|
263
|
-
return QAResultAggregator.merge_placeholders(
|
|
264
|
-
list_of_strings, dictionary, skip
|
|
265
|
-
)
|
|
266
|
-
else:
|
|
267
|
-
return QAResultAggregator.merge_placeholders(
|
|
268
|
-
list_of_strings, dictionary, skip + 1
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
def cluster_summary(self, threshold=0.75):
|
|
272
|
-
self.clustered_summary = defaultdict(
|
|
273
|
-
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
|
|
274
|
-
)
|
|
275
|
-
for status in self.summary:
|
|
276
|
-
if status == "error":
|
|
277
|
-
for test_id in self.summary[status]:
|
|
278
|
-
messages = list(self.summary[status][test_id].keys())
|
|
279
|
-
# Pass a copy of messages to cluster_messages to generate clusters
|
|
280
|
-
clusters = QAResultAggregator.cluster_messages(
|
|
281
|
-
messages[:], threshold
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
for cluster in clusters:
|
|
285
|
-
generalized, placeholders = (
|
|
286
|
-
QAResultAggregator.generalize_message_group(cluster)
|
|
287
|
-
)
|
|
288
|
-
example_parts = ", ".join(
|
|
289
|
-
[
|
|
290
|
-
(
|
|
291
|
-
f"{k}='{v[0]}'"
|
|
292
|
-
if isinstance(v, list)
|
|
293
|
-
else f"{k}='{v}'"
|
|
294
|
-
)
|
|
295
|
-
for k, v in placeholders.items()
|
|
296
|
-
]
|
|
297
|
-
)
|
|
298
|
-
if example_parts:
|
|
299
|
-
msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
|
|
300
|
-
else:
|
|
301
|
-
msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
|
|
302
|
-
|
|
303
|
-
# Gather all ds_ids and filenames across the cluster
|
|
304
|
-
combined = defaultdict(set)
|
|
305
|
-
for message in cluster:
|
|
306
|
-
for ds_id, files in self.summary[status][test_id][
|
|
307
|
-
message
|
|
308
|
-
].items():
|
|
309
|
-
combined[ds_id].update(files)
|
|
310
|
-
|
|
311
|
-
# Shorten file lists to one example
|
|
312
|
-
formatted = {
|
|
313
|
-
ds_id
|
|
314
|
-
+ " ("
|
|
315
|
-
+ str(len(files))
|
|
316
|
-
+ f" file{'s' if len(files) > 1 else ''} affected)": (
|
|
317
|
-
[f"e.g. '{next(iter(files))}'"]
|
|
318
|
-
if len(files) > 1
|
|
319
|
-
else [f"'{next(iter(files))}'"]
|
|
320
|
-
)
|
|
321
|
-
for ds_id, files in combined.items()
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
self.clustered_summary[status][test_id][msg_summary] = formatted
|
|
325
|
-
elif status == "fail":
|
|
326
|
-
for weight in self.summary[status]:
|
|
327
|
-
for test_id in self.summary[status][weight]:
|
|
328
|
-
messages = list(self.summary[status][weight][test_id].keys())
|
|
329
|
-
# Pass a copy of messages to cluster_messages to generate clusters
|
|
330
|
-
clusters = QAResultAggregator.cluster_messages(
|
|
331
|
-
messages[:], threshold
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
for cluster in clusters:
|
|
335
|
-
generalized, placeholders = (
|
|
336
|
-
QAResultAggregator.generalize_message_group(cluster)
|
|
337
|
-
)
|
|
338
|
-
example_parts = ", ".join(
|
|
339
|
-
[
|
|
340
|
-
(
|
|
341
|
-
f"{k}='{v[0]}'"
|
|
342
|
-
if isinstance(v, list)
|
|
343
|
-
else f"{k}='{v}'"
|
|
344
|
-
)
|
|
345
|
-
for k, v in placeholders.items()
|
|
346
|
-
]
|
|
347
|
-
)
|
|
348
|
-
if example_parts:
|
|
349
|
-
msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
|
|
350
|
-
else:
|
|
351
|
-
msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
|
|
352
|
-
|
|
353
|
-
# Gather all ds_ids and filenames across the cluster
|
|
354
|
-
combined = defaultdict(set)
|
|
355
|
-
for message in cluster:
|
|
356
|
-
for ds_id, files in self.summary[status][weight][
|
|
357
|
-
test_id
|
|
358
|
-
][message].items():
|
|
359
|
-
combined[ds_id].update(files)
|
|
360
|
-
|
|
361
|
-
# Shorten file lists to one example
|
|
362
|
-
formatted = {
|
|
363
|
-
ds_id
|
|
364
|
-
+ " ("
|
|
365
|
-
+ str(len(files))
|
|
366
|
-
+ f" file{'s' if len(files) > 1 else ''} affected)": (
|
|
367
|
-
[f"e.g. '{next(iter(files))}'"]
|
|
368
|
-
if len(files) > 1
|
|
369
|
-
else [f"'{next(iter(files))}'"]
|
|
370
|
-
)
|
|
371
|
-
for ds_id, files in combined.items()
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
self.clustered_summary[status][weight][test_id][
|
|
375
|
-
msg_summary
|
|
376
|
-
] = formatted
|
|
377
|
-
|
|
378
|
-
|
|
379
40
|
def get_default_result_dir():
|
|
41
|
+
"""
|
|
42
|
+
Get the default result directory.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
str
|
|
47
|
+
Default result directory.
|
|
48
|
+
"""
|
|
380
49
|
global _timestamp
|
|
381
50
|
global _timestamp_with_ms
|
|
382
51
|
hash_object = hashlib.md5(_timestamp_with_ms.encode())
|
|
@@ -385,19 +54,86 @@ def get_default_result_dir():
|
|
|
385
54
|
+ f"/esgf-qa-results_{_timestamp_filename}_{hash_object.hexdigest()}"
|
|
386
55
|
)
|
|
387
56
|
|
|
388
|
-
|
|
57
|
+
|
|
58
|
+
def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_ids):
|
|
59
|
+
"""
|
|
60
|
+
Get the dataset id for a file.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
files_to_check_dict : dict
|
|
65
|
+
Dictionary of files to check.
|
|
66
|
+
dataset_files_map_ext : dict
|
|
67
|
+
Dictionary of dataset files.
|
|
68
|
+
file_path : str
|
|
69
|
+
Path to the file.
|
|
70
|
+
project_ids: list of str
|
|
71
|
+
List of supported project_ids
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
str
|
|
76
|
+
Dataset id.
|
|
77
|
+
"""
|
|
389
78
|
dir_id = files_to_check_dict[file_path]["id_dir"].split("/")
|
|
390
79
|
fn_id = files_to_check_dict[file_path]["id_fn"].split("_")
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
80
|
+
dsid = ".".join(dir_id)
|
|
81
|
+
dir_id_lower = [el.lower() for el in dir_id]
|
|
82
|
+
for project_id in project_ids:
|
|
83
|
+
if project_id in dir_id_lower:
|
|
84
|
+
last_index = len(dir_id_lower) - 1 - dir_id_lower[::-1].index(project_id)
|
|
85
|
+
dsid = ".".join(dir_id[last_index:])
|
|
86
|
+
break
|
|
396
87
|
if len(dataset_files_map_ext[files_to_check_dict[file_path]["id_dir"]].keys()) > 1:
|
|
397
88
|
dsid += "." + ".".join(fn_id)
|
|
398
89
|
return dsid
|
|
399
90
|
|
|
91
|
+
|
|
92
|
+
def get_installed_checker_versions():
|
|
93
|
+
"""
|
|
94
|
+
Get all available versions of installed cc-plugins.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
dict
|
|
99
|
+
A dictionary of {checker_name: [version1, version2, latest], ...}.
|
|
100
|
+
"""
|
|
101
|
+
check_suite = CheckSuite()
|
|
102
|
+
check_suite.load_all_available_checkers()
|
|
103
|
+
installed_versions = {}
|
|
104
|
+
for checker in check_suite.checkers:
|
|
105
|
+
try:
|
|
106
|
+
name, version = checker.split(":")
|
|
107
|
+
except ValueError:
|
|
108
|
+
name, version = checker, "latest"
|
|
109
|
+
if version == "latest":
|
|
110
|
+
continue
|
|
111
|
+
if name not in installed_versions:
|
|
112
|
+
installed_versions[name] = []
|
|
113
|
+
installed_versions[name].append(version)
|
|
114
|
+
for name, versions in installed_versions.items():
|
|
115
|
+
installed_versions[name] = sorted(versions, key=pversion.parse) + ["latest"]
|
|
116
|
+
|
|
117
|
+
return installed_versions
|
|
118
|
+
|
|
119
|
+
|
|
400
120
|
def get_checker_release_versions(checkers, checker_options={}):
|
|
121
|
+
"""
|
|
122
|
+
Get the release versions of the checkers.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
checkers : list
|
|
127
|
+
A list of checkers to get the release versions for.
|
|
128
|
+
checker_options : dict, optional
|
|
129
|
+
A dictionary of options for the checkers.
|
|
130
|
+
Example format: {"cf": {"check_dimension_order": True}}
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
None
|
|
135
|
+
Updates the global dictionary ``checker_release_versions``.
|
|
136
|
+
"""
|
|
401
137
|
global checker_release_versions
|
|
402
138
|
global checker_dict
|
|
403
139
|
global checker_dict_ext
|
|
@@ -413,17 +149,32 @@ def get_checker_release_versions(checkers, checker_options={}):
|
|
|
413
149
|
)
|
|
414
150
|
elif checker.split(":")[0] in checker_dict_ext:
|
|
415
151
|
checker_release_versions[checker.split(":")[0]] = version
|
|
152
|
+
else:
|
|
153
|
+
checker_release_versions[checker.split(":")[0]] = (
|
|
154
|
+
check_suite.checkers.get(
|
|
155
|
+
checker, "unknown version"
|
|
156
|
+
)._cc_spec_version
|
|
157
|
+
)
|
|
416
158
|
|
|
417
159
|
|
|
418
160
|
def run_compliance_checker(file_path, checkers, checker_options={}):
|
|
419
161
|
"""
|
|
420
162
|
Run the compliance checker on a file with the specified checkers and options.
|
|
421
163
|
|
|
422
|
-
Parameters
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
file_path : str
|
|
167
|
+
The path to the file to be checked.
|
|
168
|
+
checkers : list
|
|
169
|
+
A list of checkers to run.
|
|
170
|
+
checker_options : dict, optional
|
|
171
|
+
A dictionary of options for the checkers.
|
|
172
|
+
Example format: {"cf": {"check_dimension_order": True}}
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
dict
|
|
177
|
+
A dictionary containing the results of the compliance checker.
|
|
427
178
|
"""
|
|
428
179
|
check_suite = CheckSuite(options=checker_options)
|
|
429
180
|
check_suite.load_all_available_checkers()
|
|
@@ -453,11 +204,31 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
|
|
|
453
204
|
ds, [checker], include_checks=None, skip_checks=[]
|
|
454
205
|
)
|
|
455
206
|
)
|
|
207
|
+
if hasattr(ds, "close"):
|
|
208
|
+
ds.close()
|
|
456
209
|
return results
|
|
457
|
-
|
|
210
|
+
results = check_suite.run_all(ds, checkers, include_checks=None, skip_checks=[])
|
|
211
|
+
if hasattr(ds, "close"):
|
|
212
|
+
ds.close()
|
|
213
|
+
return results
|
|
458
214
|
|
|
459
215
|
|
|
460
216
|
def track_checked_datasets(checked_datasets_file, checked_datasets):
|
|
217
|
+
"""
|
|
218
|
+
Track checked datasets.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
checked_datasets_file : str
|
|
223
|
+
The path to the file to track checked datasets.
|
|
224
|
+
checked_datasets : list
|
|
225
|
+
A list of checked datasets.
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
None
|
|
230
|
+
Writes the checked datasets to the file.
|
|
231
|
+
"""
|
|
461
232
|
with open(checked_datasets_file, "a") as file:
|
|
462
233
|
writer = csv.writer(file)
|
|
463
234
|
for dataset_id in checked_datasets:
|
|
@@ -472,6 +243,29 @@ def process_file(
|
|
|
472
243
|
processed_files,
|
|
473
244
|
progress_file,
|
|
474
245
|
):
|
|
246
|
+
"""
|
|
247
|
+
Runs cc checks for a single file.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
file_path : str
|
|
252
|
+
The path to the file to be checked.
|
|
253
|
+
checkers : list
|
|
254
|
+
A list of checkers to run.
|
|
255
|
+
checker_options : dict
|
|
256
|
+
A dictionary of options for the checkers.
|
|
257
|
+
files_to_check_dict : dict
|
|
258
|
+
A special dictionary mapping files to check to datasets.
|
|
259
|
+
processed_files : list
|
|
260
|
+
A list of files that have already been checked.
|
|
261
|
+
progress_file : str
|
|
262
|
+
The path to the progress file.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
tuple
|
|
267
|
+
A tuple containing the file path and the results of the compliance checker.
|
|
268
|
+
"""
|
|
475
269
|
# Read result from disk if check was run previously
|
|
476
270
|
result_file = files_to_check_dict[file_path]["result_file"]
|
|
477
271
|
consistency_file = files_to_check_dict[file_path]["consistency_file"]
|
|
@@ -512,14 +306,6 @@ def process_file(
|
|
|
512
306
|
checker = checkerv.split(":")[0]
|
|
513
307
|
check_results[checker] = dict()
|
|
514
308
|
check_results[checker]["errors"] = {}
|
|
515
|
-
# print()
|
|
516
|
-
# print("name",result[checker][0][0].name)
|
|
517
|
-
# print("weight", result[checker][0][0].weight)
|
|
518
|
-
# print("value", result[checker][0][0].value)
|
|
519
|
-
# print("msgs", result[checker][0][0].msgs)
|
|
520
|
-
# print("method", result[checker][0][0].check_method)
|
|
521
|
-
# print("children", result[checker][0][0].children)
|
|
522
|
-
# quit()
|
|
523
309
|
for check in result[checkerv][0]:
|
|
524
310
|
check_results[checker][check.name] = {}
|
|
525
311
|
check_results[checker][check.name]["weight"] = check.weight
|
|
@@ -567,6 +353,31 @@ def process_dataset(
|
|
|
567
353
|
processed_datasets,
|
|
568
354
|
progress_file,
|
|
569
355
|
):
|
|
356
|
+
"""
|
|
357
|
+
Runs esgf_qa checks on a dataset.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
ds : str
|
|
362
|
+
Dataset to process.
|
|
363
|
+
ds_map : dict
|
|
364
|
+
Dictionary mapping dataset IDs to file paths.
|
|
365
|
+
checkers : list
|
|
366
|
+
List of checkers to run.
|
|
367
|
+
checker_options : dict
|
|
368
|
+
Dictionary of checker options.
|
|
369
|
+
files_to_check_dict : dict
|
|
370
|
+
A special dictionary mapping files to check to datasets.
|
|
371
|
+
processed_datasets : set
|
|
372
|
+
Set of processed datasets.
|
|
373
|
+
progress_file : str
|
|
374
|
+
Path to progress file.
|
|
375
|
+
|
|
376
|
+
Returns
|
|
377
|
+
-------
|
|
378
|
+
tuple
|
|
379
|
+
Dataset ID and check results.
|
|
380
|
+
"""
|
|
570
381
|
# Read result from disk if check was run previously
|
|
571
382
|
result_file = files_to_check_dict[ds_map[ds][0]]["result_file_ds"]
|
|
572
383
|
if ds in processed_datasets and os.path.isfile(result_file):
|
|
@@ -637,10 +448,14 @@ def parse_options(opts):
|
|
|
637
448
|
is a colon. Adapted from
|
|
638
449
|
https://github.com/ioos/compliance-checker/blob/cbb40ed1981c169b74c954f0775d5bd23005ed23/cchecker.py#L23
|
|
639
450
|
|
|
640
|
-
Parameters
|
|
641
|
-
|
|
451
|
+
Parameters
|
|
452
|
+
----------
|
|
453
|
+
opts : Iterable of strings
|
|
454
|
+
Iterable of option strings
|
|
642
455
|
|
|
643
|
-
Returns
|
|
456
|
+
Returns
|
|
457
|
+
-------
|
|
458
|
+
dict
|
|
644
459
|
Dictionary with keys as checker type (i.e. "mip").
|
|
645
460
|
Each value is a dictionary where keys are checker options and values
|
|
646
461
|
are checker option values or None if not provided.
|
|
@@ -649,21 +464,39 @@ def parse_options(opts):
|
|
|
649
464
|
for opt_str in opts:
|
|
650
465
|
try:
|
|
651
466
|
checker_type, checker_opt, *checker_val = opt_str.split(":", 2)
|
|
652
|
-
checker_val = checker_val[0] if checker_val else
|
|
467
|
+
checker_val = checker_val[0] if checker_val else True
|
|
653
468
|
except ValueError:
|
|
654
469
|
raise ValueError(
|
|
655
470
|
f"Could not split option '{opt_str}', seems illegally formatted. The required format is: '<checker>:<option_name>[:<option_value>]', eg. 'mip:tables:/path/to/Tables'."
|
|
656
471
|
)
|
|
657
|
-
if checker_type != "mip":
|
|
658
|
-
raise ValueError(
|
|
659
|
-
f"Currently, only options for 'mip' checker are supported, got '{checker_type}'."
|
|
660
|
-
)
|
|
661
472
|
options_dict[checker_type][checker_opt] = checker_val
|
|
662
473
|
return options_dict
|
|
663
474
|
|
|
664
475
|
|
|
476
|
+
def _verify_options_dict(options):
|
|
477
|
+
"""
|
|
478
|
+
Helper function to verify that the options dictionary is correctly formatted.
|
|
479
|
+
"""
|
|
480
|
+
if not isinstance(options, dict):
|
|
481
|
+
return False
|
|
482
|
+
if options == {}:
|
|
483
|
+
return True
|
|
484
|
+
try:
|
|
485
|
+
for checker_type in options.keys():
|
|
486
|
+
for checker_opt in options[checker_type].keys():
|
|
487
|
+
checker_val = options[checker_type][checker_opt]
|
|
488
|
+
if not isinstance(checker_val, (int, float, str, bool, type(None))):
|
|
489
|
+
return False
|
|
490
|
+
except (AttributeError, KeyError):
|
|
491
|
+
return False
|
|
492
|
+
# Seems to match the required format
|
|
493
|
+
return True
|
|
494
|
+
|
|
495
|
+
|
|
665
496
|
def main():
|
|
666
|
-
|
|
497
|
+
"""
|
|
498
|
+
CLI entry point.
|
|
499
|
+
"""
|
|
667
500
|
parser = argparse.ArgumentParser(description="Run QA checks")
|
|
668
501
|
parser.add_argument(
|
|
669
502
|
"parent_dir",
|
|
@@ -683,13 +516,17 @@ def main():
|
|
|
683
516
|
"--option",
|
|
684
517
|
default=[],
|
|
685
518
|
action="append",
|
|
686
|
-
help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'.
|
|
519
|
+
help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. "
|
|
520
|
+
"Multiple invocations possible.",
|
|
687
521
|
)
|
|
688
522
|
parser.add_argument(
|
|
689
523
|
"-t",
|
|
690
524
|
"--test",
|
|
691
525
|
action="append",
|
|
692
|
-
help="The test to run
|
|
526
|
+
help="The test(s) to run in format '<checker>[:<version>]', (eg. 'wcrp_cmip7', "
|
|
527
|
+
"'wcrp_cmip6:latest' or 'cf:<version>', can be specified multiple times, "
|
|
528
|
+
"eg.: '-t cc6:latest -t cf:1.11') - default: running 'cf:latest'. "
|
|
529
|
+
"The default version selected for each checker is 'latest'.",
|
|
693
530
|
)
|
|
694
531
|
parser.add_argument(
|
|
695
532
|
"-i",
|
|
@@ -709,6 +546,13 @@ def main():
|
|
|
709
546
|
action="store_true",
|
|
710
547
|
help="Include basic consistency and continuity checks. Default: False.",
|
|
711
548
|
)
|
|
549
|
+
parser.add_argument(
|
|
550
|
+
"-P",
|
|
551
|
+
"--parallel_processes",
|
|
552
|
+
type=int,
|
|
553
|
+
default=0,
|
|
554
|
+
help="Specify the maximum number of parallel processes. Default: 0 (= number of cores).",
|
|
555
|
+
)
|
|
712
556
|
args = parser.parse_args()
|
|
713
557
|
|
|
714
558
|
result_dir = os.path.abspath(args.output_dir)
|
|
@@ -720,6 +564,7 @@ def main():
|
|
|
720
564
|
args.include_consistency_checks if args.include_consistency_checks else False
|
|
721
565
|
)
|
|
722
566
|
cl_checker_options = parse_options(args.option)
|
|
567
|
+
parallel_processes = args.parallel_processes
|
|
723
568
|
|
|
724
569
|
# Progress file to track already checked files
|
|
725
570
|
progress_file = Path(result_dir, "progress.txt")
|
|
@@ -729,39 +574,51 @@ def main():
|
|
|
729
574
|
# Resume information stored in a json file
|
|
730
575
|
resume_info_file = Path(result_dir, ".resume_info")
|
|
731
576
|
|
|
577
|
+
# Do not allow any but certain arguments if resuming previous QA run
|
|
578
|
+
if resume:
|
|
579
|
+
allowed_with_resume = {"output_dir", "info", "resume", "parallel_processes"}
|
|
580
|
+
# Convert Namespace to dict for easier checking
|
|
581
|
+
set_args = {k for k, v in vars(args).items() if v not in (None, False, [], "")}
|
|
582
|
+
invalid_args = set_args - allowed_with_resume
|
|
583
|
+
if invalid_args:
|
|
584
|
+
parser.error(
|
|
585
|
+
f"When using -r/--resume, the following arguments are not allowed: {', '.join(invalid_args)}"
|
|
586
|
+
)
|
|
587
|
+
|
|
732
588
|
# Deal with result_dir
|
|
733
589
|
if not os.path.exists(result_dir):
|
|
734
590
|
if resume:
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
"Resume is set but specified output_directory does not exist. Starting a new QA run..."
|
|
591
|
+
raise FileNotFoundError(
|
|
592
|
+
f"Resume is set but specified output_directory does not exist: '{result_dir}'."
|
|
738
593
|
)
|
|
739
594
|
os.mkdir(result_dir)
|
|
740
595
|
elif os.listdir(result_dir) != []:
|
|
596
|
+
required_files = [progress_file, resume_info_file]
|
|
597
|
+
required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
|
|
741
598
|
if resume:
|
|
742
|
-
required_files = [progress_file, resume_info_file]
|
|
743
|
-
required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
|
|
744
599
|
if not all(os.path.isfile(rfile) for rfile in required_files) or not all(
|
|
745
600
|
os.path.isdir(rpath) for rpath in required_paths
|
|
746
601
|
):
|
|
747
602
|
raise Exception(
|
|
748
|
-
"Resume is set but specified output_directory cannot be identified as
|
|
603
|
+
"Resume is set but specified output_directory cannot be identified as output directory of a previous QA run."
|
|
749
604
|
)
|
|
750
605
|
else:
|
|
751
|
-
if
|
|
752
|
-
|
|
753
|
-
)
|
|
606
|
+
if all(os.path.isfile(rfile) for rfile in required_files) and all(
|
|
607
|
+
os.path.isdir(rpath) for rpath in required_paths
|
|
608
|
+
):
|
|
754
609
|
raise Exception(
|
|
755
|
-
"Specified
|
|
610
|
+
"Specified output directory is not empty but can be identified as output directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
|
|
756
611
|
)
|
|
757
612
|
else:
|
|
758
|
-
raise Exception("Specified
|
|
613
|
+
raise Exception("Specified output directory is not empty.")
|
|
759
614
|
else:
|
|
760
615
|
if resume:
|
|
761
616
|
resume = False
|
|
762
|
-
|
|
763
|
-
"Resume is set but specified
|
|
617
|
+
raise FileNotFoundError(
|
|
618
|
+
f"Resume is set but specified output directory is empty: '{result_dir}'."
|
|
764
619
|
)
|
|
620
|
+
|
|
621
|
+
# When resuming previous QA run
|
|
765
622
|
if resume:
|
|
766
623
|
print(f"Resuming previous QA run in '{result_dir}'")
|
|
767
624
|
with open(os.path.join(result_dir, ".resume_info")) as f:
|
|
@@ -770,58 +627,55 @@ def main():
|
|
|
770
627
|
required_keys = ["parent_dir", "info", "tests"]
|
|
771
628
|
if not all(key in resume_info for key in required_keys):
|
|
772
629
|
raise Exception(
|
|
773
|
-
"Invalid .resume_info file. It should contain the keys 'parent_dir', 'info', and 'tests'."
|
|
630
|
+
f"Invalid .resume_info file in '{result_dir}'. It should contain the keys 'parent_dir', 'info', and 'tests'."
|
|
774
631
|
)
|
|
775
632
|
if not (
|
|
776
633
|
isinstance(resume_info["parent_dir"], str)
|
|
777
634
|
and isinstance(resume_info["info"], str)
|
|
778
635
|
and isinstance(resume_info["tests"], list)
|
|
636
|
+
and isinstance(resume_info.get("cl_checker_options", {}), dict)
|
|
637
|
+
and isinstance(
|
|
638
|
+
resume_info.get("include_consistency_checks", False), bool
|
|
639
|
+
)
|
|
640
|
+
and _verify_options_dict(resume_info.get("cl_checker_options", {}))
|
|
779
641
|
and all(isinstance(test, str) for test in resume_info["tests"])
|
|
780
642
|
):
|
|
781
643
|
raise Exception(
|
|
782
|
-
"Invalid .resume_info file. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings."
|
|
644
|
+
f"Invalid .resume_info file in '{result_dir}'. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings. "
|
|
645
|
+
"'cl_checker_options' (optional) should be a nested dictionary of format 'checker:option_name:option_value', and "
|
|
646
|
+
"'include_consistency_checks' (optional) should be a boolean."
|
|
783
647
|
)
|
|
784
648
|
except json.JSONDecodeError:
|
|
785
649
|
raise Exception(
|
|
786
|
-
"Invalid .resume_info file. It
|
|
650
|
+
f"Invalid .resume_info file in '{result_dir}'. It needs to be a valid JSON file."
|
|
787
651
|
)
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
else:
|
|
791
|
-
tests = resume_info["tests"]
|
|
652
|
+
tests = resume_info["tests"]
|
|
653
|
+
parent_dir = resume_info["parent_dir"]
|
|
792
654
|
if info and info != resume_info["info"]:
|
|
793
655
|
warnings.warn(
|
|
794
656
|
f"<info> argument differs from the originally specified <info> argument ('{resume_info['info']}'). Using the new specification."
|
|
795
657
|
)
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
"Cannot resume a previous QA run with different <parent_dir>."
|
|
801
|
-
)
|
|
802
|
-
if cl_checker_options and cl_checker_options != resume_info.get(
|
|
803
|
-
"checker_options", {}
|
|
804
|
-
):
|
|
805
|
-
raise Exception(
|
|
806
|
-
"Cannot resume a previous QA run with different <option> arguments."
|
|
807
|
-
)
|
|
808
|
-
else:
|
|
809
|
-
parent_dir = Path(resume_info["parent_dir"])
|
|
810
|
-
if "include_consistency_checks" in resume_info:
|
|
811
|
-
include_consistency_checks = resume_info["include_consistency_checks"]
|
|
658
|
+
cl_checker_options = resume_info.get("checker_options", {})
|
|
659
|
+
include_consistency_checks = resume_info.get(
|
|
660
|
+
"include_consistency_checks", False
|
|
661
|
+
)
|
|
812
662
|
else:
|
|
813
663
|
print(f"Storing check results in '{result_dir}'")
|
|
814
664
|
|
|
815
665
|
# Deal with tests
|
|
816
666
|
if not tests:
|
|
817
|
-
checkers = ["
|
|
818
|
-
checkers_versions = {"
|
|
667
|
+
checkers = ["cf"]
|
|
668
|
+
checkers_versions = {"cf": "latest"}
|
|
819
669
|
checker_options = defaultdict(dict)
|
|
820
670
|
else:
|
|
821
|
-
|
|
671
|
+
# Require versions to be specified:
|
|
672
|
+
# test_regex = re.compile(r"^[a-zA-Z0-9_-]+:(latest|[0-9]+(\.[0-9]+)*)$")
|
|
673
|
+
# Allow versions to be ommitted:
|
|
674
|
+
test_regex = re.compile(r"^[a-zA-Z0-9_-]+(?::(latest|[0-9]+(?:\.[0-9]+)*))?$")
|
|
675
|
+
# Check format of specified checkers and separate checker, version, options
|
|
822
676
|
if not all([test_regex.match(test) for test in tests]):
|
|
823
677
|
raise Exception(
|
|
824
|
-
|
|
678
|
+
"Invalid test(s) specified. Please specify tests in the format 'checker_name' or'checker_name:version'."
|
|
825
679
|
)
|
|
826
680
|
checkers = [test.split(":")[0] for test in tests]
|
|
827
681
|
if sorted(checkers) != sorted(list(set(checkers))):
|
|
@@ -835,6 +689,29 @@ def main():
|
|
|
835
689
|
for test in tests
|
|
836
690
|
}
|
|
837
691
|
checker_options = defaultdict(dict)
|
|
692
|
+
# Check if specified checkers (or their requested versions) exist / are currently installed
|
|
693
|
+
cc_checker_versions = get_installed_checker_versions()
|
|
694
|
+
invalid_checkers = []
|
|
695
|
+
invalid_checkers_versions = []
|
|
696
|
+
invalid_checkers_errmsg = ""
|
|
697
|
+
for checker_i, checker_iv in checkers_versions.items():
|
|
698
|
+
if checker_i not in cc_checker_versions and checker_i != "eerie":
|
|
699
|
+
invalid_checkers.append(checker_i)
|
|
700
|
+
elif checker_i == "eerie":
|
|
701
|
+
pass
|
|
702
|
+
elif checker_iv not in cc_checker_versions[checker_i] and checker_i not in [
|
|
703
|
+
"cc6",
|
|
704
|
+
"mip",
|
|
705
|
+
]:
|
|
706
|
+
invalid_checkers_versions.append(checker_i)
|
|
707
|
+
if invalid_checkers:
|
|
708
|
+
invalid_checkers_errmsg = f"ERROR: Invalid test(s) specified. The following checkers are not supported or installed: {', '.join(invalid_checkers)}. "
|
|
709
|
+
for checker_i in invalid_checkers_versions:
|
|
710
|
+
if not invalid_checkers_errmsg:
|
|
711
|
+
invalid_checkers_errmsg = "ERROR: Invalid test(s) specified. "
|
|
712
|
+
invalid_checkers_errmsg += f"For checker {checker_i} only the following versions are currently supported / installed: {', '.join(cc_checker_versions[checker_i])}. "
|
|
713
|
+
if invalid_checkers_errmsg:
|
|
714
|
+
raise ValueError(invalid_checkers_errmsg)
|
|
838
715
|
if "cc6" in checkers_versions and checkers_versions["cc6"] != "latest":
|
|
839
716
|
checkers_versions["cc6"] = "latest"
|
|
840
717
|
warnings.warn("Version of checker 'cc6' must be 'latest'. Using 'latest'.")
|
|
@@ -849,11 +726,9 @@ def main():
|
|
|
849
726
|
if "eerie" in checkers_versions:
|
|
850
727
|
checkers_versions["mip"] = "latest"
|
|
851
728
|
del checkers_versions["eerie"]
|
|
852
|
-
if "
|
|
853
|
-
cl_checker_options["mip"]
|
|
854
|
-
|
|
855
|
-
]
|
|
856
|
-
elif "tables" not in cl_checker_options["mip"]:
|
|
729
|
+
if "eerie" in cl_checker_options:
|
|
730
|
+
cl_checker_options["mip"] = cl_checker_options.pop("eerie")
|
|
731
|
+
if "tables" not in cl_checker_options["mip"]:
|
|
857
732
|
cl_checker_options["mip"][
|
|
858
733
|
"tables"
|
|
859
734
|
] = "/work/bm0021/cmor_tables/eerie_cmor_tables/Tables"
|
|
@@ -861,10 +736,6 @@ def main():
|
|
|
861
736
|
raise Exception(
|
|
862
737
|
"ERROR: Cannot run both 'cc6' and 'mip' checkers at the same time."
|
|
863
738
|
)
|
|
864
|
-
if any(test not in checker_dict.keys() for test in checkers_versions):
|
|
865
|
-
raise Exception(
|
|
866
|
-
f"Invalid test(s) specified. Supported are: {', '.join(checker_dict.keys())}"
|
|
867
|
-
)
|
|
868
739
|
|
|
869
740
|
# Combine checkers and versions
|
|
870
741
|
# (checker_options are hardcoded)
|
|
@@ -887,7 +758,7 @@ def main():
|
|
|
887
758
|
if cl_checker_options:
|
|
888
759
|
resume_info["checker_options"] = cl_checker_options
|
|
889
760
|
with open(os.path.join(result_dir, ".resume_info"), "w") as f:
|
|
890
|
-
json.dump(resume_info, f)
|
|
761
|
+
json.dump(resume_info, f, sort_keys=True, indent=4)
|
|
891
762
|
|
|
892
763
|
# If only cf checker is selected, run cc6 time checks only
|
|
893
764
|
if (
|
|
@@ -905,14 +776,6 @@ def main():
|
|
|
905
776
|
progress_file.touch()
|
|
906
777
|
dataset_file.touch()
|
|
907
778
|
|
|
908
|
-
DRS_parent = "CORDEX-CMIP6"
|
|
909
|
-
for cname in checkers:
|
|
910
|
-
print(cname)
|
|
911
|
-
DRS_parent_tmp = DRS_path_parent.get(checker_dict.get(cname.split(":")[0], ""), "")
|
|
912
|
-
if DRS_parent_tmp:
|
|
913
|
-
DRS_parent = DRS_parent_tmp
|
|
914
|
-
break
|
|
915
|
-
|
|
916
779
|
# Check if progress files exist and read already processed files/datasets
|
|
917
780
|
processed_files = set()
|
|
918
781
|
with open(progress_file) as file:
|
|
@@ -1011,7 +874,7 @@ def main():
|
|
|
1011
874
|
files_to_check = sorted(files_to_check)
|
|
1012
875
|
for file_path in files_to_check:
|
|
1013
876
|
files_to_check_dict[file_path]["id"] = get_dsid(
|
|
1014
|
-
files_to_check_dict, dataset_files_map_ext, file_path,
|
|
877
|
+
files_to_check_dict, dataset_files_map_ext, file_path, supported_project_ids
|
|
1015
878
|
)
|
|
1016
879
|
files_to_check_dict[file_path]["result_file_ds"] = (
|
|
1017
880
|
result_dir
|
|
@@ -1027,14 +890,14 @@ def main():
|
|
|
1027
890
|
dataset_files_map[files_to_check_dict[file_path]["id"]] = [file_path]
|
|
1028
891
|
checker_options[file_path] = {
|
|
1029
892
|
"mip": {
|
|
1030
|
-
**cl_checker_options
|
|
893
|
+
**cl_checker_options.get("mip", {}),
|
|
1031
894
|
"consistency_output": files_to_check_dict[file_path][
|
|
1032
895
|
"consistency_file"
|
|
1033
896
|
],
|
|
1034
897
|
"time_checks_only": time_checks_only,
|
|
1035
898
|
},
|
|
1036
899
|
"cc6": {
|
|
1037
|
-
**cl_checker_options
|
|
900
|
+
**cl_checker_options.get("cc6", {}),
|
|
1038
901
|
"consistency_output": files_to_check_dict[file_path][
|
|
1039
902
|
"consistency_file"
|
|
1040
903
|
],
|
|
@@ -1046,15 +909,32 @@ def main():
|
|
|
1046
909
|
"time_checks_only": time_checks_only,
|
|
1047
910
|
},
|
|
1048
911
|
"cf:": {
|
|
1049
|
-
**cl_checker_options
|
|
912
|
+
**cl_checker_options.get("cf", {}),
|
|
1050
913
|
"enable_appendix_a_checks": True,
|
|
1051
914
|
},
|
|
915
|
+
"wcrp_cmip6": {
|
|
916
|
+
**cl_checker_options.get("wcrp_cmip6", {}),
|
|
917
|
+
"consistency_output": files_to_check_dict[file_path][
|
|
918
|
+
"consistency_file"
|
|
919
|
+
],
|
|
920
|
+
},
|
|
921
|
+
"wcrp_cordex_cmip6": {
|
|
922
|
+
**cl_checker_options.get("wcrp_cordex_cmip6", {}),
|
|
923
|
+
"consistency_output": files_to_check_dict[file_path][
|
|
924
|
+
"consistency_file"
|
|
925
|
+
],
|
|
926
|
+
"tables_dir": result_dir + "/tables",
|
|
927
|
+
"force_table_download": file_path == files_to_check[0]
|
|
928
|
+
and (
|
|
929
|
+
not resume or (resume and os.listdir(result_dir + "/tables") == [])
|
|
930
|
+
),
|
|
931
|
+
},
|
|
1052
932
|
}
|
|
1053
933
|
checker_options[file_path].update(
|
|
1054
934
|
{
|
|
1055
935
|
k: v
|
|
1056
936
|
for k, v in cl_checker_options.items()
|
|
1057
|
-
if k not in ["cc6", "cf", "mip"]
|
|
937
|
+
if k not in ["cc6", "cf", "mip", "wcrp_cmip6", "wcrp_cordex_cmip6"]
|
|
1058
938
|
}
|
|
1059
939
|
)
|
|
1060
940
|
|
|
@@ -1062,22 +942,27 @@ def main():
|
|
|
1062
942
|
raise Exception("No files found to check.")
|
|
1063
943
|
else:
|
|
1064
944
|
print(
|
|
1065
|
-
f"
|
|
945
|
+
f"\nFound {len(files_to_check)} files (organized in {len(dataset_files_map)} datasets) to check."
|
|
1066
946
|
)
|
|
1067
947
|
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
print(
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
print(
|
|
948
|
+
# Save dictionaries to disk for information
|
|
949
|
+
with open(os.path.join(result_dir, "files_to_check.json"), "w") as f:
|
|
950
|
+
json.dump(files_to_check, f, indent=4)
|
|
951
|
+
with open(os.path.join(result_dir, "files_to_check_dict.json"), "w") as f:
|
|
952
|
+
json.dump(files_to_check_dict, f, indent=4)
|
|
953
|
+
with open(os.path.join(result_dir, "dataset_files_map.json"), "w") as f:
|
|
954
|
+
json.dump(dataset_files_map, f, indent=4)
|
|
955
|
+
with open(os.path.join(result_dir, "dataset_files_map_ext.json"), "w") as f:
|
|
956
|
+
json.dump(dataset_files_map_ext, f, indent=4)
|
|
957
|
+
print(
|
|
958
|
+
"Information on which files have been found and how these are organized into datasets was saved to disk:"
|
|
959
|
+
)
|
|
960
|
+
print(
|
|
961
|
+
f" - {os.path.join(result_dir, 'files_to_check.json')}\n"
|
|
962
|
+
f" - {os.path.join(result_dir, 'files_to_check_dict.json')}\n"
|
|
963
|
+
f" - {os.path.join(result_dir, 'dataset_files_map.json')}\n"
|
|
964
|
+
f" - {os.path.join(result_dir, 'dataset_files_map_ext.json')}"
|
|
965
|
+
)
|
|
1081
966
|
|
|
1082
967
|
#########################################################
|
|
1083
968
|
# QA Part 1 - Run all compliance-checker checks
|
|
@@ -1090,10 +975,13 @@ def main():
|
|
|
1090
975
|
print()
|
|
1091
976
|
|
|
1092
977
|
# Initialize the summary
|
|
1093
|
-
summary = QAResultAggregator(
|
|
978
|
+
summary = QAResultAggregator()
|
|
979
|
+
reference_ds_dict = {}
|
|
1094
980
|
|
|
1095
981
|
# Calculate the number of processes
|
|
1096
982
|
num_processes = max(multiprocessing.cpu_count() - 4, 1)
|
|
983
|
+
if parallel_processes > 0:
|
|
984
|
+
num_processes = min(num_processes, parallel_processes)
|
|
1097
985
|
print(f"Using {num_processes} parallel processes for cc checks.")
|
|
1098
986
|
print()
|
|
1099
987
|
|
|
@@ -1140,8 +1028,9 @@ def main():
|
|
|
1140
1028
|
|
|
1141
1029
|
# Skip continuity and consistency checks if no cc6/mip checks were run
|
|
1142
1030
|
# (and thus no consistency output file was created)
|
|
1143
|
-
if
|
|
1144
|
-
|
|
1031
|
+
if any(
|
|
1032
|
+
ch.split(":", 1)[0] in checker_supporting_consistency_checks for ch in checkers
|
|
1033
|
+
):
|
|
1145
1034
|
#########################################################
|
|
1146
1035
|
# QA Part 2 - Run all consistency & continuity checks
|
|
1147
1036
|
#########################################################
|
|
@@ -1167,6 +1056,8 @@ def main():
|
|
|
1167
1056
|
# Limit the number of processes for consistency checks since a lot
|
|
1168
1057
|
# of files will be opened at the same time
|
|
1169
1058
|
num_processes = min(num_processes, 10)
|
|
1059
|
+
if parallel_processes > 0:
|
|
1060
|
+
num_processes = min(num_processes, parallel_processes)
|
|
1170
1061
|
print(f"Using {num_processes} parallel processes for dataset checks.")
|
|
1171
1062
|
print()
|
|
1172
1063
|
|
|
@@ -1217,7 +1108,9 @@ def main():
|
|
|
1217
1108
|
else:
|
|
1218
1109
|
print()
|
|
1219
1110
|
warnings.warn(
|
|
1220
|
-
"Continuity &
|
|
1111
|
+
"Continuity & consistency checks skipped since no appropriate checkers were run."
|
|
1112
|
+
" The following checkers support the continuity & consistency checks: "
|
|
1113
|
+
f"{', '.join(checker_supporting_consistency_checks)}"
|
|
1221
1114
|
)
|
|
1222
1115
|
|
|
1223
1116
|
#########################################################
|
|
@@ -1245,14 +1138,14 @@ def main():
|
|
|
1245
1138
|
"cc_version": cc_version,
|
|
1246
1139
|
"checkers": ", ".join(
|
|
1247
1140
|
[
|
|
1248
|
-
f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}"
|
|
1141
|
+
f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}".strip()
|
|
1249
1142
|
for checker in checkers
|
|
1250
1143
|
]
|
|
1251
1144
|
),
|
|
1252
1145
|
"parent_dir": str(parent_dir),
|
|
1253
1146
|
}
|
|
1254
1147
|
# Add reference datasets for inter-dataset consistency checks
|
|
1255
|
-
if
|
|
1148
|
+
if reference_ds_dict:
|
|
1256
1149
|
summary_info["inter_ds_con_checks_ref"] = reference_ds_dict
|
|
1257
1150
|
|
|
1258
1151
|
dsid_common_prefix = os.path.commonprefix(list(dataset_files_map.keys()))
|