esgf-qa 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgf_qa/_constants.py CHANGED
@@ -1,6 +1,63 @@
1
1
  from datetime import timedelta
2
2
 
3
- # Definition of maximum deviations from the given frequency
3
+ # Mapping of checker names to project names for better readability
4
+ checker_dict = {
5
+ "cc6": "CORDEX-CMIP6",
6
+ "cf": "CF-Conventions",
7
+ "mip": "MIP",
8
+ # "wcrp-cmip5": "CMIP5",
9
+ "wcrp_cmip6": "CMIP6",
10
+ # "wcrp_cmip7aft: "CMIP7-AFT",
11
+ "wcrp_cmip7": "CMIP7",
12
+ # "wcrp_cordex": "CORDEX",
13
+ "wcrp_cordex_cmip6": "CORDEX-CMIP6",
14
+ # "obs4mips": "Obs4MIPs",
15
+ # "input4mips": "Input4MIPs",
16
+ }
17
+ checker_dict_ext = {
18
+ # "pcons": "ParentConsistency"
19
+ "cons": "Consistency",
20
+ "cont": "Continuity",
21
+ "comp": "Compatibility",
22
+ **checker_dict,
23
+ }
24
+ checker_release_versions = {}
25
+
26
+ # Checkers for which consistency checks should be run
27
+ checker_supporting_consistency_checks = [
28
+ "wcrp_cmip7",
29
+ "wcrp_cmip6",
30
+ "wcrp_cordex_cmip6",
31
+ "cc6",
32
+ "mip",
33
+ ]
34
+
35
+ # DRS parent directory names (for identifying project root and building dataset id)
36
+ supported_project_ids = [
37
+ "cmip7",
38
+ "cmip6plus",
39
+ "cmip6",
40
+ "cmip5",
41
+ "cordex",
42
+ "cordex-cmip6",
43
+ "cordex-fpsconv",
44
+ "obs4mips",
45
+ "input4mips",
46
+ "c3scordex",
47
+ "c3scmip5",
48
+ "c3scmip6",
49
+ "c3s-ipcc-ar6-atlas",
50
+ "c3satlas",
51
+ "c3s-cica-atlas",
52
+ "c3satlas_v1",
53
+ "c3s-atlas-dataset",
54
+ "c3satlas_v2",
55
+ "eerie",
56
+ "happi",
57
+ "cosmo-rea",
58
+ ]
59
+
60
+ # Definition of maximum permitted deviations from the given frequency
4
61
  deltdic = {}
5
62
  deltdic["monmax"] = timedelta(days=31.01).total_seconds()
6
63
  deltdic["monmin"] = timedelta(days=27.99).total_seconds()
@@ -29,3 +86,8 @@ deltdic["decmin"] = timedelta(days=3660.01).total_seconds()
29
86
  deltdic["cen"] = timedelta(days=36000).total_seconds()
30
87
  deltdic["cenmax"] = timedelta(days=35999.99).total_seconds()
31
88
  deltdic["cenmin"] = timedelta(days=36600.01).total_seconds()
89
+ # CMIP-style frequencies for "time: point":
90
+ for l_freq in ["subhr", "1hr", "3hr", "6hr", "day", "mon", "yr"]:
91
+ deltdic[l_freq + "Pt"] = deltdic[l_freq]
92
+ deltdic[l_freq + "Ptmax"] = deltdic[l_freq + "max"]
93
+ deltdic[l_freq + "Ptmin"] = deltdic[l_freq + "min"]
esgf_qa/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.0'
32
- __version_tuple__ = version_tuple = (0, 3, 0)
31
+ __version__ = version = '0.5.0'
32
+ __version_tuple__ = version_tuple = (0, 5, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -0,0 +1,467 @@
1
+ import difflib
2
+ import re
3
+ from collections import defaultdict
4
+
5
+ from esgf_qa._constants import checker_dict, checker_dict_ext
6
+
7
+
8
+ class QAResultAggregator:
9
+ """
10
+ Aggregate, organize, and cluster the results of multiple ESGF-Quality Assurance (QA)
11
+ or Climate Checker (cc) runs.
12
+
13
+ This class collects the outcomes of compliance checker (cc) / cc-plugin runs from multiple datasets
14
+ and files, normalizes them into a consistent internal summary structure, and provides
15
+ functionality to sort, cluster, and generalize similar messages.
16
+
17
+ Attributes
18
+ ----------
19
+ summary : dict of defaultdict
20
+ Nested dictionary structure that stores the aggregated QA results.
21
+ It contains two top-level keys:
22
+ - ``"error"`` : maps checker functions to error messages → dataset IDs → file names.
23
+ - ``"fail"`` : maps test weights → test names → messages → dataset IDs → file names.
24
+ clustered_summary : dict of defaultdict
25
+ Summary structure produced after clustering messages using
26
+ :meth:`cluster_summary`. Keys and nesting mirror ``summary``, but
27
+ messages are generalized and aggregated/clustered across similar text patterns.
28
+ checker_dict : dict
29
+ Mapping of checker identifiers to human-readable names, used
30
+ for consistent labeling in summaries. Only cc checks.
31
+ checker_dict_ext : dict
32
+ Mapping of checker identifiers to human-readable names, used
33
+ for consistent labeling in summaries. cc checks extended by esgf_qa checks.
34
+
35
+ Methods
36
+ -------
37
+ update(result_dict, dsid, file_name)
38
+ Update the summary with a single cc run result (i.e. for one file).
39
+ update_ds(result_dict, dsid)
40
+ Update the summary with results from a single inter-dataset or inter-file checker run
41
+ that come with esgf-qa.
42
+ sort()
43
+ Sort the summary by test weight and test name for consistent output ordering.
44
+ cluster_messages(messages, threshold)
45
+ Cluster similar message strings based on edit-distance similarity.
46
+ generalize_message_group(messages)
47
+ Derive a generalized message template and placeholder map from a list of similar messages.
48
+ merge_placeholders(list_of_strings, dictionary, skip=0)
49
+ Helper to merge adjacent placeholders in message templates where possible.
50
+ cluster_summary(threshold=0.75)
51
+ Cluster and generalize all messages in the current summary using a similarity threshold.
52
+
53
+ Examples
54
+ --------
55
+ >>> from esgf_qa._constants import checker_dict
56
+ >>> agg = QAResultAggregator(checker_dict)
57
+ >>> result = {
58
+ ... "cf": {
59
+ ... "test_1": {"value": (0, 1), "msgs": ["Missing attribute 'units'"]},
60
+ ... }
61
+ ... }
62
+ >>> agg.update(result, dsid="dataset_001", file_name="tas_day.nc")
63
+ >>> agg.sort()
64
+ >>> agg.cluster_summary(threshold=0.8)
65
+ >>> agg.clustered_summary["fail"]
66
+ {3: {'[CF-Conventions] test_1': {'Missing attribute {A} (1 occurrences, e.g. A=\'units\')': {...}}}}
67
+ """
68
+
69
+ def __init__(self):
70
+ """
71
+ Initialize the aggregator with an empty summary.
72
+ """
73
+ self.summary = {
74
+ "error": defaultdict(
75
+ lambda: defaultdict(lambda: defaultdict(list))
76
+ ), # No weight, just function -> error msg
77
+ "fail": defaultdict(
78
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
79
+ ), # weight -> test -> msg -> dsid -> filenames
80
+ }
81
+ self.checker_dict = checker_dict
82
+ self.checker_dict_ext = checker_dict_ext
83
+
84
+ def update(self, result_dict, dsid, file_name):
85
+ """
86
+ Update the summary with a single result of a cc-run.
87
+
88
+ Parameters
89
+ ----------
90
+ result_dict : dict
91
+ Dictionary containing the results of a single cc-run.
92
+ dsid : str
93
+ Dataset ID.
94
+ file_name : str
95
+ File name.
96
+ """
97
+ for checker in result_dict:
98
+ for test in result_dict[checker]:
99
+ if test == "errors":
100
+ for function_name, error_msg in result_dict[checker][
101
+ "errors"
102
+ ].items():
103
+ self.summary["error"][
104
+ f"[{checker_dict.get(checker, checker)}] " + function_name
105
+ ][error_msg][dsid].append(file_name)
106
+ else:
107
+ score, max_score = result_dict[checker][test]["value"]
108
+ weight = result_dict[checker][test].get("weight", 3)
109
+ msgs = result_dict[checker][test].get("msgs", [])
110
+ if score < max_score: # test outcome: fail
111
+ for msg in msgs:
112
+ self.summary["fail"][weight][
113
+ f"[{checker_dict.get(checker, checker)}] " + test
114
+ ][msg][dsid].append(file_name)
115
+
116
+ def update_ds(self, result_dict, dsid):
117
+ """
118
+ Update the summary with a single result of an esgf-qa (inter-file/dataset) run.
119
+
120
+ Parameters
121
+ ----------
122
+ result_dict : dict
123
+ Dictionary containing the results of a single esgf-qa (inter-file/dataset) run.
124
+ dsid : str
125
+ Dataset ID.
126
+ """
127
+ for checker in result_dict:
128
+ for test in result_dict[checker]:
129
+ if test == "errors":
130
+ for function_name, errdict in result_dict[checker][
131
+ "errors"
132
+ ].items():
133
+ for file_name in errdict["files"]:
134
+ self.summary["error"][
135
+ f"[{checker_dict_ext.get(checker, checker)}] "
136
+ + function_name
137
+ ][errdict["msg"]][dsid].append(file_name)
138
+ else:
139
+ weight = result_dict[checker][test].get("weight", 3)
140
+ fails = result_dict[checker][test].get("msgs", {})
141
+ for msg, file_names in fails.items():
142
+ for file_name in file_names:
143
+ self.summary["fail"][weight][
144
+ f"[{checker_dict_ext.get(checker, checker)}] " + test
145
+ ][msg][dsid].append(file_name)
146
+
147
+ def sort(self):
148
+ """
149
+ Sort the summary by test weight and test name for consistent output ordering.
150
+
151
+ Modifies the `summary` attribute.
152
+ """
153
+ self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
154
+ for key in self.summary["fail"]:
155
+ self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
156
+
157
+ # Sort errors by function name
158
+ for checker in self.summary["error"]:
159
+ self.summary["error"][checker] = dict(
160
+ sorted(self.summary["error"][checker].items())
161
+ )
162
+
163
+ @staticmethod
164
+ def cluster_messages(messages, threshold):
165
+ """
166
+ Cluster messages based on similarity.
167
+
168
+ Parameters
169
+ ----------
170
+ messages : list
171
+ List of messages to cluster.
172
+ threshold : float
173
+ Similarity threshold.
174
+
175
+ Returns
176
+ -------
177
+ list
178
+ List of clusters.
179
+ """
180
+ clusters = []
181
+ while messages:
182
+ base = messages.pop(0)
183
+ cluster = [base]
184
+ to_remove = []
185
+ for msg in messages:
186
+ ratio = difflib.SequenceMatcher(None, base, msg).ratio()
187
+ if ratio >= threshold:
188
+ cluster.append(msg)
189
+ to_remove.append(msg)
190
+ for msg in to_remove:
191
+ messages.remove(msg)
192
+ clusters.append(cluster)
193
+ return clusters
194
+
195
+ @staticmethod
196
+ def generalize_message_group(messages):
197
+ """
198
+ Generalize a group of messages.
199
+
200
+ Parameters
201
+ ----------
202
+ messages : list
203
+ List of messages to generalize.
204
+
205
+ Returns
206
+ -------
207
+ str
208
+ Generalized message.
209
+ dict
210
+ Placeholders.
211
+ """
212
+ if len(messages) == 1:
213
+ return messages[0], {}
214
+
215
+ # Split messages into tokens
216
+ split_messages = [re.findall(r"\w+|\W", m) for m in messages]
217
+ transposed = list(zip(*split_messages))
218
+ template = []
219
+ placeholders = {}
220
+ var_index = 0
221
+
222
+ for i, tokens in enumerate(transposed):
223
+ unique_tokens = set(tokens)
224
+ if len(unique_tokens) == 1:
225
+ template.append(tokens[0])
226
+ else:
227
+ var_name = chr(ord("A") + var_index)
228
+ template.append(f"{{{var_name}}}")
229
+ placeholders[var_name] = tokens[0]
230
+ var_index += 1
231
+
232
+ # Merge placeholders if possible
233
+ template, placeholders = QAResultAggregator.merge_placeholders(
234
+ template, placeholders
235
+ )
236
+
237
+ # Return the generalized message and the placeholders
238
+ generalized = "".join(template)
239
+ return generalized, placeholders
240
+
241
+ @staticmethod
242
+ def merge_placeholders(list_of_strings, dictionary, skip=0):
243
+ """
244
+ Merge adjacent placeholders in message templates where possible.
245
+
246
+ Avoids too many placeholders in a clustered message.
247
+
248
+ Parameters
249
+ ----------
250
+ list_of_strings : list
251
+ List of strings.
252
+ dictionary : dict
253
+ Dictionary of placeholders.
254
+ skip : int, optional
255
+ Number of placeholders to skip, by default 0.
256
+
257
+ Returns
258
+ -------
259
+ list
260
+ List of strings with placeholders merged.
261
+ dict
262
+ Dictionary of placeholders.
263
+ """
264
+
265
+ def find_next_two_placeholders(list_of_strings, skip):
266
+ placeholders = [
267
+ s for s in list_of_strings if s.startswith("{") and s.endswith("}")
268
+ ]
269
+ if len(placeholders) < 2:
270
+ return None, None
271
+ return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
272
+ placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
273
+ )
274
+
275
+ def extract_text_between_placeholders(
276
+ list_of_strings, placeholder1, placeholder2
277
+ ):
278
+ idx1 = list_of_strings.index(placeholder1)
279
+ idx2 = list_of_strings.index(placeholder2)
280
+ return "".join(list_of_strings[idx1 + 1 : idx2])
281
+
282
+ def merge_two_placeholders(
283
+ placeholder1, placeholder2, text_between, dictionary
284
+ ):
285
+ new_value = (
286
+ dictionary[placeholder1.lstrip("{").rstrip("}")]
287
+ + text_between
288
+ + dictionary[placeholder2.lstrip("{").rstrip("}")]
289
+ )
290
+ dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
291
+ del dictionary[placeholder2.lstrip("{").rstrip("}")]
292
+ return dictionary
293
+
294
+ def update_placeholder_names(list_of_strings, dictionary):
295
+ old_placeholders = sorted(list(dictionary.keys()))
296
+ new_placeholders = [
297
+ chr(ord("A") + i) for i in range(0, len(old_placeholders))
298
+ ]
299
+ new_dictionary = dict(
300
+ zip(new_placeholders, [dictionary[val] for val in old_placeholders])
301
+ )
302
+ for old, new in zip(old_placeholders, new_placeholders):
303
+ list_of_strings = [
304
+ s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
305
+ ]
306
+ return list_of_strings, new_dictionary
307
+
308
+ def replace_placeholders_with_new_one(
309
+ list_of_strings, placeholder1, placeholder2
310
+ ):
311
+ idx1 = list_of_strings.index(placeholder1)
312
+ idx2 = list_of_strings.index(placeholder2)
313
+ list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
314
+ if idx2 < len(list_of_strings) + 1:
315
+ list_of_strings_new += list_of_strings[idx2 + 1 :]
316
+ return list_of_strings_new
317
+
318
+ if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
319
+ return list_of_strings, dictionary
320
+
321
+ placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
322
+ if placeholder1 is None or placeholder2 is None:
323
+ return list_of_strings, dictionary
324
+
325
+ text_between = extract_text_between_placeholders(
326
+ list_of_strings, placeholder1, placeholder2
327
+ )
328
+ if len(text_between) < 5:
329
+ dictionary = merge_two_placeholders(
330
+ placeholder1, placeholder2, text_between, dictionary
331
+ )
332
+ list_of_strings = replace_placeholders_with_new_one(
333
+ list_of_strings, placeholder1, placeholder2
334
+ )
335
+ list_of_strings, dictionary = update_placeholder_names(
336
+ list_of_strings, dictionary
337
+ )
338
+ return QAResultAggregator.merge_placeholders(
339
+ list_of_strings, dictionary, skip
340
+ )
341
+ else:
342
+ return QAResultAggregator.merge_placeholders(
343
+ list_of_strings, dictionary, skip + 1
344
+ )
345
+
346
+ def cluster_summary(self, threshold=0.75):
347
+ """
348
+ Cluster messages in the summary into groups of similar messages.
349
+
350
+ Drastically reduces number of messages in the summary for datasets accumulating
351
+ large numbers of check failure messages.
352
+
353
+ Parameters
354
+ ----------
355
+ threshold : float, optional
356
+ The threshold for similarity between messages, by default 0.75.
357
+
358
+ Returns
359
+ -------
360
+ None
361
+ Modifies the `clustered_summary` attribute.
362
+ """
363
+ self.clustered_summary = defaultdict(
364
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
365
+ )
366
+ for status in self.summary:
367
+ if status == "error":
368
+ for test_id in self.summary[status]:
369
+ messages = list(self.summary[status][test_id].keys())
370
+ # Pass a copy of messages to cluster_messages to generate clusters
371
+ clusters = QAResultAggregator.cluster_messages(
372
+ messages[:], threshold
373
+ )
374
+
375
+ for cluster in clusters:
376
+ generalized, placeholders = (
377
+ QAResultAggregator.generalize_message_group(cluster)
378
+ )
379
+ example_parts = ", ".join(
380
+ [
381
+ (
382
+ f"{k}='{v[0]}'"
383
+ if isinstance(v, list)
384
+ else f"{k}='{v}'"
385
+ )
386
+ for k, v in placeholders.items()
387
+ ]
388
+ )
389
+ if example_parts:
390
+ msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
391
+ else:
392
+ msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
393
+
394
+ # Gather all ds_ids and filenames across the cluster
395
+ combined = defaultdict(set)
396
+ for message in cluster:
397
+ for ds_id, files in self.summary[status][test_id][
398
+ message
399
+ ].items():
400
+ combined[ds_id].update(files)
401
+
402
+ # Shorten file lists to one example
403
+ formatted = {
404
+ ds_id
405
+ + " ("
406
+ + str(len(files))
407
+ + f" file{'s' if len(files) > 1 else ''} affected)": (
408
+ [f"e.g. '{next(iter(files))}'"]
409
+ if len(files) > 1
410
+ else [f"'{next(iter(files))}'"]
411
+ )
412
+ for ds_id, files in combined.items()
413
+ }
414
+
415
+ self.clustered_summary[status][test_id][msg_summary] = formatted
416
+ elif status == "fail":
417
+ for weight in self.summary[status]:
418
+ for test_id in self.summary[status][weight]:
419
+ messages = list(self.summary[status][weight][test_id].keys())
420
+ # Pass a copy of messages to cluster_messages to generate clusters
421
+ clusters = QAResultAggregator.cluster_messages(
422
+ messages[:], threshold
423
+ )
424
+
425
+ for cluster in clusters:
426
+ generalized, placeholders = (
427
+ QAResultAggregator.generalize_message_group(cluster)
428
+ )
429
+ example_parts = ", ".join(
430
+ [
431
+ (
432
+ f"{k}='{v[0]}'"
433
+ if isinstance(v, list)
434
+ else f"{k}='{v}'"
435
+ )
436
+ for k, v in placeholders.items()
437
+ ]
438
+ )
439
+ if example_parts:
440
+ msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
441
+ else:
442
+ msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
443
+
444
+ # Gather all ds_ids and filenames across the cluster
445
+ combined = defaultdict(set)
446
+ for message in cluster:
447
+ for ds_id, files in self.summary[status][weight][
448
+ test_id
449
+ ][message].items():
450
+ combined[ds_id].update(files)
451
+
452
+ # Shorten file lists to one example
453
+ formatted = {
454
+ ds_id
455
+ + " ("
456
+ + str(len(files))
457
+ + f" file{'s' if len(files) > 1 else ''} affected)": (
458
+ [f"e.g. '{next(iter(files))}'"]
459
+ if len(files) > 1
460
+ else [f"'{next(iter(files))}'"]
461
+ )
462
+ for ds_id, files in combined.items()
463
+ }
464
+
465
+ self.clustered_summary[status][weight][test_id][
466
+ msg_summary
467
+ ] = formatted