esgf-qa 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgf_qa/_constants.py CHANGED
@@ -1,6 +1,42 @@
1
1
  from datetime import timedelta
2
2
 
3
- # Definition of maximum deviations from the given frequency
3
+ # Collection of supported checkers
4
+ checker_dict = {
5
+ "cc6": "CORDEX-CMIP6",
6
+ "cf": "CF-Conventions",
7
+ "mip": "MIP",
8
+ "plugin_cmip6": "CMIP6",
9
+ # "wcrp-cmip5": "CMIP5",
10
+ "wcrp_cmip6": "CMIP6",
11
+ # "wcrp_cmip7": "CMIP7-AFT",
12
+ # "wcrp_cmip7": "CMIP7",
13
+ # "wcrp_cordex": "CORDEX",
14
+ "wcrp_cordex_cmip6": "CORDEX-CMIP6",
15
+ # "obs4mips": "Obs4MIPs",
16
+ # "input4mips": "Input4MIPs",
17
+ }
18
+ checker_dict_ext = {
19
+ # "pcons": "ParentConsistency"
20
+ "cons": "Consistency",
21
+ "cont": "Continuity",
22
+ "comp": "Compatibility",
23
+ **checker_dict,
24
+ }
25
+ checker_release_versions = {}
26
+
27
+ # DRS parent directory names
28
+ DRS_path_parent = {
29
+ "CMIP5": "CMIP5",
30
+ "CMIP6": "CMIP6",
31
+ "CMIP7": "CMIP7",
32
+ "CMIP7-AFT": "CMIP7",
33
+ "CORDEX": "CORDEX",
34
+ "CORDEX-CMIP6": "CORDEX-CMIP6",
35
+ "Obs4MIPs": "Obs4MIPs",
36
+ "Input4MIPs": "Input4MIPs",
37
+ }
38
+
39
+ # Definition of maximum permitted deviations from the given frequency
4
40
  deltdic = {}
5
41
  deltdic["monmax"] = timedelta(days=31.01).total_seconds()
6
42
  deltdic["monmin"] = timedelta(days=27.99).total_seconds()
@@ -29,3 +65,8 @@ deltdic["decmin"] = timedelta(days=3660.01).total_seconds()
29
65
  deltdic["cen"] = timedelta(days=36000).total_seconds()
30
66
  deltdic["cenmax"] = timedelta(days=35999.99).total_seconds()
31
67
  deltdic["cenmin"] = timedelta(days=36600.01).total_seconds()
68
+ # CMIP-style frequencies for "time: point":
69
+ for l_freq in ["subhr", "1hr", "3hr", "6hr", "day", "mon", "yr"]:
70
+ deltdic[l_freq + "Pt"] = deltdic[l_freq]
71
+ deltdic[l_freq + "Ptmax"] = deltdic[l_freq + "max"]
72
+ deltdic[l_freq + "Ptmin"] = deltdic[l_freq + "min"]
esgf_qa/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.0'
32
- __version_tuple__ = version_tuple = (0, 3, 0)
31
+ __version__ = version = '0.4.0'
32
+ __version_tuple__ = version_tuple = (0, 4, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -0,0 +1,466 @@
1
+ import difflib
2
+ import re
3
+ from collections import defaultdict
4
+
5
+ from esgf_qa._constants import checker_dict, checker_dict_ext
6
+
7
+
8
+ class QAResultAggregator:
9
+ """
10
+ Aggregate, organize, and cluster the results of multiple ESGF-Quality Assurance (QA)
11
+ or Climate Checker (cc) runs.
12
+
13
+ This class collects the outcomes of compliance checker (cc) / cc-plugin runs from multiple datasets
14
+ and files, normalizes them into a consistent internal summary structure, and provides
15
+ functionality to sort, cluster, and generalize similar messages.
16
+
17
+ Attributes
18
+ ----------
19
+ summary : dict of defaultdict
20
+ Nested dictionary structure that stores the aggregated QA results.
21
+ It contains two top-level keys:
22
+ - ``"error"`` : maps checker functions to error messages → dataset IDs → file names.
23
+ - ``"fail"`` : maps test weights → test names → messages → dataset IDs → file names.
24
+ clustered_summary : dict of defaultdict
25
+ Summary structure produced after clustering messages using
26
+ :meth:`cluster_summary`. Keys and nesting mirror ``summary``, but
27
+ messages are generalized and aggregated/clustered across similar text patterns.
28
+ checker_dict : dict
29
+ Mapping of checker identifiers to human-readable names, used
30
+ for consistent labeling in summaries. Only cc checks.
31
+ checker_dict_ext : dict
32
+ Mapping of checker identifiers to human-readable names, used
33
+ for consistent labeling in summaries. cc checks extended by esgf_qa checks.
34
+
35
+ Methods
36
+ -------
37
+ update(result_dict, dsid, file_name)
38
+ Update the summary with a single cc run result (i.e. for one file).
39
+ update_ds(result_dict, dsid)
40
+ Update the summary with results from a single inter-dataset or inter-file checker run
41
+ that come with esgf-qa.
42
+ sort()
43
+ Sort the summary by test weight and test name for consistent output ordering.
44
+ cluster_messages(messages, threshold)
45
+ Cluster similar message strings based on edit-distance similarity.
46
+ generalize_message_group(messages)
47
+ Derive a generalized message template and placeholder map from a list of similar messages.
48
+ merge_placeholders(list_of_strings, dictionary, skip=0)
49
+ Helper to merge adjacent placeholders in message templates where possible.
50
+ cluster_summary(threshold=0.75)
51
+ Cluster and generalize all messages in the current summary using a similarity threshold.
52
+
53
+ Examples
54
+ --------
55
+ >>> from esgf_qa._constants import checker_dict
56
+ >>> agg = QAResultAggregator(checker_dict)
57
+ >>> result = {
58
+ ... "cf": {
59
+ ... "test_1": {"value": (0, 1), "msgs": ["Missing attribute 'units'"]},
60
+ ... }
61
+ ... }
62
+ >>> agg.update(result, dsid="dataset_001", file_name="tas_day.nc")
63
+ >>> agg.sort()
64
+ >>> agg.cluster_summary(threshold=0.8)
65
+ >>> agg.clustered_summary["fail"]
66
+ {3: {'[CF-Conventions] test_1': {'Missing attribute {A} (1 occurrences, e.g. A=\'units\')': {...}}}}
67
+ """
68
+
69
+ def __init__(self):
70
+ """
71
+ Initialize the aggregator with an empty summary.
72
+ """
73
+ self.summary = {
74
+ "error": defaultdict(
75
+ lambda: defaultdict(lambda: defaultdict(list))
76
+ ), # No weight, just function -> error msg
77
+ "fail": defaultdict(
78
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
79
+ ), # weight -> test -> msg -> dsid -> filenames
80
+ }
81
+ self.checker_dict = checker_dict
82
+ self.checker_dict_ext = checker_dict_ext
83
+
84
+ def update(self, result_dict, dsid, file_name):
85
+ """
86
+ Update the summary with a single result of a cc-run.
87
+
88
+ Parameters
89
+ ----------
90
+ result_dict : dict
91
+ Dictionary containing the results of a single cc-run.
92
+ dsid : str
93
+ Dataset ID.
94
+ file_name : str
95
+ File name.
96
+ """
97
+ for checker in result_dict:
98
+ for test in result_dict[checker]:
99
+ if test == "errors":
100
+ for function_name, error_msg in result_dict[checker][
101
+ "errors"
102
+ ].items():
103
+ self.summary["error"][
104
+ f"[{checker_dict[checker]}] " + function_name
105
+ ][error_msg][dsid].append(file_name)
106
+ else:
107
+ score, max_score = result_dict[checker][test]["value"]
108
+ weight = result_dict[checker][test].get("weight", 3)
109
+ msgs = result_dict[checker][test].get("msgs", [])
110
+ if score < max_score: # test outcome: fail
111
+ for msg in msgs:
112
+ self.summary["fail"][weight][
113
+ f"[{checker_dict[checker]}] " + test
114
+ ][msg][dsid].append(file_name)
115
+
116
+ def update_ds(self, result_dict, dsid):
117
+ """
118
+ Update the summary with a single result of an esgf-qa (inter-file/dataset) run.
119
+
120
+ Parameters
121
+ ----------
122
+ result_dict : dict
123
+ Dictionary containing the results of a single esgf-qa (inter-file/dataset) run.
124
+ dsid : str
125
+ Dataset ID.
126
+ """
127
+ for checker in result_dict:
128
+ for test in result_dict[checker]:
129
+ if test == "errors":
130
+ for function_name, errdict in result_dict[checker][
131
+ "errors"
132
+ ].items():
133
+ for file_name in errdict["files"]:
134
+ self.summary["error"][
135
+ f"[{checker_dict_ext[checker]}] " + function_name
136
+ ][errdict["msg"]][dsid].append(file_name)
137
+ else:
138
+ weight = result_dict[checker][test].get("weight", 3)
139
+ fails = result_dict[checker][test].get("msgs", {})
140
+ for msg, file_names in fails.items():
141
+ for file_name in file_names:
142
+ self.summary["fail"][weight][
143
+ f"[{checker_dict_ext[checker]}] " + test
144
+ ][msg][dsid].append(file_name)
145
+
146
+ def sort(self):
147
+ """
148
+ Sort the summary by test weight and test name for consistent output ordering.
149
+
150
+ Modifies the `summary` attribute.
151
+ """
152
+ self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
153
+ for key in self.summary["fail"]:
154
+ self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
155
+
156
+ # Sort errors by function name
157
+ for checker in self.summary["error"]:
158
+ self.summary["error"][checker] = dict(
159
+ sorted(self.summary["error"][checker].items())
160
+ )
161
+
162
+ @staticmethod
163
+ def cluster_messages(messages, threshold):
164
+ """
165
+ Cluster messages based on similarity.
166
+
167
+ Parameters
168
+ ----------
169
+ messages : list
170
+ List of messages to cluster.
171
+ threshold : float
172
+ Similarity threshold.
173
+
174
+ Returns
175
+ -------
176
+ list
177
+ List of clusters.
178
+ """
179
+ clusters = []
180
+ while messages:
181
+ base = messages.pop(0)
182
+ cluster = [base]
183
+ to_remove = []
184
+ for msg in messages:
185
+ ratio = difflib.SequenceMatcher(None, base, msg).ratio()
186
+ if ratio >= threshold:
187
+ cluster.append(msg)
188
+ to_remove.append(msg)
189
+ for msg in to_remove:
190
+ messages.remove(msg)
191
+ clusters.append(cluster)
192
+ return clusters
193
+
194
+ @staticmethod
195
+ def generalize_message_group(messages):
196
+ """
197
+ Generalize a group of messages.
198
+
199
+ Parameters
200
+ ----------
201
+ messages : list
202
+ List of messages to generalize.
203
+
204
+ Returns
205
+ -------
206
+ str
207
+ Generalized message.
208
+ dict
209
+ Placeholders.
210
+ """
211
+ if len(messages) == 1:
212
+ return messages[0], {}
213
+
214
+ # Split messages into tokens
215
+ split_messages = [re.findall(r"\w+|\W", m) for m in messages]
216
+ transposed = list(zip(*split_messages))
217
+ template = []
218
+ placeholders = {}
219
+ var_index = 0
220
+
221
+ for i, tokens in enumerate(transposed):
222
+ unique_tokens = set(tokens)
223
+ if len(unique_tokens) == 1:
224
+ template.append(tokens[0])
225
+ else:
226
+ var_name = chr(ord("A") + var_index)
227
+ template.append(f"{{{var_name}}}")
228
+ placeholders[var_name] = tokens[0]
229
+ var_index += 1
230
+
231
+ # Merge placeholders if possible
232
+ template, placeholders = QAResultAggregator.merge_placeholders(
233
+ template, placeholders
234
+ )
235
+
236
+ # Return the generalized message and the placeholders
237
+ generalized = "".join(template)
238
+ return generalized, placeholders
239
+
240
+ @staticmethod
241
+ def merge_placeholders(list_of_strings, dictionary, skip=0):
242
+ """
243
+ Merge adjacent placeholders in message templates where possible.
244
+
245
+ Avoids too many placeholders in a clustered message.
246
+
247
+ Parameters
248
+ ----------
249
+ list_of_strings : list
250
+ List of strings.
251
+ dictionary : dict
252
+ Dictionary of placeholders.
253
+ skip : int, optional
254
+ Number of placeholders to skip, by default 0.
255
+
256
+ Returns
257
+ -------
258
+ list
259
+ List of strings with placeholders merged.
260
+ dict
261
+ Dictionary of placeholders.
262
+ """
263
+
264
+ def find_next_two_placeholders(list_of_strings, skip):
265
+ placeholders = [
266
+ s for s in list_of_strings if s.startswith("{") and s.endswith("}")
267
+ ]
268
+ if len(placeholders) < 2:
269
+ return None, None
270
+ return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
271
+ placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
272
+ )
273
+
274
+ def extract_text_between_placeholders(
275
+ list_of_strings, placeholder1, placeholder2
276
+ ):
277
+ idx1 = list_of_strings.index(placeholder1)
278
+ idx2 = list_of_strings.index(placeholder2)
279
+ return "".join(list_of_strings[idx1 + 1 : idx2])
280
+
281
+ def merge_two_placeholders(
282
+ placeholder1, placeholder2, text_between, dictionary
283
+ ):
284
+ new_value = (
285
+ dictionary[placeholder1.lstrip("{").rstrip("}")]
286
+ + text_between
287
+ + dictionary[placeholder2.lstrip("{").rstrip("}")]
288
+ )
289
+ dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
290
+ del dictionary[placeholder2.lstrip("{").rstrip("}")]
291
+ return dictionary
292
+
293
+ def update_placeholder_names(list_of_strings, dictionary):
294
+ old_placeholders = sorted(list(dictionary.keys()))
295
+ new_placeholders = [
296
+ chr(ord("A") + i) for i in range(0, len(old_placeholders))
297
+ ]
298
+ new_dictionary = dict(
299
+ zip(new_placeholders, [dictionary[val] for val in old_placeholders])
300
+ )
301
+ for old, new in zip(old_placeholders, new_placeholders):
302
+ list_of_strings = [
303
+ s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
304
+ ]
305
+ return list_of_strings, new_dictionary
306
+
307
+ def replace_placeholders_with_new_one(
308
+ list_of_strings, placeholder1, placeholder2
309
+ ):
310
+ idx1 = list_of_strings.index(placeholder1)
311
+ idx2 = list_of_strings.index(placeholder2)
312
+ list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
313
+ if idx2 < len(list_of_strings) + 1:
314
+ list_of_strings_new += list_of_strings[idx2 + 1 :]
315
+ return list_of_strings_new
316
+
317
+ if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
318
+ return list_of_strings, dictionary
319
+
320
+ placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
321
+ if placeholder1 is None or placeholder2 is None:
322
+ return list_of_strings, dictionary
323
+
324
+ text_between = extract_text_between_placeholders(
325
+ list_of_strings, placeholder1, placeholder2
326
+ )
327
+ if len(text_between) < 5:
328
+ dictionary = merge_two_placeholders(
329
+ placeholder1, placeholder2, text_between, dictionary
330
+ )
331
+ list_of_strings = replace_placeholders_with_new_one(
332
+ list_of_strings, placeholder1, placeholder2
333
+ )
334
+ list_of_strings, dictionary = update_placeholder_names(
335
+ list_of_strings, dictionary
336
+ )
337
+ return QAResultAggregator.merge_placeholders(
338
+ list_of_strings, dictionary, skip
339
+ )
340
+ else:
341
+ return QAResultAggregator.merge_placeholders(
342
+ list_of_strings, dictionary, skip + 1
343
+ )
344
+
345
+ def cluster_summary(self, threshold=0.75):
346
+ """
347
+ Cluster messages in the summary into groups of similar messages.
348
+
349
+ Drastically reduces number of messages in the summary for datasets accumulating
350
+ large numbers of check failure messages.
351
+
352
+ Parameters
353
+ ----------
354
+ threshold : float, optional
355
+ The threshold for similarity between messages, by default 0.75.
356
+
357
+ Returns
358
+ -------
359
+ None
360
+ Modifies the `clustered_summary` attribute.
361
+ """
362
+ self.clustered_summary = defaultdict(
363
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
364
+ )
365
+ for status in self.summary:
366
+ if status == "error":
367
+ for test_id in self.summary[status]:
368
+ messages = list(self.summary[status][test_id].keys())
369
+ # Pass a copy of messages to cluster_messages to generate clusters
370
+ clusters = QAResultAggregator.cluster_messages(
371
+ messages[:], threshold
372
+ )
373
+
374
+ for cluster in clusters:
375
+ generalized, placeholders = (
376
+ QAResultAggregator.generalize_message_group(cluster)
377
+ )
378
+ example_parts = ", ".join(
379
+ [
380
+ (
381
+ f"{k}='{v[0]}'"
382
+ if isinstance(v, list)
383
+ else f"{k}='{v}'"
384
+ )
385
+ for k, v in placeholders.items()
386
+ ]
387
+ )
388
+ if example_parts:
389
+ msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
390
+ else:
391
+ msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
392
+
393
+ # Gather all ds_ids and filenames across the cluster
394
+ combined = defaultdict(set)
395
+ for message in cluster:
396
+ for ds_id, files in self.summary[status][test_id][
397
+ message
398
+ ].items():
399
+ combined[ds_id].update(files)
400
+
401
+ # Shorten file lists to one example
402
+ formatted = {
403
+ ds_id
404
+ + " ("
405
+ + str(len(files))
406
+ + f" file{'s' if len(files) > 1 else ''} affected)": (
407
+ [f"e.g. '{next(iter(files))}'"]
408
+ if len(files) > 1
409
+ else [f"'{next(iter(files))}'"]
410
+ )
411
+ for ds_id, files in combined.items()
412
+ }
413
+
414
+ self.clustered_summary[status][test_id][msg_summary] = formatted
415
+ elif status == "fail":
416
+ for weight in self.summary[status]:
417
+ for test_id in self.summary[status][weight]:
418
+ messages = list(self.summary[status][weight][test_id].keys())
419
+ # Pass a copy of messages to cluster_messages to generate clusters
420
+ clusters = QAResultAggregator.cluster_messages(
421
+ messages[:], threshold
422
+ )
423
+
424
+ for cluster in clusters:
425
+ generalized, placeholders = (
426
+ QAResultAggregator.generalize_message_group(cluster)
427
+ )
428
+ example_parts = ", ".join(
429
+ [
430
+ (
431
+ f"{k}='{v[0]}'"
432
+ if isinstance(v, list)
433
+ else f"{k}='{v}'"
434
+ )
435
+ for k, v in placeholders.items()
436
+ ]
437
+ )
438
+ if example_parts:
439
+ msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
440
+ else:
441
+ msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
442
+
443
+ # Gather all ds_ids and filenames across the cluster
444
+ combined = defaultdict(set)
445
+ for message in cluster:
446
+ for ds_id, files in self.summary[status][weight][
447
+ test_id
448
+ ][message].items():
449
+ combined[ds_id].update(files)
450
+
451
+ # Shorten file lists to one example
452
+ formatted = {
453
+ ds_id
454
+ + " ("
455
+ + str(len(files))
456
+ + f" file{'s' if len(files) > 1 else ''} affected)": (
457
+ [f"e.g. '{next(iter(files))}'"]
458
+ if len(files) > 1
459
+ else [f"'{next(iter(files))}'"]
460
+ )
461
+ for ds_id, files in combined.items()
462
+ }
463
+
464
+ self.clustered_summary[status][weight][test_id][
465
+ msg_summary
466
+ ] = formatted