esgf-qa 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgf_qa/con_checks.py ADDED
@@ -0,0 +1,634 @@
1
+ import json
2
+ from collections import ChainMap, OrderedDict, defaultdict
3
+
4
+ import cftime
5
+ import xarray as xr
6
+
7
+ from esgf_qa._constants import deltdic
8
+
9
+
10
+ def level2_factory():
11
+ return defaultdict(list)
12
+
13
+
14
+ def level1_factory():
15
+ return defaultdict(level2_factory)
16
+
17
+
18
+ def level0_factory():
19
+ return defaultdict(level1_factory)
20
+
21
+
22
+ def printtimedelta(d):
23
+ """Return timedelta (s) as either min, hours, days, whatever fits best."""
24
+ if d > 86000:
25
+ return f"{d/86400.} days"
26
+ if d > 3500:
27
+ return f"{d/3600.} hours"
28
+ if d > 50:
29
+ return f"{d/60.} minutes"
30
+ else:
31
+ return f"{d} seconds"
32
+
33
+
34
+ def truncate_str(s, max_length=16):
35
+ if max_length <= 15 or len(s) <= max_length:
36
+ return s
37
+
38
+ # Select start and end of string
39
+ words = s.split()
40
+ start = ""
41
+ end = ""
42
+
43
+ for i in range(len(words)):
44
+ if len(" ".join(words[: i + 1])) >= 6:
45
+ start = " ".join(words[: i + 1])
46
+ break
47
+
48
+ for i in range(len(words) - 1, -1, -1):
49
+ if len(" ".join(words[i:])) >= 6:
50
+ end = " ".join(words[i:])
51
+ break
52
+
53
+ # Return truncated string
54
+ if len(start) + len(end) + 3 >= len(s):
55
+ return s
56
+ else:
57
+ return f"{start}...{end}"
58
+
59
+
60
+ def compare_dicts(dict1, dict2, exclude_keys=None):
61
+ if exclude_keys is None:
62
+ exclude_keys = set()
63
+ else:
64
+ exclude_keys = set(exclude_keys)
65
+
66
+ # Get all keys that are in either dictionary, excluding the ones to skip
67
+ all_keys = (set(dict1) | set(dict2)) - exclude_keys
68
+
69
+ # Collect keys with differing values
70
+ differing_keys = [key for key in all_keys if dict1.get(key) != dict2.get(key)]
71
+
72
+ return differing_keys
73
+
74
+
75
+ def compare_nested_dicts(dict1, dict2, exclude_keys=None):
76
+ diffs = {}
77
+
78
+ all_root_keys = set(dict1) | set(dict2)
79
+
80
+ for root_key in all_root_keys:
81
+ subdict1 = dict1.get(root_key, {})
82
+ subdict2 = dict2.get(root_key, {})
83
+
84
+ if not isinstance(subdict1, dict) or not isinstance(subdict2, dict):
85
+ if subdict1 != subdict2:
86
+ diffs[root_key] = []
87
+ continue
88
+
89
+ diffs_k = compare_dicts(subdict1, subdict2, exclude_keys)
90
+
91
+ if diffs_k:
92
+ diffs[root_key] = diffs_k
93
+
94
+ return diffs
95
+
96
+
97
+ def consistency_checks(ds, ds_map, files_to_check_dict, checker_options):
98
+ results = defaultdict(level1_factory)
99
+ filelist = sorted(ds_map[ds])
100
+ consistency_files = OrderedDict(
101
+ (files_to_check_dict[i]["consistency_file"], i) for i in filelist
102
+ )
103
+
104
+ # Exclude the following global attributes from comparison
105
+ excl_global_attrs = ["creation_date", "history", "tracking_id"]
106
+
107
+ # Exclude the following variable attributes from comparison
108
+ excl_var_attrs = []
109
+
110
+ # Exclude the following coordinates from comparison
111
+ excl_coords = []
112
+
113
+ # Compare each file with reference
114
+ reference_file = list(consistency_files.keys())[0]
115
+ with open(reference_file) as fr:
116
+ reference_data = json.load(fr)
117
+ for file in consistency_files.keys():
118
+ if file == reference_file:
119
+ continue
120
+ with open(file) as fc:
121
+ data = json.load(fc)
122
+
123
+ # Compare required global attributes
124
+ test = "Required global attributes"
125
+ results[test]["weight"] = 3
126
+ diff_keys = compare_dicts(
127
+ reference_data["global_attributes"],
128
+ data["global_attributes"],
129
+ exclude_keys=excl_global_attrs,
130
+ )
131
+ if diff_keys:
132
+ err_msg = "The following global attributes differ: " + ", ".join(
133
+ sorted(diff_keys)
134
+ )
135
+ results[test]["msgs"][err_msg].append(consistency_files[file])
136
+
137
+ # Compare non-required global attributes
138
+ test = "Non-required global attributes"
139
+ results[test]["weight"] = 1
140
+ diff_keys = compare_dicts(
141
+ reference_data["global_attributes_non_required"],
142
+ data["global_attributes_non_required"],
143
+ exclude_keys=excl_global_attrs,
144
+ )
145
+ if diff_keys:
146
+ err_msg = (
147
+ "The following non-required global attributes differ: "
148
+ + ", ".join(sorted(diff_keys))
149
+ )
150
+ results[test]["msgs"][err_msg].append(consistency_files[file])
151
+
152
+ # Compare global attributes dtypes
153
+ test = "Global attributes data types"
154
+ results[test]["weight"] = 3
155
+ diff_keys = compare_dicts(
156
+ reference_data["global_attributes_dtypes"],
157
+ data["global_attributes_dtypes"],
158
+ exclude_keys=[],
159
+ )
160
+ if diff_keys:
161
+ diff_keys = [
162
+ key
163
+ for key in diff_keys
164
+ if key in reference_data["global_attributes_dtypes"]
165
+ and key in data["global_attributes_dtypes"]
166
+ ]
167
+ if diff_keys:
168
+ err_msg = (
169
+ "The following global attributes have inconsistent data types: "
170
+ + ", ".join(sorted(diff_keys))
171
+ )
172
+ results[test]["msgs"][err_msg].append(consistency_files[file])
173
+
174
+ # Compare variable attributes
175
+ test = "Variable attributes"
176
+ results[test]["weight"] = 3
177
+ diff_keys = compare_nested_dicts(
178
+ reference_data["variable_attributes"],
179
+ data["variable_attributes"],
180
+ exclude_keys=excl_var_attrs,
181
+ )
182
+ if diff_keys:
183
+ for key, diff in diff_keys.items():
184
+ if diff:
185
+ err_msg = (
186
+ f"For variable '{key}' the following variable attributes differ: "
187
+ + ", ".join(sorted(diff))
188
+ )
189
+ results[test]["msgs"][err_msg].append(
190
+ consistency_files[file]
191
+ )
192
+ else:
193
+ err_msg = f"Variable '{key}' not present."
194
+ if key not in data["variable_attributes"]:
195
+ results[test]["msgs"][err_msg].append(
196
+ consistency_files[file]
197
+ )
198
+ else:
199
+ results[test]["msgs"][err_msg].append(
200
+ consistency_files[reference_file]
201
+ )
202
+
203
+ # Compare variable attributes data types
204
+ test = "Variable attributes data types"
205
+ results[test]["weight"] = 3
206
+ diff_keys = compare_nested_dicts(
207
+ reference_data["variable_attributes_dtypes"],
208
+ data["variable_attributes_dtypes"],
209
+ exclude_keys=[],
210
+ )
211
+ if diff_keys:
212
+ for key, diff in diff_keys.items():
213
+ if diff:
214
+ err_msg = (
215
+ f"For variable '{key}' the following variable attributes have inconsistent data types: "
216
+ + ", ".join(sorted(diff))
217
+ )
218
+ results[test]["msgs"][err_msg].append(
219
+ consistency_files[file]
220
+ )
221
+
222
+ # Compare dimensions
223
+ test = "Dimensions"
224
+ results[test]["weight"] = 3
225
+ diff_keys = compare_dicts(
226
+ reference_data["dimensions"],
227
+ data["dimensions"],
228
+ exclude_keys=["time"],
229
+ )
230
+ if diff_keys:
231
+ err_msg = "The following dimensions differ: " + ", ".join(
232
+ sorted(diff_keys)
233
+ )
234
+ results[test]["msgs"][err_msg].append(consistency_files[file])
235
+
236
+ # Compare coordinates
237
+ test = "Coordinates"
238
+ results[test]["weight"] = 3
239
+ diff_keys = compare_dicts(
240
+ reference_data["coordinates"],
241
+ data["coordinates"],
242
+ exclude_keys=excl_coords,
243
+ )
244
+ if diff_keys:
245
+ err_msg = "The following coordinates differ: " + ", ".join(
246
+ sorted(diff_keys)
247
+ )
248
+ results[test]["msgs"][err_msg].append(consistency_files[file])
249
+
250
+ return results
251
+
252
+
253
+ def continuity_checks(ds, ds_map, files_to_check_dict, checker_options):
254
+ results = defaultdict(level1_factory)
255
+ filelist = sorted(ds_map[ds])
256
+ consistency_files = OrderedDict(
257
+ (files_to_check_dict[i]["consistency_file"], i) for i in filelist
258
+ )
259
+
260
+ # Check time and time_bnds continuity
261
+ test = "Time continuity"
262
+ results[test]["weight"] = 3
263
+ timen = None
264
+ boundn = None
265
+ i = 0
266
+ for file in consistency_files.keys():
267
+ with open(file) as fc:
268
+ data = json.load(fc)
269
+ i += 1
270
+ prev_timen = timen
271
+ prev_boundn = boundn
272
+ timen = (
273
+ cftime.num2date(
274
+ data["time_info"]["timen"],
275
+ units=data["time_info"]["units"],
276
+ calendar=data["time_info"]["calendar"],
277
+ )
278
+ if data["time_info"]["timen"]
279
+ else None
280
+ )
281
+ boundn = (
282
+ cftime.num2date(
283
+ data["time_info"]["boundn"],
284
+ units=data["time_info"]["units"],
285
+ calendar=data["time_info"]["calendar"],
286
+ )
287
+ if data["time_info"]["boundn"]
288
+ else None
289
+ )
290
+ if i == 1:
291
+ continue
292
+ time0 = (
293
+ cftime.num2date(
294
+ data["time_info"]["time0"],
295
+ units=data["time_info"]["units"],
296
+ calendar=data["time_info"]["calendar"],
297
+ )
298
+ if data["time_info"]["time0"]
299
+ else None
300
+ )
301
+ bound0 = (
302
+ cftime.num2date(
303
+ data["time_info"]["bound0"],
304
+ units=data["time_info"]["units"],
305
+ calendar=data["time_info"]["calendar"],
306
+ )
307
+ if data["time_info"]["bound0"]
308
+ else None
309
+ )
310
+ freq = data["time_info"]["frequency"]
311
+ if (time0 or timen or bound0 or boundn) and not freq:
312
+ err_msg = "Frequency could not be inferred"
313
+ results[test]["msgs"][err_msg].append(consistency_files[file])
314
+ continue
315
+ elif (time0 or timen or bound0 or boundn) and freq not in deltdic:
316
+ err_msg = f"Unsupported frequency '{freq}'"
317
+ continue
318
+
319
+ if time0 and prev_timen:
320
+ delt = time0 - prev_timen
321
+ delts = delt.total_seconds()
322
+ if delts > deltdic[freq + "max"] or delts < deltdic[freq + "min"]:
323
+ err_msg = f"Gap in time axis (between files) - previous {prev_timen} - current {time0} - delta-t {printtimedelta(delts)}"
324
+ results[test]["msgs"][err_msg].append(consistency_files[file])
325
+
326
+ if bound0 and prev_boundn:
327
+ delt_bnd = bound0 - prev_boundn
328
+ delts_bnd = delt_bnd.total_seconds()
329
+ if delts_bnd < -1:
330
+ err_msg = f"Overlapping time bounds (between files) - previous {prev_boundn} - current {bound0} - delta-t {printtimedelta(delts_bnd)}"
331
+ results[test]["msgs"][err_msg].append(consistency_files[file])
332
+ if delts_bnd > 1:
333
+ err_msg = f"Gap in time bounds (between files) - previous {prev_boundn} - current {bound0} - delta-t {printtimedelta(delts_bnd)}"
334
+ results[test]["msgs"][err_msg].append(consistency_files[file])
335
+
336
+ return results
337
+
338
+
339
+ def compatibility_checks(ds, ds_map, files_to_check_dict, checker_options):
340
+ results = defaultdict(level1_factory)
341
+ filelist = sorted(ds_map[ds])
342
+
343
+ # open_mfdataset - override
344
+ test = "xarray open_mfdataset - override"
345
+ results[test]["weight"] = 3
346
+ try:
347
+ with xr.open_mfdataset(filelist, coords="minimal", compat="override") as ds:
348
+ pass
349
+ except Exception as e:
350
+ results[test]["msgs"][str(e)].extend(filelist)
351
+
352
+ # open_mfdataset - no_conflicts
353
+ test = "xarray open_mfdataset - no_conflicts"
354
+ results[test]["weight"] = 3
355
+ try:
356
+ with xr.open_mfdataset(filelist, coords="minimal", compat="no_conflicts") as ds:
357
+ pass
358
+ except Exception as e:
359
+ results[test]["msgs"][str(e)].extend(filelist)
360
+
361
+ return results
362
+
363
+
364
+ def dataset_coverage_checks(ds_map, files_to_check_dict, checker_options):
365
+ results = defaultdict(level0_factory)
366
+ test = "Time coverage"
367
+
368
+ coverage_start = dict()
369
+ coverage_end = dict()
370
+
371
+ # Extract time coverage for each dataset
372
+ for ds in ds_map.keys():
373
+ fl = sorted(ds_map[ds])
374
+ ts0 = None
375
+ tsn = None
376
+ try:
377
+ if files_to_check_dict[fl[0]]["ts"] != "":
378
+ ts0 = files_to_check_dict[fl[0]]["ts"].split("-")[0][0:4]
379
+ # If time interval of timestamp does not start in January, use following year
380
+ if len(files_to_check_dict[fl[-1]]["ts"].split("-")[0]) >= 6:
381
+ if files_to_check_dict[fl[-1]]["ts"].split("-")[0][4:6] != "01":
382
+ coverage_start[ds] = int(ts0) + 1
383
+ else:
384
+ coverage_start[ds] = int(ts0)
385
+ coverage_start[ds] = int(ts0)
386
+ if files_to_check_dict[fl[-1]]["ts"] != "":
387
+ tsn = files_to_check_dict[fl[-1]]["ts"].split("-")[1][0:4]
388
+ # If time interval of timestamp ends in January, use previous year
389
+ if len(files_to_check_dict[fl[-1]]["ts"].split("-")[1]) >= 6:
390
+ if files_to_check_dict[fl[-1]]["ts"].split("-")[1][4:6] == "01":
391
+ coverage_end[ds] = int(tsn) - 1
392
+ else:
393
+ coverage_end[ds] = int(tsn)
394
+ else:
395
+ coverage_end[ds] = int(tsn)
396
+ if ts0 is None and tsn is None:
397
+ continue
398
+ elif ts0 is None:
399
+ results[ds][test]["weight"] = 1
400
+ results[ds][test]["msgs"][
401
+ "Begin of time coverage cannot be inferred."
402
+ ] = [fl[0]]
403
+ continue
404
+ elif tsn is None:
405
+ results[ds][test]["weight"] = 1
406
+ results[ds][test]["msgs"][
407
+ "End of time coverage cannot be inferred."
408
+ ] = [fl[-1]]
409
+ continue
410
+ except IndexError or ValueError:
411
+ results[ds][test]["weight"] = 1
412
+ if len(fl) > 1:
413
+ results[ds][test]["msgs"]["Time coverage cannot be inferred."] = [
414
+ fl[0],
415
+ fl[-1],
416
+ ]
417
+ else:
418
+ results[ds][test]["msgs"]["Time coverage cannot be inferred."] = [fl[0]]
419
+ continue
420
+
421
+ # Compare coverage
422
+ if len(coverage_start.keys()) > 1:
423
+ scov = min(coverage_start.values())
424
+ ecov = max(coverage_end.values())
425
+ # Get all ds where coverage_start differs
426
+ for ds in coverage_start.keys():
427
+ fl = sorted(ds_map[ds])
428
+ if coverage_start[ds] != scov:
429
+ results[ds][test]["weight"] = 1
430
+ results[ds][test]["msgs"][
431
+ f"Time series starts at '{coverage_start[ds]}' while other time series start at '{scov}'"
432
+ ] = [fl[0]]
433
+ if ds in coverage_end and coverage_end[ds] != ecov:
434
+ results[ds][test]["weight"] = 1
435
+ results[ds][test]["msgs"][
436
+ f"Time series ends at '{coverage_end[ds]}' while other time series end at '{ecov}'"
437
+ ] = [fl[-1]]
438
+
439
+ return results
440
+
441
+
442
+ def inter_dataset_consistency_checks(ds_map, files_to_check_dict, checker_options):
443
+ results = defaultdict(level0_factory)
444
+ filedict = {}
445
+ consistency_data = {}
446
+ for ds in ds_map.keys():
447
+ filedict[ds] = sorted(ds_map[ds])[0]
448
+
449
+ # Exclude the following global attributes from comparison
450
+ excl_global_attrs = [
451
+ "creation_date",
452
+ "history",
453
+ "tracking_id",
454
+ "variable_id",
455
+ "frequency",
456
+ "external_variables",
457
+ "table_id",
458
+ "grid",
459
+ "grid_label",
460
+ "realm",
461
+ "modeling_realm",
462
+ ]
463
+
464
+ # Include the following global attributes in the realm-specific comparison
465
+ incl_global_attrs = ["grid", "grid_label", "realm", "modeling_realm"]
466
+
467
+ # Consistency data
468
+ for ds, dsfile0 in filedict.items():
469
+ consistency_file = files_to_check_dict[dsfile0]["consistency_file"]
470
+ with open(consistency_file) as f:
471
+ data = json.load(f)
472
+ consistency_data[ds] = data
473
+
474
+ # Reference datasets
475
+ ref_ds = dict()
476
+
477
+ # Compare each file with reference
478
+ for ds, data in consistency_data.items():
479
+ # Select first dataset as main reference
480
+ if "Main" not in ref_ds:
481
+ ref_ds["Main"] = ds
482
+ # Also group datasets by realm and grid label
483
+ # for grid / realm specific consistency checks
484
+ realm = ChainMap(
485
+ data["global_attributes"], data["global_attributes_non_required"]
486
+ ).get("realm", None)
487
+ if not realm:
488
+ realm = ChainMap(
489
+ data["global_attributes"], data["global_attributes_non_required"]
490
+ ).get("modeling_realm", None)
491
+ if not realm:
492
+ realm = "Default"
493
+ gridlabel = ChainMap(
494
+ data["global_attributes"], data["global_attributes_non_required"]
495
+ ).get("grid_label", None)
496
+ if not gridlabel:
497
+ gridlabel = ChainMap(
498
+ data["global_attributes"], data["global_attributes_non_required"]
499
+ ).get("grid", None)
500
+ if not gridlabel:
501
+ gridlabel = "Default"
502
+ ref_ds_key = f"{realm}/{gridlabel}"
503
+ if ref_ds_key not in ref_ds:
504
+ ref_ds[ref_ds_key] = ds
505
+ continue
506
+ else:
507
+ reference_data_rg = consistency_data[ref_ds[ref_ds_key]]
508
+ reference_data = consistency_data[ref_ds["Main"]]
509
+
510
+ # Compare required global attributes
511
+ test = "Required global attributes (Inter-Dataset)"
512
+ results[ds][test]["weight"] = 2
513
+ diff_keys = compare_dicts(
514
+ reference_data["global_attributes"],
515
+ data["global_attributes"],
516
+ exclude_keys=excl_global_attrs,
517
+ )
518
+ if diff_keys:
519
+ err_msg = (
520
+ "The following global attributes differ between datasets: "
521
+ + ", ".join(sorted(diff_keys))
522
+ )
523
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
524
+
525
+ # Compare specific global attributes
526
+ test = "Realm-specific global attributes (Inter-Dataset)"
527
+ results[ds][test]["weight"] = 2
528
+ diff_keys = compare_dicts(
529
+ {
530
+ k: ChainMap(
531
+ reference_data_rg["global_attributes"],
532
+ reference_data_rg["global_attributes_non_required"],
533
+ ).get(k, "unset")
534
+ for k in incl_global_attrs
535
+ },
536
+ {
537
+ k: ChainMap(
538
+ data["global_attributes"],
539
+ data["global_attributes_non_required"],
540
+ ).get(k, "unset")
541
+ for k in incl_global_attrs
542
+ },
543
+ exclude_keys=[],
544
+ )
545
+ if diff_keys:
546
+ err_msg = (
547
+ f"The following realm-specific global attributes differ between datasets (realm/grid_label: {truncate_str(ref_ds_key.split('/')[0])}/{truncate_str(ref_ds_key.split('/')[1])}): "
548
+ + ", ".join(sorted(diff_keys))
549
+ )
550
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
551
+
552
+ # Compare non-required global attributes
553
+ test = "Non-required global attributes (Inter-Dataset)"
554
+ results[ds][test]["weight"] = 1
555
+ diff_keys = compare_dicts(
556
+ reference_data["global_attributes_non_required"],
557
+ data["global_attributes_non_required"],
558
+ exclude_keys=excl_global_attrs,
559
+ )
560
+ if diff_keys:
561
+ err_msg = (
562
+ "The following non-required global attributes differ between datasets: "
563
+ + ", ".join(sorted(diff_keys))
564
+ )
565
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
566
+
567
+ # Compare global attributes dtypes
568
+ test = "Global attributes data types (Inter-Dataset)"
569
+ results[ds][test]["weight"] = 2
570
+ diff_keys = compare_dicts(
571
+ reference_data["global_attributes_dtypes"],
572
+ data["global_attributes_dtypes"],
573
+ exclude_keys=[],
574
+ )
575
+ if diff_keys:
576
+ err_msg = (
577
+ "The following global attributes have inconsistent data types between datasets: "
578
+ + ", ".join(sorted(diff_keys))
579
+ )
580
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
581
+
582
+ # Compare dimensions
583
+ test = "Dimensions (Inter-Dataset)"
584
+ results[ds][test]["weight"] = 2
585
+ diff_keys = compare_dicts(
586
+ reference_data_rg["dimensions"],
587
+ data["dimensions"],
588
+ exclude_keys=["time", "depth", "lev"],
589
+ )
590
+ if diff_keys:
591
+ err_msg = (
592
+ "The following dimensions differ between datasets: "
593
+ + ", ".join(sorted(diff_keys))
594
+ )
595
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
596
+
597
+ # Compare coordinates
598
+ test = "Coordinates (Inter-Dataset)"
599
+ results[ds][test]["weight"] = 2
600
+ diff_keys = compare_dicts(
601
+ reference_data_rg["coordinates"],
602
+ data["coordinates"],
603
+ exclude_keys=[
604
+ "depth",
605
+ "depth_bnds",
606
+ "lev",
607
+ "lev_bnds",
608
+ "plev",
609
+ "height",
610
+ ],
611
+ )
612
+ if diff_keys:
613
+ err_msg = (
614
+ "The following coordinates differ between datasets: "
615
+ + ", ".join(sorted(diff_keys))
616
+ )
617
+ results[ds][test]["msgs"][err_msg].append(filedict[ds])
618
+
619
+ # List reference datasets
620
+ print("The following datasets were used as reference:")
621
+ print(f" - General reference: {ref_ds['Main']}")
622
+ reference_datasets = {"general_reference": ref_ds["Main"]}
623
+ for key in sorted(list(ref_ds.keys())):
624
+ if key == "Main":
625
+ continue
626
+ else:
627
+ reference_datasets[key] = ref_ds[key]
628
+ print(
629
+ f" - '{truncate_str(key.split('/')[0])}' / '{truncate_str(key.split('/')[1])}' (realm / grid): {ref_ds[key]}"
630
+ )
631
+
632
+ print()
633
+
634
+ return results, reference_datasets