PyPI - esgf-qa - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

esgf-qa 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

esgf_qa/_constants.py +42 -1
esgf_qa/_version.py +2 -2
esgf_qa/cluster_results.py +466 -0
esgf_qa/con_checks.py +209 -11
esgf_qa/run_qa.py +247 -418
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/METADATA +42 -28
esgf_qa-0.4.0.dist-info/RECORD +19 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/top_level.txt +1 -1
tests/test_cli.py +267 -0
tests/test_cluster_results.py +166 -0
tests/test_con_checks.py +263 -0
tests/test_qaviewer.py +147 -0
tests/test_run_dummy_qa.py +191 -0
tests/test_run_qa.py +181 -0
docs/esgf-qa_Logo.png +0 -0
esgf_qa-0.3.0.dist-info/RECORD +0 -13
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/WHEEL +0 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/entry_points.txt +0 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/licenses/LICENSE +0 -0

esgf_qa/run_qa.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import argparse
 import csv
 import datetime
-import difflib
 import hashlib
 import json
 import multiprocessing
@@ -14,45 +13,19 @@ from pathlib import Path
 from compliance_checker import __version__ as cc_version
 from compliance_checker.runner import CheckSuite
+from esgf_qa._constants import (
+    DRS_path_parent,
+    checker_dict,
+    checker_dict_ext,
+    checker_release_versions,
+)
 from esgf_qa._version import version
+from esgf_qa.cluster_results import QAResultAggregator
 from esgf_qa.con_checks import compatibility_checks as comp  # noqa
 from esgf_qa.con_checks import consistency_checks as cons  # noqa
 from esgf_qa.con_checks import continuity_checks as cont  # noqa
 from esgf_qa.con_checks import dataset_coverage_checks, inter_dataset_consistency_checks
-checker_dict = {
-    "cc6": "CORDEX-CMIP6",
-    "cf": "CF-Conventions",
-    "mip": "MIP",
-    "plugin_cmip6": "CMIP6",
-    # "wcrp-cmip5": "CMIP5",
-    "wcrp_cmip6": "CMIP6",
-    # "wcrp_cmip7": "CMIP7-AFT",
-    # "wcrp_cmip7": "CMIP7",
-    # "wcrp_cordex": "CORDEX",
-    "wcrp_cordex_cmip6": "CORDEX-CMIP6",
-    # "obs4mips": "Obs4MIPs",
-    # "input4mips": "Input4MIPs",
-}
-DRS_path_parent = {
-    "CMIP5": "CMIP5",
-    "CMIP6": "CMIP6",
-    "CMIP7": "CMIP7",
-    "CMIP7-AFT": "CMIP7",
-    "CORDEX": "CORDEX",
-    "CORDEX-CMIP6": "CORDEX-CMIP6",
-    "Obs4MIPs": "Obs4MIPs",
-    "Input4MIPs": "Input4MIPs",
-}
-checker_release_versions = {}
-checker_dict_ext = {
-    # "pcons": "ParentConsistency"
-    "cons": "Consistency",
-    "cont": "Continuity",
-    "comp": "Compatibility",
-    **checker_dict,
-}
 _timestamp_with_ms = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
 _timestamp_filename = datetime.datetime.strptime(
     _timestamp_with_ms, "%Y%m%d-%H%M%S%f"
@@ -62,321 +35,15 @@ _timestamp_pprint = datetime.datetime.strptime(
 ).strftime("%Y-%m-%d %H:%M")
-class QAResultAggregator:
-    def __init__(self, checker_dict):
-        """
-        Initialize the aggregator with an empty summary.
-        """
-        self.summary = {
-            "error": defaultdict(
-                lambda: defaultdict(lambda: defaultdict(list))
-            ),  # No weight, just function -> error msg
-            "fail": defaultdict(
-                lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-            ),  # weight -> test -> msg -> dsid -> filenames
-        }
-        self.checker_dict = checker_dict
-    def update(self, result_dict, dsid, file_name):
-        """
-        Update the summary with a single result of a cc-run.
-        """
-        for checker in result_dict:
-            for test in result_dict[checker]:
-                if test == "errors":
-                    for function_name, error_msg in result_dict[checker][
-                        "errors"
-                    ].items():
-                        self.summary["error"][
-                            f"[{checker_dict[checker]}] " + function_name
-                        ][error_msg][dsid].append(file_name)
-                else:
-                    score, max_score = result_dict[checker][test]["value"]
-                    weight = result_dict[checker][test].get("weight", 3)
-                    msgs = result_dict[checker][test].get("msgs", [])
-                    if score < max_score:  # test outcome: fail
-                        for msg in msgs:
-                            self.summary["fail"][weight][
-                                f"[{checker_dict[checker]}] " + test
-                            ][msg][dsid].append(file_name)
-    def update_ds(self, result_dict, dsid):
-        """
-        Update the summary with a single result of a esgf-qa run.
-        """
-        for checker in result_dict:
-            for test in result_dict[checker]:
-                if test == "errors":
-                    for function_name, errdict in result_dict[checker][
-                        "errors"
-                    ].items():
-                        for file_name in errdict["files"]:
-                            self.summary["error"][
-                                f"[{checker_dict_ext[checker]}] " + function_name
-                            ][errdict["msg"]][dsid].append(file_name)
-                else:
-                    weight = result_dict[checker][test].get("weight", 3)
-                    fails = result_dict[checker][test].get("msgs", {})
-                    for msg, file_names in fails.items():
-                        for file_name in file_names:
-                            self.summary["fail"][weight][
-                                f"[{checker_dict_ext[checker]}] " + test
-                            ][msg][dsid].append(file_name)
-    def sort(self):
-        """
-        Sort the summary.
-        """
-        self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
-        for key in self.summary["fail"]:
-            self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
-        # Sort errors by function name
-        for checker in self.summary["error"]:
-            self.summary["error"][checker] = dict(
-                sorted(self.summary["error"][checker].items())
-            )
-    @staticmethod
-    def cluster_messages(messages, threshold):
-        clusters = []
-        while messages:
-            base = messages.pop(0)
-            cluster = [base]
-            to_remove = []
-            for msg in messages:
-                ratio = difflib.SequenceMatcher(None, base, msg).ratio()
-                if ratio >= threshold:
-                    cluster.append(msg)
-                    to_remove.append(msg)
-            for msg in to_remove:
-                messages.remove(msg)
-            clusters.append(cluster)
-        return clusters
-    @staticmethod
-    def generalize_message_group(messages):
-        if len(messages) == 1:
-            return messages[0], {}
-        # Split messages into tokens
-        split_messages = [re.findall(r"\w+|\W", m) for m in messages]
-        transposed = list(zip(*split_messages))
-        template = []
-        placeholders = {}
-        var_index = 0
-        for i, tokens in enumerate(transposed):
-            unique_tokens = set(tokens)
-            if len(unique_tokens) == 1:
-                template.append(tokens[0])
-            else:
-                var_name = chr(ord("A") + var_index)
-                template.append(f"{{{var_name}}}")
-                placeholders[var_name] = tokens[0]
-                var_index += 1
-        # Merge placeholders if possible
-        template, placeholders = QAResultAggregator.merge_placeholders(
-            template, placeholders
-        )
-        # Return the generalized message and the placeholders
-        generalized = "".join(template)
-        return generalized, placeholders
-    @staticmethod
-    def merge_placeholders(list_of_strings, dictionary, skip=0):
-        def find_next_two_placeholders(list_of_strings, skip):
-            placeholders = [
-                s for s in list_of_strings if s.startswith("{") and s.endswith("}")
-            ]
-            if len(placeholders) < 2:
-                return None, None
-            return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
-                placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
-            )
-        def extract_text_between_placeholders(
-            list_of_strings, placeholder1, placeholder2
-        ):
-            idx1 = list_of_strings.index(placeholder1)
-            idx2 = list_of_strings.index(placeholder2)
-            return "".join(list_of_strings[idx1 + 1 : idx2])
-        def merge_two_placeholders(
-            placeholder1, placeholder2, text_between, dictionary
-        ):
-            new_value = (
-                dictionary[placeholder1.lstrip("{").rstrip("}")]
-                + text_between
-                + dictionary[placeholder2.lstrip("{").rstrip("}")]
-            )
-            dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
-            del dictionary[placeholder2.lstrip("{").rstrip("}")]
-            return dictionary
-        def update_placeholder_names(list_of_strings, dictionary):
-            old_placeholders = sorted(list(dictionary.keys()))
-            new_placeholders = [
-                chr(ord("A") + i) for i in range(0, len(old_placeholders))
-            ]
-            new_dictionary = dict(
-                zip(new_placeholders, [dictionary[val] for val in old_placeholders])
-            )
-            for old, new in zip(old_placeholders, new_placeholders):
-                list_of_strings = [
-                    s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
-                ]
-            return list_of_strings, new_dictionary
-        def replace_placeholders_with_new_one(
-            list_of_strings, placeholder1, placeholder2
-        ):
-            idx1 = list_of_strings.index(placeholder1)
-            idx2 = list_of_strings.index(placeholder2)
-            list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
-            if idx2 < len(list_of_strings) + 1:
-                list_of_strings_new += list_of_strings[idx2 + 1 :]
-            return list_of_strings_new
-        if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
-            return list_of_strings, dictionary
-        placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
-        if placeholder1 is None or placeholder2 is None:
-            return list_of_strings, dictionary
-        text_between = extract_text_between_placeholders(
-            list_of_strings, placeholder1, placeholder2
-        )
-        if len(text_between) < 5:
-            dictionary = merge_two_placeholders(
-                placeholder1, placeholder2, text_between, dictionary
-            )
-            list_of_strings = replace_placeholders_with_new_one(
-                list_of_strings, placeholder1, placeholder2
-            )
-            list_of_strings, dictionary = update_placeholder_names(
-                list_of_strings, dictionary
-            )
-            return QAResultAggregator.merge_placeholders(
-                list_of_strings, dictionary, skip
-            )
-        else:
-            return QAResultAggregator.merge_placeholders(
-                list_of_strings, dictionary, skip + 1
-            )
-    def cluster_summary(self, threshold=0.75):
-        self.clustered_summary = defaultdict(
-            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
-        )
-        for status in self.summary:
-            if status == "error":
-                for test_id in self.summary[status]:
-                    messages = list(self.summary[status][test_id].keys())
-                    # Pass a copy of messages to cluster_messages to generate clusters
-                    clusters = QAResultAggregator.cluster_messages(
-                        messages[:], threshold
-                    )
-                    for cluster in clusters:
-                        generalized, placeholders = (
-                            QAResultAggregator.generalize_message_group(cluster)
-                        )
-                        example_parts = ", ".join(
-                            [
-                                (
-                                    f"{k}='{v[0]}'"
-                                    if isinstance(v, list)
-                                    else f"{k}='{v}'"
-                                )
-                                for k, v in placeholders.items()
-                            ]
-                        )
-                        if example_parts:
-                            msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
-                        else:
-                            msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
-                        # Gather all ds_ids and filenames across the cluster
-                        combined = defaultdict(set)
-                        for message in cluster:
-                            for ds_id, files in self.summary[status][test_id][
-                                message
-                            ].items():
-                                combined[ds_id].update(files)
-                        # Shorten file lists to one example
-                        formatted = {
-                            ds_id
-                            + " ("
-                            + str(len(files))
-                            + f" file{'s' if len(files) > 1 else ''} affected)": (
-                                [f"e.g. '{next(iter(files))}'"]
-                                if len(files) > 1
-                                else [f"'{next(iter(files))}'"]
-                            )
-                            for ds_id, files in combined.items()
-                        }
-                        self.clustered_summary[status][test_id][msg_summary] = formatted
-            elif status == "fail":
-                for weight in self.summary[status]:
-                    for test_id in self.summary[status][weight]:
-                        messages = list(self.summary[status][weight][test_id].keys())
-                        # Pass a copy of messages to cluster_messages to generate clusters
-                        clusters = QAResultAggregator.cluster_messages(
-                            messages[:], threshold
-                        )
-                        for cluster in clusters:
-                            generalized, placeholders = (
-                                QAResultAggregator.generalize_message_group(cluster)
-                            )
-                            example_parts = ", ".join(
-                                [
-                                    (
-                                        f"{k}='{v[0]}'"
-                                        if isinstance(v, list)
-                                        else f"{k}='{v}'"
-                                    )
-                                    for k, v in placeholders.items()
-                                ]
-                            )
-                            if example_parts:
-                                msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
-                            else:
-                                msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
-                            # Gather all ds_ids and filenames across the cluster
-                            combined = defaultdict(set)
-                            for message in cluster:
-                                for ds_id, files in self.summary[status][weight][
-                                    test_id
-                                ][message].items():
-                                    combined[ds_id].update(files)
-                            # Shorten file lists to one example
-                            formatted = {
-                                ds_id
-                                + " ("
-                                + str(len(files))
-                                + f" file{'s' if len(files) > 1 else ''} affected)": (
-                                    [f"e.g. '{next(iter(files))}'"]
-                                    if len(files) > 1
-                                    else [f"'{next(iter(files))}'"]
-                                )
-                                for ds_id, files in combined.items()
-                            }
-                            self.clustered_summary[status][weight][test_id][
-                                msg_summary
-                            ] = formatted
 def get_default_result_dir():
+    """
+    Get the default result directory.
+    Returns
+    -------
+    str
+        Default result directory.
+    """
     global _timestamp
     global _timestamp_with_ms
     hash_object = hashlib.md5(_timestamp_with_ms.encode())
@@ -385,7 +52,27 @@ def get_default_result_dir():
         + f"/esgf-qa-results_{_timestamp_filename}_{hash_object.hexdigest()}"
     )
 def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
+    """
+    Get the dataset id for a file.
+    Parameters
+    ----------
+    files_to_check_dict : dict
+        Dictionary of files to check.
+    dataset_files_map_ext : dict
+        Dictionary of dataset files.
+    file_path : str
+        Path to the file.
+    project_id : str
+        Project id.
+    Returns
+    -------
+    str
+        Dataset id.
+    """
     dir_id = files_to_check_dict[file_path]["id_dir"].split("/")
     fn_id = files_to_check_dict[file_path]["id_fn"].split("_")
     if project_id in dir_id:
@@ -397,7 +84,24 @@ def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
         dsid += "." + ".".join(fn_id)
     return dsid
 def get_checker_release_versions(checkers, checker_options={}):
+    """
+    Get the release versions of the checkers.
+    Parameters
+    ----------
+    checkers : list
+        A list of checkers to get the release versions for.
+    checker_options : dict, optional
+        A dictionary of options for the checkers.
+        Example format: {"cf": {"check_dimension_order": True}}
+    Returns
+    -------
+    None
+        Updates the global dictionary ``checker_release_versions``.
+    """
     global checker_release_versions
     global checker_dict
     global checker_dict_ext
@@ -419,11 +123,20 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
     """
     Run the compliance checker on a file with the specified checkers and options.
-    Parameters:
-        file_path (str): Path to the file to be checked.
-        checkers (list): List of checkers to run.
-        checker_options (dict): Dictionary of options for each checker.
-                                Example format: {"cf": {"check_dimension_order": True}}
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to be checked.
+    checkers : list
+        A list of checkers to run.
+    checker_options : dict, optional
+        A dictionary of options for the checkers.
+        Example format: {"cf": {"check_dimension_order": True}}
+    Returns
+    -------
+    dict
+        A dictionary containing the results of the compliance checker.
     """
     check_suite = CheckSuite(options=checker_options)
     check_suite.load_all_available_checkers()
@@ -458,6 +171,21 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
 def track_checked_datasets(checked_datasets_file, checked_datasets):
+    """
+    Track checked datasets.
+    Parameters
+    ----------
+    checked_datasets_file : str
+        The path to the file to track checked datasets.
+    checked_datasets : list
+        A list of checked datasets.
+    Returns
+    -------
+    None
+        Writes the checked datasets to the file.
+    """
     with open(checked_datasets_file, "a") as file:
         writer = csv.writer(file)
         for dataset_id in checked_datasets:
@@ -472,6 +200,29 @@ def process_file(
     processed_files,
     progress_file,
 ):
+    """
+    Runs cc checks for a single file.
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to be checked.
+    checkers : list
+        A list of checkers to run.
+    checker_options : dict
+        A dictionary of options for the checkers.
+    files_to_check_dict : dict
+        A special dictionary mapping files to check to datasets.
+    processed_files : list
+        A list of files that have already been checked.
+    progress_file : str
+        The path to the progress file.
+    Returns
+    -------
+    tuple
+        A tuple containing the file path and the results of the compliance checker.
+    """
     # Read result from disk if check was run previously
     result_file = files_to_check_dict[file_path]["result_file"]
     consistency_file = files_to_check_dict[file_path]["consistency_file"]
@@ -567,6 +318,31 @@ def process_dataset(
     processed_datasets,
     progress_file,
 ):
+    """
+    Runs esgf_qa checks on a dataset.
+    Parameters
+    ----------
+    ds : str
+        Dataset to process.
+    ds_map : dict
+        Dictionary mapping dataset IDs to file paths.
+    checkers : list
+        List of checkers to run.
+    checker_options : dict
+        Dictionary of checker options.
+    files_to_check_dict : dict
+        A special dictionary mapping files to check to datasets.
+    processed_datasets : set
+        Set of processed datasets.
+    progress_file : str
+        Path to progress file.
+    Returns
+    -------
+    tuple
+        Dataset ID and check results.
+    """
     # Read result from disk if check was run previously
     result_file = files_to_check_dict[ds_map[ds][0]]["result_file_ds"]
     if ds in processed_datasets and os.path.isfile(result_file):
@@ -637,10 +413,14 @@ def parse_options(opts):
     is a colon. Adapted from
     https://github.com/ioos/compliance-checker/blob/cbb40ed1981c169b74c954f0775d5bd23005ed23/cchecker.py#L23
-    Parameters:
-        opts: Iterable of strings with options
+    Parameters
+    ----------
+    opts : Iterable of strings
+        Iterable of option strings
-    Returns:
+    Returns
+    -------
+    dict
         Dictionary with keys as checker type (i.e. "mip").
         Each value is a dictionary where keys are checker options and values
         are checker option values or None if not provided.
@@ -649,21 +429,39 @@ def parse_options(opts):
     for opt_str in opts:
         try:
             checker_type, checker_opt, *checker_val = opt_str.split(":", 2)
-            checker_val = checker_val[0] if checker_val else None
+            checker_val = checker_val[0] if checker_val else True
         except ValueError:
             raise ValueError(
                 f"Could not split option '{opt_str}', seems illegally formatted. The required format is: '<checker>:<option_name>[:<option_value>]', eg. 'mip:tables:/path/to/Tables'."
             )
-        if checker_type != "mip":
-            raise ValueError(
-                f"Currently, only options for 'mip' checker are supported, got '{checker_type}'."
-            )
         options_dict[checker_type][checker_opt] = checker_val
     return options_dict
+def _verify_options_dict(options):
+    """
+    Helper function to verify that the options dictionary is correctly formatted.
+    """
+    if not isinstance(options, dict):
+        return False
+    if options == {}:
+        return True
+    try:
+        for checker_type in options.keys():
+            for checker_opt in options[checker_type].keys():
+                checker_val = options[checker_type][checker_opt]
+                if not isinstance(checker_val, (int, float, str, bool, type(None))):
+                    return False
+    except (AttributeError, KeyError):
+        return False
+    # Seems to match the required format
+    return True
 def main():
-    # CLI
+    """
+    CLI entry point.
+    """
     parser = argparse.ArgumentParser(description="Run QA checks")
     parser.add_argument(
         "parent_dir",
@@ -729,39 +527,51 @@ def main():
     # Resume information stored in a json file
     resume_info_file = Path(result_dir, ".resume_info")
+    # Do not allow arguments other than -o/--output_dir, -i/--info and -r/--resume if resuming previous QA run
+    if resume:
+        allowed_with_resume = {"output_dir", "info", "resume"}
+        # Convert Namespace to dict for easier checking
+        set_args = {k for k, v in vars(args).items() if v not in (None, False, [], "")}
+        invalid_args = set_args - allowed_with_resume
+        if invalid_args:
+            parser.error(
+                f"When using -r/--resume, only -o/--output_dir and -i/--info can be set. Invalid: {', '.join(invalid_args)}"
+            )
     # Deal with result_dir
     if not os.path.exists(result_dir):
         if resume:
-            resume = False
-            warnings.warn(
-                "Resume is set but specified output_directory does not exist. Starting a new QA run..."
+            raise FileNotFoundError(
+                f"Resume is set but specified output_directory does not exist: '{result_dir}'."
             )
         os.mkdir(result_dir)
     elif os.listdir(result_dir) != []:
+        required_files = [progress_file, resume_info_file]
+        required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
         if resume:
-            required_files = [progress_file, resume_info_file]
-            required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
             if not all(os.path.isfile(rfile) for rfile in required_files) or not all(
                 os.path.isdir(rpath) for rpath in required_paths
             ):
                 raise Exception(
-                    "Resume is set but specified output_directory cannot be identified as output_directory of a previous QA run."
+                    "Resume is set but specified output_directory cannot be identified as output directory of a previous QA run."
                 )
         else:
-            if "progress.txt" in os.listdir(
-                result_dir
-            ) and ".resume_info" in os.listdir(result_dir):
+            if all(os.path.isfile(rfile) for rfile in required_files) and all(
+                os.path.isdir(rpath) for rpath in required_paths
+            ):
                 raise Exception(
-                    "Specified output_directory is not empty but can be identified as output_directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
+                    "Specified output directory is not empty but can be identified as output directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
                 )
             else:
-                raise Exception("Specified output_directory is not empty.")
+                raise Exception("Specified output directory is not empty.")
     else:
         if resume:
             resume = False
-            warnings.warn(
-                "Resume is set but specified output_directory is empty. Starting a new QA run..."
+            raise FileNotFoundError(
+                f"Resume is set but specified output directory is empty: '{result_dir}'."
             )
+    # When resuming previous QA run
     if resume:
         print(f"Resuming previous QA run in '{result_dir}'")
         with open(os.path.join(result_dir, ".resume_info")) as f:
@@ -770,58 +580,54 @@ def main():
                 required_keys = ["parent_dir", "info", "tests"]
                 if not all(key in resume_info for key in required_keys):
                     raise Exception(
-                        "Invalid .resume_info file. It should contain the keys 'parent_dir', 'info', and 'tests'."
+                        f"Invalid .resume_info file in '{result_dir}'. It should contain the keys 'parent_dir', 'info', and 'tests'."
                     )
                 if not (
                     isinstance(resume_info["parent_dir"], str)
                     and isinstance(resume_info["info"], str)
                     and isinstance(resume_info["tests"], list)
+                    and isinstance(resume_info.get("cl_checker_options", {}), dict)
+                    and isinstance(
+                        resume_info.get("include_consistency_checks", False), bool
+                    )
+                    and _verify_options_dict(resume_info.get("cl_checker_options", {}))
                     and all(isinstance(test, str) for test in resume_info["tests"])
                 ):
                     raise Exception(
-                        "Invalid .resume_info file. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings."
+                        f"Invalid .resume_info file in '{result_dir}'. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings. "
+                        "'cl_checker_options' (optional) should be a nested dictionary of format 'checker:option_name:option_value', and "
+                        "'include_consistency_checks' (optional) should be a boolean."
                     )
             except json.JSONDecodeError:
                 raise Exception(
-                    "Invalid .resume_info file. It should be a valid JSON file."
+                    f"Invalid .resume_info file in '{result_dir}'. It needs to be a valid JSON file."
                 )
-            if tests and sorted(tests) != resume_info["tests"]:
-                raise Exception("Cannot resume a previous QA run with different tests.")
-            else:
-                tests = resume_info["tests"]
+            tests = resume_info["tests"]
+            parent_dir = resume_info["parent_dir"]
             if info and info != resume_info["info"]:
                 warnings.warn(
                     f"<info> argument differs from the originally specified <info> argument ('{resume_info['info']}'). Using the new specification."
                 )
-            if parent_dir is None:
-                parent_dir = resume_info["parent_dir"]
-            if parent_dir and Path(parent_dir) != Path(resume_info["parent_dir"]):
-                raise Exception(
-                    "Cannot resume a previous QA run with different <parent_dir>."
-                )
-            if cl_checker_options and cl_checker_options != resume_info.get(
-                "checker_options", {}
-            ):
-                raise Exception(
-                    "Cannot resume a previous QA run with different <option> arguments."
-                )
-            else:
-                parent_dir = Path(resume_info["parent_dir"])
-            if "include_consistency_checks" in resume_info:
-                include_consistency_checks = resume_info["include_consistency_checks"]
+            cl_checker_options = resume_info.get("checker_options", {})
+            include_consistency_checks = resume_info.get(
+                "include_consistency_checks", False
+            )
     else:
         print(f"Storing check results in '{result_dir}'")
     # Deal with tests
     if not tests:
-        checkers = ["cc6", "cf"]
-        checkers_versions = {"cc6": "latest", "cf": "1.11"}
+        checkers = ["cf"]
+        checkers_versions = {"cf": "latest"}
         checker_options = defaultdict(dict)
     else:
-        test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
+        # Require versions to be specified:
+        # test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
+        # Allow versions to be ommitted:
+        test_regex = re.compile(r"^[a-z0-9_]+(?::(latest|[0-9]+(?:\.[0-9]+)*))?$")
         if not all([test_regex.match(test) for test in tests]):
             raise Exception(
-                f"Invalid test(s) specified. Please specify tests in the format 'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
+                f"Invalid test(s) specified. Please specify tests in the format 'checker_name' or'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
             )
         checkers = [test.split(":")[0] for test in tests]
         if sorted(checkers) != sorted(list(set(checkers))):
@@ -849,11 +655,9 @@ def main():
         if "eerie" in checkers_versions:
             checkers_versions["mip"] = "latest"
             del checkers_versions["eerie"]
-            if "tables" in cl_checker_options["eerie"]:
-                cl_checker_options["mip"]["tables"] = cl_checker_options["eerie"][
-                    "tables"
-                ]
-            elif "tables" not in cl_checker_options["mip"]:
+            if "eerie" in cl_checker_options:
+                cl_checker_options["mip"] = cl_checker_options.pop("eerie")
+            if "tables" not in cl_checker_options["mip"]:
                 cl_checker_options["mip"][
                     "tables"
                 ] = "/work/bm0021/cmor_tables/eerie_cmor_tables/Tables"
@@ -887,7 +691,7 @@ def main():
     if cl_checker_options:
         resume_info["checker_options"] = cl_checker_options
     with open(os.path.join(result_dir, ".resume_info"), "w") as f:
-        json.dump(resume_info, f)
+        json.dump(resume_info, f, sort_keys=True, indent=4)
     # If only cf checker is selected, run cc6 time checks only
     if (
@@ -907,8 +711,9 @@ def main():
     DRS_parent = "CORDEX-CMIP6"
     for cname in checkers:
-        print(cname)
-        DRS_parent_tmp = DRS_path_parent.get(checker_dict.get(cname.split(":")[0], ""), "")
+        DRS_parent_tmp = DRS_path_parent.get(
+            checker_dict.get(cname.split(":")[0], ""), ""
+        )
         if DRS_parent_tmp:
             DRS_parent = DRS_parent_tmp
             break
@@ -1027,14 +832,14 @@ def main():
             dataset_files_map[files_to_check_dict[file_path]["id"]] = [file_path]
         checker_options[file_path] = {
             "mip": {
-                **cl_checker_options["mip"],
+                **cl_checker_options.get("mip", {}),
                 "consistency_output": files_to_check_dict[file_path][
                     "consistency_file"
                 ],
                 "time_checks_only": time_checks_only,
             },
             "cc6": {
-                **cl_checker_options["cc6"],
+                **cl_checker_options.get("cc6", {}),
                 "consistency_output": files_to_check_dict[file_path][
                     "consistency_file"
                 ],
@@ -1046,15 +851,32 @@ def main():
                 "time_checks_only": time_checks_only,
             },
             "cf:": {
-                **cl_checker_options["cf"],
+                **cl_checker_options.get("cf", {}),
                 "enable_appendix_a_checks": True,
             },
+            "wcrp_cmip6": {
+                **cl_checker_options.get("wcrp_cmip6", {}),
+                "consistency_output": files_to_check_dict[file_path][
+                    "consistency_file"
+                ],
+            },
+            "wcrp_cordex_cmip6": {
+                **cl_checker_options.get("wcrp_cordex_cmip6", {}),
+                "consistency_output": files_to_check_dict[file_path][
+                    "consistency_file"
+                ],
+                "tables_dir": result_dir + "/tables",
+                "force_table_download": file_path == files_to_check[0]
+                and (
+                    not resume or (resume and os.listdir(result_dir + "/tables") == [])
+                ),
+            },
         }
         checker_options[file_path].update(
             {
                 k: v
                 for k, v in cl_checker_options.items()
-                if k not in ["cc6", "cf", "mip"]
+                if k not in ["cc6", "cf", "mip", "wcrp_cmip6", "wcrp_cordex_cmip6"]
             }
         )
@@ -1090,7 +912,8 @@ def main():
     print()
     # Initialize the summary
-    summary = QAResultAggregator(checker_dict=checker_dict_ext)
+    summary = QAResultAggregator()
+    reference_ds_dict = {}
     # Calculate the number of processes
     num_processes = max(multiprocessing.cpu_count() - 4, 1)
@@ -1140,8 +963,14 @@ def main():
     # Skip continuity and consistency checks if no cc6/mip checks were run
     #   (and thus no consistency output file was created)
-    if "cc6:latest" in checkers or "mip:latest" in checkers:
+    if (
+        "cc6:latest" in checkers
+        or "mip:latest" in checkers
+        or "wcrp_cmip6:1.0" in checkers
+        or "wcrp_cmip6:latest" in checkers
+        or "wcrp_cordex_cmip6:1.0" in checkers
+        or "wcrp_cordex_cmip6:latest" in checkers
+    ):
         #########################################################
         # QA Part 2 - Run all consistency & continuity checks
         #########################################################
@@ -1252,7 +1081,7 @@ def main():
         "parent_dir": str(parent_dir),
     }
     # Add reference datasets for inter-dataset consistency checks
-    if 'cc6:latest' in checkers or 'mip:latest' in checkers:
+    if reference_ds_dict:
         summary_info["inter_ds_con_checks_ref"] = reference_ds_dict
     dsid_common_prefix = os.path.commonprefix(list(dataset_files_map.keys()))

esgf-qa 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

esgf-qa 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl