PyPI - esgf-qa - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

esgf-qa 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

esgf_qa/_constants.py +63 -1
esgf_qa/_version.py +2 -2
esgf_qa/cluster_results.py +467 -0
esgf_qa/con_checks.py +209 -11
esgf_qa/run_qa.py +356 -463
{esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/METADATA +47 -31
esgf_qa-0.5.0.dist-info/RECORD +19 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/WHEEL +1 -1
{esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/top_level.txt +1 -1
tests/test_cli.py +271 -0
tests/test_cluster_results.py +166 -0
tests/test_con_checks.py +263 -0
tests/test_qaviewer.py +147 -0
tests/test_run_dummy_qa.py +191 -0
tests/test_run_qa.py +181 -0
docs/esgf-qa_Logo.png +0 -0
esgf_qa-0.3.0.dist-info/RECORD +0 -13
{esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/entry_points.txt +0 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.5.0.dist-info}/licenses/LICENSE +0 -0

esgf_qa/run_qa.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import argparse
 import csv
 import datetime
-import difflib
 import hashlib
 import json
 import multiprocessing
@@ -13,46 +12,22 @@ from pathlib import Path
 from compliance_checker import __version__ as cc_version
 from compliance_checker.runner import CheckSuite
+from packaging import version as pversion
+from esgf_qa._constants import (
+    checker_dict,
+    checker_dict_ext,
+    checker_release_versions,
+    checker_supporting_consistency_checks,
+    supported_project_ids,
+)
 from esgf_qa._version import version
+from esgf_qa.cluster_results import QAResultAggregator
 from esgf_qa.con_checks import compatibility_checks as comp  # noqa
 from esgf_qa.con_checks import consistency_checks as cons  # noqa
 from esgf_qa.con_checks import continuity_checks as cont  # noqa
 from esgf_qa.con_checks import dataset_coverage_checks, inter_dataset_consistency_checks
-checker_dict = {
-    "cc6": "CORDEX-CMIP6",
-    "cf": "CF-Conventions",
-    "mip": "MIP",
-    "plugin_cmip6": "CMIP6",
-    # "wcrp-cmip5": "CMIP5",
-    "wcrp_cmip6": "CMIP6",
-    # "wcrp_cmip7": "CMIP7-AFT",
-    # "wcrp_cmip7": "CMIP7",
-    # "wcrp_cordex": "CORDEX",
-    "wcrp_cordex_cmip6": "CORDEX-CMIP6",
-    # "obs4mips": "Obs4MIPs",
-    # "input4mips": "Input4MIPs",
-}
-DRS_path_parent = {
-    "CMIP5": "CMIP5",
-    "CMIP6": "CMIP6",
-    "CMIP7": "CMIP7",
-    "CMIP7-AFT": "CMIP7",
-    "CORDEX": "CORDEX",
-    "CORDEX-CMIP6": "CORDEX-CMIP6",
-    "Obs4MIPs": "Obs4MIPs",
-    "Input4MIPs": "Input4MIPs",
-}
-checker_release_versions = {}
-checker_dict_ext = {
-    # "pcons": "ParentConsistency"
-    "cons": "Consistency",
-    "cont": "Continuity",
-    "comp": "Compatibility",
-    **checker_dict,
-}
 _timestamp_with_ms = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
 _timestamp_filename = datetime.datetime.strptime(
     _timestamp_with_ms, "%Y%m%d-%H%M%S%f"
@@ -62,321 +37,15 @@ _timestamp_pprint = datetime.datetime.strptime(
 ).strftime("%Y-%m-%d %H:%M")
-class QAResultAggregator:
-    def __init__(self, checker_dict):
-        """
-        Initialize the aggregator with an empty summary.
-        """
-        self.summary = {
-            "error": defaultdict(
-                lambda: defaultdict(lambda: defaultdict(list))
-            ),  # No weight, just function -> error msg
-            "fail": defaultdict(
-                lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-            ),  # weight -> test -> msg -> dsid -> filenames
-        }
-        self.checker_dict = checker_dict
-    def update(self, result_dict, dsid, file_name):
-        """
-        Update the summary with a single result of a cc-run.
-        """
-        for checker in result_dict:
-            for test in result_dict[checker]:
-                if test == "errors":
-                    for function_name, error_msg in result_dict[checker][
-                        "errors"
-                    ].items():
-                        self.summary["error"][
-                            f"[{checker_dict[checker]}] " + function_name
-                        ][error_msg][dsid].append(file_name)
-                else:
-                    score, max_score = result_dict[checker][test]["value"]
-                    weight = result_dict[checker][test].get("weight", 3)
-                    msgs = result_dict[checker][test].get("msgs", [])
-                    if score < max_score:  # test outcome: fail
-                        for msg in msgs:
-                            self.summary["fail"][weight][
-                                f"[{checker_dict[checker]}] " + test
-                            ][msg][dsid].append(file_name)
-    def update_ds(self, result_dict, dsid):
-        """
-        Update the summary with a single result of a esgf-qa run.
-        """
-        for checker in result_dict:
-            for test in result_dict[checker]:
-                if test == "errors":
-                    for function_name, errdict in result_dict[checker][
-                        "errors"
-                    ].items():
-                        for file_name in errdict["files"]:
-                            self.summary["error"][
-                                f"[{checker_dict_ext[checker]}] " + function_name
-                            ][errdict["msg"]][dsid].append(file_name)
-                else:
-                    weight = result_dict[checker][test].get("weight", 3)
-                    fails = result_dict[checker][test].get("msgs", {})
-                    for msg, file_names in fails.items():
-                        for file_name in file_names:
-                            self.summary["fail"][weight][
-                                f"[{checker_dict_ext[checker]}] " + test
-                            ][msg][dsid].append(file_name)
-    def sort(self):
-        """
-        Sort the summary.
-        """
-        self.summary["fail"] = dict(sorted(self.summary["fail"].items(), reverse=True))
-        for key in self.summary["fail"]:
-            self.summary["fail"][key] = dict(sorted(self.summary["fail"][key].items()))
-        # Sort errors by function name
-        for checker in self.summary["error"]:
-            self.summary["error"][checker] = dict(
-                sorted(self.summary["error"][checker].items())
-            )
-    @staticmethod
-    def cluster_messages(messages, threshold):
-        clusters = []
-        while messages:
-            base = messages.pop(0)
-            cluster = [base]
-            to_remove = []
-            for msg in messages:
-                ratio = difflib.SequenceMatcher(None, base, msg).ratio()
-                if ratio >= threshold:
-                    cluster.append(msg)
-                    to_remove.append(msg)
-            for msg in to_remove:
-                messages.remove(msg)
-            clusters.append(cluster)
-        return clusters
-    @staticmethod
-    def generalize_message_group(messages):
-        if len(messages) == 1:
-            return messages[0], {}
-        # Split messages into tokens
-        split_messages = [re.findall(r"\w+|\W", m) for m in messages]
-        transposed = list(zip(*split_messages))
-        template = []
-        placeholders = {}
-        var_index = 0
-        for i, tokens in enumerate(transposed):
-            unique_tokens = set(tokens)
-            if len(unique_tokens) == 1:
-                template.append(tokens[0])
-            else:
-                var_name = chr(ord("A") + var_index)
-                template.append(f"{{{var_name}}}")
-                placeholders[var_name] = tokens[0]
-                var_index += 1
-        # Merge placeholders if possible
-        template, placeholders = QAResultAggregator.merge_placeholders(
-            template, placeholders
-        )
-        # Return the generalized message and the placeholders
-        generalized = "".join(template)
-        return generalized, placeholders
-    @staticmethod
-    def merge_placeholders(list_of_strings, dictionary, skip=0):
-        def find_next_two_placeholders(list_of_strings, skip):
-            placeholders = [
-                s for s in list_of_strings if s.startswith("{") and s.endswith("}")
-            ]
-            if len(placeholders) < 2:
-                return None, None
-            return placeholders[skip] if len(placeholders) >= skip + 1 else None, (
-                placeholders[skip + 1] if len(placeholders) >= skip + 2 else None
-            )
-        def extract_text_between_placeholders(
-            list_of_strings, placeholder1, placeholder2
-        ):
-            idx1 = list_of_strings.index(placeholder1)
-            idx2 = list_of_strings.index(placeholder2)
-            return "".join(list_of_strings[idx1 + 1 : idx2])
-        def merge_two_placeholders(
-            placeholder1, placeholder2, text_between, dictionary
-        ):
-            new_value = (
-                dictionary[placeholder1.lstrip("{").rstrip("}")]
-                + text_between
-                + dictionary[placeholder2.lstrip("{").rstrip("}")]
-            )
-            dictionary[placeholder1.lstrip("{").rstrip("}")] = new_value
-            del dictionary[placeholder2.lstrip("{").rstrip("}")]
-            return dictionary
-        def update_placeholder_names(list_of_strings, dictionary):
-            old_placeholders = sorted(list(dictionary.keys()))
-            new_placeholders = [
-                chr(ord("A") + i) for i in range(0, len(old_placeholders))
-            ]
-            new_dictionary = dict(
-                zip(new_placeholders, [dictionary[val] for val in old_placeholders])
-            )
-            for old, new in zip(old_placeholders, new_placeholders):
-                list_of_strings = [
-                    s.replace("{" + old + "}", "{" + new + "}") for s in list_of_strings
-                ]
-            return list_of_strings, new_dictionary
-        def replace_placeholders_with_new_one(
-            list_of_strings, placeholder1, placeholder2
-        ):
-            idx1 = list_of_strings.index(placeholder1)
-            idx2 = list_of_strings.index(placeholder2)
-            list_of_strings_new = list_of_strings[:idx1] + [placeholder1]
-            if idx2 < len(list_of_strings) + 1:
-                list_of_strings_new += list_of_strings[idx2 + 1 :]
-            return list_of_strings_new
-        if not any(s.startswith("{") and s.endswith("}") for s in list_of_strings):
-            return list_of_strings, dictionary
-        placeholder1, placeholder2 = find_next_two_placeholders(list_of_strings, skip)
-        if placeholder1 is None or placeholder2 is None:
-            return list_of_strings, dictionary
-        text_between = extract_text_between_placeholders(
-            list_of_strings, placeholder1, placeholder2
-        )
-        if len(text_between) < 5:
-            dictionary = merge_two_placeholders(
-                placeholder1, placeholder2, text_between, dictionary
-            )
-            list_of_strings = replace_placeholders_with_new_one(
-                list_of_strings, placeholder1, placeholder2
-            )
-            list_of_strings, dictionary = update_placeholder_names(
-                list_of_strings, dictionary
-            )
-            return QAResultAggregator.merge_placeholders(
-                list_of_strings, dictionary, skip
-            )
-        else:
-            return QAResultAggregator.merge_placeholders(
-                list_of_strings, dictionary, skip + 1
-            )
-    def cluster_summary(self, threshold=0.75):
-        self.clustered_summary = defaultdict(
-            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
-        )
-        for status in self.summary:
-            if status == "error":
-                for test_id in self.summary[status]:
-                    messages = list(self.summary[status][test_id].keys())
-                    # Pass a copy of messages to cluster_messages to generate clusters
-                    clusters = QAResultAggregator.cluster_messages(
-                        messages[:], threshold
-                    )
-                    for cluster in clusters:
-                        generalized, placeholders = (
-                            QAResultAggregator.generalize_message_group(cluster)
-                        )
-                        example_parts = ", ".join(
-                            [
-                                (
-                                    f"{k}='{v[0]}'"
-                                    if isinstance(v, list)
-                                    else f"{k}='{v}'"
-                                )
-                                for k, v in placeholders.items()
-                            ]
-                        )
-                        if example_parts:
-                            msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
-                        else:
-                            msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
-                        # Gather all ds_ids and filenames across the cluster
-                        combined = defaultdict(set)
-                        for message in cluster:
-                            for ds_id, files in self.summary[status][test_id][
-                                message
-                            ].items():
-                                combined[ds_id].update(files)
-                        # Shorten file lists to one example
-                        formatted = {
-                            ds_id
-                            + " ("
-                            + str(len(files))
-                            + f" file{'s' if len(files) > 1 else ''} affected)": (
-                                [f"e.g. '{next(iter(files))}'"]
-                                if len(files) > 1
-                                else [f"'{next(iter(files))}'"]
-                            )
-                            for ds_id, files in combined.items()
-                        }
-                        self.clustered_summary[status][test_id][msg_summary] = formatted
-            elif status == "fail":
-                for weight in self.summary[status]:
-                    for test_id in self.summary[status][weight]:
-                        messages = list(self.summary[status][weight][test_id].keys())
-                        # Pass a copy of messages to cluster_messages to generate clusters
-                        clusters = QAResultAggregator.cluster_messages(
-                            messages[:], threshold
-                        )
-                        for cluster in clusters:
-                            generalized, placeholders = (
-                                QAResultAggregator.generalize_message_group(cluster)
-                            )
-                            example_parts = ", ".join(
-                                [
-                                    (
-                                        f"{k}='{v[0]}'"
-                                        if isinstance(v, list)
-                                        else f"{k}='{v}'"
-                                    )
-                                    for k, v in placeholders.items()
-                                ]
-                            )
-                            if example_parts:
-                                msg_summary = f"{generalized} ({len(cluster)} occurrences, e.g. {example_parts})"
-                            else:
-                                msg_summary = f"{generalized}{' (' + str(len(cluster)) + ' occurrences)' if len(cluster) > 1 else ''}"
-                            # Gather all ds_ids and filenames across the cluster
-                            combined = defaultdict(set)
-                            for message in cluster:
-                                for ds_id, files in self.summary[status][weight][
-                                    test_id
-                                ][message].items():
-                                    combined[ds_id].update(files)
-                            # Shorten file lists to one example
-                            formatted = {
-                                ds_id
-                                + " ("
-                                + str(len(files))
-                                + f" file{'s' if len(files) > 1 else ''} affected)": (
-                                    [f"e.g. '{next(iter(files))}'"]
-                                    if len(files) > 1
-                                    else [f"'{next(iter(files))}'"]
-                                )
-                                for ds_id, files in combined.items()
-                            }
-                            self.clustered_summary[status][weight][test_id][
-                                msg_summary
-                            ] = formatted
 def get_default_result_dir():
+    """
+    Get the default result directory.
+    Returns
+    -------
+    str
+        Default result directory.
+    """
     global _timestamp
     global _timestamp_with_ms
     hash_object = hashlib.md5(_timestamp_with_ms.encode())
@@ -385,19 +54,86 @@ def get_default_result_dir():
         + f"/esgf-qa-results_{_timestamp_filename}_{hash_object.hexdigest()}"
     )
-def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_id):
+def get_dsid(files_to_check_dict, dataset_files_map_ext, file_path, project_ids):
+    """
+    Get the dataset id for a file.
+    Parameters
+    ----------
+    files_to_check_dict : dict
+        Dictionary of files to check.
+    dataset_files_map_ext : dict
+        Dictionary of dataset files.
+    file_path : str
+        Path to the file.
+    project_ids: list of str
+        List of supported project_ids
+    Returns
+    -------
+    str
+        Dataset id.
+    """
     dir_id = files_to_check_dict[file_path]["id_dir"].split("/")
     fn_id = files_to_check_dict[file_path]["id_fn"].split("_")
-    if project_id in dir_id:
-        last_index = len(dir_id) - 1 - dir_id[::-1].index(project_id)
-        dsid = ".".join(dir_id[last_index:])
-    else:
-        dsid = ".".join(dir_id)
+    dsid = ".".join(dir_id)
+    dir_id_lower = [el.lower() for el in dir_id]
+    for project_id in project_ids:
+        if project_id in dir_id_lower:
+            last_index = len(dir_id_lower) - 1 - dir_id_lower[::-1].index(project_id)
+            dsid = ".".join(dir_id[last_index:])
+            break
     if len(dataset_files_map_ext[files_to_check_dict[file_path]["id_dir"]].keys()) > 1:
         dsid += "." + ".".join(fn_id)
     return dsid
+def get_installed_checker_versions():
+    """
+    Get all available versions of installed cc-plugins.
+    Returns
+    -------
+    dict
+        A dictionary of {checker_name: [version1, version2, latest], ...}.
+    """
+    check_suite = CheckSuite()
+    check_suite.load_all_available_checkers()
+    installed_versions = {}
+    for checker in check_suite.checkers:
+        try:
+            name, version = checker.split(":")
+        except ValueError:
+            name, version = checker, "latest"
+        if version == "latest":
+            continue
+        if name not in installed_versions:
+            installed_versions[name] = []
+        installed_versions[name].append(version)
+    for name, versions in installed_versions.items():
+        installed_versions[name] = sorted(versions, key=pversion.parse) + ["latest"]
+    return installed_versions
 def get_checker_release_versions(checkers, checker_options={}):
+    """
+    Get the release versions of the checkers.
+    Parameters
+    ----------
+    checkers : list
+        A list of checkers to get the release versions for.
+    checker_options : dict, optional
+        A dictionary of options for the checkers.
+        Example format: {"cf": {"check_dimension_order": True}}
+    Returns
+    -------
+    None
+        Updates the global dictionary ``checker_release_versions``.
+    """
     global checker_release_versions
     global checker_dict
     global checker_dict_ext
@@ -413,17 +149,32 @@ def get_checker_release_versions(checkers, checker_options={}):
                 )
             elif checker.split(":")[0] in checker_dict_ext:
                 checker_release_versions[checker.split(":")[0]] = version
+            else:
+                checker_release_versions[checker.split(":")[0]] = (
+                    check_suite.checkers.get(
+                        checker, "unknown version"
+                    )._cc_spec_version
+                )
 def run_compliance_checker(file_path, checkers, checker_options={}):
     """
     Run the compliance checker on a file with the specified checkers and options.
-    Parameters:
-        file_path (str): Path to the file to be checked.
-        checkers (list): List of checkers to run.
-        checker_options (dict): Dictionary of options for each checker.
-                                Example format: {"cf": {"check_dimension_order": True}}
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to be checked.
+    checkers : list
+        A list of checkers to run.
+    checker_options : dict, optional
+        A dictionary of options for the checkers.
+        Example format: {"cf": {"check_dimension_order": True}}
+    Returns
+    -------
+    dict
+        A dictionary containing the results of the compliance checker.
     """
     check_suite = CheckSuite(options=checker_options)
     check_suite.load_all_available_checkers()
@@ -453,11 +204,31 @@ def run_compliance_checker(file_path, checkers, checker_options={}):
                         ds, [checker], include_checks=None, skip_checks=[]
                     )
                 )
+        if hasattr(ds, "close"):
+            ds.close()
         return results
-    return check_suite.run_all(ds, checkers, include_checks=None, skip_checks=[])
+    results = check_suite.run_all(ds, checkers, include_checks=None, skip_checks=[])
+    if hasattr(ds, "close"):
+        ds.close()
+    return results
 def track_checked_datasets(checked_datasets_file, checked_datasets):
+    """
+    Track checked datasets.
+    Parameters
+    ----------
+    checked_datasets_file : str
+        The path to the file to track checked datasets.
+    checked_datasets : list
+        A list of checked datasets.
+    Returns
+    -------
+    None
+        Writes the checked datasets to the file.
+    """
     with open(checked_datasets_file, "a") as file:
         writer = csv.writer(file)
         for dataset_id in checked_datasets:
@@ -472,6 +243,29 @@ def process_file(
     processed_files,
     progress_file,
 ):
+    """
+    Runs cc checks for a single file.
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to be checked.
+    checkers : list
+        A list of checkers to run.
+    checker_options : dict
+        A dictionary of options for the checkers.
+    files_to_check_dict : dict
+        A special dictionary mapping files to check to datasets.
+    processed_files : list
+        A list of files that have already been checked.
+    progress_file : str
+        The path to the progress file.
+    Returns
+    -------
+    tuple
+        A tuple containing the file path and the results of the compliance checker.
+    """
     # Read result from disk if check was run previously
     result_file = files_to_check_dict[file_path]["result_file"]
     consistency_file = files_to_check_dict[file_path]["consistency_file"]
@@ -512,14 +306,6 @@ def process_file(
         checker = checkerv.split(":")[0]
         check_results[checker] = dict()
         check_results[checker]["errors"] = {}
-        # print()
-        # print("name",result[checker][0][0].name)
-        # print("weight", result[checker][0][0].weight)
-        # print("value", result[checker][0][0].value)
-        # print("msgs", result[checker][0][0].msgs)
-        # print("method", result[checker][0][0].check_method)
-        # print("children", result[checker][0][0].children)
-        # quit()
         for check in result[checkerv][0]:
             check_results[checker][check.name] = {}
             check_results[checker][check.name]["weight"] = check.weight
@@ -567,6 +353,31 @@ def process_dataset(
     processed_datasets,
     progress_file,
 ):
+    """
+    Runs esgf_qa checks on a dataset.
+    Parameters
+    ----------
+    ds : str
+        Dataset to process.
+    ds_map : dict
+        Dictionary mapping dataset IDs to file paths.
+    checkers : list
+        List of checkers to run.
+    checker_options : dict
+        Dictionary of checker options.
+    files_to_check_dict : dict
+        A special dictionary mapping files to check to datasets.
+    processed_datasets : set
+        Set of processed datasets.
+    progress_file : str
+        Path to progress file.
+    Returns
+    -------
+    tuple
+        Dataset ID and check results.
+    """
     # Read result from disk if check was run previously
     result_file = files_to_check_dict[ds_map[ds][0]]["result_file_ds"]
     if ds in processed_datasets and os.path.isfile(result_file):
@@ -637,10 +448,14 @@ def parse_options(opts):
     is a colon. Adapted from
     https://github.com/ioos/compliance-checker/blob/cbb40ed1981c169b74c954f0775d5bd23005ed23/cchecker.py#L23
-    Parameters:
-        opts: Iterable of strings with options
+    Parameters
+    ----------
+    opts : Iterable of strings
+        Iterable of option strings
-    Returns:
+    Returns
+    -------
+    dict
         Dictionary with keys as checker type (i.e. "mip").
         Each value is a dictionary where keys are checker options and values
         are checker option values or None if not provided.
@@ -649,21 +464,39 @@ def parse_options(opts):
     for opt_str in opts:
         try:
             checker_type, checker_opt, *checker_val = opt_str.split(":", 2)
-            checker_val = checker_val[0] if checker_val else None
+            checker_val = checker_val[0] if checker_val else True
         except ValueError:
             raise ValueError(
                 f"Could not split option '{opt_str}', seems illegally formatted. The required format is: '<checker>:<option_name>[:<option_value>]', eg. 'mip:tables:/path/to/Tables'."
             )
-        if checker_type != "mip":
-            raise ValueError(
-                f"Currently, only options for 'mip' checker are supported, got '{checker_type}'."
-            )
         options_dict[checker_type][checker_opt] = checker_val
     return options_dict
+def _verify_options_dict(options):
+    """
+    Helper function to verify that the options dictionary is correctly formatted.
+    """
+    if not isinstance(options, dict):
+        return False
+    if options == {}:
+        return True
+    try:
+        for checker_type in options.keys():
+            for checker_opt in options[checker_type].keys():
+                checker_val = options[checker_type][checker_opt]
+                if not isinstance(checker_val, (int, float, str, bool, type(None))):
+                    return False
+    except (AttributeError, KeyError):
+        return False
+    # Seems to match the required format
+    return True
 def main():
-    # CLI
+    """
+    CLI entry point.
+    """
     parser = argparse.ArgumentParser(description="Run QA checks")
     parser.add_argument(
         "parent_dir",
@@ -683,13 +516,17 @@ def main():
         "--option",
         default=[],
         action="append",
-        help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. Multiple invocations possible.",
+        help="Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. "
+        "Multiple invocations possible.",
     )
     parser.add_argument(
         "-t",
         "--test",
         action="append",
-        help="The test to run ('cc6:latest' or 'cf:<version>', can be specified multiple times, eg.: '-t cc6:latest -t cf:1.8') - default: running 'cc6:latest' and 'cf:1.11'.",
+        help="The test(s) to run in format '<checker>[:<version>]', (eg. 'wcrp_cmip7', "
+        "'wcrp_cmip6:latest' or 'cf:<version>', can be specified multiple times, "
+        "eg.: '-t cc6:latest -t cf:1.11') - default: running 'cf:latest'. "
+        "The default version selected for each checker is 'latest'.",
     )
     parser.add_argument(
         "-i",
@@ -709,6 +546,13 @@ def main():
         action="store_true",
         help="Include basic consistency and continuity checks. Default: False.",
     )
+    parser.add_argument(
+        "-P",
+        "--parallel_processes",
+        type=int,
+        default=0,
+        help="Specify the maximum number of parallel processes. Default: 0 (= number of cores).",
+    )
     args = parser.parse_args()
     result_dir = os.path.abspath(args.output_dir)
@@ -720,6 +564,7 @@ def main():
         args.include_consistency_checks if args.include_consistency_checks else False
     )
     cl_checker_options = parse_options(args.option)
+    parallel_processes = args.parallel_processes
     # Progress file to track already checked files
     progress_file = Path(result_dir, "progress.txt")
@@ -729,39 +574,51 @@ def main():
     # Resume information stored in a json file
     resume_info_file = Path(result_dir, ".resume_info")
+    # Do not allow any but certain arguments if resuming previous QA run
+    if resume:
+        allowed_with_resume = {"output_dir", "info", "resume", "parallel_processes"}
+        # Convert Namespace to dict for easier checking
+        set_args = {k for k, v in vars(args).items() if v not in (None, False, [], "")}
+        invalid_args = set_args - allowed_with_resume
+        if invalid_args:
+            parser.error(
+                f"When using -r/--resume, the following arguments are not allowed: {', '.join(invalid_args)}"
+            )
     # Deal with result_dir
     if not os.path.exists(result_dir):
         if resume:
-            resume = False
-            warnings.warn(
-                "Resume is set but specified output_directory does not exist. Starting a new QA run..."
+            raise FileNotFoundError(
+                f"Resume is set but specified output_directory does not exist: '{result_dir}'."
             )
         os.mkdir(result_dir)
     elif os.listdir(result_dir) != []:
+        required_files = [progress_file, resume_info_file]
+        required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
         if resume:
-            required_files = [progress_file, resume_info_file]
-            required_paths = [os.path.join(result_dir, p) for p in ["tables"]]
             if not all(os.path.isfile(rfile) for rfile in required_files) or not all(
                 os.path.isdir(rpath) for rpath in required_paths
             ):
                 raise Exception(
-                    "Resume is set but specified output_directory cannot be identified as output_directory of a previous QA run."
+                    "Resume is set but specified output_directory cannot be identified as output directory of a previous QA run."
                 )
         else:
-            if "progress.txt" in os.listdir(
-                result_dir
-            ) and ".resume_info" in os.listdir(result_dir):
+            if all(os.path.isfile(rfile) for rfile in required_files) and all(
+                os.path.isdir(rpath) for rpath in required_paths
+            ):
                 raise Exception(
-                    "Specified output_directory is not empty but can be identified as output_directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
+                    "Specified output directory is not empty but can be identified as output directory of a previous QA run. Use'-r' or '--resume' (together with '-o' or '--output_dir') to continue the previous QA run or choose a different output_directory instead."
                 )
             else:
-                raise Exception("Specified output_directory is not empty.")
+                raise Exception("Specified output directory is not empty.")
     else:
         if resume:
             resume = False
-            warnings.warn(
-                "Resume is set but specified output_directory is empty. Starting a new QA run..."
+            raise FileNotFoundError(
+                f"Resume is set but specified output directory is empty: '{result_dir}'."
             )
+    # When resuming previous QA run
     if resume:
         print(f"Resuming previous QA run in '{result_dir}'")
         with open(os.path.join(result_dir, ".resume_info")) as f:
@@ -770,58 +627,55 @@ def main():
                 required_keys = ["parent_dir", "info", "tests"]
                 if not all(key in resume_info for key in required_keys):
                     raise Exception(
-                        "Invalid .resume_info file. It should contain the keys 'parent_dir', 'info', and 'tests'."
+                        f"Invalid .resume_info file in '{result_dir}'. It should contain the keys 'parent_dir', 'info', and 'tests'."
                     )
                 if not (
                     isinstance(resume_info["parent_dir"], str)
                     and isinstance(resume_info["info"], str)
                     and isinstance(resume_info["tests"], list)
+                    and isinstance(resume_info.get("cl_checker_options", {}), dict)
+                    and isinstance(
+                        resume_info.get("include_consistency_checks", False), bool
+                    )
+                    and _verify_options_dict(resume_info.get("cl_checker_options", {}))
                     and all(isinstance(test, str) for test in resume_info["tests"])
                 ):
                     raise Exception(
-                        "Invalid .resume_info file. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings."
+                        f"Invalid .resume_info file in '{result_dir}'. 'parent_dir' and 'info' should be strings, and 'tests' should be a list of strings. "
+                        "'cl_checker_options' (optional) should be a nested dictionary of format 'checker:option_name:option_value', and "
+                        "'include_consistency_checks' (optional) should be a boolean."
                     )
             except json.JSONDecodeError:
                 raise Exception(
-                    "Invalid .resume_info file. It should be a valid JSON file."
+                    f"Invalid .resume_info file in '{result_dir}'. It needs to be a valid JSON file."
                 )
-            if tests and sorted(tests) != resume_info["tests"]:
-                raise Exception("Cannot resume a previous QA run with different tests.")
-            else:
-                tests = resume_info["tests"]
+            tests = resume_info["tests"]
+            parent_dir = resume_info["parent_dir"]
             if info and info != resume_info["info"]:
                 warnings.warn(
                     f"<info> argument differs from the originally specified <info> argument ('{resume_info['info']}'). Using the new specification."
                 )
-            if parent_dir is None:
-                parent_dir = resume_info["parent_dir"]
-            if parent_dir and Path(parent_dir) != Path(resume_info["parent_dir"]):
-                raise Exception(
-                    "Cannot resume a previous QA run with different <parent_dir>."
-                )
-            if cl_checker_options and cl_checker_options != resume_info.get(
-                "checker_options", {}
-            ):
-                raise Exception(
-                    "Cannot resume a previous QA run with different <option> arguments."
-                )
-            else:
-                parent_dir = Path(resume_info["parent_dir"])
-            if "include_consistency_checks" in resume_info:
-                include_consistency_checks = resume_info["include_consistency_checks"]
+            cl_checker_options = resume_info.get("checker_options", {})
+            include_consistency_checks = resume_info.get(
+                "include_consistency_checks", False
+            )
     else:
         print(f"Storing check results in '{result_dir}'")
     # Deal with tests
     if not tests:
-        checkers = ["cc6", "cf"]
-        checkers_versions = {"cc6": "latest", "cf": "1.11"}
+        checkers = ["cf"]
+        checkers_versions = {"cf": "latest"}
         checker_options = defaultdict(dict)
     else:
-        test_regex = re.compile(r"^[a-z0-9_]+:(latest|[0-9]+(\.[0-9]+)*)$")
+        # Require versions to be specified:
+        # test_regex = re.compile(r"^[a-zA-Z0-9_-]+:(latest|[0-9]+(\.[0-9]+)*)$")
+        # Allow versions to be ommitted:
+        test_regex = re.compile(r"^[a-zA-Z0-9_-]+(?::(latest|[0-9]+(?:\.[0-9]+)*))?$")
+        # Check format of specified checkers and separate checker, version, options
         if not all([test_regex.match(test) for test in tests]):
             raise Exception(
-                f"Invalid test(s) specified. Please specify tests in the format 'checker_name:version'. Currently supported are: {', '.join(list(checker_dict.keys()))}, eerie."
+                "Invalid test(s) specified. Please specify tests in the format 'checker_name' or'checker_name:version'."
             )
         checkers = [test.split(":")[0] for test in tests]
         if sorted(checkers) != sorted(list(set(checkers))):
@@ -835,6 +689,29 @@ def main():
             for test in tests
         }
         checker_options = defaultdict(dict)
+        # Check if specified checkers (or their requested versions) exist / are currently installed
+        cc_checker_versions = get_installed_checker_versions()
+        invalid_checkers = []
+        invalid_checkers_versions = []
+        invalid_checkers_errmsg = ""
+        for checker_i, checker_iv in checkers_versions.items():
+            if checker_i not in cc_checker_versions and checker_i != "eerie":
+                invalid_checkers.append(checker_i)
+            elif checker_i == "eerie":
+                pass
+            elif checker_iv not in cc_checker_versions[checker_i] and checker_i not in [
+                "cc6",
+                "mip",
+            ]:
+                invalid_checkers_versions.append(checker_i)
+        if invalid_checkers:
+            invalid_checkers_errmsg = f"ERROR: Invalid test(s) specified. The following checkers are not supported or installed: {', '.join(invalid_checkers)}. "
+        for checker_i in invalid_checkers_versions:
+            if not invalid_checkers_errmsg:
+                invalid_checkers_errmsg = "ERROR: Invalid test(s) specified. "
+            invalid_checkers_errmsg += f"For checker {checker_i} only the following versions are currently supported / installed: {', '.join(cc_checker_versions[checker_i])}. "
+        if invalid_checkers_errmsg:
+            raise ValueError(invalid_checkers_errmsg)
         if "cc6" in checkers_versions and checkers_versions["cc6"] != "latest":
             checkers_versions["cc6"] = "latest"
             warnings.warn("Version of checker 'cc6' must be 'latest'. Using 'latest'.")
@@ -849,11 +726,9 @@ def main():
         if "eerie" in checkers_versions:
             checkers_versions["mip"] = "latest"
             del checkers_versions["eerie"]
-            if "tables" in cl_checker_options["eerie"]:
-                cl_checker_options["mip"]["tables"] = cl_checker_options["eerie"][
-                    "tables"
-                ]
-            elif "tables" not in cl_checker_options["mip"]:
+            if "eerie" in cl_checker_options:
+                cl_checker_options["mip"] = cl_checker_options.pop("eerie")
+            if "tables" not in cl_checker_options["mip"]:
                 cl_checker_options["mip"][
                     "tables"
                 ] = "/work/bm0021/cmor_tables/eerie_cmor_tables/Tables"
@@ -861,10 +736,6 @@ def main():
             raise Exception(
                 "ERROR: Cannot run both 'cc6' and 'mip' checkers at the same time."
             )
-        if any(test not in checker_dict.keys() for test in checkers_versions):
-            raise Exception(
-                f"Invalid test(s) specified. Supported are: {', '.join(checker_dict.keys())}"
-            )
     # Combine checkers and versions
     #  (checker_options are hardcoded)
@@ -887,7 +758,7 @@ def main():
     if cl_checker_options:
         resume_info["checker_options"] = cl_checker_options
     with open(os.path.join(result_dir, ".resume_info"), "w") as f:
-        json.dump(resume_info, f)
+        json.dump(resume_info, f, sort_keys=True, indent=4)
     # If only cf checker is selected, run cc6 time checks only
     if (
@@ -905,14 +776,6 @@ def main():
     progress_file.touch()
     dataset_file.touch()
-    DRS_parent = "CORDEX-CMIP6"
-    for cname in checkers:
-        print(cname)
-        DRS_parent_tmp = DRS_path_parent.get(checker_dict.get(cname.split(":")[0], ""), "")
-        if DRS_parent_tmp:
-            DRS_parent = DRS_parent_tmp
-            break
     # Check if progress files exist and read already processed files/datasets
     processed_files = set()
     with open(progress_file) as file:
@@ -1011,7 +874,7 @@ def main():
     files_to_check = sorted(files_to_check)
     for file_path in files_to_check:
         files_to_check_dict[file_path]["id"] = get_dsid(
-            files_to_check_dict, dataset_files_map_ext, file_path, DRS_parent
+            files_to_check_dict, dataset_files_map_ext, file_path, supported_project_ids
         )
         files_to_check_dict[file_path]["result_file_ds"] = (
             result_dir
@@ -1027,14 +890,14 @@ def main():
             dataset_files_map[files_to_check_dict[file_path]["id"]] = [file_path]
         checker_options[file_path] = {
             "mip": {
-                **cl_checker_options["mip"],
+                **cl_checker_options.get("mip", {}),
                 "consistency_output": files_to_check_dict[file_path][
                     "consistency_file"
                 ],
                 "time_checks_only": time_checks_only,
             },
             "cc6": {
-                **cl_checker_options["cc6"],
+                **cl_checker_options.get("cc6", {}),
                 "consistency_output": files_to_check_dict[file_path][
                     "consistency_file"
                 ],
@@ -1046,15 +909,32 @@ def main():
                 "time_checks_only": time_checks_only,
             },
             "cf:": {
-                **cl_checker_options["cf"],
+                **cl_checker_options.get("cf", {}),
                 "enable_appendix_a_checks": True,
             },
+            "wcrp_cmip6": {
+                **cl_checker_options.get("wcrp_cmip6", {}),
+                "consistency_output": files_to_check_dict[file_path][
+                    "consistency_file"
+                ],
+            },
+            "wcrp_cordex_cmip6": {
+                **cl_checker_options.get("wcrp_cordex_cmip6", {}),
+                "consistency_output": files_to_check_dict[file_path][
+                    "consistency_file"
+                ],
+                "tables_dir": result_dir + "/tables",
+                "force_table_download": file_path == files_to_check[0]
+                and (
+                    not resume or (resume and os.listdir(result_dir + "/tables") == [])
+                ),
+            },
         }
         checker_options[file_path].update(
             {
                 k: v
                 for k, v in cl_checker_options.items()
-                if k not in ["cc6", "cf", "mip"]
+                if k not in ["cc6", "cf", "mip", "wcrp_cmip6", "wcrp_cordex_cmip6"]
             }
         )
@@ -1062,22 +942,27 @@ def main():
         raise Exception("No files found to check.")
     else:
         print(
-            f"Found {len(files_to_check)} files (organized in {len(dataset_files_map)} datasets) to check."
+            f"\nFound {len(files_to_check)} files (organized in {len(dataset_files_map)} datasets) to check."
         )
-    print()
-    print("Files to check:")
-    print(json.dumps(files_to_check, indent=4))
-    print()
-    print("Dataset - Files mapping (extended):")
-    print(json.dumps(dataset_files_map_ext, indent=4))
-    print()
-    print("Dataset - Files mapping:")
-    print(json.dumps(dataset_files_map, indent=4))
-    print()
-    print("Files to check dict:")
-    print(json.dumps(files_to_check_dict, indent=4))
-    print()
+    # Save dictionaries to disk for information
+    with open(os.path.join(result_dir, "files_to_check.json"), "w") as f:
+        json.dump(files_to_check, f, indent=4)
+    with open(os.path.join(result_dir, "files_to_check_dict.json"), "w") as f:
+        json.dump(files_to_check_dict, f, indent=4)
+    with open(os.path.join(result_dir, "dataset_files_map.json"), "w") as f:
+        json.dump(dataset_files_map, f, indent=4)
+    with open(os.path.join(result_dir, "dataset_files_map_ext.json"), "w") as f:
+        json.dump(dataset_files_map_ext, f, indent=4)
+    print(
+        "Information on which files have been found and how these are organized into datasets was saved to disk:"
+    )
+    print(
+        f" - {os.path.join(result_dir, 'files_to_check.json')}\n"
+        f" - {os.path.join(result_dir, 'files_to_check_dict.json')}\n"
+        f" - {os.path.join(result_dir, 'dataset_files_map.json')}\n"
+        f" - {os.path.join(result_dir, 'dataset_files_map_ext.json')}"
+    )
     #########################################################
     # QA Part 1 - Run all compliance-checker checks
@@ -1090,10 +975,13 @@ def main():
     print()
     # Initialize the summary
-    summary = QAResultAggregator(checker_dict=checker_dict_ext)
+    summary = QAResultAggregator()
+    reference_ds_dict = {}
     # Calculate the number of processes
     num_processes = max(multiprocessing.cpu_count() - 4, 1)
+    if parallel_processes > 0:
+        num_processes = min(num_processes, parallel_processes)
     print(f"Using {num_processes} parallel processes for cc checks.")
     print()
@@ -1140,8 +1028,9 @@ def main():
     # Skip continuity and consistency checks if no cc6/mip checks were run
     #   (and thus no consistency output file was created)
-    if "cc6:latest" in checkers or "mip:latest" in checkers:
+    if any(
+        ch.split(":", 1)[0] in checker_supporting_consistency_checks for ch in checkers
+    ):
         #########################################################
         # QA Part 2 - Run all consistency & continuity checks
         #########################################################
@@ -1167,6 +1056,8 @@ def main():
         # Limit the number of processes for consistency checks since a lot
         #   of files will be opened at the same time
         num_processes = min(num_processes, 10)
+        if parallel_processes > 0:
+            num_processes = min(num_processes, parallel_processes)
         print(f"Using {num_processes} parallel processes for dataset checks.")
         print()
@@ -1217,7 +1108,9 @@ def main():
     else:
         print()
         warnings.warn(
-            "Continuity & Consistency checks skipped since no cc6 checks were run."
+            "Continuity & consistency checks skipped since no appropriate checkers were run."
+            " The following checkers support the continuity & consistency checks: "
+            f"{', '.join(checker_supporting_consistency_checks)}"
         )
     #########################################################
@@ -1245,14 +1138,14 @@ def main():
         "cc_version": cc_version,
         "checkers": ", ".join(
             [
-                f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}"
+                f"{checker_dict.get(checker.split(':')[0], '')} {checker.split(':')[0]}:{checker_release_versions[checker.split(':')[0]]}".strip()
                 for checker in checkers
             ]
         ),
         "parent_dir": str(parent_dir),
     }
     # Add reference datasets for inter-dataset consistency checks
-    if 'cc6:latest' in checkers or 'mip:latest' in checkers:
+    if reference_ds_dict:
         summary_info["inter_ds_con_checks_ref"] = reference_ds_dict
     dsid_common_prefix = os.path.commonprefix(list(dataset_files_map.keys()))

esgf-qa 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

esgf-qa 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl