PyPI - esgf-qa - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

esgf-qa 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

esgf_qa/_constants.py +42 -1
esgf_qa/_version.py +2 -2
esgf_qa/cluster_results.py +466 -0
esgf_qa/con_checks.py +209 -11
esgf_qa/run_qa.py +247 -418
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/METADATA +42 -28
esgf_qa-0.4.0.dist-info/RECORD +19 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/top_level.txt +1 -1
tests/test_cli.py +267 -0
tests/test_cluster_results.py +166 -0
tests/test_con_checks.py +263 -0
tests/test_qaviewer.py +147 -0
tests/test_run_dummy_qa.py +191 -0
tests/test_run_qa.py +181 -0
docs/esgf-qa_Logo.png +0 -0
esgf_qa-0.3.0.dist-info/RECORD +0 -13
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/WHEEL +0 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/entry_points.txt +0 -0
{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/licenses/LICENSE +0 -0

{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: esgf-qa
-Version: 0.3.0
+Version: 0.4.0
 Summary: QA based on compliance-checker
 Author-email: Martin Schupfner <schupfner@dkrz.de>
 Maintainer-email: Martin Schupfner <schupfner@dkrz.de>
@@ -38,6 +38,7 @@ Requires-Dist: flake8-print; extra == "dev"
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-flake8; extra == "dev"
+Requires-Dist: pytest-asyncio; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: twine; extra == "dev"
 Requires-Dist: wheel; extra == "dev"
@@ -45,34 +46,36 @@ Dynamic: license-file
 [![PyPI version](https://img.shields.io/pypi/v/esgf-qa.svg)](https://pypi.org/project/esgf-qa/)
-# esgf-qa: Quality Assurance Workflow Based on `compliance-checker` and `cc-plugin-wcrp`
-<img src="docs/esgf-qa_Logo.png" align="left" width="120">
+# esgf-qa
+### Quality Assurance Workflow Based on `compliance-checker` and `cc-plugin-wcrp` (or other cc-plugins)
+<img src="https://raw.githubusercontent.com/ESGF/esgf-qa/master/docs/esgf-qa_Logo.png" align="left" width="120">
-`esgf-qa` makes use of the frameworks and [CF](https://cfconventions.org/)-compliance checks of the
-[ioos/compliance-checker](https://github.com/ioos/compliance-checker) and extensions coming with
+`esgf-qa` provides a flexible quality assurance (QA) workflow for evaluating dataset compliance using the
+[ioos/compliance-checker](https://github.com/ioos/compliance-checker) framework
+(including [CF](https://cfconventions.org/) compliance checks)
+and various community plugins (`cc-plugin`s), such as
 [ESGF/cc-plugin-wcrp](https://github.com/ESGF/cc-plugin-wcrp) and
 [euro-cordex/cc-plugin-cc6](https://github.com/euro-cordex/cc-plugin-cc6).
-This tool is designed to run the desired file-based QC tests with
-[ioos/compliance-checker](https://github.com/ioos/compliance-checker) and
-[euro-cordex/cc-plugin-wcrp](https://github.com/euro-cordex/cc-plugin-wcrp),
-to conduct additional dataset-based checks (such as time axis continuity and
-consistency checks) as well as to summarize the test results.
+The tool executes file-based quality control (QC) tests through the Compliance Checker,
+and, where applicable, performs additional dataset-level checks to test inter-file time-axis continuity
+and consistency in variable, coordinate and attribute definitions.
+Results from both file- and dataset-level checks are aggregated, summarized, and clustered for easier interpretation.
-`esgf-qa` is mainly aimed at a QA workflow testing compliance with various WCRP Project Specifications (see below).
-However, it is generally applicable to test for compliance with the CF conventions through application of the IOOS Compliance Checker,
-and it is easily extendable for any `cc-plugin` and for projects defining CORDEX or CMIP style CMOR-tables.
+### Currently supported checkers
+While `esgf-qa` has been primarily developed for workflows assessing compliance with WCRP project data specifications
+(e.g., CMIP, CORDEX), it can also be used for general CF-compliance testing and easily extended to support any
+`cc-plugin` and projects following CORDEX- or CMIP-style CMOR table conventions.
 | Standard                                                                                             | Checker Name |
 | ---------------------------------------------------------------------------------------------------- | ------------ |
-| [cordex-cmip6-cv](https://github.com/WCRP-CORDEX/cordex-cmip6-cv)                                    |  wcrp_cordex_cmip6, cc6 |
-| [cordex-cmip6-cmor-tables](https://github.com/WCRP-CORDEX/cordex-cmip6-cmor-tables)                  |  wcrp_cordex_cmip6, cc6 |
-| [CORDEX-CMIP6 Archive Specifications](https://doi.org/10.5281/zenodo.10961069)                       |  wcrp_cordex_cmip6, cc6 |
-| [CMIP6 DRS](https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf) | wcrp_cmip6 / plugin_cmip6 |
-| [cmip6-cmor-tables](https://github.com/PCMDI/cmip6-cmor-tables) (esgvoc) | wcrp_cmip6 / plugin_cmip6 |
-| [CMIP6 CVs](https://github.com/WCRP-CMIP/CMIP6_CVs) (esgvoc) | wcrp_cmip6 / plugin_cmip6 |
-| [EERIE CMOR Tables & CV](https://github.com/eerie-project/dreq_tools) | eerie |
-| Custom MIP | mip |
+| [CF Conventions](https://cfconventions.org/) (shipped with [ioos/compliance-checker](https://github.com/ioos/compliance-checker)) | cf |
+| [WCRP CMIP6](https://pcmdi.llnl.gov/CMIP6/):<br><ul><li>[CMIP6 DRS](https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf)</li><li>[CMIP6 CVs](https://github.com/WCRP-CMIP/CMIP6_CVs) (esgvoc)</li></li><li>[cmip6-cmor-tables](https://github.com/PCMDI/cmip6-cmor-tables) (esgvoc)</li></ul> | wcrp_cmip6 |
+| [WCRP CORDEX-CMIP6](https://cordex.org/):<br><ul><li>[CORDEX-CMIP6 Archive Specifications](https://doi.org/10.5281/zenodo.10961069)</li><li>[cordex-cmip6-cv](https://github.com/WCRP-CORDEX/cordex-cmip6-cv) (esgvoc)</li><li>[cordex-cmip6-cmor-tables](https://github.com/WCRP-CORDEX/cordex-cmip6-cmor-tables) (esgvoc)</li></ul> |  wcrp_cordex_cmip6 |
+|  [WCRP CORDEX-CMIP6](https://cordex.org/):<br><ul><li>[CORDEX-CMIP6 Archive Specifications](https://doi.org/10.5281/zenodo.10961069)</li><li>[cordex-cmip6-cv](https://github.com/WCRP-CORDEX/cordex-cmip6-cv)</li><li>[cordex-cmip6-cmor-tables](https://github.com/WCRP-CORDEX/cordex-cmip6-cmor-tables)</li></ul>  | cc6 |
+| [EERIE](https://eerie-project.eu/):<br>[EERIE CMOR Tables & CV](https://github.com/eerie-project/dreq_tools) | eerie |
+| Custom MIP (CMOR/MIP tables have to be specified) | mip |
 ## Installation
@@ -109,11 +112,16 @@ esgvoc install
 - Test your installation
-The following command should now also list the `esgf-qc` checks next to all `cc_plugin_cc6` and `compliance_checker` checks:
+The following command should now also list the `cc-plugin-wcrp` checks next to all `cc_plugin_cc6` and `compliance_checker` checks:
 ```
 cchecker.py -l
 ```
+The following command should now list the necessary projects with metadata sources for `esgvoc`:
+```
+esgvoc status
+```
 ## Usage
 ```shell
@@ -125,10 +133,10 @@ $ esgqa [-h] [-o <OUTPUT_DIR>] [-t <TEST>] [-O OPTION] [-i <INFO>] [-r] [-C] <pa
 - options:
   - `-h, --help`: show this help message and exit
   - `-o, --output_dir OUTPUT_DIR`: Directory to store QA results. Needs to be non-existing or empty or from previous QA run. If not specified, will store results in `./cc-qa-check-results/YYYYMMDD-HHmm_<hash>`.
-  - `-t, --test TEST`: The test to run ('wcrp_cmip6:latest', 'wcrp_cordex_cmip6':latest' or 'cf:<version>', can be specified multiple times, eg.: '-t wcrp_cmip6:latest -t cf:1.7') - default: running latest CF checks 'cf:latest'.
-  - `-O, --option OPTION`: Additional options to be passed to the checkers. Format: '<checker>:<option_name>[:<option_value>]'. Multiple invocations possible.
+  - `-t, --test TEST`: The test to run (`'wcrp_cmip6:latest'`, `'wcrp_cordex_cmip6:latest'` or `'cf:<version>'`, can be specified multiple times, eg.: `'-t wcrp_cmip6:latest -t cf:1.7'`) - default: running latest CF checks `'cf:latest'`.
+  - `-O, --option OPTION`: Additional options to be passed to the checkers. Format: `'<checker>:<option_name>[:<option_value>]'`. Multiple invocations possible.
   - `-i, --info INFO`:  Information used to tag the QA results, eg. the simulation id to identify the checked run. Suggested is the original experiment-id you gave the run.
-  - `-r, --resume`: Specify to continue a previous QC run. Requires the <output_dir> argument to be set.
+  - `-r, --resume`: Specify to continue a previous QC run. Requires the `<output_dir>` argument to be set.
   - `-C, --include_consistency_checks`: Include basic consistency and continuity checks. When using the `wcrp-*`, `cc6`, `mip` or `eerie` checkers, they are included by default.
 ### Example Usage
@@ -137,7 +145,7 @@ $ esgqa [-h] [-o <OUTPUT_DIR>] [-t <TEST>] [-O OPTION] [-i <INFO>] [-r] [-C] <pa
 $ esgqa -t wcrp_cordex_cmip6:latest -t cf:1.11 -o QA_results/IAEVALL02_2025-10-20 -i "IAEVALL02" ESGF_Buff/IAEVALL02/CORDEX-CMIP6
 ```
-To resume at a later date, eg. if the QA run did not finish in time or more files have been added to the <parent_dir>
+To resume at a later date, eg. if the QA run did not finish in time or more files have been added to the `<parent_dir>`
 (note, that the last modification date of files is NOT taken into account - once a certain file path has been checked
 it will be marked as checked and checks will only be repeated if runtime errors occured):
@@ -164,8 +172,13 @@ The results will be stored in two `json` files:
 ### Web view
 The clustered results can be viewed using the following website:
-[https://cmiphub.dkrz.de/info/display_qc_results.html](https://cmiphub.dkrz.de/info/display_qc_results.html).
+- DKRZ: [https://cmiphub.dkrz.de/info/display_qc_results.html](https://cmiphub.dkrz.de/info/display_qc_results.html).
+- IPSL: coming soon
 This website runs entirely in the user's browser using JavaScript, without requiring interaction with a web server.
+You can select one of the recent QA runs conducted at the respective site or select a local QA run result file to be displayed.
 Alternatively, you can open the included `display_qc_results.html` file directly in your browser.
 While the web view also supports the full (unclustered) results, it is recommended to not use the web view for files greater than a few MegaBytes.
@@ -188,8 +201,9 @@ in the GitLab Repository [qa-results](https://gitlab.dkrz.de/udag/qa-results). Y
 This project is licensed under the Apache License 2.0, and includes the Inter font, which is licensed under the SIL Open Font License 1.1. See the [LICENSE](./LICENSE) file for more details.
 > [!NOTE]
-> **This project was originally developed by [DKRZ](https://www.dkrz.de)** under the name **cc-qa** (see [DKRZ GitLab](https://gitlab.dkrz.de/udag/cc-qa)), with funding from the German Ministry of Research, Technology and Space ([BMFTR](https://www.bmftr.bund.de/en), reference `01LP2326E`).
+> **This project was originally developed by [DKRZ](https://www.dkrz.de)** under the name **cc-qa** (see [DKRZ GitLab](https://gitlab.dkrz.de/udag/cc-qa)), with funding from the _German Ministry of Research, Technology and Space_ ([BMFTR](https://www.bmftr.bund.de/en), reference `01LP2326E`).
 > It has since been renamed to **esgf-qa** and is now maintained under the **Earth System Grid Federation (ESGF)** organization on GitHub.
 >
 > If you previously used `cc-qa`, please update your installations as described above.

esgf_qa-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+esgf_qa/__init__.py,sha256=iLmy2rOkHS_4KZWMD8BgT7R3tLMKeaTCDVf3B4FyYxM,91
+esgf_qa/_constants.py,sha256=CKWF9DCe3cen0Wp10yuKC49n8yQ8Ge5jPavd-JiJVro,2783
+esgf_qa/_version.py,sha256=2_0GUP7yBCXRus-qiJKxQD62z172WSs1sQ6DVpPsbmM,704
+esgf_qa/cluster_results.py,sha256=cy-Qc3SRbrYA6QGU_ROgum1Fxmd4wspPVvsJ6m5mVo0,19047
+esgf_qa/con_checks.py,sha256=BAqbDcEmDB1kiRBaSaB76mNfKxoHtTNWqJHbtALcpIg,29074
+esgf_qa/qaviewer.py,sha256=myt9lq47E40sD7KrMjVcAvy8sqocVinBSUYf4nOPD80,8843
+esgf_qa/run_qa.py,sha256=VXuYWBHCzYN4Cjv80HOlCSyLKgj3tceB7qf0fAkuH6g,41724
+esgf_qa-0.4.0.dist-info/licenses/LICENSE,sha256=S1WmzAIRoXFV26FENC3SW_XsmvkGtCs-4_gm7PrPYWg,12636
+tests/test_cli.py,sha256=OcJ1Pq5l5vKnPP96r3_mBWyn3hWFQ7p7Xb7YJp4tAms,10821
+tests/test_cluster_results.py,sha256=ahwtG6666mP7VdVxHwPy7I8vV9rPxl2VRPdnH8VQk-w,5894
+tests/test_con_checks.py,sha256=VCj_0jt_fbBqo_VWCrpHMHPs9IWxb5PtJs6Yh1jrxxU,8853
+tests/test_qaviewer.py,sha256=ZEH7LkPIl3ocV0Xk4D4Zv6VIH9397hB71FtXLeo7NwY,4635
+tests/test_run_dummy_qa.py,sha256=6pIQkvzP8c-mKynk3n19UvZAhvsPMpnu32YznWFDB2k,6213
+tests/test_run_qa.py,sha256=DUi7KpgpL80b9pL6XP4uFAw-8b0YqhMcwCixS4z8ZEI,6128
+esgf_qa-0.4.0.dist-info/METADATA,sha256=anvxnx7EeAxbM5M7zp6MHKguY6wiecGsrlelTTyFmZk,11057
+esgf_qa-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+esgf_qa-0.4.0.dist-info/entry_points.txt,sha256=ZGMG_3eS7nyUJE6ZJ9v23Thcf-r29ZSZ7e8voBVwbf4,82
+esgf_qa-0.4.0.dist-info/top_level.txt,sha256=BtbDH91jFtWygUPsLIr1g5CKU7Jmp4K-CU8yzCaONt0,14
+esgf_qa-0.4.0.dist-info/RECORD,,

{esgf_qa-0.3.0.dist-info → esgf_qa-0.4.0.dist-info}/top_level.txt RENAMED Viewed

@@ -1,2 +1,2 @@
-docs
 esgf_qa
+tests

tests/test_cli.py ADDED Viewed

@@ -0,0 +1,267 @@
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+import numpy as np
+import pytest
+import xarray as xr
+class TestQACommandLine:
+    """
+    End-to-end pytest test class for esgqa CLI using synthetic CMIP6 and CORDEX-CMIP6 data.
+    """
+    @classmethod
+    def setup_class(cls):
+        """
+        Generate lightweight synthetic CMIP6 and CORDEX-CMIP6 test datasets.
+        """
+        cls.test_data_dir = tempfile.mkdtemp(prefix="esgf_qa_testdata_")
+        cls.cmip6_dir = os.path.join(cls.test_data_dir, "cmip6")
+        cls.cordex_dir = os.path.join(cls.test_data_dir, "cordex_cmip6")
+        cls.custom_dir = os.path.join(cls.test_data_dir, "custom")
+        os.makedirs(cls.cmip6_dir, exist_ok=True)
+        os.makedirs(cls.cordex_dir, exist_ok=True)
+        # Generate lightweight CMIP6 test data
+        for var in ["tas", "huss"]:
+            base_path = (
+                Path(cls.cmip6_dir)
+                / f"MPI-ESM1-2-LR/historical/r1i1p1f1/Amon/{var}/gn/v20210215"
+            )
+            base_path.mkdir(parents=True, exist_ok=True)
+            for start_year in [1850, 1855]:
+                ntime = 60  # 5 years monthly data
+                times = np.array(np.arange(ntime), dtype=np.float64)
+                lats = np.arange(-90, 91, 10)
+                lons = np.arange(0, 360, 10)
+                data = np.zeros((len(times), len(lats), len(lons)))
+                ds = xr.Dataset(
+                    {var: (("time", "lat", "lon"), data)},
+                    coords={"time": times, "lat": lats, "lon": lons},
+                )
+                file_name = f"{var}_Amon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_{start_year:04d}01-"
+                file_name += f"{start_year+4:04d}12.nc"
+                ds.to_netcdf(base_path / file_name)
+        # Generate lightweight CORDEX-CMIP6 test data
+        for var in ["ta600", "tas"]:
+            base_path = (
+                Path(cls.cordex_dir)
+                / f"DD/EUR-12/CLMcom-DWD/MPI-ESM1-2-HR/historical/r1i1p1f1/ICON-CLM-202407-1-1/v1-r1/mon/{var}/v20240920"
+            )
+            base_path.mkdir(parents=True, exist_ok=True)
+            for start_year, end_year in [(1950, 1950), (1951, 1960)]:
+                ntime = (end_year - start_year + 1) * 12
+                times = np.array(np.arange(ntime), dtype=np.float64)
+                rlat = np.arange(0, 41, 10)
+                rlon = np.arange(0, 41, 10)
+                data = np.zeros((len(times), len(rlat), len(rlon)))
+                ds = xr.Dataset(
+                    {var: (("time", "rlat", "rlon"), data)},
+                    coords={"time": times, "rlat": rlat, "rlon": rlon},
+                )
+                file_name = f"{var}_EUR-12_MPI-ESM1-2-HR_historical_r1i1p1f1_CLMcom-DWD_ICON-CLM-202407-1-1_v1-r1_mon_{start_year:04d}01-{end_year:04d}12.nc"
+                ds.to_netcdf(base_path / file_name)
+        # Generate lightweight custom data
+        for var in ["temp2", "huss"]:
+            base_path = Path(cls.custom_dir) / "model_output"
+            base_path.mkdir(parents=True, exist_ok=True)
+            for start_year in range(1850, 1860):
+                times = np.arange(0, 12)  # 1 years monthly data
+                lats = np.arange(-90, 91, 10)
+                lons = np.arange(0, 360, 10)
+                data = np.zeros((len(times), len(lats), len(lons)))
+                ds = xr.Dataset(
+                    {var: (("time", "lat", "lon"), data)},
+                    coords={"time": times, "lat": lats, "lon": lons},
+                )
+                file_name = f"{var}_Amon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_{start_year:04d}01-"
+                file_name += f"{start_year+4:04d}12.nc"
+                ds.to_netcdf(base_path / file_name)
+    @classmethod
+    def teardown_class(cls):
+        """Clean up temporary test data."""
+        shutil.rmtree(cls.test_data_dir)
+    def _run_cli(self, args, expect_error=False, expected_err_msg=None):
+        """Run the esgqa CLI and optionally check for errors."""
+        cmd = ["python", "-m", "esgf_qa.run_qa"] + args
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if expect_error:
+            assert (
+                result.returncode != 0
+            ), f"Expected error but CLI succeeded:\n{result.stdout}\n{result.stderr}"
+            if expected_err_msg:
+                combined = result.stdout + "\n" + result.stderr
+                assert (
+                    expected_err_msg in combined
+                ), f"Expected error message '{expected_err_msg}' not found.\nOutput:\n{combined}"
+        else:
+            assert (
+                result.returncode == 0
+            ), f"CLI failed unexpectedly:\n{result.stdout}\n{result.stderr}"
+        return result.stdout, result.stderr
+    @pytest.mark.parametrize(
+        "test_args",
+        [
+            ["-t", "cc6:latest", "-o", "OUTPUT", "cmip6"],
+            ["-t", "cc6", "-o", "OUTPUT", "cordex_cmip6"],
+            ["-t", "cc6:latest", "-t", "cf", "-o", "OUTPUT", "cordex_cmip6"],
+            ["-t", "cf:latest", "-o", "OUTPUT", "cmip6"],
+            ["-t", "cf:1.7", "-C", "-o", "OUTPUT", "cmip6"],
+            [
+                "-t",
+                "wcrp_cmip6:latest",
+                "-t",
+                "cf:1.7",
+                "-o",
+                "OUTPUT",
+                "cmip6",
+                "-i",
+                "test_info",
+            ],
+            [
+                "-t",
+                "wcrp_cordex_cmip6",
+                "-t",
+                "cf:1.7",
+                "-o",
+                "OUTPUT",
+                "cordex_cmip6",
+                "-i",
+                "test_info",
+            ],
+        ],
+    )
+    def test_cli_runs_successfully(self, test_args, tmp_path):
+        temp_dir = tempfile.mkdtemp()
+        try:
+            result_dir = tmp_path / "results"
+            args = [
+                (
+                    os.path.join(self.test_data_dir, "cmip6")
+                    if arg == "cmip6"
+                    else (
+                        os.path.join(self.test_data_dir, "cordex_cmip6")
+                        if arg == "cordex_cmip6"
+                        else arg.replace("OUTPUT", str(result_dir))
+                    )
+                )
+                for arg in test_args
+            ]
+            stdout, stderr = self._run_cli(args)
+            output_dir_index = args.index("-o") + 1
+            result_dir = args[output_dir_index]
+            result_files = os.listdir(result_dir)
+            assert any(
+                f.startswith("qa_result_") and f.endswith(".json") for f in result_files
+            )
+            # Check clustered summary if exists
+            clustered_files = [
+                f for f in result_files if "clustered" in f and f.endswith(".json")
+            ]
+            for cf in clustered_files:
+                with open(os.path.join(result_dir, cf)) as f:
+                    data = json.load(f)
+                for key in ["error", "fail", "info"]:
+                    assert key in data
+                info = data["info"]
+                for field in [
+                    "id",
+                    "date",
+                    "files",
+                    "datasets",
+                    "cc_version",
+                    "checkers",
+                ]:
+                    assert field in info
+                for sev_dict in [data["fail"], data["error"]]:
+                    for _, issues in sev_dict.items():
+                        for issue_name, messages in issues.items():
+                            for msg, files in messages.items():
+                                assert isinstance(files, list)
+                                assert (
+                                    len(files) == 1
+                                ), f"Clustered summary should have one example file for {msg}"
+                                assert isinstance(files[0], str)
+        finally:
+            shutil.rmtree(temp_dir)
+    def test_cli_resume_functionality(self):
+        temp_dir = tempfile.mkdtemp()
+        output_dir = os.path.join(temp_dir, "output")
+        os.makedirs(output_dir, exist_ok=True)
+        resume_file = os.path.join(output_dir, ".resume_info")
+        Path(os.path.join(output_dir, "progress.txt")).touch()
+        Path(os.path.join(output_dir, "progress_datasets.txt")).touch()
+        os.makedirs(os.path.join(output_dir, "tables"), exist_ok=True)
+        with open(resume_file, "w") as f:
+            json.dump(
+                {
+                    "parent_dir": self.cmip6_dir,
+                    "info": "test_resume",
+                    "tests": ["cf:latest"],
+                },
+                f,
+            )
+        stdout, stderr = self._run_cli(["-r", "-o", output_dir])
+        assert "Resuming previous QA run" in stdout
+        shutil.rmtree(temp_dir)
+    @pytest.mark.parametrize(
+        "test_args, expected_err_msg",
+        [
+            (
+                ["-t", "cf:latest", "-o", "some_dir"],
+                "Missing required argument <parent_dir>",
+            ),
+            (
+                ["-t", "invalid_checker:latest", "-o", "some_dir", "cmip6"],
+                "Invalid test(s) specified",
+            ),
+            (
+                ["-r", "-t", "cf:latest", "-o", "some_dir"],
+                "When using -r/--resume, only -o/--output_dir and -i/--info can be set",
+            ),
+        ],
+    )
+    def test_cli_fails_on_invalid_arguments(self, test_args, expected_err_msg):
+        temp_dir = tempfile.mkdtemp()
+        try:
+            args = [arg if arg != "cmip6" else self.cmip6_dir for arg in test_args]
+            self._run_cli(args, expect_error=True, expected_err_msg=expected_err_msg)
+        finally:
+            shutil.rmtree(temp_dir)
+    def test_cli_produces_valid_json(self):
+        temp_dir = Path(tempfile.mkdtemp())
+        try:
+            output_dir = temp_dir / "output"
+            output_dir.mkdir()
+            self._run_cli(
+                ["-t", "cf:latest", "-o", str(output_dir), str(self.cmip6_dir)]
+            )
+            json_files = list(output_dir.glob("*.json"))
+            assert len(json_files) == 2
+            with open(json_files[0]) as f:
+                data = json.load(f)
+            # "info" is the only required field
+            assert "info" in data
+            # "error" and "fail" are optional, others are not allowed
+            assert all([key in ["fail", "info", "error"] for key in data])
+            info = data["info"]
+            for field in ["id", "date", "files", "datasets", "cc_version", "checkers"]:
+                assert field in info
+            assert isinstance(data.get("error", {}), dict)
+            assert isinstance(data.get("fail", {}), dict)
+        finally:
+            shutil.rmtree(temp_dir)

tests/test_cluster_results.py ADDED Viewed

@@ -0,0 +1,166 @@
+from collections import defaultdict
+import pytest
+import esgf_qa.cluster_results as esgqacr
+from esgf_qa.cluster_results import QAResultAggregator
+@pytest.fixture(autouse=True)
+def patch_checker_dicts(monkeypatch):
+    """
+    Patch module-level checker_dict and checker_dict_ext
+    to avoid dependency on real ESGF constants.
+    """
+    mock_checker_dict = {"cf": "CF", "cc6": "C-C6"}
+    mock_checker_dict_ext = {"cf": "CF-EXT", "cc6": "C-C6-EXT"}
+    monkeypatch.setattr(esgqacr, "checker_dict", mock_checker_dict)
+    monkeypatch.setattr(esgqacr, "checker_dict_ext", mock_checker_dict_ext)
+    yield
+@pytest.fixture
+def aggregator():
+    """Provide a fresh aggregator instance."""
+    return QAResultAggregator()
+def test_initial_summary_structure(aggregator):
+    """Ensure the summary structure initializes correctly."""
+    assert "error" in aggregator.summary
+    assert "fail" in aggregator.summary
+    assert isinstance(aggregator.summary["fail"], defaultdict)
+def test_update_adds_fail_entries(aggregator):
+    """Verify that a failed test adds entries to the summary."""
+    result_dict = {
+        "cf": {
+            "check_units": {
+                "value": (0, 1),
+                "weight": 2,
+                "msgs": ["Missing attribute 'units'"],
+            }
+        }
+    }
+    aggregator.update(result_dict, dsid="ds1", file_name="file1.nc")
+    fail_summary = aggregator.summary["fail"]
+    assert 2 in fail_summary
+    test_name = "[CF] check_units"
+    assert test_name in fail_summary[2]
+    assert "Missing attribute 'units'" in fail_summary[2][test_name]
+    assert "ds1" in fail_summary[2][test_name]["Missing attribute 'units'"]
+    assert "file1.nc" in fail_summary[2][test_name]["Missing attribute 'units'"]["ds1"]
+def test_update_adds_error_entries(aggregator):
+    """Verify that an error test adds entries to the summary."""
+    result_dict = {"cf": {"errors": {"test_func": "Some internal error"}}}
+    aggregator.update(result_dict, dsid="dsX", file_name="fX.nc")
+    error_summary = aggregator.summary["error"]
+    assert "[CF] test_func" in error_summary
+    assert "Some internal error" in error_summary["[CF] test_func"]
+    assert "dsX" in error_summary["[CF] test_func"]["Some internal error"]
+    assert "fX.nc" in error_summary["[CF] test_func"]["Some internal error"]["dsX"]
+def test_update_ds_uses_checker_dict_ext(aggregator):
+    """Ensure update_ds uses checker_dict_ext for extended checkers."""
+    result_dict = {
+        "cf": {
+            "errors": {
+                "check1": {"msg": "Something broke", "files": ["fileA.nc", "fileB.nc"]}
+            },
+            "test2": {"weight": 3, "msgs": {"Bad value": ["fileC.nc"]}},
+        }
+    }
+    aggregator.update_ds(result_dict, dsid="dataset_42")
+    error_summary = aggregator.summary["error"]
+    fail_summary = aggregator.summary["fail"]
+    # Check both sections populated and use extended prefix
+    assert any("[CF-EXT]" in key for key in error_summary.keys())
+    assert any("[CF-EXT]" in key for key in fail_summary[3].keys())
+def test_sort_orders_failures_by_weight(aggregator):
+    """Check that sorting produces a descending order by weight."""
+    aggregator.summary["fail"][1]["[CF] test1"] = {}
+    aggregator.summary["fail"][5]["[CF] test5"] = {}
+    aggregator.sort()
+    weights = list(aggregator.summary["fail"].keys())
+    assert weights == sorted(weights, reverse=True)
+def test_cluster_messages_basic():
+    """Cluster messages with small differences using threshold."""
+    messages = [
+        "Missing value for var1",
+        "Missing value for var2",
+        "Completely different",
+    ]
+    clusters = QAResultAggregator.cluster_messages(messages[:], threshold=0.8)
+    # Expect two clusters: similar ones together
+    assert len(clusters) == 2
+    assert any("var1" in msg or "var2" in msg for msg in clusters[0])
+def test_generalize_message_group_single():
+    """If there is one message, return it unchanged."""
+    msg, placeholders = QAResultAggregator.generalize_message_group(["Missing X"])
+    assert msg == "Missing X"
+    assert placeholders == {}
+def test_generalize_message_group_multiple():
+    """Generalization should replace differing tokens with placeholders."""
+    msgs = ["Missing variable A", "Missing variable B"]
+    generalized, placeholders = QAResultAggregator.generalize_message_group(msgs)
+    assert "Missing variable" in generalized
+    assert "{" in generalized
+    assert isinstance(placeholders, dict)
+    assert list(placeholders.keys())  # at least one placeholder
+def test_merge_placeholders_merges_close():
+    """Test merging adjacent placeholders."""
+    tokens = ["{A}", "-", "{B}"]
+    dictionary = {"A": "foo", "B": "bar"}
+    merged_tokens, merged_dict = QAResultAggregator.merge_placeholders(
+        tokens, dictionary
+    )
+    # The placeholders should merge since only one char between them
+    assert len(merged_dict) <= 1
+    assert "{" in merged_tokens[0]
+def test_cluster_summary_produces_clustered_summary(aggregator):
+    """Integration-like test for cluster_summary on simple data."""
+    result_dict = {
+        "cf": {
+            "check_attrs": {
+                "value": (0, 1),
+                "weight": 3,
+                "msgs": ["Missing attr 'long_name'", "Missing attr 'standard_name'"],
+            }
+        }
+    }
+    aggregator.update(result_dict, dsid="ds1", file_name="file1.nc")
+    aggregator.sort()
+    aggregator.cluster_summary(threshold=0.7)
+    clustered = aggregator.clustered_summary["fail"]
+    # should contain weight 3 and a generalized message
+    assert 3 in clustered
+    test_name = next(iter(clustered[3].keys()))
+    assert "[CF]" in test_name
+    # at least one generalized message with "Missing attr"
+    found_msg_keys = list(clustered[3]["[CF] check_attrs"].keys())
+    assert any("Missing attr" in k for k in found_msg_keys)

esgf-qa 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

esgf-qa 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl