PyPI - climate-ref - Versions diffs - 0.5.0__py3-none-any.whl - Mend

climate-ref 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

climate_ref/__init__.py +30 -0
climate_ref/_config_helpers.py +214 -0
climate_ref/alembic.ini +114 -0
climate_ref/cli/__init__.py +138 -0
climate_ref/cli/_utils.py +68 -0
climate_ref/cli/config.py +28 -0
climate_ref/cli/datasets.py +205 -0
climate_ref/cli/executions.py +201 -0
climate_ref/cli/providers.py +84 -0
climate_ref/cli/solve.py +23 -0
climate_ref/config.py +475 -0
climate_ref/constants.py +8 -0
climate_ref/database.py +223 -0
climate_ref/dataset_registry/obs4ref_reference.txt +2 -0
climate_ref/dataset_registry/sample_data.txt +60 -0
climate_ref/datasets/__init__.py +40 -0
climate_ref/datasets/base.py +214 -0
climate_ref/datasets/cmip6.py +202 -0
climate_ref/datasets/obs4mips.py +224 -0
climate_ref/datasets/pmp_climatology.py +15 -0
climate_ref/datasets/utils.py +16 -0
climate_ref/executor/__init__.py +274 -0
climate_ref/executor/local.py +89 -0
climate_ref/migrations/README +22 -0
climate_ref/migrations/env.py +139 -0
climate_ref/migrations/script.py.mako +26 -0
climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +292 -0
climate_ref/models/__init__.py +33 -0
climate_ref/models/base.py +42 -0
climate_ref/models/dataset.py +206 -0
climate_ref/models/diagnostic.py +61 -0
climate_ref/models/execution.py +306 -0
climate_ref/models/metric_value.py +195 -0
climate_ref/models/provider.py +39 -0
climate_ref/provider_registry.py +146 -0
climate_ref/py.typed +0 -0
climate_ref/solver.py +395 -0
climate_ref/testing.py +109 -0
climate_ref-0.5.0.dist-info/METADATA +97 -0
climate_ref-0.5.0.dist-info/RECORD +44 -0
climate_ref-0.5.0.dist-info/WHEEL +4 -0
climate_ref-0.5.0.dist-info/entry_points.txt +2 -0
climate_ref-0.5.0.dist-info/licenses/LICENCE +201 -0
climate_ref-0.5.0.dist-info/licenses/NOTICE +3 -0

climate_ref/cli/datasets.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""
+View and ingest input datasets
+"""
+import errno
+import os
+import shutil
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Annotated
+import typer
+from loguru import logger
+from rich.console import Console
+from climate_ref.cli._utils import pretty_print_df
+from climate_ref.datasets import get_dataset_adapter
+from climate_ref.models import Dataset
+from climate_ref.provider_registry import ProviderRegistry
+from climate_ref.solver import solve_required_executions
+from climate_ref.testing import fetch_sample_data
+from climate_ref_core.dataset_registry import dataset_registry_manager, fetch_all_files
+from climate_ref_core.datasets import SourceDatasetType
+app = typer.Typer(help=__doc__)
+console = Console()
+@app.command(name="list")
+def list_(
+    ctx: typer.Context,
+    source_type: Annotated[
+        SourceDatasetType, typer.Option(help="Type of source dataset")
+    ] = SourceDatasetType.CMIP6.value,  # type: ignore
+    column: Annotated[list[str] | None, typer.Option()] = None,
+    include_files: bool = typer.Option(False, help="Include files in the output"),
+    limit: int = typer.Option(100, help="Limit the number of rows to display"),
+) -> None:
+    """
+    List the datasets that have been ingested
+    The data catalog is sorted by the date that the dataset was ingested (first = newest).
+    """
+    database = ctx.obj.database
+    adapter = get_dataset_adapter(source_type.value)
+    data_catalog = adapter.load_catalog(database, include_files=include_files, limit=limit)
+    if column:
+        missing = set(column) - set(data_catalog.columns)
+        if missing:
+            def format_(columns: Iterable[str]) -> str:
+                return ", ".join(f"'{c}'" for c in sorted(columns))
+            logger.error(
+                f"Column{'s' if len(missing) > 1 else ''} "
+                f"{format_(missing)} not found in data catalog. "
+                f"Choose from: {format_(data_catalog.columns)}"
+            )
+            raise typer.Exit(code=1)
+        data_catalog = data_catalog[column].sort_values(by=column)
+    pretty_print_df(data_catalog, console=console)
+@app.command()
+def list_columns(
+    ctx: typer.Context,
+    source_type: Annotated[
+        SourceDatasetType, typer.Option(help="Type of source dataset")
+    ] = SourceDatasetType.CMIP6.value,  # type: ignore
+    include_files: bool = typer.Option(False, help="Include files in the output"),
+) -> None:
+    """
+    Print the current climate_ref configuration
+    If a configuration directory is provided,
+    the configuration will attempt to load from the specified directory.
+    """
+    database = ctx.obj.database
+    adapter = get_dataset_adapter(source_type.value)
+    data_catalog = adapter.load_catalog(database, include_files=include_files)
+    for column in sorted(data_catalog.columns.to_list()):
+        print(column)
+@app.command()
+def ingest(  # noqa: PLR0913
+    ctx: typer.Context,
+    file_or_directory: Path,
+    source_type: Annotated[SourceDatasetType, typer.Option(help="Type of source dataset")],
+    solve: Annotated[bool, typer.Option(help="Solve for new diagnostic executions after ingestion")] = False,
+    dry_run: Annotated[bool, typer.Option(help="Do not ingest datasets into the database")] = False,
+    n_jobs: Annotated[int | None, typer.Option(help="Number of jobs to run in parallel")] = None,
+    skip_invalid: Annotated[
+        bool, typer.Option(help="Ignore (but log) any datasets that don't pass validation")
+    ] = False,
+) -> None:
+    """
+    Ingest a dataset
+    This will register a dataset in the database to be used for diagnostics calculations.
+    """
+    config = ctx.obj.config
+    db = ctx.obj.database
+    file_or_directory = Path(file_or_directory).expanduser()
+    logger.info(f"ingesting {file_or_directory}")
+    kwargs = {}
+    if n_jobs is not None:
+        kwargs["n_jobs"] = n_jobs
+    adapter = get_dataset_adapter(source_type.value, **kwargs)
+    # Create a data catalog from the specified file or directory
+    if not file_or_directory.exists():
+        logger.error(f"File or directory {file_or_directory} does not exist")
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_or_directory)
+    data_catalog = adapter.find_local_datasets(file_or_directory)
+    data_catalog = adapter.validate_data_catalog(data_catalog, skip_invalid=skip_invalid)
+    logger.info(
+        f"Found {len(data_catalog)} files for {len(data_catalog[adapter.slug_column].unique())} datasets"
+    )
+    pretty_print_df(adapter.pretty_subset(data_catalog), console=console)
+    for instance_id, data_catalog_dataset in data_catalog.groupby(adapter.slug_column):
+        logger.info(f"Processing dataset {instance_id}")
+        if dry_run:
+            dataset = db.session.query(Dataset).filter_by(slug=instance_id, dataset_type=source_type).first()
+            if not dataset:
+                logger.info(f"Would save dataset {instance_id} to the database")
+                continue
+        else:
+            with db.session.begin():
+                adapter.register_dataset(config, db, data_catalog_dataset)
+    if solve:
+        solve_required_executions(
+            config=config,
+            db=db,
+            dry_run=dry_run,
+        )
+@app.command(name="fetch-sample-data")
+def _fetch_sample_data(
+    force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
+    symlink: Annotated[
+        bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
+    ] = False,
+) -> None:
+    """
+    Fetch the sample data for the given version.
+    These data will be written into the test data directory.
+    This operation may fail if the test data directory does not exist,
+    as is the case for non-source-based installations.
+    """
+    fetch_sample_data(force_cleanup=force_cleanup, symlink=symlink)
+@app.command(name="fetch-data")
+def fetch_data(
+    ctx: typer.Context,
+    registry: Annotated[str, typer.Option(help="Name of the data registry to use")],
+    output_directory: Annotated[
+        Path | None, typer.Option(help="Output directory where files will be saved")
+    ] = None,
+    force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
+    symlink: Annotated[
+        bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
+    ] = False,
+) -> None:
+    """
+    Fetch REF-specific datasets
+    These datasets have been verified to have open licenses
+    and are in the process of being added to Obs4MIPs.
+    """
+    config = ctx.obj.config
+    db = ctx.obj.database
+    # Setup the provider registry to register any dataset registries in the configured providers
+    ProviderRegistry.build_from_config(config, db)
+    if output_directory and force_cleanup and output_directory.exists():
+        logger.warning(f"Removing existing directory {output_directory}")
+        shutil.rmtree(output_directory)
+    try:
+        _registry = dataset_registry_manager[registry]
+    except KeyError:
+        logger.error(f"Registry {registry} not found")
+        logger.error(f"Available registries: {', '.join(dataset_registry_manager.keys())}")
+        raise typer.Exit(code=1)
+    fetch_all_files(_registry, registry, output_directory, symlink=symlink)

climate_ref/cli/executions.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""
+View diagnostic executions
+"""
+import pathlib
+from typing import Annotated
+from urllib.parse import quote
+import pandas as pd
+import typer
+from loguru import logger
+from rich.console import Console, Group
+from rich.filesize import decimal
+from rich.markup import escape
+from rich.panel import Panel
+from rich.text import Text
+from rich.tree import Tree
+from climate_ref.cli._utils import df_to_table, pretty_print_df
+from climate_ref.config import Config
+from climate_ref.models import Execution, ExecutionGroup
+from climate_ref.models.execution import get_execution_group_and_latest
+from climate_ref_core.executor import EXECUTION_LOG_FILENAME
+app = typer.Typer(help=__doc__)
+console = Console()
+@app.command()
+def list_groups(
+    ctx: typer.Context,
+    column: Annotated[list[str] | None, typer.Option()] = None,
+    limit: int = typer.Option(100, help="Limit the number of rows to display"),
+) -> None:
+    """
+    List the diagnostic execution groups that have been identified
+    """
+    session = ctx.obj.database.session
+    execution_groups_results = get_execution_group_and_latest(session).limit(limit).all()
+    execution_count = session.query(ExecutionGroup).count()
+    results_df = pd.DataFrame(
+        [
+            {
+                "id": execution_groups.id,
+                "key": execution_groups.key,
+                "provider": execution_groups.diagnostic.provider.slug,
+                "diagnostic": execution_groups.diagnostic.slug,
+                "dirty": execution_groups.dirty,
+                "successful": result.successful if result else None,
+                "created_at": execution_groups.created_at,
+                "updated_at": execution_groups.updated_at,
+            }
+            for execution_groups, result in execution_groups_results
+        ]
+    )
+    if column:
+        if not all(col in results_df.columns for col in column):
+            logger.error(f"Column not found in data catalog: {column}")
+            raise typer.Exit(code=1)
+        results_df = results_df[column]
+    pretty_print_df(results_df, console=console)
+    if execution_count > limit:
+        logger.warning(
+            f"Displaying {limit} of {execution_count} results. Use the `--limit` option to display more."
+        )
+def walk_directory(directory: pathlib.Path, tree: Tree) -> None:
+    """Recursively build a Tree with directory contents."""
+    # Sort dirs first then by filename
+    paths = sorted(
+        pathlib.Path(directory).iterdir(),
+        key=lambda path: (path.is_file(), path.name.lower()),
+    )
+    for path in paths:
+        # Remove hidden files
+        if path.name.startswith("."):
+            continue
+        if path.is_dir():
+            style = "dim" if path.name.startswith("__") else ""
+            branch = tree.add(
+                f"[bold magenta]:open_file_folder: [link file://{path}]{escape(path.name)}",
+                style=style,
+                guide_style=style,
+            )
+            walk_directory(path, branch)
+        else:
+            text_filename = Text(path.name, "green")
+            text_filename.highlight_regex(r"\..*$", "bold red")
+            text_filename.stylize(f"link file://{path}")
+            file_size = path.stat().st_size
+            text_filename.append(f" ({decimal(file_size)})", "blue")
+            tree.add(text_filename)
+def _execution_panel(execution_group: ExecutionGroup) -> Panel:
+    if len(execution_group.executions) == 0:
+        result = None
+    else:
+        result = execution_group.executions[-1]
+    panel = Panel(
+        f"Key: [bold]{execution_group.key}[/]\n"
+        f"Diagnostic: [bold]{execution_group.diagnostic.slug}[/]\n"
+        f"Provider: [bold]{execution_group.diagnostic.provider.slug}[/]\n"
+        f"Dirty: [bold]{execution_group.dirty}[/]\n"
+        f"Successful: [bold]{result.successful if result else 'not-started'}[/]\n"
+        f"Created At: [bold]{execution_group.created_at}[/]\n"
+        f"Updated At: [bold]{execution_group.updated_at}[/]\n"
+        f"Number of attempted executions: [bold]{len(execution_group.executions)}[/]",
+        title=f"Execution Details: [bold]{execution_group.id}[/]",
+    )
+    return panel
+def _datasets_panel(result: Execution) -> Panel:
+    datasets = result.datasets
+    datasets_df = pd.DataFrame(
+        [
+            {"id": dataset.id, "slug": dataset.slug, "dataset_type": dataset.dataset_type}
+            for dataset in datasets
+        ]
+    )
+    return Panel(
+        df_to_table(datasets_df),
+        title=f"Datasets hash: {result.dataset_hash}",
+    )
+def _results_directory_panel(result_directory: pathlib.Path) -> Panel:
+    if result_directory.exists():
+        tree = Tree(
+            f":open_file_folder: [link file://{result_directory}]{result_directory}",
+            guide_style="bold bright_blue",
+        )
+        walk_directory(result_directory, tree)
+        return Panel(tree, title="File Tree")
+    else:
+        target_directory = f"file://{quote(str(result_directory.parent))}"
+        link_text = escape(str(result_directory))
+        return Panel(
+            Group(
+                Text("Result directory not found.", "bold red"),
+                # Link to the parent directory otherwise this link will never be resolved
+                Text.from_markup(f"[bold magenta]:open_file_folder:[link={target_directory}]{link_text}"),
+            ),
+            title="File Tree",
+        )
+def _log_panel(result_directory: pathlib.Path) -> Panel | None:
+    log_file = result_directory / EXECUTION_LOG_FILENAME
+    if log_file.exists():
+        with open(log_file) as f:
+            log_content = f.read()
+        log_text = Text.from_markup(f"[link file://{log_file}]{log_content}")
+        return Panel(
+            log_text,
+            title="Execution Logs",
+        )
+    else:
+        return Panel(
+            Text("Log file not found.", "bold red"),
+            title="Execution Logs",
+        )
+@app.command()
+def inspect(ctx: typer.Context, execution_id: int) -> None:
+    """
+    Inspect a specific execution group by its ID
+    """
+    config: Config = ctx.obj.config
+    session = ctx.obj.database.session
+    execution_group = session.get(ExecutionGroup, execution_id)
+    if not execution_group:
+        logger.error(f"Execution not found: {execution_id}")
+        raise typer.Exit(code=1)
+    console.print(_execution_panel(execution_group))
+    if not execution_group.executions:
+        logger.error(f"No results found for execution: {execution_id}")
+        return
+    result: Execution = execution_group.executions[-1]
+    result_directory = config.paths.results / result.output_fragment
+    console.print(_datasets_panel(result))
+    console.print(_results_directory_panel(result_directory))
+    console.print(_log_panel(result_directory))

climate_ref/cli/providers.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Manage the REF providers.
+"""
+from typing import Annotated
+import pandas as pd
+import typer
+from loguru import logger
+from rich.console import Console
+from climate_ref.cli._utils import pretty_print_df
+from climate_ref.provider_registry import ProviderRegistry
+from climate_ref_core.providers import CondaDiagnosticProvider, DiagnosticProvider
+app = typer.Typer(help=__doc__)
+console = Console()
+@app.command(name="list")
+def list_(ctx: typer.Context) -> None:
+    """
+    Print the available providers.
+    """
+    config = ctx.obj.config
+    db = ctx.obj.database
+    with db.session.begin():
+        provider_registry = ProviderRegistry.build_from_config(config, db)
+    def get_env(provider: DiagnosticProvider) -> str:
+        env = ""
+        if isinstance(provider, CondaDiagnosticProvider):
+            env = f"{provider.env_path}"
+            if not provider.env_path.exists():
+                env += " (not installed)"
+        return env
+    results_df = pd.DataFrame(
+        [
+            {
+                "provider": provider.slug,
+                "version": provider.version,
+                "conda environment": get_env(provider),
+            }
+            for provider in provider_registry.providers
+        ]
+    )
+    pretty_print_df(results_df, console=console)
+@app.command()
+def create_env(
+    ctx: typer.Context,
+    provider: Annotated[
+        str | None,
+        typer.Option(help="Only install the environment for the named provider."),
+    ] = None,
+) -> None:
+    """
+    Create a virtual environment containing the provider software.
+    """
+    config = ctx.obj.config
+    db = ctx.obj.database
+    with db.session.begin():
+        providers = ProviderRegistry.build_from_config(config, db).providers
+    if provider is not None:
+        available = ", ".join([f'"{p.slug}"' for p in providers])
+        providers = [p for p in providers if p.slug == provider]
+        if not providers:
+            msg = f'Provider "{provider}" not available. Choose from: {available}'
+            logger.error(msg)
+            raise typer.Exit(code=1)
+    for provider_ in providers:
+        txt = f"virtual environment for provider {provider_.slug}"
+        if isinstance(provider_, CondaDiagnosticProvider):
+            logger.info(f"Creating {txt} in {provider_.env_path}")
+            provider_.create_env()
+            logger.info(f"Finished creating {txt}")
+        else:
+            logger.info(f"Skipping creating {txt} because it does use virtual environments.")
+    list_(ctx)

climate_ref/cli/solve.py ADDED Viewed

@@ -0,0 +1,23 @@
+import typer
+from climate_ref.solver import solve_required_executions
+app = typer.Typer()
+@app.command()
+def solve(
+    ctx: typer.Context,
+    dry_run: bool = typer.Option(False, help="Do not execute any diagnostics"),
+    timeout: int = typer.Option(60, help="Timeout in seconds for the solve operation"),
+) -> None:
+    """
+    Solve for executions that require recalculation
+    This may trigger a number of additional calculations depending on what data has been ingested
+    since the last solve.
+    """
+    config = ctx.obj.config
+    db = ctx.obj.database
+    with ctx.obj.database.session.begin():
+        solve_required_executions(config=config, db=db, dry_run=dry_run, timeout=timeout)