climate-ref 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. climate_ref/__init__.py +30 -0
  2. climate_ref/_config_helpers.py +214 -0
  3. climate_ref/alembic.ini +114 -0
  4. climate_ref/cli/__init__.py +138 -0
  5. climate_ref/cli/_utils.py +68 -0
  6. climate_ref/cli/config.py +28 -0
  7. climate_ref/cli/datasets.py +205 -0
  8. climate_ref/cli/executions.py +201 -0
  9. climate_ref/cli/providers.py +84 -0
  10. climate_ref/cli/solve.py +23 -0
  11. climate_ref/config.py +475 -0
  12. climate_ref/constants.py +8 -0
  13. climate_ref/database.py +223 -0
  14. climate_ref/dataset_registry/obs4ref_reference.txt +2 -0
  15. climate_ref/dataset_registry/sample_data.txt +60 -0
  16. climate_ref/datasets/__init__.py +40 -0
  17. climate_ref/datasets/base.py +214 -0
  18. climate_ref/datasets/cmip6.py +202 -0
  19. climate_ref/datasets/obs4mips.py +224 -0
  20. climate_ref/datasets/pmp_climatology.py +15 -0
  21. climate_ref/datasets/utils.py +16 -0
  22. climate_ref/executor/__init__.py +274 -0
  23. climate_ref/executor/local.py +89 -0
  24. climate_ref/migrations/README +22 -0
  25. climate_ref/migrations/env.py +139 -0
  26. climate_ref/migrations/script.py.mako +26 -0
  27. climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +292 -0
  28. climate_ref/models/__init__.py +33 -0
  29. climate_ref/models/base.py +42 -0
  30. climate_ref/models/dataset.py +206 -0
  31. climate_ref/models/diagnostic.py +61 -0
  32. climate_ref/models/execution.py +306 -0
  33. climate_ref/models/metric_value.py +195 -0
  34. climate_ref/models/provider.py +39 -0
  35. climate_ref/provider_registry.py +146 -0
  36. climate_ref/py.typed +0 -0
  37. climate_ref/solver.py +395 -0
  38. climate_ref/testing.py +109 -0
  39. climate_ref-0.5.0.dist-info/METADATA +97 -0
  40. climate_ref-0.5.0.dist-info/RECORD +44 -0
  41. climate_ref-0.5.0.dist-info/WHEEL +4 -0
  42. climate_ref-0.5.0.dist-info/entry_points.txt +2 -0
  43. climate_ref-0.5.0.dist-info/licenses/LICENCE +201 -0
  44. climate_ref-0.5.0.dist-info/licenses/NOTICE +3 -0
@@ -0,0 +1,205 @@
1
+ """
2
+ View and ingest input datasets
3
+ """
4
+
5
+ import errno
6
+ import os
7
+ import shutil
8
+ from collections.abc import Iterable
9
+ from pathlib import Path
10
+ from typing import Annotated
11
+
12
+ import typer
13
+ from loguru import logger
14
+ from rich.console import Console
15
+
16
+ from climate_ref.cli._utils import pretty_print_df
17
+ from climate_ref.datasets import get_dataset_adapter
18
+ from climate_ref.models import Dataset
19
+ from climate_ref.provider_registry import ProviderRegistry
20
+ from climate_ref.solver import solve_required_executions
21
+ from climate_ref.testing import fetch_sample_data
22
+ from climate_ref_core.dataset_registry import dataset_registry_manager, fetch_all_files
23
+ from climate_ref_core.datasets import SourceDatasetType
24
+
25
+ app = typer.Typer(help=__doc__)
26
+ console = Console()
27
+
28
+
29
+ @app.command(name="list")
30
+ def list_(
31
+ ctx: typer.Context,
32
+ source_type: Annotated[
33
+ SourceDatasetType, typer.Option(help="Type of source dataset")
34
+ ] = SourceDatasetType.CMIP6.value, # type: ignore
35
+ column: Annotated[list[str] | None, typer.Option()] = None,
36
+ include_files: bool = typer.Option(False, help="Include files in the output"),
37
+ limit: int = typer.Option(100, help="Limit the number of rows to display"),
38
+ ) -> None:
39
+ """
40
+ List the datasets that have been ingested
41
+
42
+ The data catalog is sorted by the date that the dataset was ingested (first = newest).
43
+ """
44
+ database = ctx.obj.database
45
+
46
+ adapter = get_dataset_adapter(source_type.value)
47
+ data_catalog = adapter.load_catalog(database, include_files=include_files, limit=limit)
48
+
49
+ if column:
50
+ missing = set(column) - set(data_catalog.columns)
51
+ if missing:
52
+
53
+ def format_(columns: Iterable[str]) -> str:
54
+ return ", ".join(f"'{c}'" for c in sorted(columns))
55
+
56
+ logger.error(
57
+ f"Column{'s' if len(missing) > 1 else ''} "
58
+ f"{format_(missing)} not found in data catalog. "
59
+ f"Choose from: {format_(data_catalog.columns)}"
60
+ )
61
+ raise typer.Exit(code=1)
62
+ data_catalog = data_catalog[column].sort_values(by=column)
63
+
64
+ pretty_print_df(data_catalog, console=console)
65
+
66
+
67
+ @app.command()
68
+ def list_columns(
69
+ ctx: typer.Context,
70
+ source_type: Annotated[
71
+ SourceDatasetType, typer.Option(help="Type of source dataset")
72
+ ] = SourceDatasetType.CMIP6.value, # type: ignore
73
+ include_files: bool = typer.Option(False, help="Include files in the output"),
74
+ ) -> None:
75
+ """
76
+ Print the current climate_ref configuration
77
+
78
+ If a configuration directory is provided,
79
+ the configuration will attempt to load from the specified directory.
80
+ """
81
+ database = ctx.obj.database
82
+
83
+ adapter = get_dataset_adapter(source_type.value)
84
+ data_catalog = adapter.load_catalog(database, include_files=include_files)
85
+
86
+ for column in sorted(data_catalog.columns.to_list()):
87
+ print(column)
88
+
89
+
90
+ @app.command()
91
+ def ingest( # noqa: PLR0913
92
+ ctx: typer.Context,
93
+ file_or_directory: Path,
94
+ source_type: Annotated[SourceDatasetType, typer.Option(help="Type of source dataset")],
95
+ solve: Annotated[bool, typer.Option(help="Solve for new diagnostic executions after ingestion")] = False,
96
+ dry_run: Annotated[bool, typer.Option(help="Do not ingest datasets into the database")] = False,
97
+ n_jobs: Annotated[int | None, typer.Option(help="Number of jobs to run in parallel")] = None,
98
+ skip_invalid: Annotated[
99
+ bool, typer.Option(help="Ignore (but log) any datasets that don't pass validation")
100
+ ] = False,
101
+ ) -> None:
102
+ """
103
+ Ingest a dataset
104
+
105
+ This will register a dataset in the database to be used for diagnostics calculations.
106
+ """
107
+ config = ctx.obj.config
108
+ db = ctx.obj.database
109
+
110
+ file_or_directory = Path(file_or_directory).expanduser()
111
+ logger.info(f"ingesting {file_or_directory}")
112
+
113
+ kwargs = {}
114
+
115
+ if n_jobs is not None:
116
+ kwargs["n_jobs"] = n_jobs
117
+
118
+ adapter = get_dataset_adapter(source_type.value, **kwargs)
119
+
120
+ # Create a data catalog from the specified file or directory
121
+ if not file_or_directory.exists():
122
+ logger.error(f"File or directory {file_or_directory} does not exist")
123
+ raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_or_directory)
124
+
125
+ data_catalog = adapter.find_local_datasets(file_or_directory)
126
+ data_catalog = adapter.validate_data_catalog(data_catalog, skip_invalid=skip_invalid)
127
+
128
+ logger.info(
129
+ f"Found {len(data_catalog)} files for {len(data_catalog[adapter.slug_column].unique())} datasets"
130
+ )
131
+ pretty_print_df(adapter.pretty_subset(data_catalog), console=console)
132
+
133
+ for instance_id, data_catalog_dataset in data_catalog.groupby(adapter.slug_column):
134
+ logger.info(f"Processing dataset {instance_id}")
135
+
136
+ if dry_run:
137
+ dataset = db.session.query(Dataset).filter_by(slug=instance_id, dataset_type=source_type).first()
138
+ if not dataset:
139
+ logger.info(f"Would save dataset {instance_id} to the database")
140
+ continue
141
+ else:
142
+ with db.session.begin():
143
+ adapter.register_dataset(config, db, data_catalog_dataset)
144
+
145
+ if solve:
146
+ solve_required_executions(
147
+ config=config,
148
+ db=db,
149
+ dry_run=dry_run,
150
+ )
151
+
152
+
153
+ @app.command(name="fetch-sample-data")
154
+ def _fetch_sample_data(
155
+ force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
156
+ symlink: Annotated[
157
+ bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
158
+ ] = False,
159
+ ) -> None:
160
+ """
161
+ Fetch the sample data for the given version.
162
+
163
+ These data will be written into the test data directory.
164
+ This operation may fail if the test data directory does not exist,
165
+ as is the case for non-source-based installations.
166
+ """
167
+ fetch_sample_data(force_cleanup=force_cleanup, symlink=symlink)
168
+
169
+
170
+ @app.command(name="fetch-data")
171
+ def fetch_data(
172
+ ctx: typer.Context,
173
+ registry: Annotated[str, typer.Option(help="Name of the data registry to use")],
174
+ output_directory: Annotated[
175
+ Path | None, typer.Option(help="Output directory where files will be saved")
176
+ ] = None,
177
+ force_cleanup: Annotated[bool, typer.Option(help="If True, remove any existing files")] = False,
178
+ symlink: Annotated[
179
+ bool, typer.Option(help="If True, symlink files into the output directory, otherwise perform a copy")
180
+ ] = False,
181
+ ) -> None:
182
+ """
183
+ Fetch REF-specific datasets
184
+
185
+ These datasets have been verified to have open licenses
186
+ and are in the process of being added to Obs4MIPs.
187
+ """
188
+ config = ctx.obj.config
189
+ db = ctx.obj.database
190
+
191
+ # Setup the provider registry to register any dataset registries in the configured providers
192
+ ProviderRegistry.build_from_config(config, db)
193
+
194
+ if output_directory and force_cleanup and output_directory.exists():
195
+ logger.warning(f"Removing existing directory {output_directory}")
196
+ shutil.rmtree(output_directory)
197
+
198
+ try:
199
+ _registry = dataset_registry_manager[registry]
200
+ except KeyError:
201
+ logger.error(f"Registry {registry} not found")
202
+ logger.error(f"Available registries: {', '.join(dataset_registry_manager.keys())}")
203
+ raise typer.Exit(code=1)
204
+
205
+ fetch_all_files(_registry, registry, output_directory, symlink=symlink)
@@ -0,0 +1,201 @@
1
+ """
2
+ View diagnostic executions
3
+ """
4
+
5
+ import pathlib
6
+ from typing import Annotated
7
+ from urllib.parse import quote
8
+
9
+ import pandas as pd
10
+ import typer
11
+ from loguru import logger
12
+ from rich.console import Console, Group
13
+ from rich.filesize import decimal
14
+ from rich.markup import escape
15
+ from rich.panel import Panel
16
+ from rich.text import Text
17
+ from rich.tree import Tree
18
+
19
+ from climate_ref.cli._utils import df_to_table, pretty_print_df
20
+ from climate_ref.config import Config
21
+ from climate_ref.models import Execution, ExecutionGroup
22
+ from climate_ref.models.execution import get_execution_group_and_latest
23
+ from climate_ref_core.executor import EXECUTION_LOG_FILENAME
24
+
25
+ app = typer.Typer(help=__doc__)
26
+ console = Console()
27
+
28
+
29
+ @app.command()
30
+ def list_groups(
31
+ ctx: typer.Context,
32
+ column: Annotated[list[str] | None, typer.Option()] = None,
33
+ limit: int = typer.Option(100, help="Limit the number of rows to display"),
34
+ ) -> None:
35
+ """
36
+ List the diagnostic execution groups that have been identified
37
+ """
38
+ session = ctx.obj.database.session
39
+
40
+ execution_groups_results = get_execution_group_and_latest(session).limit(limit).all()
41
+ execution_count = session.query(ExecutionGroup).count()
42
+
43
+ results_df = pd.DataFrame(
44
+ [
45
+ {
46
+ "id": execution_groups.id,
47
+ "key": execution_groups.key,
48
+ "provider": execution_groups.diagnostic.provider.slug,
49
+ "diagnostic": execution_groups.diagnostic.slug,
50
+ "dirty": execution_groups.dirty,
51
+ "successful": result.successful if result else None,
52
+ "created_at": execution_groups.created_at,
53
+ "updated_at": execution_groups.updated_at,
54
+ }
55
+ for execution_groups, result in execution_groups_results
56
+ ]
57
+ )
58
+
59
+ if column:
60
+ if not all(col in results_df.columns for col in column):
61
+ logger.error(f"Column not found in data catalog: {column}")
62
+ raise typer.Exit(code=1)
63
+ results_df = results_df[column]
64
+
65
+ pretty_print_df(results_df, console=console)
66
+ if execution_count > limit:
67
+ logger.warning(
68
+ f"Displaying {limit} of {execution_count} results. Use the `--limit` option to display more."
69
+ )
70
+
71
+
72
+ def walk_directory(directory: pathlib.Path, tree: Tree) -> None:
73
+ """Recursively build a Tree with directory contents."""
74
+ # Sort dirs first then by filename
75
+ paths = sorted(
76
+ pathlib.Path(directory).iterdir(),
77
+ key=lambda path: (path.is_file(), path.name.lower()),
78
+ )
79
+ for path in paths:
80
+ # Remove hidden files
81
+ if path.name.startswith("."):
82
+ continue
83
+ if path.is_dir():
84
+ style = "dim" if path.name.startswith("__") else ""
85
+ branch = tree.add(
86
+ f"[bold magenta]:open_file_folder: [link file://{path}]{escape(path.name)}",
87
+ style=style,
88
+ guide_style=style,
89
+ )
90
+ walk_directory(path, branch)
91
+ else:
92
+ text_filename = Text(path.name, "green")
93
+ text_filename.highlight_regex(r"\..*$", "bold red")
94
+ text_filename.stylize(f"link file://{path}")
95
+ file_size = path.stat().st_size
96
+ text_filename.append(f" ({decimal(file_size)})", "blue")
97
+ tree.add(text_filename)
98
+
99
+
100
+ def _execution_panel(execution_group: ExecutionGroup) -> Panel:
101
+ if len(execution_group.executions) == 0:
102
+ result = None
103
+ else:
104
+ result = execution_group.executions[-1]
105
+
106
+ panel = Panel(
107
+ f"Key: [bold]{execution_group.key}[/]\n"
108
+ f"Diagnostic: [bold]{execution_group.diagnostic.slug}[/]\n"
109
+ f"Provider: [bold]{execution_group.diagnostic.provider.slug}[/]\n"
110
+ f"Dirty: [bold]{execution_group.dirty}[/]\n"
111
+ f"Successful: [bold]{result.successful if result else 'not-started'}[/]\n"
112
+ f"Created At: [bold]{execution_group.created_at}[/]\n"
113
+ f"Updated At: [bold]{execution_group.updated_at}[/]\n"
114
+ f"Number of attempted executions: [bold]{len(execution_group.executions)}[/]",
115
+ title=f"Execution Details: [bold]{execution_group.id}[/]",
116
+ )
117
+ return panel
118
+
119
+
120
+ def _datasets_panel(result: Execution) -> Panel:
121
+ datasets = result.datasets
122
+
123
+ datasets_df = pd.DataFrame(
124
+ [
125
+ {"id": dataset.id, "slug": dataset.slug, "dataset_type": dataset.dataset_type}
126
+ for dataset in datasets
127
+ ]
128
+ )
129
+
130
+ return Panel(
131
+ df_to_table(datasets_df),
132
+ title=f"Datasets hash: {result.dataset_hash}",
133
+ )
134
+
135
+
136
+ def _results_directory_panel(result_directory: pathlib.Path) -> Panel:
137
+ if result_directory.exists():
138
+ tree = Tree(
139
+ f":open_file_folder: [link file://{result_directory}]{result_directory}",
140
+ guide_style="bold bright_blue",
141
+ )
142
+ walk_directory(result_directory, tree)
143
+ return Panel(tree, title="File Tree")
144
+ else:
145
+ target_directory = f"file://{quote(str(result_directory.parent))}"
146
+ link_text = escape(str(result_directory))
147
+
148
+ return Panel(
149
+ Group(
150
+ Text("Result directory not found.", "bold red"),
151
+ # Link to the parent directory otherwise this link will never be resolved
152
+ Text.from_markup(f"[bold magenta]:open_file_folder:[link={target_directory}]{link_text}"),
153
+ ),
154
+ title="File Tree",
155
+ )
156
+
157
+
158
+ def _log_panel(result_directory: pathlib.Path) -> Panel | None:
159
+ log_file = result_directory / EXECUTION_LOG_FILENAME
160
+
161
+ if log_file.exists():
162
+ with open(log_file) as f:
163
+ log_content = f.read()
164
+ log_text = Text.from_markup(f"[link file://{log_file}]{log_content}")
165
+
166
+ return Panel(
167
+ log_text,
168
+ title="Execution Logs",
169
+ )
170
+ else:
171
+ return Panel(
172
+ Text("Log file not found.", "bold red"),
173
+ title="Execution Logs",
174
+ )
175
+
176
+
177
+ @app.command()
178
+ def inspect(ctx: typer.Context, execution_id: int) -> None:
179
+ """
180
+ Inspect a specific execution group by its ID
181
+ """
182
+ config: Config = ctx.obj.config
183
+ session = ctx.obj.database.session
184
+ execution_group = session.get(ExecutionGroup, execution_id)
185
+
186
+ if not execution_group:
187
+ logger.error(f"Execution not found: {execution_id}")
188
+ raise typer.Exit(code=1)
189
+
190
+ console.print(_execution_panel(execution_group))
191
+
192
+ if not execution_group.executions:
193
+ logger.error(f"No results found for execution: {execution_id}")
194
+ return
195
+
196
+ result: Execution = execution_group.executions[-1]
197
+ result_directory = config.paths.results / result.output_fragment
198
+
199
+ console.print(_datasets_panel(result))
200
+ console.print(_results_directory_panel(result_directory))
201
+ console.print(_log_panel(result_directory))
@@ -0,0 +1,84 @@
1
+ """
2
+ Manage the REF providers.
3
+ """
4
+
5
+ from typing import Annotated
6
+
7
+ import pandas as pd
8
+ import typer
9
+ from loguru import logger
10
+ from rich.console import Console
11
+
12
+ from climate_ref.cli._utils import pretty_print_df
13
+ from climate_ref.provider_registry import ProviderRegistry
14
+ from climate_ref_core.providers import CondaDiagnosticProvider, DiagnosticProvider
15
+
16
+ app = typer.Typer(help=__doc__)
17
+ console = Console()
18
+
19
+
20
+ @app.command(name="list")
21
+ def list_(ctx: typer.Context) -> None:
22
+ """
23
+ Print the available providers.
24
+ """
25
+ config = ctx.obj.config
26
+ db = ctx.obj.database
27
+ with db.session.begin():
28
+ provider_registry = ProviderRegistry.build_from_config(config, db)
29
+
30
+ def get_env(provider: DiagnosticProvider) -> str:
31
+ env = ""
32
+ if isinstance(provider, CondaDiagnosticProvider):
33
+ env = f"{provider.env_path}"
34
+ if not provider.env_path.exists():
35
+ env += " (not installed)"
36
+ return env
37
+
38
+ results_df = pd.DataFrame(
39
+ [
40
+ {
41
+ "provider": provider.slug,
42
+ "version": provider.version,
43
+ "conda environment": get_env(provider),
44
+ }
45
+ for provider in provider_registry.providers
46
+ ]
47
+ )
48
+ pretty_print_df(results_df, console=console)
49
+
50
+
51
+ @app.command()
52
+ def create_env(
53
+ ctx: typer.Context,
54
+ provider: Annotated[
55
+ str | None,
56
+ typer.Option(help="Only install the environment for the named provider."),
57
+ ] = None,
58
+ ) -> None:
59
+ """
60
+ Create a virtual environment containing the provider software.
61
+ """
62
+ config = ctx.obj.config
63
+ db = ctx.obj.database
64
+ with db.session.begin():
65
+ providers = ProviderRegistry.build_from_config(config, db).providers
66
+
67
+ if provider is not None:
68
+ available = ", ".join([f'"{p.slug}"' for p in providers])
69
+ providers = [p for p in providers if p.slug == provider]
70
+ if not providers:
71
+ msg = f'Provider "{provider}" not available. Choose from: {available}'
72
+ logger.error(msg)
73
+ raise typer.Exit(code=1)
74
+
75
+ for provider_ in providers:
76
+ txt = f"virtual environment for provider {provider_.slug}"
77
+ if isinstance(provider_, CondaDiagnosticProvider):
78
+ logger.info(f"Creating {txt} in {provider_.env_path}")
79
+ provider_.create_env()
80
+ logger.info(f"Finished creating {txt}")
81
+ else:
82
+ logger.info(f"Skipping creating {txt} because it does use virtual environments.")
83
+
84
+ list_(ctx)
@@ -0,0 +1,23 @@
1
+ import typer
2
+
3
+ from climate_ref.solver import solve_required_executions
4
+
5
+ app = typer.Typer()
6
+
7
+
8
+ @app.command()
9
+ def solve(
10
+ ctx: typer.Context,
11
+ dry_run: bool = typer.Option(False, help="Do not execute any diagnostics"),
12
+ timeout: int = typer.Option(60, help="Timeout in seconds for the solve operation"),
13
+ ) -> None:
14
+ """
15
+ Solve for executions that require recalculation
16
+
17
+ This may trigger a number of additional calculations depending on what data has been ingested
18
+ since the last solve.
19
+ """
20
+ config = ctx.obj.config
21
+ db = ctx.obj.database
22
+ with ctx.obj.database.session.begin():
23
+ solve_required_executions(config=config, db=db, dry_run=dry_run, timeout=timeout)