mlcast-dataset-validator 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. mlcast_dataset_validator/checks/coords/names.py +1 -2
  2. mlcast_dataset_validator/checks/coords/spatial.py +1 -2
  3. mlcast_dataset_validator/checks/coords/temporal.py +1 -2
  4. mlcast_dataset_validator/checks/data_vars/chunking.py +1 -2
  5. mlcast_dataset_validator/checks/data_vars/compression.py +1 -2
  6. mlcast_dataset_validator/checks/data_vars/data_structure.py +1 -2
  7. mlcast_dataset_validator/checks/data_vars/data_variable.py +1 -2
  8. mlcast_dataset_validator/checks/data_vars/georeferencing.py +1 -2
  9. mlcast_dataset_validator/checks/data_vars/naming.py +1 -2
  10. mlcast_dataset_validator/checks/global_attributes/conditional.py +1 -2
  11. mlcast_dataset_validator/checks/global_attributes/licensing.py +1 -2
  12. mlcast_dataset_validator/checks/global_attributes/mlcast_metadata.py +1 -1
  13. mlcast_dataset_validator/checks/global_attributes/zarr_format.py +4 -2
  14. mlcast_dataset_validator/checks/tool_compatibility/cartopy.py +1 -2
  15. mlcast_dataset_validator/checks/tool_compatibility/gdal.py +1 -2
  16. mlcast_dataset_validator/specs/cli.py +35 -4
  17. mlcast_dataset_validator/specs/{base.py → reporting.py} +73 -3
  18. mlcast_dataset_validator/specs/source_data/radar_precipitation.py +48 -47
  19. mlcast_dataset_validator/utils/logging_decorator.py +3 -17
  20. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/METADATA +3 -1
  21. mlcast_dataset_validator-0.2.0.dist-info/RECORD +36 -0
  22. mlcast_dataset_validator-0.1.0.dist-info/RECORD +0 -36
  23. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/WHEEL +0 -0
  24. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/entry_points.txt +0 -0
  25. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/licenses/LICENSE +0 -0
  26. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/licenses/LICENSE-APACHE +0 -0
  27. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/licenses/LICENSE-BSD +0 -0
  28. {mlcast_dataset_validator-0.1.0.dist-info → mlcast_dataset_validator-0.2.0.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,7 @@ from typing import Dict, List, Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from . import SECTION_ID as PARENT_SECTION_ID
8
7
 
9
8
  SECTION_ID = f"{PARENT_SECTION_ID}.1"
@@ -1,7 +1,6 @@
1
1
  import xarray as xr
2
2
 
3
- from ...specs.base import ValidationReport
4
- from ...utils.logging_decorator import log_function_call
3
+ from ...specs.reporting import ValidationReport, log_function_call
5
4
  from . import SECTION_ID as PARENT_SECTION_ID
6
5
 
7
6
  SECTION_ID = f"{PARENT_SECTION_ID}.2"
@@ -5,8 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
  import xarray as xr
7
7
 
8
- from ...specs.base import ValidationReport
9
- from ...utils.logging_decorator import log_function_call
8
+ from ...specs.reporting import ValidationReport, log_function_call
10
9
  from . import SECTION_ID as PARENT_SECTION_ID
11
10
 
12
11
  SECTION_ID = f"{PARENT_SECTION_ID}.3"
@@ -1,7 +1,6 @@
1
1
  import xarray as xr
2
2
 
3
- from ...specs.base import ValidationReport
4
- from ...utils.logging_decorator import log_function_call
3
+ from ...specs.reporting import ValidationReport, log_function_call
5
4
  from ..data_vars_filter import iter_data_vars
6
5
  from . import SECTION_ID as PARENT_SECTION_ID
7
6
 
@@ -2,8 +2,7 @@ from typing import Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from . import SECTION_ID as PARENT_SECTION_ID
8
7
 
9
8
  SECTION_ID = f"{PARENT_SECTION_ID}.2"
@@ -2,8 +2,7 @@ from typing import Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from . import SECTION_ID as PARENT_SECTION_ID
8
7
 
9
8
  SECTION_ID = f"{PARENT_SECTION_ID}.3"
@@ -2,8 +2,7 @@ from typing import Dict, Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from . import SECTION_ID as PARENT_SECTION_ID
8
7
 
9
8
  SECTION_ID = f"{PARENT_SECTION_ID}.4"
@@ -2,8 +2,7 @@ from typing import Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from . import SECTION_ID as PARENT_SECTION_ID
8
7
 
9
8
  SECTION_ID = f"{PARENT_SECTION_ID}.5"
@@ -4,8 +4,7 @@ from typing import Dict, Iterable
4
4
 
5
5
  import xarray as xr
6
6
 
7
- from ...specs.base import ValidationReport
8
- from ...utils.logging_decorator import log_function_call
7
+ from ...specs.reporting import ValidationReport, log_function_call
9
8
  from ..data_vars_filter import iter_data_vars
10
9
  from . import SECTION_ID as PARENT_SECTION_ID
11
10
 
@@ -2,8 +2,7 @@ from typing import Sequence
2
2
 
3
3
  import xarray as xr
4
4
 
5
- from ...specs.base import ValidationReport
6
- from ...utils.logging_decorator import log_function_call
5
+ from ...specs.reporting import ValidationReport, log_function_call
7
6
  from ..coords.temporal import analyze_dataset_timesteps
8
7
  from . import SECTION_ID as PARENT_SECTION_ID
9
8
 
@@ -4,8 +4,7 @@ from typing import Iterable
4
4
  import xarray as xr
5
5
  from license_expression import ExpressionError, get_spdx_licensing
6
6
 
7
- from ...specs.base import ValidationReport
8
- from ...utils.logging_decorator import log_function_call
7
+ from ...specs.reporting import ValidationReport, log_function_call
9
8
  from . import SECTION_ID as PARENT_SECTION_ID
10
9
 
11
10
  SECTION_ID = f"{PARENT_SECTION_ID}.2"
@@ -12,7 +12,7 @@ from packaging.version import parse as parse_version
12
12
  from parse import compile as parse_compile
13
13
  from requests import RequestException
14
14
 
15
- from ...specs.base import ValidationReport
15
+ from ...specs.reporting import ValidationReport
16
16
  from ...utils.logging_decorator import log_function_call
17
17
  from . import SECTION_ID as PARENT_SECTION_ID
18
18
 
@@ -3,8 +3,7 @@ from typing import Sequence
3
3
  import fsspec
4
4
  import xarray as xr
5
5
 
6
- from ...specs.base import ValidationReport
7
- from ...utils.logging_decorator import log_function_call
6
+ from ...specs.reporting import ValidationReport, log_function_call
8
7
  from . import SECTION_ID as PARENT_SECTION_ID
9
8
 
10
9
  SECTION_ID = f"{PARENT_SECTION_ID}.3"
@@ -53,6 +52,9 @@ def check_zarr_format(
53
52
  """Check Zarr format requirements."""
54
53
  report = ValidationReport()
55
54
 
55
+ if storage_options is None:
56
+ storage_options = ds.encoding.get("storage_options")
57
+
56
58
  zarr_format = getattr(ds, "zarr_format", 2) # Default to Zarr v2
57
59
  if zarr_format in allowed_versions:
58
60
  report.add(
@@ -10,8 +10,7 @@ from typing import Optional
10
10
  import numpy as np
11
11
  import xarray as xr
12
12
 
13
- from ...specs.base import ValidationReport
14
- from ...utils.logging_decorator import log_function_call
13
+ from ...specs.reporting import ValidationReport, log_function_call
15
14
  from ..data_vars_filter import select_data_var
16
15
  from . import SECTION_ID as PARENT_SECTION_ID
17
16
 
@@ -11,8 +11,7 @@ from typing import Dict, Optional
11
11
 
12
12
  import xarray as xr
13
13
 
14
- from ...specs.base import ValidationReport
15
- from ...utils.logging_decorator import log_function_call
14
+ from ...specs.reporting import ValidationReport, log_function_call
16
15
  from ..data_vars_filter import select_data_var
17
16
  from . import SECTION_ID as PARENT_SECTION_ID
18
17
 
@@ -8,9 +8,11 @@ import pkgutil
8
8
  import sys
9
9
  from typing import Dict, List, Sequence
10
10
 
11
+ import xarray as xr
11
12
  from loguru import logger
12
13
 
13
14
  from .. import __version__
15
+ from .reporting import skip_all_checks
14
16
 
15
17
  SPEC_PACKAGE = "mlcast_dataset_validator.specs"
16
18
 
@@ -96,6 +98,14 @@ def build_parser(catalog: Dict[str, List[str]]) -> argparse.ArgumentParser:
96
98
  action="store_true",
97
99
  help="List available data_stage/product combinations and exit.",
98
100
  )
101
+ parser.add_argument(
102
+ "--print-spec-markdown",
103
+ action="store_true",
104
+ help=(
105
+ "Print the selected specification as Markdown without running validation checks. "
106
+ "Requires data_stage and product, dataset_path is optional."
107
+ ),
108
+ )
99
109
  parser.epilog = "Available combinations:\n" + _format_catalog(catalog)
100
110
  return parser
101
111
 
@@ -134,7 +144,12 @@ def main(argv: Sequence[str] | None = None) -> int:
134
144
  print(f" - {product}")
135
145
  return 0
136
146
 
137
- if not (args.data_stage and args.product and args.dataset_path):
147
+ if args.print_spec_markdown:
148
+ if not (args.data_stage and args.product):
149
+ parser.error(
150
+ "--print-spec-markdown requires a data_stage and product to select the specification."
151
+ )
152
+ elif not (args.data_stage and args.product and args.dataset_path):
138
153
  parser.print_help()
139
154
  return 1
140
155
 
@@ -164,9 +179,25 @@ def main(argv: Sequence[str] | None = None) -> int:
164
179
  f"{__version__})"
165
180
  )
166
181
 
167
- report = module.validate_dataset(
168
- args.dataset_path, storage_options=storage_options or None
169
- )
182
+ if args.print_spec_markdown:
183
+ with skip_all_checks():
184
+ _, spec_text = module.validate_dataset(None)
185
+ if not spec_text:
186
+ raise SystemExit(
187
+ "Specification text could not be retrieved from validate_dataset()."
188
+ )
189
+ print(spec_text)
190
+ return 0
191
+ else:
192
+ ds = xr.open_zarr(args.dataset_path, storage_options=storage_options or None)
193
+ if storage_options:
194
+ ds.encoding.setdefault("storage_options", storage_options)
195
+ logger.info(
196
+ f"Opened dataset at {args.dataset_path}"
197
+ + (f" with storage options {storage_options}" if storage_options else "")
198
+ )
199
+
200
+ report, _ = module.validate_dataset(ds=ds)
170
201
  report.console_print()
171
202
 
172
203
  return 1 if report.has_fails() else 0
@@ -1,11 +1,43 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import ExitStack, contextmanager
3
4
  from dataclasses import dataclass, field
4
- from typing import List, Optional
5
+ from functools import wraps
6
+ from typing import Callable, Dict, List, Optional, Tuple
7
+ from unittest import mock
5
8
 
9
+ from loguru import logger
6
10
  from rich.console import Console
7
11
  from rich.table import Table
8
12
 
13
+ # -------------------------
14
+ # Logging decorator and registry
15
+ # -------------------------
16
+ CHECK_REGISTRY: Dict[Tuple[str, str], Callable] = {}
17
+
18
+
19
+ def log_function_call(func):
20
+ """Decorator to log function calls with their arguments.
21
+
22
+ The original callable is stored in ``CHECK_REGISTRY`` so that it can be
23
+ monkey patched (e.g., when printing specs without running validations).
24
+ """
25
+
26
+ if (func.__module__, func.__name__) not in CHECK_REGISTRY:
27
+ CHECK_REGISTRY[(func.__module__, func.__name__)] = func
28
+
29
+ @wraps(func)
30
+ def wrapper(*args, **kwargs):
31
+ logger.info(f"Applying {func.__name__} with {kwargs}")
32
+ func_from_registry = CHECK_REGISTRY[(func.__module__, func.__name__)]
33
+ report = func_from_registry(*args, **kwargs)
34
+ for result in report.results:
35
+ result.module = func.__module__
36
+ result.function = func.__name__
37
+ return report
38
+
39
+ return wrapper
40
+
9
41
 
10
42
  # -------------------------
11
43
  # Data structures
@@ -89,14 +121,17 @@ class ValidationReport:
89
121
  out.results = [*self.results, *other.results]
90
122
  return out
91
123
 
92
- def console_print(self) -> None:
124
+ def console_print(self, *, file=None) -> None:
93
125
  """
94
126
  Print all results in the validation report as a table using the rich library.
95
127
 
128
+ Args:
129
+ file: Optional file-like object to write to (defaults to stdout).
130
+
96
131
  Returns:
97
132
  None
98
133
  """
99
- console = Console()
134
+ console = Console(file=file)
100
135
  table = Table(title="Validation Report")
101
136
 
102
137
  table.add_column("Section", style="bold")
@@ -144,3 +179,38 @@ class ValidationReport:
144
179
  bool: True if there is at least one WARNING result, False otherwise.
145
180
  """
146
181
  return any(r.status == "WARNING" for r in self.results)
182
+
183
+
184
+ @contextmanager
185
+ def skip_all_checks():
186
+ """Context manager to bypass check functions and dataset loading.
187
+
188
+ Assumes check functions are decorated with ``log_function_call`` which
189
+ records them in ``CHECK_REGISTRY``. Each registered check is monkey patched
190
+ to a stub returning an empty ``ValidationReport`` and dataset loading is
191
+ bypassed.
192
+ """
193
+
194
+ def _stubbed_check(*_args, **_kwargs):
195
+ return ValidationReport()
196
+
197
+ with ExitStack() as stack:
198
+ # Avoid calling into real xarray when rendering specs.
199
+ try:
200
+ stack.enter_context(mock.patch("xarray.open_zarr", lambda *a, **kw: None))
201
+ import xarray as xr
202
+
203
+ stack.enter_context(
204
+ mock.patch.object(xr, "open_zarr", lambda *a, **kw: None)
205
+ )
206
+ except ModuleNotFoundError:
207
+ # xarray not available in this environment; skip patching dataset load.
208
+ pass
209
+
210
+ stack.enter_context(
211
+ mock.patch.dict(
212
+ CHECK_REGISTRY,
213
+ {key: _stubbed_check for key in list(CHECK_REGISTRY.keys())},
214
+ )
215
+ )
216
+ yield
@@ -7,7 +7,7 @@ against this specification. The specification text is written inline with the
7
7
  calls to checking operations that match the specification requirements.
8
8
  """
9
9
 
10
- from typing import Optional
10
+ import textwrap
11
11
 
12
12
  import xarray as xr
13
13
  from loguru import logger
@@ -26,7 +26,7 @@ from ...checks.global_attributes.mlcast_metadata import check_mlcast_metadata
26
26
  from ...checks.global_attributes.zarr_format import check_zarr_format
27
27
  from ...checks.tool_compatibility.cartopy import check_cartopy_compatibility
28
28
  from ...checks.tool_compatibility.gdal import check_gdal_compatibility
29
- from ..base import ValidationReport
29
+ from ..reporting import ValidationReport
30
30
 
31
31
  VERSION = "0.2.0"
32
32
  IDENTIFIER = __spec__.name.split(".")[-1]
@@ -35,12 +35,16 @@ IDENTIFIER = __spec__.name.split(".")[-1]
35
35
  # -------------------------
36
36
  # Core public API
37
37
  # -------------------------
38
- def validate_dataset(
39
- path: str, storage_options: Optional[dict] = None
40
- ) -> ValidationReport:
38
+ def validate_dataset(ds: xr.Dataset) -> ValidationReport:
41
39
  """Validate a radar precipitation dataset against the MLCast specification."""
42
40
  report = ValidationReport()
43
- spec_text = """
41
+ spec_text = f"""
42
+ ---
43
+ data_stage: source_data
44
+ product: {IDENTIFIER}
45
+ version: {VERSION}
46
+ ---
47
+
44
48
  ## 1. Introduction
45
49
 
46
50
  This document specifies the requirements for 2D radar precipitation and
@@ -59,9 +63,6 @@ def validate_dataset(
59
63
  (see inline comments below for rest of specification)
60
64
  """
61
65
 
62
- # Load dataset
63
- ds = xr.open_zarr(path, storage_options=storage_options)
64
- logger.info(f"Opened dataset at {path}")
65
66
  logger.info(str(ds))
66
67
 
67
68
  spec_text += """
@@ -71,8 +72,8 @@ def validate_dataset(
71
72
  spec_text += """
72
73
  ### 3.1 Coordinate Variables
73
74
 
74
- > "The dataset MUST expose CF-compliant coordinates: latitude/longitude and projected x/y."
75
- > "Coordinate metadata MUST provide `standard_name`/`axis`/`units` per CF (with a valid `time` coordinate as well)."
75
+ - The dataset MUST expose CF-compliant coordinates: latitude/longitude and projected x/y.
76
+ - Coordinate metadata MUST provide `standard_name`/`axis`/`units` per CF (with a valid `time` coordinate as well).
76
77
  """
77
78
  report += check_coordinate_names(
78
79
  ds,
@@ -84,9 +85,9 @@ def validate_dataset(
84
85
  spec_text += """
85
86
  ### 3.2 Spatial Requirements
86
87
 
87
- > "The dataset MUST provide 2D radar composites with a spatial resolution of 1 kilometer or finer."
88
- > "The valid sensing area MUST support at least one 256×256 pixel square crop that is fully contained within the radar sensing range."
89
- > "The spatial domain, including resolution, size, and geographical coverage, MUST remain constant across all times in the archive."
88
+ - The dataset MUST provide 2D radar composites with a spatial resolution of 1 kilometer or finer.
89
+ - The valid sensing area MUST support at least one 256×256 pixel square crop that is fully contained within the radar sensing range.
90
+ - The spatial domain, including resolution, size, and geographical coverage, MUST remain constant across all times in the archive.
90
91
  """
91
92
  report += check_spatial_requirements(
92
93
  ds,
@@ -98,9 +99,9 @@ def validate_dataset(
98
99
  spec_text += """
99
100
  ### 3.3 Temporal Requirements
100
101
 
101
- - "The timestep (the duration between successive time values for which a single data-point is valid) MAY be variable throughout the archive, but in that case a global attribute named `consistent_timestep_start` MUST be included to indicate the first timestamp where regular timestepping begins. In the absence of this attribute, the timestep MUST be regular throughout the archive."
102
- - "Times for which data is missing MUST be given expicitly in the variable `missing_times` as CF-compliant time values. The timestep is defined as the interval between consecutive times (including missing times). These times MUST NOT be included in the main time coordinate."
103
- - "Time values MUST be strictly monotonically increasing."
102
+ - The timestep (the duration between successive time values for which a single data-point is valid) MAY be variable throughout the archive, but in that case a global attribute named `consistent_timestep_start` MUST be included to indicate the first timestamp where regular timestepping begins. In the absence of this attribute, the timestep MUST be regular throughout the archive.
103
+ - Times for which data is missing MUST be given expicitly in the variable `missing_times` as CF-compliant time values. The timestep is defined as the interval between consecutive times (including missing times). These times MUST NOT be included in the main time coordinate.
104
+ - Time values MUST be strictly monotonically increasing.
104
105
  """
105
106
  report += check_temporal_requirements(
106
107
  ds,
@@ -115,7 +116,7 @@ def validate_dataset(
115
116
  spec_text += """
116
117
  ### 4.1 Chunking Strategy
117
118
 
118
- > "The dataset MUST use a chunking strategy of 1 × height × width (one chunk per timestep)."
119
+ - The dataset MUST use a chunking strategy of 1 × height × width (one chunk per timestep).
119
120
  """
120
121
  report += check_chunking_strategy(
121
122
  ds,
@@ -125,9 +126,9 @@ def validate_dataset(
125
126
  spec_text += """
126
127
  ### 4.2 Compression
127
128
 
128
- > "The main data arrays MUST use compression to reduce storage requirements."
129
- > "ZSTD compression is RECOMMENDED for optimal performance of the main data arrays."
130
- > "Coordinate arrays MAY use different compression algorithms (e.g., lz4) as appropriate."
129
+ - The main data arrays MUST use compression to reduce storage requirements.
130
+ - ZSTD compression is RECOMMENDED for optimal performance of the main data arrays.
131
+ - Coordinate arrays MAY use different compression algorithms (e.g., lz4) as appropriate.
131
132
  """
132
133
  report += check_compression(
133
134
  ds,
@@ -139,8 +140,8 @@ def validate_dataset(
139
140
  spec_text += """
140
141
  ### 4.3 Data Structure
141
142
 
142
- > "The main data variable MUST be encoded with dimensions in the order: time × height (y, lat) × width (x, lon)."
143
- > "The data type MUST be floating-point (float16, float32, or float64)."
143
+ - The main data variable MUST be encoded with dimensions in the order: time × height (y, lat) × width (x, lon).
144
+ - The data type MUST be floating-point (float16, float32, or float64).
144
145
  """
145
146
  report += check_data_structure(
146
147
  ds,
@@ -151,8 +152,8 @@ def validate_dataset(
151
152
  spec_text += """
152
153
  ### 4.4 Data Variable Naming and Attributes
153
154
 
154
- > "The data variable name SHOULD be a CF convention standard name or use a sensible name from the ECMWF parameter database."
155
- > "The data variable MUST include the `long_name`, `standard_name` and `units` attributes following CF conventions."
155
+ - The data variable name SHOULD be a CF convention standard name or use a sensible name from the ECMWF parameter database.
156
+ - The data variable MUST include the `long_name`, `standard_name` and `units` attributes following CF conventions.
156
157
  """
157
158
  allowed_standard_names = (
158
159
  "rainfall_flux",
@@ -169,9 +170,9 @@ def validate_dataset(
169
170
  spec_text += """
170
171
  ### 4.5 Georeferencing
171
172
 
172
- > "The dataset MUST include proper georeferencing information following the GeoZarr specification."
173
- > "The data variable MUST include a `grid_mapping` attribute that references the coordinate reference system (crs) variable."
174
- > "The crs variable MUST include both a `spatial_ref` and a `crs_wkt` attribute with a WKT string."
173
+ - The dataset MUST include proper georeferencing information following the GeoZarr specification.
174
+ - The data variable MUST include a `grid_mapping` attribute that references the coordinate reference system (crs) variable.
175
+ - The crs variable MUST include both a `spatial_ref` and a `crs_wkt` attribute with a WKT string.
175
176
  """
176
177
  report += check_georeferencing(
177
178
  ds,
@@ -188,7 +189,7 @@ def validate_dataset(
188
189
  spec_text += """
189
190
  ### 5.1 Conditional Global Attributes
190
191
 
191
- > "The global attribute `consistent_timestep_start` is CONDITIONALLY REQUIRED if the dataset uses a variable timestep. It MUST be an ISO formatted datetime string indicating the first timestamp where regular timestepping begins."
192
+ - The global attribute `consistent_timestep_start` is CONDITIONALLY REQUIRED if the dataset uses a variable timestep. It MUST be an ISO formatted datetime string indicating the first timestamp where regular timestepping begins.
192
193
  """
193
194
  report += check_conditional_global_attributes(
194
195
  ds,
@@ -198,9 +199,9 @@ def validate_dataset(
198
199
  spec_text += """
199
200
  ### 5.2 Licensing Requirements
200
201
 
201
- > "The dataset MUST include a global `license` attribute containing a valid SPDX identifier."
202
- > "The following licenses are RECOMMENDED: `CC-BY`, `CC-BY-SA`, `OGL`."
203
- > "Licenses with `NC` or `ND` restrictions SHOULD generate warnings but MAY be accepted after review."
202
+ - The dataset MUST include a global `license` attribute containing a valid SPDX identifier.
203
+ - The following licenses are RECOMMENDED: `CC-BY`, `CC-BY-SA`, `OGL`.
204
+ - Licenses with `NC` or `ND` restrictions SHOULD generate warnings but MAY be accepted after review.
204
205
  """
205
206
  report += check_license(
206
207
  ds,
@@ -225,26 +226,26 @@ def validate_dataset(
225
226
  spec_text += """
226
227
  ### 5.3 Zarr Format
227
228
 
228
- > "The dataset MUST use Zarr version 2 or version 3 format."
229
- > "If Zarr version 2 is used, the dataset MUST include consolidated metadata."
229
+ - The dataset MUST use Zarr version 2 or version 3 format.
230
+ - If Zarr version 2 is used, the dataset MUST include consolidated metadata.
230
231
  """
231
232
  report += check_zarr_format(
232
233
  ds,
233
234
  allowed_versions=[2, 3],
234
235
  require_consolidated_if_v2=True,
235
- storage_options=storage_options,
236
236
  )
237
237
 
238
238
  spec_text += """
239
239
  ### 5.4 MLCast Metadata
240
240
 
241
- > "The dataset MUST include the following global attributes:
242
- > - `mlcast_created_on`: ISO formatted datetime of dataset creation.
243
- > - `mlcast_created_by`: Creator contact in `Name <email>` format.
244
- > - `mlcast_created_with`: GitHub URL of the creating software including version (e.g., https://github.com/mlcast-community/mlcast-dataset-radklim@v0.1.0) and the repository/revision MUST exist.
245
- > - `mlcast_dataset_version`: Dataset specification version (semver or calver).
246
- > - `mlcast_dataset_identifier`: Unique dataset identifier formatted as `<country_code>-<entity>-<physical_variable>` by default.
247
- > - `mlcast_dataset_identifier_format`: OPTIONAL format string that MUST start with `<country_code>-<entity>-<physical_variable>` and MAY include only the approved identifier parts: `country_code`, `entity`, `physical_variable`, `time_resolution`, `common_name`."
241
+ The dataset MUST include the following global attributes:
242
+
243
+ - `mlcast_created_on`: ISO formatted datetime of dataset creation.
244
+ - `mlcast_created_by`: Creator contact in `Name <email>` format.
245
+ - `mlcast_created_with`: GitHub URL of the creating software including version (e.g., https://github.com/mlcast-community/mlcast-dataset-radklim@v0.1.0) and the repository/revision MUST exist.
246
+ - `mlcast_dataset_version`: Dataset specification version (semver or calver).
247
+ - `mlcast_dataset_identifier`: Unique dataset identifier formatted as `<country_code>-<entity>-<physical_variable>` by default.
248
+ - `mlcast_dataset_identifier_format`: OPTIONAL format string that MUST start with `<country_code>-<entity>-<physical_variable>` and MAY include only the approved identifier parts: `country_code`, `entity`, `physical_variable`, `time_resolution`, `common_name`.
248
249
  """
249
250
  report += check_mlcast_metadata(ds)
250
251
 
@@ -257,17 +258,17 @@ def validate_dataset(
257
258
  spec_text += """
258
259
  ### 6.1 GDAL Compatibility
259
260
 
260
- > "The dataset SHOULD expose georeferencing metadata readable by GDAL, including a CRS WKT."
261
- > "A basic GeoTIFF export SHOULD roundtrip through GDAL with geotransform/projection metadata."
261
+ - The dataset SHOULD expose georeferencing metadata readable by GDAL, including a CRS WKT.
262
+ - A basic GeoTIFF export SHOULD roundtrip through GDAL with geotransform/projection metadata.
262
263
  """
263
264
  report += check_gdal_compatibility(ds)
264
265
 
265
266
  spec_text += """
266
267
  ### 6.2 Cartopy Compatibility
267
268
 
268
- > "The CRS WKT SHOULD be parseable by cartopy."
269
- > "Coordinate grids SHOULD transform cleanly into PlateCarree for mapping workflows."
269
+ - The CRS WKT SHOULD be parseable by cartopy.
270
+ - Coordinate grids SHOULD transform cleanly into PlateCarree for mapping workflows.
270
271
  """
271
272
  report += check_cartopy_compatibility(ds)
272
273
 
273
- return report
274
+ return report, textwrap.dedent(spec_text)
@@ -1,19 +1,5 @@
1
- from functools import wraps
1
+ """Compat shim: import logging helpers from specs.reporting."""
2
2
 
3
- from loguru import logger
3
+ from ..specs.reporting import CHECK_REGISTRY, log_function_call
4
4
 
5
-
6
- def log_function_call(func):
7
- """Decorator to log function calls with their arguments."""
8
-
9
- @wraps(func)
10
- def wrapper(*args, **kwargs):
11
- logger.info(f"Applying {func.__name__} with {kwargs}")
12
- report = func(*args, **kwargs)
13
- # Set module and function name on the result object
14
- for result in report.results:
15
- result.module = func.__module__
16
- result.function = func.__name__
17
- return report
18
-
19
- return wrapper
5
+ __all__ = ["CHECK_REGISTRY", "log_function_call"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlcast-dataset-validator
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Add your description here
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -23,6 +23,8 @@ Dynamic: license-file
23
23
 
24
24
  # mlcast-dataset-validator
25
25
 
26
+ [![Spec Docs](https://img.shields.io/badge/spec%20docs-HTML-blue)](https://mlcast-community.github.io/mlcast-dataset-validator/)
27
+
26
28
  Dataset validator for the MLCast Intake catalog ([mlcast-datasets](https://github.com/mlcast-community/mlcast-datasets)).
27
29
 
28
30
  ## What is this?
@@ -0,0 +1,36 @@
1
+ mlcast_dataset_validator/__init__.py,sha256=8EyAIaXvAvj4xzQVXeGeDuottFvUy4BsxgCpU9zfTdc,317
2
+ mlcast_dataset_validator/checks/data_vars_filter.py,sha256=2bS6YRqPBWYYOEjRoUQRx16XpQ9lYoF8O8kVEXB117A,1641
3
+ mlcast_dataset_validator/checks/coords/__init__.py,sha256=tpAlg9zunrUeyqJ4DYu605HR_v18A30-L0xLET4bO98,15
4
+ mlcast_dataset_validator/checks/coords/names.py,sha256=v102OC2Usmd9X5gqSXWWcMd7iR7rjMSYC8xtT85RQe0,8372
5
+ mlcast_dataset_validator/checks/coords/spatial.py,sha256=f7xj5L7dGLkDP6xMu_P1ZQDbbSFKcrGi1ZWUrCeej_w,3619
6
+ mlcast_dataset_validator/checks/coords/temporal.py,sha256=SDcEYYudK4WjCCekj8Oa-EG8yhMeokGG0LC5KWCEbTc,12507
7
+ mlcast_dataset_validator/checks/data_vars/__init__.py,sha256=x0QGejaph-2tbTVbEsOTqWG7uOvqWo-OIz0vFriF0Z4,15
8
+ mlcast_dataset_validator/checks/data_vars/chunking.py,sha256=eGk5hEB-ksvwNMNGVJvhIVok-U4Cy0iPqwZaLAkZk8M,1692
9
+ mlcast_dataset_validator/checks/data_vars/compression.py,sha256=0EbOc388nnXWOx-NFkrbreIfMrVO6zQDUSZSDdxNKUQ,4042
10
+ mlcast_dataset_validator/checks/data_vars/data_structure.py,sha256=dvDl51D3ZIQStYsf3lyIE3lmQGMZc3WJ06suruwdUnc,1872
11
+ mlcast_dataset_validator/checks/data_vars/data_variable.py,sha256=H36elAD1uUtGVCeaeE1V_getvAyQFWq8enaBeBBtqms,1943
12
+ mlcast_dataset_validator/checks/data_vars/georeferencing.py,sha256=CbH6vK1ucoRKnZD2y_p02tvWwbJuVk_Ejr9QJzkP57c,2232
13
+ mlcast_dataset_validator/checks/data_vars/naming.py,sha256=1HuL4C5CJQoE9b7Ww80b3G0fNB6FrlR7_7WfBh9yAlI,5457
14
+ mlcast_dataset_validator/checks/global_attributes/__init__.py,sha256=OGycUmJKEN3J7KD5XDtwp3tIN20aJcWCfI4dh7NvLp0,15
15
+ mlcast_dataset_validator/checks/global_attributes/conditional.py,sha256=HmqNfU7hbnU_cyUj9V2s1IqHiLdqYZEw4fxT9hftetI,1931
16
+ mlcast_dataset_validator/checks/global_attributes/licensing.py,sha256=bPeZYIYgt5MguvXgyBRbi8nMoFZx4pGEtz7IrEfwcg4,4659
17
+ mlcast_dataset_validator/checks/global_attributes/mlcast_metadata.py,sha256=21c_69cY3BwCmfX68esN7Ja7KJvNvn0a0941OFM7X3k,21663
18
+ mlcast_dataset_validator/checks/global_attributes/zarr_format.py,sha256=nq3be1nR4wUs2GnUJP6EG3q49BcQsPGOMc-bt4snSeU,2671
19
+ mlcast_dataset_validator/checks/tool_compatibility/__init__.py,sha256=lPaUZM1rQCeQCtpTxzhvvLB5WoRordTvJt5gJ8ApYcE,15
20
+ mlcast_dataset_validator/checks/tool_compatibility/cartopy.py,sha256=0n1X4ANIZ6U11QxqVOOHIvcxSTpw8KpwijgNQ0hWpWU,5543
21
+ mlcast_dataset_validator/checks/tool_compatibility/gdal.py,sha256=nACc8Kv-D5E47THxfLmbQGTj1bzSIvHrCqLlNUYq6CQ,7448
22
+ mlcast_dataset_validator/specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ mlcast_dataset_validator/specs/cli.py,sha256=G1c4rMeBocHvbd3Z9-BHt9PnCmC4nwqHuor9n46Mt0o,6645
24
+ mlcast_dataset_validator/specs/reporting.py,sha256=jcoNw9fdv51TljtOGXflwMqdyXZ3cBDqtKJAVd9lVFI,7045
25
+ mlcast_dataset_validator/specs/source_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ mlcast_dataset_validator/specs/source_data/radar_precipitation.py,sha256=7ftMti8f0DlCmjxFZ-bqhIvIpmG7Kk5UwX2RN8Qebwc,10336
27
+ mlcast_dataset_validator/specs/training_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ mlcast_dataset_validator/utils/logging_decorator.py,sha256=nub3yic77svFZFp0AHpK9S1doW_kyfeoDRCfgZbj6LM,180
29
+ mlcast_dataset_validator-0.2.0.dist-info/licenses/LICENSE,sha256=OJwqNMiebzDI5xO80428flMTXjYT_NTzGid_LrlAdQE,346
30
+ mlcast_dataset_validator-0.2.0.dist-info/licenses/LICENSE-APACHE,sha256=zjN0GIESbq-pIfRMHPnKuYAtB09yJEqAZxJ2eTmUN0M,11347
31
+ mlcast_dataset_validator-0.2.0.dist-info/licenses/LICENSE-BSD,sha256=nxwpuv65fwXrfZtDbnObrC5gyCryKO-QZoKdjoYX-Cw,1481
32
+ mlcast_dataset_validator-0.2.0.dist-info/METADATA,sha256=7ALR2bg8ibf_Cme8_QrgHGk1VuJcEaXs7DR9O9_83EY,6340
33
+ mlcast_dataset_validator-0.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
34
+ mlcast_dataset_validator-0.2.0.dist-info/entry_points.txt,sha256=agJXVv9X2zWETDGjZzEWVZi_BAivfFB72QgoCuKoW-I,84
35
+ mlcast_dataset_validator-0.2.0.dist-info/top_level.txt,sha256=cn53cAYHj7S05A2j8JKGV30YgEbqylDYaRsFn6Ppp60,25
36
+ mlcast_dataset_validator-0.2.0.dist-info/RECORD,,
@@ -1,36 +0,0 @@
1
- mlcast_dataset_validator/__init__.py,sha256=8EyAIaXvAvj4xzQVXeGeDuottFvUy4BsxgCpU9zfTdc,317
2
- mlcast_dataset_validator/checks/data_vars_filter.py,sha256=2bS6YRqPBWYYOEjRoUQRx16XpQ9lYoF8O8kVEXB117A,1641
3
- mlcast_dataset_validator/checks/coords/__init__.py,sha256=tpAlg9zunrUeyqJ4DYu605HR_v18A30-L0xLET4bO98,15
4
- mlcast_dataset_validator/checks/coords/names.py,sha256=NSRAZ3tXVjamGpoO2oZIaDR0VWEUxMxWcZrf2UthYHI,8405
5
- mlcast_dataset_validator/checks/coords/spatial.py,sha256=67DNv9LXJ7h60viZi0lSXiz6Kewjm626G1vfnJO3ZEM,3652
6
- mlcast_dataset_validator/checks/coords/temporal.py,sha256=Rf7lG_w92xMEHZ5oBEb6aFl9Fu81YSILd4GRuCOGqyw,12540
7
- mlcast_dataset_validator/checks/data_vars/__init__.py,sha256=x0QGejaph-2tbTVbEsOTqWG7uOvqWo-OIz0vFriF0Z4,15
8
- mlcast_dataset_validator/checks/data_vars/chunking.py,sha256=WwywuPHzPuwkTM0UJjDVcD40Ns_nNFofihiFzNals1c,1725
9
- mlcast_dataset_validator/checks/data_vars/compression.py,sha256=UL8D5jd69HYtz884yXOPzKfEm6gQs3tNzRsuJGhJxts,4075
10
- mlcast_dataset_validator/checks/data_vars/data_structure.py,sha256=kBLahdABZF5dpPlhsZEomNhsp0APgNnmoNUVmWrmYIc,1905
11
- mlcast_dataset_validator/checks/data_vars/data_variable.py,sha256=P10qzfhV0P5X0xXyatoxBmzXf1lxTZKg1U_Ez_PTpLg,1976
12
- mlcast_dataset_validator/checks/data_vars/georeferencing.py,sha256=CKraolEGPXxgfy4q_9TYnYMIdegiGr0vdGOfECZa47o,2265
13
- mlcast_dataset_validator/checks/data_vars/naming.py,sha256=fZMpDJKNfOvtqs_1QYEUGmJxhp3BR_eTf6rjaqk7hb4,5490
14
- mlcast_dataset_validator/checks/global_attributes/__init__.py,sha256=OGycUmJKEN3J7KD5XDtwp3tIN20aJcWCfI4dh7NvLp0,15
15
- mlcast_dataset_validator/checks/global_attributes/conditional.py,sha256=z5i0AKEK42BT2_3XzleJD5w6i91VbVhatf3fNT9atpM,1964
16
- mlcast_dataset_validator/checks/global_attributes/licensing.py,sha256=sUCyVNx_PegagTROg57REll10jKT_KfU3zGuGnJPi-k,4692
17
- mlcast_dataset_validator/checks/global_attributes/mlcast_metadata.py,sha256=EoqFvRkobOkKdHcQSvItmAXRIVe-u1aNqGNv-bGnXUc,21658
18
- mlcast_dataset_validator/checks/global_attributes/zarr_format.py,sha256=kAy66Z5VZhXAnDVFiVM5mm3pyE78j3sLdwycUOVJlpI,2610
19
- mlcast_dataset_validator/checks/tool_compatibility/__init__.py,sha256=lPaUZM1rQCeQCtpTxzhvvLB5WoRordTvJt5gJ8ApYcE,15
20
- mlcast_dataset_validator/checks/tool_compatibility/cartopy.py,sha256=ofq82tH2w5Fe8QUlMvehkVozvzoAe3QzTJQXwINGt1E,5576
21
- mlcast_dataset_validator/checks/tool_compatibility/gdal.py,sha256=LemeGk-XugFg43b0sAJyWUz0M8PF1w1Iy-6N0QmZ2eY,7481
22
- mlcast_dataset_validator/specs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- mlcast_dataset_validator/specs/base.py,sha256=uoIIdTXJBTi6QISUsWzkI2r1-gxMPhcED8OeghY3t-8,4702
24
- mlcast_dataset_validator/specs/cli.py,sha256=Kz8Ky9JNGR3l9UpNDGqDwDhjTWl9bkatdIe09NA13e8,5449
25
- mlcast_dataset_validator/specs/source_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- mlcast_dataset_validator/specs/source_data/radar_precipitation.py,sha256=RFd3VBbz_Ce8YaoEexfG41SoSLLLmkfCY2JpfqGXr9c,10512
27
- mlcast_dataset_validator/specs/training_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- mlcast_dataset_validator/utils/logging_decorator.py,sha256=s0zgRiTzMzyOrPrreNKjnyP_ftl8Tjp2fVMz62_vE-A,532
29
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE,sha256=OJwqNMiebzDI5xO80428flMTXjYT_NTzGid_LrlAdQE,346
30
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-APACHE,sha256=zjN0GIESbq-pIfRMHPnKuYAtB09yJEqAZxJ2eTmUN0M,11347
31
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-BSD,sha256=nxwpuv65fwXrfZtDbnObrC5gyCryKO-QZoKdjoYX-Cw,1481
32
- mlcast_dataset_validator-0.1.0.dist-info/METADATA,sha256=jn90Q5yk9-jky-aIZkaNOZe6z-VVE5XKGA941Q4YKiw,6210
33
- mlcast_dataset_validator-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
34
- mlcast_dataset_validator-0.1.0.dist-info/entry_points.txt,sha256=agJXVv9X2zWETDGjZzEWVZi_BAivfFB72QgoCuKoW-I,84
35
- mlcast_dataset_validator-0.1.0.dist-info/top_level.txt,sha256=cn53cAYHj7S05A2j8JKGV30YgEbqylDYaRsFn6Ppp60,25
36
- mlcast_dataset_validator-0.1.0.dist-info/RECORD,,