mlcast-dataset-validator 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlcast_dataset_validator/__init__.py +10 -0
- mlcast_dataset_validator/checks/coords/__init__.py +1 -0
- mlcast_dataset_validator/checks/coords/names.py +275 -0
- mlcast_dataset_validator/checks/coords/spatial.py +100 -0
- mlcast_dataset_validator/checks/coords/temporal.py +389 -0
- mlcast_dataset_validator/checks/data_vars/__init__.py +1 -0
- mlcast_dataset_validator/checks/data_vars/chunking.py +53 -0
- mlcast_dataset_validator/checks/data_vars/compression.py +130 -0
- mlcast_dataset_validator/checks/data_vars/data_structure.py +63 -0
- mlcast_dataset_validator/checks/data_vars/data_variable.py +62 -0
- mlcast_dataset_validator/checks/data_vars/georeferencing.py +67 -0
- mlcast_dataset_validator/checks/data_vars/naming.py +158 -0
- mlcast_dataset_validator/checks/data_vars_filter.py +49 -0
- mlcast_dataset_validator/checks/global_attributes/__init__.py +1 -0
- mlcast_dataset_validator/checks/global_attributes/conditional.py +67 -0
- mlcast_dataset_validator/checks/global_attributes/licensing.py +150 -0
- mlcast_dataset_validator/checks/global_attributes/mlcast_metadata.py +658 -0
- mlcast_dataset_validator/checks/global_attributes/zarr_format.py +88 -0
- mlcast_dataset_validator/checks/tool_compatibility/__init__.py +1 -0
- mlcast_dataset_validator/checks/tool_compatibility/cartopy.py +180 -0
- mlcast_dataset_validator/checks/tool_compatibility/gdal.py +251 -0
- mlcast_dataset_validator/specs/__init__.py +0 -0
- mlcast_dataset_validator/specs/base.py +146 -0
- mlcast_dataset_validator/specs/cli.py +176 -0
- mlcast_dataset_validator/specs/source_data/__init__.py +0 -0
- mlcast_dataset_validator/specs/source_data/radar_precipitation.py +273 -0
- mlcast_dataset_validator/specs/training_data/__init__.py +0 -0
- mlcast_dataset_validator/utils/logging_decorator.py +19 -0
- mlcast_dataset_validator-0.1.0.dist-info/METADATA +126 -0
- mlcast_dataset_validator-0.1.0.dist-info/RECORD +36 -0
- mlcast_dataset_validator-0.1.0.dist-info/WHEEL +5 -0
- mlcast_dataset_validator-0.1.0.dist-info/entry_points.txt +2 -0
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE +11 -0
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-APACHE +202 -0
- mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-BSD +26 -0
- mlcast_dataset_validator-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Top-level package for the MLCast source data validator."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
__all__ = ["__version__"]
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
__version__ = version("mlcast-dataset-validator")
|
|
9
|
+
except PackageNotFoundError: # pragma: no cover - package metadata unavailable
|
|
10
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
SECTION_ID = 3
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from typing import Dict, List, Sequence
|
|
2
|
+
|
|
3
|
+
import xarray as xr
|
|
4
|
+
|
|
5
|
+
from ...specs.base import ValidationReport
|
|
6
|
+
from ...utils.logging_decorator import log_function_call
|
|
7
|
+
from . import SECTION_ID as PARENT_SECTION_ID
|
|
8
|
+
|
|
9
|
+
SECTION_ID = f"{PARENT_SECTION_ID}.1"
|
|
10
|
+
|
|
11
|
+
# Normalized unit strings accepted for latitude coordinates
|
|
12
|
+
_LAT_UNITS = {
|
|
13
|
+
"degrees_north",
|
|
14
|
+
"degree_north",
|
|
15
|
+
"degrees_n",
|
|
16
|
+
"degree_n",
|
|
17
|
+
"deg_n",
|
|
18
|
+
}
|
|
19
|
+
# Normalized unit strings accepted for longitude coordinates
|
|
20
|
+
_LON_UNITS = {
|
|
21
|
+
"degrees_east",
|
|
22
|
+
"degree_east",
|
|
23
|
+
"degrees_e",
|
|
24
|
+
"degree_e",
|
|
25
|
+
"deg_e",
|
|
26
|
+
}
|
|
27
|
+
# Linear distance units accepted for projected coordinates
|
|
28
|
+
_LINEAR_DISTANCE_UNITS = {
|
|
29
|
+
"m",
|
|
30
|
+
"meter",
|
|
31
|
+
"meters",
|
|
32
|
+
"metre",
|
|
33
|
+
"metres",
|
|
34
|
+
"km",
|
|
35
|
+
"kilometer",
|
|
36
|
+
"kilometers",
|
|
37
|
+
"kilometre",
|
|
38
|
+
"kilometres",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
RuleSet = List[Dict[str, Sequence[str]]]
|
|
42
|
+
|
|
43
|
+
# Metadata-driven rules for identifying CF coordinate requirements.
|
|
44
|
+
# Each top-level key corresponds to a coordinate category (lat/lon/x/y/time).
|
|
45
|
+
# The value is a list of rule dictionaries; a coordinate matches the category
|
|
46
|
+
# if it satisfies *any* rule in the list (logical OR). Within each rule the
|
|
47
|
+
# coordinate must satisfy *all* attribute/value pairs (logical AND). Some
|
|
48
|
+
# rules check CF metadata (standard_name/units/axis) while others fall back
|
|
49
|
+
# to name-based heuristics when metadata is absent.
|
|
50
|
+
_COORD_RULES: Dict[str, RuleSet] = {
|
|
51
|
+
"lat": [
|
|
52
|
+
{"standard_name": ("latitude",)},
|
|
53
|
+
{"axis": ("Y",), "units": tuple(_LAT_UNITS)},
|
|
54
|
+
{"name": ("lat", "latitude")},
|
|
55
|
+
],
|
|
56
|
+
"lon": [
|
|
57
|
+
{"standard_name": ("longitude",)},
|
|
58
|
+
{"axis": ("X",), "units": tuple(_LON_UNITS)},
|
|
59
|
+
{"name": ("lon", "longitude")},
|
|
60
|
+
],
|
|
61
|
+
"x": [
|
|
62
|
+
{"standard_name": ("projection_x_coordinate",)},
|
|
63
|
+
{"axis": ("X",), "units": tuple(_LINEAR_DISTANCE_UNITS)},
|
|
64
|
+
{"name": ("x", "easting")},
|
|
65
|
+
],
|
|
66
|
+
"y": [
|
|
67
|
+
{"standard_name": ("projection_y_coordinate",)},
|
|
68
|
+
{"axis": ("Y",), "units": tuple(_LINEAR_DISTANCE_UNITS)},
|
|
69
|
+
{"name": ("y", "northing")},
|
|
70
|
+
],
|
|
71
|
+
"time": [
|
|
72
|
+
{"standard_name": ("time",)},
|
|
73
|
+
{"axis": ("T",)},
|
|
74
|
+
{"name": ("time",)},
|
|
75
|
+
],
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _normalize(value: str) -> str:
|
|
80
|
+
"""Lowercase-and-trim helper to normalize metadata values for comparison."""
|
|
81
|
+
return value.strip().lower()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _matches_rule(
|
|
85
|
+
coord_name: str, coord_var: xr.DataArray, rule: Dict[str, Sequence[str]]
|
|
86
|
+
) -> bool:
|
|
87
|
+
"""
|
|
88
|
+
Determine whether a coordinate satisfies a specific CF metadata rule.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
coord_name : str
|
|
93
|
+
Name of the coordinate variable.
|
|
94
|
+
coord_var : xr.DataArray
|
|
95
|
+
Coordinate data array with attached attributes.
|
|
96
|
+
rule : dict
|
|
97
|
+
Mapping of attribute names to allowed values (e.g., `standard_name`,
|
|
98
|
+
`units`, `axis`, or synthetic `name`).
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
bool
|
|
103
|
+
True if all required attributes match one of the allowed values.
|
|
104
|
+
"""
|
|
105
|
+
for attr, allowed in rule.items():
|
|
106
|
+
if attr == "name":
|
|
107
|
+
value = coord_name
|
|
108
|
+
else:
|
|
109
|
+
value = coord_var.attrs.get(attr)
|
|
110
|
+
|
|
111
|
+
if value is None:
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
if attr == "axis":
|
|
115
|
+
normalized_value = value.strip().upper()
|
|
116
|
+
allowed_values = {str(option).strip().upper() for option in allowed}
|
|
117
|
+
else:
|
|
118
|
+
normalized_value = _normalize(str(value))
|
|
119
|
+
allowed_values = {_normalize(str(option)) for option in allowed}
|
|
120
|
+
|
|
121
|
+
if normalized_value not in allowed_values:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _find_coordinates(ds: xr.Dataset, rules: RuleSet) -> List[str]:
|
|
128
|
+
"""
|
|
129
|
+
Find coordinate names that satisfy at least one rule in the given set.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
ds : xr.Dataset
|
|
134
|
+
Dataset whose coordinates should be inspected.
|
|
135
|
+
rules : list[dict]
|
|
136
|
+
Sequence of rule dictionaries describing acceptable metadata combos.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
list[str]
|
|
141
|
+
Names of coordinates that matched any rule.
|
|
142
|
+
"""
|
|
143
|
+
matches: List[str] = []
|
|
144
|
+
for coord_name in ds.coords:
|
|
145
|
+
coord_var = ds.coords[coord_name]
|
|
146
|
+
if any(_matches_rule(coord_name, coord_var, rule) for rule in rules):
|
|
147
|
+
matches.append(coord_name)
|
|
148
|
+
return matches
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _format_coord_list(names: Sequence[str]) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Render a list of coordinate names for human-readable reporting.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
names : Sequence[str]
|
|
158
|
+
Coordinate names to concatenate.
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
str
|
|
163
|
+
Comma-separated list (or 'none' if empty).
|
|
164
|
+
"""
|
|
165
|
+
if not names:
|
|
166
|
+
return "none"
|
|
167
|
+
return ", ".join(f"'{name}'" for name in names)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@log_function_call
|
|
171
|
+
def check_coordinate_names(
|
|
172
|
+
ds: xr.Dataset,
|
|
173
|
+
*,
|
|
174
|
+
require_time_coord: bool = True,
|
|
175
|
+
require_projected_coords: bool = False,
|
|
176
|
+
require_latlon_coords: bool = False,
|
|
177
|
+
) -> ValidationReport:
|
|
178
|
+
"""
|
|
179
|
+
Validate that the dataset exposes CF-compliant coordinate variables.
|
|
180
|
+
|
|
181
|
+
The dataset must provide either:
|
|
182
|
+
- Latitude/longitude coordinates with CF-compliant metadata
|
|
183
|
+
- Projected x/y coordinates with CF-compliant metadata
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
ds : xr.Dataset
|
|
188
|
+
Dataset to evaluate.
|
|
189
|
+
require_time_coord : bool, optional
|
|
190
|
+
Require at least one CF-compliant time coordinate. Defaults to True.
|
|
191
|
+
require_projected_coords : bool, optional
|
|
192
|
+
Require projected x/y coordinates. Defaults to False.
|
|
193
|
+
require_latlon_coords : bool, optional
|
|
194
|
+
Require latitude/longitude coordinates. Defaults to False.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
report = ValidationReport()
|
|
198
|
+
|
|
199
|
+
time_coords = _find_coordinates(ds, _COORD_RULES["time"])
|
|
200
|
+
if time_coords:
|
|
201
|
+
report.add(
|
|
202
|
+
SECTION_ID,
|
|
203
|
+
"Time coordinate presence",
|
|
204
|
+
"PASS",
|
|
205
|
+
f"CF-compliant time coordinate(s) found: {_format_coord_list(time_coords)}",
|
|
206
|
+
)
|
|
207
|
+
elif require_time_coord:
|
|
208
|
+
report.add(
|
|
209
|
+
SECTION_ID,
|
|
210
|
+
"Time coordinate presence",
|
|
211
|
+
"FAIL",
|
|
212
|
+
"Dataset is missing a CF-compliant time coordinate (requires `standard_name=time`, `axis=T`, or a 'time' coordinate).",
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
lat_coords = _find_coordinates(ds, _COORD_RULES["lat"])
|
|
216
|
+
lon_coords = _find_coordinates(ds, _COORD_RULES["lon"])
|
|
217
|
+
x_coords = _find_coordinates(ds, _COORD_RULES["x"])
|
|
218
|
+
y_coords = _find_coordinates(ds, _COORD_RULES["y"])
|
|
219
|
+
|
|
220
|
+
geographic_ok = bool(lat_coords and lon_coords)
|
|
221
|
+
projected_ok = bool(x_coords and y_coords)
|
|
222
|
+
|
|
223
|
+
if geographic_ok:
|
|
224
|
+
report.add(
|
|
225
|
+
SECTION_ID,
|
|
226
|
+
"Latitude/longitude coordinates",
|
|
227
|
+
"PASS",
|
|
228
|
+
f"CF-compliant latitude ({_format_coord_list(lat_coords)}) and longitude ({_format_coord_list(lon_coords)}) coordinates detected.",
|
|
229
|
+
)
|
|
230
|
+
if projected_ok:
|
|
231
|
+
report.add(
|
|
232
|
+
SECTION_ID,
|
|
233
|
+
"Projected coordinates",
|
|
234
|
+
"PASS",
|
|
235
|
+
f"CF-compliant projected x ({_format_coord_list(x_coords)}) and y ({_format_coord_list(y_coords)}) coordinates detected.",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
failures: List[str] = []
|
|
239
|
+
if require_latlon_coords and not geographic_ok:
|
|
240
|
+
failures.append(
|
|
241
|
+
"Latitude/longitude coordinates are required but no CF-compliant pair was found."
|
|
242
|
+
)
|
|
243
|
+
if require_projected_coords and not projected_ok:
|
|
244
|
+
failures.append(
|
|
245
|
+
"Projected x/y coordinates are required but no CF-compliant pair was found."
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if not (geographic_ok or projected_ok):
|
|
249
|
+
missing_geo = []
|
|
250
|
+
if not lat_coords:
|
|
251
|
+
missing_geo.append("latitude")
|
|
252
|
+
if not lon_coords:
|
|
253
|
+
missing_geo.append("longitude")
|
|
254
|
+
missing_proj = []
|
|
255
|
+
if not x_coords:
|
|
256
|
+
missing_proj.append("projection_x_coordinate")
|
|
257
|
+
if not y_coords:
|
|
258
|
+
missing_proj.append("projection_y_coordinate")
|
|
259
|
+
detail = (
|
|
260
|
+
f"Latitude/longitude pair incomplete (missing CF-compliant {', '.join(missing_geo)} coordinate). "
|
|
261
|
+
f"Projected x/y pair incomplete (missing CF-compliant {', '.join(missing_proj)} coordinate)."
|
|
262
|
+
)
|
|
263
|
+
failures.append(
|
|
264
|
+
f"Dataset must include CF-compliant latitude/longitude or projected coordinates. {detail}"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if failures:
|
|
268
|
+
report.add(
|
|
269
|
+
SECTION_ID,
|
|
270
|
+
"Coordinate reference compliance",
|
|
271
|
+
"FAIL",
|
|
272
|
+
" ".join(failures),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return report
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import xarray as xr
|
|
2
|
+
|
|
3
|
+
from ...specs.base import ValidationReport
|
|
4
|
+
from ...utils.logging_decorator import log_function_call
|
|
5
|
+
from . import SECTION_ID as PARENT_SECTION_ID
|
|
6
|
+
|
|
7
|
+
SECTION_ID = f"{PARENT_SECTION_ID}.2"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@log_function_call
|
|
11
|
+
def check_spatial_requirements(
|
|
12
|
+
ds: xr.Dataset,
|
|
13
|
+
max_resolution_km: float,
|
|
14
|
+
min_crop_size: tuple[int, int],
|
|
15
|
+
require_constant_domain: bool,
|
|
16
|
+
) -> ValidationReport:
|
|
17
|
+
"""
|
|
18
|
+
Validate spatial requirements for the dataset.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
ds (xr.Dataset): The dataset to validate.
|
|
22
|
+
max_resolution_km (float): Maximum allowed spatial resolution in kilometers.
|
|
23
|
+
min_crop_size (tuple[int, int]): Minimum crop size as (height, width) in pixels.
|
|
24
|
+
require_constant_domain (bool): Whether the spatial domain must remain constant across timesteps.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
ValidationReport: A report containing the results of the spatial validation checks.
|
|
28
|
+
"""
|
|
29
|
+
report = ValidationReport()
|
|
30
|
+
|
|
31
|
+
# Validate spatial resolution
|
|
32
|
+
if "x" in ds.coords and "y" in ds.coords:
|
|
33
|
+
try:
|
|
34
|
+
x_vals = ds.x.values
|
|
35
|
+
y_vals = ds.y.values
|
|
36
|
+
if len(x_vals) > 1 and len(y_vals) > 1:
|
|
37
|
+
x_res = abs(float(x_vals[1] - x_vals[0]))
|
|
38
|
+
y_res = abs(float(y_vals[1] - y_vals[0]))
|
|
39
|
+
if (
|
|
40
|
+
x_res <= max_resolution_km * 1000
|
|
41
|
+
and y_res <= max_resolution_km * 1000
|
|
42
|
+
):
|
|
43
|
+
report.add(
|
|
44
|
+
SECTION_ID,
|
|
45
|
+
"Spatial resolution ≤1km",
|
|
46
|
+
"PASS",
|
|
47
|
+
f"Resolution ({x_res:.1f}m × {y_res:.1f}m) ≤ {max_resolution_km}km",
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
report.add(
|
|
51
|
+
SECTION_ID,
|
|
52
|
+
"Spatial resolution ≤1km",
|
|
53
|
+
"FAIL",
|
|
54
|
+
f"Resolution ({x_res:.1f}m × {y_res:.1f}m) exceeds {max_resolution_km}km limit",
|
|
55
|
+
)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
report.add(
|
|
58
|
+
SECTION_ID,
|
|
59
|
+
"Spatial resolution ≤1km",
|
|
60
|
+
"WARNING",
|
|
61
|
+
f"Could not verify spatial resolution: {e}",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Find all grid_mapping data variables since we don't want to check those
|
|
65
|
+
grid_mapping_vars = set()
|
|
66
|
+
for var in ds.data_vars:
|
|
67
|
+
if "grid_mapping" in ds[var].attrs:
|
|
68
|
+
grid_mapping_vars.add(ds[var].attrs["grid_mapping"])
|
|
69
|
+
data_vars = set(ds.data_vars) - grid_mapping_vars
|
|
70
|
+
|
|
71
|
+
# Validate spatial coverage
|
|
72
|
+
for data_var in data_vars:
|
|
73
|
+
data_array = ds[data_var]
|
|
74
|
+
dims = data_array.dims
|
|
75
|
+
spatial_dims = [d for d in dims if d not in ["time", "t"]]
|
|
76
|
+
if len(spatial_dims) < 2:
|
|
77
|
+
report.add(
|
|
78
|
+
SECTION_ID,
|
|
79
|
+
"Spatial dimension check",
|
|
80
|
+
"FAIL",
|
|
81
|
+
f"Need at least 2 spatial dimensions for {data_var} ({dims})",
|
|
82
|
+
)
|
|
83
|
+
continue
|
|
84
|
+
spatial_sizes = [data_array.sizes[d] for d in spatial_dims]
|
|
85
|
+
if all(s >= min_crop_size[0] for s in spatial_sizes):
|
|
86
|
+
report.add(
|
|
87
|
+
SECTION_ID,
|
|
88
|
+
"256×256 pixel support",
|
|
89
|
+
"PASS",
|
|
90
|
+
f"Spatial dimensions {spatial_sizes} support {min_crop_size[0]}×{min_crop_size[1]} crops",
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
report.add(
|
|
94
|
+
SECTION_ID,
|
|
95
|
+
"256×256 pixel support",
|
|
96
|
+
"FAIL",
|
|
97
|
+
f"Spatial dimensions {spatial_sizes} too small for {min_crop_size[0]}×{min_crop_size[1]} crops",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return report
|