mlcast-dataset-validator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. mlcast_dataset_validator/__init__.py +10 -0
  2. mlcast_dataset_validator/checks/coords/__init__.py +1 -0
  3. mlcast_dataset_validator/checks/coords/names.py +275 -0
  4. mlcast_dataset_validator/checks/coords/spatial.py +100 -0
  5. mlcast_dataset_validator/checks/coords/temporal.py +389 -0
  6. mlcast_dataset_validator/checks/data_vars/__init__.py +1 -0
  7. mlcast_dataset_validator/checks/data_vars/chunking.py +53 -0
  8. mlcast_dataset_validator/checks/data_vars/compression.py +130 -0
  9. mlcast_dataset_validator/checks/data_vars/data_structure.py +63 -0
  10. mlcast_dataset_validator/checks/data_vars/data_variable.py +62 -0
  11. mlcast_dataset_validator/checks/data_vars/georeferencing.py +67 -0
  12. mlcast_dataset_validator/checks/data_vars/naming.py +158 -0
  13. mlcast_dataset_validator/checks/data_vars_filter.py +49 -0
  14. mlcast_dataset_validator/checks/global_attributes/__init__.py +1 -0
  15. mlcast_dataset_validator/checks/global_attributes/conditional.py +67 -0
  16. mlcast_dataset_validator/checks/global_attributes/licensing.py +150 -0
  17. mlcast_dataset_validator/checks/global_attributes/mlcast_metadata.py +658 -0
  18. mlcast_dataset_validator/checks/global_attributes/zarr_format.py +88 -0
  19. mlcast_dataset_validator/checks/tool_compatibility/__init__.py +1 -0
  20. mlcast_dataset_validator/checks/tool_compatibility/cartopy.py +180 -0
  21. mlcast_dataset_validator/checks/tool_compatibility/gdal.py +251 -0
  22. mlcast_dataset_validator/specs/__init__.py +0 -0
  23. mlcast_dataset_validator/specs/base.py +146 -0
  24. mlcast_dataset_validator/specs/cli.py +176 -0
  25. mlcast_dataset_validator/specs/source_data/__init__.py +0 -0
  26. mlcast_dataset_validator/specs/source_data/radar_precipitation.py +273 -0
  27. mlcast_dataset_validator/specs/training_data/__init__.py +0 -0
  28. mlcast_dataset_validator/utils/logging_decorator.py +19 -0
  29. mlcast_dataset_validator-0.1.0.dist-info/METADATA +126 -0
  30. mlcast_dataset_validator-0.1.0.dist-info/RECORD +36 -0
  31. mlcast_dataset_validator-0.1.0.dist-info/WHEEL +5 -0
  32. mlcast_dataset_validator-0.1.0.dist-info/entry_points.txt +2 -0
  33. mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE +11 -0
  34. mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-APACHE +202 -0
  35. mlcast_dataset_validator-0.1.0.dist-info/licenses/LICENSE-BSD +26 -0
  36. mlcast_dataset_validator-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,10 @@
1
+ """Top-level package for the MLCast source data validator."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ __all__ = ["__version__"]
6
+
7
+ try:
8
+ __version__ = version("mlcast-dataset-validator")
9
+ except PackageNotFoundError: # pragma: no cover - package metadata unavailable
10
+ __version__ = "0.0.0"
@@ -0,0 +1 @@
1
+ SECTION_ID = 3
@@ -0,0 +1,275 @@
1
+ from typing import Dict, List, Sequence
2
+
3
+ import xarray as xr
4
+
5
+ from ...specs.base import ValidationReport
6
+ from ...utils.logging_decorator import log_function_call
7
+ from . import SECTION_ID as PARENT_SECTION_ID
8
+
9
+ SECTION_ID = f"{PARENT_SECTION_ID}.1"
10
+
11
+ # Normalized unit strings accepted for latitude coordinates
12
+ _LAT_UNITS = {
13
+ "degrees_north",
14
+ "degree_north",
15
+ "degrees_n",
16
+ "degree_n",
17
+ "deg_n",
18
+ }
19
+ # Normalized unit strings accepted for longitude coordinates
20
+ _LON_UNITS = {
21
+ "degrees_east",
22
+ "degree_east",
23
+ "degrees_e",
24
+ "degree_e",
25
+ "deg_e",
26
+ }
27
+ # Linear distance units accepted for projected coordinates
28
+ _LINEAR_DISTANCE_UNITS = {
29
+ "m",
30
+ "meter",
31
+ "meters",
32
+ "metre",
33
+ "metres",
34
+ "km",
35
+ "kilometer",
36
+ "kilometers",
37
+ "kilometre",
38
+ "kilometres",
39
+ }
40
+
41
+ RuleSet = List[Dict[str, Sequence[str]]]
42
+
43
+ # Metadata-driven rules for identifying CF coordinate requirements.
44
+ # Each top-level key corresponds to a coordinate category (lat/lon/x/y/time).
45
+ # The value is a list of rule dictionaries; a coordinate matches the category
46
+ # if it satisfies *any* rule in the list (logical OR). Within each rule the
47
+ # coordinate must satisfy *all* attribute/value pairs (logical AND). Some
48
+ # rules check CF metadata (standard_name/units/axis) while others fall back
49
+ # to name-based heuristics when metadata is absent.
50
+ _COORD_RULES: Dict[str, RuleSet] = {
51
+ "lat": [
52
+ {"standard_name": ("latitude",)},
53
+ {"axis": ("Y",), "units": tuple(_LAT_UNITS)},
54
+ {"name": ("lat", "latitude")},
55
+ ],
56
+ "lon": [
57
+ {"standard_name": ("longitude",)},
58
+ {"axis": ("X",), "units": tuple(_LON_UNITS)},
59
+ {"name": ("lon", "longitude")},
60
+ ],
61
+ "x": [
62
+ {"standard_name": ("projection_x_coordinate",)},
63
+ {"axis": ("X",), "units": tuple(_LINEAR_DISTANCE_UNITS)},
64
+ {"name": ("x", "easting")},
65
+ ],
66
+ "y": [
67
+ {"standard_name": ("projection_y_coordinate",)},
68
+ {"axis": ("Y",), "units": tuple(_LINEAR_DISTANCE_UNITS)},
69
+ {"name": ("y", "northing")},
70
+ ],
71
+ "time": [
72
+ {"standard_name": ("time",)},
73
+ {"axis": ("T",)},
74
+ {"name": ("time",)},
75
+ ],
76
+ }
77
+
78
+
79
+ def _normalize(value: str) -> str:
80
+ """Lowercase-and-trim helper to normalize metadata values for comparison."""
81
+ return value.strip().lower()
82
+
83
+
84
+ def _matches_rule(
85
+ coord_name: str, coord_var: xr.DataArray, rule: Dict[str, Sequence[str]]
86
+ ) -> bool:
87
+ """
88
+ Determine whether a coordinate satisfies a specific CF metadata rule.
89
+
90
+ Parameters
91
+ ----------
92
+ coord_name : str
93
+ Name of the coordinate variable.
94
+ coord_var : xr.DataArray
95
+ Coordinate data array with attached attributes.
96
+ rule : dict
97
+ Mapping of attribute names to allowed values (e.g., `standard_name`,
98
+ `units`, `axis`, or synthetic `name`).
99
+
100
+ Returns
101
+ -------
102
+ bool
103
+ True if all required attributes match one of the allowed values.
104
+ """
105
+ for attr, allowed in rule.items():
106
+ if attr == "name":
107
+ value = coord_name
108
+ else:
109
+ value = coord_var.attrs.get(attr)
110
+
111
+ if value is None:
112
+ return False
113
+
114
+ if attr == "axis":
115
+ normalized_value = value.strip().upper()
116
+ allowed_values = {str(option).strip().upper() for option in allowed}
117
+ else:
118
+ normalized_value = _normalize(str(value))
119
+ allowed_values = {_normalize(str(option)) for option in allowed}
120
+
121
+ if normalized_value not in allowed_values:
122
+ return False
123
+
124
+ return True
125
+
126
+
127
+ def _find_coordinates(ds: xr.Dataset, rules: RuleSet) -> List[str]:
128
+ """
129
+ Find coordinate names that satisfy at least one rule in the given set.
130
+
131
+ Parameters
132
+ ----------
133
+ ds : xr.Dataset
134
+ Dataset whose coordinates should be inspected.
135
+ rules : list[dict]
136
+ Sequence of rule dictionaries describing acceptable metadata combos.
137
+
138
+ Returns
139
+ -------
140
+ list[str]
141
+ Names of coordinates that matched any rule.
142
+ """
143
+ matches: List[str] = []
144
+ for coord_name in ds.coords:
145
+ coord_var = ds.coords[coord_name]
146
+ if any(_matches_rule(coord_name, coord_var, rule) for rule in rules):
147
+ matches.append(coord_name)
148
+ return matches
149
+
150
+
151
+ def _format_coord_list(names: Sequence[str]) -> str:
152
+ """
153
+ Render a list of coordinate names for human-readable reporting.
154
+
155
+ Parameters
156
+ ----------
157
+ names : Sequence[str]
158
+ Coordinate names to concatenate.
159
+
160
+ Returns
161
+ -------
162
+ str
163
+ Comma-separated list (or 'none' if empty).
164
+ """
165
+ if not names:
166
+ return "none"
167
+ return ", ".join(f"'{name}'" for name in names)
168
+
169
+
170
+ @log_function_call
171
+ def check_coordinate_names(
172
+ ds: xr.Dataset,
173
+ *,
174
+ require_time_coord: bool = True,
175
+ require_projected_coords: bool = False,
176
+ require_latlon_coords: bool = False,
177
+ ) -> ValidationReport:
178
+ """
179
+ Validate that the dataset exposes CF-compliant coordinate variables.
180
+
181
+ The dataset must provide either:
182
+ - Latitude/longitude coordinates with CF-compliant metadata
183
+ - Projected x/y coordinates with CF-compliant metadata
184
+
185
+ Parameters
186
+ ----------
187
+ ds : xr.Dataset
188
+ Dataset to evaluate.
189
+ require_time_coord : bool, optional
190
+ Require at least one CF-compliant time coordinate. Defaults to True.
191
+ require_projected_coords : bool, optional
192
+ Require projected x/y coordinates. Defaults to False.
193
+ require_latlon_coords : bool, optional
194
+ Require latitude/longitude coordinates. Defaults to False.
195
+ """
196
+
197
+ report = ValidationReport()
198
+
199
+ time_coords = _find_coordinates(ds, _COORD_RULES["time"])
200
+ if time_coords:
201
+ report.add(
202
+ SECTION_ID,
203
+ "Time coordinate presence",
204
+ "PASS",
205
+ f"CF-compliant time coordinate(s) found: {_format_coord_list(time_coords)}",
206
+ )
207
+ elif require_time_coord:
208
+ report.add(
209
+ SECTION_ID,
210
+ "Time coordinate presence",
211
+ "FAIL",
212
+ "Dataset is missing a CF-compliant time coordinate (requires `standard_name=time`, `axis=T`, or a 'time' coordinate).",
213
+ )
214
+
215
+ lat_coords = _find_coordinates(ds, _COORD_RULES["lat"])
216
+ lon_coords = _find_coordinates(ds, _COORD_RULES["lon"])
217
+ x_coords = _find_coordinates(ds, _COORD_RULES["x"])
218
+ y_coords = _find_coordinates(ds, _COORD_RULES["y"])
219
+
220
+ geographic_ok = bool(lat_coords and lon_coords)
221
+ projected_ok = bool(x_coords and y_coords)
222
+
223
+ if geographic_ok:
224
+ report.add(
225
+ SECTION_ID,
226
+ "Latitude/longitude coordinates",
227
+ "PASS",
228
+ f"CF-compliant latitude ({_format_coord_list(lat_coords)}) and longitude ({_format_coord_list(lon_coords)}) coordinates detected.",
229
+ )
230
+ if projected_ok:
231
+ report.add(
232
+ SECTION_ID,
233
+ "Projected coordinates",
234
+ "PASS",
235
+ f"CF-compliant projected x ({_format_coord_list(x_coords)}) and y ({_format_coord_list(y_coords)}) coordinates detected.",
236
+ )
237
+
238
+ failures: List[str] = []
239
+ if require_latlon_coords and not geographic_ok:
240
+ failures.append(
241
+ "Latitude/longitude coordinates are required but no CF-compliant pair was found."
242
+ )
243
+ if require_projected_coords and not projected_ok:
244
+ failures.append(
245
+ "Projected x/y coordinates are required but no CF-compliant pair was found."
246
+ )
247
+
248
+ if not (geographic_ok or projected_ok):
249
+ missing_geo = []
250
+ if not lat_coords:
251
+ missing_geo.append("latitude")
252
+ if not lon_coords:
253
+ missing_geo.append("longitude")
254
+ missing_proj = []
255
+ if not x_coords:
256
+ missing_proj.append("projection_x_coordinate")
257
+ if not y_coords:
258
+ missing_proj.append("projection_y_coordinate")
259
+ detail = (
260
+ f"Latitude/longitude pair incomplete (missing CF-compliant {', '.join(missing_geo)} coordinate). "
261
+ f"Projected x/y pair incomplete (missing CF-compliant {', '.join(missing_proj)} coordinate)."
262
+ )
263
+ failures.append(
264
+ f"Dataset must include CF-compliant latitude/longitude or projected coordinates. {detail}"
265
+ )
266
+
267
+ if failures:
268
+ report.add(
269
+ SECTION_ID,
270
+ "Coordinate reference compliance",
271
+ "FAIL",
272
+ " ".join(failures),
273
+ )
274
+
275
+ return report
@@ -0,0 +1,100 @@
1
+ import xarray as xr
2
+
3
+ from ...specs.base import ValidationReport
4
+ from ...utils.logging_decorator import log_function_call
5
+ from . import SECTION_ID as PARENT_SECTION_ID
6
+
7
+ SECTION_ID = f"{PARENT_SECTION_ID}.2"
8
+
9
+
10
+ @log_function_call
11
+ def check_spatial_requirements(
12
+ ds: xr.Dataset,
13
+ max_resolution_km: float,
14
+ min_crop_size: tuple[int, int],
15
+ require_constant_domain: bool,
16
+ ) -> ValidationReport:
17
+ """
18
+ Validate spatial requirements for the dataset.
19
+
20
+ Parameters:
21
+ ds (xr.Dataset): The dataset to validate.
22
+ max_resolution_km (float): Maximum allowed spatial resolution in kilometers.
23
+ min_crop_size (tuple[int, int]): Minimum crop size as (height, width) in pixels.
24
+ require_constant_domain (bool): Whether the spatial domain must remain constant across timesteps.
25
+
26
+ Returns:
27
+ ValidationReport: A report containing the results of the spatial validation checks.
28
+ """
29
+ report = ValidationReport()
30
+
31
+ # Validate spatial resolution
32
+ if "x" in ds.coords and "y" in ds.coords:
33
+ try:
34
+ x_vals = ds.x.values
35
+ y_vals = ds.y.values
36
+ if len(x_vals) > 1 and len(y_vals) > 1:
37
+ x_res = abs(float(x_vals[1] - x_vals[0]))
38
+ y_res = abs(float(y_vals[1] - y_vals[0]))
39
+ if (
40
+ x_res <= max_resolution_km * 1000
41
+ and y_res <= max_resolution_km * 1000
42
+ ):
43
+ report.add(
44
+ SECTION_ID,
45
+ "Spatial resolution ≤1km",
46
+ "PASS",
47
+ f"Resolution ({x_res:.1f}m × {y_res:.1f}m) ≤ {max_resolution_km}km",
48
+ )
49
+ else:
50
+ report.add(
51
+ SECTION_ID,
52
+ "Spatial resolution ≤1km",
53
+ "FAIL",
54
+ f"Resolution ({x_res:.1f}m × {y_res:.1f}m) exceeds {max_resolution_km}km limit",
55
+ )
56
+ except Exception as e:
57
+ report.add(
58
+ SECTION_ID,
59
+ "Spatial resolution ≤1km",
60
+ "WARNING",
61
+ f"Could not verify spatial resolution: {e}",
62
+ )
63
+
64
+ # Find all grid_mapping data variables since we don't want to check those
65
+ grid_mapping_vars = set()
66
+ for var in ds.data_vars:
67
+ if "grid_mapping" in ds[var].attrs:
68
+ grid_mapping_vars.add(ds[var].attrs["grid_mapping"])
69
+ data_vars = set(ds.data_vars) - grid_mapping_vars
70
+
71
+ # Validate spatial coverage
72
+ for data_var in data_vars:
73
+ data_array = ds[data_var]
74
+ dims = data_array.dims
75
+ spatial_dims = [d for d in dims if d not in ["time", "t"]]
76
+ if len(spatial_dims) < 2:
77
+ report.add(
78
+ SECTION_ID,
79
+ "Spatial dimension check",
80
+ "FAIL",
81
+ f"Need at least 2 spatial dimensions for {data_var} ({dims})",
82
+ )
83
+ continue
84
+ spatial_sizes = [data_array.sizes[d] for d in spatial_dims]
85
+ if all(s >= min_crop_size[0] for s in spatial_sizes):
86
+ report.add(
87
+ SECTION_ID,
88
+ "256×256 pixel support",
89
+ "PASS",
90
+ f"Spatial dimensions {spatial_sizes} support {min_crop_size[0]}×{min_crop_size[1]} crops",
91
+ )
92
+ else:
93
+ report.add(
94
+ SECTION_ID,
95
+ "256×256 pixel support",
96
+ "FAIL",
97
+ f"Spatial dimensions {spatial_sizes} too small for {min_crop_size[0]}×{min_crop_size[1]} crops",
98
+ )
99
+
100
+ return report