rucola 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rucola/__init__.py +933 -0
- rucola/_algorithms.py +316 -0
- rucola/_homogeneity.py +827 -0
- rucola/_normalization.py +621 -0
- rucola/_preprocessing.py +90 -0
- rucola/_results.py +495 -0
- rucola-0.1.0.dist-info/METADATA +231 -0
- rucola-0.1.0.dist-info/RECORD +10 -0
- rucola-0.1.0.dist-info/WHEEL +4 -0
- rucola-0.1.0.dist-info/licenses/LICENSE.md +21 -0
rucola/__init__.py
ADDED
|
@@ -0,0 +1,933 @@
|
|
|
1
|
+
"""Homogenization toolbox for climate station data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Self
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
from rucola._algorithms import (
|
|
15
|
+
_VALID_GROUPS,
|
|
16
|
+
CorrectionMode,
|
|
17
|
+
GroupLabel,
|
|
18
|
+
NeighborInfo,
|
|
19
|
+
apply_correction,
|
|
20
|
+
build_correlation_cache,
|
|
21
|
+
build_distance_cache,
|
|
22
|
+
build_reference_series,
|
|
23
|
+
compute_correction_factor,
|
|
24
|
+
compute_q_series,
|
|
25
|
+
select_neighbors,
|
|
26
|
+
)
|
|
27
|
+
from rucola._homogeneity import (
|
|
28
|
+
BuishandTest, # noqa: F401
|
|
29
|
+
EasterlingPetersonTest, # noqa: F401
|
|
30
|
+
HomogenizationTest,
|
|
31
|
+
PettittTest, # noqa: F401
|
|
32
|
+
SNHTTest,
|
|
33
|
+
StarsTest, # noqa: F401
|
|
34
|
+
TestResult,
|
|
35
|
+
WorsleyTest, # noqa: F401
|
|
36
|
+
)
|
|
37
|
+
from rucola._normalization import ( # noqa: F401, TC001
|
|
38
|
+
BreakInfo,
|
|
39
|
+
BreakPredicate,
|
|
40
|
+
ConsensusRule,
|
|
41
|
+
MagnitudeAbove,
|
|
42
|
+
NeighborCountAbove,
|
|
43
|
+
NormalizationConfig,
|
|
44
|
+
NSignificantAbove,
|
|
45
|
+
SignalAbove,
|
|
46
|
+
StationIn,
|
|
47
|
+
StepIn,
|
|
48
|
+
TestSignificant,
|
|
49
|
+
YearBetween,
|
|
50
|
+
)
|
|
51
|
+
from rucola._results import (
|
|
52
|
+
CorrectionRecord,
|
|
53
|
+
DetectionRecord,
|
|
54
|
+
DetectionResult,
|
|
55
|
+
HomogenizationResult, # noqa: F401
|
|
56
|
+
StationDetection,
|
|
57
|
+
StationResult, # noqa: F401
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
_STATION_ID_SAMPLE_SIZE = 5
|
|
61
|
+
_LAT_MIN, _LAT_MAX = -90.0, 90.0
|
|
62
|
+
_LON_MIN, _LON_MAX = -180.0, 180.0
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Internal state tracking across the 6-step procedure
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class _StationState:
|
|
71
|
+
station_id: str
|
|
72
|
+
annual_original: pl.Series
|
|
73
|
+
annual_current: pl.Series # updated after each correction
|
|
74
|
+
years: pl.Series
|
|
75
|
+
_group: GroupLabel = field(default="", init=False, repr=False)
|
|
76
|
+
corrections: list[CorrectionRecord] = field(default_factory=list)
|
|
77
|
+
detections_by_step: dict[int, DetectionRecord] = field(default_factory=dict)
|
|
78
|
+
neighbors_by_step: dict[int, list[NeighborInfo]] = field(default_factory=dict)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def group(self) -> GroupLabel:
|
|
82
|
+
return self._group
|
|
83
|
+
|
|
84
|
+
@group.setter
|
|
85
|
+
def group(self, value: GroupLabel) -> None:
|
|
86
|
+
if value not in _VALID_GROUPS:
|
|
87
|
+
msg = f"Invalid group {value!r}. Must be one of {sorted(_VALID_GROUPS - {''})}"
|
|
88
|
+
raise ValueError(msg)
|
|
89
|
+
self._group = value
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Run configuration
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class RunConfig:
|
|
99
|
+
"""Configuration for the González-Rouco six-step detection procedure.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
tests :
|
|
104
|
+
Homogeneity tests to run at each step. Defaults to ``[SNHTTest()]``.
|
|
105
|
+
Pass multiple tests for consensus detection.
|
|
106
|
+
mode :
|
|
107
|
+
``"ratio"`` (multiplicative, precipitation) or ``"difference"``
|
|
108
|
+
(additive, temperature).
|
|
109
|
+
run_consensus :
|
|
110
|
+
How many tests must agree to classify a station as inhomogeneous
|
|
111
|
+
during the six-step procedure. ``"majority"`` (default),
|
|
112
|
+
``"any"`` (most sensitive), or ``"unanimous"``.
|
|
113
|
+
``"strongest_signal"`` is accepted but treated as ``"any"`` at
|
|
114
|
+
detection time; its special tiebreak behaviour only applies in
|
|
115
|
+
:class:`NormalizationConfig`.
|
|
116
|
+
min_series_years :
|
|
117
|
+
Minimum non-null annual values required to process a station (default: 20).
|
|
118
|
+
max_gap_years :
|
|
119
|
+
Stations with a consecutive null gap exceeding this are excluded.
|
|
120
|
+
max_neighbors :
|
|
121
|
+
Maximum number of reference stations (default: 10).
|
|
122
|
+
min_correlation :
|
|
123
|
+
Minimum Pearson correlation to include a neighbor (default: 0.5).
|
|
124
|
+
max_distance_km :
|
|
125
|
+
Search radius for neighbors in km. ``None`` disables the filter.
|
|
126
|
+
station_ids :
|
|
127
|
+
Restrict the run to this subset of station IDs.
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
tests: list[HomogenizationTest] | None = None
|
|
132
|
+
mode: CorrectionMode = "ratio"
|
|
133
|
+
run_consensus: ConsensusRule = "majority"
|
|
134
|
+
min_series_years: int = 20
|
|
135
|
+
max_gap_years: int | None = None
|
|
136
|
+
max_neighbors: int = 10
|
|
137
|
+
min_correlation: float = 0.5
|
|
138
|
+
max_distance_km: float | None = None
|
|
139
|
+
station_ids: list[str] | None = None
|
|
140
|
+
progress: bool = False
|
|
141
|
+
on_step: Callable[[int, str], None] | None = None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Main class
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Rucola:
|
|
150
|
+
"""Homogenization toolbox for climate station data.
|
|
151
|
+
|
|
152
|
+
Implements the six-step quality control and homogenization procedure from
|
|
153
|
+
González-Rouco et al. (2001), with six pluggable breakpoint tests
|
|
154
|
+
(SNHT, Buishand, Pettitt, Worsley, Easterling–Peterson, STARS) and an
|
|
155
|
+
iteratively refined reference pool.
|
|
156
|
+
|
|
157
|
+
Each instance represents one parameter (e.g. precipitation_height).
|
|
158
|
+
Use the ``from_*`` class methods to load data from different sources.
|
|
159
|
+
|
|
160
|
+
Minimum required columns
|
|
161
|
+
------------------------
|
|
162
|
+
stations : station_id (str), latitude (float), longitude (float)
|
|
163
|
+
values : station_id (str), date (date/datetime), value (float),
|
|
164
|
+
parameter (str)
|
|
165
|
+
|
|
166
|
+
References
|
|
167
|
+
----------
|
|
168
|
+
González-Rouco et al. (2001), J. Climate 14(5):964–978.
|
|
169
|
+
https://doi.org/10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2
|
|
170
|
+
Alexandersson (1986), Int. J. Climatol. 6(6):661–675.
|
|
171
|
+
Alexandersson & Moberg (1997), Int. J. Climatol. 17(1):25–34.
|
|
172
|
+
Hanssen-Bauer & Førland (1994), J. Climate 7(7):1001–1013.
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
STATIONS_REQUIRED: frozenset[str] = frozenset({"station_id", "latitude", "longitude"})
|
|
177
|
+
VALUES_REQUIRED: frozenset[str] = frozenset({"station_id", "date", "value"})
|
|
178
|
+
|
|
179
|
+
def __init__(
|
|
180
|
+
self,
|
|
181
|
+
values: pl.DataFrame,
|
|
182
|
+
stations: pl.DataFrame | None = None,
|
|
183
|
+
parameter: str | None = None,
|
|
184
|
+
) -> None:
|
|
185
|
+
"""Initialise directly from pre-loaded DataFrames. Prefer ``from_*`` class methods."""
|
|
186
|
+
self._check_columns(values, self.VALUES_REQUIRED, "values")
|
|
187
|
+
self._check_value_dtype(values)
|
|
188
|
+
self._check_date_order(values)
|
|
189
|
+
self._check_duplicate_dates(values)
|
|
190
|
+
self._check_single_parameter(values)
|
|
191
|
+
|
|
192
|
+
if stations is not None:
|
|
193
|
+
self._check_columns(stations, self.STATIONS_REQUIRED, "stations")
|
|
194
|
+
self._check_station_coverage(values, stations)
|
|
195
|
+
self._check_stations(stations)
|
|
196
|
+
self.stations: pl.DataFrame = stations
|
|
197
|
+
else:
|
|
198
|
+
station_ids = values["station_id"].unique().sort().to_list()
|
|
199
|
+
self.stations = pl.DataFrame({"station_id": station_ids})
|
|
200
|
+
|
|
201
|
+
self.values = values
|
|
202
|
+
self.parameter = parameter or "unspecified"
|
|
203
|
+
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
# Constructors
|
|
206
|
+
# ------------------------------------------------------------------
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def from_duckdb(
|
|
210
|
+
cls,
|
|
211
|
+
path: str | Path,
|
|
212
|
+
parameter: str | None = None,
|
|
213
|
+
stations_table: str | None = "stations",
|
|
214
|
+
values_table: str = "values",
|
|
215
|
+
) -> Self:
|
|
216
|
+
"""Load from a DuckDB file.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
path :
|
|
221
|
+
Path to the .duckdb file.
|
|
222
|
+
parameter :
|
|
223
|
+
Label for the parameter stored in this instance (e.g.
|
|
224
|
+
``"precipitation_height"``). Pure metadata — filter the table to a
|
|
225
|
+
single parameter before calling this method.
|
|
226
|
+
stations_table :
|
|
227
|
+
Name of the stations table (default: ``"stations"``). Pass ``None``
|
|
228
|
+
to run without station metadata.
|
|
229
|
+
values_table :
|
|
230
|
+
Name of the values table (default: ``"values"``).
|
|
231
|
+
|
|
232
|
+
"""
|
|
233
|
+
try:
|
|
234
|
+
import duckdb # noqa: PLC0415
|
|
235
|
+
except ImportError as e:
|
|
236
|
+
msg = "DuckDB is required for from_duckdb(). Install it with: pip install rucola[duckdb]"
|
|
237
|
+
raise ImportError(msg) from e
|
|
238
|
+
with duckdb.connect(str(path), read_only=True) as con:
|
|
239
|
+
stations = con.execute(f"SELECT * FROM {stations_table}").pl() if stations_table else None # noqa: S608
|
|
240
|
+
values = con.execute(f"SELECT * FROM {values_table}").pl() # noqa: S608
|
|
241
|
+
return cls._cast(values, stations, parameter=parameter)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def from_csv(
|
|
245
|
+
cls,
|
|
246
|
+
values_path: str | Path,
|
|
247
|
+
stations_path: str | Path | None = None,
|
|
248
|
+
parameter: str | None = None,
|
|
249
|
+
) -> Self:
|
|
250
|
+
"""Load from CSV files.
|
|
251
|
+
|
|
252
|
+
Pre-filter ``values`` to a single parameter before calling this method.
|
|
253
|
+
``stations_path`` is optional; omit it to run without station metadata.
|
|
254
|
+
"""
|
|
255
|
+
values = pl.read_csv(str(values_path), try_parse_dates=True)
|
|
256
|
+
stations = pl.read_csv(str(stations_path), try_parse_dates=True) if stations_path else None
|
|
257
|
+
return cls._cast(values, stations, parameter=parameter)
|
|
258
|
+
|
|
259
|
+
@classmethod
|
|
260
|
+
def from_polars(
|
|
261
|
+
cls,
|
|
262
|
+
values: pl.DataFrame,
|
|
263
|
+
stations: pl.DataFrame | None = None,
|
|
264
|
+
parameter: str | None = None,
|
|
265
|
+
) -> Self:
|
|
266
|
+
"""Load from Polars DataFrames.
|
|
267
|
+
|
|
268
|
+
Pre-filter ``values`` to a single parameter before calling this method.
|
|
269
|
+
``stations`` is optional; omit it to run without station metadata.
|
|
270
|
+
"""
|
|
271
|
+
return cls._cast(values, stations, parameter=parameter)
|
|
272
|
+
|
|
273
|
+
# ------------------------------------------------------------------
|
|
274
|
+
# Core 6-step procedure
|
|
275
|
+
# ------------------------------------------------------------------
|
|
276
|
+
|
|
277
|
+
def run( # noqa: C901, PLR0912, PLR0915
|
|
278
|
+
self,
|
|
279
|
+
config: RunConfig | None = None,
|
|
280
|
+
) -> DetectionResult:
|
|
281
|
+
"""Run the González-Rouco (2001) six-step homogenization procedure.
|
|
282
|
+
|
|
283
|
+
Overview
|
|
284
|
+
--------
|
|
285
|
+
Step 0 (pre-processing): Winsorize daily values to P_out = q_0.75 + 3·IQR
|
|
286
|
+
per station, then aggregate to annual totals.
|
|
287
|
+
|
|
288
|
+
Steps 1–6 iteratively refine the reference station pool and apply
|
|
289
|
+
corrections. Each step tests a subset of candidate stations using a
|
|
290
|
+
progressively more reliable set of reference stations:
|
|
291
|
+
|
|
292
|
+
Step 1 – First assessment. ALL vs ALL.
|
|
293
|
+
→ groups H1 (homogeneous) and I1 (inhomogeneous).
|
|
294
|
+
Corrections applied to I1.
|
|
295
|
+
|
|
296
|
+
Step 2 – Adjusted references. ALL vs (H1 + corrected I1).
|
|
297
|
+
→ groups H2, I2.
|
|
298
|
+
Corrections applied to I2.
|
|
299
|
+
|
|
300
|
+
Step 3 – Test corrected I2 series. corr(I2) vs (H2 + corr(I2)).
|
|
301
|
+
→ groups HC3 (corrected, now homogeneous), IC3 (still inhomogeneous).
|
|
302
|
+
|
|
303
|
+
Step 4 – Only homogeneous references. (H2 + corr(I2)) vs (H2 + HC3).
|
|
304
|
+
→ groups H4, HC4, I4, IC4.
|
|
305
|
+
Corrections applied to I4.
|
|
306
|
+
|
|
307
|
+
Step 5 – Last single-break corrections. I4 vs (H4 + HC4).
|
|
308
|
+
→ groups HC5, IC5.
|
|
309
|
+
|
|
310
|
+
Step 6 – Double-break correction. (IC4 + IC5) vs (H4 + HC4 + HC5).
|
|
311
|
+
Two-pass: first correct the later break on the post-first-break
|
|
312
|
+
sub-series, then correct the earlier break on the full series.
|
|
313
|
+
→ groups HCC6 or ICC6.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
config :
|
|
318
|
+
Run configuration. Defaults to ``RunConfig()`` if not provided.
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
DetectionResult
|
|
323
|
+
Raw detection data for all stations. Call ``.normalize()`` to
|
|
324
|
+
obtain a ``HomogenizationResult`` with corrections applied.
|
|
325
|
+
|
|
326
|
+
"""
|
|
327
|
+
cfg = config or RunConfig()
|
|
328
|
+
|
|
329
|
+
_pbar: Any = None
|
|
330
|
+
if cfg.progress:
|
|
331
|
+
try:
|
|
332
|
+
from tqdm import tqdm # noqa: PLC0415
|
|
333
|
+
_pbar = tqdm(total=6, desc="step 1/6", unit="step", leave=True)
|
|
334
|
+
except ImportError:
|
|
335
|
+
import warnings # noqa: PLC0415
|
|
336
|
+
warnings.warn("tqdm is not installed; install it with: pip install rucola[tqdm]", stacklevel=2)
|
|
337
|
+
|
|
338
|
+
if cfg.max_distance_km is not None and "latitude" not in self.stations.columns:
|
|
339
|
+
msg = "max_distance_km requires a stations DataFrame with latitude/longitude columns."
|
|
340
|
+
raise ValueError(msg)
|
|
341
|
+
|
|
342
|
+
# ── Resolve tests and derive min_years_from_end ───────────────────
|
|
343
|
+
_tests = cfg.tests or [SNHTTest()]
|
|
344
|
+
min_years_from_end = max(t.min_years_from_end for t in _tests)
|
|
345
|
+
|
|
346
|
+
# ── Parameter consistency check ───────────────────────────────────
|
|
347
|
+
min_detectable = 2 * min_years_from_end + 1
|
|
348
|
+
if cfg.min_series_years < min_detectable:
|
|
349
|
+
msg = (
|
|
350
|
+
f"min_series_years={cfg.min_series_years} is below the minimum detectable series length "
|
|
351
|
+
f"({min_detectable} = 2 * min_years_from_end + 1 = 2 * {min_years_from_end} + 1). "
|
|
352
|
+
"No break could ever pass the edge-effect guard. "
|
|
353
|
+
f"Set min_series_years >= {min_detectable} or lower min_years_from_end."
|
|
354
|
+
)
|
|
355
|
+
raise ValueError(msg)
|
|
356
|
+
|
|
357
|
+
# ── Step 0: validate resolution and pivot to wide ────────────────
|
|
358
|
+
self._check_annual_resolution(self.values)
|
|
359
|
+
annual_wide_base = (
|
|
360
|
+
self.values.with_columns(pl.col("date").dt.year().alias("year"))
|
|
361
|
+
.select("station_id", "year", "value")
|
|
362
|
+
.pivot(on="station_id", index="year", values="value", aggregate_function="first")
|
|
363
|
+
.sort("year")
|
|
364
|
+
)
|
|
365
|
+
years = annual_wide_base["year"]
|
|
366
|
+
|
|
367
|
+
candidate_ids = [sid for sid in self.stations["station_id"].to_list() if sid in annual_wide_base.columns]
|
|
368
|
+
if cfg.station_ids is not None:
|
|
369
|
+
allowed = set(cfg.station_ids)
|
|
370
|
+
candidate_ids = [s for s in candidate_ids if s in allowed]
|
|
371
|
+
|
|
372
|
+
# ── Pre-filter: series length and gap checks ──────────────────────
|
|
373
|
+
def _max_gap(sid: str) -> int:
|
|
374
|
+
best, cur = 0, 0
|
|
375
|
+
for v in annual_wide_base[sid].to_list():
|
|
376
|
+
cur = cur + 1 if v is None else 0
|
|
377
|
+
best = max(best, cur)
|
|
378
|
+
return best
|
|
379
|
+
|
|
380
|
+
insufficient_ids: list[str] = []
|
|
381
|
+
filtered: list[str] = []
|
|
382
|
+
for sid in candidate_ids:
|
|
383
|
+
col = annual_wide_base[sid]
|
|
384
|
+
too_short = col.drop_nulls().len() < cfg.min_series_years
|
|
385
|
+
gapped = cfg.max_gap_years is not None and _max_gap(sid) > cfg.max_gap_years
|
|
386
|
+
if too_short or gapped:
|
|
387
|
+
insufficient_ids.append(sid)
|
|
388
|
+
else:
|
|
389
|
+
filtered.append(sid)
|
|
390
|
+
candidate_ids = filtered
|
|
391
|
+
|
|
392
|
+
# Initialise per-station state
|
|
393
|
+
states: dict[str, _StationState] = {
|
|
394
|
+
sid: _StationState(
|
|
395
|
+
station_id=sid,
|
|
396
|
+
annual_original=annual_wide_base[sid],
|
|
397
|
+
annual_current=annual_wide_base[sid],
|
|
398
|
+
years=years,
|
|
399
|
+
)
|
|
400
|
+
for sid in candidate_ids
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# ── helpers ──────────────────────────────────────────────────────
|
|
404
|
+
|
|
405
|
+
def build_wide() -> pl.DataFrame:
|
|
406
|
+
data: dict = {"year": years.to_list()}
|
|
407
|
+
for sid in candidate_ids:
|
|
408
|
+
data[sid] = states[sid].annual_current.to_list()
|
|
409
|
+
return pl.DataFrame(data)
|
|
410
|
+
|
|
411
|
+
_dist_cache = build_distance_cache(self.stations) if cfg.max_distance_km is not None else None
|
|
412
|
+
|
|
413
|
+
def _advance(step: int, desc: str) -> None:
|
|
414
|
+
if _pbar is not None:
|
|
415
|
+
_pbar.set_description(desc)
|
|
416
|
+
_pbar.update(1)
|
|
417
|
+
if cfg.on_step is not None:
|
|
418
|
+
cfg.on_step(step, desc)
|
|
419
|
+
|
|
420
|
+
def _meets_consensus(tr_list: list[TestResult]) -> bool:
|
|
421
|
+
sig = sum(1 for t, tr in zip(_tests, tr_list, strict=False) if t.is_inhomogeneous(tr))
|
|
422
|
+
n = len(_tests)
|
|
423
|
+
if cfg.run_consensus == "unanimous":
|
|
424
|
+
return sig == n
|
|
425
|
+
if cfg.run_consensus == "majority":
|
|
426
|
+
return sig > n // 2
|
|
427
|
+
# "any" / "strongest_signal" — at detection time both require at least one inhomogeneous test
|
|
428
|
+
return sig >= 1
|
|
429
|
+
|
|
430
|
+
def _resolve_break(tr_list: list[TestResult]) -> int | None:
|
|
431
|
+
sig = [tr for t, tr in zip(_tests, tr_list, strict=False) if t.is_inhomogeneous(tr)]
|
|
432
|
+
if not sig:
|
|
433
|
+
return None
|
|
434
|
+
return max(sig, key=lambda r: r.relative_signal).break_year
|
|
435
|
+
|
|
436
|
+
def run_tests_for(
|
|
437
|
+
test_ids: list[str],
|
|
438
|
+
ref_ids: set[str],
|
|
439
|
+
step: int,
|
|
440
|
+
) -> dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]]:
|
|
441
|
+
"""Run all tests for each station; store results in state."""
|
|
442
|
+
wide = build_wide()
|
|
443
|
+
_corr_cache = build_correlation_cache(wide)
|
|
444
|
+
results: dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]] = {}
|
|
445
|
+
for cid in test_ids:
|
|
446
|
+
if cid not in wide.columns:
|
|
447
|
+
continue
|
|
448
|
+
nbrs = select_neighbors(
|
|
449
|
+
cid,
|
|
450
|
+
self.stations,
|
|
451
|
+
wide,
|
|
452
|
+
max_neighbors=cfg.max_neighbors,
|
|
453
|
+
min_correlation=cfg.min_correlation,
|
|
454
|
+
max_distance_km=cfg.max_distance_km,
|
|
455
|
+
allowed_ids=ref_ids,
|
|
456
|
+
dist_cache=_dist_cache,
|
|
457
|
+
corr_cache=_corr_cache,
|
|
458
|
+
)
|
|
459
|
+
if not nbrs:
|
|
460
|
+
continue
|
|
461
|
+
ref = build_reference_series(wide, nbrs, cfg.mode)
|
|
462
|
+
q = compute_q_series(wide[cid], ref, cfg.mode)
|
|
463
|
+
tr_list = [t.detect(q, years) for t in _tests]
|
|
464
|
+
states[cid].neighbors_by_step[step] = nbrs
|
|
465
|
+
results[cid] = (tr_list, nbrs, q)
|
|
466
|
+
return results
|
|
467
|
+
|
|
468
|
+
def correct_stations(
|
|
469
|
+
test_results: dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]],
|
|
470
|
+
ids_to_correct: set[str],
|
|
471
|
+
step: int,
|
|
472
|
+
) -> None:
|
|
473
|
+
"""Apply correction if any test detects inhomogeneity; store DetectionRecord."""
|
|
474
|
+
for cid in ids_to_correct:
|
|
475
|
+
if cid not in test_results:
|
|
476
|
+
continue
|
|
477
|
+
tr_list, _, q = test_results[cid]
|
|
478
|
+
is_inh = _meets_consensus(tr_list)
|
|
479
|
+
break_year = _resolve_break(tr_list)
|
|
480
|
+
neutral = 1.0 if cfg.mode == "ratio" else 0.0
|
|
481
|
+
f = compute_correction_factor(q, years, break_year, cfg.mode) if is_inh else neutral
|
|
482
|
+
if is_inh:
|
|
483
|
+
states[cid].annual_current = apply_correction(
|
|
484
|
+
states[cid].annual_current, years, break_year, f, cfg.mode
|
|
485
|
+
)
|
|
486
|
+
states[cid].corrections.append(CorrectionRecord(step=step, break_year=break_year, factor=f))
|
|
487
|
+
states[cid].detections_by_step[step] = DetectionRecord(
|
|
488
|
+
step=step,
|
|
489
|
+
break_year=break_year,
|
|
490
|
+
factor=f,
|
|
491
|
+
test_results=tr_list,
|
|
492
|
+
was_applied=is_inh,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# ── Step 1 ────────────────────────────────────────────────────────
|
|
496
|
+
all_ids = set(candidate_ids)
|
|
497
|
+
r1 = run_tests_for(candidate_ids, all_ids, step=1)
|
|
498
|
+
|
|
499
|
+
h1: set[str] = set()
|
|
500
|
+
i1: set[str] = set()
|
|
501
|
+
for sid in candidate_ids:
|
|
502
|
+
if sid in r1 and _meets_consensus(r1[sid][0]):
|
|
503
|
+
i1.add(sid)
|
|
504
|
+
states[sid].group = "I1"
|
|
505
|
+
else:
|
|
506
|
+
h1.add(sid)
|
|
507
|
+
states[sid].group = "H1"
|
|
508
|
+
|
|
509
|
+
correct_stations(r1, i1, step=1)
|
|
510
|
+
_advance(2, f"step 2/6 — {len(candidate_ids)} stations vs corrected pool")
|
|
511
|
+
|
|
512
|
+
# ── Step 2 ────────────────────────────────────────────────────────
|
|
513
|
+
ref2 = h1 | i1 # i1 now corrected in states
|
|
514
|
+
r2 = run_tests_for(candidate_ids, ref2, step=2)
|
|
515
|
+
|
|
516
|
+
h2: set[str] = set()
|
|
517
|
+
i2: set[str] = set()
|
|
518
|
+
for sid in candidate_ids:
|
|
519
|
+
if sid in r2 and _meets_consensus(r2[sid][0]):
|
|
520
|
+
i2.add(sid)
|
|
521
|
+
states[sid].group = "I2"
|
|
522
|
+
else:
|
|
523
|
+
h2.add(sid)
|
|
524
|
+
states[sid].group = "H2"
|
|
525
|
+
|
|
526
|
+
correct_stations(r2, i2, step=2)
|
|
527
|
+
_advance(3, f"step 3/6 — {len(i2)} corrected stations re-tested")
|
|
528
|
+
|
|
529
|
+
# ── Step 3 ────────────────────────────────────────────────────────
|
|
530
|
+
corrected_i2 = i2
|
|
531
|
+
ref3 = h2 | corrected_i2
|
|
532
|
+
r3 = run_tests_for(list(corrected_i2), ref3, step=3)
|
|
533
|
+
|
|
534
|
+
hc3: set[str] = set()
|
|
535
|
+
ic3: set[str] = set()
|
|
536
|
+
for sid in corrected_i2:
|
|
537
|
+
if sid in r3 and _meets_consensus(r3[sid][0]):
|
|
538
|
+
ic3.add(sid)
|
|
539
|
+
states[sid].group = "IC3"
|
|
540
|
+
else:
|
|
541
|
+
hc3.add(sid)
|
|
542
|
+
states[sid].group = "HC3"
|
|
543
|
+
|
|
544
|
+
# No corrections in step 3 (classification only, corrections already done in step 2)
|
|
545
|
+
_advance(4, f"step 4/6 — {len(h2) + len(corrected_i2)} stations vs homogeneous references")
|
|
546
|
+
|
|
547
|
+
# ── Step 4 ────────────────────────────────────────────────────────
|
|
548
|
+
test4 = list(h2 | corrected_i2)
|
|
549
|
+
ref4 = h2 | hc3
|
|
550
|
+
r4 = run_tests_for(test4, ref4, step=4)
|
|
551
|
+
|
|
552
|
+
h4: set[str] = set()
|
|
553
|
+
i4: set[str] = set()
|
|
554
|
+
hc4: set[str] = set()
|
|
555
|
+
ic4: set[str] = set()
|
|
556
|
+
for sid in h2:
|
|
557
|
+
if sid in r4 and _meets_consensus(r4[sid][0]):
|
|
558
|
+
i4.add(sid)
|
|
559
|
+
states[sid].group = "I4"
|
|
560
|
+
else:
|
|
561
|
+
h4.add(sid)
|
|
562
|
+
states[sid].group = "H4"
|
|
563
|
+
for sid in corrected_i2:
|
|
564
|
+
if sid in r4 and _meets_consensus(r4[sid][0]):
|
|
565
|
+
ic4.add(sid)
|
|
566
|
+
states[sid].group = "IC4"
|
|
567
|
+
else:
|
|
568
|
+
hc4.add(sid)
|
|
569
|
+
states[sid].group = "HC4"
|
|
570
|
+
|
|
571
|
+
correct_stations(r4, i4, step=4)
|
|
572
|
+
_advance(5, f"step 5/6 — {len(i4)} remaining inhomogeneous stations")
|
|
573
|
+
|
|
574
|
+
# ── Step 5 ────────────────────────────────────────────────────────
|
|
575
|
+
ref5 = h4 | hc4
|
|
576
|
+
r5 = run_tests_for(list(i4), ref5, step=5)
|
|
577
|
+
|
|
578
|
+
hc5: set[str] = set()
|
|
579
|
+
ic5: set[str] = set()
|
|
580
|
+
for sid in i4:
|
|
581
|
+
if sid in r5 and _meets_consensus(r5[sid][0]):
|
|
582
|
+
ic5.add(sid)
|
|
583
|
+
states[sid].group = "IC5"
|
|
584
|
+
else:
|
|
585
|
+
hc5.add(sid)
|
|
586
|
+
states[sid].group = "HC5"
|
|
587
|
+
|
|
588
|
+
# Only i4 stations meeting consensus get a correction; IC5 (failed consensus) do not.
|
|
589
|
+
correct_stations(r5, i4, step=5)
|
|
590
|
+
_advance(6, f"step 6/6 — double-break correction for {len(ic4 | ic5)} stations")
|
|
591
|
+
|
|
592
|
+
# ── Step 6 ────────────────────────────────────────────────────────
|
|
593
|
+
# For IC4 and IC5: two-break correction.
|
|
594
|
+
# Pass 6a: omit data before first known break, test post-break sub-series.
|
|
595
|
+
# Pass 6b: with the second break corrected, test full series for first break.
|
|
596
|
+
_corr_cache_6b = build_correlation_cache(build_wide())
|
|
597
|
+
|
|
598
|
+
double_break_ids = ic4 | ic5
|
|
599
|
+
ref6 = h4 | hc4 | hc5
|
|
600
|
+
|
|
601
|
+
years_list = years.to_list()
|
|
602
|
+
|
|
603
|
+
for cid in double_break_ids:
|
|
604
|
+
state = states[cid]
|
|
605
|
+
if not state.corrections:
|
|
606
|
+
continue
|
|
607
|
+
first_break = state.corrections[0].break_year
|
|
608
|
+
|
|
609
|
+
# find the index in years where the post-first-break segment starts
|
|
610
|
+
start_idx = next((i for i, y in enumerate(years_list) if y >= first_break), None)
|
|
611
|
+
if start_idx is None or (len(years_list) - start_idx) < 2 * min_years_from_end:
|
|
612
|
+
continue
|
|
613
|
+
|
|
614
|
+
# --- 6a: test only the post-first-break portion ---
|
|
615
|
+
_wide_cols = set(build_wide().columns)
|
|
616
|
+
partial_wide = pl.DataFrame(
|
|
617
|
+
{"year": years_list[start_idx:]}
|
|
618
|
+
| {
|
|
619
|
+
sid: states[sid].annual_current[start_idx:].to_list()
|
|
620
|
+
for sid in candidate_ids
|
|
621
|
+
if sid in _wide_cols
|
|
622
|
+
},
|
|
623
|
+
)
|
|
624
|
+
nbrs6a = select_neighbors(
|
|
625
|
+
cid,
|
|
626
|
+
self.stations,
|
|
627
|
+
partial_wide,
|
|
628
|
+
max_neighbors=cfg.max_neighbors,
|
|
629
|
+
min_correlation=cfg.min_correlation,
|
|
630
|
+
max_distance_km=cfg.max_distance_km,
|
|
631
|
+
allowed_ids=ref6,
|
|
632
|
+
dist_cache=_dist_cache,
|
|
633
|
+
corr_cache=build_correlation_cache(partial_wide),
|
|
634
|
+
)
|
|
635
|
+
if nbrs6a:
|
|
636
|
+
ref_6a = build_reference_series(partial_wide, nbrs6a, cfg.mode)
|
|
637
|
+
q_6a = compute_q_series(partial_wide[cid], ref_6a, cfg.mode)
|
|
638
|
+
tr_6a = [t.detect(q_6a, years[start_idx:]) for t in _tests]
|
|
639
|
+
states[cid].neighbors_by_step[61] = nbrs6a
|
|
640
|
+
inh_6a = _meets_consensus(tr_6a)
|
|
641
|
+
br_6a = _resolve_break(tr_6a)
|
|
642
|
+
neutral = 1.0 if cfg.mode == "ratio" else 0.0
|
|
643
|
+
f_6a = compute_correction_factor(q_6a, years[start_idx:], br_6a, cfg.mode) if inh_6a else neutral
|
|
644
|
+
if inh_6a:
|
|
645
|
+
states[cid].annual_current = apply_correction(
|
|
646
|
+
states[cid].annual_current,
|
|
647
|
+
years,
|
|
648
|
+
br_6a,
|
|
649
|
+
f_6a,
|
|
650
|
+
cfg.mode,
|
|
651
|
+
)
|
|
652
|
+
states[cid].corrections.append(CorrectionRecord(step=6, break_year=br_6a, factor=f_6a))
|
|
653
|
+
states[cid].detections_by_step[61] = DetectionRecord(
|
|
654
|
+
step=61,
|
|
655
|
+
break_year=br_6a,
|
|
656
|
+
factor=f_6a,
|
|
657
|
+
test_results=tr_6a,
|
|
658
|
+
was_applied=inh_6a,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# --- 6b: test full series to correct the first break ---
|
|
662
|
+
wide6b = build_wide()
|
|
663
|
+
nbrs6b = select_neighbors(
|
|
664
|
+
cid,
|
|
665
|
+
self.stations,
|
|
666
|
+
wide6b,
|
|
667
|
+
max_neighbors=cfg.max_neighbors,
|
|
668
|
+
min_correlation=cfg.min_correlation,
|
|
669
|
+
max_distance_km=cfg.max_distance_km,
|
|
670
|
+
allowed_ids=ref6,
|
|
671
|
+
dist_cache=_dist_cache,
|
|
672
|
+
corr_cache=_corr_cache_6b,
|
|
673
|
+
)
|
|
674
|
+
if not nbrs6b:
|
|
675
|
+
states[cid].group = "ICC6"
|
|
676
|
+
continue
|
|
677
|
+
|
|
678
|
+
ref_6b = build_reference_series(wide6b, nbrs6b, cfg.mode)
|
|
679
|
+
q_6b = compute_q_series(wide6b[cid], ref_6b, cfg.mode)
|
|
680
|
+
tr_6b = [t.detect(q_6b, years) for t in _tests]
|
|
681
|
+
states[cid].neighbors_by_step[62] = nbrs6b
|
|
682
|
+
inh_6b = _meets_consensus(tr_6b)
|
|
683
|
+
br_6b = _resolve_break(tr_6b)
|
|
684
|
+
neutral = 1.0 if cfg.mode == "ratio" else 0.0
|
|
685
|
+
f_6b = compute_correction_factor(q_6b, years, br_6b, cfg.mode) if inh_6b else neutral
|
|
686
|
+
states[cid].detections_by_step[62] = DetectionRecord(
|
|
687
|
+
step=62,
|
|
688
|
+
break_year=br_6b,
|
|
689
|
+
factor=f_6b,
|
|
690
|
+
test_results=tr_6b,
|
|
691
|
+
was_applied=inh_6b,
|
|
692
|
+
)
|
|
693
|
+
if inh_6b:
|
|
694
|
+
states[cid].annual_current = apply_correction(
|
|
695
|
+
states[cid].annual_current,
|
|
696
|
+
years,
|
|
697
|
+
br_6b,
|
|
698
|
+
f_6b,
|
|
699
|
+
cfg.mode,
|
|
700
|
+
)
|
|
701
|
+
states[cid].corrections.append(CorrectionRecord(step=6, break_year=br_6b, factor=f_6b))
|
|
702
|
+
|
|
703
|
+
# --- 6c: re-test the doubly-corrected series — González-Rouco (2001) §3.b.
|
|
704
|
+
# HCC6 means "homogeneous after two corrections"; ICC6 means a residual
|
|
705
|
+
# inhomogeneity (e.g. a third break) survives. Without this re-test every
|
|
706
|
+
# IC4/IC5 station would silently become HCC6.
|
|
707
|
+
wide6c = build_wide()
|
|
708
|
+
ref_6c = build_reference_series(wide6c, nbrs6b, cfg.mode)
|
|
709
|
+
q_6c = compute_q_series(wide6c[cid], ref_6c, cfg.mode)
|
|
710
|
+
tr_6c = [t.detect(q_6c, years) for t in _tests]
|
|
711
|
+
still_inh = _meets_consensus(tr_6c)
|
|
712
|
+
states[cid].detections_by_step[63] = DetectionRecord(
|
|
713
|
+
step=63,
|
|
714
|
+
break_year=_resolve_break(tr_6c),
|
|
715
|
+
factor=1.0 if cfg.mode == "ratio" else 0.0,
|
|
716
|
+
test_results=tr_6c,
|
|
717
|
+
was_applied=False,
|
|
718
|
+
)
|
|
719
|
+
states[cid].group = "ICC6" if still_inh else "HCC6"
|
|
720
|
+
|
|
721
|
+
# mark any double-break stations we couldn't fully correct
|
|
722
|
+
for cid in double_break_ids:
|
|
723
|
+
if states[cid].group in ("IC4", "IC5"):
|
|
724
|
+
states[cid].group = "ICC6"
|
|
725
|
+
|
|
726
|
+
_advance(7, "done")
|
|
727
|
+
if _pbar is not None:
|
|
728
|
+
_pbar.close()
|
|
729
|
+
|
|
730
|
+
# ── Build DetectionResult ─────────────────────────────────────────
|
|
731
|
+
# Stations that never found a neighbor at any step → UNTESTABLE.
|
|
732
|
+
# Use neighbors_by_step (set for all tested stations) rather than
|
|
733
|
+
# detections_by_step (only set for stations that were inhomogeneous),
|
|
734
|
+
# so that stations tested and found homogeneous keep their H* group.
|
|
735
|
+
for sid in candidate_ids:
|
|
736
|
+
if not states[sid].neighbors_by_step:
|
|
737
|
+
states[sid].group = "UNTESTABLE"
|
|
738
|
+
|
|
739
|
+
station_detections: dict[str, StationDetection] = {
|
|
740
|
+
sid: StationDetection(
|
|
741
|
+
station_id=sid,
|
|
742
|
+
group=states[sid].group,
|
|
743
|
+
annual_original=states[sid].annual_original,
|
|
744
|
+
annual_corrected=states[sid].annual_current,
|
|
745
|
+
years=years,
|
|
746
|
+
detections_by_step=states[sid].detections_by_step,
|
|
747
|
+
neighbors_by_step=states[sid].neighbors_by_step,
|
|
748
|
+
corrections=states[sid].corrections,
|
|
749
|
+
)
|
|
750
|
+
for sid in candidate_ids
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
# Add pre-filtered stations with INSUFFICIENT_DATA label
|
|
754
|
+
for sid in insufficient_ids:
|
|
755
|
+
annual = annual_wide_base[sid]
|
|
756
|
+
station_detections[sid] = StationDetection(
|
|
757
|
+
station_id=sid,
|
|
758
|
+
group="INSUFFICIENT_DATA",
|
|
759
|
+
annual_original=annual,
|
|
760
|
+
annual_corrected=annual,
|
|
761
|
+
years=years,
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
return DetectionResult(
|
|
765
|
+
station_detections=station_detections,
|
|
766
|
+
parameter=self.parameter,
|
|
767
|
+
mode=cfg.mode,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# ------------------------------------------------------------------
|
|
771
|
+
# Properties
|
|
772
|
+
# ------------------------------------------------------------------
|
|
773
|
+
|
|
774
|
+
@property
|
|
775
|
+
def n_stations(self) -> int:
|
|
776
|
+
"""Number of stations in the dataset."""
|
|
777
|
+
return len(self.stations)
|
|
778
|
+
|
|
779
|
+
@property
|
|
780
|
+
def date_range(self) -> tuple[str, str]:
|
|
781
|
+
"""Earliest and latest date in the values table as ISO strings."""
|
|
782
|
+
d = self.values["date"]
|
|
783
|
+
return str(d.min()), str(d.max())
|
|
784
|
+
|
|
785
|
+
def __repr__(self) -> str:
|
|
786
|
+
"""Return a short summary string."""
|
|
787
|
+
lo, hi = self.date_range
|
|
788
|
+
return (
|
|
789
|
+
f"Rucola(parameter={self.parameter!r}, "
|
|
790
|
+
f"stations={self.n_stations}, "
|
|
791
|
+
f"records={len(self.values):,}, "
|
|
792
|
+
f"period={lo} – {hi})" # noqa: RUF001
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# ------------------------------------------------------------------
|
|
796
|
+
# Internal helpers
|
|
797
|
+
# ------------------------------------------------------------------
|
|
798
|
+
|
|
799
|
+
@staticmethod
|
|
800
|
+
def _check_date_order(values: pl.DataFrame) -> None:
|
|
801
|
+
"""Raise if any station's dates are not sorted ascending."""
|
|
802
|
+
unsorted = (
|
|
803
|
+
values.group_by("station_id")
|
|
804
|
+
.agg((pl.col("date") == pl.col("date").cum_max()).all().alias("sorted"))
|
|
805
|
+
.filter(~pl.col("sorted"))["station_id"]
|
|
806
|
+
.to_list()
|
|
807
|
+
)
|
|
808
|
+
if unsorted:
|
|
809
|
+
n = len(unsorted)
|
|
810
|
+
sample = sorted(unsorted)[:_STATION_ID_SAMPLE_SIZE]
|
|
811
|
+
suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
|
|
812
|
+
msg = f"{n} station(s) in `values` have dates not sorted ascending: {sample}{suffix}"
|
|
813
|
+
raise ValueError(msg)
|
|
814
|
+
|
|
815
|
+
@staticmethod
|
|
816
|
+
def _check_duplicate_dates(values: pl.DataFrame) -> None:
|
|
817
|
+
"""Raise if any station has two records for the same date."""
|
|
818
|
+
dup_ids = (
|
|
819
|
+
values.group_by("station_id", "date")
|
|
820
|
+
.len()
|
|
821
|
+
.filter(pl.col("len") > 1)["station_id"]
|
|
822
|
+
.unique()
|
|
823
|
+
.to_list()
|
|
824
|
+
)
|
|
825
|
+
if dup_ids:
|
|
826
|
+
n = len(dup_ids)
|
|
827
|
+
sample = sorted(dup_ids)[:_STATION_ID_SAMPLE_SIZE]
|
|
828
|
+
suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
|
|
829
|
+
msg = f"{n} station(s) in `values` have duplicate dates: {sample}{suffix}"
|
|
830
|
+
raise ValueError(msg)
|
|
831
|
+
|
|
832
|
+
@staticmethod
|
|
833
|
+
def _check_single_parameter(values: pl.DataFrame) -> None:
|
|
834
|
+
"""Raise if `values` contains more than one parameter value."""
|
|
835
|
+
if "parameter" not in values.columns:
|
|
836
|
+
return
|
|
837
|
+
params = values["parameter"].drop_nulls().unique().to_list()
|
|
838
|
+
if len(params) > 1:
|
|
839
|
+
msg = (
|
|
840
|
+
f"`values` contains multiple parameters: {sorted(params)!r}. "
|
|
841
|
+
"Filter to a single parameter before loading."
|
|
842
|
+
)
|
|
843
|
+
raise ValueError(msg)
|
|
844
|
+
|
|
845
|
+
@staticmethod
|
|
846
|
+
def _check_annual_resolution(values: pl.DataFrame) -> None:
|
|
847
|
+
"""Raise if values contains more than one record per station per year."""
|
|
848
|
+
max_per: int = (
|
|
849
|
+
values.with_columns(pl.col("date").dt.year().alias("_year"))
|
|
850
|
+
.group_by("station_id", "_year")
|
|
851
|
+
.len()
|
|
852
|
+
.select(pl.col("len").max().cast(pl.Int64))
|
|
853
|
+
.item()
|
|
854
|
+
)
|
|
855
|
+
if max_per > 1:
|
|
856
|
+
msg = (
|
|
857
|
+
f"values has sub-annual resolution ({max_per} records per station-year). "
|
|
858
|
+
"Pre-aggregate to annual using rucola._preprocessing.compute_annual_totals "
|
|
859
|
+
"or compute_annual_means before passing to Rucola."
|
|
860
|
+
)
|
|
861
|
+
raise ValueError(msg)
|
|
862
|
+
|
|
863
|
+
@staticmethod
|
|
864
|
+
def _check_value_dtype(values: pl.DataFrame) -> None:
|
|
865
|
+
"""Raise if the `value` column is not numeric."""
|
|
866
|
+
dtype = values["value"].dtype
|
|
867
|
+
if not dtype.is_numeric():
|
|
868
|
+
msg = f"`values.value` must be numeric, got {dtype}. Cast to Float64 before loading."
|
|
869
|
+
raise TypeError(msg)
|
|
870
|
+
|
|
871
|
+
@staticmethod
|
|
872
|
+
def _check_stations(stations: pl.DataFrame) -> None:
|
|
873
|
+
"""Raise on duplicate station IDs, null coordinates, or out-of-range coordinates."""
|
|
874
|
+
dupes = stations.group_by("station_id").len().filter(pl.col("len") > 1)["station_id"].to_list()
|
|
875
|
+
if dupes:
|
|
876
|
+
msg = f"`stations` has duplicate station_id(s): {sorted(dupes)}"
|
|
877
|
+
raise ValueError(msg)
|
|
878
|
+
|
|
879
|
+
null_coords = stations.filter(pl.col("latitude").is_null() | pl.col("longitude").is_null())[
|
|
880
|
+
"station_id"
|
|
881
|
+
].to_list()
|
|
882
|
+
if null_coords:
|
|
883
|
+
msg = f"`stations` has null latitude/longitude for station_id(s): {sorted(null_coords)}"
|
|
884
|
+
raise ValueError(msg)
|
|
885
|
+
|
|
886
|
+
bad_lat = stations.filter((pl.col("latitude") < _LAT_MIN) | (pl.col("latitude") > _LAT_MAX))[
|
|
887
|
+
"station_id"
|
|
888
|
+
].to_list()
|
|
889
|
+
if bad_lat:
|
|
890
|
+
msg = f"`stations` has latitude outside [{_LAT_MIN}, {_LAT_MAX}] for station_id(s): {sorted(bad_lat)}"
|
|
891
|
+
raise ValueError(msg)
|
|
892
|
+
|
|
893
|
+
bad_lon = stations.filter((pl.col("longitude") < _LON_MIN) | (pl.col("longitude") > _LON_MAX))[
|
|
894
|
+
"station_id"
|
|
895
|
+
].to_list()
|
|
896
|
+
if bad_lon:
|
|
897
|
+
msg = f"`stations` has longitude outside [{_LON_MIN}, {_LON_MAX}] for station_id(s): {sorted(bad_lon)}"
|
|
898
|
+
raise ValueError(msg)
|
|
899
|
+
|
|
900
|
+
@staticmethod
|
|
901
|
+
def _check_columns(df: pl.DataFrame, required: frozenset[str], name: str) -> None:
|
|
902
|
+
missing = required - set(df.columns)
|
|
903
|
+
if missing:
|
|
904
|
+
msg = f"`{name}` is missing required column(s): {sorted(missing)}"
|
|
905
|
+
raise ValueError(msg)
|
|
906
|
+
|
|
907
|
+
@staticmethod
|
|
908
|
+
def _check_station_coverage(values: pl.DataFrame, stations: pl.DataFrame) -> None:
|
|
909
|
+
"""All station_ids in values must have a matching entry in stations."""
|
|
910
|
+
value_ids = set(values["station_id"].unique().to_list())
|
|
911
|
+
station_ids = set(stations["station_id"].to_list())
|
|
912
|
+
missing = value_ids - station_ids
|
|
913
|
+
if missing:
|
|
914
|
+
n = len(missing)
|
|
915
|
+
sample = sorted(missing)[:_STATION_ID_SAMPLE_SIZE]
|
|
916
|
+
suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
|
|
917
|
+
msg = f"{n} station_id(s) in `values` have no entry in `stations`: {sample}{suffix}"
|
|
918
|
+
raise ValueError(msg)
|
|
919
|
+
|
|
920
|
+
@classmethod
|
|
921
|
+
def _cast(
|
|
922
|
+
cls,
|
|
923
|
+
values: pl.DataFrame,
|
|
924
|
+
stations: pl.DataFrame | None = None,
|
|
925
|
+
parameter: str | None = None,
|
|
926
|
+
) -> Self:
|
|
927
|
+
"""Normalise column types and construct the instance."""
|
|
928
|
+
values = values.with_columns(pl.col("station_id").cast(pl.String))
|
|
929
|
+
if stations is not None:
|
|
930
|
+
stations = stations.with_columns(pl.col("station_id").cast(pl.String))
|
|
931
|
+
if "date" in values.columns and values["date"].dtype not in (pl.Date, pl.Datetime):
|
|
932
|
+
values = values.with_columns(pl.col("date").cast(pl.Date))
|
|
933
|
+
return cls(values=values, stations=stations, parameter=parameter)
|