rucola 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rucola/__init__.py ADDED
@@ -0,0 +1,933 @@
1
+ """Homogenization toolbox for climate station data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, Self
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+ from pathlib import Path
11
+
12
+ import polars as pl
13
+
14
+ from rucola._algorithms import (
15
+ _VALID_GROUPS,
16
+ CorrectionMode,
17
+ GroupLabel,
18
+ NeighborInfo,
19
+ apply_correction,
20
+ build_correlation_cache,
21
+ build_distance_cache,
22
+ build_reference_series,
23
+ compute_correction_factor,
24
+ compute_q_series,
25
+ select_neighbors,
26
+ )
27
+ from rucola._homogeneity import (
28
+ BuishandTest, # noqa: F401
29
+ EasterlingPetersonTest, # noqa: F401
30
+ HomogenizationTest,
31
+ PettittTest, # noqa: F401
32
+ SNHTTest,
33
+ StarsTest, # noqa: F401
34
+ TestResult,
35
+ WorsleyTest, # noqa: F401
36
+ )
37
+ from rucola._normalization import ( # noqa: F401, TC001
38
+ BreakInfo,
39
+ BreakPredicate,
40
+ ConsensusRule,
41
+ MagnitudeAbove,
42
+ NeighborCountAbove,
43
+ NormalizationConfig,
44
+ NSignificantAbove,
45
+ SignalAbove,
46
+ StationIn,
47
+ StepIn,
48
+ TestSignificant,
49
+ YearBetween,
50
+ )
51
+ from rucola._results import (
52
+ CorrectionRecord,
53
+ DetectionRecord,
54
+ DetectionResult,
55
+ HomogenizationResult, # noqa: F401
56
+ StationDetection,
57
+ StationResult, # noqa: F401
58
+ )
59
+
60
+ _STATION_ID_SAMPLE_SIZE = 5
61
+ _LAT_MIN, _LAT_MAX = -90.0, 90.0
62
+ _LON_MIN, _LON_MAX = -180.0, 180.0
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Internal state tracking across the 6-step procedure
66
+ # ---------------------------------------------------------------------------
67
+
68
+
69
+ @dataclass
70
+ class _StationState:
71
+ station_id: str
72
+ annual_original: pl.Series
73
+ annual_current: pl.Series # updated after each correction
74
+ years: pl.Series
75
+ _group: GroupLabel = field(default="", init=False, repr=False)
76
+ corrections: list[CorrectionRecord] = field(default_factory=list)
77
+ detections_by_step: dict[int, DetectionRecord] = field(default_factory=dict)
78
+ neighbors_by_step: dict[int, list[NeighborInfo]] = field(default_factory=dict)
79
+
80
+ @property
81
+ def group(self) -> GroupLabel:
82
+ return self._group
83
+
84
+ @group.setter
85
+ def group(self, value: GroupLabel) -> None:
86
+ if value not in _VALID_GROUPS:
87
+ msg = f"Invalid group {value!r}. Must be one of {sorted(_VALID_GROUPS - {''})}"
88
+ raise ValueError(msg)
89
+ self._group = value
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Run configuration
94
+ # ---------------------------------------------------------------------------
95
+
96
+
97
+ @dataclass
98
+ class RunConfig:
99
+ """Configuration for the González-Rouco six-step detection procedure.
100
+
101
+ Parameters
102
+ ----------
103
+ tests :
104
+ Homogeneity tests to run at each step. Defaults to ``[SNHTTest()]``.
105
+ Pass multiple tests for consensus detection.
106
+ mode :
107
+ ``"ratio"`` (multiplicative, precipitation) or ``"difference"``
108
+ (additive, temperature).
109
+ run_consensus :
110
+ How many tests must agree to classify a station as inhomogeneous
111
+ during the six-step procedure. ``"majority"`` (default),
112
+ ``"any"`` (most sensitive), or ``"unanimous"``.
113
+ ``"strongest_signal"`` is accepted but treated as ``"any"`` at
114
+ detection time; its special tiebreak behaviour only applies in
115
+ :class:`NormalizationConfig`.
116
+ min_series_years :
117
+ Minimum non-null annual values required to process a station (default: 20).
118
+ max_gap_years :
119
+ Stations with a consecutive null gap exceeding this are excluded.
120
+ max_neighbors :
121
+ Maximum number of reference stations (default: 10).
122
+ min_correlation :
123
+ Minimum Pearson correlation to include a neighbor (default: 0.5).
124
+ max_distance_km :
125
+ Search radius for neighbors in km. ``None`` disables the filter.
126
+ station_ids :
127
+ Restrict the run to this subset of station IDs.
128
+
129
+ """
130
+
131
+ tests: list[HomogenizationTest] | None = None
132
+ mode: CorrectionMode = "ratio"
133
+ run_consensus: ConsensusRule = "majority"
134
+ min_series_years: int = 20
135
+ max_gap_years: int | None = None
136
+ max_neighbors: int = 10
137
+ min_correlation: float = 0.5
138
+ max_distance_km: float | None = None
139
+ station_ids: list[str] | None = None
140
+ progress: bool = False
141
+ on_step: Callable[[int, str], None] | None = None
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # Main class
146
+ # ---------------------------------------------------------------------------
147
+
148
+
149
+ class Rucola:
150
+ """Homogenization toolbox for climate station data.
151
+
152
+ Implements the six-step quality control and homogenization procedure from
153
+ González-Rouco et al. (2001), with six pluggable breakpoint tests
154
+ (SNHT, Buishand, Pettitt, Worsley, Easterling–Peterson, STARS) and an
155
+ iteratively refined reference pool.
156
+
157
+ Each instance represents one parameter (e.g. precipitation_height).
158
+ Use the ``from_*`` class methods to load data from different sources.
159
+
160
+ Minimum required columns
161
+ ------------------------
162
+ stations : station_id (str), latitude (float), longitude (float)
163
+ values : station_id (str), date (date/datetime), value (float),
164
+ parameter (str)
165
+
166
+ References
167
+ ----------
168
+ González-Rouco et al. (2001), J. Climate 14(5):964–978.
169
+ https://doi.org/10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2
170
+ Alexandersson (1986), Int. J. Climatol. 6(6):661–675.
171
+ Alexandersson & Moberg (1997), Int. J. Climatol. 17(1):25–34.
172
+ Hanssen-Bauer & Førland (1994), J. Climate 7(7):1001–1013.
173
+
174
+ """
175
+
176
+ STATIONS_REQUIRED: frozenset[str] = frozenset({"station_id", "latitude", "longitude"})
177
+ VALUES_REQUIRED: frozenset[str] = frozenset({"station_id", "date", "value"})
178
+
179
+ def __init__(
180
+ self,
181
+ values: pl.DataFrame,
182
+ stations: pl.DataFrame | None = None,
183
+ parameter: str | None = None,
184
+ ) -> None:
185
+ """Initialise directly from pre-loaded DataFrames. Prefer ``from_*`` class methods."""
186
+ self._check_columns(values, self.VALUES_REQUIRED, "values")
187
+ self._check_value_dtype(values)
188
+ self._check_date_order(values)
189
+ self._check_duplicate_dates(values)
190
+ self._check_single_parameter(values)
191
+
192
+ if stations is not None:
193
+ self._check_columns(stations, self.STATIONS_REQUIRED, "stations")
194
+ self._check_station_coverage(values, stations)
195
+ self._check_stations(stations)
196
+ self.stations: pl.DataFrame = stations
197
+ else:
198
+ station_ids = values["station_id"].unique().sort().to_list()
199
+ self.stations = pl.DataFrame({"station_id": station_ids})
200
+
201
+ self.values = values
202
+ self.parameter = parameter or "unspecified"
203
+
204
+ # ------------------------------------------------------------------
205
+ # Constructors
206
+ # ------------------------------------------------------------------
207
+
208
+ @classmethod
209
+ def from_duckdb(
210
+ cls,
211
+ path: str | Path,
212
+ parameter: str | None = None,
213
+ stations_table: str | None = "stations",
214
+ values_table: str = "values",
215
+ ) -> Self:
216
+ """Load from a DuckDB file.
217
+
218
+ Parameters
219
+ ----------
220
+ path :
221
+ Path to the .duckdb file.
222
+ parameter :
223
+ Label for the parameter stored in this instance (e.g.
224
+ ``"precipitation_height"``). Pure metadata — filter the table to a
225
+ single parameter before calling this method.
226
+ stations_table :
227
+ Name of the stations table (default: ``"stations"``). Pass ``None``
228
+ to run without station metadata.
229
+ values_table :
230
+ Name of the values table (default: ``"values"``).
231
+
232
+ """
233
+ try:
234
+ import duckdb # noqa: PLC0415
235
+ except ImportError as e:
236
+ msg = "DuckDB is required for from_duckdb(). Install it with: pip install rucola[duckdb]"
237
+ raise ImportError(msg) from e
238
+ with duckdb.connect(str(path), read_only=True) as con:
239
+ stations = con.execute(f"SELECT * FROM {stations_table}").pl() if stations_table else None # noqa: S608
240
+ values = con.execute(f"SELECT * FROM {values_table}").pl() # noqa: S608
241
+ return cls._cast(values, stations, parameter=parameter)
242
+
243
+ @classmethod
244
+ def from_csv(
245
+ cls,
246
+ values_path: str | Path,
247
+ stations_path: str | Path | None = None,
248
+ parameter: str | None = None,
249
+ ) -> Self:
250
+ """Load from CSV files.
251
+
252
+ Pre-filter ``values`` to a single parameter before calling this method.
253
+ ``stations_path`` is optional; omit it to run without station metadata.
254
+ """
255
+ values = pl.read_csv(str(values_path), try_parse_dates=True)
256
+ stations = pl.read_csv(str(stations_path), try_parse_dates=True) if stations_path else None
257
+ return cls._cast(values, stations, parameter=parameter)
258
+
259
+ @classmethod
260
+ def from_polars(
261
+ cls,
262
+ values: pl.DataFrame,
263
+ stations: pl.DataFrame | None = None,
264
+ parameter: str | None = None,
265
+ ) -> Self:
266
+ """Load from Polars DataFrames.
267
+
268
+ Pre-filter ``values`` to a single parameter before calling this method.
269
+ ``stations`` is optional; omit it to run without station metadata.
270
+ """
271
+ return cls._cast(values, stations, parameter=parameter)
272
+
273
+ # ------------------------------------------------------------------
274
+ # Core 6-step procedure
275
+ # ------------------------------------------------------------------
276
+
277
+ def run( # noqa: C901, PLR0912, PLR0915
278
+ self,
279
+ config: RunConfig | None = None,
280
+ ) -> DetectionResult:
281
+ """Run the González-Rouco (2001) six-step homogenization procedure.
282
+
283
+ Overview
284
+ --------
285
+ Step 0 (pre-processing): Winsorize daily values to P_out = q_0.75 + 3·IQR
286
+ per station, then aggregate to annual totals.
287
+
288
+ Steps 1–6 iteratively refine the reference station pool and apply
289
+ corrections. Each step tests a subset of candidate stations using a
290
+ progressively more reliable set of reference stations:
291
+
292
+ Step 1 – First assessment. ALL vs ALL.
293
+ → groups H1 (homogeneous) and I1 (inhomogeneous).
294
+ Corrections applied to I1.
295
+
296
+ Step 2 – Adjusted references. ALL vs (H1 + corrected I1).
297
+ → groups H2, I2.
298
+ Corrections applied to I2.
299
+
300
+ Step 3 – Test corrected I2 series. corr(I2) vs (H2 + corr(I2)).
301
+ → groups HC3 (corrected, now homogeneous), IC3 (still inhomogeneous).
302
+
303
+ Step 4 – Only homogeneous references. (H2 + corr(I2)) vs (H2 + HC3).
304
+ → groups H4, HC4, I4, IC4.
305
+ Corrections applied to I4.
306
+
307
+ Step 5 – Last single-break corrections. I4 vs (H4 + HC4).
308
+ → groups HC5, IC5.
309
+
310
+ Step 6 – Double-break correction. (IC4 + IC5) vs (H4 + HC4 + HC5).
311
+ Two-pass: first correct the later break on the post-first-break
312
+ sub-series, then correct the earlier break on the full series.
313
+ → groups HCC6 or ICC6.
314
+
315
+ Parameters
316
+ ----------
317
+ config :
318
+ Run configuration. Defaults to ``RunConfig()`` if not provided.
319
+
320
+ Returns
321
+ -------
322
+ DetectionResult
323
+ Raw detection data for all stations. Call ``.normalize()`` to
324
+ obtain a ``HomogenizationResult`` with corrections applied.
325
+
326
+ """
327
+ cfg = config or RunConfig()
328
+
329
+ _pbar: Any = None
330
+ if cfg.progress:
331
+ try:
332
+ from tqdm import tqdm # noqa: PLC0415
333
+ _pbar = tqdm(total=6, desc="step 1/6", unit="step", leave=True)
334
+ except ImportError:
335
+ import warnings # noqa: PLC0415
336
+ warnings.warn("tqdm is not installed; install it with: pip install rucola[tqdm]", stacklevel=2)
337
+
338
+ if cfg.max_distance_km is not None and "latitude" not in self.stations.columns:
339
+ msg = "max_distance_km requires a stations DataFrame with latitude/longitude columns."
340
+ raise ValueError(msg)
341
+
342
+ # ── Resolve tests and derive min_years_from_end ───────────────────
343
+ _tests = cfg.tests or [SNHTTest()]
344
+ min_years_from_end = max(t.min_years_from_end for t in _tests)
345
+
346
+ # ── Parameter consistency check ───────────────────────────────────
347
+ min_detectable = 2 * min_years_from_end + 1
348
+ if cfg.min_series_years < min_detectable:
349
+ msg = (
350
+ f"min_series_years={cfg.min_series_years} is below the minimum detectable series length "
351
+ f"({min_detectable} = 2 * min_years_from_end + 1 = 2 * {min_years_from_end} + 1). "
352
+ "No break could ever pass the edge-effect guard. "
353
+ f"Set min_series_years >= {min_detectable} or lower min_years_from_end."
354
+ )
355
+ raise ValueError(msg)
356
+
357
+ # ── Step 0: validate resolution and pivot to wide ────────────────
358
+ self._check_annual_resolution(self.values)
359
+ annual_wide_base = (
360
+ self.values.with_columns(pl.col("date").dt.year().alias("year"))
361
+ .select("station_id", "year", "value")
362
+ .pivot(on="station_id", index="year", values="value", aggregate_function="first")
363
+ .sort("year")
364
+ )
365
+ years = annual_wide_base["year"]
366
+
367
+ candidate_ids = [sid for sid in self.stations["station_id"].to_list() if sid in annual_wide_base.columns]
368
+ if cfg.station_ids is not None:
369
+ allowed = set(cfg.station_ids)
370
+ candidate_ids = [s for s in candidate_ids if s in allowed]
371
+
372
+ # ── Pre-filter: series length and gap checks ──────────────────────
373
+ def _max_gap(sid: str) -> int:
374
+ best, cur = 0, 0
375
+ for v in annual_wide_base[sid].to_list():
376
+ cur = cur + 1 if v is None else 0
377
+ best = max(best, cur)
378
+ return best
379
+
380
+ insufficient_ids: list[str] = []
381
+ filtered: list[str] = []
382
+ for sid in candidate_ids:
383
+ col = annual_wide_base[sid]
384
+ too_short = col.drop_nulls().len() < cfg.min_series_years
385
+ gapped = cfg.max_gap_years is not None and _max_gap(sid) > cfg.max_gap_years
386
+ if too_short or gapped:
387
+ insufficient_ids.append(sid)
388
+ else:
389
+ filtered.append(sid)
390
+ candidate_ids = filtered
391
+
392
+ # Initialise per-station state
393
+ states: dict[str, _StationState] = {
394
+ sid: _StationState(
395
+ station_id=sid,
396
+ annual_original=annual_wide_base[sid],
397
+ annual_current=annual_wide_base[sid],
398
+ years=years,
399
+ )
400
+ for sid in candidate_ids
401
+ }
402
+
403
+ # ── helpers ──────────────────────────────────────────────────────
404
+
405
+ def build_wide() -> pl.DataFrame:
406
+ data: dict = {"year": years.to_list()}
407
+ for sid in candidate_ids:
408
+ data[sid] = states[sid].annual_current.to_list()
409
+ return pl.DataFrame(data)
410
+
411
+ _dist_cache = build_distance_cache(self.stations) if cfg.max_distance_km is not None else None
412
+
413
+ def _advance(step: int, desc: str) -> None:
414
+ if _pbar is not None:
415
+ _pbar.set_description(desc)
416
+ _pbar.update(1)
417
+ if cfg.on_step is not None:
418
+ cfg.on_step(step, desc)
419
+
420
+ def _meets_consensus(tr_list: list[TestResult]) -> bool:
421
+ sig = sum(1 for t, tr in zip(_tests, tr_list, strict=False) if t.is_inhomogeneous(tr))
422
+ n = len(_tests)
423
+ if cfg.run_consensus == "unanimous":
424
+ return sig == n
425
+ if cfg.run_consensus == "majority":
426
+ return sig > n // 2
427
+ # "any" / "strongest_signal" — at detection time both require at least one inhomogeneous test
428
+ return sig >= 1
429
+
430
+ def _resolve_break(tr_list: list[TestResult]) -> int | None:
431
+ sig = [tr for t, tr in zip(_tests, tr_list, strict=False) if t.is_inhomogeneous(tr)]
432
+ if not sig:
433
+ return None
434
+ return max(sig, key=lambda r: r.relative_signal).break_year
435
+
436
+ def run_tests_for(
437
+ test_ids: list[str],
438
+ ref_ids: set[str],
439
+ step: int,
440
+ ) -> dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]]:
441
+ """Run all tests for each station; store results in state."""
442
+ wide = build_wide()
443
+ _corr_cache = build_correlation_cache(wide)
444
+ results: dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]] = {}
445
+ for cid in test_ids:
446
+ if cid not in wide.columns:
447
+ continue
448
+ nbrs = select_neighbors(
449
+ cid,
450
+ self.stations,
451
+ wide,
452
+ max_neighbors=cfg.max_neighbors,
453
+ min_correlation=cfg.min_correlation,
454
+ max_distance_km=cfg.max_distance_km,
455
+ allowed_ids=ref_ids,
456
+ dist_cache=_dist_cache,
457
+ corr_cache=_corr_cache,
458
+ )
459
+ if not nbrs:
460
+ continue
461
+ ref = build_reference_series(wide, nbrs, cfg.mode)
462
+ q = compute_q_series(wide[cid], ref, cfg.mode)
463
+ tr_list = [t.detect(q, years) for t in _tests]
464
+ states[cid].neighbors_by_step[step] = nbrs
465
+ results[cid] = (tr_list, nbrs, q)
466
+ return results
467
+
468
+ def correct_stations(
469
+ test_results: dict[str, tuple[list[TestResult], list[NeighborInfo], pl.Series]],
470
+ ids_to_correct: set[str],
471
+ step: int,
472
+ ) -> None:
473
+ """Apply correction if any test detects inhomogeneity; store DetectionRecord."""
474
+ for cid in ids_to_correct:
475
+ if cid not in test_results:
476
+ continue
477
+ tr_list, _, q = test_results[cid]
478
+ is_inh = _meets_consensus(tr_list)
479
+ break_year = _resolve_break(tr_list)
480
+ neutral = 1.0 if cfg.mode == "ratio" else 0.0
481
+ f = compute_correction_factor(q, years, break_year, cfg.mode) if is_inh else neutral
482
+ if is_inh:
483
+ states[cid].annual_current = apply_correction(
484
+ states[cid].annual_current, years, break_year, f, cfg.mode
485
+ )
486
+ states[cid].corrections.append(CorrectionRecord(step=step, break_year=break_year, factor=f))
487
+ states[cid].detections_by_step[step] = DetectionRecord(
488
+ step=step,
489
+ break_year=break_year,
490
+ factor=f,
491
+ test_results=tr_list,
492
+ was_applied=is_inh,
493
+ )
494
+
495
+ # ── Step 1 ────────────────────────────────────────────────────────
496
+ all_ids = set(candidate_ids)
497
+ r1 = run_tests_for(candidate_ids, all_ids, step=1)
498
+
499
+ h1: set[str] = set()
500
+ i1: set[str] = set()
501
+ for sid in candidate_ids:
502
+ if sid in r1 and _meets_consensus(r1[sid][0]):
503
+ i1.add(sid)
504
+ states[sid].group = "I1"
505
+ else:
506
+ h1.add(sid)
507
+ states[sid].group = "H1"
508
+
509
+ correct_stations(r1, i1, step=1)
510
+ _advance(2, f"step 2/6 — {len(candidate_ids)} stations vs corrected pool")
511
+
512
+ # ── Step 2 ────────────────────────────────────────────────────────
513
+ ref2 = h1 | i1 # i1 now corrected in states
514
+ r2 = run_tests_for(candidate_ids, ref2, step=2)
515
+
516
+ h2: set[str] = set()
517
+ i2: set[str] = set()
518
+ for sid in candidate_ids:
519
+ if sid in r2 and _meets_consensus(r2[sid][0]):
520
+ i2.add(sid)
521
+ states[sid].group = "I2"
522
+ else:
523
+ h2.add(sid)
524
+ states[sid].group = "H2"
525
+
526
+ correct_stations(r2, i2, step=2)
527
+ _advance(3, f"step 3/6 — {len(i2)} corrected stations re-tested")
528
+
529
+ # ── Step 3 ────────────────────────────────────────────────────────
530
+ corrected_i2 = i2
531
+ ref3 = h2 | corrected_i2
532
+ r3 = run_tests_for(list(corrected_i2), ref3, step=3)
533
+
534
+ hc3: set[str] = set()
535
+ ic3: set[str] = set()
536
+ for sid in corrected_i2:
537
+ if sid in r3 and _meets_consensus(r3[sid][0]):
538
+ ic3.add(sid)
539
+ states[sid].group = "IC3"
540
+ else:
541
+ hc3.add(sid)
542
+ states[sid].group = "HC3"
543
+
544
+ # No corrections in step 3 (classification only, corrections already done in step 2)
545
+ _advance(4, f"step 4/6 — {len(h2) + len(corrected_i2)} stations vs homogeneous references")
546
+
547
+ # ── Step 4 ────────────────────────────────────────────────────────
548
+ test4 = list(h2 | corrected_i2)
549
+ ref4 = h2 | hc3
550
+ r4 = run_tests_for(test4, ref4, step=4)
551
+
552
+ h4: set[str] = set()
553
+ i4: set[str] = set()
554
+ hc4: set[str] = set()
555
+ ic4: set[str] = set()
556
+ for sid in h2:
557
+ if sid in r4 and _meets_consensus(r4[sid][0]):
558
+ i4.add(sid)
559
+ states[sid].group = "I4"
560
+ else:
561
+ h4.add(sid)
562
+ states[sid].group = "H4"
563
+ for sid in corrected_i2:
564
+ if sid in r4 and _meets_consensus(r4[sid][0]):
565
+ ic4.add(sid)
566
+ states[sid].group = "IC4"
567
+ else:
568
+ hc4.add(sid)
569
+ states[sid].group = "HC4"
570
+
571
+ correct_stations(r4, i4, step=4)
572
+ _advance(5, f"step 5/6 — {len(i4)} remaining inhomogeneous stations")
573
+
574
+ # ── Step 5 ────────────────────────────────────────────────────────
575
+ ref5 = h4 | hc4
576
+ r5 = run_tests_for(list(i4), ref5, step=5)
577
+
578
+ hc5: set[str] = set()
579
+ ic5: set[str] = set()
580
+ for sid in i4:
581
+ if sid in r5 and _meets_consensus(r5[sid][0]):
582
+ ic5.add(sid)
583
+ states[sid].group = "IC5"
584
+ else:
585
+ hc5.add(sid)
586
+ states[sid].group = "HC5"
587
+
588
+ # Only i4 stations meeting consensus get a correction; IC5 (failed consensus) do not.
589
+ correct_stations(r5, i4, step=5)
590
+ _advance(6, f"step 6/6 — double-break correction for {len(ic4 | ic5)} stations")
591
+
592
+ # ── Step 6 ────────────────────────────────────────────────────────
593
+ # For IC4 and IC5: two-break correction.
594
+ # Pass 6a: omit data before first known break, test post-break sub-series.
595
+ # Pass 6b: with the second break corrected, test full series for first break.
596
+ _corr_cache_6b = build_correlation_cache(build_wide())
597
+
598
+ double_break_ids = ic4 | ic5
599
+ ref6 = h4 | hc4 | hc5
600
+
601
+ years_list = years.to_list()
602
+
603
+ for cid in double_break_ids:
604
+ state = states[cid]
605
+ if not state.corrections:
606
+ continue
607
+ first_break = state.corrections[0].break_year
608
+
609
+ # find the index in years where the post-first-break segment starts
610
+ start_idx = next((i for i, y in enumerate(years_list) if y >= first_break), None)
611
+ if start_idx is None or (len(years_list) - start_idx) < 2 * min_years_from_end:
612
+ continue
613
+
614
+ # --- 6a: test only the post-first-break portion ---
615
+ _wide_cols = set(build_wide().columns)
616
+ partial_wide = pl.DataFrame(
617
+ {"year": years_list[start_idx:]}
618
+ | {
619
+ sid: states[sid].annual_current[start_idx:].to_list()
620
+ for sid in candidate_ids
621
+ if sid in _wide_cols
622
+ },
623
+ )
624
+ nbrs6a = select_neighbors(
625
+ cid,
626
+ self.stations,
627
+ partial_wide,
628
+ max_neighbors=cfg.max_neighbors,
629
+ min_correlation=cfg.min_correlation,
630
+ max_distance_km=cfg.max_distance_km,
631
+ allowed_ids=ref6,
632
+ dist_cache=_dist_cache,
633
+ corr_cache=build_correlation_cache(partial_wide),
634
+ )
635
+ if nbrs6a:
636
+ ref_6a = build_reference_series(partial_wide, nbrs6a, cfg.mode)
637
+ q_6a = compute_q_series(partial_wide[cid], ref_6a, cfg.mode)
638
+ tr_6a = [t.detect(q_6a, years[start_idx:]) for t in _tests]
639
+ states[cid].neighbors_by_step[61] = nbrs6a
640
+ inh_6a = _meets_consensus(tr_6a)
641
+ br_6a = _resolve_break(tr_6a)
642
+ neutral = 1.0 if cfg.mode == "ratio" else 0.0
643
+ f_6a = compute_correction_factor(q_6a, years[start_idx:], br_6a, cfg.mode) if inh_6a else neutral
644
+ if inh_6a:
645
+ states[cid].annual_current = apply_correction(
646
+ states[cid].annual_current,
647
+ years,
648
+ br_6a,
649
+ f_6a,
650
+ cfg.mode,
651
+ )
652
+ states[cid].corrections.append(CorrectionRecord(step=6, break_year=br_6a, factor=f_6a))
653
+ states[cid].detections_by_step[61] = DetectionRecord(
654
+ step=61,
655
+ break_year=br_6a,
656
+ factor=f_6a,
657
+ test_results=tr_6a,
658
+ was_applied=inh_6a,
659
+ )
660
+
661
+ # --- 6b: test full series to correct the first break ---
662
+ wide6b = build_wide()
663
+ nbrs6b = select_neighbors(
664
+ cid,
665
+ self.stations,
666
+ wide6b,
667
+ max_neighbors=cfg.max_neighbors,
668
+ min_correlation=cfg.min_correlation,
669
+ max_distance_km=cfg.max_distance_km,
670
+ allowed_ids=ref6,
671
+ dist_cache=_dist_cache,
672
+ corr_cache=_corr_cache_6b,
673
+ )
674
+ if not nbrs6b:
675
+ states[cid].group = "ICC6"
676
+ continue
677
+
678
+ ref_6b = build_reference_series(wide6b, nbrs6b, cfg.mode)
679
+ q_6b = compute_q_series(wide6b[cid], ref_6b, cfg.mode)
680
+ tr_6b = [t.detect(q_6b, years) for t in _tests]
681
+ states[cid].neighbors_by_step[62] = nbrs6b
682
+ inh_6b = _meets_consensus(tr_6b)
683
+ br_6b = _resolve_break(tr_6b)
684
+ neutral = 1.0 if cfg.mode == "ratio" else 0.0
685
+ f_6b = compute_correction_factor(q_6b, years, br_6b, cfg.mode) if inh_6b else neutral
686
+ states[cid].detections_by_step[62] = DetectionRecord(
687
+ step=62,
688
+ break_year=br_6b,
689
+ factor=f_6b,
690
+ test_results=tr_6b,
691
+ was_applied=inh_6b,
692
+ )
693
+ if inh_6b:
694
+ states[cid].annual_current = apply_correction(
695
+ states[cid].annual_current,
696
+ years,
697
+ br_6b,
698
+ f_6b,
699
+ cfg.mode,
700
+ )
701
+ states[cid].corrections.append(CorrectionRecord(step=6, break_year=br_6b, factor=f_6b))
702
+
703
+ # --- 6c: re-test the doubly-corrected series — González-Rouco (2001) §3.b.
704
+ # HCC6 means "homogeneous after two corrections"; ICC6 means a residual
705
+ # inhomogeneity (e.g. a third break) survives. Without this re-test every
706
+ # IC4/IC5 station would silently become HCC6.
707
+ wide6c = build_wide()
708
+ ref_6c = build_reference_series(wide6c, nbrs6b, cfg.mode)
709
+ q_6c = compute_q_series(wide6c[cid], ref_6c, cfg.mode)
710
+ tr_6c = [t.detect(q_6c, years) for t in _tests]
711
+ still_inh = _meets_consensus(tr_6c)
712
+ states[cid].detections_by_step[63] = DetectionRecord(
713
+ step=63,
714
+ break_year=_resolve_break(tr_6c),
715
+ factor=1.0 if cfg.mode == "ratio" else 0.0,
716
+ test_results=tr_6c,
717
+ was_applied=False,
718
+ )
719
+ states[cid].group = "ICC6" if still_inh else "HCC6"
720
+
721
+ # mark any double-break stations we couldn't fully correct
722
+ for cid in double_break_ids:
723
+ if states[cid].group in ("IC4", "IC5"):
724
+ states[cid].group = "ICC6"
725
+
726
+ _advance(7, "done")
727
+ if _pbar is not None:
728
+ _pbar.close()
729
+
730
+ # ── Build DetectionResult ─────────────────────────────────────────
731
+ # Stations that never found a neighbor at any step → UNTESTABLE.
732
+ # Use neighbors_by_step (set for all tested stations) rather than
733
+ # detections_by_step (only set for stations that were inhomogeneous),
734
+ # so that stations tested and found homogeneous keep their H* group.
735
+ for sid in candidate_ids:
736
+ if not states[sid].neighbors_by_step:
737
+ states[sid].group = "UNTESTABLE"
738
+
739
+ station_detections: dict[str, StationDetection] = {
740
+ sid: StationDetection(
741
+ station_id=sid,
742
+ group=states[sid].group,
743
+ annual_original=states[sid].annual_original,
744
+ annual_corrected=states[sid].annual_current,
745
+ years=years,
746
+ detections_by_step=states[sid].detections_by_step,
747
+ neighbors_by_step=states[sid].neighbors_by_step,
748
+ corrections=states[sid].corrections,
749
+ )
750
+ for sid in candidate_ids
751
+ }
752
+
753
+ # Add pre-filtered stations with INSUFFICIENT_DATA label
754
+ for sid in insufficient_ids:
755
+ annual = annual_wide_base[sid]
756
+ station_detections[sid] = StationDetection(
757
+ station_id=sid,
758
+ group="INSUFFICIENT_DATA",
759
+ annual_original=annual,
760
+ annual_corrected=annual,
761
+ years=years,
762
+ )
763
+
764
+ return DetectionResult(
765
+ station_detections=station_detections,
766
+ parameter=self.parameter,
767
+ mode=cfg.mode,
768
+ )
769
+
770
+ # ------------------------------------------------------------------
771
+ # Properties
772
+ # ------------------------------------------------------------------
773
+
774
+ @property
775
+ def n_stations(self) -> int:
776
+ """Number of stations in the dataset."""
777
+ return len(self.stations)
778
+
779
+ @property
780
+ def date_range(self) -> tuple[str, str]:
781
+ """Earliest and latest date in the values table as ISO strings."""
782
+ d = self.values["date"]
783
+ return str(d.min()), str(d.max())
784
+
785
+ def __repr__(self) -> str:
786
+ """Return a short summary string."""
787
+ lo, hi = self.date_range
788
+ return (
789
+ f"Rucola(parameter={self.parameter!r}, "
790
+ f"stations={self.n_stations}, "
791
+ f"records={len(self.values):,}, "
792
+ f"period={lo} – {hi})" # noqa: RUF001
793
+ )
794
+
795
+ # ------------------------------------------------------------------
796
+ # Internal helpers
797
+ # ------------------------------------------------------------------
798
+
799
+ @staticmethod
800
+ def _check_date_order(values: pl.DataFrame) -> None:
801
+ """Raise if any station's dates are not sorted ascending."""
802
+ unsorted = (
803
+ values.group_by("station_id")
804
+ .agg((pl.col("date") == pl.col("date").cum_max()).all().alias("sorted"))
805
+ .filter(~pl.col("sorted"))["station_id"]
806
+ .to_list()
807
+ )
808
+ if unsorted:
809
+ n = len(unsorted)
810
+ sample = sorted(unsorted)[:_STATION_ID_SAMPLE_SIZE]
811
+ suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
812
+ msg = f"{n} station(s) in `values` have dates not sorted ascending: {sample}{suffix}"
813
+ raise ValueError(msg)
814
+
815
+ @staticmethod
816
+ def _check_duplicate_dates(values: pl.DataFrame) -> None:
817
+ """Raise if any station has two records for the same date."""
818
+ dup_ids = (
819
+ values.group_by("station_id", "date")
820
+ .len()
821
+ .filter(pl.col("len") > 1)["station_id"]
822
+ .unique()
823
+ .to_list()
824
+ )
825
+ if dup_ids:
826
+ n = len(dup_ids)
827
+ sample = sorted(dup_ids)[:_STATION_ID_SAMPLE_SIZE]
828
+ suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
829
+ msg = f"{n} station(s) in `values` have duplicate dates: {sample}{suffix}"
830
+ raise ValueError(msg)
831
+
832
+ @staticmethod
833
+ def _check_single_parameter(values: pl.DataFrame) -> None:
834
+ """Raise if `values` contains more than one parameter value."""
835
+ if "parameter" not in values.columns:
836
+ return
837
+ params = values["parameter"].drop_nulls().unique().to_list()
838
+ if len(params) > 1:
839
+ msg = (
840
+ f"`values` contains multiple parameters: {sorted(params)!r}. "
841
+ "Filter to a single parameter before loading."
842
+ )
843
+ raise ValueError(msg)
844
+
845
+ @staticmethod
846
+ def _check_annual_resolution(values: pl.DataFrame) -> None:
847
+ """Raise if values contains more than one record per station per year."""
848
+ max_per: int = (
849
+ values.with_columns(pl.col("date").dt.year().alias("_year"))
850
+ .group_by("station_id", "_year")
851
+ .len()
852
+ .select(pl.col("len").max().cast(pl.Int64))
853
+ .item()
854
+ )
855
+ if max_per > 1:
856
+ msg = (
857
+ f"values has sub-annual resolution ({max_per} records per station-year). "
858
+ "Pre-aggregate to annual using rucola._preprocessing.compute_annual_totals "
859
+ "or compute_annual_means before passing to Rucola."
860
+ )
861
+ raise ValueError(msg)
862
+
863
+ @staticmethod
864
+ def _check_value_dtype(values: pl.DataFrame) -> None:
865
+ """Raise if the `value` column is not numeric."""
866
+ dtype = values["value"].dtype
867
+ if not dtype.is_numeric():
868
+ msg = f"`values.value` must be numeric, got {dtype}. Cast to Float64 before loading."
869
+ raise TypeError(msg)
870
+
871
+ @staticmethod
872
+ def _check_stations(stations: pl.DataFrame) -> None:
873
+ """Raise on duplicate station IDs, null coordinates, or out-of-range coordinates."""
874
+ dupes = stations.group_by("station_id").len().filter(pl.col("len") > 1)["station_id"].to_list()
875
+ if dupes:
876
+ msg = f"`stations` has duplicate station_id(s): {sorted(dupes)}"
877
+ raise ValueError(msg)
878
+
879
+ null_coords = stations.filter(pl.col("latitude").is_null() | pl.col("longitude").is_null())[
880
+ "station_id"
881
+ ].to_list()
882
+ if null_coords:
883
+ msg = f"`stations` has null latitude/longitude for station_id(s): {sorted(null_coords)}"
884
+ raise ValueError(msg)
885
+
886
+ bad_lat = stations.filter((pl.col("latitude") < _LAT_MIN) | (pl.col("latitude") > _LAT_MAX))[
887
+ "station_id"
888
+ ].to_list()
889
+ if bad_lat:
890
+ msg = f"`stations` has latitude outside [{_LAT_MIN}, {_LAT_MAX}] for station_id(s): {sorted(bad_lat)}"
891
+ raise ValueError(msg)
892
+
893
+ bad_lon = stations.filter((pl.col("longitude") < _LON_MIN) | (pl.col("longitude") > _LON_MAX))[
894
+ "station_id"
895
+ ].to_list()
896
+ if bad_lon:
897
+ msg = f"`stations` has longitude outside [{_LON_MIN}, {_LON_MAX}] for station_id(s): {sorted(bad_lon)}"
898
+ raise ValueError(msg)
899
+
900
+ @staticmethod
901
+ def _check_columns(df: pl.DataFrame, required: frozenset[str], name: str) -> None:
902
+ missing = required - set(df.columns)
903
+ if missing:
904
+ msg = f"`{name}` is missing required column(s): {sorted(missing)}"
905
+ raise ValueError(msg)
906
+
907
+ @staticmethod
908
+ def _check_station_coverage(values: pl.DataFrame, stations: pl.DataFrame) -> None:
909
+ """All station_ids in values must have a matching entry in stations."""
910
+ value_ids = set(values["station_id"].unique().to_list())
911
+ station_ids = set(stations["station_id"].to_list())
912
+ missing = value_ids - station_ids
913
+ if missing:
914
+ n = len(missing)
915
+ sample = sorted(missing)[:_STATION_ID_SAMPLE_SIZE]
916
+ suffix = " ..." if n > _STATION_ID_SAMPLE_SIZE else ""
917
+ msg = f"{n} station_id(s) in `values` have no entry in `stations`: {sample}{suffix}"
918
+ raise ValueError(msg)
919
+
920
+ @classmethod
921
+ def _cast(
922
+ cls,
923
+ values: pl.DataFrame,
924
+ stations: pl.DataFrame | None = None,
925
+ parameter: str | None = None,
926
+ ) -> Self:
927
+ """Normalise column types and construct the instance."""
928
+ values = values.with_columns(pl.col("station_id").cast(pl.String))
929
+ if stations is not None:
930
+ stations = stations.with_columns(pl.col("station_id").cast(pl.String))
931
+ if "date" in values.columns and values["date"].dtype not in (pl.Date, pl.Datetime):
932
+ values = values.with_columns(pl.col("date").cast(pl.Date))
933
+ return cls(values=values, stations=stations, parameter=parameter)