rucola 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rucola-0.1.0/LICENSE.md +21 -0
- rucola-0.1.0/PKG-INFO +231 -0
- rucola-0.1.0/README.md +204 -0
- rucola-0.1.0/pyproject.toml +207 -0
- rucola-0.1.0/src/rucola/__init__.py +933 -0
- rucola-0.1.0/src/rucola/_algorithms.py +316 -0
- rucola-0.1.0/src/rucola/_homogeneity.py +827 -0
- rucola-0.1.0/src/rucola/_normalization.py +621 -0
- rucola-0.1.0/src/rucola/_preprocessing.py +90 -0
- rucola-0.1.0/src/rucola/_results.py +495 -0
rucola-0.1.0/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 earthobservations
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
rucola-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rucola
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Climate station data homogenization implementing the González-Rouco et al. (2001) six-step procedure with six pluggable breakpoint tests
|
|
5
|
+
Keywords: climate,climatology,homogenization,inhomogeneity,meteorology,SNHT,Buishand,Pettitt,Worsley,Easterling-Peterson,STARS,Rodionov,breakpoint-detection,changepoint,reference-series,detection,correction,time-series,station-data,quality-control
|
|
6
|
+
Author: Benjamin Gutzmann, Claude
|
|
7
|
+
Author-email: Benjamin Gutzmann <benjamin@eobs.org>, Claude <noreply@anthropic.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE.md
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Atmospheric Science
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Dist: polars>=1.41.2,<2.0.0
|
|
21
|
+
Requires-Dist: duckdb>=1.5.3,<2.0.0 ; extra == 'duckdb'
|
|
22
|
+
Requires-Dist: tqdm>=4.0,<5.0.0 ; extra == 'tqdm'
|
|
23
|
+
Requires-Python: >=3.12
|
|
24
|
+
Provides-Extra: duckdb
|
|
25
|
+
Provides-Extra: tqdm
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
<p align="center"><img src="https://upload.wikimedia.org/wikipedia/commons/8/8d/Rukola.JPG" alt="rucola" width="400"></p>
|
|
29
|
+
|
|
30
|
+
# rucola
|
|
31
|
+
|
|
32
|
+
Climate station data homogenization implementing the six-step procedure from [González-Rouco et al. (2001)](https://journals.ametsoc.org/view/journals/clim/14/5/1520-0442_2001_014_0964_qcahop_2.0.co_2.xml), with six pluggable breakpoint tests.
|
|
33
|
+
|
|
34
|
+
[](https://github.com/earthobservations/rucola/actions/workflows/tests.yml)
|
|
35
|
+
[](https://earthobservations.github.io/rucola/)
|
|
36
|
+
[](https://codecov.io/gh/earthobservations/rucola)
|
|
37
|
+
[](https://pypi.org/project/rucola/)
|
|
38
|
+
[](https://pypi.org/project/rucola/)
|
|
39
|
+
[](https://pepy.tech/project/rucola)
|
|
40
|
+
[](https://pypi.org/project/rucola/)
|
|
41
|
+
[](https://github.com/earthobservations/rucola/blob/main/LICENSE.md)
|
|
42
|
+
|
|
43
|
+
**[Documentation](https://earthobservations.github.io/rucola/)**
|
|
44
|
+
|
|
45
|
+
> **Beta:** rucola is under active development. The API may change between minor versions before a stable 1.0 release.
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
Long climate records from ground stations are frequently affected by non-climatic discontinuities — station relocations, instrument replacements, changes in observation practice. rucola detects and corrects these breakpoints using an iterative reference-station approach:
|
|
50
|
+
|
|
51
|
+
1. Build a normalized Q-series for each candidate station relative to its neighbors
|
|
52
|
+
2. Apply one or more statistical breakpoint tests to the Q-series
|
|
53
|
+
3. Correct detected breaks and refine the reference pool across six steps
|
|
54
|
+
|
|
55
|
+
Six tests are available: **SNHT** (Alexandersson 1986), **Buishand range** (Buishand 1982), **Pettitt** (Pettitt 1979), **Worsley likelihood ratio** (Worsley 1979), **Easterling–Peterson two-phase regression** (Easterling & Peterson 1995), and **STARS** sequential regime-shift test (Rodionov 2004). Tests can be run individually or in consensus combinations.
|
|
56
|
+
|
|
57
|
+
Both **ratio** (multiplicative, for precipitation) and **difference** (additive, for temperature) correction modes are supported.
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install rucola # core (Polars + CSV)
|
|
63
|
+
pip install rucola[duckdb] # with DuckDB support
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick start
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import rucola
|
|
70
|
+
|
|
71
|
+
r = rucola.Rucola.from_csv("values.csv", "stations.csv")
|
|
72
|
+
|
|
73
|
+
# Run the six-step procedure
|
|
74
|
+
detection = r.run(rucola.RunConfig(mode="ratio"))
|
|
75
|
+
|
|
76
|
+
# Apply corrections
|
|
77
|
+
result = detection.normalize()
|
|
78
|
+
|
|
79
|
+
print(result.summary)
|
|
80
|
+
print(result.corrections)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Input format
|
|
84
|
+
|
|
85
|
+
| Table | Required columns |
|
|
86
|
+
|---|---|
|
|
87
|
+
| `stations` | `station_id`, `latitude`, `longitude` |
|
|
88
|
+
| `values` | `station_id`, `date`, `value`, `parameter` |
|
|
89
|
+
|
|
90
|
+
Values must be at **annual resolution** and pre-filtered to a **single parameter**.
|
|
91
|
+
|
|
92
|
+
### Loaders
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# from Polars DataFrames
|
|
96
|
+
rucola.Rucola.from_polars(values_df, stations_df)
|
|
97
|
+
|
|
98
|
+
# from CSV files
|
|
99
|
+
rucola.Rucola.from_csv("values.csv", "stations.csv")
|
|
100
|
+
|
|
101
|
+
# from a DuckDB file (requires: pip install rucola[duckdb])
|
|
102
|
+
rucola.Rucola.from_duckdb("climate.duckdb")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Multiple tests with consensus detection
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
detection = r.run(
|
|
109
|
+
rucola.RunConfig(
|
|
110
|
+
tests=[
|
|
111
|
+
rucola.SNHTTest(),
|
|
112
|
+
rucola.BuishandTest(),
|
|
113
|
+
rucola.PettittTest(),
|
|
114
|
+
rucola.StarsTest(l=10), # sequential regime-shift test
|
|
115
|
+
],
|
|
116
|
+
mode="ratio",
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Normalization options
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from rucola import NormalizationConfig
|
|
125
|
+
|
|
126
|
+
result = detection.normalize(
|
|
127
|
+
NormalizationConfig(
|
|
128
|
+
consensus="majority", # require >50 % of tests to agree
|
|
129
|
+
tiebreak="strongest_signal",
|
|
130
|
+
break_window_years=3,
|
|
131
|
+
min_correction_magnitude=0.02,
|
|
132
|
+
min_relative_signal=1.2, # require signal 1.2× the critical value
|
|
133
|
+
min_years_from_end=5, # reject edge-effect artefacts (Hawkins 1977)
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Break predicates
|
|
139
|
+
|
|
140
|
+
Use composable predicates to filter which detected breaks are applied. Combine them with `&`, `|`, and `~`:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from rucola import (
|
|
144
|
+
NormalizationConfig,
|
|
145
|
+
YearBetween, StationIn, StepIn,
|
|
146
|
+
MagnitudeAbove, SignalAbove,
|
|
147
|
+
NSignificantAbove, NeighborCountAbove,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Trusted year window + minimum correction size
|
|
151
|
+
result = detection.normalize(
|
|
152
|
+
NormalizationConfig(
|
|
153
|
+
predicate=YearBetween(min=1960, max=2010) & MagnitudeAbove(threshold=0.05)
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Only correct a specific set of stations
|
|
158
|
+
result = detection.normalize(
|
|
159
|
+
NormalizationConfig(predicate=StationIn({"S1", "S3", "S7"}))
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Require strong evidence: signal 1.5× critical value, at least 3 tests agree,
|
|
163
|
+
# detected from a solid reference pool, skip early unreliable steps
|
|
164
|
+
result = detection.normalize(
|
|
165
|
+
NormalizationConfig(
|
|
166
|
+
predicate=SignalAbove(1.5) & NSignificantAbove(3)
|
|
167
|
+
& NeighborCountAbove(4) & ~StepIn({1, 2})
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Predicates are fully serializable via `to_dict()` / `BreakPredicate.from_dict()`.
|
|
173
|
+
|
|
174
|
+
### Saving and loading results
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
detection.to_json("detection.json")
|
|
178
|
+
result.to_json("result.json")
|
|
179
|
+
|
|
180
|
+
detection = rucola.DetectionResult.from_json("detection.json")
|
|
181
|
+
result = rucola.HomogenizationResult.from_json("result.json")
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Development
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
git clone https://github.com/earthobservations/rucola
|
|
188
|
+
cd rucola
|
|
189
|
+
uv sync --all-groups
|
|
190
|
+
|
|
191
|
+
# code quality
|
|
192
|
+
poe format # auto-format with ruff
|
|
193
|
+
poe lint # check formatting and linting
|
|
194
|
+
poe type # type-check with ty
|
|
195
|
+
|
|
196
|
+
# testing
|
|
197
|
+
poe test # fast unit + constructor tests (default)
|
|
198
|
+
poe test-unit # unit tests only, verbose
|
|
199
|
+
poe test-slow # full pipeline tests, verbose
|
|
200
|
+
poe test-integration # DWD integration tests (requires network)
|
|
201
|
+
poe test-all # everything, verbose
|
|
202
|
+
poe coverage # run tests and generate coverage.xml for Codecov
|
|
203
|
+
|
|
204
|
+
# docs
|
|
205
|
+
poe docs-serve # live preview at localhost:8000
|
|
206
|
+
poe docs-build # build static site
|
|
207
|
+
|
|
208
|
+
# security / hygiene
|
|
209
|
+
poe audit # scan dependencies for vulnerabilities
|
|
210
|
+
poe deptry # check for unused/missing dependencies
|
|
211
|
+
poe zizmor # audit GitHub Actions workflows
|
|
212
|
+
|
|
213
|
+
# all-in-one
|
|
214
|
+
poe check # lint + type + audit + test
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## References
|
|
218
|
+
|
|
219
|
+
- González-Rouco et al. (2001), *J. Climate* 14(5):964–978. [doi:10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2](https://doi.org/10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2)
|
|
220
|
+
- Hawkins (1977), *Biometrika* 64(2):279–288. [doi:10.1093/biomet/64.2.279](https://doi.org/10.1093/biomet/64.2.279)
|
|
221
|
+
- Alexandersson (1986), *Int. J. Climatol.* 6(6):661–675. [doi:10.1002/joc.3370060607](https://doi.org/10.1002/joc.3370060607)
|
|
222
|
+
- Alexandersson & Moberg (1997), *Int. J. Climatol.* 17(1):25–34. [doi:10.1002/(SICI)1097-0088(199701)17:1<25::AID-JOC103>3.0.CO;2-J](https://doi.org/10.1002/(SICI)1097-0088(199701)17:1<25::AID-JOC103>3.0.CO;2-J)
|
|
223
|
+
- Buishand (1982), *J. Hydrol.* 58(1–2):11–29. [doi:10.1016/0022-1694(82)90066-X](https://doi.org/10.1016/0022-1694(82)90066-X)
|
|
224
|
+
- Pettitt (1979), *Appl. Stat.* 28(2):126–135. [doi:10.2307/2346729](https://doi.org/10.2307/2346729)
|
|
225
|
+
- Worsley (1979), *J. Amer. Statist. Assoc.* 74(366):365–367. [doi:10.1080/01621459.1979.10482519](https://doi.org/10.1080/01621459.1979.10482519)
|
|
226
|
+
- Easterling & Peterson (1995), *Int. J. Climatol.* 15(4):369–377. [doi:10.1002/joc.3370150403](https://doi.org/10.1002/joc.3370150403)
|
|
227
|
+
- Rodionov (2004), *Geophys. Res. Lett.* 31, L09204. [doi:10.1029/2004GL019448](https://doi.org/10.1029/2004GL019448)
|
|
228
|
+
|
|
229
|
+
## Authors
|
|
230
|
+
|
|
231
|
+
rucola was created by [Benjamin Gutzmann](mailto:benjamin@eobs.org), with the majority of the implementation written by [Claude](https://claude.ai) (Anthropic).
|
rucola-0.1.0/README.md
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
<p align="center"><img src="https://upload.wikimedia.org/wikipedia/commons/8/8d/Rukola.JPG" alt="rucola" width="400"></p>
|
|
2
|
+
|
|
3
|
+
# rucola
|
|
4
|
+
|
|
5
|
+
Climate station data homogenization implementing the six-step procedure from [González-Rouco et al. (2001)](https://journals.ametsoc.org/view/journals/clim/14/5/1520-0442_2001_014_0964_qcahop_2.0.co_2.xml), with six pluggable breakpoint tests.
|
|
6
|
+
|
|
7
|
+
[](https://github.com/earthobservations/rucola/actions/workflows/tests.yml)
|
|
8
|
+
[](https://earthobservations.github.io/rucola/)
|
|
9
|
+
[](https://codecov.io/gh/earthobservations/rucola)
|
|
10
|
+
[](https://pypi.org/project/rucola/)
|
|
11
|
+
[](https://pypi.org/project/rucola/)
|
|
12
|
+
[](https://pepy.tech/project/rucola)
|
|
13
|
+
[](https://pypi.org/project/rucola/)
|
|
14
|
+
[](https://github.com/earthobservations/rucola/blob/main/LICENSE.md)
|
|
15
|
+
|
|
16
|
+
**[Documentation](https://earthobservations.github.io/rucola/)**
|
|
17
|
+
|
|
18
|
+
> **Beta:** rucola is under active development. The API may change between minor versions before a stable 1.0 release.
|
|
19
|
+
|
|
20
|
+
## Overview
|
|
21
|
+
|
|
22
|
+
Long climate records from ground stations are frequently affected by non-climatic discontinuities — station relocations, instrument replacements, changes in observation practice. rucola detects and corrects these breakpoints using an iterative reference-station approach:
|
|
23
|
+
|
|
24
|
+
1. Build a normalized Q-series for each candidate station relative to its neighbors
|
|
25
|
+
2. Apply one or more statistical breakpoint tests to the Q-series
|
|
26
|
+
3. Correct detected breaks and refine the reference pool across six steps
|
|
27
|
+
|
|
28
|
+
Six tests are available: **SNHT** (Alexandersson 1986), **Buishand range** (Buishand 1982), **Pettitt** (Pettitt 1979), **Worsley likelihood ratio** (Worsley 1979), **Easterling–Peterson two-phase regression** (Easterling & Peterson 1995), and **STARS** sequential regime-shift test (Rodionov 2004). Tests can be run individually or in consensus combinations.
|
|
29
|
+
|
|
30
|
+
Both **ratio** (multiplicative, for precipitation) and **difference** (additive, for temperature) correction modes are supported.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install rucola # core (Polars + CSV)
|
|
36
|
+
pip install rucola[duckdb] # with DuckDB support
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import rucola
|
|
43
|
+
|
|
44
|
+
r = rucola.Rucola.from_csv("values.csv", "stations.csv")
|
|
45
|
+
|
|
46
|
+
# Run the six-step procedure
|
|
47
|
+
detection = r.run(rucola.RunConfig(mode="ratio"))
|
|
48
|
+
|
|
49
|
+
# Apply corrections
|
|
50
|
+
result = detection.normalize()
|
|
51
|
+
|
|
52
|
+
print(result.summary)
|
|
53
|
+
print(result.corrections)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Input format
|
|
57
|
+
|
|
58
|
+
| Table | Required columns |
|
|
59
|
+
|---|---|
|
|
60
|
+
| `stations` | `station_id`, `latitude`, `longitude` |
|
|
61
|
+
| `values` | `station_id`, `date`, `value`, `parameter` |
|
|
62
|
+
|
|
63
|
+
Values must be at **annual resolution** and pre-filtered to a **single parameter**.
|
|
64
|
+
|
|
65
|
+
### Loaders
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# from Polars DataFrames
|
|
69
|
+
rucola.Rucola.from_polars(values_df, stations_df)
|
|
70
|
+
|
|
71
|
+
# from CSV files
|
|
72
|
+
rucola.Rucola.from_csv("values.csv", "stations.csv")
|
|
73
|
+
|
|
74
|
+
# from a DuckDB file (requires: pip install rucola[duckdb])
|
|
75
|
+
rucola.Rucola.from_duckdb("climate.duckdb")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Multiple tests with consensus detection
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
detection = r.run(
|
|
82
|
+
rucola.RunConfig(
|
|
83
|
+
tests=[
|
|
84
|
+
rucola.SNHTTest(),
|
|
85
|
+
rucola.BuishandTest(),
|
|
86
|
+
rucola.PettittTest(),
|
|
87
|
+
rucola.StarsTest(l=10), # sequential regime-shift test
|
|
88
|
+
],
|
|
89
|
+
mode="ratio",
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Normalization options
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from rucola import NormalizationConfig
|
|
98
|
+
|
|
99
|
+
result = detection.normalize(
|
|
100
|
+
NormalizationConfig(
|
|
101
|
+
consensus="majority", # require >50 % of tests to agree
|
|
102
|
+
tiebreak="strongest_signal",
|
|
103
|
+
break_window_years=3,
|
|
104
|
+
min_correction_magnitude=0.02,
|
|
105
|
+
min_relative_signal=1.2, # require signal 1.2× the critical value
|
|
106
|
+
min_years_from_end=5, # reject edge-effect artefacts (Hawkins 1977)
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Break predicates
|
|
112
|
+
|
|
113
|
+
Use composable predicates to filter which detected breaks are applied. Combine them with `&`, `|`, and `~`:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from rucola import (
|
|
117
|
+
NormalizationConfig,
|
|
118
|
+
YearBetween, StationIn, StepIn,
|
|
119
|
+
MagnitudeAbove, SignalAbove,
|
|
120
|
+
NSignificantAbove, NeighborCountAbove,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Trusted year window + minimum correction size
|
|
124
|
+
result = detection.normalize(
|
|
125
|
+
NormalizationConfig(
|
|
126
|
+
predicate=YearBetween(min=1960, max=2010) & MagnitudeAbove(threshold=0.05)
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Only correct a specific set of stations
|
|
131
|
+
result = detection.normalize(
|
|
132
|
+
NormalizationConfig(predicate=StationIn({"S1", "S3", "S7"}))
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Require strong evidence: signal 1.5× critical value, at least 3 tests agree,
|
|
136
|
+
# detected from a solid reference pool, skip early unreliable steps
|
|
137
|
+
result = detection.normalize(
|
|
138
|
+
NormalizationConfig(
|
|
139
|
+
predicate=SignalAbove(1.5) & NSignificantAbove(3)
|
|
140
|
+
& NeighborCountAbove(4) & ~StepIn({1, 2})
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Predicates are fully serializable via `to_dict()` / `BreakPredicate.from_dict()`.
|
|
146
|
+
|
|
147
|
+
### Saving and loading results
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
detection.to_json("detection.json")
|
|
151
|
+
result.to_json("result.json")
|
|
152
|
+
|
|
153
|
+
detection = rucola.DetectionResult.from_json("detection.json")
|
|
154
|
+
result = rucola.HomogenizationResult.from_json("result.json")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Development
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
git clone https://github.com/earthobservations/rucola
|
|
161
|
+
cd rucola
|
|
162
|
+
uv sync --all-groups
|
|
163
|
+
|
|
164
|
+
# code quality
|
|
165
|
+
poe format # auto-format with ruff
|
|
166
|
+
poe lint # check formatting and linting
|
|
167
|
+
poe type # type-check with ty
|
|
168
|
+
|
|
169
|
+
# testing
|
|
170
|
+
poe test # fast unit + constructor tests (default)
|
|
171
|
+
poe test-unit # unit tests only, verbose
|
|
172
|
+
poe test-slow # full pipeline tests, verbose
|
|
173
|
+
poe test-integration # DWD integration tests (requires network)
|
|
174
|
+
poe test-all # everything, verbose
|
|
175
|
+
poe coverage # run tests and generate coverage.xml for Codecov
|
|
176
|
+
|
|
177
|
+
# docs
|
|
178
|
+
poe docs-serve # live preview at localhost:8000
|
|
179
|
+
poe docs-build # build static site
|
|
180
|
+
|
|
181
|
+
# security / hygiene
|
|
182
|
+
poe audit # scan dependencies for vulnerabilities
|
|
183
|
+
poe deptry # check for unused/missing dependencies
|
|
184
|
+
poe zizmor # audit GitHub Actions workflows
|
|
185
|
+
|
|
186
|
+
# all-in-one
|
|
187
|
+
poe check # lint + type + audit + test
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## References
|
|
191
|
+
|
|
192
|
+
- González-Rouco et al. (2001), *J. Climate* 14(5):964–978. [doi:10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2](https://doi.org/10.1175/1520-0442(2001)014<0964:QCAHOP>2.0.CO;2)
|
|
193
|
+
- Hawkins (1977), *Biometrika* 64(2):279–288. [doi:10.1093/biomet/64.2.279](https://doi.org/10.1093/biomet/64.2.279)
|
|
194
|
+
- Alexandersson (1986), *Int. J. Climatol.* 6(6):661–675. [doi:10.1002/joc.3370060607](https://doi.org/10.1002/joc.3370060607)
|
|
195
|
+
- Alexandersson & Moberg (1997), *Int. J. Climatol.* 17(1):25–34. [doi:10.1002/(SICI)1097-0088(199701)17:1<25::AID-JOC103>3.0.CO;2-J](https://doi.org/10.1002/(SICI)1097-0088(199701)17:1<25::AID-JOC103>3.0.CO;2-J)
|
|
196
|
+
- Buishand (1982), *J. Hydrol.* 58(1–2):11–29. [doi:10.1016/0022-1694(82)90066-X](https://doi.org/10.1016/0022-1694(82)90066-X)
|
|
197
|
+
- Pettitt (1979), *Appl. Stat.* 28(2):126–135. [doi:10.2307/2346729](https://doi.org/10.2307/2346729)
|
|
198
|
+
- Worsley (1979), *J. Amer. Statist. Assoc.* 74(366):365–367. [doi:10.1080/01621459.1979.10482519](https://doi.org/10.1080/01621459.1979.10482519)
|
|
199
|
+
- Easterling & Peterson (1995), *Int. J. Climatol.* 15(4):369–377. [doi:10.1002/joc.3370150403](https://doi.org/10.1002/joc.3370150403)
|
|
200
|
+
- Rodionov (2004), *Geophys. Res. Lett.* 31, L09204. [doi:10.1029/2004GL019448](https://doi.org/10.1029/2004GL019448)
|
|
201
|
+
|
|
202
|
+
## Authors
|
|
203
|
+
|
|
204
|
+
rucola was created by [Benjamin Gutzmann](mailto:benjamin@eobs.org), with the majority of the implementation written by [Claude](https://claude.ai) (Anthropic).
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build>=0.11.19,<0.12"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rucola"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Climate station data homogenization implementing the González-Rouco et al. (2001) six-step procedure with six pluggable breakpoint tests"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE.md"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Benjamin Gutzmann", email = "benjamin@eobs.org" },
|
|
15
|
+
{ name = "Claude", email = "noreply@anthropic.com" },
|
|
16
|
+
]
|
|
17
|
+
keywords = [
|
|
18
|
+
"climate",
|
|
19
|
+
"climatology",
|
|
20
|
+
"homogenization",
|
|
21
|
+
"inhomogeneity",
|
|
22
|
+
"meteorology",
|
|
23
|
+
"SNHT",
|
|
24
|
+
"Buishand",
|
|
25
|
+
"Pettitt",
|
|
26
|
+
"Worsley",
|
|
27
|
+
"Easterling-Peterson",
|
|
28
|
+
"STARS",
|
|
29
|
+
"Rodionov",
|
|
30
|
+
"breakpoint-detection",
|
|
31
|
+
"changepoint",
|
|
32
|
+
"reference-series",
|
|
33
|
+
"detection",
|
|
34
|
+
"correction",
|
|
35
|
+
"time-series",
|
|
36
|
+
"station-data",
|
|
37
|
+
"quality-control",
|
|
38
|
+
]
|
|
39
|
+
classifiers = [
|
|
40
|
+
"Development Status :: 4 - Beta",
|
|
41
|
+
"Intended Audience :: Science/Research",
|
|
42
|
+
"Operating System :: OS Independent",
|
|
43
|
+
"Programming Language :: Python :: 3",
|
|
44
|
+
"Programming Language :: Python :: 3.12",
|
|
45
|
+
"Programming Language :: Python :: 3.13",
|
|
46
|
+
"Programming Language :: Python :: 3.14",
|
|
47
|
+
"Topic :: Scientific/Engineering :: Atmospheric Science",
|
|
48
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
49
|
+
"Typing :: Typed",
|
|
50
|
+
]
|
|
51
|
+
dependencies = [
|
|
52
|
+
"polars>=1.41.2,<2.0.0",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[project.optional-dependencies]
|
|
56
|
+
duckdb = ["duckdb>=1.5.3,<2.0.0"]
|
|
57
|
+
tqdm = ["tqdm>=4.0,<5.0.0"]
|
|
58
|
+
|
|
59
|
+
[dependency-groups]
|
|
60
|
+
dev = [
|
|
61
|
+
"deptry>=0.23.0,<1.0.0",
|
|
62
|
+
"poethepoet>=0.46.0,<1.0.0",
|
|
63
|
+
"ruff>=0.15.16,<1.0.0",
|
|
64
|
+
"ty>=0.0.43,<1.0.0",
|
|
65
|
+
"zizmor>=1.0.0,<2.0.0",
|
|
66
|
+
]
|
|
67
|
+
test = [
|
|
68
|
+
"mktestdocs>=0.2.5,<1.0.0",
|
|
69
|
+
"pytest>=9.0.3,<10.0.0",
|
|
70
|
+
"pytest-cov>=7.1.0,<8.0.0",
|
|
71
|
+
"wetterdienst>=0.121.1,<1.0.0",
|
|
72
|
+
]
|
|
73
|
+
docs = [
|
|
74
|
+
"mike>=2.1.3,<3",
|
|
75
|
+
"mkdocs>=1.4.3,<2",
|
|
76
|
+
"mkdocs-material>=9.1.15,<10",
|
|
77
|
+
"mkdocstrings-python>=1.1.2,<3",
|
|
78
|
+
]
|
|
79
|
+
notebooks = [
|
|
80
|
+
"altair>=6.2.1,<7.0.0",
|
|
81
|
+
"duckdb>=1.5.3,<2.0.0",
|
|
82
|
+
"marimo>=0.23.9,<1",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
[tool.poe.tasks.format]
|
|
86
|
+
sequence = [
|
|
87
|
+
{ cmd = "uv run ruff format" },
|
|
88
|
+
{ cmd = "uv run ruff check --fix" },
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
[tool.poe.tasks.lint]
|
|
92
|
+
sequence = [
|
|
93
|
+
{ cmd = "uv run ruff format --check" },
|
|
94
|
+
{ cmd = "uv run ruff check" },
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.poe.tasks.type]
|
|
98
|
+
cmd = "uv run ty check"
|
|
99
|
+
|
|
100
|
+
[tool.poe.tasks.test]
|
|
101
|
+
cmd = "uv run --group test pytest -m 'not integration and not slow'"
|
|
102
|
+
|
|
103
|
+
[tool.poe.tasks.test-unit]
|
|
104
|
+
cmd = "uv run --group test pytest -m unit -v"
|
|
105
|
+
|
|
106
|
+
[tool.poe.tasks.test-slow]
|
|
107
|
+
cmd = "uv run --group test pytest -m 'slow and not integration' -v"
|
|
108
|
+
|
|
109
|
+
[tool.poe.tasks.test-integration]
|
|
110
|
+
cmd = "uv run --group test pytest -m integration -v"
|
|
111
|
+
|
|
112
|
+
[tool.poe.tasks.test-all]
|
|
113
|
+
cmd = "uv run --group test pytest -v"
|
|
114
|
+
|
|
115
|
+
[tool.poe.tasks.docs-serve]
|
|
116
|
+
cmd = "uv run --group docs mkdocs serve"
|
|
117
|
+
|
|
118
|
+
[tool.poe.tasks.docs-build]
|
|
119
|
+
cmd = "uv run --group docs mkdocs build"
|
|
120
|
+
|
|
121
|
+
[tool.poe.tasks.coverage]
|
|
122
|
+
cmd = "uv run --group test pytest --cov-report=xml"
|
|
123
|
+
|
|
124
|
+
[tool.poe.tasks.audit]
|
|
125
|
+
cmd = "uv audit"
|
|
126
|
+
|
|
127
|
+
[tool.poe.tasks.deptry]
|
|
128
|
+
cmd = "uv run deptry src"
|
|
129
|
+
|
|
130
|
+
[tool.poe.tasks.zizmor]
|
|
131
|
+
cmd = "uv run zizmor ."
|
|
132
|
+
|
|
133
|
+
[tool.poe.tasks.check]
|
|
134
|
+
sequence = [
|
|
135
|
+
{ ref = "lint" },
|
|
136
|
+
{ ref = "type" },
|
|
137
|
+
{ ref = "audit" },
|
|
138
|
+
{ ref = "test" },
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
[tool.pytest.ini_options]
|
|
142
|
+
testpaths = ["tests"]
|
|
143
|
+
addopts = "--cov --cov-report=term-missing -q"
|
|
144
|
+
filterwarnings = [
|
|
145
|
+
"ignore:cannot collect test class 'TestResult':pytest.PytestCollectionWarning",
|
|
146
|
+
]
|
|
147
|
+
markers = [
|
|
148
|
+
"integration: requires network access to wetterdienst / DWD API",
|
|
149
|
+
"slow: computationally expensive; excluded from the default test run",
|
|
150
|
+
"unit: fast pure-function tests",
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
[tool.coverage.run]
|
|
154
|
+
source = ["rucola"]
|
|
155
|
+
branch = true
|
|
156
|
+
omit = []
|
|
157
|
+
|
|
158
|
+
[tool.coverage.report]
|
|
159
|
+
show_missing = true
|
|
160
|
+
skip_covered = false
|
|
161
|
+
skip_empty = true
|
|
162
|
+
precision = 1
|
|
163
|
+
exclude_lines = [
|
|
164
|
+
"pragma: no cover",
|
|
165
|
+
"if TYPE_CHECKING:",
|
|
166
|
+
"@abstractmethod",
|
|
167
|
+
"raise NotImplementedError",
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
[tool.ruff]
|
|
171
|
+
include = ["src/**/*.py", "tests/**/*.py", "notebooks/**/*.py"]
|
|
172
|
+
line-length = 120
|
|
173
|
+
|
|
174
|
+
[tool.ruff.lint]
|
|
175
|
+
select = ["ALL"]
|
|
176
|
+
ignore = [
|
|
177
|
+
"COM812", # conflicts with ruff formatter
|
|
178
|
+
"D203", # incompatible with D211
|
|
179
|
+
"D213", # incompatible with D212
|
|
180
|
+
"RUF002", # ambiguous unicode in docstrings (intentional in scientific notation)
|
|
181
|
+
"RUF003", # ambiguous unicode in comments (intentional in scientific notation)
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
[tool.ruff.lint.per-file-ignores]
|
|
185
|
+
"notebooks/**" = [
|
|
186
|
+
"ANN", # no type annotations in notebook cells
|
|
187
|
+
"D", # no docstrings in notebook cells
|
|
188
|
+
"I001", # import sorting inside cell functions
|
|
189
|
+
"N803", # marimo passes class names (PascalCase) as cell arguments
|
|
190
|
+
"PLC0415", # imports inside cell functions are expected
|
|
191
|
+
"PLR0913", # cell functions can have many reactive dependencies
|
|
192
|
+
"PLR1711", # marimo cells need bare `return` to signal no output
|
|
193
|
+
"RUF001", # en-dashes intentional in markdown strings
|
|
194
|
+
"RUF002", # en-dashes intentional in markdown strings
|
|
195
|
+
"ARG001", # unused args are reactive deps in marimo
|
|
196
|
+
"B018", # bare expressions display output in marimo cells
|
|
197
|
+
"C901", # complex cell functions are acceptable in notebooks
|
|
198
|
+
"ERA001", # commented-out code blocks are intentional usage recipes
|
|
199
|
+
"N806", # uppercase _CONSTANTS inside cell functions are intentional
|
|
200
|
+
"PERF401", # list.append in loops is fine in notebooks
|
|
201
|
+
"PLR0915", # large cells can exceed the statement limit
|
|
202
|
+
"PLR2004", # magic values are readable in notebook context
|
|
203
|
+
]
|
|
204
|
+
"tests/**" = [
|
|
205
|
+
"S101", # assert is required in pytest tests
|
|
206
|
+
"S311", # seeded PRNG is intentional for reproducible test fixtures
|
|
207
|
+
]
|