pyreclaim 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ """Feature engineering and transformation for RECLAIM :no-index:"""
@@ -0,0 +1,75 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
5
+ """
6
+ Engineer and transform features in reservoir/catchment dataset.
7
+
8
+ Features are first engineered in raw space (linear), then log-transformations
9
+ are applied in a single pass to avoid double-logging.
10
+
11
+ Log-transformed columns are prefixed with ``log_`` to clearly indicate their state.
12
+
13
+ Required input columns (abbreviations):
14
+ - CA, DCA, OBC, HGT, RA, RP, FL
15
+ - SA_mean, SA_mean_clip, SA_std, SA_kurt
16
+ - PAI, MAI, MAO, I_std, O_std, MAR
17
+ - OEY, BY, VGF, VLF
18
+ - Land cover: LCAS, LCC, LCG, LCT, LCS, LCHV, LCM, LCSV, LCBS, LCSG, LCWB
19
+ - COAR, SAND, NSSC2_mean
20
+ """
21
+
22
+ # Ensure required columns exist
23
+ required_cols = ['CA', 'DCA', 'OBC', 'HGT', 'RA', 'RP', 'FL',
24
+ 'SA_mean', 'SA_mean_clip', 'SA_std', 'SA_kurt',
25
+ 'PAI', 'MAI', 'MAO', 'I_std', 'O_std', 'MAR',
26
+ 'OEY', 'BY', 'VGF', 'VLF',
27
+ 'LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB',
28
+ 'COAR','SAND','NSSC2_mean']
29
+ for col in required_cols:
30
+ if col not in df.columns:
31
+ df[col] = np.nan
32
+
33
+ # -------------------------
34
+ # ENGINEER RAW FEATURES
35
+ # -------------------------
36
+ inflow_cap_ratio = (df['MAI'] * 3600 * 24 * 365.25 / 1e6) / df['OBC']
37
+
38
+ feature_dict = {
39
+ "AGE": df["OEY"] - df["BY"],
40
+ "ROBC": df["OBC"] / df["CA"],
41
+ "NVGF": df["VGF"] - df["VLF"],
42
+ "GC": df["RA"] / (df["RP"]**2),
43
+ "rain_per_area": np.where(df["CA"]!=0, df["MAR"]/df["CA"], df["MAR"]),
44
+ "R_tree_bare": np.where(df["LCBS"]!=0, df["LCT"]/df["LCBS"], df["LCT"]),
45
+ "R_shrub_bare": np.where(df["LCBS"]!=0, df["LCS"]/df["LCBS"], df["LCS"]),
46
+ "R_coarse_sand": df["COAR"]/df["SAND"],
47
+ "RT": df["OBC"] * 1e6 / (df["MAI"] * 3600 * 24 * 365.25),
48
+ "TE": np.exp(-0.0079 * inflow_cap_ratio) * 100,
49
+ "ECLR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio,
50
+ "ESR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio * df["OBC"] / 100,
51
+ "rel_SA_mean_clip": df["SA_mean_clip"] / df["RA"],
52
+ "R_SA_cap": df["SA_mean_clip"] / df["OBC"],
53
+ "SIN": df["MAI"] * df["NSSC2_mean"],
54
+ "SOUT": df["MAO"] * df["NSSC2_mean"],
55
+ }
56
+
57
+ # Land cover log-area features
58
+ lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
59
+ for col in lc_cols:
60
+ feature_dict[col] = df["CA"] * df[col] / 100
61
+
62
+ df = pd.concat([df, pd.DataFrame(feature_dict)], axis=1)
63
+
64
+ # -------------------------
65
+ # APPLY LOG TRANSFORMATIONS
66
+ # -------------------------
67
+ log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
68
+ 'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
69
+ 'rain_per_area','GC','TE','ECLR','SIN','SOUT'] + lc_cols
70
+
71
+ for col in log_candidates:
72
+ log_col = f'log_{col}' # add prefix to avoid double log
73
+ df[log_col] = np.log(df[col].clip(lower=1e-15))
74
+
75
+ return df
@@ -0,0 +1 @@
1
+ """Dynamic features for RECLAIM :no-index:"""
@@ -0,0 +1,103 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import Dict, Sequence
4
+
5
+ from reclaim.dynamic_features.utils.rainfall import (
6
+ mean_annual_rainfall_mm,
7
+ mean_annual_rainy_days,
8
+ )
9
+ from reclaim.dynamic_features.utils.statistical_metrics import (
10
+ annual_mean,
11
+ annual_std,
12
+ coefficient_of_variation,
13
+ skewness,
14
+ kurtosis_val,
15
+ )
16
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
17
+
18
+
19
+ def catchment_based_dynamic_features(
20
+ variable_info: Dict[str, Dict[str, str]],
21
+ observation_period: Sequence[int],
22
+ ) -> pd.DataFrame:
23
+ """
24
+ Compute dynamic catchment-based features for a single reservoir's catchment,
25
+ using precipitation, temperature, and wind speed time series.
26
+
27
+ Required time series keys (case-sensitive)
28
+ - "precip": Daily precipitation in mm
29
+ - "tmin": Daily minimum temperature in °C
30
+ - "tmax": Daily maximum temperature in °C
31
+ - "wind": Daily wind speed in m/s
32
+
33
+ Parameters
34
+ ----------
35
+ variable_info : dict
36
+ Dictionary of input series metadata.
37
+ Each key corresponds to a variable (precip, tmin, tmax, wind).
38
+ Each value is a dict with:
39
+ {
40
+ "path": str,
41
+ "time_column": str,
42
+ "data_column": str
43
+ }
44
+
45
+ observation_period : sequence[int]
46
+ Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
47
+
48
+ Returns
49
+ -------
50
+ pd.DataFrame
51
+ A one-row DataFrame containing the computed catchment-based features.
52
+
53
+ Notes
54
+ -----
55
+ - Precipitation features are reported as mm/year (for MAR) and counts (rainy days).
56
+ - Wind statistics include mean, std, CV, skewness, kurtosis.
57
+ - Temperature features are simple annual means (°C).
58
+ """
59
+
60
+ variable_features = {
61
+ "precip": {
62
+ "MAR": mean_annual_rainfall_mm,
63
+ "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
64
+ "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
65
+ "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
66
+ },
67
+ "tmin": {
68
+ "tmin_mean": annual_mean,
69
+ },
70
+ "tmax": {
71
+ "tmax_mean": annual_mean,
72
+ },
73
+ "wind": {
74
+ "wind_mean": annual_mean,
75
+ "wind_std": annual_std,
76
+ "wind_cv": coefficient_of_variation,
77
+ "wind_skew": skewness,
78
+ "wind_kurt": kurtosis_val,
79
+ },
80
+ }
81
+
82
+ results = {}
83
+
84
+ for var, feat_dict in variable_features.items():
85
+ if var not in variable_info:
86
+ for feat in feat_dict.keys():
87
+ results[feat] = np.nan
88
+ continue
89
+
90
+ path = variable_info[var]["path"]
91
+ time_col = variable_info[var]["time_column"]
92
+ data_col = variable_info[var]["data_column"]
93
+
94
+ for feat, func in feat_dict.items():
95
+ try:
96
+ df_feat = compute_ts_aggregate(
97
+ path, time_col, data_col, func, feat, observation_period
98
+ )
99
+ results[feat] = df_feat.iloc[0, 0] # extract scalar
100
+ except Exception:
101
+ results[feat] = np.nan
102
+
103
+ return pd.DataFrame([results])
@@ -0,0 +1,148 @@
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Dict, Sequence, Union, Callable
5
+
6
+ from reclaim.dynamic_features.utils.statistical_metrics import (
7
+ annual_mean,
8
+ annual_std,
9
+ skewness,
10
+ kurtosis_val,
11
+ coefficient_of_variation,
12
+ max_days_above_90th,
13
+ max_annual_persistence
14
+ )
15
+ from reclaim.dynamic_features.utils.inflow_outflow import (
16
+ mean_annual_flow_m3_per_s,
17
+ mean_annual_flow_std_m3_per_s,
18
+ max_annual_flow_m3_per_s,
19
+ mean_annual_flow_variability
20
+ )
21
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
22
+
23
+ def reservoir_based_dynamic_features(
24
+ variable_info: Dict[str, Dict[str, str]],
25
+ observation_period: Sequence[int],
26
+ ) -> pd.DataFrame:
27
+ """
28
+ Compute dynamic reservoir features for a single reservoir using inflow, outflow,
29
+ surface area, evaporation, and sediment-related time series.
30
+
31
+ Required time series keys (case-sensitive):
32
+
33
+ - ``inflow``: Daily inflow in m³/day
34
+ - ``outflow``: Daily outflow in m³/day
35
+ - ``evaporation``: Daily evaporation in mm/day
36
+ - ``surface_area``: Reservoir surface area in km²
37
+ - ``nssc``: Normalized suspended sediment concentration variant 1 (red/green) (dimensionless)
38
+ - ``nssc2``: Normalized suspended sediment concentration variant 2 (near-infrared/red) (dimensionless)
39
+
40
+ Parameters
41
+ ----------
42
+ variable_info : dict
43
+ Dictionary of input series metadata.
44
+ Each key corresponds to a variable (``inflow``, ``outflow``, ``evaporation``, ``surface_area``, ``nssc``, ``nssc2``).
45
+ Each value is a dict with the following structure::
46
+
47
+ {
48
+ "path": str, # Path to the CSV file
49
+ "time_column": str, # Name of the datetime column
50
+ "data_column": str # Name of the variable column
51
+ }
52
+
53
+ Example::
54
+
55
+ {
56
+ "inflow": {"path": "data/inflow.csv", "time_column": "date", "data_column": "inflow (m3/d)"},
57
+ "outflow": {"path": "data/outflow.csv", "time_column": "date", "data_column": "outflow (m3/d)"}
58
+ }
59
+
60
+ observation_period : sequence[int]
61
+ Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
62
+
63
+ Returns
64
+ -------
65
+ pd.DataFrame
66
+ A one-row DataFrame containing the computed reservoir dynamic features.
67
+ Missing variables in ``variable_info`` will result in NaN values for their features.
68
+
69
+ Notes
70
+ -----
71
+ - All inflow/outflow metrics are converted to m³/s internally.
72
+ - Surface area statistics are reported both for full record and clipped period.
73
+ - NSSC statistics are dimensionless.
74
+ - If a variable is missing in ``variable_info``, its corresponding features are NaN.
75
+ """
76
+
77
+ # Define which features depend on which variable
78
+ variable_features = {
79
+ "inflow": {
80
+ "MAI": mean_annual_flow_m3_per_s,
81
+ "PAI": max_annual_flow_m3_per_s,
82
+ "I_cv": mean_annual_flow_variability,
83
+ "I_std": mean_annual_flow_std_m3_per_s,
84
+ "I_above_90": max_days_above_90th,
85
+ "I_max_persis": max_annual_persistence,
86
+ },
87
+ "outflow": {
88
+ "MAO": mean_annual_flow_m3_per_s,
89
+ "O_std": mean_annual_flow_std_m3_per_s,
90
+ "O_cv": mean_annual_flow_variability,
91
+ },
92
+ "evaporation": {
93
+ "E_mean": annual_mean,
94
+ "E_std": annual_std,
95
+ },
96
+ "surface_area": {
97
+ "SA_mean": annual_mean,
98
+ "SA_std": annual_std,
99
+ "SA_cv": coefficient_of_variation,
100
+ "SA_skew": skewness,
101
+ "SA_kurt": kurtosis_val,
102
+ "SA_mean_clip": annual_mean,
103
+ "SA_above_90": max_days_above_90th,
104
+ },
105
+ "nssc": {
106
+ "NSSC1_mean": annual_mean,
107
+ "NSSC1_std": annual_std,
108
+ "NSSC1_cv": coefficient_of_variation,
109
+ "NSSC1_skew": skewness,
110
+ "NSSC1_kurt": kurtosis_val,
111
+ },
112
+ "nssc2": {
113
+ "NSSC2_mean": annual_mean,
114
+ "NSSC2_above_90": max_days_above_90th,
115
+ "NSSC2_max_persis": max_annual_persistence,
116
+ },
117
+ }
118
+
119
+ results = {}
120
+
121
+ # Loop through required variables
122
+ for var, feat_dict in variable_features.items():
123
+ if var not in variable_info:
124
+ # Fill with NaN if variable not provided
125
+ for feat in feat_dict.keys():
126
+ results[feat] = np.nan
127
+ continue
128
+
129
+ path = variable_info[var]["path"]
130
+ time_col = variable_info[var]["time_column"]
131
+ data_col = variable_info[var]["data_column"]
132
+
133
+ # Some features require clipping, others use full record
134
+ for feat, func in feat_dict.items():
135
+ if var == "surface_area" and feat in ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt"]:
136
+ obs_period = None # full record
137
+ else:
138
+ obs_period = observation_period
139
+
140
+ try:
141
+ df_feat = compute_ts_aggregate(
142
+ path, time_col, data_col, func, feat, obs_period
143
+ )
144
+ results[feat] = df_feat.iloc[0, 0] # single value
145
+ except Exception:
146
+ results[feat] = np.nan
147
+
148
+ return pd.DataFrame([results])
@@ -0,0 +1 @@
1
+ """Utility functions to generate dynamic features for RECLAIM input dataset :no-index:"""
@@ -0,0 +1,95 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ SECONDS_PER_DAY = 24 * 3600
5
+ DAYS_PER_YEAR = 365.25
6
+ SECONDS_PER_YEAR = SECONDS_PER_DAY * DAYS_PER_YEAR
7
+
8
+ def mean_annual_flow_m3_per_s(ts: pd.Series) -> float:
9
+ """
10
+ Computes the mean annual flow in m³/s from a time series of daily flow in m³/day.
11
+
12
+ Parameters
13
+ ----------
14
+ ts : pd.Series
15
+ Time series of daily flow values in m³/day, indexed by datetime.
16
+
17
+ Returns
18
+ -------
19
+ float
20
+ Mean annual flow in m³/s.
21
+ """
22
+ if ts.empty:
23
+ return float('nan')
24
+
25
+ annual_totals = ts.groupby(ts.index.year).sum()
26
+ mean_annual = annual_totals.mean()
27
+ return mean_annual / SECONDS_PER_YEAR
28
+
29
+
30
+ def mean_annual_flow_std_m3_per_s(ts: pd.Series) -> float:
31
+ """
32
+ Computes the mean annual standard deviation of daily flow (in m³/s).
33
+
34
+ Parameters
35
+ ----------
36
+ ts : pd.Series
37
+ Time series of daily flow values in m³/day, indexed by datetime.
38
+
39
+ Returns
40
+ -------
41
+ float
42
+ Mean annual standard deviation of flow in m³/s.
43
+ """
44
+ if ts.empty:
45
+ return float('nan')
46
+
47
+ annual_std = ts.groupby(ts.index.year).std()
48
+ annual_std_m3_per_s = annual_std / SECONDS_PER_DAY
49
+ return annual_std_m3_per_s.mean()
50
+
51
+
52
+ def max_annual_flow_m3_per_s(ts: pd.Series) -> float:
53
+ """
54
+ Computes the maximum annual flow in m³/s from daily flow series.
55
+
56
+ Parameters
57
+ ----------
58
+ ts : pd.Series
59
+ Time series of daily flow values in m³/day, indexed by datetime.
60
+
61
+ Returns
62
+ -------
63
+ float
64
+ Maximum annual flow in m³/s.
65
+ """
66
+ if ts.empty:
67
+ return float('nan')
68
+
69
+ annual_totals = ts.groupby(ts.index.year).sum()
70
+ max_annual = annual_totals.max()
71
+ return max_annual / SECONDS_PER_YEAR
72
+
73
+
74
+ def mean_annual_flow_variability(ts: pd.Series) -> float:
75
+ """
76
+ Computes the mean annual variability (coefficient of variation) of daily flow.
77
+
78
+ CV = std / mean within each year
79
+
80
+ Parameters
81
+ ----------
82
+ ts : pd.Series
83
+ Time series of daily flow values in m³/day, indexed by datetime.
84
+
85
+ Returns
86
+ -------
87
+ float
88
+ Mean coefficient of variation across all years (unitless).
89
+ """
90
+ if ts.empty:
91
+ return float('nan')
92
+
93
+ annual_stats = ts.groupby(ts.index.year).agg(['mean', 'std'])
94
+ annual_stats['cv'] = annual_stats['std'] / annual_stats['mean']
95
+ return annual_stats['cv'].mean()
@@ -0,0 +1,49 @@
1
+ import pandas as pd
2
+
3
+ def mean_annual_rainfall_mm(ts: pd.Series) -> float:
4
+ """
5
+ Calculates the mean annual rainfall in mm from a time series of daily rainfall in mm.
6
+
7
+ Parameters
8
+ ----------
9
+ ts : pd.Series
10
+ Time series of daily rainfall values in mm, indexed by datetime.
11
+
12
+ Returns
13
+ -------
14
+ float
15
+ Mean annual rainfall in mm.
16
+ """
17
+ if ts.empty:
18
+ return float('nan')
19
+
20
+ # Total rainfall for each year (mm/year)
21
+ annual_totals_mm = ts.groupby(ts.index.year).sum()
22
+
23
+ # Return mean annual rainfall (mm/year)
24
+ return annual_totals_mm.mean()
25
+
26
+ def mean_annual_rainy_days(ts: pd.Series, threshold: float = 100.0) -> float:
27
+ """
28
+ Calculates the mean annual number of days on which daily rainfall exceeds a threshold.
29
+
30
+ Parameters
31
+ ----------
32
+ ts : pd.Series
33
+ Time series of daily rainfall values in mm, indexed by datetime.
34
+ threshold : float, optional
35
+ Rainfall threshold in mm to define a "rainy day" (default is 10 mm).
36
+
37
+ Returns
38
+ -------
39
+ float
40
+ Mean annual number of days exceeding the threshold.
41
+ """
42
+ if ts.empty:
43
+ return float('nan')
44
+
45
+ # Count days above threshold for each year
46
+ rainy_days_per_year = ts.groupby(ts.index.year).apply(lambda x: (x > threshold).sum())
47
+
48
+ # Return mean number of rainy days across years
49
+ return rainy_days_per_year.mean()
@@ -0,0 +1,190 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from scipy.stats import skew, kurtosis
5
+
6
+ def annual_mean(ts: pd.Series) -> float:
7
+ """
8
+ Calculates the mean of annual means from a time series.
9
+ The annual mean is computed for each year using daily values.
10
+
11
+ Parameters
12
+ ----------
13
+ ts : pd.Series
14
+ Time series of daily values, indexed by datetime.
15
+
16
+ Returns
17
+ -------
18
+ float
19
+ Mean of the annual mean values across all years.
20
+ """
21
+ if ts.empty:
22
+ return float('nan')
23
+
24
+ # Group by year and calculate mean surface area for each year
25
+ annual_means = ts.groupby(ts.index.year).mean()
26
+
27
+ # Return the mean of these annual means
28
+ return annual_means.mean()
29
+
30
+ def annual_std(ts: pd.Series) -> float:
31
+ """
32
+ Calculates the mean annual standard deviation from a time series.
33
+ Standard deviation is computed for each year using daily values.
34
+
35
+ Parameters
36
+ ----------
37
+ ts : pd.Series
38
+ Time series of daily values, indexed by datetime.
39
+
40
+ Returns
41
+ -------
42
+ float
43
+ Mean standard deviation across all years.
44
+ """
45
+ if ts.empty:
46
+ return float('nan')
47
+
48
+ # Group by year and compute standard deviation for each year
49
+ annual_std_values = ts.groupby(ts.index.year).std()
50
+
51
+ # Return the mean standard deviation across years
52
+ return annual_std_values.mean()
53
+
54
+ # Skewness
55
+ def skewness(ts: pd.Series) -> float:
56
+ """
57
+ Calculates skewness of the given time series.
58
+
59
+ Parameters
60
+ ----------
61
+ ts : pd.Series
62
+ Time series, indexed by datetime.
63
+
64
+ Returns
65
+ -------
66
+ float
67
+ Skewness of the time series (unitless).
68
+ """
69
+ if ts.empty:
70
+ return float('nan')
71
+ return skew(ts.dropna())
72
+
73
+ # Kurtosis
74
+ def kurtosis_val(ts: pd.Series) -> float:
75
+ """
76
+ Calculates kurtosis of the given time series.
77
+
78
+ Parameters
79
+ ----------
80
+ ts : pd.Series
81
+ Time series, indexed by datetime.
82
+
83
+ Returns
84
+ -------
85
+ float
86
+ Kurtosis of the time series (excess kurtosis, unitless).
87
+ """
88
+ if ts.empty:
89
+ return float('nan')
90
+ return kurtosis(ts.dropna(), fisher=True)
91
+
92
+ # COV
93
+ def coefficient_of_variation(ts: pd.Series) -> float:
94
+ """
95
+ Calculates coefficient of variation (CV) of the given time series.
96
+
97
+ Parameters
98
+ ----------
99
+ ts : pd.Series
100
+ Time series, indexed by datetime.
101
+
102
+ Returns
103
+ -------
104
+ float
105
+ Coefficient of variation (std/mean, unitless).
106
+ """
107
+ if ts.empty:
108
+ return float('nan')
109
+ mean_val = ts.mean()
110
+ if mean_val == 0:
111
+ return float('nan')
112
+ return ts.std() / mean_val
113
+
114
+
115
+
116
+ def max_days_above_90th(ts: pd.Series) -> float:
117
+ """
118
+ Calculates the maximum number of days per year where the daily values
119
+ exceed the 90th percentile threshold (computed over the entire time series).
120
+
121
+ Parameters
122
+ ----------
123
+ ts : pd.Series
124
+ Time series of daily values, indexed by datetime.
125
+
126
+ Returns
127
+ -------
128
+ float
129
+ Maximum count of days above the 90th percentile across years.
130
+ """
131
+ if ts.empty:
132
+ return float('nan')
133
+
134
+ # Compute global 90th percentile threshold
135
+ threshold = np.nanpercentile(ts, 90)
136
+
137
+ # Boolean series: True if value > threshold
138
+ above_threshold = ts > threshold
139
+
140
+ # Count per year
141
+ annual_counts = above_threshold.groupby(ts.index.year).sum()
142
+
143
+ # Return maximum count across years
144
+ return float(annual_counts.max()) if not annual_counts.empty else float('nan')
145
+
146
+ def max_annual_persistence(timeseries, threshold=1/np.e, min_periods=30):
147
+ """
148
+ Compute the persistence (decorrelation time) of high values in a time series annually.
149
+
150
+ Parameters
151
+ ----------
152
+ timeseries : pd.Series
153
+ A datetime-indexed series of daily values.
154
+ threshold : float, optional
155
+ Autocorrelation cutoff (default=1/e ~ 0.367).
156
+ min_periods : int, optional
157
+ Minimum number of days required in a year to compute autocorrelation.
158
+
159
+ Returns
160
+ -------
161
+ int
162
+ Maximum persistence (days) across all years.
163
+ """
164
+
165
+ results = {}
166
+
167
+ # group by year
168
+ for year, group in timeseries.groupby(timeseries.index.year):
169
+ if len(group) < min_periods:
170
+ continue
171
+
172
+ # normalize (remove mean, divide std)
173
+ x = (group - group.mean()) / group.std()
174
+ n = len(x)
175
+
176
+ # compute autocorrelation using np.correlate
177
+ acf = np.correlate(x, x, mode='full') / n
178
+ acf = acf[n-1:] / acf[n-1] # keep positive lags, normalize at lag 0 = 1
179
+
180
+ # find first lag where acf < threshold
181
+ persistence = np.argmax(acf < threshold)
182
+ if persistence == 0: # if acf never drops below threshold
183
+ persistence = len(acf) - 1
184
+
185
+ results[year] = persistence
186
+
187
+ if not results:
188
+ return float('nan')
189
+
190
+ return max(results.values())