pyreclaim 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyreclaim
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Reservoir Estimation of Capacity Loss using AI based Methods
5
5
  Author-email: Sanchit Minocha <msanchit@uw.edu>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -698,6 +698,12 @@ Requires-Dist: joblib
698
698
  Requires-Dist: xgboost
699
699
  Requires-Dist: lightgbm
700
700
  Requires-Dist: catboost
701
+ Requires-Dist: openpyxl
702
+ Requires-Dist: netcdf4
703
+ Requires-Dist: dask
704
+ Requires-Dist: rioxarray
705
+ Requires-Dist: matplotlib
706
+ Requires-Dist: tqdm
701
707
  Dynamic: license-file
702
708
 
703
709
  <div align="center">
@@ -737,7 +743,7 @@ pip install pyreclaim
737
743
 
738
744
  To generate features for reservoirs using the **RECLAIM** framework and the [`pyreclaim`](https://pypi.org/project/pyreclaim/) Python package, you will need the global datasets.
739
745
 
740
- You can download all required global datasets from the Open Science Framework (OSF):
746
+ You can download all required global datasets from the Zenodo Repository:
741
747
 
742
748
  [Download Global Datasets](https://doi.org/10.5281/zenodo.17230533)
743
749
 
@@ -763,7 +769,7 @@ from reclaim.reclaim import Reclaim
763
769
  reservoir_static = {
764
770
  "obc": 150.0,
765
771
  "hgt": 45.0,
766
- "mrb": "Ganges",
772
+ "mrb": 4030033640,
767
773
  "lat": 25.6,
768
774
  "lon": 81.9,
769
775
  "reservoir_polygon": reservoir_polygon,
@@ -0,0 +1,28 @@
1
+ pyreclaim-0.5.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
2
+ reclaim/__init__.py,sha256=PnNGOrv5as3H8_WWLE3QqHR2LnqMgutXD8lDA-8PnCE,33
3
+ reclaim/generate_features.py,sha256=td-XQtuYN8HiEkDV1yVBr8wSq9MAXVg0LbGY0U2ytZI,11300
4
+ reclaim/reclaim.py,sha256=nUgE0gN4F76HGTIikR5GJZVV4KpL6HmUVuVUEhui6n8,20402
5
+ reclaim/derived_features/__init__.py,sha256=boN0ilez7nbWdn4RKD4n7whlGrg_X-zxrqMv5RA6cmg,67
6
+ reclaim/derived_features/feature_engineering_and_transformation.py,sha256=BzgITsvE1WTvVkf0-7ivg_-OaAnKWPfVz48GY9BWOzA,5689
7
+ reclaim/dynamic_features/__init__.py,sha256=gjZmRYl9zCccSenz6f38EKntrV6ueT0_9MHRrjtw2Og,45
8
+ reclaim/dynamic_features/catchment_dynamic.py,sha256=MKJwfZDepazDM6h5XnUxMFFhGSnkYSq7YBgxOourJ2s,3619
9
+ reclaim/dynamic_features/reservoir_dynamic.py,sha256=IeNFwjkOVukNA4JeYJe-Eglf13KBypeFFDLYgacQtFg,5062
10
+ reclaim/dynamic_features/utils/__init__.py,sha256=_mTSUeDu1jG5oBHvDKkMPx2A3xtIuSCpGpcMIURzmkU,89
11
+ reclaim/dynamic_features/utils/catchment_meteorology.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ reclaim/dynamic_features/utils/inflow_outflow.py,sha256=vD5CTAR12gCd56y8zDpzKfzJvG_qU_uJFD5OAajyb-g,2384
13
+ reclaim/dynamic_features/utils/rainfall.py,sha256=oWn1RbeHHmhZ--N7bx9UiJptRrPynYGXe1HTfPjHjWg,1387
14
+ reclaim/dynamic_features/utils/statistical_metrics.py,sha256=qtiLtFCqtOa6sDkpCmZmGsS_kub4rkYqUKgSEODZea4,4881
15
+ reclaim/dynamic_features/utils/ts_aggregate.py,sha256=veHO6h6YaO6nP7kN-sFKALqT-jMlLZ3FNCtfNfGC41c,3689
16
+ reclaim/static_features/__init__.py,sha256=QRDAk3sgrGX-Oq8oRY5RESkWniVhcAPoQUa6jDJ59Zo,44
17
+ reclaim/static_features/catchment_static.py,sha256=uCC-0r3R_awIWaEtkZ1t3jEXbUN4mXxKyWvuVbi_XWo,5933
18
+ reclaim/static_features/reservoir_static.py,sha256=jo8s9Wjr8PIwQ4d8wHKqefFf3r2WQ9MqZpwcnz6848U,4652
19
+ reclaim/static_features/utils/__init__.py,sha256=y-4GVIqARI0g8s2FVlTfx_4XprcPQ9HP_2nbhSw1NVA,88
20
+ reclaim/static_features/utils/aec_shape.py,sha256=Tyew9tvPvNFpMOtPBf2GhdPBXoEjXr-VA2vyfY6ugYk,3270
21
+ reclaim/static_features/utils/area_perimeter.py,sha256=yDLpxjyfGDQdL121w65X5oWLxCSwtSuPvJa7n_mtBl0,1338
22
+ reclaim/static_features/utils/basin_names.py,sha256=-6PCuuaGqAP1KfhbrCM6DYCDQ5hkZp7nte099Ks6NjI,2024
23
+ reclaim/static_features/utils/catchment_agreggate.py,sha256=Dvb7_vaeC-r69EXs2SOvDenhfWjp0-fbdmo492vqBoE,12083
24
+ reclaim/static_features/utils/flow_length.py,sha256=BuYhQxss3x1TXlg6JxPvcV1wAkrCPEl6c0IEd935lfc,19889
25
+ pyreclaim-0.5.0.dist-info/METADATA,sha256=qFIdVYDNazCXY-cQhLnNIfOZl-q54zIsTRyoegk8t8k,45510
26
+ pyreclaim-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
27
+ pyreclaim-0.5.0.dist-info/top_level.txt,sha256=uO95g-XnD3UQLLqi3q30muhaC9VqO04IqCbwmfsGmW4,8
28
+ pyreclaim-0.5.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,45 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
 
4
+ ALL_FEATURES = [
5
+ 'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
6
+ 'log_RA', 'log_RP', 'log_FL',
7
+ 'log_CA', 'log_DCA',
8
+
9
+ 'AECS', 'AECC','AECI',
10
+
11
+ 'log_LCAS', 'log_LCC',
12
+ 'log_LCG', 'log_LCT', 'log_LCS',
13
+ 'log_LCHV', 'log_LCM',
14
+ 'log_LCSV','log_LCBS',
15
+ 'log_LCSG', 'log_LCWB','DLC',
16
+
17
+ 'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
18
+
19
+ 'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
20
+
21
+ 'log_MAI', 'log_PAI', 'I_cv',
22
+ 'log_I_std','I_above_90', 'I_max_persis',
23
+ 'log_MAO', 'log_O_std', 'O_cv',
24
+ 'E_mean', 'E_std',
25
+ 'log_SA_mean', 'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
26
+ 'log_SA_mean_clip', 'SA_above_90',
27
+ 'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt',
28
+ 'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
29
+
30
+ 'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
31
+ 'tmin_mean', 'tmax_mean',
32
+ 'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt',
33
+
34
+ 'AGE', 'log_ROBC', 'log_GC',
35
+ 'NVGF',
36
+ 'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
37
+ 'log_rel_SA_mean_clip', 'log_R_SA_cap',
38
+ 'log_rain_per_area',
39
+ 'log_TE', 'log_RT', 'log_ECLR', 'ESR',
40
+ 'log_SIN', 'log_SOUT',
41
+ ]
42
+
4
43
  def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
5
44
  """
6
45
  Engineer and transform features in reservoir/catchment dataset.
@@ -54,22 +93,44 @@ def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
54
93
  "SOUT": df["MAO"] * df["NSSC2_mean"],
55
94
  }
56
95
 
57
- # Land cover log-area features
58
- lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
59
- for col in lc_cols:
60
- feature_dict[col] = df["CA"] * df[col] / 100
61
-
62
96
  df = pd.concat([df, pd.DataFrame(feature_dict)], axis=1)
63
97
 
98
+ # Land cover log-area features
99
+ lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
100
+ # for col in lc_cols:
101
+ # df[col] = df["CA"] * df[col] / 100
102
+ # Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
103
+
64
104
  # -------------------------
65
105
  # APPLY LOG TRANSFORMATIONS
66
106
  # -------------------------
67
107
  log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
68
108
  'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
69
- 'rain_per_area','GC','TE','ECLR','SIN','SOUT'] + lc_cols
109
+ 'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols
70
110
 
71
111
  for col in log_candidates:
72
112
  log_col = f'log_{col}' # add prefix to avoid double log
73
- df[log_col] = np.log(df[col].clip(lower=1e-15))
113
+ try:
114
+ if col in ['ECLR','SIN','SOUT']:
115
+ # Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
116
+ df[log_col] = np.log(df[col].clip(lower=1e-15))
117
+ elif col in ['rain_per_area']:
118
+ # Rain per area can be zero (upto 10 decimal places), clip at 1e-10
119
+ df[log_col] = np.log(df[col].clip(lower=1e-10))
120
+ elif col in lc_cols:
121
+ df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
122
+ else:
123
+ # All other columns can be zero (upto 6 decimal places), clip at 1e-6
124
+ df[log_col] = np.log(df[col].clip(lower=1e-6))
125
+ except Exception as e:
126
+ raise ValueError(f"Error applying log transform to column '{col}': {e}")
127
+
128
+ # Process DLc as categorical column
129
+ df['DLC'] = df['DLC'].astype(int).fillna(0)
130
+
131
+ # Add empty columns for any missing features
132
+ for feature in ALL_FEATURES:
133
+ if feature not in df.columns:
134
+ df[feature] = np.nan
74
135
 
75
136
  return df
@@ -1,6 +1,6 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
- from typing import Dict, Sequence
3
+ from typing import Dict, Sequence, List
4
4
 
5
5
  from reclaim.dynamic_features.utils.rainfall import (
6
6
  mean_annual_rainfall_mm,
@@ -13,12 +13,34 @@ from reclaim.dynamic_features.utils.statistical_metrics import (
13
13
  skewness,
14
14
  kurtosis_val,
15
15
  )
16
- from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
16
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
17
+
18
+ VARIABLE_FEATURES = {
19
+ "precip": {
20
+ "MAR": mean_annual_rainfall_mm,
21
+ "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
22
+ "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
23
+ "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
24
+ },
25
+ "tmin": {
26
+ "tmin_mean": annual_mean,
27
+ },
28
+ "tmax": {
29
+ "tmax_mean": annual_mean,
30
+ },
31
+ "wind": {
32
+ "wind_mean": annual_mean,
33
+ "wind_std": annual_std,
34
+ "wind_cv": coefficient_of_variation,
35
+ "wind_skew": skewness,
36
+ "wind_kurt": kurtosis_val,
37
+ },
38
+ }
17
39
 
18
40
 
19
41
  def catchment_based_dynamic_features(
20
42
  variable_info: Dict[str, Dict[str, str]],
21
- observation_period: Sequence[int],
43
+ observation_intervals: List[Sequence[int]],
22
44
  ) -> pd.DataFrame:
23
45
  """
24
46
  Compute dynamic catchment-based features for a single reservoir's catchment,
@@ -41,63 +63,47 @@ def catchment_based_dynamic_features(
41
63
  "time_column": str,
42
64
  "data_column": str
43
65
  }
44
-
45
- observation_period : sequence[int]
46
- Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
66
+
67
+ observation_intervals : list of list of int
68
+ List of [start_year, end_year] intervals to compute features over.
47
69
 
48
70
  Returns
49
71
  -------
50
72
  pd.DataFrame
51
- A one-row DataFrame containing the computed catchment-based features.
52
-
73
+ A DataFrame containing as many rows as there are observation intervals and columns corresponding to the computed catchment-based features.
74
+ Missing variables in ``variable_info`` will result in NaN values for their features.
53
75
  Notes
54
76
  -----
55
77
  - Precipitation features are reported as mm/year (for MAR) and counts (rainy days).
56
78
  - Wind statistics include mean, std, CV, skewness, kurtosis.
57
79
  - Temperature features are simple annual means (°C).
58
80
  """
59
-
60
- variable_features = {
61
- "precip": {
62
- "MAR": mean_annual_rainfall_mm,
63
- "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
64
- "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
65
- "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
66
- },
67
- "tmin": {
68
- "tmin_mean": annual_mean,
69
- },
70
- "tmax": {
71
- "tmax_mean": annual_mean,
72
- },
73
- "wind": {
74
- "wind_mean": annual_mean,
75
- "wind_std": annual_std,
76
- "wind_cv": coefficient_of_variation,
77
- "wind_skew": skewness,
78
- "wind_kurt": kurtosis_val,
79
- },
80
- }
81
81
 
82
- results = {}
82
+ all_vars = []
83
83
 
84
- for var, feat_dict in variable_features.items():
84
+ for var, feat_dict in VARIABLE_FEATURES.items():
85
85
  if var not in variable_info:
86
- for feat in feat_dict.keys():
87
- results[feat] = np.nan
86
+ all_vars.append(
87
+ pd.DataFrame(np.nan, index=range(len(observation_intervals)),
88
+ columns=feat_dict.keys())
89
+ )
88
90
  continue
89
91
 
90
92
  path = variable_info[var]["path"]
91
93
  time_col = variable_info[var]["time_column"]
92
94
  data_col = variable_info[var]["data_column"]
95
+
96
+ try:
97
+ df_var = compute_ts_aggregates(
98
+ ts_csv_path=path,
99
+ time_column=time_col,
100
+ value_column=data_col,
101
+ feature_functions=feat_dict,
102
+ intervals=observation_intervals,
103
+ )
104
+ all_vars.append(df_var)
105
+ except Exception:
106
+ df_var = pd.DataFrame()
107
+ all_vars.append(df_var)
93
108
 
94
- for feat, func in feat_dict.items():
95
- try:
96
- df_feat = compute_ts_aggregate(
97
- path, time_col, data_col, func, feat, observation_period
98
- )
99
- results[feat] = df_feat.iloc[0, 0] # extract scalar
100
- except Exception:
101
- results[feat] = np.nan
102
-
103
- return pd.DataFrame([results])
109
+ return pd.concat(all_vars, axis=1)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import pandas as pd
3
3
  import numpy as np
4
- from typing import Dict, Sequence, Union, Callable
4
+ from typing import Dict, Sequence, Union, Callable, List
5
5
 
6
6
  from reclaim.dynamic_features.utils.statistical_metrics import (
7
7
  annual_mean,
@@ -18,11 +18,53 @@ from reclaim.dynamic_features.utils.inflow_outflow import (
18
18
  max_annual_flow_m3_per_s,
19
19
  mean_annual_flow_variability
20
20
  )
21
- from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
21
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
22
+
23
+ # Define which features depend on which variable
24
+ VARIABLE_FEATURES = {
25
+ "inflow": {
26
+ "MAI": mean_annual_flow_m3_per_s,
27
+ "PAI": max_annual_flow_m3_per_s,
28
+ "I_cv": mean_annual_flow_variability,
29
+ "I_std": mean_annual_flow_std_m3_per_s,
30
+ "I_above_90": max_days_above_90th,
31
+ "I_max_persis": max_annual_persistence,
32
+ },
33
+ "outflow": {
34
+ "MAO": mean_annual_flow_m3_per_s,
35
+ "O_std": mean_annual_flow_std_m3_per_s,
36
+ "O_cv": mean_annual_flow_variability,
37
+ },
38
+ "evaporation": {
39
+ "E_mean": annual_mean,
40
+ "E_std": annual_std,
41
+ },
42
+ "surface_area": {
43
+ "SA_mean": annual_mean,
44
+ "SA_std": annual_std,
45
+ "SA_cv": coefficient_of_variation,
46
+ "SA_skew": skewness,
47
+ "SA_kurt": kurtosis_val,
48
+ "SA_mean_clip": annual_mean,
49
+ "SA_above_90": max_days_above_90th,
50
+ },
51
+ "nssc": {
52
+ "NSSC1_mean": annual_mean,
53
+ "NSSC1_std": annual_std,
54
+ "NSSC1_cv": coefficient_of_variation,
55
+ "NSSC1_skew": skewness,
56
+ "NSSC1_kurt": kurtosis_val,
57
+ },
58
+ "nssc2": {
59
+ "NSSC2_mean": annual_mean,
60
+ "NSSC2_above_90": max_days_above_90th,
61
+ "NSSC2_max_persis": max_annual_persistence,
62
+ },
63
+ }
22
64
 
23
65
  def reservoir_based_dynamic_features(
24
66
  variable_info: Dict[str, Dict[str, str]],
25
- observation_period: Sequence[int],
67
+ observation_intervals: List[Sequence[int]]
26
68
  ) -> pd.DataFrame:
27
69
  """
28
70
  Compute dynamic reservoir features for a single reservoir using inflow, outflow,
@@ -57,13 +99,13 @@ def reservoir_based_dynamic_features(
57
99
  "outflow": {"path": "data/outflow.csv", "time_column": "date", "data_column": "outflow (m3/d)"}
58
100
  }
59
101
 
60
- observation_period : sequence[int]
61
- Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
102
+ observation_intervals : list of list of int
103
+ List of [start_year, end_year] intervals to compute features over.
62
104
 
63
105
  Returns
64
106
  -------
65
107
  pd.DataFrame
66
- A one-row DataFrame containing the computed reservoir dynamic features.
108
+ A DataFrame containing as many rows as ``observation_intervals`` and columns corresponding to the computed reservoir dynamic features.
67
109
  Missing variables in ``variable_info`` will result in NaN values for their features.
68
110
 
69
111
  Notes
@@ -74,75 +116,31 @@ def reservoir_based_dynamic_features(
74
116
  - If a variable is missing in ``variable_info``, its corresponding features are NaN.
75
117
  """
76
118
 
77
- # Define which features depend on which variable
78
- variable_features = {
79
- "inflow": {
80
- "MAI": mean_annual_flow_m3_per_s,
81
- "PAI": max_annual_flow_m3_per_s,
82
- "I_cv": mean_annual_flow_variability,
83
- "I_std": mean_annual_flow_std_m3_per_s,
84
- "I_above_90": max_days_above_90th,
85
- "I_max_persis": max_annual_persistence,
86
- },
87
- "outflow": {
88
- "MAO": mean_annual_flow_m3_per_s,
89
- "O_std": mean_annual_flow_std_m3_per_s,
90
- "O_cv": mean_annual_flow_variability,
91
- },
92
- "evaporation": {
93
- "E_mean": annual_mean,
94
- "E_std": annual_std,
95
- },
96
- "surface_area": {
97
- "SA_mean": annual_mean,
98
- "SA_std": annual_std,
99
- "SA_cv": coefficient_of_variation,
100
- "SA_skew": skewness,
101
- "SA_kurt": kurtosis_val,
102
- "SA_mean_clip": annual_mean,
103
- "SA_above_90": max_days_above_90th,
104
- },
105
- "nssc": {
106
- "NSSC1_mean": annual_mean,
107
- "NSSC1_std": annual_std,
108
- "NSSC1_cv": coefficient_of_variation,
109
- "NSSC1_skew": skewness,
110
- "NSSC1_kurt": kurtosis_val,
111
- },
112
- "nssc2": {
113
- "NSSC2_mean": annual_mean,
114
- "NSSC2_above_90": max_days_above_90th,
115
- "NSSC2_max_persis": max_annual_persistence,
116
- },
117
- }
118
-
119
- results = {}
119
+ all_vars = []
120
120
 
121
121
  # Loop through required variables
122
- for var, feat_dict in variable_features.items():
122
+ for var, feat_dict in VARIABLE_FEATURES.items():
123
123
  if var not in variable_info:
124
- # Fill with NaN if variable not provided
125
- for feat in feat_dict.keys():
126
- results[feat] = np.nan
124
+ all_vars.append(
125
+ pd.DataFrame(np.nan, index=range(len(observation_intervals)),
126
+ columns=feat_dict.keys())
127
+ )
127
128
  continue
128
129
 
129
130
  path = variable_info[var]["path"]
130
131
  time_col = variable_info[var]["time_column"]
131
132
  data_col = variable_info[var]["data_column"]
132
-
133
- # Some features require clipping, others use full record
134
- for feat, func in feat_dict.items():
135
- if var == "surface_area" and feat in ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt"]:
136
- obs_period = None # full record
137
- else:
138
- obs_period = observation_period
139
-
140
- try:
141
- df_feat = compute_ts_aggregate(
142
- path, time_col, data_col, func, feat, obs_period
143
- )
144
- results[feat] = df_feat.iloc[0, 0] # single value
145
- except Exception:
146
- results[feat] = np.nan
147
-
148
- return pd.DataFrame([results])
133
+ try:
134
+ df_var = compute_ts_aggregates(
135
+ ts_csv_path=path,
136
+ time_column=time_col,
137
+ value_column=data_col,
138
+ feature_functions=feat_dict,
139
+ intervals=observation_intervals,
140
+ )
141
+ all_vars.append(df_var)
142
+ except Exception:
143
+ df_var = pd.DataFrame()
144
+ all_vars.append(df_var)
145
+
146
+ return pd.concat(all_vars, axis=1)
@@ -1,13 +1,40 @@
1
1
  import pandas as pd
2
- from typing import Callable, Union, Sequence
2
+ import numpy as np
3
+ from pathlib import Path
4
+ from typing import Callable, Union, Sequence, List, Dict
3
5
 
4
- def compute_ts_aggregate(
6
+ FULL_RECORD_FEATURES = ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt", "NSSC2_max_persis"]
7
+
8
+ def build_intervals(start_year, end_year, time_interval):
9
+ total_years = end_year - start_year + 1
10
+
11
+ # Case 1: Entire window shorter than interval
12
+ if total_years <= time_interval:
13
+ return [[start_year, end_year]]
14
+
15
+ remainder = total_years % time_interval
16
+ outputs = []
17
+
18
+ # First interval absorbs remainder (if any)
19
+ first_len = time_interval + remainder if remainder != 0 else time_interval
20
+ first_end = min(start_year + first_len - 1, end_year)
21
+ outputs.append([start_year, first_end])
22
+
23
+ # Remaining intervals
24
+ current_start = first_end + 1
25
+ while current_start <= end_year:
26
+ current_end = current_start + time_interval - 1
27
+ outputs.append([current_start, min(current_end, end_year)])
28
+ current_start = current_end + 1
29
+
30
+ return outputs
31
+
32
+ def compute_ts_aggregates(
5
33
  ts_csv_path: str,
6
34
  time_column: str,
7
35
  value_column: str,
8
- feature_function: Callable,
9
- feature_name: str,
10
- observation_period: Union[Sequence[int], None] = None
36
+ feature_functions: Dict[str, Callable],
37
+ intervals: List[Sequence[int]],
11
38
  ) -> pd.DataFrame:
12
39
  """
13
40
  Compute an aggregate feature from a user-provided time series CSV for a single reservoir.
@@ -20,44 +47,58 @@ def compute_ts_aggregate(
20
47
  Name of the column representing dates/timestamps.
21
48
  value_column : str
22
49
  Name of the column representing the variable values.
23
- feature_function : Callable
24
- Function that takes a pd.Series (the time series) and returns a single value.
25
- feature_name : str
26
- Name of the column to store the computed feature in the returned DataFrame.
27
- observation_period : list or tuple of two ints, optional
28
- [start_year, end_year] to clip the time series. If None, no clipping is applied.
50
+ feature_functions : Dict[str, Callable]
51
+ Dictionary where keys are feature names (column names for output DataFrame) and values are functions that take a pd.Series and return a single value.
52
+ intervals : list of list of int
53
+ List of [start_year, end_year] intervals to compute features over.
29
54
 
30
55
  Returns
31
56
  -------
32
57
  pd.DataFrame
33
58
  A single-row DataFrame containing the computed feature with the specified column name.
34
59
  """
35
-
36
- # Load the CSV
60
+ # --- Read CSV ONCE ---
61
+ # Check if path exists
62
+ if not Path(ts_csv_path).is_file():
63
+ raise FileNotFoundError(f"CSV file not found at path: {ts_csv_path}")
64
+
37
65
  df = pd.read_csv(ts_csv_path)
38
66
  if df.empty:
39
67
  raise ValueError(f"CSV at {ts_csv_path} is empty.")
68
+
69
+ # Ensure columns exist
70
+ if time_column not in df.columns:
71
+ raise ValueError(f"Time column '{time_column}' not found in CSV.")
72
+ if value_column not in df.columns:
73
+ raise ValueError(f"Value column '{value_column}' not found in CSV.")
40
74
 
41
75
  # Ensure time column is datetime
42
76
  df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
43
77
  if df[time_column].isna().all():
44
78
  raise ValueError(f"Time column '{time_column}' could not be converted to datetime.")
45
79
 
46
- # Set index
47
- ts = df.set_index(time_column)[value_column]
80
+ # Set index
81
+ ts = df.set_index(time_column)[value_column].sort_index()
82
+
83
+ if ts.empty:
84
+ raise ValueError("Time series is completely empty. Please check the data or avoid providing this variable.")
85
+
86
+ rows = []
48
87
 
49
- # Clip to observation period if provided
50
- if observation_period is not None:
51
- start_year, end_year = observation_period
52
- ts = ts[(ts.index.year >= start_year) & (ts.index.year <= end_year)]
88
+ for osy, oey in intervals:
89
+ ts_clip = ts[(ts.index.year >= osy) & (ts.index.year <= oey)]
90
+ ts_till_end_year = ts[ts.index.year <= oey]
53
91
 
54
- # Remove NaNs
55
- ts_clean = ts.dropna()
56
- if ts_clean.empty:
57
- raise ValueError("Time series has no valid data after clipping/removing NaNs.")
92
+ row = {}
93
+ for feat, func in feature_functions.items():
94
+ try:
95
+ if feat in FULL_RECORD_FEATURES:
96
+ row[feat] = func(ts_till_end_year) if not ts_till_end_year.empty else np.nan
97
+ else:
98
+ row[feat] = func(ts_clip) if not ts_clip.empty else np.nan
99
+ except Exception:
100
+ row[feat] = np.nan
58
101
 
59
- # Apply user-defined feature function
60
- feature_value = feature_function(ts_clean)
102
+ rows.append(row)
61
103
 
62
- # Return as single-row DataFrame with user-specified column name
63
- return pd.DataFrame({feature_name: [feature_value]})
104
+ return pd.DataFrame(rows)