pyreclaim 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/PKG-INFO +3 -1
  2. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/pyproject.toml +4 -2
  3. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/setup.py +1 -1
  4. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/PKG-INFO +3 -1
  5. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/SOURCES.txt +1 -0
  6. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/requires.txt +2 -0
  7. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/derived_features/feature_engineering_and_transformation.py +59 -4
  8. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/catchment_dynamic.py +50 -44
  9. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/reservoir_dynamic.py +68 -71
  10. pyreclaim-0.5.0/src/reclaim/dynamic_features/utils/ts_aggregate.py +104 -0
  11. pyreclaim-0.5.0/src/reclaim/generate_features.py +297 -0
  12. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/reclaim.py +9 -0
  13. pyreclaim-0.5.0/src/reclaim/static_features/catchment_static.py +202 -0
  14. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/reservoir_static.py +41 -12
  15. pyreclaim-0.5.0/src/reclaim/static_features/utils/basin_names.py +78 -0
  16. pyreclaim-0.5.0/src/reclaim/static_features/utils/catchment_agreggate.py +355 -0
  17. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/flow_length.py +65 -1
  18. pyreclaim-0.4.0/src/reclaim/dynamic_features/utils/ts_aggregate.py +0 -69
  19. pyreclaim-0.4.0/src/reclaim/generate_features.py +0 -158
  20. pyreclaim-0.4.0/src/reclaim/static_features/catchment_static.py +0 -127
  21. pyreclaim-0.4.0/src/reclaim/static_features/utils/catchment_agreggate.py +0 -148
  22. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/LICENSE +0 -0
  23. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/README.md +0 -0
  24. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/setup.cfg +0 -0
  25. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/dependency_links.txt +0 -0
  26. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/top_level.txt +0 -0
  27. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/__init__.py +0 -0
  28. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/derived_features/__init__.py +0 -0
  29. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/__init__.py +0 -0
  30. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/__init__.py +0 -0
  31. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/catchment_meteorology.py +0 -0
  32. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/inflow_outflow.py +0 -0
  33. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/rainfall.py +0 -0
  34. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/utils/statistical_metrics.py +0 -0
  35. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/__init__.py +0 -0
  36. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/__init__.py +0 -0
  37. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/aec_shape.py +0 -0
  38. {pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/static_features/utils/area_perimeter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyreclaim
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Reservoir Estimation of Capacity Loss using AI based Methods
5
5
  Author-email: Sanchit Minocha <msanchit@uw.edu>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -702,6 +702,8 @@ Requires-Dist: openpyxl
702
702
  Requires-Dist: netcdf4
703
703
  Requires-Dist: dask
704
704
  Requires-Dist: rioxarray
705
+ Requires-Dist: matplotlib
706
+ Requires-Dist: tqdm
705
707
  Dynamic: license-file
706
708
 
707
709
  <div align="center">
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "pyreclaim"
7
- version = "v0.4.0"
7
+ version = "v0.5.0"
8
8
  authors = [
9
9
  { name="Sanchit Minocha", email="msanchit@uw.edu" },
10
10
  ]
@@ -33,7 +33,9 @@ dependencies = [
33
33
  "openpyxl",
34
34
  "netcdf4",
35
35
  "dask",
36
- "rioxarray"
36
+ "rioxarray",
37
+ "matplotlib",
38
+ "tqdm"
37
39
  ]
38
40
 
39
41
  [project.urls]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name = "pyreclaim",
6
- version = "v0.4.0",
6
+ version = "v0.5.0",
7
7
  license = "GPL-3.0",
8
8
  package_dir = {"": "src"}
9
9
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyreclaim
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Reservoir Estimation of Capacity Loss using AI based Methods
5
5
  Author-email: Sanchit Minocha <msanchit@uw.edu>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -702,6 +702,8 @@ Requires-Dist: openpyxl
702
702
  Requires-Dist: netcdf4
703
703
  Requires-Dist: dask
704
704
  Requires-Dist: rioxarray
705
+ Requires-Dist: matplotlib
706
+ Requires-Dist: tqdm
705
707
  Dynamic: license-file
706
708
 
707
709
  <div align="center">
@@ -27,5 +27,6 @@ src/reclaim/static_features/reservoir_static.py
27
27
  src/reclaim/static_features/utils/__init__.py
28
28
  src/reclaim/static_features/utils/aec_shape.py
29
29
  src/reclaim/static_features/utils/area_perimeter.py
30
+ src/reclaim/static_features/utils/basin_names.py
30
31
  src/reclaim/static_features/utils/catchment_agreggate.py
31
32
  src/reclaim/static_features/utils/flow_length.py
@@ -15,3 +15,5 @@ openpyxl
15
15
  netcdf4
16
16
  dask
17
17
  rioxarray
18
+ matplotlib
19
+ tqdm
@@ -1,6 +1,45 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
 
4
+ ALL_FEATURES = [
5
+ 'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
6
+ 'log_RA', 'log_RP', 'log_FL',
7
+ 'log_CA', 'log_DCA',
8
+
9
+ 'AECS', 'AECC','AECI',
10
+
11
+ 'log_LCAS', 'log_LCC',
12
+ 'log_LCG', 'log_LCT', 'log_LCS',
13
+ 'log_LCHV', 'log_LCM',
14
+ 'log_LCSV','log_LCBS',
15
+ 'log_LCSG', 'log_LCWB','DLC',
16
+
17
+ 'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
18
+
19
+ 'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
20
+
21
+ 'log_MAI', 'log_PAI', 'I_cv',
22
+ 'log_I_std','I_above_90', 'I_max_persis',
23
+ 'log_MAO', 'log_O_std', 'O_cv',
24
+ 'E_mean', 'E_std',
25
+ 'log_SA_mean', 'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
26
+ 'log_SA_mean_clip', 'SA_above_90',
27
+ 'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt',
28
+ 'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
29
+
30
+ 'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
31
+ 'tmin_mean', 'tmax_mean',
32
+ 'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt',
33
+
34
+ 'AGE', 'log_ROBC', 'log_GC',
35
+ 'NVGF',
36
+ 'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
37
+ 'log_rel_SA_mean_clip', 'log_R_SA_cap',
38
+ 'log_rain_per_area',
39
+ 'log_TE', 'log_RT', 'log_ECLR', 'ESR',
40
+ 'log_SIN', 'log_SOUT',
41
+ ]
42
+
4
43
  def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
5
44
  """
6
45
  Engineer and transform features in reservoir/catchment dataset.
@@ -58,24 +97,40 @@ def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
58
97
 
59
98
  # Land cover log-area features
60
99
  lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
61
- for col in lc_cols:
62
- df[col] = df["CA"] * df[col] / 100
100
+ # for col in lc_cols:
101
+ # df[col] = df["CA"] * df[col] / 100
102
+ # Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
63
103
 
64
104
  # -------------------------
65
105
  # APPLY LOG TRANSFORMATIONS
66
106
  # -------------------------
67
107
  log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
68
108
  'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
69
- 'ROBC','rain_per_area','GC','TE','RT','ECLR','ESR','SIN','SOUT'] + lc_cols
109
+ 'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols
70
110
 
71
111
  for col in log_candidates:
72
112
  log_col = f'log_{col}' # add prefix to avoid double log
73
113
  try:
74
- df[log_col] = np.log(df[col].clip(lower=1e-15))
114
+ if col in ['ECLR','SIN','SOUT']:
115
+ # Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
116
+ df[log_col] = np.log(df[col].clip(lower=1e-15))
117
+ elif col in ['rain_per_area']:
118
+ # Rain per area can be zero (upto 10 decimal places), clip at 1e-10
119
+ df[log_col] = np.log(df[col].clip(lower=1e-10))
120
+ elif col in lc_cols:
121
+ df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
122
+ else:
123
+ # All other columns can be zero (upto 6 decimal places), clip at 1e-6
124
+ df[log_col] = np.log(df[col].clip(lower=1e-6))
75
125
  except Exception as e:
76
126
  raise ValueError(f"Error applying log transform to column '{col}': {e}")
77
127
 
78
128
  # Process DLc as categorical column
79
129
  df['DLC'] = df['DLC'].astype(int).fillna(0)
80
130
 
131
+ # Add empty columns for any missing features
132
+ for feature in ALL_FEATURES:
133
+ if feature not in df.columns:
134
+ df[feature] = np.nan
135
+
81
136
  return df
@@ -1,6 +1,6 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
- from typing import Dict, Sequence
3
+ from typing import Dict, Sequence, List
4
4
 
5
5
  from reclaim.dynamic_features.utils.rainfall import (
6
6
  mean_annual_rainfall_mm,
@@ -13,12 +13,34 @@ from reclaim.dynamic_features.utils.statistical_metrics import (
13
13
  skewness,
14
14
  kurtosis_val,
15
15
  )
16
- from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
16
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
17
+
18
+ VARIABLE_FEATURES = {
19
+ "precip": {
20
+ "MAR": mean_annual_rainfall_mm,
21
+ "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
22
+ "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
23
+ "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
24
+ },
25
+ "tmin": {
26
+ "tmin_mean": annual_mean,
27
+ },
28
+ "tmax": {
29
+ "tmax_mean": annual_mean,
30
+ },
31
+ "wind": {
32
+ "wind_mean": annual_mean,
33
+ "wind_std": annual_std,
34
+ "wind_cv": coefficient_of_variation,
35
+ "wind_skew": skewness,
36
+ "wind_kurt": kurtosis_val,
37
+ },
38
+ }
17
39
 
18
40
 
19
41
  def catchment_based_dynamic_features(
20
42
  variable_info: Dict[str, Dict[str, str]],
21
- observation_period: Sequence[int],
43
+ observation_intervals: List[Sequence[int]],
22
44
  ) -> pd.DataFrame:
23
45
  """
24
46
  Compute dynamic catchment-based features for a single reservoir's catchment,
@@ -41,63 +63,47 @@ def catchment_based_dynamic_features(
41
63
  "time_column": str,
42
64
  "data_column": str
43
65
  }
44
-
45
- observation_period : sequence[int]
46
- Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
66
+
67
+ observation_intervals : list of list of int
68
+ List of [start_year, end_year] intervals to compute features over.
47
69
 
48
70
  Returns
49
71
  -------
50
72
  pd.DataFrame
51
- A one-row DataFrame containing the computed catchment-based features.
52
-
73
+ A DataFrame containing as many rows as there are observation intervals and columns corresponding to the computed catchment-based features.
74
+ Missing variables in ``variable_info`` will result in NaN values for their features.
53
75
  Notes
54
76
  -----
55
77
  - Precipitation features are reported as mm/year (for MAR) and counts (rainy days).
56
78
  - Wind statistics include mean, std, CV, skewness, kurtosis.
57
79
  - Temperature features are simple annual means (°C).
58
80
  """
59
-
60
- variable_features = {
61
- "precip": {
62
- "MAR": mean_annual_rainfall_mm,
63
- "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
64
- "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
65
- "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
66
- },
67
- "tmin": {
68
- "tmin_mean": annual_mean,
69
- },
70
- "tmax": {
71
- "tmax_mean": annual_mean,
72
- },
73
- "wind": {
74
- "wind_mean": annual_mean,
75
- "wind_std": annual_std,
76
- "wind_cv": coefficient_of_variation,
77
- "wind_skew": skewness,
78
- "wind_kurt": kurtosis_val,
79
- },
80
- }
81
81
 
82
- results = {}
82
+ all_vars = []
83
83
 
84
- for var, feat_dict in variable_features.items():
84
+ for var, feat_dict in VARIABLE_FEATURES.items():
85
85
  if var not in variable_info:
86
- for feat in feat_dict.keys():
87
- results[feat] = np.nan
86
+ all_vars.append(
87
+ pd.DataFrame(np.nan, index=range(len(observation_intervals)),
88
+ columns=feat_dict.keys())
89
+ )
88
90
  continue
89
91
 
90
92
  path = variable_info[var]["path"]
91
93
  time_col = variable_info[var]["time_column"]
92
94
  data_col = variable_info[var]["data_column"]
95
+
96
+ try:
97
+ df_var = compute_ts_aggregates(
98
+ ts_csv_path=path,
99
+ time_column=time_col,
100
+ value_column=data_col,
101
+ feature_functions=feat_dict,
102
+ intervals=observation_intervals,
103
+ )
104
+ all_vars.append(df_var)
105
+ except Exception:
106
+ df_var = pd.DataFrame()
107
+ all_vars.append(df_var)
93
108
 
94
- for feat, func in feat_dict.items():
95
- try:
96
- df_feat = compute_ts_aggregate(
97
- path, time_col, data_col, func, feat, observation_period
98
- )
99
- results[feat] = df_feat.iloc[0, 0] # extract scalar
100
- except Exception:
101
- results[feat] = np.nan
102
-
103
- return pd.DataFrame([results])
109
+ return pd.concat(all_vars, axis=1)
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import pandas as pd
3
3
  import numpy as np
4
- from typing import Dict, Sequence, Union, Callable
4
+ from typing import Dict, Sequence, Union, Callable, List
5
5
 
6
6
  from reclaim.dynamic_features.utils.statistical_metrics import (
7
7
  annual_mean,
@@ -18,11 +18,53 @@ from reclaim.dynamic_features.utils.inflow_outflow import (
18
18
  max_annual_flow_m3_per_s,
19
19
  mean_annual_flow_variability
20
20
  )
21
- from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
21
+ from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
22
+
23
+ # Define which features depend on which variable
24
+ VARIABLE_FEATURES = {
25
+ "inflow": {
26
+ "MAI": mean_annual_flow_m3_per_s,
27
+ "PAI": max_annual_flow_m3_per_s,
28
+ "I_cv": mean_annual_flow_variability,
29
+ "I_std": mean_annual_flow_std_m3_per_s,
30
+ "I_above_90": max_days_above_90th,
31
+ "I_max_persis": max_annual_persistence,
32
+ },
33
+ "outflow": {
34
+ "MAO": mean_annual_flow_m3_per_s,
35
+ "O_std": mean_annual_flow_std_m3_per_s,
36
+ "O_cv": mean_annual_flow_variability,
37
+ },
38
+ "evaporation": {
39
+ "E_mean": annual_mean,
40
+ "E_std": annual_std,
41
+ },
42
+ "surface_area": {
43
+ "SA_mean": annual_mean,
44
+ "SA_std": annual_std,
45
+ "SA_cv": coefficient_of_variation,
46
+ "SA_skew": skewness,
47
+ "SA_kurt": kurtosis_val,
48
+ "SA_mean_clip": annual_mean,
49
+ "SA_above_90": max_days_above_90th,
50
+ },
51
+ "nssc": {
52
+ "NSSC1_mean": annual_mean,
53
+ "NSSC1_std": annual_std,
54
+ "NSSC1_cv": coefficient_of_variation,
55
+ "NSSC1_skew": skewness,
56
+ "NSSC1_kurt": kurtosis_val,
57
+ },
58
+ "nssc2": {
59
+ "NSSC2_mean": annual_mean,
60
+ "NSSC2_above_90": max_days_above_90th,
61
+ "NSSC2_max_persis": max_annual_persistence,
62
+ },
63
+ }
22
64
 
23
65
  def reservoir_based_dynamic_features(
24
66
  variable_info: Dict[str, Dict[str, str]],
25
- observation_period: Sequence[int],
67
+ observation_intervals: List[Sequence[int]]
26
68
  ) -> pd.DataFrame:
27
69
  """
28
70
  Compute dynamic reservoir features for a single reservoir using inflow, outflow,
@@ -57,13 +99,13 @@ def reservoir_based_dynamic_features(
57
99
  "outflow": {"path": "data/outflow.csv", "time_column": "date", "data_column": "outflow (m3/d)"}
58
100
  }
59
101
 
60
- observation_period : sequence[int]
61
- Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
102
+ observation_intervals : list of list of int
103
+ List of [start_year, end_year] intervals to compute features over.
62
104
 
63
105
  Returns
64
106
  -------
65
107
  pd.DataFrame
66
- A one-row DataFrame containing the computed reservoir dynamic features.
108
+ A DataFrame containing as many rows as ``observation_intervals`` and columns corresponding to the computed reservoir dynamic features.
67
109
  Missing variables in ``variable_info`` will result in NaN values for their features.
68
110
 
69
111
  Notes
@@ -74,76 +116,31 @@ def reservoir_based_dynamic_features(
74
116
  - If a variable is missing in ``variable_info``, its corresponding features are NaN.
75
117
  """
76
118
 
77
- # Define which features depend on which variable
78
- variable_features = {
79
- "inflow": {
80
- "MAI": mean_annual_flow_m3_per_s,
81
- "PAI": max_annual_flow_m3_per_s,
82
- "I_cv": mean_annual_flow_variability,
83
- "I_std": mean_annual_flow_std_m3_per_s,
84
- "I_above_90": max_days_above_90th,
85
- "I_max_persis": max_annual_persistence,
86
- },
87
- "outflow": {
88
- "MAO": mean_annual_flow_m3_per_s,
89
- "O_std": mean_annual_flow_std_m3_per_s,
90
- "O_cv": mean_annual_flow_variability,
91
- },
92
- "evaporation": {
93
- "E_mean": annual_mean,
94
- "E_std": annual_std,
95
- },
96
- "surface_area": {
97
- "SA_mean": annual_mean,
98
- "SA_std": annual_std,
99
- "SA_cv": coefficient_of_variation,
100
- "SA_skew": skewness,
101
- "SA_kurt": kurtosis_val,
102
- "SA_mean_clip": annual_mean,
103
- "SA_above_90": max_days_above_90th,
104
- },
105
- "nssc": {
106
- "NSSC1_mean": annual_mean,
107
- "NSSC1_std": annual_std,
108
- "NSSC1_cv": coefficient_of_variation,
109
- "NSSC1_skew": skewness,
110
- "NSSC1_kurt": kurtosis_val,
111
- },
112
- "nssc2": {
113
- "NSSC2_mean": annual_mean,
114
- "NSSC2_above_90": max_days_above_90th,
115
- "NSSC2_max_persis": max_annual_persistence,
116
- },
117
- }
118
-
119
- results = {}
119
+ all_vars = []
120
120
 
121
121
  # Loop through required variables
122
- for var, feat_dict in variable_features.items():
122
+ for var, feat_dict in VARIABLE_FEATURES.items():
123
123
  if var not in variable_info:
124
- # Fill with NaN if variable not provided
125
- for feat in feat_dict.keys():
126
- results[feat] = np.nan
124
+ all_vars.append(
125
+ pd.DataFrame(np.nan, index=range(len(observation_intervals)),
126
+ columns=feat_dict.keys())
127
+ )
127
128
  continue
128
129
 
129
130
  path = variable_info[var]["path"]
130
131
  time_col = variable_info[var]["time_column"]
131
132
  data_col = variable_info[var]["data_column"]
132
-
133
- # Some features require clipping, others use full record
134
- for feat, func in feat_dict.items():
135
- if var == "surface_area" and feat in ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt"]:
136
- obs_period = None # full record
137
- else:
138
- obs_period = observation_period
139
-
140
- try:
141
- df_feat = compute_ts_aggregate(
142
- path, time_col, data_col, func, feat, obs_period
143
- )
144
- results[feat] = df_feat.iloc[0, 0] # single value
145
- except Exception as e:
146
- print(f"Failed to compute {feat} due to error: {e}. Setting as NaN.")
147
- results[feat] = np.nan
148
-
149
- return pd.DataFrame([results])
133
+ try:
134
+ df_var = compute_ts_aggregates(
135
+ ts_csv_path=path,
136
+ time_column=time_col,
137
+ value_column=data_col,
138
+ feature_functions=feat_dict,
139
+ intervals=observation_intervals,
140
+ )
141
+ all_vars.append(df_var)
142
+ except Exception:
143
+ df_var = pd.DataFrame()
144
+ all_vars.append(df_var)
145
+
146
+ return pd.concat(all_vars, axis=1)
@@ -0,0 +1,104 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from pathlib import Path
4
+ from typing import Callable, Union, Sequence, List, Dict
5
+
6
+ FULL_RECORD_FEATURES = ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt", "NSSC2_max_persis"]
7
+
8
+ def build_intervals(start_year, end_year, time_interval):
9
+ total_years = end_year - start_year + 1
10
+
11
+ # Case 1: Entire window shorter than interval
12
+ if total_years <= time_interval:
13
+ return [[start_year, end_year]]
14
+
15
+ remainder = total_years % time_interval
16
+ outputs = []
17
+
18
+ # First interval absorbs remainder (if any)
19
+ first_len = time_interval + remainder if remainder != 0 else time_interval
20
+ first_end = min(start_year + first_len - 1, end_year)
21
+ outputs.append([start_year, first_end])
22
+
23
+ # Remaining intervals
24
+ current_start = first_end + 1
25
+ while current_start <= end_year:
26
+ current_end = current_start + time_interval - 1
27
+ outputs.append([current_start, min(current_end, end_year)])
28
+ current_start = current_end + 1
29
+
30
+ return outputs
31
+
32
+ def compute_ts_aggregates(
33
+ ts_csv_path: str,
34
+ time_column: str,
35
+ value_column: str,
36
+ feature_functions: Dict[str, Callable],
37
+ intervals: List[Sequence[int]],
38
+ ) -> pd.DataFrame:
39
+ """
40
+ Compute an aggregate feature from a user-provided time series CSV for a single reservoir.
41
+
42
+ Parameters
43
+ ----------
44
+ ts_csv_path : str
45
+ Path to the CSV file containing the time series.
46
+ time_column : str
47
+ Name of the column representing dates/timestamps.
48
+ value_column : str
49
+ Name of the column representing the variable values.
50
+ feature_functions : Dict[str, Callable]
51
+ Dictionary where keys are feature names (column names for output DataFrame) and values are functions that take a pd.Series and return a single value.
52
+ intervals : list of list of int
53
+ List of [start_year, end_year] intervals to compute features over.
54
+
55
+ Returns
56
+ -------
57
+ pd.DataFrame
58
+ A single-row DataFrame containing the computed feature with the specified column name.
59
+ """
60
+ # --- Read CSV ONCE ---
61
+ # Check if path exists
62
+ if not Path(ts_csv_path).is_file():
63
+ raise FileNotFoundError(f"CSV file not found at path: {ts_csv_path}")
64
+
65
+ df = pd.read_csv(ts_csv_path)
66
+ if df.empty:
67
+ raise ValueError(f"CSV at {ts_csv_path} is empty.")
68
+
69
+ # Ensure columns exist
70
+ if time_column not in df.columns:
71
+ raise ValueError(f"Time column '{time_column}' not found in CSV.")
72
+ if value_column not in df.columns:
73
+ raise ValueError(f"Value column '{value_column}' not found in CSV.")
74
+
75
+ # Ensure time column is datetime
76
+ df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
77
+ if df[time_column].isna().all():
78
+ raise ValueError(f"Time column '{time_column}' could not be converted to datetime.")
79
+
80
+ # Set index
81
+ ts = df.set_index(time_column)[value_column].sort_index()
82
+
83
+ if ts.empty:
84
+ raise ValueError("Time series is completely empty. Please check the data or avoid providing this variable.")
85
+
86
+ rows = []
87
+
88
+ for osy, oey in intervals:
89
+ ts_clip = ts[(ts.index.year >= osy) & (ts.index.year <= oey)]
90
+ ts_till_end_year = ts[ts.index.year <= oey]
91
+
92
+ row = {}
93
+ for feat, func in feature_functions.items():
94
+ try:
95
+ if feat in FULL_RECORD_FEATURES:
96
+ row[feat] = func(ts_till_end_year) if not ts_till_end_year.empty else np.nan
97
+ else:
98
+ row[feat] = func(ts_clip) if not ts_clip.empty else np.nan
99
+ except Exception:
100
+ row[feat] = np.nan
101
+
102
+ rows.append(row)
103
+
104
+ return pd.DataFrame(rows)