PyPI - pyreclaim - Versions diffs - 0.4.0__tar.gz → 0.5.0__tar.gz - Mend

pyreclaim 0.4.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyreclaim
-Version: 0.4.0
+Version: 0.5.0
 Summary: Reservoir Estimation of Capacity Loss using AI based Methods
 Author-email: Sanchit Minocha <msanchit@uw.edu>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -702,6 +702,8 @@ Requires-Dist: openpyxl
 Requires-Dist: netcdf4
 Requires-Dist: dask
 Requires-Dist: rioxarray
+Requires-Dist: matplotlib
+Requires-Dist: tqdm
 Dynamic: license-file
 <div align="center">

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "pyreclaim"
-version = "v0.4.0"
+version = "v0.5.0"
 authors = [
   { name="Sanchit Minocha", email="msanchit@uw.edu" },
 ]
@@ -33,7 +33,9 @@ dependencies = [
     "openpyxl",
     "netcdf4",
     "dask",
-    "rioxarray"
+    "rioxarray",
+    "matplotlib",
+    "tqdm"
 ]
 [project.urls]

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
     name = "pyreclaim",
-    version = "v0.4.0",
+    version = "v0.5.0",
     license = "GPL-3.0",
     package_dir = {"": "src"}
 )

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyreclaim
-Version: 0.4.0
+Version: 0.5.0
 Summary: Reservoir Estimation of Capacity Loss using AI based Methods
 Author-email: Sanchit Minocha <msanchit@uw.edu>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -702,6 +702,8 @@ Requires-Dist: openpyxl
 Requires-Dist: netcdf4
 Requires-Dist: dask
 Requires-Dist: rioxarray
+Requires-Dist: matplotlib
+Requires-Dist: tqdm
 Dynamic: license-file
 <div align="center">

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/SOURCES.txt RENAMED Viewed

@@ -27,5 +27,6 @@ src/reclaim/static_features/reservoir_static.py
 src/reclaim/static_features/utils/__init__.py
 src/reclaim/static_features/utils/aec_shape.py
 src/reclaim/static_features/utils/area_perimeter.py
+src/reclaim/static_features/utils/basin_names.py
 src/reclaim/static_features/utils/catchment_agreggate.py
 src/reclaim/static_features/utils/flow_length.py

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/pyreclaim.egg-info/requires.txt RENAMED Viewed

@@ -15,3 +15,5 @@ openpyxl
 netcdf4
 dask
 rioxarray
+matplotlib
+tqdm

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/derived_features/feature_engineering_and_transformation.py RENAMED Viewed

@@ -1,6 +1,45 @@
 import pandas as pd
 import numpy as np
+ALL_FEATURES = [
+     'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
+     'log_RA', 'log_RP', 'log_FL',
+     'log_CA', 'log_DCA',
+     'AECS', 'AECC','AECI',
+     'log_LCAS', 'log_LCC',
+     'log_LCG', 'log_LCT', 'log_LCS',
+     'log_LCHV', 'log_LCM',
+     'log_LCSV','log_LCBS',
+     'log_LCSG', 'log_LCWB','DLC',
+     'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
+     'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
+     'log_MAI', 'log_PAI', 'I_cv',
+     'log_I_std','I_above_90', 'I_max_persis',
+     'log_MAO', 'log_O_std', 'O_cv',
+     'E_mean', 'E_std',
+     'log_SA_mean',  'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
+     'log_SA_mean_clip', 'SA_above_90',
+     'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt',
+     'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
+     'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
+     'tmin_mean', 'tmax_mean',
+     'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt',
+     'AGE', 'log_ROBC', 'log_GC',
+     'NVGF',
+     'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
+     'log_rel_SA_mean_clip', 'log_R_SA_cap',
+     'log_rain_per_area',
+     'log_TE', 'log_RT', 'log_ECLR', 'ESR',
+     'log_SIN', 'log_SOUT',
+]
 def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
     """
     Engineer and transform features in reservoir/catchment dataset.
@@ -58,24 +97,40 @@ def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
     # Land cover log-area features
     lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
-    for col in lc_cols:
-        df[col] = df["CA"] * df[col] / 100
+    # for col in lc_cols:
+    #     df[col] = df["CA"] * df[col] / 100
+    # Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
     # -------------------------
     # APPLY LOG TRANSFORMATIONS
     # -------------------------
     log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
                       'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
-                      'ROBC','rain_per_area','GC','TE','RT','ECLR','ESR','SIN','SOUT'] + lc_cols
+                      'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols
     for col in log_candidates:
         log_col = f'log_{col}'  # add prefix to avoid double log
         try:
-            df[log_col] = np.log(df[col].clip(lower=1e-15))
+            if col in ['ECLR','SIN','SOUT']:
+                # Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
+                df[log_col] = np.log(df[col].clip(lower=1e-15))
+            elif col in ['rain_per_area']:
+                # Rain per area can be zero (upto 10 decimal places), clip at 1e-10
+                df[log_col] = np.log(df[col].clip(lower=1e-10))
+            elif col in lc_cols:
+                df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
+            else:
+                # All other columns can be zero (upto 6 decimal places), clip at 1e-6
+                df[log_col] = np.log(df[col].clip(lower=1e-6))
         except Exception as e:
             raise ValueError(f"Error applying log transform to column '{col}': {e}")
     # Process DLc as categorical column
     df['DLC'] = df['DLC'].astype(int).fillna(0)
+    # Add empty columns for any missing features
+    for feature in ALL_FEATURES:
+        if feature not in df.columns:
+            df[feature] = np.nan
     return df

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/catchment_dynamic.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from typing import Dict, Sequence
+from typing import Dict, Sequence, List
 from reclaim.dynamic_features.utils.rainfall import (
     mean_annual_rainfall_mm,
@@ -13,12 +13,34 @@ from reclaim.dynamic_features.utils.statistical_metrics import (
     skewness,
     kurtosis_val,
 )
-from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
+from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
+VARIABLE_FEATURES = {
+        "precip": {
+            "MAR": mean_annual_rainfall_mm,
+            "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
+            "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
+            "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
+        },
+        "tmin": {
+            "tmin_mean": annual_mean,
+        },
+        "tmax": {
+            "tmax_mean": annual_mean,
+        },
+        "wind": {
+            "wind_mean": annual_mean,
+            "wind_std": annual_std,
+            "wind_cv": coefficient_of_variation,
+            "wind_skew": skewness,
+            "wind_kurt": kurtosis_val,
+        },
+    }
 def catchment_based_dynamic_features(
     variable_info: Dict[str, Dict[str, str]],
-    observation_period: Sequence[int],
+    observation_intervals: List[Sequence[int]],
 ) -> pd.DataFrame:
     """
     Compute dynamic catchment-based features for a single reservoir's catchment,
@@ -41,63 +63,47 @@ def catchment_based_dynamic_features(
                 "time_column": str,
                 "data_column": str
             }
-    observation_period : sequence[int]
-        Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
+    observation_intervals : list of list of int
+        List of [start_year, end_year] intervals to compute features over.
     Returns
     -------
     pd.DataFrame
-        A one-row DataFrame containing the computed catchment-based features.
+        A DataFrame containing as many rows as there are observation intervals and columns corresponding to the computed catchment-based features.
+        Missing variables in ``variable_info`` will result in NaN values for their features.
     Notes
     -----
     - Precipitation features are reported as mm/year (for MAR) and counts (rainy days).
     - Wind statistics include mean, std, CV, skewness, kurtosis.
     - Temperature features are simple annual means (°C).
     """
-    variable_features = {
-        "precip": {
-            "MAR": mean_annual_rainfall_mm,
-            "#_rain_above_10": lambda ts: mean_annual_rainy_days(ts, threshold=10.0),
-            "#_rain_above_50": lambda ts: mean_annual_rainy_days(ts, threshold=50.0),
-            "#_rain_above_100": lambda ts: mean_annual_rainy_days(ts, threshold=100.0),
-        },
-        "tmin": {
-            "tmin_mean": annual_mean,
-        },
-        "tmax": {
-            "tmax_mean": annual_mean,
-        },
-        "wind": {
-            "wind_mean": annual_mean,
-            "wind_std": annual_std,
-            "wind_cv": coefficient_of_variation,
-            "wind_skew": skewness,
-            "wind_kurt": kurtosis_val,
-        },
-    }
-    results = {}
+    all_vars = []
-    for var, feat_dict in variable_features.items():
+    for var, feat_dict in VARIABLE_FEATURES.items():
         if var not in variable_info:
-            for feat in feat_dict.keys():
-                results[feat] = np.nan
+            all_vars.append(
+                pd.DataFrame(np.nan, index=range(len(observation_intervals)),
+                             columns=feat_dict.keys())
+            )
             continue
         path = variable_info[var]["path"]
         time_col = variable_info[var]["time_column"]
         data_col = variable_info[var]["data_column"]
+        try:
+            df_var = compute_ts_aggregates(
+                ts_csv_path=path,
+                time_column=time_col,
+                value_column=data_col,
+                feature_functions=feat_dict,
+                intervals=observation_intervals,
+            )
+            all_vars.append(df_var)
+        except Exception:
+            df_var = pd.DataFrame()
+            all_vars.append(df_var)
-        for feat, func in feat_dict.items():
-            try:
-                df_feat = compute_ts_aggregate(
-                    path, time_col, data_col, func, feat, observation_period
-                )
-                results[feat] = df_feat.iloc[0, 0]  # extract scalar
-            except Exception:
-                results[feat] = np.nan
-    return pd.DataFrame([results])
+    return pd.concat(all_vars, axis=1)

{pyreclaim-0.4.0 → pyreclaim-0.5.0}/src/reclaim/dynamic_features/reservoir_dynamic.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import pandas as pd
 import numpy as np
-from typing import Dict, Sequence, Union, Callable
+from typing import Dict, Sequence, Union, Callable, List
 from reclaim.dynamic_features.utils.statistical_metrics import (
     annual_mean,
@@ -18,11 +18,53 @@ from reclaim.dynamic_features.utils.inflow_outflow import (
     max_annual_flow_m3_per_s,
     mean_annual_flow_variability
 )
-from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregate
+from reclaim.dynamic_features.utils.ts_aggregate import compute_ts_aggregates
+# Define which features depend on which variable
+VARIABLE_FEATURES = {
+    "inflow": {
+        "MAI": mean_annual_flow_m3_per_s,
+        "PAI": max_annual_flow_m3_per_s,
+        "I_cv": mean_annual_flow_variability,
+        "I_std": mean_annual_flow_std_m3_per_s,
+        "I_above_90": max_days_above_90th,
+        "I_max_persis": max_annual_persistence,
+    },
+    "outflow": {
+        "MAO": mean_annual_flow_m3_per_s,
+        "O_std": mean_annual_flow_std_m3_per_s,
+        "O_cv": mean_annual_flow_variability,
+    },
+    "evaporation": {
+        "E_mean": annual_mean,
+        "E_std": annual_std,
+    },
+    "surface_area": {
+        "SA_mean": annual_mean,
+        "SA_std": annual_std,
+        "SA_cv": coefficient_of_variation,
+        "SA_skew": skewness,
+        "SA_kurt": kurtosis_val,
+        "SA_mean_clip": annual_mean,
+        "SA_above_90": max_days_above_90th,
+    },
+    "nssc": {
+        "NSSC1_mean": annual_mean,
+        "NSSC1_std": annual_std,
+        "NSSC1_cv": coefficient_of_variation,
+        "NSSC1_skew": skewness,
+        "NSSC1_kurt": kurtosis_val,
+    },
+    "nssc2": {
+        "NSSC2_mean": annual_mean,
+        "NSSC2_above_90": max_days_above_90th,
+        "NSSC2_max_persis": max_annual_persistence,
+    },
+}
 def reservoir_based_dynamic_features(
     variable_info: Dict[str, Dict[str, str]],
-    observation_period: Sequence[int],
+    observation_intervals: List[Sequence[int]]
 ) -> pd.DataFrame:
     """
     Compute dynamic reservoir features for a single reservoir using inflow, outflow,
@@ -57,13 +99,13 @@ def reservoir_based_dynamic_features(
                 "outflow": {"path": "data/outflow.csv", "time_column": "date", "data_column": "outflow (m3/d)"}
             }
-    observation_period : sequence[int]
-        Two-element sequence [OSY, OEY] specifying the observation period to clip the series.
+    observation_intervals : list of list of int
+        List of [start_year, end_year] intervals to compute features over.
     Returns
     -------
     pd.DataFrame
-        A one-row DataFrame containing the computed reservoir dynamic features.
+        A DataFrame containing as many rows as ``observation_intervals`` and columns corresponding to the computed reservoir dynamic features.
         Missing variables in ``variable_info`` will result in NaN values for their features.
     Notes
@@ -74,76 +116,31 @@ def reservoir_based_dynamic_features(
     - If a variable is missing in ``variable_info``, its corresponding features are NaN.
     """
-    # Define which features depend on which variable
-    variable_features = {
-        "inflow": {
-            "MAI": mean_annual_flow_m3_per_s,
-            "PAI": max_annual_flow_m3_per_s,
-            "I_cv": mean_annual_flow_variability,
-            "I_std": mean_annual_flow_std_m3_per_s,
-            "I_above_90": max_days_above_90th,
-            "I_max_persis": max_annual_persistence,
-        },
-        "outflow": {
-            "MAO": mean_annual_flow_m3_per_s,
-            "O_std": mean_annual_flow_std_m3_per_s,
-            "O_cv": mean_annual_flow_variability,
-        },
-        "evaporation": {
-            "E_mean": annual_mean,
-            "E_std": annual_std,
-        },
-        "surface_area": {
-            "SA_mean": annual_mean,
-            "SA_std": annual_std,
-            "SA_cv": coefficient_of_variation,
-            "SA_skew": skewness,
-            "SA_kurt": kurtosis_val,
-            "SA_mean_clip": annual_mean,
-            "SA_above_90": max_days_above_90th,
-        },
-        "nssc": {
-            "NSSC1_mean": annual_mean,
-            "NSSC1_std": annual_std,
-            "NSSC1_cv": coefficient_of_variation,
-            "NSSC1_skew": skewness,
-            "NSSC1_kurt": kurtosis_val,
-        },
-        "nssc2": {
-            "NSSC2_mean": annual_mean,
-            "NSSC2_above_90": max_days_above_90th,
-            "NSSC2_max_persis": max_annual_persistence,
-        },
-    }
-    results = {}
+    all_vars = []
     # Loop through required variables
-    for var, feat_dict in variable_features.items():
+    for var, feat_dict in VARIABLE_FEATURES.items():
         if var not in variable_info:
-            # Fill with NaN if variable not provided
-            for feat in feat_dict.keys():
-                results[feat] = np.nan
+            all_vars.append(
+                pd.DataFrame(np.nan, index=range(len(observation_intervals)),
+                             columns=feat_dict.keys())
+            )
             continue
         path = variable_info[var]["path"]
         time_col = variable_info[var]["time_column"]
         data_col = variable_info[var]["data_column"]
-        # Some features require clipping, others use full record
-        for feat, func in feat_dict.items():
-            if var == "surface_area" and feat in ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt"]:
-                obs_period = None  # full record
-            else:
-                obs_period = observation_period
-            try:
-                df_feat = compute_ts_aggregate(
-                    path, time_col, data_col, func, feat, obs_period
-                )
-                results[feat] = df_feat.iloc[0, 0]  # single value
-            except Exception as e:
-                print(f"Failed to compute {feat} due to error: {e}. Setting as NaN.")
-                results[feat] = np.nan
-    return pd.DataFrame([results])
+        try:
+            df_var = compute_ts_aggregates(
+                ts_csv_path=path,
+                time_column=time_col,
+                value_column=data_col,
+                feature_functions=feat_dict,
+                intervals=observation_intervals,
+            )
+            all_vars.append(df_var)
+        except Exception:
+            df_var = pd.DataFrame()
+            all_vars.append(df_var)
+    return pd.concat(all_vars, axis=1)

pyreclaim-0.5.0/src/reclaim/dynamic_features/utils/ts_aggregate.py ADDED Viewed

@@ -0,0 +1,104 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Callable, Union, Sequence, List, Dict
+FULL_RECORD_FEATURES = ["SA_mean", "SA_std", "SA_cv", "SA_skew", "SA_kurt", "NSSC2_max_persis"]
+def build_intervals(start_year, end_year, time_interval):
+    total_years = end_year - start_year + 1
+    # Case 1: Entire window shorter than interval
+    if total_years <= time_interval:
+        return [[start_year, end_year]]
+    remainder = total_years % time_interval
+    outputs = []
+    # First interval absorbs remainder (if any)
+    first_len = time_interval + remainder if remainder != 0 else time_interval
+    first_end = min(start_year + first_len - 1, end_year)
+    outputs.append([start_year, first_end])
+    # Remaining intervals
+    current_start = first_end + 1
+    while current_start <= end_year:
+        current_end = current_start + time_interval - 1
+        outputs.append([current_start, min(current_end, end_year)])
+        current_start = current_end + 1
+    return outputs
+def compute_ts_aggregates(
+    ts_csv_path: str,
+    time_column: str,
+    value_column: str,
+    feature_functions: Dict[str, Callable],
+    intervals: List[Sequence[int]],
+) -> pd.DataFrame:
+    """
+    Compute an aggregate feature from a user-provided time series CSV for a single reservoir.
+    Parameters
+    ----------
+    ts_csv_path : str
+        Path to the CSV file containing the time series.
+    time_column : str
+        Name of the column representing dates/timestamps.
+    value_column : str
+        Name of the column representing the variable values.
+    feature_functions : Dict[str, Callable]
+        Dictionary where keys are feature names (column names for output DataFrame) and values are functions that take a pd.Series and return a single value.
+    intervals : list of list of int
+        List of [start_year, end_year] intervals to compute features over.
+    Returns
+    -------
+    pd.DataFrame
+        A single-row DataFrame containing the computed feature with the specified column name.
+    """
+    # --- Read CSV ONCE ---
+    # Check if path exists
+    if not Path(ts_csv_path).is_file():
+        raise FileNotFoundError(f"CSV file not found at path: {ts_csv_path}")
+    df = pd.read_csv(ts_csv_path)
+    if df.empty:
+        raise ValueError(f"CSV at {ts_csv_path} is empty.")
+    # Ensure columns exist
+    if time_column not in df.columns:
+        raise ValueError(f"Time column '{time_column}' not found in CSV.")
+    if value_column not in df.columns:
+        raise ValueError(f"Value column '{value_column}' not found in CSV.")
+    # Ensure time column is datetime
+    df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
+    if df[time_column].isna().all():
+        raise ValueError(f"Time column '{time_column}' could not be converted to datetime.")
+    # Set index
+    ts = df.set_index(time_column)[value_column].sort_index()
+    if ts.empty:
+        raise ValueError("Time series is completely empty. Please check the data or avoid providing this variable.")
+    rows = []
+    for osy, oey in intervals:
+        ts_clip = ts[(ts.index.year >= osy) & (ts.index.year <= oey)]
+        ts_till_end_year = ts[ts.index.year <= oey]
+        row = {}
+        for feat, func in feature_functions.items():
+            try:
+                if feat in FULL_RECORD_FEATURES:
+                    row[feat] = func(ts_till_end_year) if not ts_till_end_year.empty else np.nan
+                else:
+                    row[feat] = func(ts_clip) if not ts_clip.empty else np.nan
+            except Exception:
+                row[feat] = np.nan
+        rows.append(row)
+    return pd.DataFrame(rows)

pyreclaim 0.4.0__tar.gz → 0.5.0__tar.gz

pyreclaim 0.4.0tar.gz → 0.5.0tar.gz