PyPI - lecrapaud - Versions diffs - 0.8.2__tar.gz → 0.8.4__tar.gz - Mend

lecrapaud 0.8.2tar.gz → 0.8.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (43) hide show

{lecrapaud-0.8.2 → lecrapaud-0.8.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.8.2
+Version: 0.8.4
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet

{lecrapaud-0.8.2 → lecrapaud-0.8.4}/lecrapaud/feature_selection.py RENAMED Viewed

@@ -135,10 +135,9 @@ class FeatureSelectionEngine:
         logger.info(f"Starting feature selection for TARGET_{target_number}...")
         clean_directory(self.fs_dir_target)
-        # Let's start by removing extremly correlated features
-        # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
-        # TODO: we could also remove low variance features
-        self.X = self.remove_constant_columns()
+        # Let's start by removing very low variance feature and extremly correlated features
+        # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent, non constant, non full 0 features
+        self.X = self.remove_low_variance_columns()
         features_uncorrelated, features_correlated = self.remove_correlated_features(
             90, vizualize=False
         )
@@ -353,34 +352,31 @@ class FeatureSelectionEngine:
     # Remove correlation
     # ------------------
-    def remove_constant_columns(
-        self,
-        threshold: float = 0.99,
-    ) -> pd.DataFrame:
+    def remove_low_variance_columns(self, threshold: float = 1e-10) -> pd.DataFrame:
         """
-            Removes constant or almost constant columns from a DataFrame.
+        Removes columns with very low variance (including constant columns).
         Parameters:
-            threshold (float): Max proportion for a single value (default 0.99 = 99%).
+            threshold (float): Minimum variance required to keep a column.
+                            Default is 1e-10 to eliminate near-constant features.
         Returns:
-            pd.DataFrame: Cleaned DataFrame.
+            pd.DataFrame: Cleaned DataFrame without low-variance columns.
         """
+        X = self.X
-        to_drop = []
-        for col in self.X.columns:
-            value_counts = self.X[col].value_counts(dropna=False, normalize=True)
-            if value_counts.empty:
-                to_drop.append(col)
-            elif value_counts.iloc[0] >= threshold:
-                to_drop.append(col)
+        low_var_cols = [
+            col
+            for col in X.columns
+            if pd.api.types.is_numeric_dtype(X[col])
+            and np.nanvar(X[col].values) < threshold
+        ]
-        if to_drop:
-            logger.info(f"🔍 Removed {len(to_drop)} constant/almost constant columns:")
-            logger.info(to_drop)
+        if low_var_cols:
+            logger.info(f"🧹 Removed {len(low_var_cols)} low-variance columns:")
+            logger.info(low_var_cols)
-        return self.X.drop(columns=to_drop, errors="ignore")
+        return X.drop(columns=low_var_cols, errors="ignore")
     def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
         X = self.X
@@ -866,6 +862,7 @@ class PreprocessModel:
             train_scaled = None
             val_scaled = None
             test_scaled = None
+            scalers_y = None
         # save data
         joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")

{lecrapaud-0.8.2 → lecrapaud-0.8.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.8.2"
+version = "0.8.4"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}