PyPI - lecrapaud - Versions diffs - 0.20.0__tar.gz → 0.20.1__tar.gz - Mend

lecrapaud 0.20.0tar.gz → 0.20.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (50) hide show

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lecrapaud
-Version: 0.20.0
+Version: 0.20.1
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 License-File: LICENSE

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/api.py RENAMED Viewed

@@ -165,6 +165,12 @@ class ExperimentEngine:
     def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
         """Initialize the experiment engine with either new or existing experiment."""
+        # Set all kwargs as instance attributes
+        if "models_idx" in kwargs:
+            kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
+        for key, value in kwargs.items():
+            setattr(self, key, value)
         if id:
             self.experiment = Experiment.get(id)
             kwargs.update(self.experiment.context)
@@ -180,12 +186,6 @@ class ExperimentEngine:
                 )
             self.experiment = create_experiment(data=data, **kwargs)
-        # Set all kwargs as instance attributes
-        for key, value in kwargs.items():
-            if key == "models_idx":
-                value = normalize_models_idx(value)
-            setattr(self, key, value)
     def train(self, data, best_params=None):
         logger.info("Running training...")
@@ -309,12 +309,8 @@ class ExperimentEngine:
     def feature_engineering(self, data, for_training=True):
         app = FeatureEngineeringEngine(
             data=data,
-            columns_drop=getattr(self, "columns_drop", []),
-            columns_boolean=getattr(self, "columns_boolean", []),
-            columns_date=getattr(self, "columns_date", []),
-            columns_te_groupby=getattr(self, "columns_te_groupby", []),
-            columns_te_target=getattr(self, "columns_te_target", []),
-            for_training=getattr(self, "for_training", True),
+            experiment=self.experiment,
+            for_training=for_training,
         )
         data = app.run()
         return data
@@ -322,21 +318,7 @@ class ExperimentEngine:
     def preprocess_feature(self, data, for_training=True):
         app = PreprocessFeature(
             data=data,
-            experiment=getattr(self, "experiment", None),
-            time_series=getattr(self, "time_series", False),
-            date_column=getattr(self, "date_column", None),
-            group_column=getattr(self, "group_column", None),
-            val_size=getattr(self, "val_size", 0.2),
-            test_size=getattr(self, "test_size", 0.2),
-            columns_pca=getattr(self, "columns_pca", []),
-            pca_temporal=getattr(self, "pca_temporal", []),
-            pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
-            columns_onehot=getattr(self, "columns_onehot", []),
-            columns_binary=getattr(self, "columns_binary", []),
-            columns_ordinal=getattr(self, "columns_ordinal", []),
-            columns_frequency=getattr(self, "columns_frequency", []),
-            target_numbers=getattr(self, "target_numbers", []),
-            target_clf=getattr(self, "target_clf", []),
+            experiment=self.experiment,
         )
         if for_training:
             train, val, test = app.run()
@@ -351,7 +333,6 @@ class ExperimentEngine:
                 train=train,
                 target_number=target_number,
                 experiment=self.experiment,
-                target_clf=self.target_clf,
             )
             app.run()
         self.experiment = Experiment.get(self.experiment.id)
@@ -368,14 +349,7 @@ class ExperimentEngine:
             train=train,
             val=val,
             test=test,
-            experiment=getattr(self, "experiment", None),
-            target_numbers=getattr(self, "target_numbers", []),
-            target_clf=getattr(self, "target_clf", []),
-            models_idx=getattr(self, "models_idx", []),
-            time_series=getattr(self, "time_series", False),
-            max_timesteps=getattr(self, "max_timesteps", 120),
-            date_column=getattr(self, "date_column", None),
-            group_column=getattr(self, "group_column", None),
+            experiment=self.experiment,
         )
         if for_training:
             data, reshaped_data = app.run()
@@ -390,25 +364,13 @@ class ExperimentEngine:
                 data=data,
                 reshaped_data=reshaped_data,
                 target_number=target_number,
-                experiment=getattr(self, "experiment", None),
-                target_clf=getattr(self, "target_clf", []),
-                models_idx=getattr(self, "models_idx", []),
-                time_series=getattr(self, "time_series", False),
-                date_column=getattr(self, "date_column", None),
-                group_column=getattr(self, "group_column", None),
-                target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
+                experiment=self.experiment,
             )
             if best_params and target_number not in best_params.keys():
                 raise ValueError(
                     f"Target {target_number} not found in best_params passed as argument"
                 )
             app.run(
-                self.experiment_name,
-                perform_hyperopt=self.perform_hyperopt,
-                number_of_trials=self.number_of_trials,
-                perform_crossval=self.perform_crossval,
-                plot=self.plot,
-                preserve_model=self.preserve_model,
                 best_params=best_params[target_number] if best_params else None,
             )

lecrapaud-0.20.1/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""add number_of_targets and remove other fields from experiments
+Revision ID: 0a8fb7826e9b
+Revises: 033e0f7eca4f
+Create Date: 2025-10-28 20:06:54.792631
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+# revision identifiers, used by Alembic.
+revision: str = '0a8fb7826e9b'
+down_revision: Union[str, None] = '033e0f7eca4f'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
+    op.drop_column('lecrapaud_experiments', 'corr_threshold')
+    op.drop_column('lecrapaud_experiments', 'max_features')
+    op.drop_column('lecrapaud_experiments', 'percentile')
+    op.drop_column('lecrapaud_experiments', 'type')
+    op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
+    op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
+    op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
+    op.drop_column('lecrapaud_experiments', 'number_of_targets')
+    # ### end Alembic commands ###

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/experiment.py RENAMED Viewed

@@ -50,10 +50,43 @@ class Experiment(Base):
     )
     name = Column(String(255), nullable=False)
     path = Column(String(255))  # we do not have this at creation time
-    type = Column(String(50), nullable=False)
     size = Column(Integer, nullable=False)
     train_size = Column(Integer)
     val_size = Column(Integer)
+    test_size = Column(Integer)
+    number_of_groups = Column(Integer)
+    list_of_groups = Column(JSON)
+    number_of_targets = Column(Integer)
+    start_date = Column(DateTime)
+    end_date = Column(DateTime)
+    train_start_date = Column(DateTime)
+    train_end_date = Column(DateTime)
+    val_start_date = Column(DateTime)
+    val_end_date = Column(DateTime)
+    test_start_date = Column(DateTime)
+    test_end_date = Column(DateTime)
+    context = Column(JSON)
+    feature_selections = relationship(
+        "FeatureSelection",
+        back_populates="experiment",
+        cascade="all, delete-orphan",
+        lazy="selectin",
+    )
+    targets = relationship(
+        "Target",
+        secondary=lecrapaud_experiment_target_association,
+        back_populates="experiments",
+        lazy="selectin",
+    )
+    __table_args__ = (
+        UniqueConstraint(
+            "name",
+            name="uq_experiments_composite",
+        ),
+    )
     # Relationships
     model_selections = relationship(
@@ -68,16 +101,9 @@ class Experiment(Base):
         """Best RMSE score across all model selections and trainings."""
         # Get the minimum RMSE for each model selection
         min_scores = [
-            min(
-                mss.rmse
-                for mss in ms.model_selection_scores
-                if mss.rmse is not None
-            )
+            min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
             for ms in self.model_selections
-            if any(
-                mss.rmse is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.rmse is not None for mss in ms.model_selection_scores)
         ]
         return min(min_scores) if min_scores else None
@@ -92,10 +118,7 @@ class Experiment(Base):
                 if mss.logloss is not None
             )
             for ms in self.model_selections
-            if any(
-                mss.logloss is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.logloss is not None for mss in ms.model_selection_scores)
         ]
         return min(min_scores) if min_scores else None
@@ -104,16 +127,9 @@ class Experiment(Base):
         """Average RMSE score across all model selections and trainings."""
         # Get the minimum RMSE for each model selection
         min_scores = [
-            min(
-                mss.rmse
-                for mss in ms.model_selection_scores
-                if mss.rmse is not None
-            )
+            min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
             for ms in self.model_selections
-            if any(
-                mss.rmse is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.rmse is not None for mss in ms.model_selection_scores)
         ]
         return mean(min_scores) if min_scores else None
@@ -128,50 +144,10 @@ class Experiment(Base):
                 if mss.logloss is not None
             )
             for ms in self.model_selections
-            if any(
-                mss.logloss is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.logloss is not None for mss in ms.model_selection_scores)
         ]
         return mean(min_scores) if min_scores else None
-    test_size = Column(Integer)
-    corr_threshold = Column(Float, nullable=False)
-    max_features = Column(Integer, nullable=False)
-    percentile = Column(Float, nullable=False)
-    number_of_groups = Column(Integer)
-    list_of_groups = Column(JSON)
-    start_date = Column(DateTime)
-    end_date = Column(DateTime)
-    train_start_date = Column(DateTime)
-    train_end_date = Column(DateTime)
-    val_start_date = Column(DateTime)
-    val_end_date = Column(DateTime)
-    test_start_date = Column(DateTime)
-    test_end_date = Column(DateTime)
-    context = Column(JSON)
-    feature_selections = relationship(
-        "FeatureSelection",
-        back_populates="experiment",
-        cascade="all, delete-orphan",
-        lazy="selectin",
-    )
-    targets = relationship(
-        "Target",
-        secondary=lecrapaud_experiment_target_association,
-        back_populates="experiments",
-        lazy="selectin",
-    )
-    __table_args__ = (
-        UniqueConstraint(
-            "name",
-            name="uq_experiments_composite",
-        ),
-    )
     @classmethod
     @with_db
     def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -354,19 +330,18 @@ class Experiment(Base):
         # Get the best model score based on lowest logloss or rmse
         model_scores = best_model_selection.model_selection_scores
         # Determine if we should use logloss or rmse based on what's available
         if any(ms.logloss is not None for ms in model_scores):
             # Classification: find lowest logloss
             best_score = min(
                 (ms for ms in model_scores if ms.logloss is not None),
-                key=lambda x: x.logloss
+                key=lambda x: x.logloss,
             )
         elif any(ms.rmse is not None for ms in model_scores):
             # Regression: find lowest rmse
             best_score = min(
-                (ms for ms in model_scores if ms.rmse is not None),
-                key=lambda x: x.rmse
+                (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
             )
         else:
             return {
@@ -398,12 +373,8 @@ class Experiment(Base):
         # Get the model info
         model_info = {
-            "model_type": (
-                score.model.model_type if score.model else "unknown"
-            ),
-            "model_name": (
-                score.model.name if score.model else "unknown"
-            ),
+            "model_type": (score.model.model_type if score.model else "unknown"),
+            "model_name": (score.model.name if score.model else "unknown"),
             "training_time_seconds": score.training_time,
         }
@@ -434,7 +405,9 @@ class Experiment(Base):
         return features
     @with_db
-    def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
+    def get_all_features(
+        self, date_column: str = None, group_column: str = None, db=None
+    ):
         # Ensure we have a fresh instance attached to the session
         self = db.merge(self)
         target_idx = [target.id for target in self.targets]

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/experiment.py RENAMED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import pandas as pd
 import joblib
+from datetime import datetime
 # Set up coverage file path
 os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -15,9 +16,6 @@ from lecrapaud.db.session import get_db
 def create_experiment(
     data: pd.DataFrame | str,
-    corr_threshold,
-    percentile,
-    max_features,
     date_column,
     group_column,
     experiment_name,
@@ -42,7 +40,10 @@ def create_experiment(
         targets = [
             target for target in all_targets if target.name in data.columns.str.upper()
         ]
-        experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
+        experiment_name = (
+            f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        )
+        number_of_targets = len(targets)
         experiment_dir = f"{tmp_dir}/{experiment_name}"
         preprocessing_dir = f"{experiment_dir}/preprocessing"
@@ -55,26 +56,20 @@ def create_experiment(
             db=db,
             name=experiment_name,
             path=Path(experiment_dir).resolve(),
-            type="training",
             size=data.shape[0],
-            corr_threshold=corr_threshold,
-            percentile=percentile,
-            max_features=max_features,
+            number_of_targets=number_of_targets,
             **groups,
             **dates,
             context={
-                "corr_threshold": corr_threshold,
-                "percentile": percentile,
-                "max_features": max_features,
                 "date_column": date_column,
                 "group_column": group_column,
                 "experiment_name": experiment_name,
                 **kwargs,
             },
         )
         # Set targets relationship after creation/update
         experiment.targets = targets
         experiment.save(db=db)
         return experiment

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/feature_engineering.py RENAMED Viewed

@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
     def __init__(
         self,
         data: pd.DataFrame,
-        columns_drop: list[str] = [],
-        columns_boolean: list[str] = [],
-        columns_date: list[str] = [],
-        columns_te_groupby: list[str] = [],
-        columns_te_target: list[str] = [],
+        experiment,
         for_training: bool = True,
         **kwargs,
     ):
         self.data = data
-        self.columns_drop = columns_drop
-        self.columns_boolean = columns_boolean
-        self.columns_date = columns_date
-        self.columns_te_groupby = columns_te_groupby
-        self.columns_te_target = columns_te_target
+        self.experiment = experiment
         self.for_training = for_training
+        # Get all parameters from experiment context
+        self.columns_drop = self.experiment.context.get("columns_drop", [])
+        self.columns_boolean = self.experiment.context.get("columns_boolean", [])
+        self.columns_date = self.experiment.context.get("columns_date", [])
+        self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
+        self.columns_te_target = self.experiment.context.get("columns_te_target", [])
     def run(self) -> pd.DataFrame:
         # drop columns
@@ -316,41 +315,30 @@ class PreprocessFeature:
         self,
         data: pd.DataFrame,
         experiment,
-        time_series: bool = False,
-        date_column: str | None = None,
-        group_column: str | None = None,
-        val_size: float = 0.2,
-        test_size: float = 0.2,
-        columns_pca: list[str] = [],
-        pca_temporal: list[dict[str, list[str]]] = [],
-        pca_cross_sectional: list[dict[str, list[str]]] = [],
-        columns_onehot: list[str] = [],
-        columns_binary: list[str] = [],
-        columns_ordinal: list[str] = [],
-        columns_frequency: list[str] = [],
-        target_numbers: list = [],
-        target_clf: list = [],
         **kwargs,
     ):
         self.data = data
         self.data.columns = self.data.columns.str.upper()
         self.experiment = experiment
-        self.columns_pca = [col.upper() for col in columns_pca]
-        self.pca_temporal = pca_temporal
-        self.pca_cross_sectional = pca_cross_sectional
-        self.columns_onehot = [col.upper() for col in columns_onehot]
-        self.columns_binary = [col.upper() for col in columns_binary]
-        self.columns_ordinal = [col.upper() for col in columns_ordinal]
-        self.columns_frequency = [col.upper() for col in columns_frequency]
-        self.target_numbers = target_numbers
-        self.target_clf = target_clf
-        self.time_series = time_series
-        self.date_column = date_column
-        self.group_column = group_column
-        self.val_size = val_size
-        self.test_size = test_size
+        # Get all parameters from experiment context
+        context = self.experiment.context
+        self.time_series = context.get("time_series", False)
+        self.date_column = context.get("date_column", None)
+        self.group_column = context.get("group_column", None)
+        self.val_size = context.get("val_size", 0.2)
+        self.test_size = context.get("test_size", 0.2)
+        self.target_numbers = context.get("target_numbers", [])
+        self.target_clf = context.get("target_clf", [])
+        # Handle list parameters with uppercase conversion
+        self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
+        self.pca_temporal = context.get("pca_temporal", [])
+        self.pca_cross_sectional = context.get("pca_cross_sectional", [])
+        self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
+        self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
+        self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
+        self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
         self.experiment_dir = self.experiment.path
         self.experiment_id = self.experiment.id

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/feature_selection.py RENAMED Viewed

@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
 class FeatureSelectionEngine:
-    def __init__(self, train, experiment, target_number, target_clf, **kwargs):
+    def __init__(self, train, experiment, target_number, **kwargs):
         self.experiment = experiment
         self.train = train
         self.target_number = target_number
-        self.target_clf = target_clf
+        # Get all parameters from experiment context
+        self.target_clf = self.experiment.context.get("target_clf", [])
+        self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
+        self.percentile = self.experiment.context.get("percentile", 20)
+        self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
+        self.max_features = self.experiment.context.get("max_features", 50)
         self.target_type = (
             "classification" if self.target_number in self.target_clf else "regression"
         )
-        self.percentile = self.experiment.percentile
-        self.corr_threshold = self.experiment.corr_threshold
-        self.max_features = self.experiment.max_features
         self.experiment_dir = self.experiment.path
         self.experiment_id = self.experiment.id
@@ -274,6 +277,38 @@ class FeatureSelectionEngine:
         features_selected.drop_duplicates("features", inplace=True)
         features_selected_list = features_selected["features"].values.tolist()
+        # Save ensemble features before correlation (aggregated features)
+        logger.info("Saving ensemble features before correlation...")
+        all_features_in_data = self.X.columns.tolist()
+        ensemble_rows = []
+        # Add global rank for selected features
+        features_selected_with_global_rank = features_selected.copy()
+        features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
+        for feature in all_features_in_data:
+            feature_id = feature_map.get(feature)
+            if feature_id:
+                is_selected = feature in features_selected_list
+                global_rank = None
+                if is_selected:
+                    global_rank = features_selected_with_global_rank[
+                        features_selected_with_global_rank["features"] == feature
+                    ]["global_rank"].values[0]
+                ensemble_rows.append({
+                    "feature_selection_id": feature_selection.id,
+                    "feature_id": feature_id,
+                    "method": "ensemble",
+                    "score": None,
+                    "pvalue": None,
+                    "support": 2 if is_selected else 0,  # 2 = in aggregated features
+                    "rank": global_rank,
+                    "training_time": 0,
+                })
+        FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
         # analysis 1
         features_selected_by_every_methods = set(results[0]["features"].values.tolist())
@@ -302,12 +337,46 @@ class FeatureSelectionEngine:
             header=True,
             index_label="ID",
         )
+        # Update support for features after correlation removal (before max)
+        logger.info("Updating ensemble features after correlation removal...")
+        for row in ensemble_rows:
+            feature = Feature.get(row["feature_id"]).name
+            if feature in features:
+                row["support"] = 1  # 1 = survived correlation removal
         features = features[:max_features]
         # adding categorical features selected
         features += (
             categorical_features_selected if target_type == "classification" else []
         )
+        # Final update for features after max limitation (final selection)
+        logger.info("Finalizing ensemble features with categorical features...")
+        for row in ensemble_rows:
+            feature = Feature.get(row["feature_id"]).name
+            if feature in features and row["support"] == 1:
+                row["support"] = 2  # 2 = in final selection
+        # Add categorical features to ensemble if not already present
+        if target_type == "classification":
+            for cat_feature in categorical_features_selected:
+                feature_id = feature_map.get(cat_feature)
+                if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
+                    ensemble_rows.append({
+                        "feature_selection_id": feature_selection.id,
+                        "feature_id": feature_id,
+                        "method": "ensemble",
+                        "score": None,
+                        "pvalue": None,
+                        "support": 2,  # 2 = in final selection (categorical)
+                        "rank": None,  # No rank for categorical features added at the end
+                        "training_time": 0,
+                    })
+        # Re-save all ensemble data with updated support values
+        FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
         logger.debug(
             f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
         )
@@ -440,13 +509,18 @@ class FeatureSelectionEngine:
         feat_scores["features"] = X.columns
         feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
         feat_scores["method"] = "Chi2"
+        # Apply both percentile and p-value filtering
+        # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
+        feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
         feat_scores.sort_values("rank", ascending=True, inplace=True)
         stop = time.time()
         training_time = timedelta(seconds=(stop - start)).total_seconds()
         feat_scores["training_time"] = training_time
         logger.debug(
-            f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
+            f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
         )
         feat_scores.to_csv(
@@ -803,33 +877,28 @@ class PreprocessModel:
         val,
         test,
         experiment,
-        target_numbers,
-        target_clf,
-        models_idx,
-        time_series,
-        max_timesteps,
-        group_column,
-        date_column,
         **kwargs,
     ):
         self.train = train
         self.val = val
         self.test = test
         self.experiment = experiment
-        self.target_numbers = target_numbers
-        self.target_clf = target_clf
-        self.models_idx = models_idx
-        self.time_series = time_series
-        self.max_timesteps = max_timesteps
-        self.group_column = group_column
-        self.date_column = date_column
+        # Get all parameters from experiment context
+        self.target_numbers = self.experiment.context.get("target_numbers", [])
+        self.target_clf = self.experiment.context.get("target_clf", [])
+        self.models_idx = self.experiment.context.get("models_idx", [])
+        self.time_series = self.experiment.context.get("time_series", False)
+        self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
+        self.group_column = self.experiment.context.get("group_column", None)
+        self.date_column = self.experiment.context.get("date_column", None)
         self.experiment_dir = experiment.path
         self.data_dir = f"{self.experiment_dir}/data"
         self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
         self.all_features = experiment.get_all_features(
-            date_column=date_column, group_column=group_column
+            date_column=self.date_column, group_column=self.group_column
         )
     def run(self):

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/model_selection.py RENAMED Viewed

@@ -1017,24 +1017,24 @@ class ModelSelectionEngine:
         data,
         reshaped_data,
         target_number,
-        target_clf,
         experiment,
-        models_idx,
-        time_series,
-        date_column,
-        group_column,
-        target_clf_thresholds,
         **kwargs,
     ):
         self.data = data
         self.reshaped_data = reshaped_data
         self.target_number = target_number
         self.experiment = experiment
-        self.target_clf = target_clf
-        self.models_idx = models_idx
-        self.time_series = time_series
-        self.date_column = date_column
-        self.group_column = group_column
+        # Get all parameters from experiment context
+        context = self.experiment.context
+        self.target_clf = context.get("target_clf", [])
+        self.models_idx = context.get("models_idx", [])
+        self.time_series = context.get("time_series", False)
+        self.date_column = context.get("date_column", None)
+        self.group_column = context.get("group_column", None)
+        # Handle target_clf_thresholds
+        target_clf_thresholds = context.get("target_clf_thresholds", {})
         self.target_clf_thresholds = (
             target_clf_thresholds[target_number]
             if target_number in target_clf_thresholds.keys()
@@ -1056,25 +1056,19 @@ class ModelSelectionEngine:
         )
     # Main training function
-    def run(
-        self,
-        experiment_name,
-        perform_hyperopt=True,
-        number_of_trials=20,
-        perform_crossval=False,  # This controls CV during hyperopt, not after
-        plot=True,
-        clean_dir=False,  # TODO: This has been unused because now feature_selection is in the target directory
-        preserve_model=True,
-        best_params=None,
-    ):
+    def run(self, best_params=None):
         """
         Selects the best models based on a target variable, optionally performing hyperparameter optimization
         and cross-validation, and manages outputs in a session-specific directory.
         """
-        self.experiment_name = experiment_name
-        self.plot = plot
-        self.number_of_trials = number_of_trials
-        self.perform_crossval = perform_crossval
+        # Get all parameters from experiment context
+        context = self.experiment.context
+        self.experiment_name = context.get("experiment_name", "")
+        self.plot = context.get("plot", True)
+        self.number_of_trials = context.get("number_of_trials", 20)
+        self.perform_crossval = context.get("perform_crossval", False)
+        self.preserve_model = context.get("preserve_model", True)
+        self.perform_hyperopt = context.get("perform_hyperopt", True)
         if self.experiment_id is None:
             raise ValueError("Please provide a experiment.")
@@ -1141,13 +1135,13 @@ class ModelSelectionEngine:
             self.results_dir = f"{self.target_dir}/{model_name}"
             if not os.path.exists(f"{self.results_dir}"):
                 os.makedirs(f"{self.results_dir}")
-            elif preserve_model and contains_best(self.results_dir):
+            elif self.preserve_model and contains_best(self.results_dir):
                 continue
-            elif perform_hyperopt:
+            elif self.perform_hyperopt:
                 clean_directory(self.results_dir)
             logger.info(
-                f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
+                f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
             )
             # Getting data
@@ -1204,7 +1198,7 @@ class ModelSelectionEngine:
             # Tuning hyperparameters
             start = time.time()
-            if perform_hyperopt:
+            if self.perform_hyperopt:
                 model_best_params = self.hyperoptimize(
                     x_train, y_train, x_val, y_val, model
                 )

{lecrapaud-0.20.0 → lecrapaud-0.20.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.20.0"
+version = "0.20.1"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}