PyPI - lecrapaud - Versions diffs - 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl - Mend

lecrapaud 0.20.0py3-none-any.whl → 0.20.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (14) hide show

lecrapaud/api.py +11 -49
lecrapaud/config.py +3 -2
lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +42 -0
lecrapaud/db/models/experiment.py +48 -75
lecrapaud/experiment.py +13 -15
lecrapaud/feature_engineering.py +28 -40
lecrapaud/feature_selection.py +90 -21
lecrapaud/model_selection.py +24 -30
lecrapaud/utils.py +4 -4
lecrapaud-0.20.2.dist-info/METADATA +344 -0
{lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/RECORD +13 -12
lecrapaud-0.20.0.dist-info/METADATA +0 -250
{lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/WHEEL +0 -0
{lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/licenses/LICENSE +0 -0

lecrapaud/api.py CHANGED Viewed

@@ -165,6 +165,12 @@ class ExperimentEngine:
     def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
         """Initialize the experiment engine with either new or existing experiment."""
+        # Set all kwargs as instance attributes
+        if "models_idx" in kwargs:
+            kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
+        for key, value in kwargs.items():
+            setattr(self, key, value)
         if id:
             self.experiment = Experiment.get(id)
             kwargs.update(self.experiment.context)
@@ -180,12 +186,6 @@ class ExperimentEngine:
                 )
             self.experiment = create_experiment(data=data, **kwargs)
-        # Set all kwargs as instance attributes
-        for key, value in kwargs.items():
-            if key == "models_idx":
-                value = normalize_models_idx(value)
-            setattr(self, key, value)
     def train(self, data, best_params=None):
         logger.info("Running training...")
@@ -309,12 +309,8 @@ class ExperimentEngine:
     def feature_engineering(self, data, for_training=True):
         app = FeatureEngineeringEngine(
             data=data,
-            columns_drop=getattr(self, "columns_drop", []),
-            columns_boolean=getattr(self, "columns_boolean", []),
-            columns_date=getattr(self, "columns_date", []),
-            columns_te_groupby=getattr(self, "columns_te_groupby", []),
-            columns_te_target=getattr(self, "columns_te_target", []),
-            for_training=getattr(self, "for_training", True),
+            experiment=self.experiment,
+            for_training=for_training,
         )
         data = app.run()
         return data
@@ -322,21 +318,7 @@ class ExperimentEngine:
     def preprocess_feature(self, data, for_training=True):
         app = PreprocessFeature(
             data=data,
-            experiment=getattr(self, "experiment", None),
-            time_series=getattr(self, "time_series", False),
-            date_column=getattr(self, "date_column", None),
-            group_column=getattr(self, "group_column", None),
-            val_size=getattr(self, "val_size", 0.2),
-            test_size=getattr(self, "test_size", 0.2),
-            columns_pca=getattr(self, "columns_pca", []),
-            pca_temporal=getattr(self, "pca_temporal", []),
-            pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
-            columns_onehot=getattr(self, "columns_onehot", []),
-            columns_binary=getattr(self, "columns_binary", []),
-            columns_ordinal=getattr(self, "columns_ordinal", []),
-            columns_frequency=getattr(self, "columns_frequency", []),
-            target_numbers=getattr(self, "target_numbers", []),
-            target_clf=getattr(self, "target_clf", []),
+            experiment=self.experiment,
         )
         if for_training:
             train, val, test = app.run()
@@ -351,7 +333,6 @@ class ExperimentEngine:
                 train=train,
                 target_number=target_number,
                 experiment=self.experiment,
-                target_clf=self.target_clf,
             )
             app.run()
         self.experiment = Experiment.get(self.experiment.id)
@@ -368,14 +349,7 @@ class ExperimentEngine:
             train=train,
             val=val,
             test=test,
-            experiment=getattr(self, "experiment", None),
-            target_numbers=getattr(self, "target_numbers", []),
-            target_clf=getattr(self, "target_clf", []),
-            models_idx=getattr(self, "models_idx", []),
-            time_series=getattr(self, "time_series", False),
-            max_timesteps=getattr(self, "max_timesteps", 120),
-            date_column=getattr(self, "date_column", None),
-            group_column=getattr(self, "group_column", None),
+            experiment=self.experiment,
         )
         if for_training:
             data, reshaped_data = app.run()
@@ -390,25 +364,13 @@ class ExperimentEngine:
                 data=data,
                 reshaped_data=reshaped_data,
                 target_number=target_number,
-                experiment=getattr(self, "experiment", None),
-                target_clf=getattr(self, "target_clf", []),
-                models_idx=getattr(self, "models_idx", []),
-                time_series=getattr(self, "time_series", False),
-                date_column=getattr(self, "date_column", None),
-                group_column=getattr(self, "group_column", None),
-                target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
+                experiment=self.experiment,
             )
             if best_params and target_number not in best_params.keys():
                 raise ValueError(
                     f"Target {target_number} not found in best_params passed as argument"
                 )
             app.run(
-                self.experiment_name,
-                perform_hyperopt=self.perform_hyperopt,
-                number_of_trials=self.number_of_trials,
-                perform_crossval=self.perform_crossval,
-                plot=self.plot,
-                preserve_model=self.preserve_model,
                 best_params=best_params[target_number] if best_params else None,
             )

lecrapaud/config.py CHANGED Viewed

@@ -32,6 +32,7 @@ DB_URI: str = (
 )
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
-LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)
 LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
-LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv("LECRAPAUD_OPTIMIZATION_BACKEND", "ray").lower()
+LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
+    "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
+).lower()

lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""add number_of_targets and remove other fields from experiments
+Revision ID: 0a8fb7826e9b
+Revises: 033e0f7eca4f
+Create Date: 2025-10-28 20:06:54.792631
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+# revision identifiers, used by Alembic.
+revision: str = '0a8fb7826e9b'
+down_revision: Union[str, None] = '033e0f7eca4f'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
+    op.drop_column('lecrapaud_experiments', 'corr_threshold')
+    op.drop_column('lecrapaud_experiments', 'max_features')
+    op.drop_column('lecrapaud_experiments', 'percentile')
+    op.drop_column('lecrapaud_experiments', 'type')
+    op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
+    op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
+    op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
+    op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
+    op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
+    op.drop_column('lecrapaud_experiments', 'number_of_targets')
+    # ### end Alembic commands ###

lecrapaud/db/models/experiment.py CHANGED Viewed

@@ -50,10 +50,43 @@ class Experiment(Base):
     )
     name = Column(String(255), nullable=False)
     path = Column(String(255))  # we do not have this at creation time
-    type = Column(String(50), nullable=False)
     size = Column(Integer, nullable=False)
     train_size = Column(Integer)
     val_size = Column(Integer)
+    test_size = Column(Integer)
+    number_of_groups = Column(Integer)
+    list_of_groups = Column(JSON)
+    number_of_targets = Column(Integer)
+    start_date = Column(DateTime)
+    end_date = Column(DateTime)
+    train_start_date = Column(DateTime)
+    train_end_date = Column(DateTime)
+    val_start_date = Column(DateTime)
+    val_end_date = Column(DateTime)
+    test_start_date = Column(DateTime)
+    test_end_date = Column(DateTime)
+    context = Column(JSON)
+    feature_selections = relationship(
+        "FeatureSelection",
+        back_populates="experiment",
+        cascade="all, delete-orphan",
+        lazy="selectin",
+    )
+    targets = relationship(
+        "Target",
+        secondary=lecrapaud_experiment_target_association,
+        back_populates="experiments",
+        lazy="selectin",
+    )
+    __table_args__ = (
+        UniqueConstraint(
+            "name",
+            name="uq_experiments_composite",
+        ),
+    )
     # Relationships
     model_selections = relationship(
@@ -68,16 +101,9 @@ class Experiment(Base):
         """Best RMSE score across all model selections and trainings."""
         # Get the minimum RMSE for each model selection
         min_scores = [
-            min(
-                mss.rmse
-                for mss in ms.model_selection_scores
-                if mss.rmse is not None
-            )
+            min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
             for ms in self.model_selections
-            if any(
-                mss.rmse is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.rmse is not None for mss in ms.model_selection_scores)
         ]
         return min(min_scores) if min_scores else None
@@ -92,10 +118,7 @@ class Experiment(Base):
                 if mss.logloss is not None
             )
             for ms in self.model_selections
-            if any(
-                mss.logloss is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.logloss is not None for mss in ms.model_selection_scores)
         ]
         return min(min_scores) if min_scores else None
@@ -104,16 +127,9 @@ class Experiment(Base):
         """Average RMSE score across all model selections and trainings."""
         # Get the minimum RMSE for each model selection
         min_scores = [
-            min(
-                mss.rmse
-                for mss in ms.model_selection_scores
-                if mss.rmse is not None
-            )
+            min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
             for ms in self.model_selections
-            if any(
-                mss.rmse is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.rmse is not None for mss in ms.model_selection_scores)
         ]
         return mean(min_scores) if min_scores else None
@@ -128,50 +144,10 @@ class Experiment(Base):
                 if mss.logloss is not None
             )
             for ms in self.model_selections
-            if any(
-                mss.logloss is not None
-                for mss in ms.model_selection_scores
-            )
+            if any(mss.logloss is not None for mss in ms.model_selection_scores)
         ]
         return mean(min_scores) if min_scores else None
-    test_size = Column(Integer)
-    corr_threshold = Column(Float, nullable=False)
-    max_features = Column(Integer, nullable=False)
-    percentile = Column(Float, nullable=False)
-    number_of_groups = Column(Integer)
-    list_of_groups = Column(JSON)
-    start_date = Column(DateTime)
-    end_date = Column(DateTime)
-    train_start_date = Column(DateTime)
-    train_end_date = Column(DateTime)
-    val_start_date = Column(DateTime)
-    val_end_date = Column(DateTime)
-    test_start_date = Column(DateTime)
-    test_end_date = Column(DateTime)
-    context = Column(JSON)
-    feature_selections = relationship(
-        "FeatureSelection",
-        back_populates="experiment",
-        cascade="all, delete-orphan",
-        lazy="selectin",
-    )
-    targets = relationship(
-        "Target",
-        secondary=lecrapaud_experiment_target_association,
-        back_populates="experiments",
-        lazy="selectin",
-    )
-    __table_args__ = (
-        UniqueConstraint(
-            "name",
-            name="uq_experiments_composite",
-        ),
-    )
     @classmethod
     @with_db
     def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -354,19 +330,18 @@ class Experiment(Base):
         # Get the best model score based on lowest logloss or rmse
         model_scores = best_model_selection.model_selection_scores
         # Determine if we should use logloss or rmse based on what's available
         if any(ms.logloss is not None for ms in model_scores):
             # Classification: find lowest logloss
             best_score = min(
                 (ms for ms in model_scores if ms.logloss is not None),
-                key=lambda x: x.logloss
+                key=lambda x: x.logloss,
             )
         elif any(ms.rmse is not None for ms in model_scores):
             # Regression: find lowest rmse
             best_score = min(
-                (ms for ms in model_scores if ms.rmse is not None),
-                key=lambda x: x.rmse
+                (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
             )
         else:
             return {
@@ -398,12 +373,8 @@ class Experiment(Base):
         # Get the model info
         model_info = {
-            "model_type": (
-                score.model.model_type if score.model else "unknown"
-            ),
-            "model_name": (
-                score.model.name if score.model else "unknown"
-            ),
+            "model_type": (score.model.model_type if score.model else "unknown"),
+            "model_name": (score.model.name if score.model else "unknown"),
             "training_time_seconds": score.training_time,
         }
@@ -434,7 +405,9 @@ class Experiment(Base):
         return features
     @with_db
-    def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
+    def get_all_features(
+        self, date_column: str = None, group_column: str = None, db=None
+    ):
         # Ensure we have a fresh instance attached to the session
         self = db.merge(self)
         target_idx = [target.id for target in self.targets]

lecrapaud/experiment.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import pandas as pd
 import joblib
+from datetime import datetime
 # Set up coverage file path
 os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -15,18 +16,18 @@ from lecrapaud.db.session import get_db
 def create_experiment(
     data: pd.DataFrame | str,
-    corr_threshold,
-    percentile,
-    max_features,
-    date_column,
-    group_column,
     experiment_name,
+    date_column=None,
+    group_column=None,
     **kwargs,
 ):
     if isinstance(data, str):
         path = f"{data}/data/full.pkl"
         data = joblib.load(path)
+    if kwargs.get("time_series") and not date_column:
+        raise ValueError("date_column must be provided for time series experiments")
     dates = {}
     if date_column:
         dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
@@ -42,7 +43,10 @@ def create_experiment(
         targets = [
             target for target in all_targets if target.name in data.columns.str.upper()
         ]
-        experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
+        experiment_name = (
+            f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        )
+        number_of_targets = len(targets)
         experiment_dir = f"{tmp_dir}/{experiment_name}"
         preprocessing_dir = f"{experiment_dir}/preprocessing"
@@ -55,26 +59,20 @@ def create_experiment(
             db=db,
             name=experiment_name,
             path=Path(experiment_dir).resolve(),
-            type="training",
             size=data.shape[0],
-            corr_threshold=corr_threshold,
-            percentile=percentile,
-            max_features=max_features,
+            number_of_targets=number_of_targets,
             **groups,
             **dates,
             context={
-                "corr_threshold": corr_threshold,
-                "percentile": percentile,
-                "max_features": max_features,
                 "date_column": date_column,
                 "group_column": group_column,
                 "experiment_name": experiment_name,
                 **kwargs,
             },
         )
         # Set targets relationship after creation/update
         experiment.targets = targets
         experiment.save(db=db)
         return experiment

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
     def __init__(
         self,
         data: pd.DataFrame,
-        columns_drop: list[str] = [],
-        columns_boolean: list[str] = [],
-        columns_date: list[str] = [],
-        columns_te_groupby: list[str] = [],
-        columns_te_target: list[str] = [],
+        experiment,
         for_training: bool = True,
         **kwargs,
     ):
         self.data = data
-        self.columns_drop = columns_drop
-        self.columns_boolean = columns_boolean
-        self.columns_date = columns_date
-        self.columns_te_groupby = columns_te_groupby
-        self.columns_te_target = columns_te_target
+        self.experiment = experiment
         self.for_training = for_training
+        # Get all parameters from experiment context
+        self.columns_drop = self.experiment.context.get("columns_drop", [])
+        self.columns_boolean = self.experiment.context.get("columns_boolean", [])
+        self.columns_date = self.experiment.context.get("columns_date", [])
+        self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
+        self.columns_te_target = self.experiment.context.get("columns_te_target", [])
     def run(self) -> pd.DataFrame:
         # drop columns
@@ -316,41 +315,30 @@ class PreprocessFeature:
         self,
         data: pd.DataFrame,
         experiment,
-        time_series: bool = False,
-        date_column: str | None = None,
-        group_column: str | None = None,
-        val_size: float = 0.2,
-        test_size: float = 0.2,
-        columns_pca: list[str] = [],
-        pca_temporal: list[dict[str, list[str]]] = [],
-        pca_cross_sectional: list[dict[str, list[str]]] = [],
-        columns_onehot: list[str] = [],
-        columns_binary: list[str] = [],
-        columns_ordinal: list[str] = [],
-        columns_frequency: list[str] = [],
-        target_numbers: list = [],
-        target_clf: list = [],
         **kwargs,
     ):
         self.data = data
         self.data.columns = self.data.columns.str.upper()
         self.experiment = experiment
-        self.columns_pca = [col.upper() for col in columns_pca]
-        self.pca_temporal = pca_temporal
-        self.pca_cross_sectional = pca_cross_sectional
-        self.columns_onehot = [col.upper() for col in columns_onehot]
-        self.columns_binary = [col.upper() for col in columns_binary]
-        self.columns_ordinal = [col.upper() for col in columns_ordinal]
-        self.columns_frequency = [col.upper() for col in columns_frequency]
-        self.target_numbers = target_numbers
-        self.target_clf = target_clf
-        self.time_series = time_series
-        self.date_column = date_column
-        self.group_column = group_column
-        self.val_size = val_size
-        self.test_size = test_size
+        # Get all parameters from experiment context
+        context = self.experiment.context
+        self.time_series = context.get("time_series", False)
+        self.date_column = context.get("date_column", None)
+        self.group_column = context.get("group_column", None)
+        self.val_size = context.get("val_size", 0.2)
+        self.test_size = context.get("test_size", 0.2)
+        self.target_numbers = context.get("target_numbers", [])
+        self.target_clf = context.get("target_clf", [])
+        # Handle list parameters with uppercase conversion
+        self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
+        self.pca_temporal = context.get("pca_temporal", [])
+        self.pca_cross_sectional = context.get("pca_cross_sectional", [])
+        self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
+        self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
+        self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
+        self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
         self.experiment_dir = self.experiment.path
         self.experiment_id = self.experiment.id

lecrapaud 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.20.0py3-none-any.whl → 0.20.2py3-none-any.whl