PyPI - lecrapaud - Versions diffs - 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl - Mend

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

lecrapaud/__init__.py +22 -1
lecrapaud/{api.py → base.py} +331 -241
lecrapaud/config.py +15 -3
lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
lecrapaud/db/models/__init__.py +2 -4
lecrapaud/db/models/base.py +116 -65
lecrapaud/db/models/experiment.py +195 -182
lecrapaud/db/models/feature_selection.py +0 -3
lecrapaud/db/models/feature_selection_rank.py +0 -18
lecrapaud/db/models/model_selection.py +2 -2
lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
lecrapaud/db/session.py +4 -0
lecrapaud/experiment.py +44 -17
lecrapaud/feature_engineering.py +45 -674
lecrapaud/feature_preprocessing.py +1202 -0
lecrapaud/feature_selection.py +145 -332
lecrapaud/integrations/sentry_integration.py +46 -0
lecrapaud/misc/tabpfn_tests.ipynb +2 -2
lecrapaud/mixins.py +247 -0
lecrapaud/model_preprocessing.py +295 -0
lecrapaud/model_selection.py +612 -242
lecrapaud/pipeline.py +548 -0
lecrapaud/search_space.py +2 -1
lecrapaud/utils.py +36 -3
lecrapaud-0.22.6.dist-info/METADATA +423 -0
lecrapaud-0.22.6.dist-info/RECORD +51 -0
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
{lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
lecrapaud/db/models/model_training.py +0 -64
lecrapaud/jobs/__init__.py +0 -13
lecrapaud/jobs/config.py +0 -17
lecrapaud/jobs/scheduler.py +0 -30
lecrapaud/jobs/tasks.py +0 -17
lecrapaud-0.19.0.dist-info/METADATA +0 -249
lecrapaud-0.19.0.dist-info/RECORD +0 -48

lecrapaud/db/models/experiment.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from itertools import chain
 import joblib
+import pandas as pd
+import os
 from sqlalchemy import (
     Column,
@@ -14,18 +16,18 @@ from sqlalchemy import (
     TIMESTAMP,
     UniqueConstraint,
     func,
+    event,
 )
-from sqlalchemy.orm import relationship, aliased
+from sqlalchemy.orm import relationship, aliased, mapper
 from sqlalchemy.ext.hybrid import hybrid_property
 from sqlalchemy import func
 from statistics import fmean as mean
 from lecrapaud.db.models.model_selection import ModelSelection
-from lecrapaud.db.models.model_training import ModelTraining
-from lecrapaud.db.models.score import Score
+from lecrapaud.db.models.model_selection_score import ModelSelectionScore
 from lecrapaud.db.models.base import Base, with_db
 from lecrapaud.db.models.utils import create_association_table
-from lecrapaud.utils import logger, contains_best
+from lecrapaud.utils import logger, contains_best, strip_timestamp_suffix
 # jointures
 lecrapaud_experiment_target_association = create_association_table(
@@ -51,105 +53,13 @@ class Experiment(Base):
     )
     name = Column(String(255), nullable=False)
     path = Column(String(255))  # we do not have this at creation time
-    type = Column(String(50), nullable=False)
     size = Column(Integer, nullable=False)
     train_size = Column(Integer)
     val_size = Column(Integer)
-    # Relationships
-    model_selections = relationship(
-        "ModelSelection",
-        back_populates="experiment",
-        cascade="all, delete-orphan",
-        lazy="selectin",
-    )
-    @hybrid_property
-    def best_rmse(self):
-        """Best RMSE score across all model selections and trainings."""
-        # Get the minimum RMSE for each model selection
-        min_scores = [
-            min(
-                score.rmse
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.rmse is not None
-            )
-            for ms in self.model_selections
-            if any(
-                score.rmse is not None
-                for mt in ms.model_trainings
-                for score in mt.score
-            )
-        ]
-        return min(min_scores) if min_scores else None
-    @hybrid_property
-    def best_logloss(self):
-        """Best LogLoss score across all model selections and trainings."""
-        # Get the minimum LogLoss for each model selection
-        min_scores = [
-            min(
-                score.logloss
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.logloss is not None
-            )
-            for ms in self.model_selections
-            if any(
-                score.logloss is not None
-                for mt in ms.model_trainings
-                for score in mt.score
-            )
-        ]
-        return min(min_scores) if min_scores else None
-    @hybrid_property
-    def avg_rmse(self):
-        """Average RMSE score across all model selections and trainings."""
-        # Get the minimum RMSE for each model selection
-        min_scores = [
-            min(
-                score.rmse
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.rmse is not None
-            )
-            for ms in self.model_selections
-            if any(
-                score.rmse is not None
-                for mt in ms.model_trainings
-                for score in mt.score
-            )
-        ]
-        return mean(min_scores) if min_scores else None
-    @hybrid_property
-    def avg_logloss(self):
-        """Average LogLoss score across all model selections and trainings."""
-        # Get the minimum LogLoss for each model selection
-        min_scores = [
-            min(
-                score.logloss
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.logloss is not None
-            )
-            for ms in self.model_selections
-            if any(
-                score.logloss is not None
-                for mt in ms.model_trainings
-                for score in mt.score
-            )
-        ]
-        return mean(min_scores) if min_scores else None
     test_size = Column(Integer)
-    corr_threshold = Column(Float, nullable=False)
-    max_features = Column(Integer, nullable=False)
-    percentile = Column(Float, nullable=False)
     number_of_groups = Column(Integer)
     list_of_groups = Column(JSON)
+    number_of_targets = Column(Integer)
     start_date = Column(DateTime)
     end_date = Column(DateTime)
     train_start_date = Column(DateTime)
@@ -181,6 +91,142 @@ class Experiment(Base):
         ),
     )
+    # Relationships
+    model_selections = relationship(
+        "ModelSelection",
+        back_populates="experiment",
+        cascade="all, delete-orphan",
+        lazy="selectin",
+    )
+    # Hooks
+    # @event.listens_to(Experiment, "after_commit")
+    # def set_score(mapper, connection, target):
+    #     target.score = target.score
+    # Properties
+    @hybrid_property
+    def rmse_scores(self):
+        """best RMSE scores across all model selections, for each targets."""
+        # Get the minimum RMSE for each model selection
+        min_scores = [
+            min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
+            for ms in self.model_selections
+            if any(mss.rmse is not None for mss in ms.model_selection_scores)
+        ]
+        if not min_scores:
+            # fallback to path if no model_selection_scores found
+            for target in self.targets:
+                path = f"{self.path}/{target.name}/scores_tracking.csv"
+                if not os.path.exists(path):
+                    continue
+                score = pd.read_csv(path)
+                if "RMSE" not in score.columns:
+                    continue
+                min_scores.append(min(score["RMSE"]))
+        return min_scores
+    @hybrid_property
+    def logloss_scores(self):
+        """best LogLoss scores across all model selections, for each targets."""
+        # Get the minimum LogLoss for each model selection
+        min_scores = [
+            min(
+                mss.logloss
+                for mss in ms.model_selection_scores
+                if mss.logloss is not None
+            )
+            for ms in self.model_selections
+            if any(mss.logloss is not None for mss in ms.model_selection_scores)
+        ]
+        if not min_scores:
+            # fallback to path if no model_selection_scores found
+            for target in self.targets:
+                path = f"{self.path}/{target.name}/scores_tracking.csv"
+                if not os.path.exists(path):
+                    continue
+                score = pd.read_csv(path)
+                if "LOGLOSS" not in score.columns:
+                    continue
+                min_scores.append(min(score["LOGLOSS"]))
+        return min_scores
+    @hybrid_property
+    def best_rmse(self):
+        """Best RMSE score within targets, across all model selections."""
+        return min(self.rmse_scores) if self.rmse_scores else None
+    @hybrid_property
+    def best_logloss(self):
+        """Best LogLoss score within targets, across all model selections."""
+        return min(self.logloss_scores) if self.logloss_scores else None
+    @hybrid_property
+    def score(self):
+        # Calculate a combined score: average of normalized best RMSE and LogLoss per targets
+        # This ensures we're comparing apples to apples by normalizing the scores
+        if not self.rmse_scores and not self.logloss_scores:
+            logger.error("No experiments found with RMSE or LogLoss scores.")
+            return None
+        # Normalize scores (subtract min and divide by range)
+        # Guard against division by zero when only one observation or all equal
+        # Let's gather all the data from similar experiments to calculate the range
+        similar_experiments = Experiment.get_all_by_name(name=self.name)
+        if not similar_experiments:
+            similar_experiments = [self]
+        rmse_scores = [
+            score for exp in similar_experiments for score in exp.rmse_scores or []
+        ]
+        logloss_scores = [
+            score for exp in similar_experiments for score in exp.logloss_scores or []
+        ]
+        min_rmse = min(rmse_scores)
+        max_rmse = max(rmse_scores)
+        range_rmse = max_rmse - min_rmse
+        min_logloss = min(logloss_scores)
+        max_logloss = max(logloss_scores)
+        range_logloss = max_logloss - min_logloss
+        # Calculate combined score for each experiment
+        normed_scores = []
+        for rmse_score in self.rmse_scores:
+            # Normalize both scores (safe when range == 0)
+            norm_rmse = 0.0 if range_rmse == 0 else (rmse_score - min_rmse) / range_rmse
+            normed_scores.append(norm_rmse)
+        for logloss_score in self.logloss_scores:
+            norm_logloss = (
+                0.0
+                if range_logloss == 0
+                else (logloss_score - min_logloss) / range_logloss
+            )
+            normed_scores.append(norm_logloss)
+        # Calculate score (average of normalized scores)
+        score = sum(normed_scores) / len(normed_scores)
+        return score
+    @hybrid_property
+    def avg_rmse(self):
+        """Average within targets of best RMSE score across all model selections ."""
+        return mean(self.rmse_scores) if self.rmse_scores else None
+    @hybrid_property
+    def avg_logloss(self):
+        """Average within targets of best LogLoss score across all model selections ."""
+        return mean(self.logloss_scores) if self.logloss_scores else None
+    # Class methods
     @classmethod
     @with_db
     def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -194,10 +240,11 @@ class Experiment(Base):
         Returns:
             Experiment or None: The most recent matching experiment or None if not found
         """
+        base_name = strip_timestamp_suffix(name)
         if name is not None:
             return (
                 db.query(cls)
-                .filter(cls.name.ilike(f"%{name}%"))
+                .filter(cls.name.ilike(f"%{base_name}%"))
                 .order_by(cls.created_at.desc())
                 .limit(limit)
                 .all()
@@ -217,27 +264,24 @@ class Experiment(Base):
         Returns:
             Experiment or None: The most recent matching experiment or None if not found
         """
+        base_name = strip_timestamp_suffix(name)
         return (
             db.query(cls)
-            .filter(cls.name.ilike(f"%{name}%"))
+            .filter(cls.name.ilike(f"%{base_name}%"))
             .order_by(cls.created_at.desc())
             .first()
         )
     @classmethod
     @with_db
-    def get_best_by_score(cls, name: str, metric="both", db=None):
+    def get_best_by_score(cls, name: str, db=None):
         """
-        Find the experiment with the best score based on average RMSE, LogLoss, or both.
-        Args:
-            metric (str): 'rmse', 'logloss', or 'both' to determine which score to optimize
-            db: SQLAlchemy session
+        Find the experiment with the best normalized score across RMSE and LogLoss.
         Returns:
             Experiment or None: The experiment with the best score or None if not found
         """
-        experiments = db.query(cls).filter(cls.name.ilike(f"%{name}%")).all()
+        experiments = Experiment.get_all_by_name(name=name)
         if not experiments:
             logger.error(f"No experiments found with the given name: {name}")
             return None
@@ -255,66 +299,22 @@ class Experiment(Base):
             )
             return None
-        if metric == "both":
-            # Calculate a combined score: average of normalized RMSE and LogLoss
-            # This ensures we're comparing apples to apples by normalizing the scores
-            # Get all scores
-            rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
-            logloss_scores = [
-                e.avg_logloss for e in experiments if e.avg_logloss is not None
-            ]
-            if not rmse_scores or not logloss_scores:
-                logger.error(
-                    "No experiments found with both RMSE and LogLoss scores. Maybe try with only one metric."
-                )
-                return None
-            # Normalize scores (subtract min and divide by range)
-            min_rmse = min(rmse_scores)
-            range_rmse = max(rmse_scores) - min_rmse
-            min_logloss = min(logloss_scores)
-            range_logloss = max(logloss_scores) - min_logloss
-            # Calculate combined score for each experiment
-            experiment_scores = []
-            for experiment in experiments:
-                if experiment.avg_rmse is None or experiment.avg_logloss is None:
-                    continue
-                # Normalize both scores
-                norm_rmse = (experiment.avg_rmse - min_rmse) / range_rmse
-                norm_logloss = (experiment.avg_logloss - min_logloss) / range_logloss
+        scored_experiments = []
+        for experiment in experiments:
+            score = experiment.score
+            if score is not None:
+                scored_experiments.append((experiment, score))
-                # Calculate combined score (average of normalized scores)
-                combined_score = (norm_rmse + norm_logloss) / 2
-                experiment_scores.append((experiment, combined_score))
-            # Sort by combined score (ascending since lower is better)
-            experiment_scores.sort(key=lambda x: x[1])
-            return experiment_scores[0][0] if experiment_scores else None
-        elif metric == "rmse" or metric == "logloss":
-            # For single metric case (rmse or logloss)
-            # Filter out experiments without scores and sort by the selected metric
-            filtered_experiments = []
-            for exp in experiments:
-                score = exp.avg_rmse if metric == "rmse" else exp.avg_logloss
-                if score is not None:
-                    filtered_experiments.append((exp, score))
+        if not scored_experiments:
+            logger.error(
+                f"No experiments with calculable scores found with the given name: {name}"
+            )
+            return None
-            if not filtered_experiments:
-                return None
-            # Sort by score (ascending since lower is better)
-            filtered_experiments.sort(key=lambda x: x[1])
-            return filtered_experiments[0][0]
-        else:
-            raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
+        scored_experiments.sort(key=lambda x: x[1])
+        return scored_experiments[0][0]
+    # Instance methods
     def best_score(self, target_number: int) -> dict:
         """
         Returns the scores for the best model of the specified target.
@@ -342,7 +342,7 @@ class Experiment(Base):
             (ms for ms in self.model_selections if ms.target_id == target.id), None
         )
-        if not best_model_selection or not best_model_selection.model_trainings:
+        if not best_model_selection or not best_model_selection.model_selection_scores:
             return {
                 "experiment_name": self.name,
                 "target_number": target_number,
@@ -350,22 +350,31 @@ class Experiment(Base):
                 "scores": {},
             }
-        # Get the best model training (assuming the first one is the best)
-        best_training = best_model_selection.model_trainings[0]
-        # Get the validation score for this training
-        validation_scores = [s for s in best_training.score if s.type == "validation"]
+        # Get the best model score based on lowest logloss or rmse
+        model_scores = best_model_selection.model_selection_scores
-        if not validation_scores:
+        # Determine if we should use logloss or rmse based on what's available
+        if any(ms.logloss is not None for ms in model_scores):
+            # Classification: find lowest logloss
+            best_score = min(
+                (ms for ms in model_scores if ms.logloss is not None),
+                key=lambda x: x.logloss,
+            )
+        elif any(ms.rmse is not None for ms in model_scores):
+            # Regression: find lowest rmse
+            best_score = min(
+                (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
+            )
+        else:
             return {
                 "experiment_name": self.name,
                 "target_number": target_number,
-                "error": "No validation scores found for the best model",
+                "error": "No scores found for the best model",
                 "scores": {},
             }
-        # Get all available metrics from the first validation score
-        score = validation_scores[0]
+        # Use the best score found
+        score = best_score
         available_metrics = [
             "rmse",
             "mae",
@@ -386,13 +395,9 @@ class Experiment(Base):
         # Get the model info
         model_info = {
-            "model_type": (
-                best_training.model.model_type if best_training.model else "unknown"
-            ),
-            "model_name": (
-                best_training.model.name if best_training.model else "unknown"
-            ),
-            "training_time_seconds": best_training.training_time,
+            "model_type": (score.model.model_type if score.model else "unknown"),
+            "model_name": (score.model.name if score.model else "unknown"),
+            "training_time_seconds": score.training_time,
         }
         return {
@@ -402,7 +407,10 @@ class Experiment(Base):
             "scores": scores,
         }
-    def get_features(self, target_number: int):
+    @with_db
+    def get_features(self, target_number: int, db=None):
+        # Ensure we have a fresh instance attached to the session
+        self = db.merge(self)
         targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
         if targets:
             target_id = targets[0].id
@@ -418,7 +426,12 @@ class Experiment(Base):
         features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
         return features
-    def get_all_features(self, date_column: str = None, group_column: str = None):
+    @with_db
+    def get_all_features(
+        self, date_column: str = None, group_column: str = None, db=None
+    ):
+        # Ensure we have a fresh instance attached to the session
+        self = db.merge(self)
         target_idx = [target.id for target in self.targets]
         _all_features = chain.from_iterable(
             [f.name for f in fs.features]

lecrapaud/db/models/feature_selection.py CHANGED Viewed

@@ -115,7 +115,4 @@ class FeatureSelection(Base):
             if feature not in self.features:
                 self.features.append(feature)
-        db.flush()
-        db.refresh(self)
-        print(self.features)
         return self

lecrapaud/db/models/feature_selection_rank.py CHANGED Viewed

@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
             name="uq_feature_selection_rank_composite",
         ),
     )
-    @classmethod
-    @with_db
-    def bulk_upsert(cls, rows, db=None):
-        stmt = insert(cls).values(rows)
-        update_fields = {
-            key: stmt.inserted[key]
-            for key in rows[0]
-            if key not in ("feature_selection_id", "feature_id", "method")
-        }
-        stmt = stmt.on_duplicate_key_update(**update_fields)
-        db.execute(stmt)
-        db.commit()
-        return len(rows)

lecrapaud/db/models/model_selection.py CHANGED Viewed

@@ -54,8 +54,8 @@ class ModelSelection(Base):
     )
     best_model = relationship("Model", lazy="selectin")
-    model_trainings = relationship(
-        "ModelTraining",
+    model_selection_scores = relationship(
+        "ModelSelectionScore",
         back_populates="model_selection",
         cascade="all, delete-orphan",
         lazy="selectin",

lecrapaud/db/models/{score.py → model_selection_score.py} RENAMED Viewed

@@ -3,10 +3,11 @@ from sqlalchemy import (
     Integer,
     String,
     Float,
+    JSON,
     ForeignKey,
     BigInteger,
     TIMESTAMP,
-    JSON,
+    UniqueConstraint,
 )
 from sqlalchemy import func
 from sqlalchemy.orm import relationship
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
 from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
-class Score(Base):
+class ModelSelectionScore(Base):
+    __tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
     id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
     created_at = Column(
         TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
@@ -25,10 +28,21 @@ class Score(Base):
         onupdate=func.now(),
         nullable=False,
     )
-    type = Column(
-        String(50), nullable=False
-    )  # either hyperopts or validation or crossval
+    # From ModelTraining
+    best_params = Column(JSON)
+    model_path = Column(String(255))
     training_time = Column(Integer)
+    model_id = Column(
+        BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
+    )
+    model_selection_id = Column(
+        BigInteger,
+        ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
+        nullable=False,
+    )
+    # From Score (excluding type and training_time which is already in ModelTraining)
     eval_data_std = Column(Float)
     rmse = Column(Float)
     rmse_std_ratio = Column(Float)
@@ -50,12 +64,15 @@ class Score(Base):
     precision_at_threshold = Column(Float)
     recall_at_threshold = Column(Float)
     f1_at_threshold = Column(Float)
-    model_training_id = Column(
-        BigInteger,
-        ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
-        nullable=False,
-    )
-    model_trainings = relationship(
-        "ModelTraining", back_populates="score", lazy="selectin"
+    # Relationships
+    model = relationship("Model", lazy="selectin")
+    model_selection = relationship(
+        "ModelSelection", back_populates="model_selection_scores", lazy="selectin"
     )
+    __table_args__ = (
+        UniqueConstraint(
+            "model_id", "model_selection_id", name="uq_model_selection_score_composite"
+        ),
+    )

lecrapaud/db/session.py CHANGED Viewed

@@ -27,6 +27,9 @@ else:
 def init_db(uri: str = None):
     global _engine, _SessionLocal, DATABASE_URL, DB_URI
+    if _SessionLocal is not None:
+        return
     if uri:
         if "mysql://" in uri and "pymysql://" not in uri:
             uri = uri.replace("mysql://", "mysql+pymysql://")
@@ -73,6 +76,7 @@ def init_db(uri: str = None):
         autocommit=False,
         autoflush=False,
         bind=_engine,
+        expire_on_commit=False,  # Prevent detached instance errors
     )
     # Step 5: Apply Alembic migrations programmatically

lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

lecrapaud 0.19.0py3-none-any.whl → 0.22.6py3-none-any.whl