PyPI - lecrapaud - Versions diffs - 0.19.2__py3-none-any.whl → 0.20.0__py3-none-any.whl - Mend

lecrapaud 0.19.2py3-none-any.whl → 0.20.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (23) hide show

lecrapaud/api.py +3 -0
lecrapaud/config.py +1 -0
lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
lecrapaud/db/models/__init__.py +2 -4
lecrapaud/db/models/base.py +103 -65
lecrapaud/db/models/experiment.py +53 -46
lecrapaud/db/models/feature_selection.py +0 -3
lecrapaud/db/models/feature_selection_rank.py +0 -18
lecrapaud/db/models/model_selection.py +2 -2
lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
lecrapaud/db/session.py +1 -0
lecrapaud/experiment.py +7 -4
lecrapaud/feature_engineering.py +6 -9
lecrapaud/feature_selection.py +0 -1
lecrapaud/model_selection.py +478 -170
lecrapaud/search_space.py +2 -1
lecrapaud/utils.py +22 -2
{lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/METADATA +1 -1
{lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/RECORD +22 -21
lecrapaud/db/models/model_training.py +0 -64
{lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/WHEEL +0 -0
{lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/licenses/LICENSE +0 -0

lecrapaud/db/models/experiment.py CHANGED Viewed

@@ -20,8 +20,7 @@ from sqlalchemy.ext.hybrid import hybrid_property
 from sqlalchemy import func
 from statistics import fmean as mean
 from lecrapaud.db.models.model_selection import ModelSelection
-from lecrapaud.db.models.model_training import ModelTraining
-from lecrapaud.db.models.score import Score
+from lecrapaud.db.models.model_selection_score import ModelSelectionScore
 from lecrapaud.db.models.base import Base, with_db
 from lecrapaud.db.models.utils import create_association_table
@@ -70,16 +69,14 @@ class Experiment(Base):
         # Get the minimum RMSE for each model selection
         min_scores = [
             min(
-                score.rmse
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.rmse is not None
+                mss.rmse
+                for mss in ms.model_selection_scores
+                if mss.rmse is not None
             )
             for ms in self.model_selections
             if any(
-                score.rmse is not None
-                for mt in ms.model_trainings
-                for score in mt.score
+                mss.rmse is not None
+                for mss in ms.model_selection_scores
             )
         ]
         return min(min_scores) if min_scores else None
@@ -90,16 +87,14 @@ class Experiment(Base):
         # Get the minimum LogLoss for each model selection
         min_scores = [
             min(
-                score.logloss
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.logloss is not None
+                mss.logloss
+                for mss in ms.model_selection_scores
+                if mss.logloss is not None
             )
             for ms in self.model_selections
             if any(
-                score.logloss is not None
-                for mt in ms.model_trainings
-                for score in mt.score
+                mss.logloss is not None
+                for mss in ms.model_selection_scores
             )
         ]
         return min(min_scores) if min_scores else None
@@ -110,16 +105,14 @@ class Experiment(Base):
         # Get the minimum RMSE for each model selection
         min_scores = [
             min(
-                score.rmse
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.rmse is not None
+                mss.rmse
+                for mss in ms.model_selection_scores
+                if mss.rmse is not None
             )
             for ms in self.model_selections
             if any(
-                score.rmse is not None
-                for mt in ms.model_trainings
-                for score in mt.score
+                mss.rmse is not None
+                for mss in ms.model_selection_scores
             )
         ]
         return mean(min_scores) if min_scores else None
@@ -130,16 +123,14 @@ class Experiment(Base):
         # Get the minimum LogLoss for each model selection
         min_scores = [
             min(
-                score.logloss
-                for mt in ms.model_trainings
-                for score in mt.score
-                if score.logloss is not None
+                mss.logloss
+                for mss in ms.model_selection_scores
+                if mss.logloss is not None
             )
             for ms in self.model_selections
             if any(
-                score.logloss is not None
-                for mt in ms.model_trainings
-                for score in mt.score
+                mss.logloss is not None
+                for mss in ms.model_selection_scores
             )
         ]
         return mean(min_scores) if min_scores else None
@@ -353,7 +344,7 @@ class Experiment(Base):
             (ms for ms in self.model_selections if ms.target_id == target.id), None
         )
-        if not best_model_selection or not best_model_selection.model_trainings:
+        if not best_model_selection or not best_model_selection.model_selection_scores:
             return {
                 "experiment_name": self.name,
                 "target_number": target_number,
@@ -361,22 +352,32 @@ class Experiment(Base):
                 "scores": {},
             }
-        # Get the best model training (assuming the first one is the best)
-        best_training = best_model_selection.model_trainings[0]
-        # Get the validation score for this training
-        validation_scores = [s for s in best_training.score if s.type == "validation"]
-        if not validation_scores:
+        # Get the best model score based on lowest logloss or rmse
+        model_scores = best_model_selection.model_selection_scores
+        # Determine if we should use logloss or rmse based on what's available
+        if any(ms.logloss is not None for ms in model_scores):
+            # Classification: find lowest logloss
+            best_score = min(
+                (ms for ms in model_scores if ms.logloss is not None),
+                key=lambda x: x.logloss
+            )
+        elif any(ms.rmse is not None for ms in model_scores):
+            # Regression: find lowest rmse
+            best_score = min(
+                (ms for ms in model_scores if ms.rmse is not None),
+                key=lambda x: x.rmse
+            )
+        else:
             return {
                 "experiment_name": self.name,
                 "target_number": target_number,
-                "error": "No validation scores found for the best model",
+                "error": "No scores found for the best model",
                 "scores": {},
             }
-        # Get all available metrics from the first validation score
-        score = validation_scores[0]
+        # Use the best score found
+        score = best_score
         available_metrics = [
             "rmse",
             "mae",
@@ -398,12 +399,12 @@ class Experiment(Base):
         # Get the model info
         model_info = {
             "model_type": (
-                best_training.model.model_type if best_training.model else "unknown"
+                score.model.model_type if score.model else "unknown"
             ),
             "model_name": (
-                best_training.model.name if best_training.model else "unknown"
+                score.model.name if score.model else "unknown"
             ),
-            "training_time_seconds": best_training.training_time,
+            "training_time_seconds": score.training_time,
         }
         return {
@@ -413,7 +414,10 @@ class Experiment(Base):
             "scores": scores,
         }
-    def get_features(self, target_number: int):
+    @with_db
+    def get_features(self, target_number: int, db=None):
+        # Ensure we have a fresh instance attached to the session
+        self = db.merge(self)
         targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
         if targets:
             target_id = targets[0].id
@@ -429,7 +433,10 @@ class Experiment(Base):
         features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
         return features
-    def get_all_features(self, date_column: str = None, group_column: str = None):
+    @with_db
+    def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
+        # Ensure we have a fresh instance attached to the session
+        self = db.merge(self)
         target_idx = [target.id for target in self.targets]
         _all_features = chain.from_iterable(
             [f.name for f in fs.features]

lecrapaud/db/models/feature_selection.py CHANGED Viewed

@@ -115,7 +115,4 @@ class FeatureSelection(Base):
             if feature not in self.features:
                 self.features.append(feature)
-        db.flush()
-        db.refresh(self)
-        print(self.features)
         return self

lecrapaud/db/models/feature_selection_rank.py CHANGED Viewed

@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
             name="uq_feature_selection_rank_composite",
         ),
     )
-    @classmethod
-    @with_db
-    def bulk_upsert(cls, rows, db=None):
-        stmt = insert(cls).values(rows)
-        update_fields = {
-            key: stmt.inserted[key]
-            for key in rows[0]
-            if key not in ("feature_selection_id", "feature_id", "method")
-        }
-        stmt = stmt.on_duplicate_key_update(**update_fields)
-        db.execute(stmt)
-        db.commit()
-        return len(rows)

lecrapaud/db/models/model_selection.py CHANGED Viewed

@@ -54,8 +54,8 @@ class ModelSelection(Base):
     )
     best_model = relationship("Model", lazy="selectin")
-    model_trainings = relationship(
-        "ModelTraining",
+    model_selection_scores = relationship(
+        "ModelSelectionScore",
         back_populates="model_selection",
         cascade="all, delete-orphan",
         lazy="selectin",

lecrapaud/db/models/{score.py → model_selection_score.py} RENAMED Viewed

@@ -3,10 +3,11 @@ from sqlalchemy import (
     Integer,
     String,
     Float,
+    JSON,
     ForeignKey,
     BigInteger,
     TIMESTAMP,
-    JSON,
+    UniqueConstraint,
 )
 from sqlalchemy import func
 from sqlalchemy.orm import relationship
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
 from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
-class Score(Base):
+class ModelSelectionScore(Base):
+    __tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
     id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
     created_at = Column(
         TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
@@ -25,10 +28,21 @@ class Score(Base):
         onupdate=func.now(),
         nullable=False,
     )
-    type = Column(
-        String(50), nullable=False
-    )  # either hyperopts or validation or crossval
+    # From ModelTraining
+    best_params = Column(JSON)
+    model_path = Column(String(255))
     training_time = Column(Integer)
+    model_id = Column(
+        BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
+    )
+    model_selection_id = Column(
+        BigInteger,
+        ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
+        nullable=False,
+    )
+    # From Score (excluding type and training_time which is already in ModelTraining)
     eval_data_std = Column(Float)
     rmse = Column(Float)
     rmse_std_ratio = Column(Float)
@@ -50,12 +64,15 @@ class Score(Base):
     precision_at_threshold = Column(Float)
     recall_at_threshold = Column(Float)
     f1_at_threshold = Column(Float)
-    model_training_id = Column(
-        BigInteger,
-        ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
-        nullable=False,
-    )
-    model_trainings = relationship(
-        "ModelTraining", back_populates="score", lazy="selectin"
+    # Relationships
+    model = relationship("Model", lazy="selectin")
+    model_selection = relationship(
+        "ModelSelection", back_populates="model_selection_scores", lazy="selectin"
     )
+    __table_args__ = (
+        UniqueConstraint(
+            "model_id", "model_selection_id", name="uq_model_selection_score_composite"
+        ),
+    )

lecrapaud/db/session.py CHANGED Viewed

@@ -73,6 +73,7 @@ def init_db(uri: str = None):
         autocommit=False,
         autoflush=False,
         bind=_engine,
+        expire_on_commit=False,  # Prevent detached instance errors
     )
     # Step 5: Apply Alembic migrations programmatically

lecrapaud/experiment.py CHANGED Viewed

@@ -35,7 +35,7 @@ def create_experiment(
     groups = {}
     if group_column:
         groups["number_of_groups"] = data[group_column].nunique()
-        groups["list_of_groups"] = data[group_column].unique().tolist().sort()
+        groups["list_of_groups"] = sorted(data[group_column].unique().tolist())
     with get_db() as db:
         all_targets = Target.get_all(db=db)
@@ -50,8 +50,8 @@ def create_experiment(
         os.makedirs(preprocessing_dir, exist_ok=True)
         os.makedirs(data_dir, exist_ok=True)
+        # Create or update experiment (without targets relation)
         experiment = Experiment.upsert(
-            match_fields=["name"],
             db=db,
             name=experiment_name,
             path=Path(experiment_dir).resolve(),
@@ -62,7 +62,6 @@ def create_experiment(
             max_features=max_features,
             **groups,
             **dates,
-            targets=targets,
             context={
                 "corr_threshold": corr_threshold,
                 "percentile": percentile,
@@ -73,5 +72,9 @@ def create_experiment(
                 **kwargs,
             },
         )
+        # Set targets relationship after creation/update
+        experiment.targets = targets
+        experiment.save(db=db)
         return experiment

lecrapaud/feature_engineering.py CHANGED Viewed

@@ -483,8 +483,8 @@ class PreprocessFeature:
                 f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
             )
-        Experiment.upsert(
-            match_fields=["id"],
+        # Update existing experiment with sizes and dates
+        Experiment.update(
             id=self.experiment_id,
             train_size=len(train),
             val_size=len(val),
@@ -545,8 +545,8 @@ class PreprocessFeature:
         for name, data in zip(["train", "val", "test"], [train, val, test]):
             logger.info(f"{data.shape} {name} data")
-        Experiment.upsert(
-            match_fields=["id"],
+        # Update existing experiment with sizes
+        Experiment.update(
             id=self.experiment_id,
             train_size=len(train),
             val_size=len(val),
@@ -838,8 +838,7 @@ class PreprocessFeature:
             # Upsert features in bulk if we have any features
             if all_feature_names:
-                Feature.upsert_bulk(
-                    match_fields=["name"],
+                Feature.bulk_upsert(
                     name=all_feature_names,
                     type=all_feature_types,
                 )
@@ -855,9 +854,7 @@ class PreprocessFeature:
                 for target in target_names
             ]
-            Target.upsert_bulk(
-                match_fields=["name"], name=target_names, type=target_types
-            )
+            Target.bulk_upsert(name=target_names, type=target_types)
             # Get all the upserted objects
             targets = Target.filter(name__in=target_names)

lecrapaud/feature_selection.py CHANGED Viewed

@@ -115,7 +115,6 @@ class FeatureSelectionEngine:
         max_features = self.max_features
         feature_selection = FeatureSelection.upsert(
-            match_fields=["target_id", "experiment_id"],
             target_id=target.id,
             experiment_id=self.experiment_id,
         )

lecrapaud 0.19.2__py3-none-any.whl → 0.20.0__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.19.2py3-none-any.whl → 0.20.0py3-none-any.whl