PyPI - lecrapaud - Versions diffs - 0.18.10__tar.gz → 0.19.0__tar.gz - Mend

lecrapaud 0.18.10tar.gz → 0.19.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (48) hide show

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.18.10
+Version: 0.19.0
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet
@@ -8,6 +8,7 @@ Requires-Python: ==3.12.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: catboost (>=1.2.8)
 Requires-Dist: category-encoders (>=2.8.1)
 Requires-Dist: celery (>=5.5.3)
 Requires-Dist: ftfy (>=6.3.1)

lecrapaud-0.19.0/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""add avg precision to score
+Revision ID: c36e9fee22b9
+Revises: 7ed9963e732f
+Create Date: 2025-08-28 15:16:34.657593
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
+# revision identifiers, used by Alembic.
+revision: str = "c36e9fee22b9"
+down_revision: Union[str, None] = "7ed9963e732f"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_scores",
+        sa.Column("avg_precision", sa.Float(), nullable=True),
+    )
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_scores", "avg_precision")
+    # ### end Alembic commands ###

lecrapaud-0.19.0/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""change name column
+Revision ID: 8b11c1ba982e
+Revises: c36e9fee22b9
+Create Date: 2025-08-28 16:22:45.528296
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
+# revision identifiers, used by Alembic.
+revision: str = "8b11c1ba982e"
+down_revision: Union[str, None] = "c36e9fee22b9"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        "name",
+        existing_type=mysql.VARCHAR(length=50),
+        type_=sa.String(length=255),
+        existing_nullable=False,
+    )
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column(
+        f"{LECRAPAUD_TABLE_PREFIX}_experiments",
+        "name",
+        existing_type=sa.String(length=255),
+        type_=mysql.VARCHAR(length=50),
+        existing_nullable=False,
+    )
+    # ### end Alembic commands ###

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/lecrapaud/db/models/experiment.py RENAMED Viewed

@@ -49,7 +49,7 @@ class Experiment(Base):
         onupdate=func.now(),
         nullable=False,
     )
-    name = Column(String(50), nullable=False)
+    name = Column(String(255), nullable=False)
     path = Column(String(255))  # we do not have this at creation time
     type = Column(String(50), nullable=False)
     size = Column(Integer, nullable=False)

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/lecrapaud/db/models/score.py RENAMED Viewed

@@ -45,6 +45,7 @@ class Score(Base):
     recall = Column(Float)
     f1 = Column(Float)
     roc_auc = Column(Float)
+    avg_precision = Column(Float)
     thresholds = Column(JSON)
     precision_at_threshold = Column(Float)
     recall_at_threshold = Column(Float)

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/lecrapaud/model_selection.py RENAMED Viewed

@@ -14,8 +14,6 @@ import pickle
 from pydantic import BaseModel
 import ast
-os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
 # ML models
 from sklearn.model_selection import TimeSeriesSplit
 from sklearn.calibration import CalibratedClassifierCV
@@ -80,6 +78,8 @@ from lecrapaud.db import (
     Experiment,
 )
+os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
 # Reproducible result
 keras.utils.set_random_seed(42)
 np.random.seed(42)
@@ -157,8 +157,10 @@ class ModelEngine:
     def fit(self, *args):
         if self.recurrent:
             fit = self.fit_recurrent
-        elif (self.create_model == "lgb") or (self.create_model == "xgb"):
+        elif (self.model_name == "lgb") or (self.model_name == "xgb"):
             fit = self.fit_boosting
+        elif self.model_name == "catboost":
+            fit = self.fit_catboost
         else:
             fit = self.fit_sklearn
         model = fit(*args)
@@ -201,17 +203,110 @@ class ModelEngine:
         return model
-    def fit_boosting(self, x_train, y_train, x_val, y_val, params):
+    def fit_catboost(self, x_train, y_train, x_val, y_val, params):
         """
-        This is using lightGBM or XGboost C++ librairies
+        Train CatBoost models with native early stopping and log metrics to TensorBoard.
+        Also supports plotting of the primary eval metric if self.plot is True.
         """
-        lightGBM = self.create_model == "lgb"
+        # Prepare constructor parameters
+        ctor_params = dict(params) if params else {}
+        early_stopping_rounds = ctor_params.pop("early_stopping_rounds", None)
+        # Alias support: num_boost_round -> iterations
+        num_boost_round = ctor_params.pop("num_boost_round", None)
+        if num_boost_round is not None and "iterations" not in ctor_params:
+            ctor_params["iterations"] = num_boost_round
+        # Determine classification/regression setup
+        labels = np.unique(y_train)
+        num_class = (
+            labels.size
+            if self.target_type == "classification" and labels.size > 2
+            else 1
+        )
+        if self.target_type == "regression":
+            ctor_params.setdefault("loss_function", "RMSE")
+            eval_metric = ctor_params.get("eval_metric", "RMSE")
+        else:
+            if num_class <= 2:
+                ctor_params.setdefault("loss_function", "Logloss")
+                eval_metric = ctor_params.get("eval_metric", "Logloss")
+            else:
+                ctor_params.setdefault("loss_function", "MultiClass")
+                eval_metric = ctor_params.get("eval_metric", "MultiClass")
+        ctor_params.setdefault("eval_metric", eval_metric)
+        # Instantiate CatBoost model from provided constructor
+        model = self.create_model(**ctor_params, allow_writing_files=False)
+        # Train with eval_set and early stopping
+        logger.info(f"Fitting the model {self.model_name}...")
+        logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
+        logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
+        model.fit(
+            x_train,
+            y_train,
+            eval_set=[(x_val, y_val)],
+            use_best_model=True,
+            early_stopping_rounds=early_stopping_rounds,
+            verbose=False,
+        )
+        # Retrieve evaluation results
+        evals_result = model.get_evals_result()
+        # CatBoost commonly uses 'learn' and 'validation' (or 'validation_0')
+        learn_key = "learn"
+        val_key = None
+        for k in evals_result.keys():
+            if k != learn_key:
+                val_key = k
+                break
+        # Ensure eval_metric exists; otherwise fallback to first available metric
+        if eval_metric not in evals_result.get(learn_key, {}):
+            if evals_result.get(learn_key):
+                eval_metric = next(iter(evals_result[learn_key].keys()))
+        # TensorBoard logging
+        writer = SummaryWriter(self.log_dir)
+        try:
+            # learn_scores = evals_result.get(learn_key, {}).get(eval_metric, [])
+            val_scores = (
+                evals_result.get(val_key, {}).get(eval_metric, []) if val_key else []
+            )
+            # for i, v in enumerate(learn_scores):
+            #     writer.add_scalar(f"CatBoost/train/{eval_metric}", v, i)
+            for i, v in enumerate(val_scores):
+                writer.add_scalar(f"CatBoost/{eval_metric}", v, i)
+        finally:
+            writer.close()
+        # Optional plotting of training progress
+        if self.plot and eval_metric and learn_key in evals_result and val_key:
+            logs = {
+                "train": evals_result[learn_key].get(eval_metric, []),
+                "val": evals_result[val_key].get(eval_metric, []),
+            }
+            plot_training_progress(
+                logs=logs,
+                model_name=self.model_name,
+                target_number=self.target_number,
+                title_suffix=f"Training Progress - {eval_metric}",
+            )
+        # Attach metadata for consistency with sklearn path
+        model.model_name = self.model_name
+        model.target_type = self.target_type
+        logger.info(f"Successfully created a {model.model_name} at {datetime.now()}")
-        # Experiments
-        boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
-        train_data = boosting_dataset(x_train, label=y_train)
-        val_data = boosting_dataset(x_val, label=y_val)
+        self._model = model
+        return model
+    def fit_boosting(self, x_train, y_train, x_val, y_val, params):
+        """
+        This is using lightGBM or XGboost C++ librairies
+        """
         # Create a TensorBoardX writer
         writer = SummaryWriter(self.log_dir)
         evals_result = {}
@@ -223,11 +318,13 @@ class ModelEngine:
             if self.target_type == "classification" and labels.size > 2
             else 1
         )
-        logger.info("Fitting the model...")
+        logger.info(f"Fitting the model {self.model_name}...")
         logger.info(f"x_train shape: {x_train.shape}, x_val shape: {x_val.shape}")
         logger.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")
-        if lightGBM:
+        if self.model_name == "lgb":
+            train_data = lgb.Dataset(x_train, label=y_train)
+            val_data = lgb.Dataset(x_val, label=y_val)
             def tensorboard_callback(env):
                 for i, metric in enumerate(env.evaluation_result_list):
@@ -252,18 +349,23 @@ class ModelEngine:
                     "objective": loss,
                     "metric": eval_metric,
                     "num_class": num_class,
+                    "verbose": -1,
                 },
                 num_boost_round=params["num_boost_round"],
                 train_set=train_data,
                 valid_sets=[train_data, val_data],
                 valid_names=["train", "val"],
                 callbacks=[
-                    lgb.early_stopping(stopping_rounds=params["early_stopping_rounds"]),
+                    lgb.early_stopping(
+                        stopping_rounds=params["early_stopping_rounds"], verbose=False
+                    ),
                     lgb.record_evaluation(evals_result),
                     tensorboard_callback,
                 ],
             )
         else:
+            train_data = xgb.DMatrix(x_train, label=y_train)
+            val_data = xgb.DMatrix(x_val, label=y_val)
             class TensorBoardCallback(xgb.callback.TrainingCallback):
@@ -300,6 +402,7 @@ class ModelEngine:
                 if self.target_type == "regression"
                 else ("logloss" if num_class <= 2 else "mlogloss")
             )
+            xgb.set_config(verbosity=0)
             model = xgb.train(
                 params={
                     **params["model_params"],
@@ -318,7 +421,7 @@ class ModelEngine:
                     tensorboard_callback,
                 ],
                 evals_result=evals_result,  # Record evaluation result
-                verbose_eval=0,
+                verbose_eval=10000,
             )
         model.model_name = self.create_model
@@ -1365,6 +1468,9 @@ def evaluate(
             average=("binary" if num_classes == 2 else "macro"),
         )
         score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
+        score["AVG_PRECISION"] = average_precision_score(
+            y_true, y_pred_proba, average="macro"
+        )
         # Store the complete thresholds dictionary
         if len(target_clf_thresholds.keys()) > 1:

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/lecrapaud/search_space.py RENAMED Viewed

@@ -15,6 +15,7 @@ from sklearn.naive_bayes import GaussianNB
 # Ensemble models
 from lightgbm import LGBMRegressor, LGBMClassifier
 from xgboost import XGBRegressor, XGBClassifier
+from catboost import CatBoostRegressor, CatBoostClassifier
 from sklearn.ensemble import (
     RandomForestRegressor,
     AdaBoostRegressor,
@@ -464,6 +465,41 @@ ml_models = [
             },
         },
     },
+    {
+        "model_name": "catboost",
+        "recurrent": False,
+        "need_scaling": False,
+        "classification": {
+            "create_model": CatBoostClassifier,
+            "search_params": {
+                "iterations": tune.randint(50, 1000),
+                "num_boost_round": tune.randint(50, 1000),
+                "early_stopping_rounds": tune.randint(5, 50),
+                "learning_rate": tune.loguniform(1e-4, 0.5),
+                "depth": tune.randint(3, 10),
+                "l2_leaf_reg": tune.loguniform(1e-5, 10),
+                "bagging_temperature": tune.uniform(0.0, 1.0),
+                "rsm": tune.quniform(0.6, 1.0, 0.05),
+                "random_state": 42,
+                "verbose": False,
+            },
+        },
+        "regression": {
+            "create_model": CatBoostRegressor,
+            "search_params": {
+                "iterations": tune.randint(50, 1000),
+                "num_boost_round": tune.randint(50, 1000),
+                "early_stopping_rounds": tune.randint(5, 50),
+                "learning_rate": tune.loguniform(1e-4, 0.5),
+                "depth": tune.randint(3, 10),
+                "l2_leaf_reg": tune.loguniform(1e-5, 10),
+                "bagging_temperature": tune.uniform(0.0, 1.0),
+                "rsm": tune.quniform(0.6, 1.0, 0.05),
+                "random_state": 42,
+                "verbose": False,
+            },
+        },
+    },
 ]

{lecrapaud-0.18.10 → lecrapaud-0.19.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lecrapaud"
-version = "0.18.10"
+version = "0.19.0"
 description = "Framework for machine and deep learning, with regression, classification and time series analysis"
 authors = [
     {name = "Pierre H. Gallet"}
@@ -9,6 +9,7 @@ license = {text = "Apache License"}
 readme = "README.md"
 requires-python = "==3.12.*"
 dependencies = [
+    "catboost>=1.2.8",
     "category-encoders>=2.8.1",
     "celery>=5.5.3",
     "ftfy>=6.3.1",