PyPI - lecrapaud - Versions diffs - 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

lecrapaud 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (31) hide show

lecrapaud/api.py +84 -61
lecrapaud/config.py +6 -2
lecrapaud/db/alembic/versions/{2025_06_20_1924-1edada319fd7_initial_setup.py → 2025_06_23_1748-f089dfb7e3ba_.py} +20 -20
lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +30 -0
lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +34 -0
lecrapaud/db/models/__init__.py +14 -2
lecrapaud/db/models/base.py +48 -2
lecrapaud/db/models/{dataset.py → experiment.py} +23 -25
lecrapaud/db/models/feature_selection.py +5 -5
lecrapaud/db/models/model_selection.py +5 -5
lecrapaud/db/models/score.py +3 -1
lecrapaud/db/models/target.py +4 -4
lecrapaud/db/session.py +4 -4
lecrapaud/directories.py +0 -2
lecrapaud/experiment.py +25 -18
lecrapaud/feature_engineering.py +53 -24
lecrapaud/feature_selection.py +41 -36
lecrapaud/jobs/tasks.py +4 -4
lecrapaud/model_selection.py +268 -261
lecrapaud/search_space.py +23 -4
lecrapaud/utils.py +2 -2
{lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/METADATA +2 -2
lecrapaud-0.7.0.dist-info/RECORD +43 -0
lecrapaud/services/__init__.py +0 -0
lecrapaud/services/embedding_categorical.py +0 -71
lecrapaud/services/indicators.py +0 -309
lecrapaud/speed_tests/experiments.py +0 -139
lecrapaud/speed_tests/trash.py +0 -37
lecrapaud-0.5.1.dist-info/RECORD +0 -46
{lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/LICENSE +0 -0
{lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/WHEEL +0 -0

lecrapaud/model_selection.py CHANGED Viewed

@@ -75,7 +75,7 @@ from lecrapaud.db import (
     ModelTraining,
     Score,
     Target,
-    Dataset,
+    Experiment,
 )
 # Reproducible result
@@ -144,8 +144,12 @@ class ModelEngine:
         self.plot = plot
         self.log_dir = log_dir
-        if self.need_scaling and self.target_type == "regression":
-            self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
+        if self.path and self.need_scaling and self.target_type == "regression":
+            preprocessing_dir = Path(f"{self.path}/../preprocessing")
+            target_number = self.path.split("/")[-1].split("_")[-1]
+            self.scaler_y = joblib.load(
+                preprocessing_dir / f"scaler_y_{target_number}.pkl"
+            )
         else:
             self.scaler_y = None
@@ -204,7 +208,7 @@ class ModelEngine:
         """
         lightGBM = self.create_model == "lgb"
-        # Datasets
+        # Experiments
         boosting_dataset = lgb.Dataset if lightGBM else xgb.DMatrix
         train_data = boosting_dataset(x_train, label=y_train)
         val_data = boosting_dataset(x_val, label=y_val)
@@ -482,7 +486,7 @@ class ModelEngine:
     def predict(
         self,
-        data: pd.DataFrame,
+        data: pd.DataFrame | np.ndarray,
         threshold: float = 0.5,
     ):
         """Function to get prediction from model. Support sklearn, keras and boosting models such as xgboost and lgboost
@@ -500,77 +504,58 @@ class ModelEngine:
         if self.threshold and threshold == 0.5:
             threshold = self.threshold
-        if self.recurrent or model.model_name in ["lgb", "xgb"]:
-            # keras, lgb & xgb
-            if model.model_name == "lgb":
-                # Direct prediction for LightGBM
-                pred = model.predict(data)
-            elif model.model_name == "xgb":
-                # Convert val_data to DMatrix for XGBoost
-                d_data = xgb.DMatrix(data)
-                pred = model.predict(d_data)
+        # Determine index for output
+        if isinstance(data, pd.DataFrame):
+            index = data.index
+        elif isinstance(data, np.ndarray):
+            index = pd.RangeIndex(start=0, stop=data.shape[0])
+        else:
+            raise ValueError(
+                "Unsupported data type: expected pd.DataFrame or np.ndarray"
+            )
+        # Keras, LightGBM, XGBoost
+        if self.recurrent or self.model_name in ["lgb", "xgb"]:
+            if self.model_name == "xgb":
+                data_input = xgb.DMatrix(data)
+                pred_raw = model.predict(data_input)
             else:
-                # Reshape (flatten) for keras if not multiclass
-                pred = model.predict(data)
-                if pred.shape[1] == 1:
-                    pred = pred.reshape(-1)
+                pred_raw = model.predict(data)
-            if self.target_type == "classification":
-                num_class = pred.shape[1] if len(pred.shape) > 1 else 2
+            if pred_raw.ndim == 1:
+                pred_raw = pred_raw.reshape(-1, 1)
+            if self.target_type == "classification":
+                num_class = pred_raw.shape[1] if pred_raw.ndim > 1 else 2
                 if num_class <= 2:
-                    # For binary classification, concatenate the predicted probabilities for both classes
-                    pred_df = pd.DataFrame(
-                        {
-                            0: 1 - pred,  # Probability of class 0
-                            1: pred,  # Probability of class 1
-                        },
+                    pred_proba = pd.DataFrame(
+                        {0: 1 - pred_raw.ravel(), 1: pred_raw.ravel()}, index=index
                     )
                 else:
-                    # For multi-class classification, use the predicted probabilities for each class
-                    pred_df = pd.DataFrame(pred, columns=range(num_class))
-                # Get final predictions (argmax for multi-class, threshold for binary)
-                if num_class == 2:
-                    pred_df["PRED"] = np.where(
-                        pred_df[1] >= threshold, 1, 0
-                    )  # Class 1 if prob >= threshold
-                else:
-                    pred_df["PRED"] = pred_df.idxmax(
-                        axis=1
-                    )  # Class with highest probability for multiclasses
-                # Reorder columns to show predicted class first, then probabilities
-                pred = pred_df[["PRED"] + list(range(num_class))]
+                    pred_proba = pd.DataFrame(
+                        pred_raw, columns=range(num_class), index=index
+                    )
+                pred_df = apply_thresholds(pred_proba, threshold, pred_proba.columns)
             else:
-                pred = pd.Series(pred, name="PRED")
+                pred_df = pd.Series(pred_raw.ravel(), index=index, name="PRED")
-            # set index for lgb and xgb (for keras, as we use np array, we need to set index outside)
-            if model.model_name in ["lgb", "xgb"]:
-                pred.index = data.index
+        # Sklearn
         else:
-            # sk learn
-            pred = pd.Series(model.predict(data), index=data.index, name="PRED")
             if self.target_type == "classification":
                 pred_proba = pd.DataFrame(
                     model.predict_proba(data),
-                    index=data.index,
+                    index=index,
                     columns=[
                         int(c) if isinstance(c, float) and c.is_integer() else c
                         for c in model.classes_
                     ],
                 )
+                pred_df = apply_thresholds(pred_proba, threshold, model.classes_)
+            else:
+                pred_df = pd.Series(model.predict(data), index=index, name="PRED")
-                # Apply threshold for binary classification
-                if len(model.classes_) == 2:
-                    positive_class = model.classes_[1]  # Assuming classes are ordered
-                    pred = (pred_proba[positive_class] >= threshold).astype(int)
-                    pred.name = "PRED"
-                pred = pd.concat([pred, pred_proba], axis=1)
-        return pred
+        return pred_df
     def save(self, path):
         if self.recurrent:
@@ -640,11 +625,13 @@ def trainable(
     y_val,
     model_name,
     target_type,
-    session_name,
+    experiment_name,
     target_number,
     create_model,
     type_name="hyperopts",
     plot=False,
+    log_dir=None,
+    target_clf_thresholds: dict = None,
 ):
     """Standalone version of train_model that doesn't depend on self"""
     # Create model engine
@@ -653,10 +640,11 @@ def trainable(
         target_type=target_type,
         create_model=create_model,
         plot=plot,
+        log_dir=log_dir,
     )
     logger.info(
-        f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {session_name}, TARGET_{target_number}"
+        f"TARGET_{target_number} - Training a {model.model_name} at {datetime.now()} : {experiment_name}, TARGET_{target_number}"
     )
     if model.recurrent:
@@ -696,17 +684,13 @@ def trainable(
     # Evaluate model
     score = {
         "DATE": datetime.now(),
-        "SESSION": session_name,
-        "TRAIN_DATA": x_train.shape[0],
-        "VAL_DATA": x_val.shape[0],
-        "FEATURES": x_train.shape[-1],
         "MODEL_NAME": model.model_name,
         "TYPE": type_name,
         "TRAINING_TIME": stop - start,
         "EVAL_DATA_STD": prediction["TARGET"].std(),
     }
-    score.update(evaluate(prediction, target_type))
+    score.update(evaluate(prediction, target_type, target_clf_thresholds))
     if type_name == "hyperopts":
         session.report(metrics=score)
@@ -723,41 +707,47 @@ class ModelSelectionEngine:
         reshaped_data,
         target_number,
         target_clf,
-        dataset,
+        experiment,
         models_idx,
         time_series,
         date_column,
         group_column,
+        target_clf_thresholds,
         **kwargs,
     ):
         self.data = data
         self.reshaped_data = reshaped_data
         self.target_number = target_number
-        self.dataset = dataset
+        self.experiment = experiment
         self.target_clf = target_clf
         self.models_idx = models_idx
         self.time_series = time_series
         self.date_column = date_column
         self.group_column = group_column
+        self.target_clf_thresholds = (
+            target_clf_thresholds[target_number]
+            if target_number in target_clf_thresholds.keys()
+            else None
+        )
         self.target_type = (
             "classification" if self.target_number in self.target_clf else "regression"
         )
-        self.dataset_dir = self.dataset.path
-        self.dataset_id = self.dataset.id
-        self.data_dir = f"{self.dataset_dir}/data"
-        self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
-        self.training_target_dir = f"{self.dataset_dir}/TARGET_{self.target_number}"
+        self.experiment_dir = self.experiment.path
+        self.experiment_id = self.experiment.id
+        self.data_dir = f"{self.experiment_dir}/data"
+        self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
+        self.training_target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
         self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
-        self.features = self.dataset.get_features(self.target_number)
-        self.all_features = self.dataset.get_all_features(
+        self.features = self.experiment.get_features(self.target_number)
+        self.all_features = self.experiment.get_all_features(
             date_column=self.date_column, group_column=self.group_column
         )
     # Main training function
     def run(
         self,
-        session_name,
+        experiment_name,
         perform_hyperopt=True,
         number_of_trials=20,
         perform_crossval=False,
@@ -769,12 +759,12 @@ class ModelSelectionEngine:
         Selects the best models based on a target variable, optionally performing hyperparameter optimization
         and cross-validation, and manages outputs in a session-specific directory.
         """
-        self.session_name = session_name
+        self.experiment_name = experiment_name
         self.plot = plot
         self.number_of_trials = number_of_trials
-        if self.dataset_id is None:
-            raise ValueError("Please provide a dataset.")
+        if self.experiment_id is None:
+            raise ValueError("Please provide a experiment.")
         if self.data:
             train = self.data["train"]
@@ -791,7 +781,9 @@ class ModelSelectionEngine:
                 train_scaled,
                 val_scaled,
                 test_scaled,
-            ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
+            ) = load_train_data(
+                self.experiment_dir, self.target_number, self.target_clf
+            )
         if (
             any(all_models[i].get("recurrent") for i in self.models_idx)
@@ -819,9 +811,9 @@ class ModelSelectionEngine:
         # create model selection in db
         target = Target.find_by(name=f"TARGET_{self.target_number}")
         model_selection = ModelSelection.upsert(
-            match_fields=["target_id", "dataset_id"],
+            match_fields=["target_id", "experiment_id"],
             target_id=target.id,
-            dataset_id=self.dataset_id,
+            experiment_id=self.experiment_id,
         )
         # recurrent models starts at 9 # len(list_models)
@@ -994,7 +986,7 @@ class ModelSelectionEngine:
                     self.metric
                 ].mean()
                 logger.info(
-                    f"Best model mean cross-validation score on entire dataset: {cross_validation_mean_score}"
+                    f"Best model mean cross-validation score on entire experiment: {cross_validation_mean_score}"
                 )
                 # Retrain on entire training set, but keep score on cross-validation folds
@@ -1023,7 +1015,7 @@ class ModelSelectionEngine:
             # Save validation predictions
             best_pred.to_csv(
-                f"{self.results_dir}/pred_val.csv",
+                f"{self.results_dir}/prediction.csv",
                 index=True,
                 header=True,
                 index_label="ID",
@@ -1065,10 +1057,6 @@ class ModelSelectionEngine:
             # Store metrics in DB
             drop_cols = [
                 "DATE",
-                "SESSION",
-                "TRAIN_DATA",
-                "VAL_DATA",
-                "FEATURES",
                 "MODEL_NAME",
                 "MODEL_PATH",
             ]
@@ -1117,6 +1105,9 @@ class ModelSelectionEngine:
         logger.info(f"Best model overall is : {best_score_overall}")
+        best_model = joblib.load(best_model_path)
+        return best_model
     def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
         self.type_name = "hyperopts"
@@ -1149,11 +1140,13 @@ class ModelSelectionEngine:
                 y_val=y_val,
                 model_name=model.model_name,
                 target_type=self.target_type,
-                session_name=self.session_name,
+                experiment_name=self.experiment_name,
                 target_number=self.target_number,
                 create_model=model.create_model,
                 type_name="hyperopts",
                 plot=model.plot,
+                log_dir=model.log_dir,
+                target_clf_thresholds=self.target_clf_thresholds,
             ),
             param_space=model.search_params,
             tune_config=TuneConfig(
@@ -1206,21 +1199,28 @@ class ModelSelectionEngine:
             y_val,
             model.model_name,
             self.target_type,
-            self.session_name,
+            self.experiment_name,
             self.target_number,
             model.create_model,
             self.type_name,
             model.plot,
+            log_dir=model.log_dir,
+            target_clf_thresholds=self.target_clf_thresholds,
         )
-def evaluate(prediction: pd.DataFrame, target_type: str):
+def evaluate(
+    prediction: pd.DataFrame,
+    target_type: str,
+    target_clf_thresholds: dict = {"precision": 0.80},
+):
     """
     Function to evaluate model performance
     Args:
         - prediction: the prediction dataframe containing TARGET and PRED columns, as well as predicted probablities for each class for classification tasks
         - target_type: classification or regression
+        - target_clf_thresholds: thresholds for classification tasks like {"recall": 0.9} or {"precision": 0.9}
     """
     score = {}
     y_true = prediction["TARGET"]
@@ -1286,15 +1286,37 @@ def evaluate(prediction: pd.DataFrame, target_type: str):
             average=("binary" if num_classes == 2 else "macro"),
         )
         score["ROC_AUC"] = float(roc_auc_score(y_true, y_pred_proba, multi_class="ovr"))
-        (
-            score["THRESHOLD"],
-            score["PRECISION_AT_THRESHOLD"],
-            score["RECALL_AT_THRESHOLD"],
-        ) = (
-            find_best_precision_threshold(prediction)
-            if num_classes == 2
-            else (None, None, None)
+        # Store the complete thresholds dictionary
+        if len(target_clf_thresholds.keys()) > 1:
+            raise ValueError(
+                f"Only one metric can be specified for threshold optimization. found {target_clf_thresholds.keys()}"
+            )
+        # Get the single key-value pair or use defaults
+        metric, value = (
+            next(iter(target_clf_thresholds.items()))
+            if target_clf_thresholds
+            else ("precision", 0.8)
         )
+        score["THRESHOLDS"] = find_best_threshold(prediction, metric, value)
+        # Collect valid metrics across all classes (works for both binary and multiclass)
+        valid_metrics = [
+            m for m in score["THRESHOLDS"].values() if m["threshold"] is not None
+        ]
+        if valid_metrics:
+            score["PRECISION_AT_THRESHOLD"] = np.mean(
+                [m["precision"] for m in valid_metrics]
+            )
+            score["RECALL_AT_THRESHOLD"] = np.mean([m["recall"] for m in valid_metrics])
+            score["F1_AT_THRESHOLD"] = np.mean([m["f1"] for m in valid_metrics])
+        else:
+            score["PRECISION_AT_THRESHOLD"] = None
+            score["RECALL_AT_THRESHOLD"] = None
+            score["F1_AT_THRESHOLD"] = None
     return score
@@ -1380,196 +1402,181 @@ def plot_confusion_matrix(y_true, y_pred):
     plt.show()
-# thresholds
-def find_max_f1_threshold(prediction):
+def find_best_threshold(
+    prediction: pd.DataFrame, metric: str = "recall", target_value: float | None = None
+) -> dict:
     """
-    Finds the threshold that maximizes the F1 score for a binary classification task.
+    General function to find best threshold optimizing recall, precision, or f1.
-    Parameters:
-    - prediction: DataFrame with 'TARGET' and '1' (predicted probabilities) columns
-    Returns:
-    - best_threshold: The threshold that maximizes the F1 score
-    - best_precision: The precision at that threshold
-    - best_recall: The recall at that threshold
-    """
-    y_true = prediction["TARGET"]
-    y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
-    # Compute precision, recall, and thresholds
-    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
-    # Drop the first element to align with thresholds
-    precision = precision[1:]
-    recall = recall[1:]
-    # Filter out trivial cases (precision or recall = 0)
-    valid = (precision > 0) & (recall > 0)
-    if not np.any(valid):
-        raise ValueError("No valid threshold with non-zero precision and recall")
-    precision = precision[valid]
-    recall = recall[valid]
-    thresholds = thresholds[valid]
-    # Compute F1 scores for each threshold
-    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
-    best_index = np.argmax(f1_scores)
-    best_threshold = thresholds[best_index]
-    best_precision = precision[best_index]
-    best_recall = recall[best_index]
-    return best_threshold, best_precision, best_recall
-def find_best_f1_threshold(prediction, fscore_target: float):
-    """
-    Finds the highest threshold achieving at least the given F1 score target.
+    Supports both binary and multiclass classification.
     Parameters:
-    - prediction: DataFrame with 'TARGET' and '1' (or 1 as int) columns
-    - fscore_target: Desired minimum F1 score (between 0 and 1)
+    - prediction (pd.DataFrame): must contain 'TARGET' and class probability columns.
+    - metric (str): 'recall', 'precision', or 'f1'.
+    - target_value (float | None): minimum acceptable value for the chosen metric.
     Returns:
-    - best_threshold: The highest threshold meeting the F1 target
-    - best_precision: Precision at that threshold
-    - best_recall: Recall at that threshold
-    - best_f1: Actual F1 score at that threshold
+    - dict: {class_label: {'threshold', 'precision', 'recall', 'f1'}}
     """
+    assert metric in {"recall", "precision", "f1"}, "Invalid metric"
     y_true = prediction["TARGET"]
-    y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
-    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
-    # Align precision/recall with thresholds
-    precision = precision[1:]
-    recall = recall[1:]
-    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
-    # Filter for thresholds meeting F1 target
-    valid_indices = [i for i, f1 in enumerate(f1_scores) if f1 >= fscore_target]
-    if not valid_indices:
-        raise ValueError(f"Could not find a threshold with F1 >= {fscore_target:.2f}")
-    # Pick the highest threshold among valid ones
-    best_index = valid_indices[-1]
-    return (
-        thresholds[best_index],
-        precision[best_index],
-        recall[best_index],
-        f1_scores[best_index],
-    )
-def find_max_precision_threshold_without_trivial_case(prediction: dict):
-    """
-    Finds the threshold that maximizes precision without reaching a precision of 1,
-    which indicates all predictions are classified as the negative class (0).
-    Parameters:
-    - prediction: dict with keys 'TARGET' (true labels) and '1' (predicted probabilities)
-    Returns:
-    - threshold: the probability threshold that maximizes precision
-    - actual_recall: the recall achieved at this threshold
-    - actual_precision: the precision achieved at this threshold
-    """
-    y_true = prediction["TARGET"]
-    y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
-    # Compute precision, recall, and thresholds
-    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
-    # Drop the first element of precision and recall to align with thresholds
-    precision = precision[1:]
-    recall = recall[1:]
-    # Filter out precision == 1.0 (which might correspond to predicting only 0s)
-    valid_indices = np.where(precision < 1.0)[0]
-    if len(valid_indices) == 0:
-        raise ValueError("No valid precision values less than 1.0")
+    pred_cols = [
+        col for col in prediction.columns if col not in ["ID", "TARGET", "PRED"]
+    ]
+    classes = [1] if len(pred_cols) <= 2 else sorted(y_true.unique())
+    results = {}
+    for cls in classes:
+        cls_str = str(cls)
+        if cls_str not in prediction.columns and cls not in prediction.columns:
+            logger.warning(f"Missing predicted probabilities for class '{cls}'")
+            results[cls_str] = {
+                "threshold": None,
+                "precision": None,
+                "recall": None,
+                "f1": None,
+            }
+            continue
+        # Binarize for one-vs-rest
+        y_binary = (y_true == int(cls)).astype(int)
+        y_scores = prediction[cls] if cls in prediction.columns else prediction[cls_str]
+        precision, recall, thresholds = precision_recall_curve(y_binary, y_scores)
+        precision, recall = precision[1:], recall[1:]  # Align with thresholds
+        thresholds = thresholds
+        f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
+        metric_values = {"precision": precision, "recall": recall, "f1": f1}
+        values = metric_values[metric]
+        if target_value is not None:
+            if metric == "recall":
+                # Only keep recall >= target
+                valid_indices = [i for i, r in enumerate(recall) if r >= target_value]
+                if valid_indices:
+                    # Pick the highest threshold
+                    best_idx = max(valid_indices, key=lambda i: thresholds[i])
+                else:
+                    logger.warning(
+                        f"[Class {cls}] No threshold with recall ≥ {target_value}"
+                    )
+                    best_idx = int(np.argmax(recall))  # fallback
-    precision = precision[valid_indices]
-    recall = recall[valid_indices]
-    thresholds = thresholds[valid_indices]
+            elif metric == "precision":
+                # Only keep precision ≥ target and recall > 0
+                valid_indices = [
+                    i
+                    for i, (p, r) in enumerate(zip(precision, recall))
+                    if p >= target_value and r > 0
+                ]
+                if valid_indices:
+                    # Among valid ones, pick the one with highest recall
+                    best_idx = max(valid_indices, key=lambda i: recall[i])
+                else:
+                    logger.warning(
+                        f"[Class {cls}] No threshold with precision ≥ {target_value}"
+                    )
+                    best_idx = int(np.argmax(precision))  # fallback
-    # Find the index of the maximum precision
-    best_index = np.argmax(precision)
+            elif metric == "f1":
+                valid_indices = [i for i, val in enumerate(f1) if val >= target_value]
+                if valid_indices:
+                    best_idx = max(valid_indices, key=lambda i: f1[i])
+                else:
+                    logger.warning(
+                        f"[Class {cls}] No threshold with f1 ≥ {target_value}"
+                    )
+                    best_idx = int(np.argmax(f1))  # fallback
+        else:
+            best_idx = int(np.argmax(values))  # no constraint, get best value
-    # Return the corresponding threshold, precision, and recall
-    best_threshold = thresholds[best_index]
-    best_precision = precision[best_index]
-    best_recall = recall[best_index]
+        results[cls_str] = {
+            "threshold": float(thresholds[best_idx]),
+            "precision": float(precision[best_idx]),
+            "recall": float(recall[best_idx]),
+            "f1": float(f1[best_idx]),
+        }
-    return best_threshold, best_precision, best_recall
+    return results
-def find_best_precision_threshold(prediction, precision_target: float = 0.80):
+def apply_thresholds(
+    pred_proba: pd.DataFrame, threshold: dict | int | float, classes
+) -> pd.DataFrame:
     """
-    Finds the highest threshold that achieves at least the given precision target.
+    Apply thresholds to predicted probabilities.
     Parameters:
-        prediction (pd.DataFrame): DataFrame with columns 'TARGET' and '1' or index 1 for predicted probabilities
-        precision_target (float): Desired minimum precision (between 0 and 1)
+    - pred_proba (pd.DataFrame): Probabilities per class.
+    - threshold (float | dict): Global threshold (float) or per-class dict from `find_best_threshold`.
+    - classes (iterable): List or array of class labels (used for binary classification).
     Returns:
-        tuple: (threshold, precision, recall) achieving the desired precision
+    - pd.DataFrame with "PRED" column and original predicted probabilities.
     """
-    y_true = prediction["TARGET"]
-    y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
-    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
-    # Align lengths: thresholds is N-1 compared to precision/recall
-    thresholds = thresholds
-    precision = precision[1:]  # Shift to match thresholds
-    recall = recall[1:]
+    # Case 1: Per-class thresholds
+    if isinstance(threshold, dict):
+        class_predictions = []
+        class_probabilities = []
-    valid_indices = [i for i, p in enumerate(precision) if p >= precision_target]
-    if not valid_indices:
-        raise ValueError(
-            f"Could not find a threshold with precision >= {precision_target}"
-        )
-    best_idx = valid_indices[-1]  # Highest threshold with precision >= target
-    return thresholds[best_idx], precision[best_idx], recall[best_idx]
-def find_best_recall_threshold(prediction, recall_target: float = 0.98) -> float:
-    """
-    Finds the highest threshold that achieves at least the given recall target.
-    Parameters:
-        pred_df (pd.DataFrame): DataFrame with columns 'y_true' and 'y_pred_proba'
-        recall_target (float): Desired minimum recall (between 0 and 1)
-    Returns:
-        float: Best threshold achieving the desired recall, or None if not reachable
-    """
-    y_true = prediction["TARGET"]
-    y_pred_proba = prediction[1] if 1 in prediction.columns else prediction["1"]
-    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
+        for class_label, metrics in threshold.items():
+            # Get threshold from structured dict
+            _threshold = (
+                metrics.get("threshold") if isinstance(metrics, dict) else metrics[0]
+            )
+            if _threshold is not None:
+                if class_label not in pred_proba.columns:
+                    continue  # skip missing class
+                col = pred_proba[class_label]
+                exceeded = col >= _threshold
+                class_predictions.append(
+                    pd.Series(
+                        np.where(exceeded, class_label, -1), index=pred_proba.index
+                    )
+                )
+                class_probabilities.append(
+                    pd.Series(np.where(exceeded, col, -np.inf), index=pred_proba.index)
+                )
-    # `thresholds` has length N-1 compared to precision and recall
-    recall = recall[1:]  # Drop first element to align with thresholds
-    precision = precision[1:]
+        if class_predictions:
+            preds_df = pd.concat(class_predictions, axis=1)
+            probs_df = pd.concat(class_probabilities, axis=1)
-    valid_indices = [i for i, r in enumerate(recall) if r >= recall_target]
+            def select_class(row_pred, row_prob, row_orig):
+                exceeded = row_pred >= 0
+                if exceeded.any():
+                    return row_pred.iloc[row_prob.argmax()]
+                return row_orig.idxmax()
-    if not valid_indices:
-        logger.warning(f"Could not find a threshold with recall >= {recall_target}")
-        return None, None, None
+            pred = pd.Series(
+                [
+                    select_class(
+                        preds_df.loc[idx], probs_df.loc[idx], pred_proba.loc[idx]
+                    )
+                    for idx in pred_proba.index
+                ],
+                index=pred_proba.index,
+                name="PRED",
+            )
+        else:
+            # fallback: take max probability if no thresholds apply
+            pred = pred_proba.idxmax(axis=1).rename("PRED")
-    best_idx = valid_indices[-1]  # Highest threshold with recall >= target
+    # Case 2: Global scalar threshold (e.g., 0.5 for binary)
+    else:
+        if len(classes) == 2:
+            # Binary classification: threshold on positive class
+            pos_class = classes[1]
+            pred = (pred_proba[pos_class] >= threshold).astype(int).rename("PRED")
+        else:
+            # Multiclass: default to max probability
+            pred = pred_proba.idxmax(axis=1).rename("PRED")
-    return thresholds[best_idx], precision[best_idx], recall[best_idx]
+    return pd.concat([pred, pred_proba], axis=1)
 def plot_threshold(prediction, threshold, precision, recall):
@@ -1629,7 +1636,7 @@ def get_pred_distribution(training_target_dir: str, model_name="linear"):
     Look at prediction distributions
     """
     prediction = pd.read_csv(
-        f"{training_target_dir}/{model_name}/pred_val.csv",
+        f"{training_target_dir}/{model_name}/prediction.csv",
         index_col="ID",
     )
     prediction.describe()

lecrapaud 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl