PyPI - lecrapaud - Versions diffs - 0.10.2__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

lecrapaud 0.10.2py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lecrapaud might be problematic. Click here for more details.

Files changed (15) hide show

lecrapaud/api.py +180 -28
lecrapaud/config.py +1 -0
lecrapaud/jobs/scheduler.py +1 -7
lecrapaud/jobs/tasks.py +3 -43
lecrapaud/misc/tabpfn_tests.ipynb +222 -0
lecrapaud/model_selection.py +25 -65
lecrapaud/utils.py +6 -3
{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/METADATA +12 -28
{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/RECORD +14 -14
lecrapaud/speed_tests/tests.ipynb +0 -145
/lecrapaud/{speed_tests → misc}/test-gpu-bilstm.ipynb +0 -0
/lecrapaud/{speed_tests → misc}/test-gpu-resnet.ipynb +0 -0
/lecrapaud/{speed_tests → misc}/test-gpu-transformers.ipynb +0 -0
{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/LICENSE +0 -0
{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/WHEEL +0 -0

lecrapaud/api.py CHANGED Viewed

@@ -1,42 +1,47 @@
-"""
-Main API class
-the way I want it to work :
-app = LeCrapaud()
-kwargs = {
-}
-experiment = app.create_experiment(**kwargs) # return a class Experiment()
-ou
-experiment = app.get_experiment(exp_id)
-best_features, artifacts, best_model = experiment.train(get_data, get_data_params)
-new_data + target_pred + target_proba (if classif) = experiment.predict(**new_data)
+"""LeCrapaud API module.
-On veut aussi pouvoir juste faire :
+This module provides the main interface for the LeCrapaud machine learning pipeline.
+It allows for end-to-end ML workflows including data preprocessing, feature engineering,
+model training, and prediction.
-experiment.feature_engineering(data) : feat eng, return data
+Basic Usage:
+    # Create a LeCrapaud instance
+    lc = LeCrapaud()
-experiment.preprocess_feature(data) : split, encoding, pcas, return train, val, test df
+    # Create a new experiment
+    experiment = lc.create_experiment(data, target_numbers=[1], target_clf=[1])
-experiment.feature_selection(train) : return features
+    # Train a model
+    best_features, artifacts, best_model = experiment.train(data)
-experiment.preprocess_model(train, val, test) : return data = dict of df
+    # Make predictions
+    predictions, scores_reg, scores_clf = experiment.predict(new_data)
-experiment.model_selection(data) : return best_model
+    # Or use individual pipeline steps:
+    processed_data = experiment.feature_engineering(data)  # Feature engineering
+    train, val, test = experiment.preprocess_feature(data)  # Data splitting and encoding
+    selected_features = experiment.feature_selection(train)  # Feature selection
+    model_data = experiment.preprocess_model(train, val, test)  # Model preprocessing
+    best_model = experiment.model_selection(model_data)  # Model selection
 """
 import joblib
 import pandas as pd
 import logging
+import seaborn as sns
+import numpy as np
+import matplotlib.pyplot as plt
 from lecrapaud.utils import logger
 from lecrapaud.db.session import init_db
 from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
-from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine, evaluate
+from lecrapaud.model_selection import (
+    ModelSelectionEngine,
+    ModelEngine,
+    evaluate,
+    load_model,
+    plot_threshold,
+    plot_evaluation_for_classification,
+)
 from lecrapaud.feature_engineering import FeatureEngineeringEngine, PreprocessFeature
 from lecrapaud.experiment import create_experiment
 from lecrapaud.db import Experiment
@@ -44,24 +49,71 @@ from lecrapaud.search_space import normalize_models_idx
 class LeCrapaud:
+    """Main class for interacting with the LeCrapaud ML pipeline.
+    This class provides methods to create and retrieve experiments.
+    Args:
+        uri (str, optional): Database connection URI. If None, uses default connection.
+    """
     def __init__(self, uri: str = None):
+        """Initialize LeCrapaud with optional database URI."""
         init_db(uri=uri)
-    def create_experiment(self, data: pd.DataFrame, **kwargs):
+    def create_experiment(self, data: pd.DataFrame, **kwargs) -> "ExperimentEngine":
+        """Create a new experiment.
+        Args:
+            data (pd.DataFrame): Input data for the experiment
+            **kwargs: Additional arguments to configure the experiment
+        Returns:
+            ExperimentEngine: A new experiment instance
+        """
         return ExperimentEngine(data=data, **kwargs)
-    def get_experiment(self, id: int, **kwargs):
+    def get_experiment(self, id: int, **kwargs) -> "ExperimentEngine":
+        """Retrieve an existing experiment by ID.
+        Args:
+            id (int): The ID of the experiment to retrieve
+            **kwargs: Additional arguments to pass to the experiment
+        Returns:
+            ExperimentEngine: The retrieved experiment instance
+        """
         return ExperimentEngine(id=id, **kwargs)
+    def list_experiments(self, limit=1000) -> list[ExperimentEngine]:
+        """List all experiments in the database."""
+        return [ExperimentEngine(id=exp.id) for exp in Experiment.get_all(limit=limit)]
 class ExperimentEngine:
-    def __init__(self, id=None, data=None, **kwargs):
+    """Engine for managing ML experiments.
+    This class handles the complete ML pipeline including feature engineering,
+    model training, and prediction. It can be initialized with either new data
+    or by loading an existing experiment by ID.
+    Args:
+        id (int, optional): ID of an existing experiment to load
+        data (pd.DataFrame, optional): Input data for a new experiment
+        **kwargs: Additional configuration parameters
+    """
+    def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
+        """Initialize the experiment engine with either new or existing experiment."""
         if id:
             self.experiment = Experiment.get(id)
             kwargs.update(self.experiment.context)
         else:
+            if data is None:
+                raise ValueError("Either id or data must be provided")
             self.experiment = create_experiment(data=data, **kwargs)
+        # Set all kwargs as instance attributes
         for key, value in kwargs.items():
             if key == "models_idx":
                 value = normalize_models_idx(value)
@@ -295,3 +347,103 @@ class ExperimentEngine:
         return joblib.load(
             f"{self.experiment.path}/TARGET_{target_number}/thresholds.pkl"
         )
+    def load_model(self, target_number: int, model_name: str = None):
+        if not model_name:
+            return load_model(f"{self.experiment.path}/TARGET_{target_number}")
+        return load_model(f"{self.experiment.path}/TARGET_{target_number}/{model_name}")
+    def plot_feature_importance(
+        self, target_number: int, model_name="linear", top_n=30
+    ):
+        """
+        Plot feature importance ranking.
+        Args:
+            target_number (int): Target variable number
+            model_name (str): Name of the model to load
+            top_n (int): Number of top features to display
+        """
+        model = self.load_model(target_number, model_name)
+        experiment = self.experiment
+        # Get feature names
+        feature_names = experiment.get_features(target_number)
+        # Get feature importances based on model type
+        if hasattr(model, "feature_importances_"):
+            # For sklearn tree models
+            importances = model.feature_importances_
+            importance_type = "Gini"
+        elif hasattr(model, "get_score"):
+            # For xgboost models
+            importance_dict = model.get_score(importance_type="weight")
+            importances = np.zeros(len(feature_names))
+            for i, feat in enumerate(feature_names):
+                if feat in importance_dict:
+                    importances[i] = importance_dict[feat]
+            importance_type = "Weight"
+        elif hasattr(model, "feature_importance"):
+            # For lightgbm models
+            importances = model.feature_importance(importance_type="split")
+            importance_type = "Split"
+        elif hasattr(model, "coef_"):
+            # For linear models
+            importances = np.abs(model.coef_.flatten())
+            importance_type = "Absolute coefficient"
+        else:
+            raise ValueError(
+                f"Model {model_name} does not support feature importance calculation"
+            )
+        # Create a DataFrame for easier manipulation
+        importance_df = pd.DataFrame(
+            {"feature": feature_names[: len(importances)], "importance": importances}
+        )
+        # Sort features by importance and take top N
+        importance_df = importance_df.sort_values("importance", ascending=False).head(
+            top_n
+        )
+        # Create the plot
+        plt.figure(figsize=(10, max(6, len(importance_df) * 0.3)))
+        ax = sns.barplot(
+            data=importance_df,
+            x="importance",
+            y="feature",
+            palette="viridis",
+            orient="h",
+        )
+        # Add value labels
+        for i, v in enumerate(importance_df["importance"]):
+            ax.text(v, i, f"{v:.4f}", color="black", ha="left", va="center")
+        plt.title(f"Feature Importance ({importance_type})")
+        plt.tight_layout()
+        plt.show()
+        return importance_df
+    def plot_evaluation_for_classification(
+        self, target_number: int, model_name="linear"
+    ):
+        prediction = self.get_prediction(target_number, model_name)
+        thresholds = self.get_threshold(target_number)
+        plot_evaluation_for_classification(prediction)
+        for class_label, metrics in thresholds.items():
+            threshold = metrics["threshold"]
+            precision = metrics["precision"]
+            recall = metrics["recall"]
+            if threshold is not None:
+                tmp_pred = prediction[["TARGET", "PRED", class_label]].copy()
+                tmp_pred.rename(columns={class_label: 1}, inplace=True)
+                logger.info(f"Class {class_label}:")
+                plot_threshold(tmp_pred, threshold, precision, recall)
+            else:
+                logger.info(f"No threshold found for class {class_label}")

lecrapaud/config.py CHANGED Viewed

@@ -32,3 +32,4 @@ DB_URI = (
 )
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
+LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)

lecrapaud/jobs/scheduler.py CHANGED Viewed

@@ -5,11 +5,6 @@ from lecrapaud.jobs.tasks import app
 def schedule_tasks():
     schedule_tasks_list = [
-        {
-            "name": "task_send_daily_emails",
-            "task": "src.jobs.tasks.task_send_daily_emails",
-            "schedule": crontab(minute=00, hour=12),
-        },
         {
             "name": "task_training_experiment",
             "task": "src.jobs.tasks.task_training_experiment",
@@ -24,8 +19,7 @@ def schedule_tasks():
 def unschedule_tasks():
     unschedule_task_keys = [
-        "redbeat:task_send_daily_emails",
-        "redbeat:task_train_models",
+        "redbeat:task_training_experiment",
     ]
     for key in unschedule_task_keys:

lecrapaud/jobs/tasks.py CHANGED Viewed

@@ -1,30 +1,5 @@
 from lecrapaud.jobs import app
-# from honeybadger import honeybadger
-from lecrapaud.send_daily_emails import send_daily_emails
-from lecrapaud.config import EXPERIMENT_ID, RECEIVER_EMAIL
-from lecrapaud.training import run_training
-from lecrapaud.constants import stock_list_3
-from lecrapaud.search_space import get_models_idx
-@app.task(
-    bind=True,
-    autoretry_for=(Exception,),
-    retry_backoff=True,
-    retry_kwargs={"max_retries": 5},
-    acks_late=True,
-)
-def task_send_daily_emails(self):
-    try:
-        print(f"[Attempt #{self.request.retries}] task_send_daily_emails")
-        experiment_id = int(EXPERIMENT_ID)
-        email = RECEIVER_EMAIL
-        return send_daily_emails(email, experiment_id)
-    except Exception as e:
-        print(e)
-        # honeybadger.notify(e)
-        raise
+from lecrapaud.utils import logger
 @app.task(
@@ -36,22 +11,7 @@ def task_send_daily_emails(self):
 )
 def task_training_experiment(self):
     try:
-        print(f"[Attempt #{self.request.retries}] task_training_experiment")
-        run_training(
-            years_of_data=20,
-            list_of_groups=stock_list_3,
-            targets_numbers=range(1, 15),
-            percentile=20,
-            corr_threshold=80,
-            max_features=25,
-            models_idx=get_models_idx("linear", "xgb"),
-            number_of_trials=20,
-            perform_hyperoptimization=True,
-            perform_crossval=False,
-            preserve_model=False,
-            experiment_name="20y_stock_list_3_linear_xgb",
-        )
+        pass
     except Exception as e:
-        print(e)
-        # honeybadger.notify(e)
+        logger.error(e)
         raise

lecrapaud/misc/tabpfn_tests.ipynb ADDED Viewed

@@ -0,0 +1,222 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/base.py:89: UserWarning: Downloading model to /Users/pierregallet/Library/Caches/tabpfn/tabpfn-v2-classifier.ckpt.\n",
+      "  model, _, config_ = load_model_criterion_config(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df286c7a921b48439f5a97dae1985862",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tabpfn-v2-classifier.ckpt:   0%|          | 0.00/29.0M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55c41be7cbaf4b95a670b40157536ea1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/classifier.py:432: UserWarning: Running on CPU with more than 200 samples may be slow.\n",
+      "Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client\n",
+      "  check_cpu_warning(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ROC AUC: 0.9981992797118848\n",
+      "Accuracy 0.9824561403508771\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "from sklearn.metrics import accuracy_score, roc_auc_score\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from tabpfn import TabPFNClassifier\n",
+    "\n",
+    "# Load data\n",
+    "X, y = load_breast_cancer(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.5, random_state=42\n",
+    ")\n",
+    "\n",
+    "# Initialize a classifier\n",
+    "clf = TabPFNClassifier()\n",
+    "clf.fit(X_train, y_train)\n",
+    "\n",
+    "# Predict probabilities\n",
+    "prediction_probabilities = clf.predict_proba(X_test)\n",
+    "print(\"ROC AUC:\", roc_auc_score(y_test, prediction_probabilities[:, 1]))\n",
+    "\n",
+    "# Predict labels\n",
+    "predictions = clf.predict(X_test)\n",
+    "print(\"Accuracy\", accuracy_score(y_test, predictions))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training time: 0.05 seconds\n",
+      "Accuracy: 0.9561\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.95      0.93      0.94        43\n",
+      "           1       0.96      0.97      0.97        71\n",
+      "\n",
+      "    accuracy                           0.96       114\n",
+      "   macro avg       0.96      0.95      0.95       114\n",
+      "weighted avg       0.96      0.96      0.96       114\n",
+      "\n",
+      "\n",
+      "Feature Importance:\n",
+      "Feature 0: 0.0284\n",
+      "Feature 1: 0.0198\n",
+      "Feature 2: 0.0000\n",
+      "Feature 3: 0.0136\n",
+      "Feature 4: 0.0094\n",
+      "Feature 5: 0.0053\n",
+      "Feature 6: 0.0060\n",
+      "Feature 7: 0.3079\n",
+      "Feature 8: 0.0001\n",
+      "Feature 9: 0.0063\n",
+      "Feature 10: 0.0093\n",
+      "Feature 11: 0.0089\n",
+      "Feature 12: 0.0168\n",
+      "Feature 13: 0.0119\n",
+      "Feature 14: 0.0113\n",
+      "Feature 15: 0.0087\n",
+      "Feature 16: 0.0220\n",
+      "Feature 17: 0.0043\n",
+      "Feature 18: 0.0036\n",
+      "Feature 19: 0.0040\n",
+      "Feature 20: 0.0578\n",
+      "Feature 21: 0.0276\n",
+      "Feature 22: 0.1538\n",
+      "Feature 23: 0.0360\n",
+      "Feature 24: 0.0072\n",
+      "Feature 25: 0.0000\n",
+      "Feature 26: 0.0295\n",
+      "Feature 27: 0.1860\n",
+      "Feature 28: 0.0049\n",
+      "Feature 29: 0.0000\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [11:58:14] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:738: \n",
+      "Parameters: { \"use_label_encoder\" } are not used.\n",
+      "\n",
+      "  bst.update(dtrain, iteration=i, fobj=obj)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# XGBoost Example\n",
+    "import xgboost as xgb\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "import time\n",
+    "\n",
+    "# Load the breast cancer dataset\n",
+    "X, y = load_breast_cancer(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.2, random_state=42\n",
+    ")\n",
+    "\n",
+    "# Create and train XGBoost classifier\n",
+    "start_time = time.time()\n",
+    "xgb_clf = xgb.XGBClassifier(\n",
+    "    n_estimators=100,\n",
+    "    max_depth=3,\n",
+    "    learning_rate=0.1,\n",
+    "    use_label_encoder=False,\n",
+    "    eval_metric=\"logloss\",\n",
+    "    random_state=42,\n",
+    ")\n",
+    "\n",
+    "# Train the model\n",
+    "xgb_clf.fit(X_train, y_train)\n",
+    "\n",
+    "# Make predictions\n",
+    "y_pred = xgb_clf.predict(X_test)\n",
+    "y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]\n",
+    "\n",
+    "# Calculate metrics\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "training_time = time.time() - start_time\n",
+    "\n",
+    "print(f\"Training time: {training_time:.2f} seconds\")\n",
+    "print(f\"Accuracy: {accuracy:.4f}\")\n",
+    "print(\"\\nClassification Report:\")\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "\n",
+    "# Feature importance\n",
+    "print(\"\\nFeature Importance:\")\n",
+    "for i, importance in enumerate(xgb_clf.feature_importances_):\n",
+    "    print(f\"Feature {i}: {importance:.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

lecrapaud/model_selection.py CHANGED Viewed

@@ -567,40 +567,14 @@ class ModelEngine:
         if not self.path:
             raise ValueError("Path is not set, cannot load model")
-        target_dir = Path(self.path)
-        # Search for files that contain '.best' or '.keras' in the name
-        best_files = list(target_dir.glob("*.best*")) + list(
-            target_dir.glob("*.keras*")
-        )
-        # If any files are found, try loading the first one (or process as needed)
-        if best_files:
-            file_path = best_files[
-                0
-            ]  # Assuming you want to open the first matching file
-            try:
-                # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
-                self._model = joblib.load(file_path)
-            except (pickle.UnpicklingError, EOFError):
-                # If it's not a pickle file, try loading it as a Keras model
-                try:
-                    # Attempt to load the file as a Keras model
-                    self._model = keras.models.load_model(file_path)
-                except Exception as e:
-                    raise FileNotFoundError(
-                        f"Model could not be loaded from path: {file_path}: {e}"
-                    )
-        else:
-            raise FileNotFoundError(
-                f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
-            )
+        self._model = load_model(self.path)
         self.model_name = self._model.model_name
         self.target_type = self._model.target_type
         # Load threshold
         self.threshold = (
-            joblib.load(f"{target_dir}/thresholds.pkl")
+            joblib.load(f"{self.path}/thresholds.pkl")
             if self.target_type == "classification"
             else None
         )
@@ -1326,12 +1300,29 @@ def get_log_dir(target_dir: str, model_name="test_model"):
     return str(log_dir)
-def print_scores(target_dir: str):
-    """
-    Monitor scores
-    """
-    scores_tracking = pd.read_csv(f"{target_dir}/scores_tracking.csv")
-    return scores_tracking
+def load_model(target_dir: str):
+    target_dir = Path(target_dir)
+    # Search for files that contain '.best' or '.keras' in the name
+    best_files = list(target_dir.glob("*.best*")) + list(target_dir.glob("*.keras*"))
+    # If any files are found, try loading the first one (or process as needed)
+    if best_files:
+        file_path = best_files[0]  # Assuming you want to open the first matching file
+        try:
+            # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
+            return joblib.load(file_path)
+        except (pickle.UnpicklingError, EOFError):
+            # If it's not a pickle file, try loading it as a Keras model
+            try:
+                # Attempt to load the file as a Keras model
+                return keras.models.load_model(file_path)
+            except Exception as e:
+                raise FileNotFoundError(
+                    f"Model could not be loaded from path: {file_path}: {e}"
+                )
+    else:
+        raise FileNotFoundError(
+            f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
+        )
 # plots
@@ -1629,37 +1620,6 @@ def plot_threshold(prediction, threshold, precision, recall):
 # OLD - to sort out
-def get_pred_distribution(target_dir: str, model_name="linear"):
-    """
-    Look at prediction distributions
-    """
-    prediction = pd.read_csv(
-        f"{target_dir}/{model_name}/prediction.csv",
-        index_col="ID",
-    )
-    prediction.describe()
-def plot_feature_importance(target_dir: str, model_name="linear"):
-    """
-    Monitor feature importance ranking to filter out unrelevant features
-    """
-    model = joblib.load(f"{target_dir}/{model_name}/{model_name}.best")
-    if hasattr(model, "feature_importances_"):
-        feature_importances_ = model.feature_importances_.flatten()
-    elif hasattr(model, "feature_importance"):
-        feature_importances_ = model.feature_importance.flatten()
-    elif hasattr(model, "coefs_"):
-        feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
-    elif hasattr(model, "coef_"):
-        feature_importances_ = model.coef_.flatten()
-    else:
-        feature_importances_ = []
-    sns.barplot(
-        data=feature_importances_,
-        orient="h",
-    )
 def print_model_estimators(target_dir: str, model_name="linear"):

lecrapaud/utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ import re
 import string
 from lecrapaud.directories import logger_dir
-from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
+from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV, LECRAPAUD_LOCAL
 _LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
@@ -24,7 +24,7 @@ def setup_logger():
         return logging.getLogger("lecrapaud" if PYTHON_ENV != "Worker" else "")
     print(
-        f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
+        f"Setting up logger lecrapaud with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
     )
     # ------------------------------------------------------------------ #
     #  Real configuration happens only on the FIRST call                 #
@@ -61,7 +61,10 @@ def setup_logger():
     return logger
-logger = setup_logger()
+if LECRAPAUD_LOCAL:
+    logger = setup_logger()
+else:
+    logger = logging.getLogger(__name__)
 def get_df_name(obj, namespace):

{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: lecrapaud
-Version: 0.10.2
+Version: 0.11.1
 Summary: Framework for machine and deep learning, with regression, classification and time series analysis
 License: Apache License
 Author: Pierre H. Gallet
@@ -8,43 +8,27 @@ Requires-Python: ==3.12.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: backoff (>=2.2.1)
 Requires-Dist: category-encoders (>=2.8.1)
-Requires-Dist: celery (>=5.5.1)
-Requires-Dist: curl-cffi (>=0.11.1)
-Requires-Dist: deep-translator (>=1.11.4)
-Requires-Dist: degiro-connector (>=3.0.26)
-Requires-Dist: fake-useragent (>=2.1.0)
+Requires-Dist: celery (>=5.5.3)
 Requires-Dist: ftfy (>=6.3.1)
-Requires-Dist: honeybadger (>=0.21)
-Requires-Dist: joblib (>=1.4.2)
-Requires-Dist: keras (>=3.9.0)
-Requires-Dist: keras-tcn (>=3.1.2)
+Requires-Dist: joblib (>=1.5.1)
+Requires-Dist: keras (>=3.10.0)
 Requires-Dist: lightgbm (>=4.6.0)
-Requires-Dist: matplotlib (>=3.10.1)
+Requires-Dist: matplotlib (>=3.10.3)
 Requires-Dist: mlxtend (>=0.23.4)
 Requires-Dist: numpy (>=2.1.3)
-Requires-Dist: openai (>=1.86.0)
-Requires-Dist: pandas (>=2.2.3)
-Requires-Dist: pandas-market-calendars (>=4.6.1)
-Requires-Dist: playwright (>=1.52.0)
-Requires-Dist: pydantic (>=2.10.6)
-Requires-Dist: python-dotenv (>=1.0.1)
-Requires-Dist: pytz (>=2025.1)
-Requires-Dist: ratelimit (>=2.2.1)
+Requires-Dist: openai (>=1.88.0)
+Requires-Dist: pandas (>=2.3.0)
+Requires-Dist: python-dotenv (>=1.1.0)
 Requires-Dist: scikit-learn (>=1.6.1)
-Requires-Dist: scipy (>=1.15.2)
+Requires-Dist: scipy (>=1.15.3)
 Requires-Dist: seaborn (>=0.13.2)
-Requires-Dist: sentence-transformers (>=3.4.1)
-Requires-Dist: sqlalchemy (>=2.0.39)
-Requires-Dist: tensorboardx (>=2.6.2.2)
+Requires-Dist: sqlalchemy (>=2.0.41)
+Requires-Dist: tensorboardx (>=2.6.4)
 Requires-Dist: tensorflow (>=2.19.0)
-Requires-Dist: tf-keras (>=2.19.0)
 Requires-Dist: tiktoken (>=0.9.0)
 Requires-Dist: tqdm (>=4.67.1)
-Requires-Dist: xgboost (>=3.0.0)
-Requires-Dist: yahoo-fin (>=0.8.9.1)
-Requires-Dist: yfinance (>=0.2.55)
+Requires-Dist: xgboost (>=3.0.2)
 Description-Content-Type: text/markdown
 <div align="center">

{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
-lecrapaud/api.py,sha256=hpAVsHeOaxck2ufH0BA7IsKQXG9oA8Y_q1lvaHn6liU,10563
-lecrapaud/config.py,sha256=n5qYpWyNSgxhJrmiujqRPa_EN3eLjGjtXDsboi1eeCo,993
+lecrapaud/api.py,sha256=wrMc3TaP5qCzGvmN0QsYKxUt2ZPzK3z4nmnetQo23io,16645
+lecrapaud/config.py,sha256=eYnrktVq457xMIMGcUSilJdNxCsaGP_gRAlzCSwd6Vo,1047
 lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
 lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
 lecrapaud/db/alembic/env.py,sha256=rseEi8oR_eKXYYW3UwOKiCMuDEwT4lxsT7llySOUpgk,2305
@@ -29,16 +29,16 @@ lecrapaud/feature_selection.py,sha256=u3TWq3G5Xh3geQevGDOZEt_rl_m6-K_CR7SttFtpwK
 lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
 lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
 lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
-lecrapaud/jobs/scheduler.py,sha256=SiYWPxokpKnR8V6btLOO6gbK0PEjSRoeG0kCbQvYPf4,990
-lecrapaud/jobs/tasks.py,sha256=jfhOCsgZlZGTnsLB_K7-Y3NgJqpzpUCFu7EfDQuIeSY,1655
-lecrapaud/model_selection.py,sha256=hKa6rQPbFBPSiQv98R89bxp-U-3Kufj9pETV0ff6KKM,61767
+lecrapaud/jobs/scheduler.py,sha256=OKXhb_gxE1-R7D1HyPns88iIS31Wd4gRqEzk4EqS0J4,774
+lecrapaud/jobs/tasks.py,sha256=sbD2_IT45DE4yQQbR6DVb9xv5x06rYDtUvSK8exYxes,332
+lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQTjqg,6676
+lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
+lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
+lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
+lecrapaud/model_selection.py,sha256=PQGEWVWN-4ZeHCqrmXBpHgq1QZi_1nOOeu5gazXGDLQ,60487
 lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
-lecrapaud/speed_tests/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
-lecrapaud/speed_tests/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
-lecrapaud/speed_tests/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
-lecrapaud/speed_tests/tests.ipynb,sha256=RjI7LDHSsbadUkea_hT14sD7ivljtIQk4NB5McXJ1bE,3835
-lecrapaud/utils.py,sha256=zM3V6WzY7XTBnbBAzk5_HKPYsH4WskjbqFwnQLG9g90,8197
-lecrapaud-0.10.2.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
-lecrapaud-0.10.2.dist-info/METADATA,sha256=WBdFSR8XKbLy8KhvJJWi48ylh7vP1nVKk-h9ga0OApU,11624
-lecrapaud-0.10.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-lecrapaud-0.10.2.dist-info/RECORD,,
+lecrapaud/utils.py,sha256=MUgDoJ31GOF8WRLn_WLzDbHw7OTKxq_ldnZT6dpxdQo,8295
+lecrapaud-0.11.1.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
+lecrapaud-0.11.1.dist-info/METADATA,sha256=YDEIQa4j_87wQqkW4SKzeomDWgUjKCFdEYn55nO41MI,11017
+lecrapaud-0.11.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+lecrapaud-0.11.1.dist-info/RECORD,,

lecrapaud/speed_tests/tests.ipynb DELETED Viewed

@@ -1,145 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import autosklearn.classification\n",
-    "import sklearn.datasets\n",
-    "import sklearn.metrics\n",
-    "from pprint import pprint\n",
-    "from tabpfn import TabPFNClassifier\n",
-    "import numpy as np\n",
-    "from pathlib import Path\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "from sklearn.metrics import accuracy_score\n",
-    "from sklearn.datasets import load_breast_cancer\n",
-    "from sklearn.model_selection import train_test_split"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
-    "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
-    "    X, y, random_state=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "automl = autosklearn.classification.AutoSklearnClassifier(\n",
-    "    time_left_for_this_task=120,\n",
-    "    per_run_time_limit=30,\n",
-    "    tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
-    "    include={\n",
-    "        \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
-    "        \"feature_preprocessor\": [\n",
-    "            \"no_preprocessing\",\n",
-    "            \"polynomial\",\n",
-    "            \"select_percentile_classification\",\n",
-    "        ],\n",
-    "    },\n",
-    "    ensemble_kwargs={\"ensemble_size\": 1},\n",
-    ")\n",
-    "automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pprint(automl.show_models(), indent=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = automl.predict(X_test)\n",
-    "print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
-    "# more ensemble members are slower, but more accurate\n",
-    "classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "start = time.time()\n",
-    "classifier.fit(X_train, y_train)\n",
-    "y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
-    "print(\n",
-    "    \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# We also offer the `predict_proba` interface\n",
-    "classifier.predict_proba(X_test).shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "out_table = pd.DataFrame(X_test.copy().astype(str))\n",
-    "out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
-    "out_table"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

/lecrapaud/{speed_tests → misc}/test-gpu-bilstm.ipynb RENAMED Viewed

File without changes

/lecrapaud/{speed_tests → misc}/test-gpu-resnet.ipynb RENAMED Viewed

File without changes

/lecrapaud/{speed_tests → misc}/test-gpu-transformers.ipynb RENAMED Viewed

File without changes

{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/WHEEL RENAMED Viewed

File without changes

lecrapaud 0.10.2__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

lecrapaud 0.10.2py3-none-any.whl → 0.11.1py3-none-any.whl