lecrapaud 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/__init__.py CHANGED
@@ -1 +1,5 @@
1
1
  from lecrapaud.api import *
2
+
3
+ # Export default parameters for easy access
4
+ from lecrapaud.api import ExperimentEngine
5
+ DEFAULT_EXPERIMENT_PARAMS = ExperimentEngine.DEFAULT_PARAMS
lecrapaud/api.py CHANGED
@@ -163,28 +163,96 @@ class ExperimentEngine:
163
163
  **kwargs: Additional configuration parameters
164
164
  """
165
165
 
166
+ # Default values for all experiment parameters
167
+ DEFAULT_PARAMS = {
168
+ # Feature Engineering
169
+ "columns_drop": [],
170
+ "columns_boolean": [],
171
+ "columns_date": [],
172
+ "columns_te_groupby": [],
173
+ "columns_te_target": [],
174
+ "for_training": True,
175
+ # Preprocessing
176
+ "time_series": False,
177
+ "val_size": 0.2,
178
+ "test_size": 0.2,
179
+ "columns_pca": [],
180
+ "pca_temporal": [],
181
+ "pca_cross_sectional": [],
182
+ "columns_onehot": [],
183
+ "columns_binary": [],
184
+ "columns_ordinal": [],
185
+ "columns_frequency": [],
186
+ # Feature Selection
187
+ "percentile": 20,
188
+ "corr_threshold": 80,
189
+ "max_features": 50,
190
+ "max_p_value_categorical": 0.05,
191
+ # Model Selection
192
+ "target_numbers": [],
193
+ "target_clf": [],
194
+ "models_idx": [],
195
+ "max_timesteps": 120,
196
+ "perform_hyperopt": True,
197
+ "number_of_trials": 20,
198
+ "perform_crossval": False,
199
+ "plot": True,
200
+ "preserve_model": True,
201
+ "target_clf_thresholds": {},
202
+ # Data structure
203
+ "date_column": None,
204
+ "group_column": None,
205
+ }
206
+
166
207
  def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
167
208
  """Initialize the experiment engine with either new or existing experiment."""
168
- # Set all kwargs as instance attributes
169
- if "models_idx" in kwargs:
170
- kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
171
- for key, value in kwargs.items():
172
- setattr(self, key, value)
173
209
 
174
210
  if id:
211
+ # Load existing experiment
175
212
  self.experiment = Experiment.get(id)
176
- kwargs.update(self.experiment.context)
177
- experiment_dir = f"{tmp_dir}/{self.experiment.name}"
178
- preprocessing_dir = f"{experiment_dir}/preprocessing"
179
- data_dir = f"{experiment_dir}/data"
180
- os.makedirs(preprocessing_dir, exist_ok=True)
181
- os.makedirs(data_dir, exist_ok=True)
213
+ # Context from DB takes precedence over kwargs
214
+ effective_kwargs = {
215
+ **self.DEFAULT_PARAMS,
216
+ **kwargs,
217
+ **self.experiment.context,
218
+ }
182
219
  else:
183
220
  if data is None:
184
221
  raise ValueError(
185
222
  "Either id or data must be provided. Data can be a path to a folder containing trained models"
186
223
  )
187
- self.experiment = create_experiment(data=data, **kwargs)
224
+ # New experiment: merge defaults with provided kwargs
225
+ effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
226
+
227
+ # Normalize models_idx if present
228
+ if "models_idx" in effective_kwargs:
229
+ effective_kwargs["models_idx"] = normalize_models_idx(
230
+ effective_kwargs["models_idx"]
231
+ )
232
+
233
+ # Set all parameters as instance attributes
234
+ for key, value in effective_kwargs.items():
235
+ setattr(self, key, value)
236
+
237
+ # Create experiment if new
238
+ if not id:
239
+ self.experiment = create_experiment(data=data, **effective_kwargs)
240
+
241
+ # Create directories
242
+ experiment_dir = f"{tmp_dir}/{self.experiment.name}"
243
+ preprocessing_dir = f"{experiment_dir}/preprocessing"
244
+ data_dir = f"{experiment_dir}/data"
245
+ os.makedirs(preprocessing_dir, exist_ok=True)
246
+ os.makedirs(data_dir, exist_ok=True)
247
+
248
+ @classmethod
249
+ def get_default_params(cls):
250
+ """Get the default parameters for experiments."""
251
+ return cls.DEFAULT_PARAMS.copy()
252
+
253
+ def get_effective_context(self):
254
+ """Get the effective context (merged defaults + experiment context)."""
255
+ return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
188
256
 
189
257
  def train(self, data, best_params=None):
190
258
  logger.info("Running training...")
lecrapaud/config.py CHANGED
@@ -32,6 +32,7 @@ DB_URI: str = (
32
32
  )
33
33
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
34
  LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
35
- LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)
36
35
  LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
37
- LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv("LECRAPAUD_OPTIMIZATION_BACKEND", "ray").lower()
36
+ LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
37
+ "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
38
+ ).lower()
@@ -5,38 +5,71 @@ Revises: 033e0f7eca4f
5
5
  Create Date: 2025-10-28 20:06:54.792631
6
6
 
7
7
  """
8
+
8
9
  from typing import Sequence, Union
9
10
 
10
11
  from alembic import op
11
12
  import sqlalchemy as sa
12
13
  from sqlalchemy.dialects import mysql
14
+ from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
13
15
 
14
16
  # revision identifiers, used by Alembic.
15
- revision: str = '0a8fb7826e9b'
16
- down_revision: Union[str, None] = '033e0f7eca4f'
17
+ revision: str = "0a8fb7826e9b"
18
+ down_revision: Union[str, None] = "033e0f7eca4f"
17
19
  branch_labels: Union[str, Sequence[str], None] = None
18
20
  depends_on: Union[str, Sequence[str], None] = None
19
21
 
20
22
 
21
23
  def upgrade() -> None:
22
24
  # ### commands auto generated by Alembic - please adjust! ###
23
- op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
24
- op.drop_column('lecrapaud_experiments', 'corr_threshold')
25
- op.drop_column('lecrapaud_experiments', 'max_features')
26
- op.drop_column('lecrapaud_experiments', 'percentile')
27
- op.drop_column('lecrapaud_experiments', 'type')
28
- op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
29
- op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
25
+ op.add_column(
26
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
27
+ sa.Column("number_of_targets", sa.Integer(), nullable=True),
28
+ )
29
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "corr_threshold")
30
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "max_features")
31
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "percentile")
32
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "type")
33
+ op.drop_index(
34
+ op.f("ix_model_selection_scores_id"),
35
+ table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
36
+ )
37
+ op.create_index(
38
+ op.f("ix_model_selection_scores_id"),
39
+ f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
40
+ ["id"],
41
+ unique=False,
42
+ )
30
43
  # ### end Alembic commands ###
31
44
 
32
45
 
33
46
  def downgrade() -> None:
34
47
  # ### commands auto generated by Alembic - please adjust! ###
35
- op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
36
- op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
37
- op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
38
- op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
39
- op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
40
- op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
41
- op.drop_column('lecrapaud_experiments', 'number_of_targets')
48
+ op.drop_index(
49
+ op.f("ix_lecrapaud_model_selection_scores_id"),
50
+ table_name=f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
51
+ )
52
+ op.create_index(
53
+ op.f("ix_model_selection_scores_id"),
54
+ f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores",
55
+ ["id"],
56
+ unique=False,
57
+ )
58
+ op.add_column(
59
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
60
+ sa.Column("type", mysql.VARCHAR(length=50), nullable=False),
61
+ )
62
+ op.add_column(
63
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
64
+ sa.Column("percentile", mysql.FLOAT(), nullable=False),
65
+ )
66
+ op.add_column(
67
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
68
+ sa.Column("max_features", mysql.INTEGER(), autoincrement=False, nullable=False),
69
+ )
70
+ op.add_column(
71
+ f"{LECRAPAUD_TABLE_PREFIX}_experiments",
72
+ sa.Column("corr_threshold", mysql.FLOAT(), nullable=False),
73
+ )
74
+ op.drop_column(f"{LECRAPAUD_TABLE_PREFIX}_experiments", "number_of_targets")
42
75
  # ### end Alembic commands ###
lecrapaud/experiment.py CHANGED
@@ -16,15 +16,21 @@ from lecrapaud.db.session import get_db
16
16
 
17
17
  def create_experiment(
18
18
  data: pd.DataFrame | str,
19
- date_column,
20
- group_column,
21
19
  experiment_name,
20
+ date_column=None,
21
+ group_column=None,
22
22
  **kwargs,
23
23
  ):
24
24
  if isinstance(data, str):
25
25
  path = f"{data}/data/full.pkl"
26
26
  data = joblib.load(path)
27
27
 
28
+ if kwargs.get("time_series") and not date_column:
29
+ raise ValueError("date_column must be provided for time series experiments")
30
+
31
+ if experiment_name is None:
32
+ raise ValueError("experiment_name must be provided")
33
+
28
34
  dates = {}
29
35
  if date_column:
30
36
  dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
@@ -94,7 +94,7 @@ class FeatureEngineeringEngine:
94
94
  self.data = data
95
95
  self.experiment = experiment
96
96
  self.for_training = for_training
97
-
97
+
98
98
  # Get all parameters from experiment context
99
99
  self.columns_drop = self.experiment.context.get("columns_drop", [])
100
100
  self.columns_boolean = self.experiment.context.get("columns_boolean", [])
@@ -330,15 +330,19 @@ class PreprocessFeature:
330
330
  self.test_size = context.get("test_size", 0.2)
331
331
  self.target_numbers = context.get("target_numbers", [])
332
332
  self.target_clf = context.get("target_clf", [])
333
-
333
+
334
334
  # Handle list parameters with uppercase conversion
335
335
  self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
336
336
  self.pca_temporal = context.get("pca_temporal", [])
337
337
  self.pca_cross_sectional = context.get("pca_cross_sectional", [])
338
338
  self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
339
339
  self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
340
- self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
341
- self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
340
+ self.columns_ordinal = [
341
+ col.upper() for col in context.get("columns_ordinal", [])
342
+ ]
343
+ self.columns_frequency = [
344
+ col.upper() for col in context.get("columns_frequency", [])
345
+ ]
342
346
 
343
347
  self.experiment_dir = self.experiment.path
344
348
  self.experiment_id = self.experiment.id
@@ -653,6 +657,221 @@ class PreprocessFeature:
653
657
 
654
658
  return df, pcas_dict
655
659
 
660
+ def add_pca_feature_cross_sectional_time_series(
661
+ self,
662
+ df: pd.DataFrame,
663
+ *,
664
+ n_components: int = 5,
665
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
666
+ impute_strategy: str = "median",
667
+ standardize: bool = True,
668
+ lookback_days: int = 365, # nombre de jours à regarder en arrière pour le fit
669
+ refresh_frequency: int = 90, # refresh la PCA tous les X jours
670
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
671
+ """
672
+ Construit un pivot (index=index_col, columns=columns_col, values=value_col),
673
+ fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
674
+ (par index_col) dans df. Renvoie (df_avec_features, pipe).
675
+
676
+ Pour les séries temporelles : fit la PCA uniquement sur les données passées
677
+ pour éviter le leakage, avec refresh périodique.
678
+
679
+ Gère le cas des données panel où on a plusieurs séries temporelles
680
+ (ex: plusieurs stocks avec les mêmes dates).
681
+ """
682
+
683
+ pcas_dict = {}
684
+ index_saved = df.index
685
+
686
+ for pca_cross_sectional in self.pca_cross_sectional:
687
+ name, index_col, columns_col, value_col = (
688
+ pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
689
+ )
690
+ prefix = f"CS_PC_{name}"
691
+
692
+ # Vérifier si c'est une série temporelle avec index = date
693
+ # Les dates sont déjà en ordinal après cyclic_encode_date
694
+ is_time_series = self.time_series and index_col == self.date_column
695
+
696
+ if is_time_series:
697
+ # Cas spécial : PCA cross-sectional sur des données de panel time series
698
+ # Par exemple : PCA sur les returns de tous les stocks à chaque date
699
+ # pour capturer le régime de marché
700
+
701
+ all_scores = []
702
+
703
+ # Les dates sont déjà en ordinal
704
+ unique_dates = sorted(df[index_col].unique())
705
+
706
+ # Pour l'inference, utiliser la PCA fournie
707
+ if pcas is not None:
708
+ pipe = pcas[name]
709
+ pivot = df.pivot_table(
710
+ index=index_col, columns=columns_col, values=value_col
711
+ ).sort_index()
712
+ scores = pipe.transform(pivot)
713
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
714
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
715
+ else:
716
+ # Training : fit PCA de manière expanding avec refresh périodique
717
+ pipe = None
718
+ last_fit_date = None
719
+
720
+ for i, current_date_ordinal in enumerate(unique_dates):
721
+ # Convertir l'ordinal en date pour les calculs de temps
722
+ current_date = pd.Timestamp.fromordinal(
723
+ int(current_date_ordinal)
724
+ )
725
+
726
+ # Déterminer si on doit refitter la PCA
727
+ should_refit = pipe is None or ( # Première fois
728
+ last_fit_date is not None
729
+ and (current_date - last_fit_date).days >= refresh_frequency
730
+ )
731
+
732
+ if (
733
+ should_refit and i > 30
734
+ ): # Attendre au moins 30 jours de données
735
+ # Prendre les données des 'lookback_days' derniers jours
736
+ lookback_start_date = current_date - pd.Timedelta(
737
+ days=lookback_days
738
+ )
739
+ lookback_start_ordinal = pd.Timestamp.toordinal(
740
+ lookback_start_date
741
+ )
742
+
743
+ # Masque pour les dates passées uniquement (éviter le leakage)
744
+ mask_fit = (df[index_col] >= lookback_start_ordinal) & (
745
+ df[index_col] < current_date_ordinal
746
+ )
747
+ df_fit = df[mask_fit]
748
+
749
+ if len(df_fit) > 0:
750
+ # Créer le pivot pour la période de lookback
751
+ pivot_fit = df_fit.pivot_table(
752
+ index=index_col,
753
+ columns=columns_col,
754
+ values=value_col,
755
+ ).sort_index()
756
+
757
+ # Vérifier qu'on a assez de dates et de colonnes
758
+ if (
759
+ len(pivot_fit) >= n_components
760
+ and pivot_fit.shape[1] >= n_components
761
+ ):
762
+ # Créer nouveau pipeline
763
+ steps = [
764
+ (
765
+ "imputer",
766
+ SimpleImputer(strategy=impute_strategy),
767
+ )
768
+ ]
769
+ if standardize:
770
+ steps.append(
771
+ (
772
+ "scaler",
773
+ StandardScaler(
774
+ with_mean=True, with_std=True
775
+ ),
776
+ )
777
+ )
778
+ pca = PCA(n_components=n_components, random_state=0)
779
+ steps.append(("pca", pca))
780
+ pipe = Pipeline(steps)
781
+ pipe.fit(pivot_fit)
782
+ last_fit_date = current_date
783
+
784
+ logger.debug(
785
+ f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
786
+ f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
787
+ )
788
+
789
+ # Transform pour la date courante uniquement
790
+ if pipe is not None:
791
+ df_current = df[df[index_col] == current_date_ordinal]
792
+ if len(df_current) > 0:
793
+ pivot_current = df_current.pivot_table(
794
+ index=index_col,
795
+ columns=columns_col,
796
+ values=value_col,
797
+ )
798
+ try:
799
+ scores_current = pipe.transform(pivot_current)
800
+ scores_dict = {
801
+ index_col: [current_date_ordinal],
802
+ **{
803
+ f"{prefix}_{j}": [scores_current[0, j]]
804
+ for j in range(n_components)
805
+ },
806
+ }
807
+ all_scores.append(pd.DataFrame(scores_dict))
808
+ except Exception as e:
809
+ # En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
810
+ logger.debug(
811
+ f"PCA transform error at date {current_date}: {str(e)}"
812
+ )
813
+ scores_dict = {
814
+ index_col: [current_date_ordinal],
815
+ **{
816
+ f"{prefix}_{j}": [np.nan]
817
+ for j in range(n_components)
818
+ },
819
+ }
820
+ all_scores.append(pd.DataFrame(scores_dict))
821
+ else:
822
+ # Pas encore de PCA fittée, créer des NaN
823
+ scores_dict = {
824
+ index_col: [current_date_ordinal],
825
+ **{
826
+ f"{prefix}_{j}": [np.nan]
827
+ for j in range(n_components)
828
+ },
829
+ }
830
+ all_scores.append(pd.DataFrame(scores_dict))
831
+
832
+ # Combiner tous les scores
833
+ if all_scores:
834
+ scores_df = pd.concat(all_scores, ignore_index=True)
835
+ else:
836
+ # Créer un DataFrame vide avec les bonnes colonnes
837
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
838
+ scores_df = pd.DataFrame(columns=[index_col] + cols)
839
+
840
+ # Merger les scores
841
+ df = df.merge(scores_df, on=index_col, how="left")
842
+ df.index = index_saved
843
+ pcas_dict.update({name: pipe})
844
+
845
+ else:
846
+ # Approche classique (non time series ou index != date)
847
+ pivot = df.pivot_table(
848
+ index=index_col, columns=columns_col, values=value_col
849
+ ).sort_index()
850
+
851
+ # Pipeline à réutiliser entre train et test
852
+ if pcas is None:
853
+ steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
854
+ if standardize:
855
+ steps.append(
856
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
857
+ )
858
+ pca = PCA(n_components=n_components, random_state=0)
859
+ steps.append(("pca", pca))
860
+ pipe = Pipeline(steps)
861
+ pipe.fit(pivot) # <- fit sur TRAIN uniquement
862
+ else:
863
+ pipe = pcas[name] # <- TEST : on réutilise le pipe existant
864
+
865
+ scores = pipe.transform(pivot) # shape: (n_index, n_components)
866
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
867
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
868
+
869
+ df = df.merge(scores_df.reset_index(), on=index_col, how="left")
870
+ df.index = index_saved
871
+ pcas_dict.update({name: pipe})
872
+
873
+ return df, pcas_dict
874
+
656
875
  # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
657
876
  def add_pca_feature_temporal(
658
877
  self,
lecrapaud/utils.py CHANGED
@@ -11,7 +11,7 @@ import re
11
11
  import string
12
12
 
13
13
  from lecrapaud.directories import logger_dir
14
- from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV, LECRAPAUD_LOCAL
14
+ from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
15
15
 
16
16
 
17
17
  _LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
@@ -237,7 +237,7 @@ def serialize_for_json(obj):
237
237
  import numpy as np
238
238
  from datetime import datetime, date
239
239
  import pandas as pd
240
-
240
+
241
241
  # Handle NumPy types
242
242
  if isinstance(obj, (np.integer, np.int64, np.int32, np.int16)):
243
243
  return int(obj)
@@ -247,11 +247,11 @@ def serialize_for_json(obj):
247
247
  return obj.tolist()
248
248
  elif isinstance(obj, np.bool_):
249
249
  return bool(obj)
250
-
250
+
251
251
  # Handle datetime types
252
252
  elif isinstance(obj, (datetime, date, pd.Timestamp)):
253
253
  return obj.isoformat()
254
-
254
+
255
255
  # Handle basic Python types
256
256
  elif isinstance(obj, (str, int, float, bool, type(None))):
257
257
  return obj
@@ -0,0 +1,347 @@
1
+ Metadata-Version: 2.4
2
+ Name: lecrapaud
3
+ Version: 0.21.0
4
+ Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
+ License: Apache License
6
+ License-File: LICENSE
7
+ Author: Pierre H. Gallet
8
+ Requires-Python: ==3.12.*
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: catboost (>=1.2.8)
13
+ Requires-Dist: category-encoders (>=2.8.1)
14
+ Requires-Dist: celery (>=5.5.3)
15
+ Requires-Dist: celery-redbeat (>=2.3.2)
16
+ Requires-Dist: ftfy (>=6.3.1)
17
+ Requires-Dist: hyperopt (>=0.2.7)
18
+ Requires-Dist: joblib (>=1.5.1)
19
+ Requires-Dist: keras (>=3.10.0)
20
+ Requires-Dist: keras-tcn (>=3.5.6)
21
+ Requires-Dist: lightgbm (>=4.6.0)
22
+ Requires-Dist: matplotlib (>=3.10.3)
23
+ Requires-Dist: mlxtend (>=0.23.4)
24
+ Requires-Dist: numpy (>=2.1.3)
25
+ Requires-Dist: openai (>=1.88.0)
26
+ Requires-Dist: pandas (>=2.3.0)
27
+ Requires-Dist: pydantic (>=2.9.2)
28
+ Requires-Dist: python-dotenv (>=1.1.0)
29
+ Requires-Dist: scikit-learn (>=1.6.1)
30
+ Requires-Dist: scipy (<1.14.0)
31
+ Requires-Dist: seaborn (>=0.13.2)
32
+ Requires-Dist: sqlalchemy (>=2.0.41)
33
+ Requires-Dist: tensorboardx (>=2.6.4)
34
+ Requires-Dist: tensorflow (>=2.19.0)
35
+ Requires-Dist: tiktoken (>=0.9.0)
36
+ Requires-Dist: tqdm (>=4.67.1)
37
+ Requires-Dist: xgboost (>=3.0.2)
38
+ Description-Content-Type: text/markdown
39
+
40
+ <div align="center">
41
+
42
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
43
+
44
+ ## Welcome to LeCrapaud
45
+
46
+ **An all-in-one machine learning framework**
47
+
48
+ [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
49
+ [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
50
+ [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
51
+ [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
52
+ [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
53
+
54
+ </div>
55
+
56
+ ## 🚀 Introduction
57
+
58
+ LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
59
+
60
+ ## ✨ Key Features
61
+
62
+ - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
63
+ - 🤖 Automated model selection and hyperparameter optimization
64
+ - 📊 Easy integration with pandas DataFrames
65
+ - 🔬 Supports both regression and classification tasks
66
+ - 🛠️ Simple API for both full pipeline and step-by-step usage
67
+ - 📦 Ready for production and research workflows
68
+
69
+ ## ⚡ Quick Start
70
+
71
+
72
+ ### Install the package
73
+
74
+ ```sh
75
+ pip install lecrapaud
76
+ ```
77
+
78
+ ### How it works
79
+
80
+ This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
81
+
82
+ ### Typical workflow
83
+
84
+ ```python
85
+ from lecrapaud import LeCrapaud
86
+
87
+ # 1. Create the main app
88
+ app = LeCrapaud(uri=uri)
89
+
90
+ # 2. Define your experiment context (see your notebook or api.py for all options)
91
+ context = {
92
+ "data": your_dataframe,
93
+ "columns_drop": [...],
94
+ "columns_date": [...],
95
+ # ... other config options
96
+ }
97
+
98
+ # 3. Create an experiment
99
+ experiment = app.create_experiment(**context)
100
+
101
+ # 4. Run the full training pipeline
102
+ experiment.train(your_dataframe)
103
+
104
+ # 5. Make predictions on new data
105
+ predictions = experiment.predict(new_data)
106
+ ```
107
+
108
+ ### Database Configuration (Required)
109
+
110
+ LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
111
+
112
+ - Pass a valid MySQL URI to the `LeCrapaud` constructor:
113
+ ```python
114
+ app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
115
+ ```
116
+ - **OR** set the following environment variables before using the package:
117
+ - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
118
+ - Or set `DB_URI` directly with your full connection string.
119
+
120
+ If neither is provided, database operations will not work.
121
+
122
+ ### Using OpenAI Embeddings (Optional)
123
+
124
+ If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
125
+
126
+ ```sh
127
+ export OPENAI_API_KEY=sk-...
128
+ ```
129
+
130
+ If this variable is not set, features relying on OpenAI embeddings will not be available.
131
+
132
+ ### Experiment Context Arguments
133
+
134
+ The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
135
+
136
+ #### Required Parameters
137
+
138
+ | Parameter | Type | Description | Example |
139
+ |-------------------|-----------|------------------------------------------------------|------------------------|
140
+ | `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
141
+ | `experiment_name`| str | Unique name for the experiment | `'stock_prediction'` |
142
+ | `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
143
+ | `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
144
+
145
+ #### Feature Engineering Parameters
146
+
147
+ | Parameter | Type | Default | Description |
148
+ |-----------------------|-------|---------|--------------------------------------------------------------------------|
149
+ | `columns_drop` | list | `[]` | Columns to drop during feature engineering |
150
+ | `columns_boolean` | list | `[]` | Columns to convert to boolean features |
151
+ | `columns_date` | list | `[]` | Date columns for cyclic encoding |
152
+ | `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
153
+ | `columns_te_target` | list | `[]` | Target columns for target encoding |
154
+
155
+ #### Preprocessing Parameters
156
+
157
+ | Parameter | Type | Default | Description |
158
+ |-------------------------|-------|---------|-----------------------------------------------------------------------|
159
+ | `time_series` | bool | `False` | Whether data is time series |
160
+ | `val_size` | float | `0.2` | Validation set size (fraction) |
161
+ | `test_size` | float | `0.2` | Test set size (fraction) |
162
+ | `columns_pca` | list | `[]` | Columns for PCA transformation |
163
+ | `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
164
+ | `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
165
+ | `columns_onehot` | list | `[]` | Columns for one-hot encoding |
166
+ | `columns_binary` | list | `[]` | Columns for binary encoding |
167
+ | `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
168
+ | `columns_frequency` | list | `[]` | Columns for frequency encoding |
169
+
170
+ #### Feature Selection Parameters
171
+
172
+ | Parameter | Type | Default | Description |
173
+ |-----------------------------|-------|---------|------------------------------------------------------------------|
174
+ | `percentile` | float | `20` | Percentage of features to keep per selection method |
175
+ | `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
176
+ | `max_features` | int | `50` | Maximum number of final features |
177
+ | `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
178
+
179
+ #### Model Selection Parameters
180
+
181
+ | Parameter | Type | Default | Description |
182
+ |------------------------|-------|---------|-----------------------------------------------------------------------|
183
+ | `target_numbers` | list | `[]` | List of target indices to predict |
184
+ | `target_clf` | list | `[]` | Classification target indices |
185
+ | `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
186
+ | `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
187
+ | `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
188
+ | `number_of_trials` | int | `20` | Number of hyperopt trials |
189
+ | `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
190
+ | `plot` | bool | `True` | Whether to generate plots |
191
+ | `preserve_model` | bool | `True` | Whether to save the best model |
192
+ | `target_clf_thresholds`| dict | `{}` | Classification thresholds per target |
193
+
194
+ #### Example Context Configuration
195
+
196
+ ```python
197
+ context = {
198
+ # Required parameters
199
+ "experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
200
+ "date_column": "DATE",
201
+ "group_column": "STOCK",
202
+
203
+ # Feature selection
204
+ "corr_threshold": 80,
205
+ "max_features": 20,
206
+ "percentile": 20,
207
+ "max_p_value_categorical": 0.05,
208
+
209
+ # Feature engineering
210
+ "columns_drop": ["SECURITY", "ISIN", "ID"],
211
+ "columns_boolean": [],
212
+ "columns_date": ["DATE"],
213
+ "columns_te_groupby": [["SECTOR", "DATE"]],
214
+ "columns_te_target": ["RET", "VOLUME"],
215
+
216
+ # Preprocessing
217
+ "time_series": True,
218
+ "val_size": 0.2,
219
+ "test_size": 0.2,
220
+ "pca_temporal": [
221
+ {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
222
+ ],
223
+ "pca_cross_sectional": [
224
+ {
225
+ "name": "MARKET_REGIME",
226
+ "index": "DATE",
227
+ "columns": "STOCK",
228
+ "value": "RET",
229
+ }
230
+ ],
231
+ "columns_onehot": ["BUY_SIGNAL"],
232
+ "columns_binary": ["SECTOR", "LOCATION"],
233
+ "columns_ordinal": ["STOCK"],
234
+
235
+ # Model selection
236
+ "target_numbers": [1, 2, 3],
237
+ "target_clf": [1],
238
+ "models_idx": ["xgb", "lgb", "catboost"],
239
+ "max_timesteps": 120,
240
+ "perform_hyperopt": True,
241
+ "number_of_trials": 50,
242
+ "perform_crossval": True,
243
+ "plot": True,
244
+ "preserve_model": True,
245
+ "target_clf_thresholds": {1: {"precision": 0.80}},
246
+ }
247
+
248
+ # Create experiment
249
+ experiment = app.create_experiment(data=your_dataframe, **context)
250
+ ```
251
+
252
+ #### Important Notes
253
+
254
+ 1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
255
+
256
+ 2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
257
+
258
+ 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
259
+
260
+ 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
261
+
262
+ 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
263
+
264
+
265
+
266
+ ### Modular usage
267
+
268
+ You can also use each step independently:
269
+
270
+ ```python
271
+ data_eng = experiment.feature_engineering(data)
272
+ train, val, test = experiment.preprocess_feature(data_eng)
273
+ features = experiment.feature_selection(train)
274
+ std_data, reshaped_data = experiment.preprocess_model(train, val, test)
275
+ experiment.model_selection(std_data, reshaped_data)
276
+ ```
277
+
278
+ ## ⚠️ Using Alembic in Your Project (Important for Integrators)
279
+
280
+ If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
281
+
282
+ By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
283
+
284
+ ```python
285
+ def include_object(object, name, type_, reflected, compare_to):
286
+ if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
287
+ return False # Ignore LeCrapaud tables
288
+ return True
289
+
290
+ context.configure(
291
+ # ... other options ...
292
+ include_object=include_object,
293
+ )
294
+ ```
295
+
296
+ This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
297
+
298
+ ---
299
+
300
+ ## 🤝 Contributing
301
+
302
+ ### Reminders for Github usage
303
+
304
+ 1. Creating Github repository
305
+
306
+ ```sh
307
+ $ brew install gh
308
+ $ gh auth login
309
+ $ gh repo create
310
+ ```
311
+
312
+ 2. Initializing git and first commit to distant repository
313
+
314
+ ```sh
315
+ $ git init
316
+ $ git add .
317
+ $ git commit -m 'first commit'
318
+ $ git remote add origin <YOUR_REPO_URL>
319
+ $ git push -u origin master
320
+ ```
321
+
322
+ 3. Use conventional commits
323
+ https://www.conventionalcommits.org/en/v1.0.0/#summary
324
+
325
+ 4. Create environment
326
+
327
+ ```sh
328
+ $ pip install virtualenv
329
+ $ python -m venv .venv
330
+ $ source .venv/bin/activate
331
+ ```
332
+
333
+ 5. Install dependencies
334
+
335
+ ```sh
336
+ $ make install
337
+ ```
338
+
339
+ 6. Deactivate virtualenv (if needed)
340
+
341
+ ```sh
342
+ $ deactivate
343
+ ```
344
+
345
+ ---
346
+
347
+ Pierre Gallet © 2025
@@ -1,6 +1,6 @@
1
- lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
2
- lecrapaud/api.py,sha256=IQlH3wcSzxYgvlamfICNMwNsQGoaNxBJUPTlC9M0kBk,20321
3
- lecrapaud/config.py,sha256=QK1MxWsEddXii02Rme31tCGDyMFsfHHF2Zy-lLIOuSY,1218
1
+ lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
2
+ lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
3
+ lecrapaud/config.py,sha256=0NEg61QdLxQ97bVFDDXa6OwlWFEo_z8VIhX5KrD1ik0,1170
4
4
  lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
5
  lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
6
6
  lecrapaud/db/alembic/env.py,sha256=RvTTBa3bDVBxmDtapAfzUoeWBgmVQU3s9U6HmQCAP84,2421
@@ -14,7 +14,7 @@ lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_
14
14
  lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py,sha256=g6H2Z9MwB6UEiqdGlBoHBXpO9DTaWkwHt8FS6joVOm0,1191
15
15
  lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py,sha256=FshOF1t-NWXrBtXT3wMNGFslJ4sWUxzvBEXSymu05cI,1043
16
16
  lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py,sha256=htHUD4zPJr-0z_DQfTi8r9RsFVe9m7SL0f7oRIvLIcQ,10999
17
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=o3TNHq1GTFjxfk2zHWaUbq91khMJi6Xy6HToO9i54AU,2051
17
+ lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=0NBvOwPqMXpWnDEGiEBk_IeLKmXQ5ZcU-dqHeSEgsRQ,2557
18
18
  lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
19
19
  lecrapaud/db/models/__init__.py,sha256=-XoCN1eeLihnNxBMl90lXrgrTSDkMbeqgienMqFi5f4,702
20
20
  lecrapaud/db/models/base.py,sha256=0548x4ftd6Oim9BJmtD7Er4izM6u0QCrlTG5560384w,9458
@@ -29,8 +29,8 @@ lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk
29
29
  lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,1109
30
30
  lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
31
31
  lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
32
- lecrapaud/experiment.py,sha256=TYECkPqZNVqQQaSg8u5fZ3UvxKYCzc3f-mYVlikCz4s,2234
33
- lecrapaud/feature_engineering.py,sha256=UM-EIOsgYWedqsR9uA-09eaWSb9FofVxoE0rRcDelQ8,39173
32
+ lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
33
+ lecrapaud/feature_engineering.py,sha256=lfY14RS303_izt3OcnLhTvsPbWUWZY5ES_0HNcbBezc,50017
34
34
  lecrapaud/feature_selection.py,sha256=Q9xWVmZsvRjX9mJHB_PY_KLXsEAYNLX7txSe0cniY4A,47529
35
35
  lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
36
36
  lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
@@ -43,8 +43,8 @@ lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8o
43
43
  lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
44
44
  lecrapaud/model_selection.py,sha256=o4_hOEp91_33HtMatVHU7YPc71KZ2hK7wucN63xqWkA,88017
45
45
  lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
46
- lecrapaud/utils.py,sha256=vsNBd2Nnhpjo65Ugz2GFJaRhq3U3_eWERfofpevo5Ls,8884
47
- lecrapaud-0.20.1.dist-info/METADATA,sha256=gCEqDJXok9Ti9DQ32XRqU4cH0blMCrSBAOLPTy9viXE,11137
48
- lecrapaud-0.20.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
- lecrapaud-0.20.1.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
- lecrapaud-0.20.1.dist-info/RECORD,,
46
+ lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
47
+ lecrapaud-0.21.0.dist-info/METADATA,sha256=TziJTM9CXoayu3hwlHqCIiqvWIbvTaZhRv0XbYaLuRQ,14348
48
+ lecrapaud-0.21.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
+ lecrapaud-0.21.0.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
+ lecrapaud-0.21.0.dist-info/RECORD,,
@@ -1,250 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: lecrapaud
3
- Version: 0.20.1
4
- Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
- License: Apache License
6
- License-File: LICENSE
7
- Author: Pierre H. Gallet
8
- Requires-Python: ==3.12.*
9
- Classifier: License :: Other/Proprietary License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.12
12
- Requires-Dist: catboost (>=1.2.8)
13
- Requires-Dist: category-encoders (>=2.8.1)
14
- Requires-Dist: celery (>=5.5.3)
15
- Requires-Dist: ftfy (>=6.3.1)
16
- Requires-Dist: joblib (>=1.5.1)
17
- Requires-Dist: keras (>=3.10.0)
18
- Requires-Dist: lightgbm (>=4.6.0)
19
- Requires-Dist: matplotlib (>=3.10.3)
20
- Requires-Dist: mlxtend (>=0.23.4)
21
- Requires-Dist: numpy (>=2.1.3)
22
- Requires-Dist: openai (>=1.88.0)
23
- Requires-Dist: pandas (>=2.3.0)
24
- Requires-Dist: pydantic (>=2.9.2)
25
- Requires-Dist: python-dotenv (>=1.1.0)
26
- Requires-Dist: scikit-learn (>=1.6.1)
27
- Requires-Dist: scipy (<1.14.0)
28
- Requires-Dist: seaborn (>=0.13.2)
29
- Requires-Dist: sqlalchemy (>=2.0.41)
30
- Requires-Dist: tensorboardx (>=2.6.4)
31
- Requires-Dist: tensorflow (>=2.19.0)
32
- Requires-Dist: tiktoken (>=0.9.0)
33
- Requires-Dist: tqdm (>=4.67.1)
34
- Requires-Dist: xgboost (>=3.0.2)
35
- Description-Content-Type: text/markdown
36
-
37
- <div align="center">
38
-
39
- <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
40
-
41
- ## Welcome to LeCrapaud
42
-
43
- **An all-in-one machine learning framework**
44
-
45
- [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
46
- [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
47
- [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
48
- [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
49
- [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
50
-
51
- </div>
52
-
53
- ## 🚀 Introduction
54
-
55
- LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
56
-
57
- ## ✨ Key Features
58
-
59
- - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
60
- - 🤖 Automated model selection and hyperparameter optimization
61
- - 📊 Easy integration with pandas DataFrames
62
- - 🔬 Supports both regression and classification tasks
63
- - 🛠️ Simple API for both full pipeline and step-by-step usage
64
- - 📦 Ready for production and research workflows
65
-
66
- ## ⚡ Quick Start
67
-
68
-
69
- ### Install the package
70
-
71
- ```sh
72
- pip install lecrapaud
73
- ```
74
-
75
- ### How it works
76
-
77
- This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
78
-
79
- ### Typical workflow
80
-
81
- ```python
82
- from lecrapaud import LeCrapaud
83
-
84
- # 1. Create the main app
85
- app = LeCrapaud(uri=uri)
86
-
87
- # 2. Define your experiment context (see your notebook or api.py for all options)
88
- context = {
89
- "data": your_dataframe,
90
- "columns_drop": [...],
91
- "columns_date": [...],
92
- # ... other config options
93
- }
94
-
95
- # 3. Create an experiment
96
- experiment = app.create_experiment(**context)
97
-
98
- # 4. Run the full training pipeline
99
- experiment.train(your_dataframe)
100
-
101
- # 5. Make predictions on new data
102
- predictions = experiment.predict(new_data)
103
- ```
104
-
105
- ### Database Configuration (Required)
106
-
107
- LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
108
-
109
- - Pass a valid MySQL URI to the `LeCrapaud` constructor:
110
- ```python
111
- app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
112
- ```
113
- - **OR** set the following environment variables before using the package:
114
- - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
115
- - Or set `DB_URI` directly with your full connection string.
116
-
117
- If neither is provided, database operations will not work.
118
-
119
- ### Using OpenAI Embeddings (Optional)
120
-
121
- If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
122
-
123
- ```sh
124
- export OPENAI_API_KEY=sk-...
125
- ```
126
-
127
- If this variable is not set, features relying on OpenAI embeddings will not be available.
128
-
129
- ### Experiment Context Arguments
130
-
131
- Below are the main arguments you can pass to `create_experiment` (or the `Experiment` class):
132
-
133
- | Argument | Type | Description | Example/Default |
134
- | -------------------- | --------- | ---------------------------------------------------------------------------------------- | ------------------ |
135
- | `columns_binary` | list | Columns to treat as binary | `['flag']` |
136
- | `columns_boolean` | list | Columns to treat as boolean | `['is_active']` |
137
- | `columns_date` | list | Columns to treat as dates | `['date']` |
138
- | `columns_drop` | list | Columns to drop during feature engineering | `['col1', 'col2']` |
139
- | `columns_frequency` | list | Columns to frequency encode | `['category']` |
140
- | `columns_onehot` | list | Columns to one-hot encode | `['sector']` |
141
- | `columns_ordinal` | list | Columns to ordinal encode | `['grade']` |
142
- | `columns_pca` | list | Columns to use for PCA/embeddings (requires `OPENAI_API_KEY` if using OpenAI embeddings) | `['text_col']` |
143
- | `columns_te_groupby` | list | Columns for target encoding groupby | `['sector']` |
144
- | `columns_te_target` | list | Columns for target encoding target | `['target']` |
145
- | `data` | DataFrame | Your main dataset (required for new experiment) | `your_dataframe` |
146
- | `date_column` | str | Name of the date column | `'date'` |
147
- | `experiment_name` | str | Name for the training session | `'my_session'` |
148
- | `group_column` | str | Name of the group column | `'stock_id'` |
149
- | `max_timesteps` | int | Max timesteps for time series models | `30` |
150
- | `models_idx` | list | Indices of models to use for model selection | `[0, 1, 2]` |
151
- | `number_of_trials` | int | Number of trials for hyperparameter optimization | `20` |
152
- | `perform_crossval` | bool | Whether to perform cross-validation | `True`/`False` |
153
- | `perform_hyperopt` | bool | Whether to perform hyperparameter optimization | `True`/`False` |
154
- | `plot` | bool | Whether to plot results | `True`/`False` |
155
- | `preserve_model` | bool | Whether to preserve the best model | `True`/`False` |
156
- | `target_clf` | list | List of classification target column indices/names | `[1, 2, 3]` |
157
- | `target_mclf` | list | Multi-class classification targets (not yet implemented) | `[11]` |
158
- | `target_numbers` | list | List of regression target column indices/names | `[1, 2, 3]` |
159
- | `test_size` | int/float | Test set size (count or fraction) | `0.2` |
160
- | `time_series` | bool | Whether the data is time series | `True`/`False` |
161
- | `val_size` | int/float | Validation set size (count or fraction) | `0.2` |
162
-
163
- **Note:**
164
- - Not all arguments are required; defaults may exist for some.
165
- - For `columns_pca` with OpenAI embeddings, you must set the `OPENAI_API_KEY` environment variable.
166
-
167
-
168
-
169
- ### Modular usage
170
-
171
- You can also use each step independently:
172
-
173
- ```python
174
- data_eng = experiment.feature_engineering(data)
175
- train, val, test = experiment.preprocess_feature(data_eng)
176
- features = experiment.feature_selection(train)
177
- std_data, reshaped_data = experiment.preprocess_model(train, val, test)
178
- experiment.model_selection(std_data, reshaped_data)
179
- ```
180
-
181
- ## ⚠️ Using Alembic in Your Project (Important for Integrators)
182
-
183
- If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
184
-
185
- By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
186
-
187
- ```python
188
- def include_object(object, name, type_, reflected, compare_to):
189
- if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
190
- return False # Ignore LeCrapaud tables
191
- return True
192
-
193
- context.configure(
194
- # ... other options ...
195
- include_object=include_object,
196
- )
197
- ```
198
-
199
- This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
200
-
201
- ---
202
-
203
- ## 🤝 Contributing
204
-
205
- ### Reminders for Github usage
206
-
207
- 1. Creating Github repository
208
-
209
- ```sh
210
- $ brew install gh
211
- $ gh auth login
212
- $ gh repo create
213
- ```
214
-
215
- 2. Initializing git and first commit to distant repository
216
-
217
- ```sh
218
- $ git init
219
- $ git add .
220
- $ git commit -m 'first commit'
221
- $ git remote add origin <YOUR_REPO_URL>
222
- $ git push -u origin master
223
- ```
224
-
225
- 3. Use conventional commits
226
- https://www.conventionalcommits.org/en/v1.0.0/#summary
227
-
228
- 4. Create environment
229
-
230
- ```sh
231
- $ pip install virtualenv
232
- $ python -m venv .venv
233
- $ source .venv/bin/activate
234
- ```
235
-
236
- 5. Install dependencies
237
-
238
- ```sh
239
- $ make install
240
- ```
241
-
242
- 6. Deactivate virtualenv (if needed)
243
-
244
- ```sh
245
- $ deactivate
246
- ```
247
-
248
- ---
249
-
250
- Pierre Gallet © 2025