lecrapaud 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
73
73
 
74
74
 
75
75
  class FeatureSelectionEngine:
76
- def __init__(self, train, experiment, target_number, target_clf, **kwargs):
76
+ def __init__(self, train, experiment, target_number, **kwargs):
77
77
  self.experiment = experiment
78
78
  self.train = train
79
79
  self.target_number = target_number
80
- self.target_clf = target_clf
80
+
81
+ # Get all parameters from experiment context
82
+ self.target_clf = self.experiment.context.get("target_clf", [])
83
+ self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
84
+ self.percentile = self.experiment.context.get("percentile", 20)
85
+ self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
86
+ self.max_features = self.experiment.context.get("max_features", 50)
81
87
 
82
88
  self.target_type = (
83
89
  "classification" if self.target_number in self.target_clf else "regression"
84
90
  )
85
- self.percentile = self.experiment.percentile
86
- self.corr_threshold = self.experiment.corr_threshold
87
- self.max_features = self.experiment.max_features
88
91
 
89
92
  self.experiment_dir = self.experiment.path
90
93
  self.experiment_id = self.experiment.id
@@ -274,6 +277,38 @@ class FeatureSelectionEngine:
274
277
  features_selected.drop_duplicates("features", inplace=True)
275
278
 
276
279
  features_selected_list = features_selected["features"].values.tolist()
280
+
281
+ # Save ensemble features before correlation (aggregated features)
282
+ logger.info("Saving ensemble features before correlation...")
283
+ all_features_in_data = self.X.columns.tolist()
284
+ ensemble_rows = []
285
+
286
+ # Add global rank for selected features
287
+ features_selected_with_global_rank = features_selected.copy()
288
+ features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
289
+
290
+ for feature in all_features_in_data:
291
+ feature_id = feature_map.get(feature)
292
+ if feature_id:
293
+ is_selected = feature in features_selected_list
294
+ global_rank = None
295
+ if is_selected:
296
+ global_rank = features_selected_with_global_rank[
297
+ features_selected_with_global_rank["features"] == feature
298
+ ]["global_rank"].values[0]
299
+
300
+ ensemble_rows.append({
301
+ "feature_selection_id": feature_selection.id,
302
+ "feature_id": feature_id,
303
+ "method": "ensemble",
304
+ "score": None,
305
+ "pvalue": None,
306
+ "support": 2 if is_selected else 0, # 2 = in aggregated features
307
+ "rank": global_rank,
308
+ "training_time": 0,
309
+ })
310
+
311
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
277
312
 
278
313
  # analysis 1
279
314
  features_selected_by_every_methods = set(results[0]["features"].values.tolist())
@@ -302,12 +337,46 @@ class FeatureSelectionEngine:
302
337
  header=True,
303
338
  index_label="ID",
304
339
  )
340
+
341
+ # Update support for features after correlation removal (before max)
342
+ logger.info("Updating ensemble features after correlation removal...")
343
+ for row in ensemble_rows:
344
+ feature = Feature.get(row["feature_id"]).name
345
+ if feature in features:
346
+ row["support"] = 1 # 1 = survived correlation removal
347
+
305
348
  features = features[:max_features]
306
349
 
307
350
  # adding categorical features selected
308
351
  features += (
309
352
  categorical_features_selected if target_type == "classification" else []
310
353
  )
354
+
355
+ # Final update for features after max limitation (final selection)
356
+ logger.info("Finalizing ensemble features with categorical features...")
357
+ for row in ensemble_rows:
358
+ feature = Feature.get(row["feature_id"]).name
359
+ if feature in features and row["support"] == 1:
360
+ row["support"] = 2 # 2 = in final selection
361
+
362
+ # Add categorical features to ensemble if not already present
363
+ if target_type == "classification":
364
+ for cat_feature in categorical_features_selected:
365
+ feature_id = feature_map.get(cat_feature)
366
+ if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
367
+ ensemble_rows.append({
368
+ "feature_selection_id": feature_selection.id,
369
+ "feature_id": feature_id,
370
+ "method": "ensemble",
371
+ "score": None,
372
+ "pvalue": None,
373
+ "support": 2, # 2 = in final selection (categorical)
374
+ "rank": None, # No rank for categorical features added at the end
375
+ "training_time": 0,
376
+ })
377
+
378
+ # Re-save all ensemble data with updated support values
379
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
311
380
  logger.debug(
312
381
  f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
313
382
  )
@@ -440,13 +509,18 @@ class FeatureSelectionEngine:
440
509
  feat_scores["features"] = X.columns
441
510
  feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
442
511
  feat_scores["method"] = "Chi2"
512
+
513
+ # Apply both percentile and p-value filtering
514
+ # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
515
+ feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
516
+
443
517
  feat_scores.sort_values("rank", ascending=True, inplace=True)
444
518
  stop = time.time()
445
519
  training_time = timedelta(seconds=(stop - start)).total_seconds()
446
520
  feat_scores["training_time"] = training_time
447
521
 
448
522
  logger.debug(
449
- f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
523
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
450
524
  )
451
525
 
452
526
  feat_scores.to_csv(
@@ -803,33 +877,28 @@ class PreprocessModel:
803
877
  val,
804
878
  test,
805
879
  experiment,
806
- target_numbers,
807
- target_clf,
808
- models_idx,
809
- time_series,
810
- max_timesteps,
811
- group_column,
812
- date_column,
813
880
  **kwargs,
814
881
  ):
815
882
  self.train = train
816
883
  self.val = val
817
884
  self.test = test
818
885
  self.experiment = experiment
819
- self.target_numbers = target_numbers
820
- self.target_clf = target_clf
821
- self.models_idx = models_idx
822
- self.time_series = time_series
823
- self.max_timesteps = max_timesteps
824
- self.group_column = group_column
825
- self.date_column = date_column
886
+
887
+ # Get all parameters from experiment context
888
+ self.target_numbers = self.experiment.context.get("target_numbers", [])
889
+ self.target_clf = self.experiment.context.get("target_clf", [])
890
+ self.models_idx = self.experiment.context.get("models_idx", [])
891
+ self.time_series = self.experiment.context.get("time_series", False)
892
+ self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
893
+ self.group_column = self.experiment.context.get("group_column", None)
894
+ self.date_column = self.experiment.context.get("date_column", None)
826
895
 
827
896
  self.experiment_dir = experiment.path
828
897
  self.data_dir = f"{self.experiment_dir}/data"
829
898
  self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
830
899
 
831
900
  self.all_features = experiment.get_all_features(
832
- date_column=date_column, group_column=group_column
901
+ date_column=self.date_column, group_column=self.group_column
833
902
  )
834
903
 
835
904
  def run(self):
@@ -1017,24 +1017,24 @@ class ModelSelectionEngine:
1017
1017
  data,
1018
1018
  reshaped_data,
1019
1019
  target_number,
1020
- target_clf,
1021
1020
  experiment,
1022
- models_idx,
1023
- time_series,
1024
- date_column,
1025
- group_column,
1026
- target_clf_thresholds,
1027
1021
  **kwargs,
1028
1022
  ):
1029
1023
  self.data = data
1030
1024
  self.reshaped_data = reshaped_data
1031
1025
  self.target_number = target_number
1032
1026
  self.experiment = experiment
1033
- self.target_clf = target_clf
1034
- self.models_idx = models_idx
1035
- self.time_series = time_series
1036
- self.date_column = date_column
1037
- self.group_column = group_column
1027
+
1028
+ # Get all parameters from experiment context
1029
+ context = self.experiment.context
1030
+ self.target_clf = context.get("target_clf", [])
1031
+ self.models_idx = context.get("models_idx", [])
1032
+ self.time_series = context.get("time_series", False)
1033
+ self.date_column = context.get("date_column", None)
1034
+ self.group_column = context.get("group_column", None)
1035
+
1036
+ # Handle target_clf_thresholds
1037
+ target_clf_thresholds = context.get("target_clf_thresholds", {})
1038
1038
  self.target_clf_thresholds = (
1039
1039
  target_clf_thresholds[target_number]
1040
1040
  if target_number in target_clf_thresholds.keys()
@@ -1056,25 +1056,19 @@ class ModelSelectionEngine:
1056
1056
  )
1057
1057
 
1058
1058
  # Main training function
1059
- def run(
1060
- self,
1061
- experiment_name,
1062
- perform_hyperopt=True,
1063
- number_of_trials=20,
1064
- perform_crossval=False, # This controls CV during hyperopt, not after
1065
- plot=True,
1066
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
1067
- preserve_model=True,
1068
- best_params=None,
1069
- ):
1059
+ def run(self, best_params=None):
1070
1060
  """
1071
1061
  Selects the best models based on a target variable, optionally performing hyperparameter optimization
1072
1062
  and cross-validation, and manages outputs in a session-specific directory.
1073
1063
  """
1074
- self.experiment_name = experiment_name
1075
- self.plot = plot
1076
- self.number_of_trials = number_of_trials
1077
- self.perform_crossval = perform_crossval
1064
+ # Get all parameters from experiment context
1065
+ context = self.experiment.context
1066
+ self.experiment_name = context.get("experiment_name", "")
1067
+ self.plot = context.get("plot", True)
1068
+ self.number_of_trials = context.get("number_of_trials", 20)
1069
+ self.perform_crossval = context.get("perform_crossval", False)
1070
+ self.preserve_model = context.get("preserve_model", True)
1071
+ self.perform_hyperopt = context.get("perform_hyperopt", True)
1078
1072
 
1079
1073
  if self.experiment_id is None:
1080
1074
  raise ValueError("Please provide a experiment.")
@@ -1141,13 +1135,13 @@ class ModelSelectionEngine:
1141
1135
  self.results_dir = f"{self.target_dir}/{model_name}"
1142
1136
  if not os.path.exists(f"{self.results_dir}"):
1143
1137
  os.makedirs(f"{self.results_dir}")
1144
- elif preserve_model and contains_best(self.results_dir):
1138
+ elif self.preserve_model and contains_best(self.results_dir):
1145
1139
  continue
1146
- elif perform_hyperopt:
1140
+ elif self.perform_hyperopt:
1147
1141
  clean_directory(self.results_dir)
1148
1142
 
1149
1143
  logger.info(
1150
- f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
1144
+ f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
1151
1145
  )
1152
1146
 
1153
1147
  # Getting data
@@ -1204,7 +1198,7 @@ class ModelSelectionEngine:
1204
1198
 
1205
1199
  # Tuning hyperparameters
1206
1200
  start = time.time()
1207
- if perform_hyperopt:
1201
+ if self.perform_hyperopt:
1208
1202
  model_best_params = self.hyperoptimize(
1209
1203
  x_train, y_train, x_val, y_val, model
1210
1204
  )
lecrapaud/utils.py CHANGED
@@ -11,7 +11,7 @@ import re
11
11
  import string
12
12
 
13
13
  from lecrapaud.directories import logger_dir
14
- from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV, LECRAPAUD_LOCAL
14
+ from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
15
15
 
16
16
 
17
17
  _LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
@@ -237,7 +237,7 @@ def serialize_for_json(obj):
237
237
  import numpy as np
238
238
  from datetime import datetime, date
239
239
  import pandas as pd
240
-
240
+
241
241
  # Handle NumPy types
242
242
  if isinstance(obj, (np.integer, np.int64, np.int32, np.int16)):
243
243
  return int(obj)
@@ -247,11 +247,11 @@ def serialize_for_json(obj):
247
247
  return obj.tolist()
248
248
  elif isinstance(obj, np.bool_):
249
249
  return bool(obj)
250
-
250
+
251
251
  # Handle datetime types
252
252
  elif isinstance(obj, (datetime, date, pd.Timestamp)):
253
253
  return obj.isoformat()
254
-
254
+
255
255
  # Handle basic Python types
256
256
  elif isinstance(obj, (str, int, float, bool, type(None))):
257
257
  return obj
@@ -0,0 +1,344 @@
1
+ Metadata-Version: 2.4
2
+ Name: lecrapaud
3
+ Version: 0.20.2
4
+ Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
+ License: Apache License
6
+ License-File: LICENSE
7
+ Author: Pierre H. Gallet
8
+ Requires-Python: ==3.12.*
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: catboost (>=1.2.8)
13
+ Requires-Dist: category-encoders (>=2.8.1)
14
+ Requires-Dist: celery (>=5.5.3)
15
+ Requires-Dist: ftfy (>=6.3.1)
16
+ Requires-Dist: joblib (>=1.5.1)
17
+ Requires-Dist: keras (>=3.10.0)
18
+ Requires-Dist: lightgbm (>=4.6.0)
19
+ Requires-Dist: matplotlib (>=3.10.3)
20
+ Requires-Dist: mlxtend (>=0.23.4)
21
+ Requires-Dist: numpy (>=2.1.3)
22
+ Requires-Dist: openai (>=1.88.0)
23
+ Requires-Dist: pandas (>=2.3.0)
24
+ Requires-Dist: pydantic (>=2.9.2)
25
+ Requires-Dist: python-dotenv (>=1.1.0)
26
+ Requires-Dist: scikit-learn (>=1.6.1)
27
+ Requires-Dist: scipy (<1.14.0)
28
+ Requires-Dist: seaborn (>=0.13.2)
29
+ Requires-Dist: sqlalchemy (>=2.0.41)
30
+ Requires-Dist: tensorboardx (>=2.6.4)
31
+ Requires-Dist: tensorflow (>=2.19.0)
32
+ Requires-Dist: tiktoken (>=0.9.0)
33
+ Requires-Dist: tqdm (>=4.67.1)
34
+ Requires-Dist: xgboost (>=3.0.2)
35
+ Description-Content-Type: text/markdown
36
+
37
+ <div align="center">
38
+
39
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
40
+
41
+ ## Welcome to LeCrapaud
42
+
43
+ **An all-in-one machine learning framework**
44
+
45
+ [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
46
+ [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
47
+ [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
48
+ [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
49
+ [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
50
+
51
+ </div>
52
+
53
+ ## 🚀 Introduction
54
+
55
+ LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
56
+
57
+ ## ✨ Key Features
58
+
59
+ - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
60
+ - 🤖 Automated model selection and hyperparameter optimization
61
+ - 📊 Easy integration with pandas DataFrames
62
+ - 🔬 Supports both regression and classification tasks
63
+ - 🛠️ Simple API for both full pipeline and step-by-step usage
64
+ - 📦 Ready for production and research workflows
65
+
66
+ ## ⚡ Quick Start
67
+
68
+
69
+ ### Install the package
70
+
71
+ ```sh
72
+ pip install lecrapaud
73
+ ```
74
+
75
+ ### How it works
76
+
77
+ This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
78
+
79
+ ### Typical workflow
80
+
81
+ ```python
82
+ from lecrapaud import LeCrapaud
83
+
84
+ # 1. Create the main app
85
+ app = LeCrapaud(uri=uri)
86
+
87
+ # 2. Define your experiment context (see your notebook or api.py for all options)
88
+ context = {
89
+ "data": your_dataframe,
90
+ "columns_drop": [...],
91
+ "columns_date": [...],
92
+ # ... other config options
93
+ }
94
+
95
+ # 3. Create an experiment
96
+ experiment = app.create_experiment(**context)
97
+
98
+ # 4. Run the full training pipeline
99
+ experiment.train(your_dataframe)
100
+
101
+ # 5. Make predictions on new data
102
+ predictions = experiment.predict(new_data)
103
+ ```
104
+
105
+ ### Database Configuration (Required)
106
+
107
+ LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
108
+
109
+ - Pass a valid MySQL URI to the `LeCrapaud` constructor:
110
+ ```python
111
+ app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
112
+ ```
113
+ - **OR** set the following environment variables before using the package:
114
+ - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
115
+ - Or set `DB_URI` directly with your full connection string.
116
+
117
+ If neither is provided, database operations will not work.
118
+
119
+ ### Using OpenAI Embeddings (Optional)
120
+
121
+ If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
122
+
123
+ ```sh
124
+ export OPENAI_API_KEY=sk-...
125
+ ```
126
+
127
+ If this variable is not set, features relying on OpenAI embeddings will not be available.
128
+
129
+ ### Experiment Context Arguments
130
+
131
+ The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
132
+
133
+ #### Required Parameters
134
+
135
+ | Parameter | Type | Description | Example |
136
+ |-------------------|-----------|------------------------------------------------------|------------------------|
137
+ | `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
138
+ | `experiment_name`| str | Unique name for the experiment | `'stock_prediction'` |
139
+ | `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
140
+ | `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
141
+
142
+ #### Feature Engineering Parameters
143
+
144
+ | Parameter | Type | Default | Description |
145
+ |-----------------------|-------|---------|--------------------------------------------------------------------------|
146
+ | `columns_drop` | list | `[]` | Columns to drop during feature engineering |
147
+ | `columns_boolean` | list | `[]` | Columns to convert to boolean features |
148
+ | `columns_date` | list | `[]` | Date columns for cyclic encoding |
149
+ | `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
150
+ | `columns_te_target` | list | `[]` | Target columns for target encoding |
151
+
152
+ #### Preprocessing Parameters
153
+
154
+ | Parameter | Type | Default | Description |
155
+ |-------------------------|-------|---------|-----------------------------------------------------------------------|
156
+ | `time_series` | bool | `False` | Whether data is time series |
157
+ | `val_size` | float | `0.2` | Validation set size (fraction) |
158
+ | `test_size` | float | `0.2` | Test set size (fraction) |
159
+ | `columns_pca` | list | `[]` | Columns for PCA transformation |
160
+ | `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
161
+ | `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
162
+ | `columns_onehot` | list | `[]` | Columns for one-hot encoding |
163
+ | `columns_binary` | list | `[]` | Columns for binary encoding |
164
+ | `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
165
+ | `columns_frequency` | list | `[]` | Columns for frequency encoding |
166
+
167
+ #### Feature Selection Parameters
168
+
169
+ | Parameter | Type | Default | Description |
170
+ |-----------------------------|-------|---------|------------------------------------------------------------------|
171
+ | `percentile` | float | `20` | Percentage of features to keep per selection method |
172
+ | `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
173
+ | `max_features` | int | `50` | Maximum number of final features |
174
+ | `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
175
+
176
+ #### Model Selection Parameters
177
+
178
+ | Parameter | Type | Default | Description |
179
+ |------------------------|-------|---------|-----------------------------------------------------------------------|
180
+ | `target_numbers` | list | `[]` | List of target indices to predict |
181
+ | `target_clf` | list | `[]` | Classification target indices |
182
+ | `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
183
+ | `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
184
+ | `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
185
+ | `number_of_trials` | int | `20` | Number of hyperopt trials |
186
+ | `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
187
+ | `plot` | bool | `True` | Whether to generate plots |
188
+ | `preserve_model` | bool | `True` | Whether to save the best model |
189
+ | `target_clf_thresholds`| dict | `{}` | Classification thresholds per target |
190
+
191
+ #### Example Context Configuration
192
+
193
+ ```python
194
+ context = {
195
+ # Required parameters
196
+ "experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
197
+ "date_column": "DATE",
198
+ "group_column": "STOCK",
199
+
200
+ # Feature selection
201
+ "corr_threshold": 80,
202
+ "max_features": 20,
203
+ "percentile": 20,
204
+ "max_p_value_categorical": 0.05,
205
+
206
+ # Feature engineering
207
+ "columns_drop": ["SECURITY", "ISIN", "ID"],
208
+ "columns_boolean": [],
209
+ "columns_date": ["DATE"],
210
+ "columns_te_groupby": [["SECTOR", "DATE"]],
211
+ "columns_te_target": ["RET", "VOLUME"],
212
+
213
+ # Preprocessing
214
+ "time_series": True,
215
+ "val_size": 0.2,
216
+ "test_size": 0.2,
217
+ "pca_temporal": [
218
+ {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
219
+ ],
220
+ "pca_cross_sectional": [
221
+ {
222
+ "name": "MARKET_REGIME",
223
+ "index": "DATE",
224
+ "columns": "STOCK",
225
+ "value": "RET",
226
+ }
227
+ ],
228
+ "columns_onehot": ["BUY_SIGNAL"],
229
+ "columns_binary": ["SECTOR", "LOCATION"],
230
+ "columns_ordinal": ["STOCK"],
231
+
232
+ # Model selection
233
+ "target_numbers": [1, 2, 3],
234
+ "target_clf": [1],
235
+ "models_idx": ["xgb", "lgb", "catboost"],
236
+ "max_timesteps": 120,
237
+ "perform_hyperopt": True,
238
+ "number_of_trials": 50,
239
+ "perform_crossval": True,
240
+ "plot": True,
241
+ "preserve_model": True,
242
+ "target_clf_thresholds": {1: {"precision": 0.80}},
243
+ }
244
+
245
+ # Create experiment
246
+ experiment = app.create_experiment(data=your_dataframe, **context)
247
+ ```
248
+
249
+ #### Important Notes
250
+
251
+ 1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
252
+
253
+ 2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
254
+
255
+ 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
256
+
257
+ 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
258
+
259
+ 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
260
+
261
+
262
+
263
+ ### Modular usage
264
+
265
+ You can also use each step independently:
266
+
267
+ ```python
268
+ data_eng = experiment.feature_engineering(data)
269
+ train, val, test = experiment.preprocess_feature(data_eng)
270
+ features = experiment.feature_selection(train)
271
+ std_data, reshaped_data = experiment.preprocess_model(train, val, test)
272
+ experiment.model_selection(std_data, reshaped_data)
273
+ ```
274
+
275
+ ## ⚠️ Using Alembic in Your Project (Important for Integrators)
276
+
277
+ If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
278
+
279
+ By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
280
+
281
+ ```python
282
+ def include_object(object, name, type_, reflected, compare_to):
283
+ if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
284
+ return False # Ignore LeCrapaud tables
285
+ return True
286
+
287
+ context.configure(
288
+ # ... other options ...
289
+ include_object=include_object,
290
+ )
291
+ ```
292
+
293
+ This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
294
+
295
+ ---
296
+
297
+ ## 🤝 Contributing
298
+
299
+ ### Reminders for Github usage
300
+
301
+ 1. Creating Github repository
302
+
303
+ ```sh
304
+ $ brew install gh
305
+ $ gh auth login
306
+ $ gh repo create
307
+ ```
308
+
309
+ 2. Initializing git and first commit to distant repository
310
+
311
+ ```sh
312
+ $ git init
313
+ $ git add .
314
+ $ git commit -m 'first commit'
315
+ $ git remote add origin <YOUR_REPO_URL>
316
+ $ git push -u origin master
317
+ ```
318
+
319
+ 3. Use conventional commits
320
+ https://www.conventionalcommits.org/en/v1.0.0/#summary
321
+
322
+ 4. Create environment
323
+
324
+ ```sh
325
+ $ pip install virtualenv
326
+ $ python -m venv .venv
327
+ $ source .venv/bin/activate
328
+ ```
329
+
330
+ 5. Install dependencies
331
+
332
+ ```sh
333
+ $ make install
334
+ ```
335
+
336
+ 6. Deactivate virtualenv (if needed)
337
+
338
+ ```sh
339
+ $ deactivate
340
+ ```
341
+
342
+ ---
343
+
344
+ Pierre Gallet © 2025