lecrapaud 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +11 -49
- lecrapaud/config.py +3 -2
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +42 -0
- lecrapaud/db/models/experiment.py +48 -75
- lecrapaud/experiment.py +13 -15
- lecrapaud/feature_engineering.py +28 -40
- lecrapaud/feature_selection.py +90 -21
- lecrapaud/model_selection.py +24 -30
- lecrapaud/utils.py +4 -4
- lecrapaud-0.20.2.dist-info/METADATA +344 -0
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/RECORD +13 -12
- lecrapaud-0.20.0.dist-info/METADATA +0 -250
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/WHEEL +0 -0
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/licenses/LICENSE +0 -0
lecrapaud/feature_selection.py
CHANGED
|
@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
class FeatureSelectionEngine:
|
|
76
|
-
def __init__(self, train, experiment, target_number,
|
|
76
|
+
def __init__(self, train, experiment, target_number, **kwargs):
|
|
77
77
|
self.experiment = experiment
|
|
78
78
|
self.train = train
|
|
79
79
|
self.target_number = target_number
|
|
80
|
-
|
|
80
|
+
|
|
81
|
+
# Get all parameters from experiment context
|
|
82
|
+
self.target_clf = self.experiment.context.get("target_clf", [])
|
|
83
|
+
self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
|
|
84
|
+
self.percentile = self.experiment.context.get("percentile", 20)
|
|
85
|
+
self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
|
|
86
|
+
self.max_features = self.experiment.context.get("max_features", 50)
|
|
81
87
|
|
|
82
88
|
self.target_type = (
|
|
83
89
|
"classification" if self.target_number in self.target_clf else "regression"
|
|
84
90
|
)
|
|
85
|
-
self.percentile = self.experiment.percentile
|
|
86
|
-
self.corr_threshold = self.experiment.corr_threshold
|
|
87
|
-
self.max_features = self.experiment.max_features
|
|
88
91
|
|
|
89
92
|
self.experiment_dir = self.experiment.path
|
|
90
93
|
self.experiment_id = self.experiment.id
|
|
@@ -274,6 +277,38 @@ class FeatureSelectionEngine:
|
|
|
274
277
|
features_selected.drop_duplicates("features", inplace=True)
|
|
275
278
|
|
|
276
279
|
features_selected_list = features_selected["features"].values.tolist()
|
|
280
|
+
|
|
281
|
+
# Save ensemble features before correlation (aggregated features)
|
|
282
|
+
logger.info("Saving ensemble features before correlation...")
|
|
283
|
+
all_features_in_data = self.X.columns.tolist()
|
|
284
|
+
ensemble_rows = []
|
|
285
|
+
|
|
286
|
+
# Add global rank for selected features
|
|
287
|
+
features_selected_with_global_rank = features_selected.copy()
|
|
288
|
+
features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
|
|
289
|
+
|
|
290
|
+
for feature in all_features_in_data:
|
|
291
|
+
feature_id = feature_map.get(feature)
|
|
292
|
+
if feature_id:
|
|
293
|
+
is_selected = feature in features_selected_list
|
|
294
|
+
global_rank = None
|
|
295
|
+
if is_selected:
|
|
296
|
+
global_rank = features_selected_with_global_rank[
|
|
297
|
+
features_selected_with_global_rank["features"] == feature
|
|
298
|
+
]["global_rank"].values[0]
|
|
299
|
+
|
|
300
|
+
ensemble_rows.append({
|
|
301
|
+
"feature_selection_id": feature_selection.id,
|
|
302
|
+
"feature_id": feature_id,
|
|
303
|
+
"method": "ensemble",
|
|
304
|
+
"score": None,
|
|
305
|
+
"pvalue": None,
|
|
306
|
+
"support": 2 if is_selected else 0, # 2 = in aggregated features
|
|
307
|
+
"rank": global_rank,
|
|
308
|
+
"training_time": 0,
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
277
312
|
|
|
278
313
|
# analysis 1
|
|
279
314
|
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
@@ -302,12 +337,46 @@ class FeatureSelectionEngine:
|
|
|
302
337
|
header=True,
|
|
303
338
|
index_label="ID",
|
|
304
339
|
)
|
|
340
|
+
|
|
341
|
+
# Update support for features after correlation removal (before max)
|
|
342
|
+
logger.info("Updating ensemble features after correlation removal...")
|
|
343
|
+
for row in ensemble_rows:
|
|
344
|
+
feature = Feature.get(row["feature_id"]).name
|
|
345
|
+
if feature in features:
|
|
346
|
+
row["support"] = 1 # 1 = survived correlation removal
|
|
347
|
+
|
|
305
348
|
features = features[:max_features]
|
|
306
349
|
|
|
307
350
|
# adding categorical features selected
|
|
308
351
|
features += (
|
|
309
352
|
categorical_features_selected if target_type == "classification" else []
|
|
310
353
|
)
|
|
354
|
+
|
|
355
|
+
# Final update for features after max limitation (final selection)
|
|
356
|
+
logger.info("Finalizing ensemble features with categorical features...")
|
|
357
|
+
for row in ensemble_rows:
|
|
358
|
+
feature = Feature.get(row["feature_id"]).name
|
|
359
|
+
if feature in features and row["support"] == 1:
|
|
360
|
+
row["support"] = 2 # 2 = in final selection
|
|
361
|
+
|
|
362
|
+
# Add categorical features to ensemble if not already present
|
|
363
|
+
if target_type == "classification":
|
|
364
|
+
for cat_feature in categorical_features_selected:
|
|
365
|
+
feature_id = feature_map.get(cat_feature)
|
|
366
|
+
if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
|
|
367
|
+
ensemble_rows.append({
|
|
368
|
+
"feature_selection_id": feature_selection.id,
|
|
369
|
+
"feature_id": feature_id,
|
|
370
|
+
"method": "ensemble",
|
|
371
|
+
"score": None,
|
|
372
|
+
"pvalue": None,
|
|
373
|
+
"support": 2, # 2 = in final selection (categorical)
|
|
374
|
+
"rank": None, # No rank for categorical features added at the end
|
|
375
|
+
"training_time": 0,
|
|
376
|
+
})
|
|
377
|
+
|
|
378
|
+
# Re-save all ensemble data with updated support values
|
|
379
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
311
380
|
logger.debug(
|
|
312
381
|
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
313
382
|
)
|
|
@@ -440,13 +509,18 @@ class FeatureSelectionEngine:
|
|
|
440
509
|
feat_scores["features"] = X.columns
|
|
441
510
|
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
442
511
|
feat_scores["method"] = "Chi2"
|
|
512
|
+
|
|
513
|
+
# Apply both percentile and p-value filtering
|
|
514
|
+
# Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
|
|
515
|
+
feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
|
|
516
|
+
|
|
443
517
|
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
444
518
|
stop = time.time()
|
|
445
519
|
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
446
520
|
feat_scores["training_time"] = training_time
|
|
447
521
|
|
|
448
522
|
logger.debug(
|
|
449
|
-
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
523
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
|
|
450
524
|
)
|
|
451
525
|
|
|
452
526
|
feat_scores.to_csv(
|
|
@@ -803,33 +877,28 @@ class PreprocessModel:
|
|
|
803
877
|
val,
|
|
804
878
|
test,
|
|
805
879
|
experiment,
|
|
806
|
-
target_numbers,
|
|
807
|
-
target_clf,
|
|
808
|
-
models_idx,
|
|
809
|
-
time_series,
|
|
810
|
-
max_timesteps,
|
|
811
|
-
group_column,
|
|
812
|
-
date_column,
|
|
813
880
|
**kwargs,
|
|
814
881
|
):
|
|
815
882
|
self.train = train
|
|
816
883
|
self.val = val
|
|
817
884
|
self.test = test
|
|
818
885
|
self.experiment = experiment
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
self.
|
|
822
|
-
self.
|
|
823
|
-
self.
|
|
824
|
-
self.
|
|
825
|
-
self.
|
|
886
|
+
|
|
887
|
+
# Get all parameters from experiment context
|
|
888
|
+
self.target_numbers = self.experiment.context.get("target_numbers", [])
|
|
889
|
+
self.target_clf = self.experiment.context.get("target_clf", [])
|
|
890
|
+
self.models_idx = self.experiment.context.get("models_idx", [])
|
|
891
|
+
self.time_series = self.experiment.context.get("time_series", False)
|
|
892
|
+
self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
|
|
893
|
+
self.group_column = self.experiment.context.get("group_column", None)
|
|
894
|
+
self.date_column = self.experiment.context.get("date_column", None)
|
|
826
895
|
|
|
827
896
|
self.experiment_dir = experiment.path
|
|
828
897
|
self.data_dir = f"{self.experiment_dir}/data"
|
|
829
898
|
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
830
899
|
|
|
831
900
|
self.all_features = experiment.get_all_features(
|
|
832
|
-
date_column=date_column, group_column=group_column
|
|
901
|
+
date_column=self.date_column, group_column=self.group_column
|
|
833
902
|
)
|
|
834
903
|
|
|
835
904
|
def run(self):
|
lecrapaud/model_selection.py
CHANGED
|
@@ -1017,24 +1017,24 @@ class ModelSelectionEngine:
|
|
|
1017
1017
|
data,
|
|
1018
1018
|
reshaped_data,
|
|
1019
1019
|
target_number,
|
|
1020
|
-
target_clf,
|
|
1021
1020
|
experiment,
|
|
1022
|
-
models_idx,
|
|
1023
|
-
time_series,
|
|
1024
|
-
date_column,
|
|
1025
|
-
group_column,
|
|
1026
|
-
target_clf_thresholds,
|
|
1027
1021
|
**kwargs,
|
|
1028
1022
|
):
|
|
1029
1023
|
self.data = data
|
|
1030
1024
|
self.reshaped_data = reshaped_data
|
|
1031
1025
|
self.target_number = target_number
|
|
1032
1026
|
self.experiment = experiment
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
self.
|
|
1037
|
-
self.
|
|
1027
|
+
|
|
1028
|
+
# Get all parameters from experiment context
|
|
1029
|
+
context = self.experiment.context
|
|
1030
|
+
self.target_clf = context.get("target_clf", [])
|
|
1031
|
+
self.models_idx = context.get("models_idx", [])
|
|
1032
|
+
self.time_series = context.get("time_series", False)
|
|
1033
|
+
self.date_column = context.get("date_column", None)
|
|
1034
|
+
self.group_column = context.get("group_column", None)
|
|
1035
|
+
|
|
1036
|
+
# Handle target_clf_thresholds
|
|
1037
|
+
target_clf_thresholds = context.get("target_clf_thresholds", {})
|
|
1038
1038
|
self.target_clf_thresholds = (
|
|
1039
1039
|
target_clf_thresholds[target_number]
|
|
1040
1040
|
if target_number in target_clf_thresholds.keys()
|
|
@@ -1056,25 +1056,19 @@ class ModelSelectionEngine:
|
|
|
1056
1056
|
)
|
|
1057
1057
|
|
|
1058
1058
|
# Main training function
|
|
1059
|
-
def run(
|
|
1060
|
-
self,
|
|
1061
|
-
experiment_name,
|
|
1062
|
-
perform_hyperopt=True,
|
|
1063
|
-
number_of_trials=20,
|
|
1064
|
-
perform_crossval=False, # This controls CV during hyperopt, not after
|
|
1065
|
-
plot=True,
|
|
1066
|
-
clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
|
|
1067
|
-
preserve_model=True,
|
|
1068
|
-
best_params=None,
|
|
1069
|
-
):
|
|
1059
|
+
def run(self, best_params=None):
|
|
1070
1060
|
"""
|
|
1071
1061
|
Selects the best models based on a target variable, optionally performing hyperparameter optimization
|
|
1072
1062
|
and cross-validation, and manages outputs in a session-specific directory.
|
|
1073
1063
|
"""
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
self.
|
|
1077
|
-
self.
|
|
1064
|
+
# Get all parameters from experiment context
|
|
1065
|
+
context = self.experiment.context
|
|
1066
|
+
self.experiment_name = context.get("experiment_name", "")
|
|
1067
|
+
self.plot = context.get("plot", True)
|
|
1068
|
+
self.number_of_trials = context.get("number_of_trials", 20)
|
|
1069
|
+
self.perform_crossval = context.get("perform_crossval", False)
|
|
1070
|
+
self.preserve_model = context.get("preserve_model", True)
|
|
1071
|
+
self.perform_hyperopt = context.get("perform_hyperopt", True)
|
|
1078
1072
|
|
|
1079
1073
|
if self.experiment_id is None:
|
|
1080
1074
|
raise ValueError("Please provide a experiment.")
|
|
@@ -1141,13 +1135,13 @@ class ModelSelectionEngine:
|
|
|
1141
1135
|
self.results_dir = f"{self.target_dir}/{model_name}"
|
|
1142
1136
|
if not os.path.exists(f"{self.results_dir}"):
|
|
1143
1137
|
os.makedirs(f"{self.results_dir}")
|
|
1144
|
-
elif preserve_model and contains_best(self.results_dir):
|
|
1138
|
+
elif self.preserve_model and contains_best(self.results_dir):
|
|
1145
1139
|
continue
|
|
1146
|
-
elif perform_hyperopt:
|
|
1140
|
+
elif self.perform_hyperopt:
|
|
1147
1141
|
clean_directory(self.results_dir)
|
|
1148
1142
|
|
|
1149
1143
|
logger.info(
|
|
1150
|
-
f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
|
|
1144
|
+
f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
|
|
1151
1145
|
)
|
|
1152
1146
|
|
|
1153
1147
|
# Getting data
|
|
@@ -1204,7 +1198,7 @@ class ModelSelectionEngine:
|
|
|
1204
1198
|
|
|
1205
1199
|
# Tuning hyperparameters
|
|
1206
1200
|
start = time.time()
|
|
1207
|
-
if perform_hyperopt:
|
|
1201
|
+
if self.perform_hyperopt:
|
|
1208
1202
|
model_best_params = self.hyperoptimize(
|
|
1209
1203
|
x_train, y_train, x_val, y_val, model
|
|
1210
1204
|
)
|
lecrapaud/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ import re
|
|
|
11
11
|
import string
|
|
12
12
|
|
|
13
13
|
from lecrapaud.directories import logger_dir
|
|
14
|
-
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
14
|
+
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
_LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
|
|
@@ -237,7 +237,7 @@ def serialize_for_json(obj):
|
|
|
237
237
|
import numpy as np
|
|
238
238
|
from datetime import datetime, date
|
|
239
239
|
import pandas as pd
|
|
240
|
-
|
|
240
|
+
|
|
241
241
|
# Handle NumPy types
|
|
242
242
|
if isinstance(obj, (np.integer, np.int64, np.int32, np.int16)):
|
|
243
243
|
return int(obj)
|
|
@@ -247,11 +247,11 @@ def serialize_for_json(obj):
|
|
|
247
247
|
return obj.tolist()
|
|
248
248
|
elif isinstance(obj, np.bool_):
|
|
249
249
|
return bool(obj)
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
# Handle datetime types
|
|
252
252
|
elif isinstance(obj, (datetime, date, pd.Timestamp)):
|
|
253
253
|
return obj.isoformat()
|
|
254
|
-
|
|
254
|
+
|
|
255
255
|
# Handle basic Python types
|
|
256
256
|
elif isinstance(obj, (str, int, float, bool, type(None))):
|
|
257
257
|
return obj
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lecrapaud
|
|
3
|
+
Version: 0.20.2
|
|
4
|
+
Summary: Framework for machine and deep learning, with regression, classification and time series analysis
|
|
5
|
+
License: Apache License
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Pierre H. Gallet
|
|
8
|
+
Requires-Python: ==3.12.*
|
|
9
|
+
Classifier: License :: Other/Proprietary License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: catboost (>=1.2.8)
|
|
13
|
+
Requires-Dist: category-encoders (>=2.8.1)
|
|
14
|
+
Requires-Dist: celery (>=5.5.3)
|
|
15
|
+
Requires-Dist: ftfy (>=6.3.1)
|
|
16
|
+
Requires-Dist: joblib (>=1.5.1)
|
|
17
|
+
Requires-Dist: keras (>=3.10.0)
|
|
18
|
+
Requires-Dist: lightgbm (>=4.6.0)
|
|
19
|
+
Requires-Dist: matplotlib (>=3.10.3)
|
|
20
|
+
Requires-Dist: mlxtend (>=0.23.4)
|
|
21
|
+
Requires-Dist: numpy (>=2.1.3)
|
|
22
|
+
Requires-Dist: openai (>=1.88.0)
|
|
23
|
+
Requires-Dist: pandas (>=2.3.0)
|
|
24
|
+
Requires-Dist: pydantic (>=2.9.2)
|
|
25
|
+
Requires-Dist: python-dotenv (>=1.1.0)
|
|
26
|
+
Requires-Dist: scikit-learn (>=1.6.1)
|
|
27
|
+
Requires-Dist: scipy (<1.14.0)
|
|
28
|
+
Requires-Dist: seaborn (>=0.13.2)
|
|
29
|
+
Requires-Dist: sqlalchemy (>=2.0.41)
|
|
30
|
+
Requires-Dist: tensorboardx (>=2.6.4)
|
|
31
|
+
Requires-Dist: tensorflow (>=2.19.0)
|
|
32
|
+
Requires-Dist: tiktoken (>=0.9.0)
|
|
33
|
+
Requires-Dist: tqdm (>=4.67.1)
|
|
34
|
+
Requires-Dist: xgboost (>=3.0.2)
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
<div align="center">
|
|
38
|
+
|
|
39
|
+
<img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
|
|
40
|
+
|
|
41
|
+
## Welcome to LeCrapaud
|
|
42
|
+
|
|
43
|
+
**An all-in-one machine learning framework**
|
|
44
|
+
|
|
45
|
+
[](https://github.com/pierregallet/lecrapaud/stargazers)
|
|
46
|
+
[](https://badge.fury.io/py/lecrapaud)
|
|
47
|
+
[](https://pypi.org/project/lecrapaud)
|
|
48
|
+
[](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
|
|
49
|
+
[](https://codecov.io/gh/pierregallet/lecrapaud)
|
|
50
|
+
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
## 🚀 Introduction
|
|
54
|
+
|
|
55
|
+
LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
|
|
56
|
+
|
|
57
|
+
## ✨ Key Features
|
|
58
|
+
|
|
59
|
+
- 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
|
|
60
|
+
- 🤖 Automated model selection and hyperparameter optimization
|
|
61
|
+
- 📊 Easy integration with pandas DataFrames
|
|
62
|
+
- 🔬 Supports both regression and classification tasks
|
|
63
|
+
- 🛠️ Simple API for both full pipeline and step-by-step usage
|
|
64
|
+
- 📦 Ready for production and research workflows
|
|
65
|
+
|
|
66
|
+
## ⚡ Quick Start
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Install the package
|
|
70
|
+
|
|
71
|
+
```sh
|
|
72
|
+
pip install lecrapaud
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### How it works
|
|
76
|
+
|
|
77
|
+
This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
|
|
78
|
+
|
|
79
|
+
### Typical workflow
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from lecrapaud import LeCrapaud
|
|
83
|
+
|
|
84
|
+
# 1. Create the main app
|
|
85
|
+
app = LeCrapaud(uri=uri)
|
|
86
|
+
|
|
87
|
+
# 2. Define your experiment context (see your notebook or api.py for all options)
|
|
88
|
+
context = {
|
|
89
|
+
"data": your_dataframe,
|
|
90
|
+
"columns_drop": [...],
|
|
91
|
+
"columns_date": [...],
|
|
92
|
+
# ... other config options
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# 3. Create an experiment
|
|
96
|
+
experiment = app.create_experiment(**context)
|
|
97
|
+
|
|
98
|
+
# 4. Run the full training pipeline
|
|
99
|
+
experiment.train(your_dataframe)
|
|
100
|
+
|
|
101
|
+
# 5. Make predictions on new data
|
|
102
|
+
predictions = experiment.predict(new_data)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Database Configuration (Required)
|
|
106
|
+
|
|
107
|
+
LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
|
|
108
|
+
|
|
109
|
+
- Pass a valid MySQL URI to the `LeCrapaud` constructor:
|
|
110
|
+
```python
|
|
111
|
+
app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
|
|
112
|
+
```
|
|
113
|
+
- **OR** set the following environment variables before using the package:
|
|
114
|
+
- `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
|
|
115
|
+
- Or set `DB_URI` directly with your full connection string.
|
|
116
|
+
|
|
117
|
+
If neither is provided, database operations will not work.
|
|
118
|
+
|
|
119
|
+
### Using OpenAI Embeddings (Optional)
|
|
120
|
+
|
|
121
|
+
If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
|
|
122
|
+
|
|
123
|
+
```sh
|
|
124
|
+
export OPENAI_API_KEY=sk-...
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
If this variable is not set, features relying on OpenAI embeddings will not be available.
|
|
128
|
+
|
|
129
|
+
### Experiment Context Arguments
|
|
130
|
+
|
|
131
|
+
The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
|
|
132
|
+
|
|
133
|
+
#### Required Parameters
|
|
134
|
+
|
|
135
|
+
| Parameter | Type | Description | Example |
|
|
136
|
+
|-------------------|-----------|------------------------------------------------------|------------------------|
|
|
137
|
+
| `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
|
|
138
|
+
| `experiment_name`| str | Unique name for the experiment | `'stock_prediction'` |
|
|
139
|
+
| `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
|
|
140
|
+
| `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
|
|
141
|
+
|
|
142
|
+
#### Feature Engineering Parameters
|
|
143
|
+
|
|
144
|
+
| Parameter | Type | Default | Description |
|
|
145
|
+
|-----------------------|-------|---------|--------------------------------------------------------------------------|
|
|
146
|
+
| `columns_drop` | list | `[]` | Columns to drop during feature engineering |
|
|
147
|
+
| `columns_boolean` | list | `[]` | Columns to convert to boolean features |
|
|
148
|
+
| `columns_date` | list | `[]` | Date columns for cyclic encoding |
|
|
149
|
+
| `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
|
|
150
|
+
| `columns_te_target` | list | `[]` | Target columns for target encoding |
|
|
151
|
+
|
|
152
|
+
#### Preprocessing Parameters
|
|
153
|
+
|
|
154
|
+
| Parameter | Type | Default | Description |
|
|
155
|
+
|-------------------------|-------|---------|-----------------------------------------------------------------------|
|
|
156
|
+
| `time_series` | bool | `False` | Whether data is time series |
|
|
157
|
+
| `val_size` | float | `0.2` | Validation set size (fraction) |
|
|
158
|
+
| `test_size` | float | `0.2` | Test set size (fraction) |
|
|
159
|
+
| `columns_pca` | list | `[]` | Columns for PCA transformation |
|
|
160
|
+
| `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
|
|
161
|
+
| `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
|
|
162
|
+
| `columns_onehot` | list | `[]` | Columns for one-hot encoding |
|
|
163
|
+
| `columns_binary` | list | `[]` | Columns for binary encoding |
|
|
164
|
+
| `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
|
|
165
|
+
| `columns_frequency` | list | `[]` | Columns for frequency encoding |
|
|
166
|
+
|
|
167
|
+
#### Feature Selection Parameters
|
|
168
|
+
|
|
169
|
+
| Parameter | Type | Default | Description |
|
|
170
|
+
|-----------------------------|-------|---------|------------------------------------------------------------------|
|
|
171
|
+
| `percentile` | float | `20` | Percentage of features to keep per selection method |
|
|
172
|
+
| `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
|
|
173
|
+
| `max_features` | int | `50` | Maximum number of final features |
|
|
174
|
+
| `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
|
|
175
|
+
|
|
176
|
+
#### Model Selection Parameters
|
|
177
|
+
|
|
178
|
+
| Parameter | Type | Default | Description |
|
|
179
|
+
|------------------------|-------|---------|-----------------------------------------------------------------------|
|
|
180
|
+
| `target_numbers` | list | `[]` | List of target indices to predict |
|
|
181
|
+
| `target_clf` | list | `[]` | Classification target indices |
|
|
182
|
+
| `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
|
|
183
|
+
| `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
|
|
184
|
+
| `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
|
|
185
|
+
| `number_of_trials` | int | `20` | Number of hyperopt trials |
|
|
186
|
+
| `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
|
|
187
|
+
| `plot` | bool | `True` | Whether to generate plots |
|
|
188
|
+
| `preserve_model` | bool | `True` | Whether to save the best model |
|
|
189
|
+
| `target_clf_thresholds`| dict | `{}` | Classification thresholds per target |
|
|
190
|
+
|
|
191
|
+
#### Example Context Configuration
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
context = {
|
|
195
|
+
# Required parameters
|
|
196
|
+
"experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|
197
|
+
"date_column": "DATE",
|
|
198
|
+
"group_column": "STOCK",
|
|
199
|
+
|
|
200
|
+
# Feature selection
|
|
201
|
+
"corr_threshold": 80,
|
|
202
|
+
"max_features": 20,
|
|
203
|
+
"percentile": 20,
|
|
204
|
+
"max_p_value_categorical": 0.05,
|
|
205
|
+
|
|
206
|
+
# Feature engineering
|
|
207
|
+
"columns_drop": ["SECURITY", "ISIN", "ID"],
|
|
208
|
+
"columns_boolean": [],
|
|
209
|
+
"columns_date": ["DATE"],
|
|
210
|
+
"columns_te_groupby": [["SECTOR", "DATE"]],
|
|
211
|
+
"columns_te_target": ["RET", "VOLUME"],
|
|
212
|
+
|
|
213
|
+
# Preprocessing
|
|
214
|
+
"time_series": True,
|
|
215
|
+
"val_size": 0.2,
|
|
216
|
+
"test_size": 0.2,
|
|
217
|
+
"pca_temporal": [
|
|
218
|
+
{"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
|
|
219
|
+
],
|
|
220
|
+
"pca_cross_sectional": [
|
|
221
|
+
{
|
|
222
|
+
"name": "MARKET_REGIME",
|
|
223
|
+
"index": "DATE",
|
|
224
|
+
"columns": "STOCK",
|
|
225
|
+
"value": "RET",
|
|
226
|
+
}
|
|
227
|
+
],
|
|
228
|
+
"columns_onehot": ["BUY_SIGNAL"],
|
|
229
|
+
"columns_binary": ["SECTOR", "LOCATION"],
|
|
230
|
+
"columns_ordinal": ["STOCK"],
|
|
231
|
+
|
|
232
|
+
# Model selection
|
|
233
|
+
"target_numbers": [1, 2, 3],
|
|
234
|
+
"target_clf": [1],
|
|
235
|
+
"models_idx": ["xgb", "lgb", "catboost"],
|
|
236
|
+
"max_timesteps": 120,
|
|
237
|
+
"perform_hyperopt": True,
|
|
238
|
+
"number_of_trials": 50,
|
|
239
|
+
"perform_crossval": True,
|
|
240
|
+
"plot": True,
|
|
241
|
+
"preserve_model": True,
|
|
242
|
+
"target_clf_thresholds": {1: {"precision": 0.80}},
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
# Create experiment
|
|
246
|
+
experiment = app.create_experiment(data=your_dataframe, **context)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
#### Important Notes
|
|
250
|
+
|
|
251
|
+
1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
|
|
252
|
+
|
|
253
|
+
2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
|
|
254
|
+
|
|
255
|
+
3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
|
|
256
|
+
|
|
257
|
+
4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
|
|
258
|
+
|
|
259
|
+
5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
### Modular usage
|
|
264
|
+
|
|
265
|
+
You can also use each step independently:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
data_eng = experiment.feature_engineering(data)
|
|
269
|
+
train, val, test = experiment.preprocess_feature(data_eng)
|
|
270
|
+
features = experiment.feature_selection(train)
|
|
271
|
+
std_data, reshaped_data = experiment.preprocess_model(train, val, test)
|
|
272
|
+
experiment.model_selection(std_data, reshaped_data)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## ⚠️ Using Alembic in Your Project (Important for Integrators)
|
|
276
|
+
|
|
277
|
+
If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
|
|
278
|
+
|
|
279
|
+
By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
def include_object(object, name, type_, reflected, compare_to):
|
|
283
|
+
if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
|
|
284
|
+
return False # Ignore LeCrapaud tables
|
|
285
|
+
return True
|
|
286
|
+
|
|
287
|
+
context.configure(
|
|
288
|
+
# ... other options ...
|
|
289
|
+
include_object=include_object,
|
|
290
|
+
)
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## 🤝 Contributing
|
|
298
|
+
|
|
299
|
+
### Reminders for Github usage
|
|
300
|
+
|
|
301
|
+
1. Creating Github repository
|
|
302
|
+
|
|
303
|
+
```sh
|
|
304
|
+
$ brew install gh
|
|
305
|
+
$ gh auth login
|
|
306
|
+
$ gh repo create
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
2. Initializing git and first commit to distant repository
|
|
310
|
+
|
|
311
|
+
```sh
|
|
312
|
+
$ git init
|
|
313
|
+
$ git add .
|
|
314
|
+
$ git commit -m 'first commit'
|
|
315
|
+
$ git remote add origin <YOUR_REPO_URL>
|
|
316
|
+
$ git push -u origin master
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
3. Use conventional commits
|
|
320
|
+
https://www.conventionalcommits.org/en/v1.0.0/#summary
|
|
321
|
+
|
|
322
|
+
4. Create environment
|
|
323
|
+
|
|
324
|
+
```sh
|
|
325
|
+
$ pip install virtualenv
|
|
326
|
+
$ python -m venv .venv
|
|
327
|
+
$ source .venv/bin/activate
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
5. Install dependencies
|
|
331
|
+
|
|
332
|
+
```sh
|
|
333
|
+
$ make install
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
6. Deactivate virtualenv (if needed)
|
|
337
|
+
|
|
338
|
+
```sh
|
|
339
|
+
$ deactivate
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
Pierre Gallet © 2025
|