lecrapaud 0.20.1__tar.gz → 0.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (54) hide show
  1. lecrapaud-0.21.0/PKG-INFO +347 -0
  2. lecrapaud-0.21.0/README.md +308 -0
  3. lecrapaud-0.21.0/lecrapaud/__init__.py +5 -0
  4. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/api.py +80 -12
  5. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/config.py +3 -2
  6. lecrapaud-0.21.0/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/experiment.py +8 -2
  8. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/feature_engineering.py +223 -4
  9. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/utils.py +4 -4
  10. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/pyproject.toml +4 -1
  11. lecrapaud-0.20.1/PKG-INFO +0 -250
  12. lecrapaud-0.20.1/README.md +0 -214
  13. lecrapaud-0.20.1/lecrapaud/__init__.py +0 -1
  14. lecrapaud-0.20.1/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +0 -42
  15. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/LICENSE +0 -0
  16. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/__init__.py +0 -0
  17. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/README +0 -0
  18. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/env.py +0 -0
  19. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  20. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  21. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  22. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  23. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  24. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +0 -0
  25. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +0 -0
  26. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +0 -0
  27. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +0 -0
  28. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +0 -0
  29. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/alembic.ini +0 -0
  30. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/__init__.py +0 -0
  31. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/base.py +0 -0
  32. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/experiment.py +0 -0
  33. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/feature.py +0 -0
  34. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/feature_selection.py +0 -0
  35. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  36. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/model.py +0 -0
  37. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/model_selection.py +0 -0
  38. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/model_selection_score.py +0 -0
  39. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/target.py +0 -0
  40. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/models/utils.py +0 -0
  41. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/db/session.py +0 -0
  42. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/directories.py +0 -0
  43. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/feature_selection.py +0 -0
  44. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/integrations/openai_integration.py +0 -0
  45. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/jobs/__init__.py +0 -0
  46. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/jobs/config.py +0 -0
  47. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/jobs/scheduler.py +0 -0
  48. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/jobs/tasks.py +0 -0
  49. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  50. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  51. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  52. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  53. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/model_selection.py +0 -0
  54. {lecrapaud-0.20.1 → lecrapaud-0.21.0}/lecrapaud/search_space.py +0 -0
@@ -0,0 +1,347 @@
1
+ Metadata-Version: 2.4
2
+ Name: lecrapaud
3
+ Version: 0.21.0
4
+ Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
+ License: Apache License
6
+ License-File: LICENSE
7
+ Author: Pierre H. Gallet
8
+ Requires-Python: ==3.12.*
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: catboost (>=1.2.8)
13
+ Requires-Dist: category-encoders (>=2.8.1)
14
+ Requires-Dist: celery (>=5.5.3)
15
+ Requires-Dist: celery-redbeat (>=2.3.2)
16
+ Requires-Dist: ftfy (>=6.3.1)
17
+ Requires-Dist: hyperopt (>=0.2.7)
18
+ Requires-Dist: joblib (>=1.5.1)
19
+ Requires-Dist: keras (>=3.10.0)
20
+ Requires-Dist: keras-tcn (>=3.5.6)
21
+ Requires-Dist: lightgbm (>=4.6.0)
22
+ Requires-Dist: matplotlib (>=3.10.3)
23
+ Requires-Dist: mlxtend (>=0.23.4)
24
+ Requires-Dist: numpy (>=2.1.3)
25
+ Requires-Dist: openai (>=1.88.0)
26
+ Requires-Dist: pandas (>=2.3.0)
27
+ Requires-Dist: pydantic (>=2.9.2)
28
+ Requires-Dist: python-dotenv (>=1.1.0)
29
+ Requires-Dist: scikit-learn (>=1.6.1)
30
+ Requires-Dist: scipy (<1.14.0)
31
+ Requires-Dist: seaborn (>=0.13.2)
32
+ Requires-Dist: sqlalchemy (>=2.0.41)
33
+ Requires-Dist: tensorboardx (>=2.6.4)
34
+ Requires-Dist: tensorflow (>=2.19.0)
35
+ Requires-Dist: tiktoken (>=0.9.0)
36
+ Requires-Dist: tqdm (>=4.67.1)
37
+ Requires-Dist: xgboost (>=3.0.2)
38
+ Description-Content-Type: text/markdown
39
+
40
+ <div align="center">
41
+
42
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
43
+
44
+ ## Welcome to LeCrapaud
45
+
46
+ **An all-in-one machine learning framework**
47
+
48
+ [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
49
+ [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
50
+ [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
51
+ [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
52
+ [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
53
+
54
+ </div>
55
+
56
+ ## 🚀 Introduction
57
+
58
+ LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
59
+
60
+ ## ✨ Key Features
61
+
62
+ - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
63
+ - 🤖 Automated model selection and hyperparameter optimization
64
+ - 📊 Easy integration with pandas DataFrames
65
+ - 🔬 Supports both regression and classification tasks
66
+ - 🛠️ Simple API for both full pipeline and step-by-step usage
67
+ - 📦 Ready for production and research workflows
68
+
69
+ ## ⚡ Quick Start
70
+
71
+
72
+ ### Install the package
73
+
74
+ ```sh
75
+ pip install lecrapaud
76
+ ```
77
+
78
+ ### How it works
79
+
80
+ This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
81
+
82
+ ### Typical workflow
83
+
84
+ ```python
85
+ from lecrapaud import LeCrapaud
86
+
87
+ # 1. Create the main app
88
+ app = LeCrapaud(uri=uri)
89
+
90
+ # 2. Define your experiment context (see your notebook or api.py for all options)
91
+ context = {
92
+ "data": your_dataframe,
93
+ "columns_drop": [...],
94
+ "columns_date": [...],
95
+ # ... other config options
96
+ }
97
+
98
+ # 3. Create an experiment
99
+ experiment = app.create_experiment(**context)
100
+
101
+ # 4. Run the full training pipeline
102
+ experiment.train(your_dataframe)
103
+
104
+ # 5. Make predictions on new data
105
+ predictions = experiment.predict(new_data)
106
+ ```
107
+
108
+ ### Database Configuration (Required)
109
+
110
+ LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
111
+
112
+ - Pass a valid MySQL URI to the `LeCrapaud` constructor:
113
+ ```python
114
+ app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
115
+ ```
116
+ - **OR** set the following environment variables before using the package:
117
+ - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
118
+ - Or set `DB_URI` directly with your full connection string.
119
+
120
+ If neither is provided, database operations will not work.
121
+
122
+ ### Using OpenAI Embeddings (Optional)
123
+
124
+ If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
125
+
126
+ ```sh
127
+ export OPENAI_API_KEY=sk-...
128
+ ```
129
+
130
+ If this variable is not set, features relying on OpenAI embeddings will not be available.
131
+
132
+ ### Experiment Context Arguments
133
+
134
+ The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
135
+
136
+ #### Required Parameters
137
+
138
+ | Parameter | Type | Description | Example |
139
+ |-------------------|-----------|------------------------------------------------------|------------------------|
140
+ | `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
141
+ | `experiment_name`| str | Unique name for the experiment | `'stock_prediction'` |
142
+ | `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
143
+ | `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
144
+
145
+ #### Feature Engineering Parameters
146
+
147
+ | Parameter | Type | Default | Description |
148
+ |-----------------------|-------|---------|--------------------------------------------------------------------------|
149
+ | `columns_drop` | list | `[]` | Columns to drop during feature engineering |
150
+ | `columns_boolean` | list | `[]` | Columns to convert to boolean features |
151
+ | `columns_date` | list | `[]` | Date columns for cyclic encoding |
152
+ | `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
153
+ | `columns_te_target` | list | `[]` | Target columns for target encoding |
154
+
155
+ #### Preprocessing Parameters
156
+
157
+ | Parameter | Type | Default | Description |
158
+ |-------------------------|-------|---------|-----------------------------------------------------------------------|
159
+ | `time_series` | bool | `False` | Whether data is time series |
160
+ | `val_size` | float | `0.2` | Validation set size (fraction) |
161
+ | `test_size` | float | `0.2` | Test set size (fraction) |
162
+ | `columns_pca` | list | `[]` | Columns for PCA transformation |
163
+ | `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
164
+ | `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
165
+ | `columns_onehot` | list | `[]` | Columns for one-hot encoding |
166
+ | `columns_binary` | list | `[]` | Columns for binary encoding |
167
+ | `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
168
+ | `columns_frequency` | list | `[]` | Columns for frequency encoding |
169
+
170
+ #### Feature Selection Parameters
171
+
172
+ | Parameter | Type | Default | Description |
173
+ |-----------------------------|-------|---------|------------------------------------------------------------------|
174
+ | `percentile` | float | `20` | Percentage of features to keep per selection method |
175
+ | `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
176
+ | `max_features` | int | `50` | Maximum number of final features |
177
+ | `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
178
+
179
+ #### Model Selection Parameters
180
+
181
+ | Parameter | Type | Default | Description |
182
+ |------------------------|-------|---------|-----------------------------------------------------------------------|
183
+ | `target_numbers` | list | `[]` | List of target indices to predict |
184
+ | `target_clf` | list | `[]` | Classification target indices |
185
+ | `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
186
+ | `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
187
+ | `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
188
+ | `number_of_trials` | int | `20` | Number of hyperopt trials |
189
+ | `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
190
+ | `plot` | bool | `True` | Whether to generate plots |
191
+ | `preserve_model` | bool | `True` | Whether to save the best model |
192
+ | `target_clf_thresholds`| dict | `{}` | Classification thresholds per target |
193
+
194
+ #### Example Context Configuration
195
+
196
+ ```python
197
+ context = {
198
+ # Required parameters
199
+ "experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
200
+ "date_column": "DATE",
201
+ "group_column": "STOCK",
202
+
203
+ # Feature selection
204
+ "corr_threshold": 80,
205
+ "max_features": 20,
206
+ "percentile": 20,
207
+ "max_p_value_categorical": 0.05,
208
+
209
+ # Feature engineering
210
+ "columns_drop": ["SECURITY", "ISIN", "ID"],
211
+ "columns_boolean": [],
212
+ "columns_date": ["DATE"],
213
+ "columns_te_groupby": [["SECTOR", "DATE"]],
214
+ "columns_te_target": ["RET", "VOLUME"],
215
+
216
+ # Preprocessing
217
+ "time_series": True,
218
+ "val_size": 0.2,
219
+ "test_size": 0.2,
220
+ "pca_temporal": [
221
+ {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
222
+ ],
223
+ "pca_cross_sectional": [
224
+ {
225
+ "name": "MARKET_REGIME",
226
+ "index": "DATE",
227
+ "columns": "STOCK",
228
+ "value": "RET",
229
+ }
230
+ ],
231
+ "columns_onehot": ["BUY_SIGNAL"],
232
+ "columns_binary": ["SECTOR", "LOCATION"],
233
+ "columns_ordinal": ["STOCK"],
234
+
235
+ # Model selection
236
+ "target_numbers": [1, 2, 3],
237
+ "target_clf": [1],
238
+ "models_idx": ["xgb", "lgb", "catboost"],
239
+ "max_timesteps": 120,
240
+ "perform_hyperopt": True,
241
+ "number_of_trials": 50,
242
+ "perform_crossval": True,
243
+ "plot": True,
244
+ "preserve_model": True,
245
+ "target_clf_thresholds": {1: {"precision": 0.80}},
246
+ }
247
+
248
+ # Create experiment
249
+ experiment = app.create_experiment(data=your_dataframe, **context)
250
+ ```
251
+
252
+ #### Important Notes
253
+
254
+ 1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
255
+
256
+ 2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
257
+
258
+ 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
259
+
260
+ 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
261
+
262
+ 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
263
+
264
+
265
+
266
+ ### Modular usage
267
+
268
+ You can also use each step independently:
269
+
270
+ ```python
271
+ data_eng = experiment.feature_engineering(data)
272
+ train, val, test = experiment.preprocess_feature(data_eng)
273
+ features = experiment.feature_selection(train)
274
+ std_data, reshaped_data = experiment.preprocess_model(train, val, test)
275
+ experiment.model_selection(std_data, reshaped_data)
276
+ ```
277
+
278
+ ## ⚠️ Using Alembic in Your Project (Important for Integrators)
279
+
280
+ If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
281
+
282
+ By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
283
+
284
+ ```python
285
+ def include_object(object, name, type_, reflected, compare_to):
286
+ if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
287
+ return False # Ignore LeCrapaud tables
288
+ return True
289
+
290
+ context.configure(
291
+ # ... other options ...
292
+ include_object=include_object,
293
+ )
294
+ ```
295
+
296
+ This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
297
+
298
+ ---
299
+
300
+ ## 🤝 Contributing
301
+
302
+ ### Reminders for Github usage
303
+
304
+ 1. Creating Github repository
305
+
306
+ ```sh
307
+ $ brew install gh
308
+ $ gh auth login
309
+ $ gh repo create
310
+ ```
311
+
312
+ 2. Initializing git and first commit to distant repository
313
+
314
+ ```sh
315
+ $ git init
316
+ $ git add .
317
+ $ git commit -m 'first commit'
318
+ $ git remote add origin <YOUR_REPO_URL>
319
+ $ git push -u origin master
320
+ ```
321
+
322
+ 3. Use conventional commits
323
+ https://www.conventionalcommits.org/en/v1.0.0/#summary
324
+
325
+ 4. Create environment
326
+
327
+ ```sh
328
+ $ pip install virtualenv
329
+ $ python -m venv .venv
330
+ $ source .venv/bin/activate
331
+ ```
332
+
333
+ 5. Install dependencies
334
+
335
+ ```sh
336
+ $ make install
337
+ ```
338
+
339
+ 6. Deactivate virtualenv (if needed)
340
+
341
+ ```sh
342
+ $ deactivate
343
+ ```
344
+
345
+ ---
346
+
347
+ Pierre Gallet © 2025
@@ -0,0 +1,308 @@
1
+ <div align="center">
2
+
3
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
4
+
5
+ ## Welcome to LeCrapaud
6
+
7
+ **An all-in-one machine learning framework**
8
+
9
+ [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers)
10
+ [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
11
+ [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
12
+ [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE)
13
+ [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud)
14
+
15
+ </div>
16
+
17
+ ## 🚀 Introduction
18
+
19
+ LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular data, with a focus on financial and stock datasets. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
20
+
21
+ ## ✨ Key Features
22
+
23
+ - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling as independent steps
24
+ - 🤖 Automated model selection and hyperparameter optimization
25
+ - 📊 Easy integration with pandas DataFrames
26
+ - 🔬 Supports both regression and classification tasks
27
+ - 🛠️ Simple API for both full pipeline and step-by-step usage
28
+ - 📦 Ready for production and research workflows
29
+
30
+ ## ⚡ Quick Start
31
+
32
+
33
+ ### Install the package
34
+
35
+ ```sh
36
+ pip install lecrapaud
37
+ ```
38
+
39
+ ### How it works
40
+
41
+ This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
42
+
43
+ ### Typical workflow
44
+
45
+ ```python
46
+ from lecrapaud import LeCrapaud
47
+
48
+ # 1. Create the main app
49
+ app = LeCrapaud(uri=uri)
50
+
51
+ # 2. Define your experiment context (see your notebook or api.py for all options)
52
+ context = {
53
+ "data": your_dataframe,
54
+ "columns_drop": [...],
55
+ "columns_date": [...],
56
+ # ... other config options
57
+ }
58
+
59
+ # 3. Create an experiment
60
+ experiment = app.create_experiment(**context)
61
+
62
+ # 4. Run the full training pipeline
63
+ experiment.train(your_dataframe)
64
+
65
+ # 5. Make predictions on new data
66
+ predictions = experiment.predict(new_data)
67
+ ```
68
+
69
+ ### Database Configuration (Required)
70
+
71
+ LeCrapaud requires access to a MySQL database to store experiments and results. You must either:
72
+
73
+ - Pass a valid MySQL URI to the `LeCrapaud` constructor:
74
+ ```python
75
+ app = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname")
76
+ ```
77
+ - **OR** set the following environment variables before using the package:
78
+ - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
79
+ - Or set `DB_URI` directly with your full connection string.
80
+
81
+ If neither is provided, database operations will not work.
82
+
83
+ ### Using OpenAI Embeddings (Optional)
84
+
85
+ If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
86
+
87
+ ```sh
88
+ export OPENAI_API_KEY=sk-...
89
+ ```
90
+
91
+ If this variable is not set, features relying on OpenAI embeddings will not be available.
92
+
93
+ ### Experiment Context Arguments
94
+
95
+ The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
96
+
97
+ #### Required Parameters
98
+
99
+ | Parameter | Type | Description | Example |
100
+ |-------------------|-----------|------------------------------------------------------|------------------------|
101
+ | `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
102
+ | `experiment_name`| str | Unique name for the experiment | `'stock_prediction'` |
103
+ | `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
104
+ | `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
105
+
106
+ #### Feature Engineering Parameters
107
+
108
+ | Parameter | Type | Default | Description |
109
+ |-----------------------|-------|---------|--------------------------------------------------------------------------|
110
+ | `columns_drop` | list | `[]` | Columns to drop during feature engineering |
111
+ | `columns_boolean` | list | `[]` | Columns to convert to boolean features |
112
+ | `columns_date` | list | `[]` | Date columns for cyclic encoding |
113
+ | `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
114
+ | `columns_te_target` | list | `[]` | Target columns for target encoding |
115
+
116
+ #### Preprocessing Parameters
117
+
118
+ | Parameter | Type | Default | Description |
119
+ |-------------------------|-------|---------|-----------------------------------------------------------------------|
120
+ | `time_series` | bool | `False` | Whether data is time series |
121
+ | `val_size` | float | `0.2` | Validation set size (fraction) |
122
+ | `test_size` | float | `0.2` | Test set size (fraction) |
123
+ | `columns_pca` | list | `[]` | Columns for PCA transformation |
124
+ | `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
125
+ | `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
126
+ | `columns_onehot` | list | `[]` | Columns for one-hot encoding |
127
+ | `columns_binary` | list | `[]` | Columns for binary encoding |
128
+ | `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
129
+ | `columns_frequency` | list | `[]` | Columns for frequency encoding |
130
+
131
+ #### Feature Selection Parameters
132
+
133
+ | Parameter | Type | Default | Description |
134
+ |-----------------------------|-------|---------|------------------------------------------------------------------|
135
+ | `percentile` | float | `20` | Percentage of features to keep per selection method |
136
+ | `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
137
+ | `max_features` | int | `50` | Maximum number of final features |
138
+ | `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
139
+
140
+ #### Model Selection Parameters
141
+
142
+ | Parameter | Type | Default | Description |
143
+ |------------------------|-------|---------|-----------------------------------------------------------------------|
144
+ | `target_numbers` | list | `[]` | List of target indices to predict |
145
+ | `target_clf` | list | `[]` | Classification target indices |
146
+ | `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
147
+ | `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
148
+ | `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
149
+ | `number_of_trials` | int | `20` | Number of hyperopt trials |
150
+ | `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
151
+ | `plot` | bool | `True` | Whether to generate plots |
152
+ | `preserve_model` | bool | `True` | Whether to save the best model |
153
+ | `target_clf_thresholds`| dict | `{}` | Classification thresholds per target |
154
+
155
+ #### Example Context Configuration
156
+
157
+ ```python
158
+ context = {
159
+ # Required parameters
160
+ "experiment_name": f"stock_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
161
+ "date_column": "DATE",
162
+ "group_column": "STOCK",
163
+
164
+ # Feature selection
165
+ "corr_threshold": 80,
166
+ "max_features": 20,
167
+ "percentile": 20,
168
+ "max_p_value_categorical": 0.05,
169
+
170
+ # Feature engineering
171
+ "columns_drop": ["SECURITY", "ISIN", "ID"],
172
+ "columns_boolean": [],
173
+ "columns_date": ["DATE"],
174
+ "columns_te_groupby": [["SECTOR", "DATE"]],
175
+ "columns_te_target": ["RET", "VOLUME"],
176
+
177
+ # Preprocessing
178
+ "time_series": True,
179
+ "val_size": 0.2,
180
+ "test_size": 0.2,
181
+ "pca_temporal": [
182
+ {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
183
+ ],
184
+ "pca_cross_sectional": [
185
+ {
186
+ "name": "MARKET_REGIME",
187
+ "index": "DATE",
188
+ "columns": "STOCK",
189
+ "value": "RET",
190
+ }
191
+ ],
192
+ "columns_onehot": ["BUY_SIGNAL"],
193
+ "columns_binary": ["SECTOR", "LOCATION"],
194
+ "columns_ordinal": ["STOCK"],
195
+
196
+ # Model selection
197
+ "target_numbers": [1, 2, 3],
198
+ "target_clf": [1],
199
+ "models_idx": ["xgb", "lgb", "catboost"],
200
+ "max_timesteps": 120,
201
+ "perform_hyperopt": True,
202
+ "number_of_trials": 50,
203
+ "perform_crossval": True,
204
+ "plot": True,
205
+ "preserve_model": True,
206
+ "target_clf_thresholds": {1: {"precision": 0.80}},
207
+ }
208
+
209
+ # Create experiment
210
+ experiment = app.create_experiment(data=your_dataframe, **context)
211
+ ```
212
+
213
+ #### Important Notes
214
+
215
+ 1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
216
+
217
+ 2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
218
+
219
+ 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
220
+
221
+ 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
222
+
223
+ 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
224
+
225
+
226
+
227
+ ### Modular usage
228
+
229
+ You can also use each step independently:
230
+
231
+ ```python
232
+ data_eng = experiment.feature_engineering(data)
233
+ train, val, test = experiment.preprocess_feature(data_eng)
234
+ features = experiment.feature_selection(train)
235
+ std_data, reshaped_data = experiment.preprocess_model(train, val, test)
236
+ experiment.model_selection(std_data, reshaped_data)
237
+ ```
238
+
239
+ ## ⚠️ Using Alembic in Your Project (Important for Integrators)
240
+
241
+ If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
242
+
243
+ By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
244
+
245
+ ```python
246
+ def include_object(object, name, type_, reflected, compare_to):
247
+ if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
248
+ return False # Ignore LeCrapaud tables
249
+ return True
250
+
251
+ context.configure(
252
+ # ... other options ...
253
+ include_object=include_object,
254
+ )
255
+ ```
256
+
257
+ This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
258
+
259
+ ---
260
+
261
+ ## 🤝 Contributing
262
+
263
+ ### Reminders for Github usage
264
+
265
+ 1. Creating Github repository
266
+
267
+ ```sh
268
+ $ brew install gh
269
+ $ gh auth login
270
+ $ gh repo create
271
+ ```
272
+
273
+ 2. Initializing git and first commit to distant repository
274
+
275
+ ```sh
276
+ $ git init
277
+ $ git add .
278
+ $ git commit -m 'first commit'
279
+ $ git remote add origin <YOUR_REPO_URL>
280
+ $ git push -u origin master
281
+ ```
282
+
283
+ 3. Use conventional commits
284
+ https://www.conventionalcommits.org/en/v1.0.0/#summary
285
+
286
+ 4. Create environment
287
+
288
+ ```sh
289
+ $ pip install virtualenv
290
+ $ python -m venv .venv
291
+ $ source .venv/bin/activate
292
+ ```
293
+
294
+ 5. Install dependencies
295
+
296
+ ```sh
297
+ $ make install
298
+ ```
299
+
300
+ 6. Deactivate virtualenv (if needed)
301
+
302
+ ```sh
303
+ $ deactivate
304
+ ```
305
+
306
+ ---
307
+
308
+ Pierre Gallet © 2025
@@ -0,0 +1,5 @@
1
+ from lecrapaud.api import *
2
+
3
+ # Export default parameters for easy access
4
+ from lecrapaud.api import ExperimentEngine
5
+ DEFAULT_EXPERIMENT_PARAMS = ExperimentEngine.DEFAULT_PARAMS