lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
@@ -0,0 +1,423 @@
1
+ Metadata-Version: 2.4
2
+ Name: lecrapaud
3
+ Version: 0.22.6
4
+ Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
+ License: Apache License
6
+ License-File: LICENSE
7
+ Author: Pierre H. Gallet
8
+ Requires-Python: ==3.12.*
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: alembic (>=1.17.2)
13
+ Requires-Dist: bandit (>=1.9.2)
14
+ Requires-Dist: black (>=25.12.0)
15
+ Requires-Dist: catboost (>=1.2.8)
16
+ Requires-Dist: category-encoders (>=2.9.0)
17
+ Requires-Dist: codecov (>=2.1.13)
18
+ Requires-Dist: coverage (>=7.12.0)
19
+ Requires-Dist: flake8 (>=7.3.0)
20
+ Requires-Dist: ftfy (>=6.3.1)
21
+ Requires-Dist: hyperopt (>=0.2.7)
22
+ Requires-Dist: ipykernel (>=7.1.0)
23
+ Requires-Dist: ipywidgets (>=8.1.8)
24
+ Requires-Dist: joblib (>=1.5.2)
25
+ Requires-Dist: keras (>=3.12.0)
26
+ Requires-Dist: keras-tcn (>=3.5.6)
27
+ Requires-Dist: lightgbm (>=4.6.0)
28
+ Requires-Dist: matplotlib (>=3.10.7)
29
+ Requires-Dist: mlxtend (>=0.23.4)
30
+ Requires-Dist: mypy (>=1.19.0)
31
+ Requires-Dist: myst-parser (>=4.0.1)
32
+ Requires-Dist: numpy (>=2.1.3)
33
+ Requires-Dist: openai (>=2.9.0)
34
+ Requires-Dist: pandas (>=2.3.3)
35
+ Requires-Dist: pipdeptree (>=2.30.0)
36
+ Requires-Dist: poetry (>=2.2.1)
37
+ Requires-Dist: pydantic (>=2.12.5)
38
+ Requires-Dist: pylint (>=4.0.4)
39
+ Requires-Dist: pymysql (>=1.1.2)
40
+ Requires-Dist: pytest (>=9.0.2)
41
+ Requires-Dist: pytest-cov (>=7.0.0)
42
+ Requires-Dist: pytest-mock (>=3.15.1)
43
+ Requires-Dist: python-dotenv (>=1.2.1)
44
+ Requires-Dist: ray[tune] (>=2.52.1)
45
+ Requires-Dist: safety (>=3.7.0)
46
+ Requires-Dist: scikit-learn (>=1.6.1)
47
+ Requires-Dist: scipy (>=1.16.3)
48
+ Requires-Dist: seaborn (>=0.13.2)
49
+ Requires-Dist: sentry-sdk (>=2.47.0)
50
+ Requires-Dist: sphinx (>=8.2.3)
51
+ Requires-Dist: sphinxcontrib-httpdomain (>=1.8.1)
52
+ Requires-Dist: sphinxcontrib-openapi (>=0.8.4)
53
+ Requires-Dist: sqlalchemy (>=2.0.44)
54
+ Requires-Dist: tabulate (>=0.9.0)
55
+ Requires-Dist: tensorboard (<=2.19.0)
56
+ Requires-Dist: tensorboardx (>=2.6.4)
57
+ Requires-Dist: tensorflow (<=2.19.0)
58
+ Requires-Dist: tiktoken (>=0.12.0)
59
+ Requires-Dist: tqdm (>=4.67.1)
60
+ Requires-Dist: xgboost (>=3.1.2)
61
+ Description-Content-Type: text/markdown
62
+
63
+ <div align="center">
64
+
65
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
66
+
67
+ ## Welcome to LeCrapaud
68
+
69
+ **An all-in-one machine learning framework**
70
+
71
+ <!-- [![GitHub stars](https://img.shields.io/github/stars/pierregallet/lecrapaud.svg?style=flat&logo=github&colorB=blue&label=stars)](https://github.com/pierregallet/lecrapaud/stargazers) -->
72
+ [![PyPI version](https://badge.fury.io/py/lecrapaud.svg)](https://badge.fury.io/py/lecrapaud)
73
+ [![Python versions](https://img.shields.io/pypi/pyversions/lecrapaud.svg)](https://pypi.org/project/lecrapaud)
74
+ <!-- [![License](https://img.shields.io/github/license/pierregallet/lecrapaud.svg)](https://github.com/pierregallet/lecrapaud/blob/main/LICENSE) -->
75
+ <!-- [![codecov](https://codecov.io/gh/pierregallet/lecrapaud/branch/main/graph/badge.svg)](https://codecov.io/gh/pierregallet/lecrapaud) -->
76
+
77
+ </div>
78
+
79
+ ## 🚀 Introduction
80
+
81
+ LeCrapaud is a high-level Python library for end-to-end machine learning workflows on tabular or time series data. It provides a simple API to handle feature engineering, model selection, training, and prediction, all in a reproducible and modular way.
82
+
83
+ ## ✨ Key Features
84
+
85
+ - 👋 End-to-end machine learning training in one command, with feature engineering, feature selection, preprocessing, model selection, and prediction
86
+ - 🧩 Modular pipeline: Feature engineering, preprocessing, selection, and modeling can also be runned as independent steps
87
+ - 🤖 Automated model selection and hyperparameter optimization
88
+ - 📊 Easy integration with pandas DataFrames
89
+ - 🔬 Supports both regression and classification tasks
90
+ - 🛠️ Simple API for both full pipeline and step-by-step usage
91
+ - 📦 Ready for production and research workflows
92
+
93
+ ## ⚡ Quick Start
94
+
95
+
96
+ ### Install the package
97
+
98
+ ```sh
99
+ pip install lecrapaud
100
+ ```
101
+
102
+ ### How it works
103
+
104
+ This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data. It can also work with time series or panel data (mutliple time series grouped by a common column).
105
+
106
+ ### Typical workflow
107
+
108
+ ```python
109
+ from lecrapaud import LeCrapaud
110
+
111
+ # Create a new experiment with data
112
+ experiment = LeCrapaud(
113
+ data=your_dataframe,
114
+ target_numbers=[1, 2],
115
+ target_clf=[2], # TARGET_2 is classification
116
+ columns_drop=[...],
117
+ columns_date=[...],
118
+ # ... other config options
119
+ )
120
+
121
+ # Train the model(s)
122
+ experiment.fit(your_dataframe)
123
+
124
+ # Make predictions
125
+ predictions, reg_scores, clf_scores = experiment.predict(new_data)
126
+
127
+ # Load existing experiment by ID
128
+ experiment = LeCrapaud(id=123)
129
+
130
+ # Or get best experiment by name
131
+ best_exp = LeCrapaud.get_best_experiment_by_name('my_experiment')
132
+ ```
133
+
134
+ #### Expected data format
135
+
136
+ - Both `your_dataframe` and `new_data` should be pandas `DataFrame` objects.
137
+ - `your_dataframe` must contain all feature columns **plus one column per target** named `TARGET_i` (e.g., `TARGET_1`, `TARGET_2`). LeCrapaud trains one model per target listed in `target_numbers`; classification targets are those listed in `target_clf`.
138
+ - `new_data` should include only the feature columns (no `TARGET_i`, unless you want to evaluate on an extra test set — models are already hyperoptimized on train + val and evaluated on test set in `fit`, but you can still want to keep another testset for final evaluation). You can reuse the same feature set or any subset consistent with training (features that was selected by feature selection).
139
+ - experiment.predict will outputs:
140
+ - `predictions` dataframe, with:
141
+ - Regression targets: the returned DataFrame has an added column `TARGET_{i}_PRED`.
142
+ - Classification targets: the returned DataFrame has `TARGET_{i}_PRED` (predicted class) and one probability column per class: `TARGET_{i}_{class_value}` (e.g., `TARGET_2_0`, `TARGET_2_1` for binary).
143
+ - `reg_scores` and `clf_scores` dataframes, only if new_data includes `TARGET_i` (for instance, if you have a testset). If not, it will be None values, but you still need to unpack them with `prediction, _, _ = experiment.predict(new_data)`
144
+ - See the examples for end-to-end code: [`examples/basic_usage.py`](examples/basic_usage.py) and [`examples/advanced_usage.py`](examples/advanced_usage.py).
145
+
146
+ ### Supported models
147
+
148
+ - Classical/ensembles: `linear`, `sgd`, `naive_bayes`, `bagging_naive_bayes`, `svm`, `tree`, `forest`, `adaboost`, `xgb`, `lgb`, `catboost`.
149
+ - Recurrent/DL:
150
+ - `LSTM-1`: single-layer LSTM head on tabular sequences.
151
+ - `LSTM-2`: two stacked LSTM layers.
152
+ - `LSTM-2-Deep`: deeper head on top of stacked LSTMs.
153
+ - `BiLSTM-1`: bidirectional single-layer LSTM.
154
+ - `GRU-1`: single-layer GRU.
155
+ - `BiGRU-1`: bidirectional GRU.
156
+ - `TCN-1`: Temporal Convolutional Network baseline.
157
+ - `Seq2Seq`: encoder-decoder with attention for sequences.
158
+ - `Transformer`: transformer encoder stack for tabular sequences.
159
+
160
+ ### Database Configuration (Required)
161
+
162
+ LeCrapaud requires access to a MySQL database to store experiments and results. You can configure the database by:
163
+
164
+ - Passing a valid MySQL URI to the constructor:
165
+ ```python
166
+ experiment = LeCrapaud(uri="mysql+pymysql://user:password@host:port/dbname", data=df, ...)
167
+ ```
168
+ - **OR** setting environment variables:
169
+ - `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`
170
+ - Or set `DB_URI` directly with your full connection string.
171
+
172
+ If neither is provided, database operations will not work.
173
+
174
+ #### Quick MySQL setup (local, macOS)
175
+
176
+ Pick one:
177
+
178
+ - Docker (fastest):
179
+ ```sh
180
+ docker run --name lecrapaud-mysql -e MYSQL_ROOT_PASSWORD=root -e MYSQL_DATABASE=lecrapaud -p 3306:3306 -d mysql:8
181
+ ```
182
+ - Homebrew MySQL:
183
+ ```sh
184
+ brew install mysql
185
+ brew services start mysql
186
+ mysql -uroot
187
+ CREATE DATABASE lecrapaud;
188
+ CREATE USER 'lecrapaud'@'localhost' IDENTIFIED BY 'lecrapaud';
189
+ GRANT ALL PRIVILEGES ON lecrapaud.* TO 'lecrapaud'@'localhost';
190
+ FLUSH PRIVILEGES;
191
+ ```
192
+
193
+ Then set your env vars:
194
+ ```sh
195
+ export DB_USER=lecrapaud
196
+ export DB_PASSWORD=lecrapaud
197
+ export DB_HOST=127.0.0.1
198
+ export DB_PORT=3306
199
+ export DB_NAME=lecrapaud
200
+ export DB_URI="mysql+pymysql://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
201
+ ```
202
+
203
+ ### Using OpenAI Embeddings (Optional)
204
+
205
+ If you want to use the `columns_pca` embedding feature (for advanced feature engineering), you must set the `OPENAI_API_KEY` environment variable with your OpenAI API key:
206
+
207
+ ```sh
208
+ export OPENAI_API_KEY=sk-...
209
+ ```
210
+
211
+ If this variable is not set, features relying on OpenAI embeddings will not be available.
212
+
213
+ ### Experiment Context Arguments
214
+
215
+ The experiment context is a dictionary containing all configuration parameters for your ML pipeline. Parameters are stored in the experiment's database record and automatically retrieved when loading an existing experiment.
216
+
217
+ #### Required Parameters
218
+
219
+ | Parameter | Type | Description | Example |
220
+ | ----------------- | --------- | -------------------------------------------------- | -------------------- |
221
+ | `data` | DataFrame | Input dataset (required for new experiments only) | `pd.DataFrame(...)` |
222
+ | `date_column` | str | Name of the date column (required for time series) | `'DATE'` |
223
+ | `experiment_name` | str | Unique name for the experiment | `'stock_prediction'` |
224
+ | `group_column` | str | Name of the group column (required for panel data) | `'STOCK'` |
225
+
226
+ #### Feature Engineering Parameters
227
+
228
+ | Parameter | Type | Default | Description |
229
+ | -------------------- | ---- | ------- | ------------------------------------------ |
230
+ | `columns_boolean` | list | `[]` | Columns to convert to boolean features |
231
+ | `columns_date` | list | `[]` | Date columns for cyclic encoding |
232
+ | `columns_drop` | list | `[]` | Columns to drop during feature engineering |
233
+ | `columns_te_groupby` | list | `[]` | Groupby columns for target encoding |
234
+ | `columns_te_target` | list | `[]` | Target columns for target encoding |
235
+
236
+ #### Preprocessing Parameters
237
+
238
+ | Parameter | Type | Default | Description |
239
+ | --------------------- | ----- | ------- | ------------------------------------------------ |
240
+ | `columns_binary` | list | `[]` | Columns for binary encoding |
241
+ | `columns_frequency` | list | `[]` | Columns for frequency encoding |
242
+ | `columns_onehot` | list | `[]` | Columns for one-hot encoding |
243
+ | `columns_ordinal` | list | `[]` | Columns for ordinal encoding |
244
+ | `columns_pca` | list | `[]` | Columns for PCA transformation |
245
+ | `pca_cross_sectional` | list | `[]` | Cross-sectional PCA config (e.g., market regime) |
246
+ | `pca_temporal` | list | `[]` | Temporal PCA config (e.g., lag features) |
247
+ | `test_size` | float | `0.2` | Test set size (fraction) |
248
+ | `time_series` | bool | `False` | Whether data is time series |
249
+ | `val_size` | float | `0.2` | Validation set size (fraction) |
250
+
251
+ #### Feature Selection Parameters
252
+
253
+ | Parameter | Type | Default | Description |
254
+ | ------------------------- | ----- | ------- | -------------------------------------------------------- |
255
+ | `corr_threshold` | float | `80` | Maximum correlation threshold (%) between features |
256
+ | `max_features` | int | `50` | Maximum number of final features |
257
+ | `max_p_value_categorical` | float | `0.05` | Maximum p-value for categorical feature selection (Chi2) |
258
+ | `percentile` | float | `20` | Percentage of features to keep per selection method |
259
+
260
+ #### Model Selection Parameters
261
+
262
+ | Parameter | Type | Default | Description |
263
+ | ----------------------- | ---- | ------- | --------------------------------------------------------- |
264
+ | `max_timesteps` | int | `120` | Maximum timesteps for recurrent models |
265
+ | `models_idx` | list | `[]` | Model indices or names to use (e.g., `[1, 'xgb', 'lgb']`) |
266
+ | `number_of_trials` | int | `20` | Number of hyperopt trials |
267
+ | `perform_crossval` | bool | `False` | Whether to use cross-validation during hyperopt |
268
+ | `perform_hyperopt` | bool | `True` | Whether to perform hyperparameter optimization |
269
+ | `plot` | bool | `True` | Whether to generate plots |
270
+ | `preserve_model` | bool | `True` | Whether to save the best model |
271
+ | `target_clf_thresholds` | dict | `{}` | Classification thresholds per target |
272
+ | `target_clf` | list | `[]` | Classification target indices |
273
+ | `target_numbers` | list | `[]` | List of target indices to predict |
274
+
275
+
276
+ #### Example context (time series)
277
+
278
+ ```python
279
+ context = {
280
+ "experiment_name": "energy_forecast_demo",
281
+ "date_column": "timestamp",
282
+ "group_column": "site_id", # per-site time series
283
+ "time_series": True,
284
+ "val_size": 0.2,
285
+ "test_size": 0.2,
286
+
287
+ # Feature engineering
288
+ "columns_drop": ["equipment_id"],
289
+ "columns_boolean": ["is_weekend"],
290
+ "columns_date": ["timestamp"],
291
+ "columns_onehot": ["weather_condition"],
292
+ "columns_binary": ["region"],
293
+ "columns_ordinal": [],
294
+
295
+ # PCA on temporal blocks (auto-creates lags)
296
+ "pca_temporal": [
297
+ {"name": "LAST_48_LOAD", "column": "load_kw", "lags": 48},
298
+ {"name": "LAST_24_TEMP", "column": "temperature_c", "lags": 24},
299
+ ],
300
+ # Optional cross-sectional PCA across sites at each timestamp
301
+ "pca_cross_sectional": [
302
+ {"name": "SITE_LOAD_FACTORS", "index": "timestamp", "columns": "site_id", "value": "load_kw"}
303
+ ],
304
+
305
+ # Feature selection
306
+ "corr_threshold": 80,
307
+ "max_features": 30,
308
+ "percentile": 30,
309
+
310
+ # Model selection
311
+ "target_numbers": [1], # Expect a column TARGET_1 (e.g., next-hour load)
312
+ "target_clf": [], # regression
313
+ "models_idx": ["lgb", "xgb"], # boosted trees for tabular time series
314
+ "perform_hyperopt": True,
315
+ "number_of_trials": 40,
316
+ }
317
+
318
+ experiment = LeCrapaud(data=your_dataframe, **context)
319
+ ```
320
+
321
+ #### Important Notes
322
+
323
+ 1. **Context Persistence**: All context parameters are saved in the database when creating an experiment and automatically restored when loading it.
324
+
325
+ 2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
326
+
327
+ 3. **PCA Time Series**:
328
+ - For time series data, both `pca_cross_sectional` and `pca_temporal` automatically use an expanding window approach with periodic refresh (default: every 90 days) to prevent data leakage.
329
+ - The system fits PCA only on historical data (lookback window of 365 days by default) and avoids look-ahead bias.
330
+ - For panel data (e.g., multiple stocks), lag features are created per group when using the simplified `pca_temporal` format.
331
+ - Missing PCA values are handled with forward-fill followed by zero-fill to ensure compatibility with downstream models.
332
+
333
+ 4. **PCA Temporal Simplified Format**:
334
+ - Instead of manually listing lag columns: `{"name": "LAST_20_RET", "columns": ["RET_-1", "RET_-2", ..., "RET_-20"]}`
335
+ - Use the simplified format: `{"name": "LAST_20_RET", "column": "RET", "lags": 20}`
336
+ - The system automatically creates the lag columns, handling panel data correctly with `group_column`.
337
+
338
+ 5. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
339
+
340
+ 6. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
341
+
342
+
343
+
344
+ ### Modular usage with sklearn-compatible components
345
+
346
+ You can also use individual pipeline components:
347
+
348
+ ```python
349
+ from lecrapaud import FeatureEngineering, FeaturePreprocessor, FeatureSelector
350
+
351
+ # Create components with experiment context
352
+ feature_eng = FeatureEngineering(experiment=experiment)
353
+ feature_prep = FeaturePreprocessor(experiment=experiment)
354
+ feature_sel = FeatureSelector(experiment=experiment, target_number=1)
355
+
356
+ # Use sklearn fit/transform pattern
357
+ feature_eng.fit(data)
358
+ data_eng = feature_eng.get_data()
359
+
360
+ feature_prep.fit(data_eng)
361
+ data_preprocessed = feature_prep.transform(data_eng)
362
+
363
+ feature_sel.fit(data_preprocessed)
364
+
365
+ # Or use in sklearn Pipeline
366
+ from sklearn.pipeline import Pipeline
367
+ pipeline = Pipeline([
368
+ ('feature_eng', FeatureEngineering(experiment=experiment)),
369
+ ('feature_prep', FeaturePreprocessor(experiment=experiment))
370
+ ])
371
+ ```
372
+
373
+ ## ⚠️ Using Alembic in Your Project (Important for Integrators)
374
+
375
+ If you use Alembic for migrations in your own project and you share the same database with LeCrapaud, you must ensure that Alembic does **not** attempt to drop or modify LeCrapaud tables (those prefixed with `{LECRAPAUD_TABLE_PREFIX}_`).
376
+
377
+ By default, Alembic's autogenerate feature will propose to drop any table that exists in the database but is not present in your project's models. To prevent this, add the following filter to your `env.py`:
378
+
379
+ ```python
380
+ def include_object(object, name, type_, reflected, compare_to):
381
+ if type_ == "table" and name.startswith(f"{LECRAPAUD_TABLE_PREFIX}_"):
382
+ return False # Ignore LeCrapaud tables
383
+ return True
384
+
385
+ context.configure(
386
+ # ... other options ...
387
+ include_object=include_object,
388
+ )
389
+ ```
390
+
391
+ This will ensure that Alembic ignores all tables created by LeCrapaud when generating migrations for your own project.
392
+
393
+ ## 🤝 Contributing
394
+
395
+ ### How we work
396
+
397
+ - Use conventional commits (e.g., `feat: add lgbm tuner`, `fix: handle missing target`).
398
+ - Create feature branches (`feat/…`, `fix/…`) off `main`; keep PRs focused and small.
399
+ - Before opening a PR: `make format && make lint && make test` (or at least run the relevant test subset). If you skip, explain why in the PR.
400
+ - Write/adjust tests when changing behavior or adding features; include fixtures/data updates when needed.
401
+ - Documentation is part of the change: update README/examples/docstrings when APIs or flows change.
402
+ - PRs should include:
403
+ - A short summary of the change and rationale.
404
+ - Screenshots or sample outputs when UI/notebook outputs are affected.
405
+ - Validation notes (commands run, datasets used).
406
+ - Any follow-ups or known gaps.
407
+
408
+ ### Setup (dev)
409
+
410
+ ```sh
411
+ python -m venv .venv
412
+ source .venv/bin/activate
413
+ make install
414
+ # optional gpu deps
415
+ make install-gpu
416
+ ```
417
+
418
+ When done: `deactivate`.
419
+
420
+ ---
421
+
422
+ Pierre Gallet © 2025
423
+
@@ -0,0 +1,51 @@
1
+ lecrapaud/__init__.py,sha256=jl028c-fd5Si_P375QYFRlbNu1VqHzm5Ovj-1x4rLeY,730
2
+ lecrapaud/base.py,sha256=1cP_l8cjm5Muscry-Bvu_JYvIA9AtSCWDreegBsL0Lw,24870
3
+ lecrapaud/config.py,sha256=1qGL7S7OKBr8rxzp2ohbux3sNfRo0-BTSiX0FjpLFAM,1403
4
+ lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
+ lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
6
+ lecrapaud/db/alembic/env.py,sha256=RvTTBa3bDVBxmDtapAfzUoeWBgmVQU3s9U6HmQCAP84,2421
7
+ lecrapaud/db/alembic/script.py.mako,sha256=MEqL-2qATlST9TAOeYgscMn1uy6HUS9NFvDgl93dMj8,635
8
+ lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py,sha256=hyPW0Mt_B4ZAHnJYLREy7MAncNDLnEIyJQJW2pyz_LY,17228
9
+ lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py,sha256=6Pf36HAXEVrVlnrohAe2O7gVaXpDiv3LLIP_EEgTyA0,917
10
+ lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py,sha256=KjwjYvFaNqYmBLTYel8As37fyaBtNVWTqN_3M7y_2eI,1357
11
+ lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py,sha256=MiqooJuZ1etExl2he3MniaEv8G0LrmqY-0m22m9xKmc,943
12
+ lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py,sha256=gyQDFFHp1dlILuDtXSPdUU_MsLlX-UzTP-E96Aj_Hto,966
13
+ lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py,sha256=Bpi1zegNGX1qU-8RVzRfwjyv2cVaQ5P9cpKQ1QDJgxs,945
14
+ lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py,sha256=g6H2Z9MwB6UEiqdGlBoHBXpO9DTaWkwHt8FS6joVOm0,1191
15
+ lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py,sha256=FshOF1t-NWXrBtXT3wMNGFslJ4sWUxzvBEXSymu05cI,1043
16
+ lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py,sha256=htHUD4zPJr-0z_DQfTi8r9RsFVe9m7SL0f7oRIvLIcQ,10999
17
+ lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py,sha256=0NBvOwPqMXpWnDEGiEBk_IeLKmXQ5ZcU-dqHeSEgsRQ,2557
18
+ lecrapaud/db/alembic.ini,sha256=Zw2rdwsKV6c7J1SPtoFIPDX08_oTP3MuUKnNxBDiY8I,3796
19
+ lecrapaud/db/models/__init__.py,sha256=-XoCN1eeLihnNxBMl90lXrgrTSDkMbeqgienMqFi5f4,702
20
+ lecrapaud/db/models/base.py,sha256=a9s_x-HMq8GmH2PjKWID9mBl-nI_Gx7eBCVQyVsPcY8,9951
21
+ lecrapaud/db/models/experiment.py,sha256=t02iBv1k9juv2oHaiMHe9g6KLYGivtEt6EIpQP2Xy6o,15356
22
+ lecrapaud/db/models/feature.py,sha256=5o77O2FyRObnLOCGNj8kaPSGM3pLv1Ov6mXXHYkmnYY,1136
23
+ lecrapaud/db/models/feature_selection.py,sha256=PBNWk9QaLb7-_xyrLlOUfab0y2xEj3agAIzt1gxssZQ,3172
24
+ lecrapaud/db/models/feature_selection_rank.py,sha256=POo-OLdaxU3eaH6fC6fTOj7Fnv0ujvTXgYZMzjjwTfE,1773
25
+ lecrapaud/db/models/model.py,sha256=F0hyMjd4FFHCv6_arIWBEmBCGOfG3b6_uzU8ExtFE90,952
26
+ lecrapaud/db/models/model_selection.py,sha256=V2hh7aTof83GPfv4pMYkyA6zR1fiC4Cyj7Z3hzwqhQM,2014
27
+ lecrapaud/db/models/model_selection_score.py,sha256=7u96v90_C0G5OJDsE7sQ3V99VPQc_7ZvwNx9-y1r2Z8,2258
28
+ lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk,1649
29
+ lecrapaud/db/models/utils.py,sha256=-a-nWWmpJ2XzidIxo2COVUTrGZIPYCfBzjhcszJj_bM,1109
30
+ lecrapaud/db/session.py,sha256=RCbAwmnECrF8jDINbUpI4OlJBDMrnUBZXb6XM5glbh8,3785
31
+ lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
32
+ lecrapaud/experiment.py,sha256=CDGipF0nRnzPJxnGJ3TNlYEsa6AYvgtPb-jhisEZ6vc,3486
33
+ lecrapaud/feature_engineering.py,sha256=5lVSmddhDyNQBzaTCIuL2QtXjfwekwKaOkKM25BzzDg,15701
34
+ lecrapaud/feature_preprocessing.py,sha256=QRVbERRKNjlPYm_Nhw2M1eRWnn4qC8ujVIDXqtRJ-pg,48472
35
+ lecrapaud/feature_selection.py,sha256=RAx5SMNq_HPwncHcs8Hap8E18XlKqj1YLozDaKa74bc,37593
36
+ lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
37
+ lecrapaud/integrations/sentry_integration.py,sha256=IsYL0m4qU3bc1j38TLGT846Ykk3y8InfAdfBxAgMnv4,1060
38
+ lecrapaud/misc/tabpfn_tests.ipynb,sha256=fy_rP0FphlbZS_a86hv-5rLojFp0HHerC5ejfov6rGE,6681
39
+ lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
40
+ lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
41
+ lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
42
+ lecrapaud/mixins.py,sha256=TtXUEAzID11PpNf6PROAHbLUdsCbLwkh4p-qOJ94FFU,7739
43
+ lecrapaud/model_preprocessing.py,sha256=7Jy_RfwOGN5ONyVkZRU6uzh8rNyxMrLB2Cqeqs7CkVk,10480
44
+ lecrapaud/model_selection.py,sha256=UBRZxi6LfxlkTZ_baG_Vn6ofYpNKcQymxltXkc9dX5A,90720
45
+ lecrapaud/pipeline.py,sha256=-qOr4z6U1phr2pUsWWNZGt18gUNwJdWV3v_L8BzmxgQ,19813
46
+ lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
47
+ lecrapaud/utils.py,sha256=4c8vvJZ6Kqmxz7Uyozc4q4RHFIQi41guSBPutC0pwaM,9289
48
+ lecrapaud-0.22.6.dist-info/METADATA,sha256=TEh9tZa5e4_4sPuvu3JHgSE2cc-qJtQmDleZQTOUJXU,19089
49
+ lecrapaud-0.22.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
50
+ lecrapaud-0.22.6.dist-info/licenses/LICENSE,sha256=qp7NEYPaTK8VJoTBbJZEMRQ3wiUMJCHVBevHCghOUys,11350
51
+ lecrapaud-0.22.6.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [2024] [Pierre H. Gallet]
189
+ Copyright [2025] [Pierre H. Gallet]
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,64 +0,0 @@
1
- from sqlalchemy import (
2
- Column,
3
- Integer,
4
- String,
5
- DateTime,
6
- Date,
7
- Float,
8
- JSON,
9
- Table,
10
- ForeignKey,
11
- BigInteger,
12
- Index,
13
- TIMESTAMP,
14
- UniqueConstraint,
15
- )
16
- from sqlalchemy import desc, asc, cast, text, func
17
-
18
- from sqlalchemy.orm import relationship, Mapped, mapped_column, DeclarativeBase
19
-
20
- from lecrapaud.db.session import get_db
21
- from lecrapaud.db.models.base import Base
22
- from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
23
-
24
-
25
- class ModelTraining(Base):
26
-
27
- id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
28
- created_at = Column(
29
- TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
30
- )
31
- updated_at = Column(
32
- TIMESTAMP(timezone=True),
33
- server_default=func.now(),
34
- onupdate=func.now(),
35
- nullable=False,
36
- )
37
- best_params = Column(JSON)
38
- model_path = Column(String(255))
39
- training_time = Column(Integer)
40
- model_id = Column(
41
- BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
42
- )
43
- model_selection_id = Column(
44
- BigInteger,
45
- ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
46
- nullable=False,
47
- )
48
-
49
- model = relationship("Model", lazy="selectin")
50
- model_selection = relationship(
51
- "ModelSelection", back_populates="model_trainings", lazy="selectin"
52
- )
53
- score = relationship(
54
- "Score",
55
- back_populates="model_trainings",
56
- cascade="all, delete-orphan",
57
- lazy="selectin",
58
- )
59
-
60
- __table_args__ = (
61
- UniqueConstraint(
62
- "model_id", "model_selection_id", name="uq_model_training_composite"
63
- ),
64
- )
@@ -1,13 +0,0 @@
1
- from celery import Celery, signals
2
- from lecrapaud.jobs import config
3
- from lecrapaud.utils import setup_logger
4
-
5
-
6
- @signals.setup_logging.connect
7
- def configure_celery_logging(**kwargs):
8
- setup_logger()
9
-
10
-
11
- app = Celery("src")
12
- app.config_from_object(config)
13
- app.autodiscover_tasks(["src.jobs"])
lecrapaud/jobs/config.py DELETED
@@ -1,17 +0,0 @@
1
- from lecrapaud.config import REDIS_URL
2
-
3
- REDIS_URL = REDIS_URL + "/1"
4
- broker_url = REDIS_URL
5
- result_backend = REDIS_URL
6
-
7
- # For RedBeat
8
- redbeat_redis_url = REDIS_URL
9
- beat_scheduler = "redbeat.RedBeatScheduler"
10
-
11
- timezone = "UTC"
12
-
13
- task_acks_late = True
14
- task_reject_on_worker_lost = True
15
- worker_prefetch_multiplier = 1
16
- task_acks_on_failure_or_timeout = False
17
- worker_concurrency = 1
@@ -1,30 +0,0 @@
1
- from redbeat.schedulers import RedBeatSchedulerEntry
2
- from celery.schedules import crontab
3
- from lecrapaud.jobs.tasks import app
4
-
5
-
6
- def schedule_tasks():
7
- schedule_tasks_list = [
8
- {
9
- "name": "task_training_experiment",
10
- "task": "src.jobs.tasks.task_training_experiment",
11
- "schedule": crontab(minute=45, hour=00),
12
- },
13
- ]
14
-
15
- for task in schedule_tasks_list:
16
- entry = RedBeatSchedulerEntry(**task, app=app)
17
- entry.save()
18
-
19
-
20
- def unschedule_tasks():
21
- unschedule_task_keys = [
22
- "redbeat:task_training_experiment",
23
- ]
24
-
25
- for key in unschedule_task_keys:
26
- try:
27
- entry = RedBeatSchedulerEntry.from_key(key, app=app)
28
- entry.delete()
29
- except KeyError:
30
- pass
lecrapaud/jobs/tasks.py DELETED
@@ -1,17 +0,0 @@
1
- from lecrapaud.jobs import app
2
- from lecrapaud.utils import logger
3
-
4
-
5
- @app.task(
6
- bind=True,
7
- autoretry_for=(Exception,),
8
- retry_backoff=True,
9
- retry_kwargs={"max_retries": 5},
10
- acks_late=True,
11
- )
12
- def task_training_experiment(self):
13
- try:
14
- pass
15
- except Exception as e:
16
- logger.error(e)
17
- raise