lecrapaud 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/PKG-INFO +27 -20
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/README.md +26 -19
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/api.py +8 -2
- lecrapaud-0.3.0/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +50 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/session.py +11 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/experiment.py +1 -1
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/feature_engineering.py +11 -12
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/feature_selection.py +29 -48
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/model_selection.py +59 -59
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/utils.py +1 -1
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/pyproject.toml +1 -1
- lecrapaud-0.2.0/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +0 -38
- lecrapaud-0.2.0/lecrapaud/predictions.py +0 -292
- lecrapaud-0.2.0/lecrapaud/preprocessing.py +0 -984
- lecrapaud-0.2.0/lecrapaud/training.py +0 -239
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/LICENSE +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/__init__.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/config.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/__init__.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/README +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/env.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/script.py.mako +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/__init__.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/base.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/dataset.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model_selection.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model_training.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/score.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/target.py +0 -0
- /lecrapaud-0.2.0/lecrapaud/directory_management.py → /lecrapaud-0.3.0/lecrapaud/directories.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/integrations/openai_integration.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/__init__.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/config.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/scheduler.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/tasks.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/search_space.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/__init__.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/embedding_categorical.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/indicators.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/experiments.py +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-bilstm.ipynb +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-resnet.ipynb +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-transformers.ipynb +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/tests.ipynb +0 -0
- {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/trash.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: lecrapaud
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Framework for machine and deep learning, with regression, classification and time series analysis
|
|
5
5
|
License: Apache License
|
|
6
6
|
Author: Pierre H. Gallet
|
|
@@ -49,7 +49,7 @@ Description-Content-Type: text/markdown
|
|
|
49
49
|
|
|
50
50
|
<div align="center">
|
|
51
51
|
|
|
52
|
-
|
|
52
|
+
<img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
|
|
53
53
|
|
|
54
54
|
## Welcome to LeCrapaud
|
|
55
55
|
|
|
@@ -72,34 +72,21 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
|
|
|
72
72
|
|
|
73
73
|
## ⚡ Quick Start
|
|
74
74
|
|
|
75
|
-
1. Create environment
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
$ pip install virtualenv
|
|
79
|
-
$ python -m venv .venv
|
|
80
|
-
$ source .venv/bin/activate
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
2. Install dependencies
|
|
76
|
+
### Install the package
|
|
84
77
|
|
|
85
78
|
```sh
|
|
86
|
-
|
|
79
|
+
pip install lecrapaud
|
|
87
80
|
```
|
|
88
81
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
```sh
|
|
92
|
-
$ deactivate
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
## 🛠️ How it works
|
|
82
|
+
### How it works
|
|
96
83
|
|
|
97
84
|
This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
|
|
98
85
|
|
|
99
86
|
### Typical workflow
|
|
100
87
|
|
|
101
88
|
```python
|
|
102
|
-
from lecrapaud
|
|
89
|
+
from lecrapaud import LeCrapaud
|
|
103
90
|
|
|
104
91
|
# 1. Create the main app
|
|
105
92
|
app = LeCrapaud()
|
|
@@ -159,6 +146,26 @@ $ git push -u origin master
|
|
|
159
146
|
3. Use conventional commits
|
|
160
147
|
https://www.conventionalcommits.org/en/v1.0.0/#summary
|
|
161
148
|
|
|
149
|
+
4. Create environment
|
|
150
|
+
|
|
151
|
+
```sh
|
|
152
|
+
$ pip install virtualenv
|
|
153
|
+
$ python -m venv .venv
|
|
154
|
+
$ source .venv/bin/activate
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
5. Install dependencies
|
|
158
|
+
|
|
159
|
+
```sh
|
|
160
|
+
$ make install
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
6. Deactivate virtualenv (if needed)
|
|
164
|
+
|
|
165
|
+
```sh
|
|
166
|
+
$ deactivate
|
|
167
|
+
```
|
|
168
|
+
|
|
162
169
|
---
|
|
163
170
|
|
|
164
|
-
Pierre Gallet ©
|
|
171
|
+
Pierre Gallet © 2025
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
|
|
4
4
|
|
|
5
5
|
## Welcome to LeCrapaud
|
|
6
6
|
|
|
@@ -23,34 +23,21 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
|
|
|
23
23
|
|
|
24
24
|
## ⚡ Quick Start
|
|
25
25
|
|
|
26
|
-
1. Create environment
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
$ pip install virtualenv
|
|
30
|
-
$ python -m venv .venv
|
|
31
|
-
$ source .venv/bin/activate
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
2. Install dependencies
|
|
27
|
+
### Install the package
|
|
35
28
|
|
|
36
29
|
```sh
|
|
37
|
-
|
|
30
|
+
pip install lecrapaud
|
|
38
31
|
```
|
|
39
32
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
```sh
|
|
43
|
-
$ deactivate
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
## 🛠️ How it works
|
|
33
|
+
### How it works
|
|
47
34
|
|
|
48
35
|
This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
|
|
49
36
|
|
|
50
37
|
### Typical workflow
|
|
51
38
|
|
|
52
39
|
```python
|
|
53
|
-
from lecrapaud
|
|
40
|
+
from lecrapaud import LeCrapaud
|
|
54
41
|
|
|
55
42
|
# 1. Create the main app
|
|
56
43
|
app = LeCrapaud()
|
|
@@ -110,6 +97,26 @@ $ git push -u origin master
|
|
|
110
97
|
3. Use conventional commits
|
|
111
98
|
https://www.conventionalcommits.org/en/v1.0.0/#summary
|
|
112
99
|
|
|
100
|
+
4. Create environment
|
|
101
|
+
|
|
102
|
+
```sh
|
|
103
|
+
$ pip install virtualenv
|
|
104
|
+
$ python -m venv .venv
|
|
105
|
+
$ source .venv/bin/activate
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
5. Install dependencies
|
|
109
|
+
|
|
110
|
+
```sh
|
|
111
|
+
$ make install
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
6. Deactivate virtualenv (if needed)
|
|
115
|
+
|
|
116
|
+
```sh
|
|
117
|
+
$ deactivate
|
|
118
|
+
```
|
|
119
|
+
|
|
113
120
|
---
|
|
114
121
|
|
|
115
|
-
Pierre Gallet ©
|
|
122
|
+
Pierre Gallet © 2025
|
|
@@ -32,6 +32,8 @@ experiment.model_selection(data) : return best_model
|
|
|
32
32
|
|
|
33
33
|
import joblib
|
|
34
34
|
import pandas as pd
|
|
35
|
+
import logging
|
|
36
|
+
from lecrapaud.utils import logger
|
|
35
37
|
from lecrapaud.db.session import init_db
|
|
36
38
|
from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
|
|
37
39
|
from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine
|
|
@@ -103,7 +105,12 @@ class Experiment:
|
|
|
103
105
|
std_data, reshaped_data = self.preprocess_model(train, val, test)
|
|
104
106
|
self.model_selection(std_data, reshaped_data)
|
|
105
107
|
|
|
106
|
-
def predict(self, new_data):
|
|
108
|
+
def predict(self, new_data, verbose: int = 0):
|
|
109
|
+
if verbose == 0:
|
|
110
|
+
logger.setLevel(logging.WARNING)
|
|
111
|
+
|
|
112
|
+
logger.warning("Running prediction...")
|
|
113
|
+
|
|
107
114
|
data = self.feature_engineering(
|
|
108
115
|
data=new_data,
|
|
109
116
|
for_training=False,
|
|
@@ -127,7 +134,6 @@ class Experiment:
|
|
|
127
134
|
else:
|
|
128
135
|
features = self.dataset.get_features(target_number)
|
|
129
136
|
model = ModelEngine(path=training_target_dir)
|
|
130
|
-
model.load()
|
|
131
137
|
|
|
132
138
|
# getting data
|
|
133
139
|
if model.recurrent:
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""make_nullablee
|
|
2
|
+
|
|
3
|
+
Revision ID: 52b809a34371
|
|
4
|
+
Revises: 339927587383
|
|
5
|
+
Create Date: 2025-05-31 18:34:58.962966
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
from sqlalchemy.dialects import mysql
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision: str = "52b809a34371"
|
|
17
|
+
down_revision: Union[str, None] = "339927587383"
|
|
18
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
19
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.alter_column(
|
|
25
|
+
"investment_runs",
|
|
26
|
+
"initial_portfolio",
|
|
27
|
+
existing_type=mysql.JSON(),
|
|
28
|
+
nullable=True,
|
|
29
|
+
)
|
|
30
|
+
op.create_foreign_key(
|
|
31
|
+
None,
|
|
32
|
+
"portfolios",
|
|
33
|
+
"investment_runs",
|
|
34
|
+
["investment_run_id"],
|
|
35
|
+
["id"],
|
|
36
|
+
ondelete="CASCADE",
|
|
37
|
+
)
|
|
38
|
+
# ### end Alembic commands ###
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade() -> None:
|
|
42
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
43
|
+
op.drop_constraint(None, "portfolios", type_="foreignkey")
|
|
44
|
+
op.alter_column(
|
|
45
|
+
"investment_runs",
|
|
46
|
+
"initial_portfolio",
|
|
47
|
+
existing_type=mysql.JSON(),
|
|
48
|
+
nullable=False,
|
|
49
|
+
)
|
|
50
|
+
# ### end Alembic commands ###
|
|
@@ -4,6 +4,9 @@ from contextlib import contextmanager
|
|
|
4
4
|
from sqlalchemy import create_engine, text
|
|
5
5
|
from sqlalchemy.orm import sessionmaker
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
|
+
from alembic.config import Config
|
|
8
|
+
from alembic import command
|
|
9
|
+
import os
|
|
7
10
|
|
|
8
11
|
from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_URI
|
|
9
12
|
|
|
@@ -39,6 +42,14 @@ def init_db(uri: str = None):
|
|
|
39
42
|
# Step 4: Create session factory
|
|
40
43
|
_SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=_engine)
|
|
41
44
|
|
|
45
|
+
# Step 5: Apply Alembic migrations programmatically
|
|
46
|
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
|
47
|
+
alembic_cfg_path = os.path.join(project_root, "alembic.ini")
|
|
48
|
+
|
|
49
|
+
alembic_cfg = Config(alembic_cfg_path)
|
|
50
|
+
alembic_cfg.set_main_option("sqlalchemy.url", uri or os.getenv("DATABASE_URL"))
|
|
51
|
+
command.upgrade(alembic_cfg, "head")
|
|
52
|
+
|
|
42
53
|
|
|
43
54
|
# Dependency to get a session instance
|
|
44
55
|
@contextmanager
|
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
6
6
|
|
|
7
7
|
# Internal
|
|
8
|
-
from lecrapaud.
|
|
8
|
+
from lecrapaud.directories import tmp_dir
|
|
9
9
|
from lecrapaud.utils import logger
|
|
10
10
|
from lecrapaud.config import PYTHON_ENV
|
|
11
11
|
from lecrapaud.db import (
|
|
@@ -101,7 +101,7 @@ class FeatureEngineeringEngine:
|
|
|
101
101
|
|
|
102
102
|
def run(self) -> pd.DataFrame:
|
|
103
103
|
# drop columns
|
|
104
|
-
self.data = self.data.drop(columns=self.columns_drop)
|
|
104
|
+
self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
|
|
105
105
|
|
|
106
106
|
# convert object columns to numeric if possible
|
|
107
107
|
self.data = convert_object_columns_that_are_numeric(self.data)
|
|
@@ -324,6 +324,8 @@ class PreprocessFeature:
|
|
|
324
324
|
**kwargs,
|
|
325
325
|
):
|
|
326
326
|
self.data = data
|
|
327
|
+
self.data.columns = self.data.columns.str.upper()
|
|
328
|
+
|
|
327
329
|
self.dataset = dataset
|
|
328
330
|
self.columns_pca = columns_pca
|
|
329
331
|
self.columns_onehot = columns_onehot
|
|
@@ -350,7 +352,7 @@ class PreprocessFeature:
|
|
|
350
352
|
self.train_val_test_split_time_series()
|
|
351
353
|
if self.time_series
|
|
352
354
|
else self.train_val_test_split(
|
|
353
|
-
stratify_col=f"
|
|
355
|
+
stratify_col=f"TARGET_{self.target_numbers[0]}"
|
|
354
356
|
)
|
|
355
357
|
) # TODO: only stratifying first target for now
|
|
356
358
|
|
|
@@ -359,8 +361,7 @@ class PreprocessFeature:
|
|
|
359
361
|
val, _ = self.add_pca_features(test, pcas=pcas)
|
|
360
362
|
test, _ = self.add_pca_features(val, pcas=pcas)
|
|
361
363
|
|
|
362
|
-
|
|
363
|
-
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
364
|
+
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
364
365
|
|
|
365
366
|
# Encoding
|
|
366
367
|
train, transformer = self.encode_categorical_features(train)
|
|
@@ -373,11 +374,10 @@ class PreprocessFeature:
|
|
|
373
374
|
transformer=transformer,
|
|
374
375
|
)
|
|
375
376
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
|
|
377
|
+
joblib.dump(self.data, f"{self.data_dir}/full.pkl")
|
|
378
|
+
joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
379
|
+
summary = summarize_dataframe(train)
|
|
380
|
+
summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
|
|
381
381
|
|
|
382
382
|
return train, val, test
|
|
383
383
|
|
|
@@ -579,8 +579,8 @@ class PreprocessFeature:
|
|
|
579
579
|
columns_ordinal: list[str] = self.columns_ordinal
|
|
580
580
|
columns_frequency: list[str] = self.columns_frequency
|
|
581
581
|
|
|
582
|
-
X = df.loc[:, ~df.columns.str.contains("^
|
|
583
|
-
y = df.loc[:, df.columns.str.contains("^
|
|
582
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
583
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
584
584
|
save_in_db = False
|
|
585
585
|
|
|
586
586
|
all_columns = (
|
|
@@ -643,7 +643,6 @@ class PreprocessFeature:
|
|
|
643
643
|
|
|
644
644
|
# Try to convert columns to best possible dtypes
|
|
645
645
|
X_transformed = X_transformed.convert_dtypes()
|
|
646
|
-
X_transformed.columns = X_transformed.columns.str.upper()
|
|
647
646
|
|
|
648
647
|
# Insert features in db
|
|
649
648
|
if save_in_db:
|
|
@@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
|
37
37
|
from scipy.stats import spearmanr, kendalltau
|
|
38
38
|
|
|
39
39
|
# Internal
|
|
40
|
-
from lecrapaud.
|
|
40
|
+
from lecrapaud.directories import tmp_dir, clean_directory
|
|
41
41
|
from lecrapaud.utils import logger
|
|
42
42
|
from lecrapaud.config import PYTHON_ENV
|
|
43
43
|
from lecrapaud.db import (
|
|
@@ -50,10 +50,6 @@ from lecrapaud.db import (
|
|
|
50
50
|
from lecrapaud.db.session import get_db
|
|
51
51
|
from lecrapaud.search_space import all_models
|
|
52
52
|
|
|
53
|
-
# Variables for targets handling
|
|
54
|
-
TARGETS_MCLF = [11]
|
|
55
|
-
GROUPING_COLUMN = "STOCK"
|
|
56
|
-
|
|
57
53
|
# Annoying Warnings
|
|
58
54
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
59
55
|
|
|
@@ -103,7 +99,7 @@ class FeatureSelectionEngine:
|
|
|
103
99
|
# Main feature selection function
|
|
104
100
|
def run(
|
|
105
101
|
self,
|
|
106
|
-
single_process: bool =
|
|
102
|
+
single_process: bool = True,
|
|
107
103
|
):
|
|
108
104
|
"""Function to do feature selection with a range of different feature selection technics
|
|
109
105
|
|
|
@@ -114,10 +110,7 @@ class FeatureSelectionEngine:
|
|
|
114
110
|
"""
|
|
115
111
|
target_number = self.target_number
|
|
116
112
|
target_type = self.target_type
|
|
117
|
-
|
|
118
|
-
fs_dir_target = self.fs_dir_target
|
|
119
|
-
else:
|
|
120
|
-
fs_dir_target = None
|
|
113
|
+
fs_dir_target = self.fs_dir_target
|
|
121
114
|
|
|
122
115
|
# Create the feature selection in db
|
|
123
116
|
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
@@ -162,7 +155,7 @@ class FeatureSelectionEngine:
|
|
|
162
155
|
# handling categorical features (only if classification)
|
|
163
156
|
self.X_categorical, self.X_numerical = get_features_by_types(self.X)
|
|
164
157
|
|
|
165
|
-
if target_type == "classification":
|
|
158
|
+
if target_type == "classification" and self.X_categorical.shape[1] > 0:
|
|
166
159
|
feat_scores = self.select_categorical_features(
|
|
167
160
|
percentile=percentile, save_dir=fs_dir_target
|
|
168
161
|
)
|
|
@@ -292,24 +285,22 @@ class FeatureSelectionEngine:
|
|
|
292
285
|
f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
|
|
293
286
|
)
|
|
294
287
|
logger.debug(features_selected_by_every_methods)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
)
|
|
288
|
+
pd.Series(features_selected_list).to_csv(
|
|
289
|
+
f"{fs_dir_target}/features_before_corr.csv",
|
|
290
|
+
index=True,
|
|
291
|
+
header=True,
|
|
292
|
+
index_label="ID",
|
|
293
|
+
)
|
|
302
294
|
|
|
303
295
|
# removing correlated features
|
|
304
296
|
self.X = self.X[features_selected_list]
|
|
305
297
|
features, features_correlated = self.remove_correlated_features(corr_threshold)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
)
|
|
298
|
+
pd.Series(features).to_csv(
|
|
299
|
+
f"{fs_dir_target}/features_before_max.csv",
|
|
300
|
+
index=True,
|
|
301
|
+
header=True,
|
|
302
|
+
index_label="ID",
|
|
303
|
+
)
|
|
313
304
|
features = features[:max_features]
|
|
314
305
|
|
|
315
306
|
# adding categorical features selected
|
|
@@ -337,8 +328,7 @@ class FeatureSelectionEngine:
|
|
|
337
328
|
best_features_path = Path(
|
|
338
329
|
f"{self.preprocessing_dir}/features_{target_number}.pkl"
|
|
339
330
|
).resolve()
|
|
340
|
-
|
|
341
|
-
joblib.dump(features, best_features_path)
|
|
331
|
+
joblib.dump(features, best_features_path)
|
|
342
332
|
|
|
343
333
|
# save in db
|
|
344
334
|
db_features = Feature.filter(name__in=features)
|
|
@@ -798,6 +788,7 @@ class PreprocessModel:
|
|
|
798
788
|
|
|
799
789
|
self.dataset_dir = dataset.path
|
|
800
790
|
self.data_dir = f"{self.dataset_dir}/data"
|
|
791
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
801
792
|
|
|
802
793
|
self.all_features = dataset.get_all_features(
|
|
803
794
|
date_column=date_column, group_column=group_column
|
|
@@ -819,31 +810,23 @@ class PreprocessModel:
|
|
|
819
810
|
|
|
820
811
|
def run(self):
|
|
821
812
|
# save data
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
826
|
-
preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
827
|
-
else:
|
|
828
|
-
preprocessing_dir = None
|
|
813
|
+
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
814
|
+
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
815
|
+
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
829
816
|
|
|
830
817
|
# scaling features
|
|
831
818
|
if any(t not in self.target_clf for t in self.target_numbers) and any(
|
|
832
819
|
all_models[i].get("need_scaling") for i in self.models_idx
|
|
833
820
|
):
|
|
834
821
|
logger.info("Scaling features...")
|
|
835
|
-
train_scaled, scaler_x, scalers_y = self.scale_data(
|
|
836
|
-
self.train, save_dir=preprocessing_dir
|
|
837
|
-
)
|
|
822
|
+
train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
|
|
838
823
|
val_scaled, _, _ = self.scale_data(
|
|
839
824
|
self.val,
|
|
840
|
-
save_dir=preprocessing_dir,
|
|
841
825
|
scaler_x=scaler_x,
|
|
842
826
|
scalers_y=scalers_y,
|
|
843
827
|
)
|
|
844
828
|
test_scaled, _, _ = self.scale_data(
|
|
845
829
|
self.test,
|
|
846
|
-
save_dir=preprocessing_dir,
|
|
847
830
|
scaler_x=scaler_x,
|
|
848
831
|
scalers_y=scalers_y,
|
|
849
832
|
)
|
|
@@ -853,10 +836,9 @@ class PreprocessModel:
|
|
|
853
836
|
test_scaled = None
|
|
854
837
|
|
|
855
838
|
# save data
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
839
|
+
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
840
|
+
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
841
|
+
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
860
842
|
|
|
861
843
|
data = {
|
|
862
844
|
"train": self.train,
|
|
@@ -923,7 +905,6 @@ class PreprocessModel:
|
|
|
923
905
|
def scale_data(
|
|
924
906
|
self,
|
|
925
907
|
df: pd.DataFrame,
|
|
926
|
-
save_dir: str,
|
|
927
908
|
scaler_x=None,
|
|
928
909
|
scalers_y: Optional[list] = None,
|
|
929
910
|
):
|
|
@@ -939,8 +920,7 @@ class PreprocessModel:
|
|
|
939
920
|
X_scaled = pd.DataFrame(
|
|
940
921
|
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
941
922
|
)
|
|
942
|
-
|
|
943
|
-
joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
|
|
923
|
+
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
944
924
|
|
|
945
925
|
# Determine which targets need to be scaled
|
|
946
926
|
targets_numbers_to_scale = [
|
|
@@ -969,8 +949,9 @@ class PreprocessModel:
|
|
|
969
949
|
columns=y.columns,
|
|
970
950
|
index=y.index,
|
|
971
951
|
)
|
|
972
|
-
|
|
973
|
-
|
|
952
|
+
joblib.dump(
|
|
953
|
+
scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
|
|
954
|
+
)
|
|
974
955
|
|
|
975
956
|
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
976
957
|
scaled_targets[target_number] = scaled_y
|