lecrapaud 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +84 -61
- lecrapaud/config.py +6 -2
- lecrapaud/db/alembic/versions/{2025_06_20_1924-1edada319fd7_initial_setup.py → 2025_06_23_1748-f089dfb7e3ba_.py} +20 -20
- lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +34 -0
- lecrapaud/db/models/__init__.py +14 -2
- lecrapaud/db/models/base.py +48 -2
- lecrapaud/db/models/{dataset.py → experiment.py} +23 -25
- lecrapaud/db/models/feature_selection.py +5 -5
- lecrapaud/db/models/model_selection.py +5 -5
- lecrapaud/db/models/score.py +3 -1
- lecrapaud/db/models/target.py +4 -4
- lecrapaud/db/session.py +4 -4
- lecrapaud/directories.py +0 -2
- lecrapaud/experiment.py +25 -18
- lecrapaud/feature_engineering.py +53 -24
- lecrapaud/feature_selection.py +41 -36
- lecrapaud/jobs/tasks.py +4 -4
- lecrapaud/model_selection.py +268 -261
- lecrapaud/search_space.py +23 -4
- lecrapaud/utils.py +2 -2
- {lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/METADATA +2 -2
- lecrapaud-0.7.0.dist-info/RECORD +43 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +0 -71
- lecrapaud/services/indicators.py +0 -309
- lecrapaud/speed_tests/experiments.py +0 -139
- lecrapaud/speed_tests/trash.py +0 -37
- lecrapaud-0.5.1.dist-info/RECORD +0 -46
- {lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.5.1.dist-info → lecrapaud-0.7.0.dist-info}/WHEEL +0 -0
|
@@ -60,9 +60,9 @@ class FeatureSelection(Base):
|
|
|
60
60
|
)
|
|
61
61
|
training_time = Column(Integer)
|
|
62
62
|
best_features_path = Column(String(255))
|
|
63
|
-
|
|
63
|
+
experiment_id = Column(
|
|
64
64
|
BigInteger,
|
|
65
|
-
ForeignKey("
|
|
65
|
+
ForeignKey("lecrapaud_experiments.id", ondelete="CASCADE"),
|
|
66
66
|
nullable=False,
|
|
67
67
|
)
|
|
68
68
|
target_id = Column(
|
|
@@ -71,8 +71,8 @@ class FeatureSelection(Base):
|
|
|
71
71
|
nullable=False,
|
|
72
72
|
)
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
"
|
|
74
|
+
experiment = relationship(
|
|
75
|
+
"Experiment", back_populates="feature_selections", lazy="selectin"
|
|
76
76
|
)
|
|
77
77
|
target = relationship(
|
|
78
78
|
"Target", back_populates="feature_selections", lazy="selectin"
|
|
@@ -92,7 +92,7 @@ class FeatureSelection(Base):
|
|
|
92
92
|
|
|
93
93
|
__table_args__ = (
|
|
94
94
|
UniqueConstraint(
|
|
95
|
-
"
|
|
95
|
+
"experiment_id", "target_id", name="uq_feature_selection_composite"
|
|
96
96
|
),
|
|
97
97
|
)
|
|
98
98
|
|
|
@@ -43,9 +43,9 @@ class ModelSelection(Base):
|
|
|
43
43
|
ForeignKey("lecrapaud_targets.id", ondelete="CASCADE"),
|
|
44
44
|
nullable=False,
|
|
45
45
|
)
|
|
46
|
-
|
|
46
|
+
experiment_id = Column(
|
|
47
47
|
BigInteger,
|
|
48
|
-
ForeignKey("
|
|
48
|
+
ForeignKey("lecrapaud_experiments.id", ondelete="CASCADE"),
|
|
49
49
|
nullable=False,
|
|
50
50
|
)
|
|
51
51
|
|
|
@@ -56,13 +56,13 @@ class ModelSelection(Base):
|
|
|
56
56
|
cascade="all, delete-orphan",
|
|
57
57
|
lazy="selectin",
|
|
58
58
|
)
|
|
59
|
-
|
|
60
|
-
"
|
|
59
|
+
experiment = relationship(
|
|
60
|
+
"Experiment", back_populates="model_selections", lazy="selectin"
|
|
61
61
|
)
|
|
62
62
|
target = relationship("Target", back_populates="model_selections", lazy="selectin")
|
|
63
63
|
|
|
64
64
|
__table_args__ = (
|
|
65
65
|
UniqueConstraint(
|
|
66
|
-
"target_id", "
|
|
66
|
+
"target_id", "experiment_id", name="uq_model_selection_composite"
|
|
67
67
|
),
|
|
68
68
|
)
|
lecrapaud/db/models/score.py
CHANGED
|
@@ -6,6 +6,7 @@ from sqlalchemy import (
|
|
|
6
6
|
ForeignKey,
|
|
7
7
|
BigInteger,
|
|
8
8
|
TIMESTAMP,
|
|
9
|
+
JSON,
|
|
9
10
|
)
|
|
10
11
|
from sqlalchemy import func
|
|
11
12
|
from sqlalchemy.orm import relationship
|
|
@@ -43,9 +44,10 @@ class Score(Base):
|
|
|
43
44
|
recall = Column(Float)
|
|
44
45
|
f1 = Column(Float)
|
|
45
46
|
roc_auc = Column(Float)
|
|
46
|
-
|
|
47
|
+
thresholds = Column(JSON)
|
|
47
48
|
precision_at_threshold = Column(Float)
|
|
48
49
|
recall_at_threshold = Column(Float)
|
|
50
|
+
f1_at_threshold = Column(Float)
|
|
49
51
|
model_training_id = Column(
|
|
50
52
|
BigInteger,
|
|
51
53
|
ForeignKey("lecrapaud_model_trainings.id", ondelete="CASCADE"),
|
lecrapaud/db/models/target.py
CHANGED
|
@@ -19,7 +19,7 @@ from sqlalchemy.orm import relationship, Mapped, mapped_column, DeclarativeBase
|
|
|
19
19
|
|
|
20
20
|
from lecrapaud.db.session import get_db
|
|
21
21
|
from lecrapaud.db.models.base import Base
|
|
22
|
-
from lecrapaud.db.models.
|
|
22
|
+
from lecrapaud.db.models.experiment import lecrapaud_experiment_target_association
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class Target(Base):
|
|
@@ -38,9 +38,9 @@ class Target(Base):
|
|
|
38
38
|
type = Column(String(50), nullable=False)
|
|
39
39
|
description = Column(String(255))
|
|
40
40
|
|
|
41
|
-
|
|
42
|
-
"
|
|
43
|
-
secondary=
|
|
41
|
+
experiments = relationship(
|
|
42
|
+
"Experiment",
|
|
43
|
+
secondary=lecrapaud_experiment_target_association,
|
|
44
44
|
back_populates="targets",
|
|
45
45
|
lazy="selectin",
|
|
46
46
|
)
|
lecrapaud/db/session.py
CHANGED
|
@@ -14,7 +14,7 @@ _engine = None
|
|
|
14
14
|
_SessionLocal = None
|
|
15
15
|
if DB_URI:
|
|
16
16
|
DATABASE_URL = DB_URI
|
|
17
|
-
elif DB_USER
|
|
17
|
+
elif DB_USER:
|
|
18
18
|
DATABASE_URL = (
|
|
19
19
|
f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
|
|
20
20
|
)
|
|
@@ -23,11 +23,10 @@ else:
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
def init_db(uri: str = None):
|
|
26
|
-
print(f"Initializing database with URI: {uri}")
|
|
27
26
|
global _engine, _SessionLocal, DATABASE_URL
|
|
28
27
|
if uri:
|
|
29
28
|
DATABASE_URL = uri
|
|
30
|
-
elif DB_USER
|
|
29
|
+
elif DB_USER:
|
|
31
30
|
DATABASE_URL = (
|
|
32
31
|
f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
|
|
33
32
|
)
|
|
@@ -39,6 +38,7 @@ def init_db(uri: str = None):
|
|
|
39
38
|
"DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME or DB_URI, "
|
|
40
39
|
"or provide a `uri` argument to LeCrapaud"
|
|
41
40
|
)
|
|
41
|
+
print(f"Initializing database with URI: {DATABASE_URL}")
|
|
42
42
|
|
|
43
43
|
# Use urlparse for robust parsing
|
|
44
44
|
parsed = urlparse(DATABASE_URL)
|
|
@@ -77,7 +77,7 @@ def init_db(uri: str = None):
|
|
|
77
77
|
@contextmanager
|
|
78
78
|
def get_db():
|
|
79
79
|
if _SessionLocal is None:
|
|
80
|
-
|
|
80
|
+
init_db()
|
|
81
81
|
db = _SessionLocal()
|
|
82
82
|
try:
|
|
83
83
|
yield db
|
lecrapaud/directories.py
CHANGED
lecrapaud/experiment.py
CHANGED
|
@@ -1,27 +1,25 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
1
|
import os
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# Set up coverage file path
|
|
5
7
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
6
8
|
|
|
7
|
-
# Internal
|
|
9
|
+
# Internal imports
|
|
8
10
|
from lecrapaud.directories import tmp_dir
|
|
9
|
-
from lecrapaud.
|
|
10
|
-
from lecrapaud.config import PYTHON_ENV
|
|
11
|
-
from lecrapaud.db import (
|
|
12
|
-
Dataset,
|
|
13
|
-
Target,
|
|
14
|
-
)
|
|
11
|
+
from lecrapaud.db import Experiment, Target
|
|
15
12
|
from lecrapaud.db.session import get_db
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
def
|
|
15
|
+
def create_experiment(
|
|
19
16
|
data: pd.DataFrame,
|
|
20
17
|
corr_threshold,
|
|
21
18
|
percentile,
|
|
22
19
|
max_features,
|
|
23
20
|
date_column,
|
|
24
21
|
group_column,
|
|
22
|
+
experiment_name,
|
|
25
23
|
**kwargs,
|
|
26
24
|
):
|
|
27
25
|
dates = {}
|
|
@@ -37,20 +35,20 @@ def create_dataset(
|
|
|
37
35
|
with get_db() as db:
|
|
38
36
|
all_targets = Target.get_all(db=db)
|
|
39
37
|
targets = [target for target in all_targets if target.name in data.columns]
|
|
40
|
-
|
|
38
|
+
experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
|
|
41
39
|
|
|
42
|
-
|
|
43
|
-
preprocessing_dir = f"{
|
|
44
|
-
data_dir = f"{
|
|
45
|
-
os.makedirs(
|
|
40
|
+
experiment_dir = f"{tmp_dir}/{experiment_name}"
|
|
41
|
+
preprocessing_dir = f"{experiment_dir}/preprocessing"
|
|
42
|
+
data_dir = f"{experiment_dir}/data"
|
|
43
|
+
os.makedirs(experiment_dir, exist_ok=True)
|
|
46
44
|
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
47
45
|
os.makedirs(data_dir, exist_ok=True)
|
|
48
46
|
|
|
49
|
-
|
|
47
|
+
experiment = Experiment.upsert(
|
|
50
48
|
match_fields=["name"],
|
|
51
49
|
db=db,
|
|
52
|
-
name=
|
|
53
|
-
path=Path(
|
|
50
|
+
name=experiment_name,
|
|
51
|
+
path=Path(experiment_dir).resolve(),
|
|
54
52
|
type="training",
|
|
55
53
|
size=data.shape[0],
|
|
56
54
|
corr_threshold=corr_threshold,
|
|
@@ -59,6 +57,15 @@ def create_dataset(
|
|
|
59
57
|
**groups,
|
|
60
58
|
**dates,
|
|
61
59
|
targets=targets,
|
|
60
|
+
context={
|
|
61
|
+
"corr_threshold": corr_threshold,
|
|
62
|
+
"percentile": percentile,
|
|
63
|
+
"max_features": max_features,
|
|
64
|
+
"date_column": date_column,
|
|
65
|
+
"group_column": group_column,
|
|
66
|
+
"experiment_name": experiment_name,
|
|
67
|
+
**kwargs,
|
|
68
|
+
},
|
|
62
69
|
)
|
|
63
70
|
|
|
64
|
-
return
|
|
71
|
+
return experiment
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -61,7 +61,7 @@ from lecrapaud.integrations.openai_integration import (
|
|
|
61
61
|
)
|
|
62
62
|
from lecrapaud.feature_selection import get_features_by_types
|
|
63
63
|
from lecrapaud.utils import logger
|
|
64
|
-
from lecrapaud.db import Target, Feature,
|
|
64
|
+
from lecrapaud.db import Target, Feature, Experiment
|
|
65
65
|
from lecrapaud.config import PYTHON_ENV
|
|
66
66
|
|
|
67
67
|
|
|
@@ -308,7 +308,7 @@ class PreprocessFeature:
|
|
|
308
308
|
def __init__(
|
|
309
309
|
self,
|
|
310
310
|
data: pd.DataFrame,
|
|
311
|
-
|
|
311
|
+
experiment,
|
|
312
312
|
time_series: bool = False,
|
|
313
313
|
date_column: str | None = None,
|
|
314
314
|
group_column: str | None = None,
|
|
@@ -326,7 +326,7 @@ class PreprocessFeature:
|
|
|
326
326
|
self.data = data
|
|
327
327
|
self.data.columns = self.data.columns.str.upper()
|
|
328
328
|
|
|
329
|
-
self.
|
|
329
|
+
self.experiment = experiment
|
|
330
330
|
self.columns_pca = columns_pca
|
|
331
331
|
self.columns_onehot = columns_onehot
|
|
332
332
|
self.columns_binary = columns_binary
|
|
@@ -341,10 +341,10 @@ class PreprocessFeature:
|
|
|
341
341
|
self.val_size = val_size
|
|
342
342
|
self.test_size = test_size
|
|
343
343
|
|
|
344
|
-
self.
|
|
345
|
-
self.
|
|
346
|
-
self.data_dir = f"{self.
|
|
347
|
-
self.preprocessing_dir = f"{self.
|
|
344
|
+
self.experiment_dir = self.experiment.path
|
|
345
|
+
self.experiment_id = self.experiment.id
|
|
346
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
347
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
348
348
|
|
|
349
349
|
def run(self):
|
|
350
350
|
# Split
|
|
@@ -358,8 +358,8 @@ class PreprocessFeature:
|
|
|
358
358
|
|
|
359
359
|
# PCA
|
|
360
360
|
train, pcas = self.add_pca_features(train)
|
|
361
|
-
val, _ = self.add_pca_features(
|
|
362
|
-
test, _ = self.add_pca_features(
|
|
361
|
+
val, _ = self.add_pca_features(val, pcas=pcas)
|
|
362
|
+
test, _ = self.add_pca_features(test, pcas=pcas)
|
|
363
363
|
|
|
364
364
|
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
365
365
|
|
|
@@ -377,7 +377,7 @@ class PreprocessFeature:
|
|
|
377
377
|
joblib.dump(self.data, f"{self.data_dir}/full.pkl")
|
|
378
378
|
joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
379
379
|
summary = summarize_dataframe(train)
|
|
380
|
-
summary.to_csv(f"{self.
|
|
380
|
+
summary.to_csv(f"{self.experiment_dir}/feature_summary.csv", index=False)
|
|
381
381
|
|
|
382
382
|
return train, val, test
|
|
383
383
|
|
|
@@ -431,9 +431,9 @@ class PreprocessFeature:
|
|
|
431
431
|
f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
432
432
|
)
|
|
433
433
|
|
|
434
|
-
|
|
434
|
+
Experiment.upsert(
|
|
435
435
|
match_fields=["id"],
|
|
436
|
-
id=self.
|
|
436
|
+
id=self.experiment_id,
|
|
437
437
|
train_size=len(train),
|
|
438
438
|
val_size=len(val),
|
|
439
439
|
test_size=len(test),
|
|
@@ -454,7 +454,7 @@ class PreprocessFeature:
|
|
|
454
454
|
Splits a DataFrame into train, validation, and test sets.
|
|
455
455
|
|
|
456
456
|
Parameters:
|
|
457
|
-
df (pd.DataFrame): The full
|
|
457
|
+
df (pd.DataFrame): The full experiment
|
|
458
458
|
val_size (float): Proportion of validation set (default 0.1)
|
|
459
459
|
test_size (float): Proportion of test set (default 0.1)
|
|
460
460
|
random_state (int): Random seed for reproducibility
|
|
@@ -646,23 +646,52 @@ class PreprocessFeature:
|
|
|
646
646
|
|
|
647
647
|
# Insert features in db
|
|
648
648
|
if save_in_db:
|
|
649
|
-
#
|
|
649
|
+
# Get feature types from transformed data
|
|
650
650
|
categorical_features, numerical_features = get_features_by_types(
|
|
651
651
|
X_transformed
|
|
652
652
|
)
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
653
|
+
|
|
654
|
+
# Get column names from DataFrames
|
|
655
|
+
cat_feature_names = categorical_features.columns.tolist()
|
|
656
|
+
num_feature_names = numerical_features.columns.tolist()
|
|
657
|
+
|
|
658
|
+
# Combine all feature names and their types
|
|
659
|
+
all_feature_names = cat_feature_names + num_feature_names
|
|
660
|
+
all_feature_types = ["categorical"] * len(cat_feature_names) + [
|
|
661
|
+
"numerical"
|
|
662
|
+
] * len(num_feature_names)
|
|
663
|
+
|
|
664
|
+
# Upsert features in bulk if we have any features
|
|
665
|
+
if all_feature_names:
|
|
666
|
+
Feature.upsert_bulk(
|
|
667
|
+
match_fields=["name"],
|
|
668
|
+
name=all_feature_names,
|
|
669
|
+
type=all_feature_types,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Upsert targets in bulk
|
|
673
|
+
target_names = y.columns.tolist()
|
|
674
|
+
target_types = [
|
|
675
|
+
(
|
|
660
676
|
"classification"
|
|
661
|
-
if
|
|
677
|
+
if int(target.split("_")[1]) in self.target_clf
|
|
662
678
|
else "regression"
|
|
663
679
|
)
|
|
664
|
-
|
|
665
|
-
|
|
680
|
+
for target in target_names
|
|
681
|
+
]
|
|
682
|
+
|
|
683
|
+
Target.upsert_bulk(
|
|
684
|
+
match_fields=["name"], name=target_names, type=target_types
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
# Get all the upserted objects
|
|
688
|
+
targets = Target.filter(name__in=target_names)
|
|
689
|
+
|
|
690
|
+
# Update experiment with targets
|
|
691
|
+
experiment = Experiment.get(self.experiment_id)
|
|
692
|
+
if experiment:
|
|
693
|
+
experiment.targets = targets
|
|
694
|
+
experiment.save()
|
|
666
695
|
|
|
667
696
|
return pd.concat([X_transformed, y], axis=1), transformer
|
|
668
697
|
|
lecrapaud/feature_selection.py
CHANGED
|
@@ -41,7 +41,7 @@ from lecrapaud.directories import tmp_dir, clean_directory
|
|
|
41
41
|
from lecrapaud.utils import logger
|
|
42
42
|
from lecrapaud.config import PYTHON_ENV
|
|
43
43
|
from lecrapaud.db import (
|
|
44
|
-
|
|
44
|
+
Experiment,
|
|
45
45
|
Target,
|
|
46
46
|
Feature,
|
|
47
47
|
FeatureSelection,
|
|
@@ -54,8 +54,8 @@ from lecrapaud.search_space import all_models
|
|
|
54
54
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
def load_train_data(
|
|
58
|
-
data_dir = f"{
|
|
57
|
+
def load_train_data(experiment_dir, target_number, target_type="regression"):
|
|
58
|
+
data_dir = f"{experiment_dir}/data"
|
|
59
59
|
|
|
60
60
|
logger.info("Loading data...")
|
|
61
61
|
train = joblib.load(f"{data_dir}/train.pkl")
|
|
@@ -74,8 +74,8 @@ def load_train_data(dataset_dir, target_number, target_type="regression"):
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
class FeatureSelectionEngine:
|
|
77
|
-
def __init__(self, train,
|
|
78
|
-
self.
|
|
77
|
+
def __init__(self, train, experiment, target_number, target_clf, **kwargs):
|
|
78
|
+
self.experiment = experiment
|
|
79
79
|
self.train = train
|
|
80
80
|
self.target_number = target_number
|
|
81
81
|
self.target_clf = target_clf
|
|
@@ -83,16 +83,16 @@ class FeatureSelectionEngine:
|
|
|
83
83
|
self.target_type = (
|
|
84
84
|
"classification" if self.target_number in self.target_clf else "regression"
|
|
85
85
|
)
|
|
86
|
-
self.percentile = self.
|
|
87
|
-
self.corr_threshold = self.
|
|
88
|
-
self.max_features = self.
|
|
89
|
-
|
|
90
|
-
self.
|
|
91
|
-
self.
|
|
92
|
-
self.data_dir = f"{self.
|
|
93
|
-
self.preprocessing_dir = f"{self.
|
|
86
|
+
self.percentile = self.experiment.percentile
|
|
87
|
+
self.corr_threshold = self.experiment.corr_threshold
|
|
88
|
+
self.max_features = self.experiment.max_features
|
|
89
|
+
|
|
90
|
+
self.experiment_dir = self.experiment.path
|
|
91
|
+
self.experiment_id = self.experiment.id
|
|
92
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
93
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
94
94
|
self.fs_dir_target = (
|
|
95
|
-
f"{self.
|
|
95
|
+
f"{self.experiment_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
|
|
96
96
|
)
|
|
97
97
|
os.makedirs(self.fs_dir_target, exist_ok=True)
|
|
98
98
|
|
|
@@ -119,12 +119,14 @@ class FeatureSelectionEngine:
|
|
|
119
119
|
max_features = self.max_features
|
|
120
120
|
|
|
121
121
|
feature_selection = FeatureSelection.upsert(
|
|
122
|
-
match_fields=["target_id", "
|
|
122
|
+
match_fields=["target_id", "experiment_id"],
|
|
123
123
|
target_id=target.id,
|
|
124
|
-
|
|
124
|
+
experiment_id=self.experiment_id,
|
|
125
125
|
)
|
|
126
126
|
|
|
127
|
-
if feature_selection.best_features_path
|
|
127
|
+
if feature_selection.best_features_path and os.path.exists(
|
|
128
|
+
feature_selection.best_features_path
|
|
129
|
+
):
|
|
128
130
|
return joblib.load(feature_selection.best_features_path)
|
|
129
131
|
|
|
130
132
|
self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
|
|
@@ -767,7 +769,7 @@ class PreprocessModel:
|
|
|
767
769
|
train,
|
|
768
770
|
val,
|
|
769
771
|
test,
|
|
770
|
-
|
|
772
|
+
experiment,
|
|
771
773
|
target_numbers,
|
|
772
774
|
target_clf,
|
|
773
775
|
models_idx,
|
|
@@ -777,7 +779,10 @@ class PreprocessModel:
|
|
|
777
779
|
date_column,
|
|
778
780
|
**kwargs,
|
|
779
781
|
):
|
|
780
|
-
self.
|
|
782
|
+
self.train = train
|
|
783
|
+
self.val = val
|
|
784
|
+
self.test = test
|
|
785
|
+
self.experiment = experiment
|
|
781
786
|
self.target_numbers = target_numbers
|
|
782
787
|
self.target_clf = target_clf
|
|
783
788
|
self.models_idx = models_idx
|
|
@@ -786,13 +791,16 @@ class PreprocessModel:
|
|
|
786
791
|
self.group_column = group_column
|
|
787
792
|
self.date_column = date_column
|
|
788
793
|
|
|
789
|
-
self.
|
|
790
|
-
self.data_dir = f"{self.
|
|
791
|
-
self.preprocessing_dir = f"{self.
|
|
794
|
+
self.experiment_dir = experiment.path
|
|
795
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
796
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
792
797
|
|
|
793
|
-
self.all_features =
|
|
798
|
+
self.all_features = experiment.get_all_features(
|
|
794
799
|
date_column=date_column, group_column=group_column
|
|
795
800
|
)
|
|
801
|
+
|
|
802
|
+
def run(self):
|
|
803
|
+
# save data
|
|
796
804
|
columns_to_keep = self.all_features + [
|
|
797
805
|
f"TARGET_{i}" for i in self.target_numbers
|
|
798
806
|
]
|
|
@@ -801,15 +809,9 @@ class PreprocessModel:
|
|
|
801
809
|
]
|
|
802
810
|
if duplicates:
|
|
803
811
|
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
804
|
-
|
|
805
|
-
self.
|
|
806
|
-
|
|
807
|
-
self.val = val[columns_to_keep]
|
|
808
|
-
if isinstance(test, pd.DataFrame):
|
|
809
|
-
self.test = test[columns_to_keep]
|
|
810
|
-
|
|
811
|
-
def run(self):
|
|
812
|
-
# save data
|
|
812
|
+
self.train = self.train[columns_to_keep]
|
|
813
|
+
self.val = self.val[columns_to_keep]
|
|
814
|
+
self.test = self.test[columns_to_keep]
|
|
813
815
|
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
814
816
|
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
815
817
|
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
@@ -870,8 +872,11 @@ class PreprocessModel:
|
|
|
870
872
|
|
|
871
873
|
def inference(self):
|
|
872
874
|
# self.train is new data here
|
|
875
|
+
columns_to_keep = self.all_features
|
|
876
|
+
self.train = self.train[columns_to_keep]
|
|
877
|
+
|
|
873
878
|
scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
874
|
-
scaled_data = scaler_x.transform(self.train)
|
|
879
|
+
scaled_data = scaler_x.transform(self.train) # TODO: utiliser scale_data
|
|
875
880
|
scaled_data = pd.DataFrame(
|
|
876
881
|
scaled_data, columns=self.train.columns, index=self.train.index
|
|
877
882
|
)
|
|
@@ -1012,7 +1017,7 @@ class PreprocessModel:
|
|
|
1012
1017
|
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1013
1018
|
df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
|
|
1014
1019
|
|
|
1015
|
-
logger.info("Grouping
|
|
1020
|
+
logger.info("Grouping features and creating timesteps...")
|
|
1016
1021
|
df_reshaped = (
|
|
1017
1022
|
df_reshaped.groupby(group_column)[0]
|
|
1018
1023
|
.apply(lambda x: shiftsum(x, timesteps))
|
|
@@ -1058,13 +1063,13 @@ class PreprocessModel:
|
|
|
1058
1063
|
def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
|
|
1059
1064
|
|
|
1060
1065
|
feature_selection = FeatureSelection.get(feature_selection_id)
|
|
1061
|
-
|
|
1066
|
+
experiment_dir = feature_selection.experiment.path
|
|
1062
1067
|
features = [f.name for f in feature_selection.features]
|
|
1063
1068
|
target = feature_selection.target.name
|
|
1064
1069
|
target_number = target.split("_")[1]
|
|
1065
1070
|
|
|
1066
1071
|
train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
|
|
1067
|
-
|
|
1072
|
+
experiment_dir, target_number, target_type=feature_selection.target.type
|
|
1068
1073
|
)
|
|
1069
1074
|
train = train[features + [target]]
|
|
1070
1075
|
train_scaled = train_scaled[features + [target]]
|
lecrapaud/jobs/tasks.py
CHANGED
|
@@ -2,7 +2,7 @@ from lecrapaud.jobs import app
|
|
|
2
2
|
|
|
3
3
|
# from honeybadger import honeybadger
|
|
4
4
|
from lecrapaud.send_daily_emails import send_daily_emails
|
|
5
|
-
from lecrapaud.config import
|
|
5
|
+
from lecrapaud.config import EXPERIMENT_ID, RECEIVER_EMAIL
|
|
6
6
|
from lecrapaud.training import run_training
|
|
7
7
|
from lecrapaud.constants import stock_list_3
|
|
8
8
|
from lecrapaud.search_space import get_models_idx
|
|
@@ -18,9 +18,9 @@ from lecrapaud.search_space import get_models_idx
|
|
|
18
18
|
def task_send_daily_emails(self):
|
|
19
19
|
try:
|
|
20
20
|
print(f"[Attempt #{self.request.retries}] task_send_daily_emails")
|
|
21
|
-
|
|
21
|
+
experiment_id = int(EXPERIMENT_ID)
|
|
22
22
|
email = RECEIVER_EMAIL
|
|
23
|
-
return send_daily_emails(email,
|
|
23
|
+
return send_daily_emails(email, experiment_id)
|
|
24
24
|
except Exception as e:
|
|
25
25
|
print(e)
|
|
26
26
|
# honeybadger.notify(e)
|
|
@@ -49,7 +49,7 @@ def task_training_experiment(self):
|
|
|
49
49
|
perform_hyperoptimization=True,
|
|
50
50
|
perform_crossval=False,
|
|
51
51
|
preserve_model=False,
|
|
52
|
-
|
|
52
|
+
experiment_name="20y_stock_list_3_linear_xgb",
|
|
53
53
|
)
|
|
54
54
|
except Exception as e:
|
|
55
55
|
print(e)
|