lecrapaud 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +11 -49
- lecrapaud/config.py +3 -2
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +42 -0
- lecrapaud/db/models/experiment.py +48 -75
- lecrapaud/experiment.py +13 -15
- lecrapaud/feature_engineering.py +28 -40
- lecrapaud/feature_selection.py +90 -21
- lecrapaud/model_selection.py +24 -30
- lecrapaud/utils.py +4 -4
- lecrapaud-0.20.2.dist-info/METADATA +344 -0
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/RECORD +13 -12
- lecrapaud-0.20.0.dist-info/METADATA +0 -250
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/WHEEL +0 -0
- {lecrapaud-0.20.0.dist-info → lecrapaud-0.20.2.dist-info}/licenses/LICENSE +0 -0
lecrapaud/api.py
CHANGED
|
@@ -165,6 +165,12 @@ class ExperimentEngine:
|
|
|
165
165
|
|
|
166
166
|
def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
|
|
167
167
|
"""Initialize the experiment engine with either new or existing experiment."""
|
|
168
|
+
# Set all kwargs as instance attributes
|
|
169
|
+
if "models_idx" in kwargs:
|
|
170
|
+
kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
|
|
171
|
+
for key, value in kwargs.items():
|
|
172
|
+
setattr(self, key, value)
|
|
173
|
+
|
|
168
174
|
if id:
|
|
169
175
|
self.experiment = Experiment.get(id)
|
|
170
176
|
kwargs.update(self.experiment.context)
|
|
@@ -180,12 +186,6 @@ class ExperimentEngine:
|
|
|
180
186
|
)
|
|
181
187
|
self.experiment = create_experiment(data=data, **kwargs)
|
|
182
188
|
|
|
183
|
-
# Set all kwargs as instance attributes
|
|
184
|
-
for key, value in kwargs.items():
|
|
185
|
-
if key == "models_idx":
|
|
186
|
-
value = normalize_models_idx(value)
|
|
187
|
-
setattr(self, key, value)
|
|
188
|
-
|
|
189
189
|
def train(self, data, best_params=None):
|
|
190
190
|
logger.info("Running training...")
|
|
191
191
|
|
|
@@ -309,12 +309,8 @@ class ExperimentEngine:
|
|
|
309
309
|
def feature_engineering(self, data, for_training=True):
|
|
310
310
|
app = FeatureEngineeringEngine(
|
|
311
311
|
data=data,
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
columns_date=getattr(self, "columns_date", []),
|
|
315
|
-
columns_te_groupby=getattr(self, "columns_te_groupby", []),
|
|
316
|
-
columns_te_target=getattr(self, "columns_te_target", []),
|
|
317
|
-
for_training=getattr(self, "for_training", True),
|
|
312
|
+
experiment=self.experiment,
|
|
313
|
+
for_training=for_training,
|
|
318
314
|
)
|
|
319
315
|
data = app.run()
|
|
320
316
|
return data
|
|
@@ -322,21 +318,7 @@ class ExperimentEngine:
|
|
|
322
318
|
def preprocess_feature(self, data, for_training=True):
|
|
323
319
|
app = PreprocessFeature(
|
|
324
320
|
data=data,
|
|
325
|
-
experiment=
|
|
326
|
-
time_series=getattr(self, "time_series", False),
|
|
327
|
-
date_column=getattr(self, "date_column", None),
|
|
328
|
-
group_column=getattr(self, "group_column", None),
|
|
329
|
-
val_size=getattr(self, "val_size", 0.2),
|
|
330
|
-
test_size=getattr(self, "test_size", 0.2),
|
|
331
|
-
columns_pca=getattr(self, "columns_pca", []),
|
|
332
|
-
pca_temporal=getattr(self, "pca_temporal", []),
|
|
333
|
-
pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
|
|
334
|
-
columns_onehot=getattr(self, "columns_onehot", []),
|
|
335
|
-
columns_binary=getattr(self, "columns_binary", []),
|
|
336
|
-
columns_ordinal=getattr(self, "columns_ordinal", []),
|
|
337
|
-
columns_frequency=getattr(self, "columns_frequency", []),
|
|
338
|
-
target_numbers=getattr(self, "target_numbers", []),
|
|
339
|
-
target_clf=getattr(self, "target_clf", []),
|
|
321
|
+
experiment=self.experiment,
|
|
340
322
|
)
|
|
341
323
|
if for_training:
|
|
342
324
|
train, val, test = app.run()
|
|
@@ -351,7 +333,6 @@ class ExperimentEngine:
|
|
|
351
333
|
train=train,
|
|
352
334
|
target_number=target_number,
|
|
353
335
|
experiment=self.experiment,
|
|
354
|
-
target_clf=self.target_clf,
|
|
355
336
|
)
|
|
356
337
|
app.run()
|
|
357
338
|
self.experiment = Experiment.get(self.experiment.id)
|
|
@@ -368,14 +349,7 @@ class ExperimentEngine:
|
|
|
368
349
|
train=train,
|
|
369
350
|
val=val,
|
|
370
351
|
test=test,
|
|
371
|
-
experiment=
|
|
372
|
-
target_numbers=getattr(self, "target_numbers", []),
|
|
373
|
-
target_clf=getattr(self, "target_clf", []),
|
|
374
|
-
models_idx=getattr(self, "models_idx", []),
|
|
375
|
-
time_series=getattr(self, "time_series", False),
|
|
376
|
-
max_timesteps=getattr(self, "max_timesteps", 120),
|
|
377
|
-
date_column=getattr(self, "date_column", None),
|
|
378
|
-
group_column=getattr(self, "group_column", None),
|
|
352
|
+
experiment=self.experiment,
|
|
379
353
|
)
|
|
380
354
|
if for_training:
|
|
381
355
|
data, reshaped_data = app.run()
|
|
@@ -390,25 +364,13 @@ class ExperimentEngine:
|
|
|
390
364
|
data=data,
|
|
391
365
|
reshaped_data=reshaped_data,
|
|
392
366
|
target_number=target_number,
|
|
393
|
-
experiment=
|
|
394
|
-
target_clf=getattr(self, "target_clf", []),
|
|
395
|
-
models_idx=getattr(self, "models_idx", []),
|
|
396
|
-
time_series=getattr(self, "time_series", False),
|
|
397
|
-
date_column=getattr(self, "date_column", None),
|
|
398
|
-
group_column=getattr(self, "group_column", None),
|
|
399
|
-
target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
|
|
367
|
+
experiment=self.experiment,
|
|
400
368
|
)
|
|
401
369
|
if best_params and target_number not in best_params.keys():
|
|
402
370
|
raise ValueError(
|
|
403
371
|
f"Target {target_number} not found in best_params passed as argument"
|
|
404
372
|
)
|
|
405
373
|
app.run(
|
|
406
|
-
self.experiment_name,
|
|
407
|
-
perform_hyperopt=self.perform_hyperopt,
|
|
408
|
-
number_of_trials=self.number_of_trials,
|
|
409
|
-
perform_crossval=self.perform_crossval,
|
|
410
|
-
plot=self.plot,
|
|
411
|
-
preserve_model=self.preserve_model,
|
|
412
374
|
best_params=best_params[target_number] if best_params else None,
|
|
413
375
|
)
|
|
414
376
|
|
lecrapaud/config.py
CHANGED
|
@@ -32,6 +32,7 @@ DB_URI: str = (
|
|
|
32
32
|
)
|
|
33
33
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
34
34
|
LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
|
|
35
|
-
LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)
|
|
36
35
|
LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
|
|
37
|
-
LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
|
|
36
|
+
LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
|
|
37
|
+
"LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
|
|
38
|
+
).lower()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""add number_of_targets and remove other fields from experiments
|
|
2
|
+
|
|
3
|
+
Revision ID: 0a8fb7826e9b
|
|
4
|
+
Revises: 033e0f7eca4f
|
|
5
|
+
Create Date: 2025-10-28 20:06:54.792631
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from typing import Sequence, Union
|
|
9
|
+
|
|
10
|
+
from alembic import op
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
from sqlalchemy.dialects import mysql
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision: str = '0a8fb7826e9b'
|
|
16
|
+
down_revision: Union[str, None] = '033e0f7eca4f'
|
|
17
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
18
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
|
|
24
|
+
op.drop_column('lecrapaud_experiments', 'corr_threshold')
|
|
25
|
+
op.drop_column('lecrapaud_experiments', 'max_features')
|
|
26
|
+
op.drop_column('lecrapaud_experiments', 'percentile')
|
|
27
|
+
op.drop_column('lecrapaud_experiments', 'type')
|
|
28
|
+
op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
|
|
29
|
+
op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
|
|
30
|
+
# ### end Alembic commands ###
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
35
|
+
op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
|
|
36
|
+
op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
|
|
37
|
+
op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
|
|
38
|
+
op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
|
|
39
|
+
op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
|
|
40
|
+
op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
|
|
41
|
+
op.drop_column('lecrapaud_experiments', 'number_of_targets')
|
|
42
|
+
# ### end Alembic commands ###
|
|
@@ -50,10 +50,43 @@ class Experiment(Base):
|
|
|
50
50
|
)
|
|
51
51
|
name = Column(String(255), nullable=False)
|
|
52
52
|
path = Column(String(255)) # we do not have this at creation time
|
|
53
|
-
type = Column(String(50), nullable=False)
|
|
54
53
|
size = Column(Integer, nullable=False)
|
|
55
54
|
train_size = Column(Integer)
|
|
56
55
|
val_size = Column(Integer)
|
|
56
|
+
test_size = Column(Integer)
|
|
57
|
+
number_of_groups = Column(Integer)
|
|
58
|
+
list_of_groups = Column(JSON)
|
|
59
|
+
number_of_targets = Column(Integer)
|
|
60
|
+
start_date = Column(DateTime)
|
|
61
|
+
end_date = Column(DateTime)
|
|
62
|
+
train_start_date = Column(DateTime)
|
|
63
|
+
train_end_date = Column(DateTime)
|
|
64
|
+
val_start_date = Column(DateTime)
|
|
65
|
+
val_end_date = Column(DateTime)
|
|
66
|
+
test_start_date = Column(DateTime)
|
|
67
|
+
test_end_date = Column(DateTime)
|
|
68
|
+
context = Column(JSON)
|
|
69
|
+
|
|
70
|
+
feature_selections = relationship(
|
|
71
|
+
"FeatureSelection",
|
|
72
|
+
back_populates="experiment",
|
|
73
|
+
cascade="all, delete-orphan",
|
|
74
|
+
lazy="selectin",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
targets = relationship(
|
|
78
|
+
"Target",
|
|
79
|
+
secondary=lecrapaud_experiment_target_association,
|
|
80
|
+
back_populates="experiments",
|
|
81
|
+
lazy="selectin",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
__table_args__ = (
|
|
85
|
+
UniqueConstraint(
|
|
86
|
+
"name",
|
|
87
|
+
name="uq_experiments_composite",
|
|
88
|
+
),
|
|
89
|
+
)
|
|
57
90
|
|
|
58
91
|
# Relationships
|
|
59
92
|
model_selections = relationship(
|
|
@@ -68,16 +101,9 @@ class Experiment(Base):
|
|
|
68
101
|
"""Best RMSE score across all model selections and trainings."""
|
|
69
102
|
# Get the minimum RMSE for each model selection
|
|
70
103
|
min_scores = [
|
|
71
|
-
min(
|
|
72
|
-
mss.rmse
|
|
73
|
-
for mss in ms.model_selection_scores
|
|
74
|
-
if mss.rmse is not None
|
|
75
|
-
)
|
|
104
|
+
min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
|
|
76
105
|
for ms in self.model_selections
|
|
77
|
-
if any(
|
|
78
|
-
mss.rmse is not None
|
|
79
|
-
for mss in ms.model_selection_scores
|
|
80
|
-
)
|
|
106
|
+
if any(mss.rmse is not None for mss in ms.model_selection_scores)
|
|
81
107
|
]
|
|
82
108
|
return min(min_scores) if min_scores else None
|
|
83
109
|
|
|
@@ -92,10 +118,7 @@ class Experiment(Base):
|
|
|
92
118
|
if mss.logloss is not None
|
|
93
119
|
)
|
|
94
120
|
for ms in self.model_selections
|
|
95
|
-
if any(
|
|
96
|
-
mss.logloss is not None
|
|
97
|
-
for mss in ms.model_selection_scores
|
|
98
|
-
)
|
|
121
|
+
if any(mss.logloss is not None for mss in ms.model_selection_scores)
|
|
99
122
|
]
|
|
100
123
|
return min(min_scores) if min_scores else None
|
|
101
124
|
|
|
@@ -104,16 +127,9 @@ class Experiment(Base):
|
|
|
104
127
|
"""Average RMSE score across all model selections and trainings."""
|
|
105
128
|
# Get the minimum RMSE for each model selection
|
|
106
129
|
min_scores = [
|
|
107
|
-
min(
|
|
108
|
-
mss.rmse
|
|
109
|
-
for mss in ms.model_selection_scores
|
|
110
|
-
if mss.rmse is not None
|
|
111
|
-
)
|
|
130
|
+
min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
|
|
112
131
|
for ms in self.model_selections
|
|
113
|
-
if any(
|
|
114
|
-
mss.rmse is not None
|
|
115
|
-
for mss in ms.model_selection_scores
|
|
116
|
-
)
|
|
132
|
+
if any(mss.rmse is not None for mss in ms.model_selection_scores)
|
|
117
133
|
]
|
|
118
134
|
return mean(min_scores) if min_scores else None
|
|
119
135
|
|
|
@@ -128,50 +144,10 @@ class Experiment(Base):
|
|
|
128
144
|
if mss.logloss is not None
|
|
129
145
|
)
|
|
130
146
|
for ms in self.model_selections
|
|
131
|
-
if any(
|
|
132
|
-
mss.logloss is not None
|
|
133
|
-
for mss in ms.model_selection_scores
|
|
134
|
-
)
|
|
147
|
+
if any(mss.logloss is not None for mss in ms.model_selection_scores)
|
|
135
148
|
]
|
|
136
149
|
return mean(min_scores) if min_scores else None
|
|
137
150
|
|
|
138
|
-
test_size = Column(Integer)
|
|
139
|
-
corr_threshold = Column(Float, nullable=False)
|
|
140
|
-
max_features = Column(Integer, nullable=False)
|
|
141
|
-
percentile = Column(Float, nullable=False)
|
|
142
|
-
number_of_groups = Column(Integer)
|
|
143
|
-
list_of_groups = Column(JSON)
|
|
144
|
-
start_date = Column(DateTime)
|
|
145
|
-
end_date = Column(DateTime)
|
|
146
|
-
train_start_date = Column(DateTime)
|
|
147
|
-
train_end_date = Column(DateTime)
|
|
148
|
-
val_start_date = Column(DateTime)
|
|
149
|
-
val_end_date = Column(DateTime)
|
|
150
|
-
test_start_date = Column(DateTime)
|
|
151
|
-
test_end_date = Column(DateTime)
|
|
152
|
-
context = Column(JSON)
|
|
153
|
-
|
|
154
|
-
feature_selections = relationship(
|
|
155
|
-
"FeatureSelection",
|
|
156
|
-
back_populates="experiment",
|
|
157
|
-
cascade="all, delete-orphan",
|
|
158
|
-
lazy="selectin",
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
targets = relationship(
|
|
162
|
-
"Target",
|
|
163
|
-
secondary=lecrapaud_experiment_target_association,
|
|
164
|
-
back_populates="experiments",
|
|
165
|
-
lazy="selectin",
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
__table_args__ = (
|
|
169
|
-
UniqueConstraint(
|
|
170
|
-
"name",
|
|
171
|
-
name="uq_experiments_composite",
|
|
172
|
-
),
|
|
173
|
-
)
|
|
174
|
-
|
|
175
151
|
@classmethod
|
|
176
152
|
@with_db
|
|
177
153
|
def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
|
|
@@ -354,19 +330,18 @@ class Experiment(Base):
|
|
|
354
330
|
|
|
355
331
|
# Get the best model score based on lowest logloss or rmse
|
|
356
332
|
model_scores = best_model_selection.model_selection_scores
|
|
357
|
-
|
|
333
|
+
|
|
358
334
|
# Determine if we should use logloss or rmse based on what's available
|
|
359
335
|
if any(ms.logloss is not None for ms in model_scores):
|
|
360
336
|
# Classification: find lowest logloss
|
|
361
337
|
best_score = min(
|
|
362
338
|
(ms for ms in model_scores if ms.logloss is not None),
|
|
363
|
-
key=lambda x: x.logloss
|
|
339
|
+
key=lambda x: x.logloss,
|
|
364
340
|
)
|
|
365
341
|
elif any(ms.rmse is not None for ms in model_scores):
|
|
366
342
|
# Regression: find lowest rmse
|
|
367
343
|
best_score = min(
|
|
368
|
-
(ms for ms in model_scores if ms.rmse is not None),
|
|
369
|
-
key=lambda x: x.rmse
|
|
344
|
+
(ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
|
|
370
345
|
)
|
|
371
346
|
else:
|
|
372
347
|
return {
|
|
@@ -398,12 +373,8 @@ class Experiment(Base):
|
|
|
398
373
|
|
|
399
374
|
# Get the model info
|
|
400
375
|
model_info = {
|
|
401
|
-
"model_type": (
|
|
402
|
-
|
|
403
|
-
),
|
|
404
|
-
"model_name": (
|
|
405
|
-
score.model.name if score.model else "unknown"
|
|
406
|
-
),
|
|
376
|
+
"model_type": (score.model.model_type if score.model else "unknown"),
|
|
377
|
+
"model_name": (score.model.name if score.model else "unknown"),
|
|
407
378
|
"training_time_seconds": score.training_time,
|
|
408
379
|
}
|
|
409
380
|
|
|
@@ -434,7 +405,9 @@ class Experiment(Base):
|
|
|
434
405
|
return features
|
|
435
406
|
|
|
436
407
|
@with_db
|
|
437
|
-
def get_all_features(
|
|
408
|
+
def get_all_features(
|
|
409
|
+
self, date_column: str = None, group_column: str = None, db=None
|
|
410
|
+
):
|
|
438
411
|
# Ensure we have a fresh instance attached to the session
|
|
439
412
|
self = db.merge(self)
|
|
440
413
|
target_idx = [target.id for target in self.targets]
|
lecrapaud/experiment.py
CHANGED
|
@@ -3,6 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import joblib
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
|
|
7
8
|
# Set up coverage file path
|
|
8
9
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
@@ -15,18 +16,18 @@ from lecrapaud.db.session import get_db
|
|
|
15
16
|
|
|
16
17
|
def create_experiment(
|
|
17
18
|
data: pd.DataFrame | str,
|
|
18
|
-
corr_threshold,
|
|
19
|
-
percentile,
|
|
20
|
-
max_features,
|
|
21
|
-
date_column,
|
|
22
|
-
group_column,
|
|
23
19
|
experiment_name,
|
|
20
|
+
date_column=None,
|
|
21
|
+
group_column=None,
|
|
24
22
|
**kwargs,
|
|
25
23
|
):
|
|
26
24
|
if isinstance(data, str):
|
|
27
25
|
path = f"{data}/data/full.pkl"
|
|
28
26
|
data = joblib.load(path)
|
|
29
27
|
|
|
28
|
+
if kwargs.get("time_series") and not date_column:
|
|
29
|
+
raise ValueError("date_column must be provided for time series experiments")
|
|
30
|
+
|
|
30
31
|
dates = {}
|
|
31
32
|
if date_column:
|
|
32
33
|
dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
|
|
@@ -42,7 +43,10 @@ def create_experiment(
|
|
|
42
43
|
targets = [
|
|
43
44
|
target for target in all_targets if target.name in data.columns.str.upper()
|
|
44
45
|
]
|
|
45
|
-
experiment_name =
|
|
46
|
+
experiment_name = (
|
|
47
|
+
f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
48
|
+
)
|
|
49
|
+
number_of_targets = len(targets)
|
|
46
50
|
|
|
47
51
|
experiment_dir = f"{tmp_dir}/{experiment_name}"
|
|
48
52
|
preprocessing_dir = f"{experiment_dir}/preprocessing"
|
|
@@ -55,26 +59,20 @@ def create_experiment(
|
|
|
55
59
|
db=db,
|
|
56
60
|
name=experiment_name,
|
|
57
61
|
path=Path(experiment_dir).resolve(),
|
|
58
|
-
type="training",
|
|
59
62
|
size=data.shape[0],
|
|
60
|
-
|
|
61
|
-
percentile=percentile,
|
|
62
|
-
max_features=max_features,
|
|
63
|
+
number_of_targets=number_of_targets,
|
|
63
64
|
**groups,
|
|
64
65
|
**dates,
|
|
65
66
|
context={
|
|
66
|
-
"corr_threshold": corr_threshold,
|
|
67
|
-
"percentile": percentile,
|
|
68
|
-
"max_features": max_features,
|
|
69
67
|
"date_column": date_column,
|
|
70
68
|
"group_column": group_column,
|
|
71
69
|
"experiment_name": experiment_name,
|
|
72
70
|
**kwargs,
|
|
73
71
|
},
|
|
74
72
|
)
|
|
75
|
-
|
|
73
|
+
|
|
76
74
|
# Set targets relationship after creation/update
|
|
77
75
|
experiment.targets = targets
|
|
78
76
|
experiment.save(db=db)
|
|
79
|
-
|
|
77
|
+
|
|
80
78
|
return experiment
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
|
|
|
87
87
|
def __init__(
|
|
88
88
|
self,
|
|
89
89
|
data: pd.DataFrame,
|
|
90
|
-
|
|
91
|
-
columns_boolean: list[str] = [],
|
|
92
|
-
columns_date: list[str] = [],
|
|
93
|
-
columns_te_groupby: list[str] = [],
|
|
94
|
-
columns_te_target: list[str] = [],
|
|
90
|
+
experiment,
|
|
95
91
|
for_training: bool = True,
|
|
96
92
|
**kwargs,
|
|
97
93
|
):
|
|
98
94
|
self.data = data
|
|
99
|
-
self.
|
|
100
|
-
self.columns_boolean = columns_boolean
|
|
101
|
-
self.columns_date = columns_date
|
|
102
|
-
self.columns_te_groupby = columns_te_groupby
|
|
103
|
-
self.columns_te_target = columns_te_target
|
|
95
|
+
self.experiment = experiment
|
|
104
96
|
self.for_training = for_training
|
|
97
|
+
|
|
98
|
+
# Get all parameters from experiment context
|
|
99
|
+
self.columns_drop = self.experiment.context.get("columns_drop", [])
|
|
100
|
+
self.columns_boolean = self.experiment.context.get("columns_boolean", [])
|
|
101
|
+
self.columns_date = self.experiment.context.get("columns_date", [])
|
|
102
|
+
self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
|
|
103
|
+
self.columns_te_target = self.experiment.context.get("columns_te_target", [])
|
|
105
104
|
|
|
106
105
|
def run(self) -> pd.DataFrame:
|
|
107
106
|
# drop columns
|
|
@@ -316,41 +315,30 @@ class PreprocessFeature:
|
|
|
316
315
|
self,
|
|
317
316
|
data: pd.DataFrame,
|
|
318
317
|
experiment,
|
|
319
|
-
time_series: bool = False,
|
|
320
|
-
date_column: str | None = None,
|
|
321
|
-
group_column: str | None = None,
|
|
322
|
-
val_size: float = 0.2,
|
|
323
|
-
test_size: float = 0.2,
|
|
324
|
-
columns_pca: list[str] = [],
|
|
325
|
-
pca_temporal: list[dict[str, list[str]]] = [],
|
|
326
|
-
pca_cross_sectional: list[dict[str, list[str]]] = [],
|
|
327
|
-
columns_onehot: list[str] = [],
|
|
328
|
-
columns_binary: list[str] = [],
|
|
329
|
-
columns_ordinal: list[str] = [],
|
|
330
|
-
columns_frequency: list[str] = [],
|
|
331
|
-
target_numbers: list = [],
|
|
332
|
-
target_clf: list = [],
|
|
333
318
|
**kwargs,
|
|
334
319
|
):
|
|
335
320
|
self.data = data
|
|
336
321
|
self.data.columns = self.data.columns.str.upper()
|
|
337
|
-
|
|
338
322
|
self.experiment = experiment
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
self.
|
|
343
|
-
self.
|
|
344
|
-
self.
|
|
345
|
-
self.
|
|
346
|
-
self.
|
|
347
|
-
self.
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
self.
|
|
352
|
-
self.
|
|
353
|
-
self.
|
|
323
|
+
|
|
324
|
+
# Get all parameters from experiment context
|
|
325
|
+
context = self.experiment.context
|
|
326
|
+
self.time_series = context.get("time_series", False)
|
|
327
|
+
self.date_column = context.get("date_column", None)
|
|
328
|
+
self.group_column = context.get("group_column", None)
|
|
329
|
+
self.val_size = context.get("val_size", 0.2)
|
|
330
|
+
self.test_size = context.get("test_size", 0.2)
|
|
331
|
+
self.target_numbers = context.get("target_numbers", [])
|
|
332
|
+
self.target_clf = context.get("target_clf", [])
|
|
333
|
+
|
|
334
|
+
# Handle list parameters with uppercase conversion
|
|
335
|
+
self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
|
|
336
|
+
self.pca_temporal = context.get("pca_temporal", [])
|
|
337
|
+
self.pca_cross_sectional = context.get("pca_cross_sectional", [])
|
|
338
|
+
self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
|
|
339
|
+
self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
|
|
340
|
+
self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
|
|
341
|
+
self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
|
|
354
342
|
|
|
355
343
|
self.experiment_dir = self.experiment.path
|
|
356
344
|
self.experiment_id = self.experiment.id
|