lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +116 -65
- lecrapaud/db/models/experiment.py +195 -182
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +4 -0
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +612 -242
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.19.0.dist-info/METADATA +0 -249
- lecrapaud-0.19.0.dist-info/RECORD +0 -48
|
@@ -0,0 +1,1202 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import joblib
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from sklearn.compose import ColumnTransformer
|
|
7
|
+
from sklearn.decomposition import PCA
|
|
8
|
+
from sklearn.impute import SimpleImputer
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
from sklearn.pipeline import Pipeline
|
|
11
|
+
from category_encoders import BinaryEncoder, CountEncoder
|
|
12
|
+
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
13
|
+
from sklearn.model_selection import train_test_split
|
|
14
|
+
|
|
15
|
+
from lecrapaud.integrations.openai_integration import (
|
|
16
|
+
truncate_text,
|
|
17
|
+
get_openai_embeddings,
|
|
18
|
+
)
|
|
19
|
+
from lecrapaud.feature_selection import get_features_by_types
|
|
20
|
+
from lecrapaud.utils import logger
|
|
21
|
+
from lecrapaud.db import Target, Feature, Experiment
|
|
22
|
+
from lecrapaud.config import PYTHON_ENV
|
|
23
|
+
from lecrapaud.feature_engineering import convert_object_columns_that_are_numeric
|
|
24
|
+
from lecrapaud.mixins import LeCrapaudTransformerMixin
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FeaturePreprocessor(LeCrapaudTransformerMixin):
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
experiment=None,
|
|
32
|
+
**kwargs,
|
|
33
|
+
):
|
|
34
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
35
|
+
super().__init__(experiment=experiment, **kwargs)
|
|
36
|
+
|
|
37
|
+
# Set defaults for attributes not automatically set by mixin
|
|
38
|
+
if not hasattr(self, "time_series"):
|
|
39
|
+
self.time_series = False
|
|
40
|
+
if not hasattr(self, "date_column"):
|
|
41
|
+
self.date_column = None
|
|
42
|
+
if not hasattr(self, "group_column"):
|
|
43
|
+
self.group_column = None
|
|
44
|
+
if not hasattr(self, "val_size"):
|
|
45
|
+
self.val_size = 0.2
|
|
46
|
+
if not hasattr(self, "test_size"):
|
|
47
|
+
self.test_size = 0.2
|
|
48
|
+
if not hasattr(self, "target_numbers"):
|
|
49
|
+
self.target_numbers = []
|
|
50
|
+
if not hasattr(self, "target_clf"):
|
|
51
|
+
self.target_clf = []
|
|
52
|
+
|
|
53
|
+
# Handle list parameters with uppercase conversion
|
|
54
|
+
if not hasattr(self, "columns_pca"):
|
|
55
|
+
self.columns_pca = []
|
|
56
|
+
else:
|
|
57
|
+
self.columns_pca = [col.upper() for col in self.columns_pca]
|
|
58
|
+
if not hasattr(self, "pca_temporal"):
|
|
59
|
+
self.pca_temporal = []
|
|
60
|
+
if not hasattr(self, "pca_cross_sectional"):
|
|
61
|
+
self.pca_cross_sectional = []
|
|
62
|
+
if not hasattr(self, "columns_onehot"):
|
|
63
|
+
self.columns_onehot = []
|
|
64
|
+
else:
|
|
65
|
+
self.columns_onehot = [col.upper() for col in self.columns_onehot]
|
|
66
|
+
if not hasattr(self, "columns_binary"):
|
|
67
|
+
self.columns_binary = []
|
|
68
|
+
else:
|
|
69
|
+
self.columns_binary = [col.upper() for col in self.columns_binary]
|
|
70
|
+
if not hasattr(self, "columns_ordinal"):
|
|
71
|
+
self.columns_ordinal = []
|
|
72
|
+
else:
|
|
73
|
+
self.columns_ordinal = [col.upper() for col in self.columns_ordinal]
|
|
74
|
+
if not hasattr(self, "columns_frequency"):
|
|
75
|
+
self.columns_frequency = []
|
|
76
|
+
else:
|
|
77
|
+
self.columns_frequency = [col.upper() for col in self.columns_frequency]
|
|
78
|
+
|
|
79
|
+
# Set experiment-related paths if experiment is available
|
|
80
|
+
if self.experiment:
|
|
81
|
+
self.experiment_dir = self.experiment.path
|
|
82
|
+
self.experiment_id = self.experiment.id
|
|
83
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
84
|
+
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
85
|
+
|
|
86
|
+
def fit(self, X, y=None):
|
|
87
|
+
"""
|
|
88
|
+
Fit the preprocessor (learns PCA components, encoders, etc.).
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
X (pd.DataFrame): Input data
|
|
92
|
+
y: Target values (ignored)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
self: Returns self for chaining
|
|
96
|
+
"""
|
|
97
|
+
X, y = self._validate_data(X, y)
|
|
98
|
+
|
|
99
|
+
# Store data and make columns uppercase
|
|
100
|
+
data = X.copy()
|
|
101
|
+
data.columns = data.columns.str.upper()
|
|
102
|
+
|
|
103
|
+
joblib.dump(
|
|
104
|
+
list(data.columns),
|
|
105
|
+
f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Fit PCA components
|
|
109
|
+
data, self.pcas_ = self.add_pca_features(data)
|
|
110
|
+
data, self.pcas_cross_sectional_ = self.add_pca_feature_cross_sectional(data)
|
|
111
|
+
data, self.pcas_temporal_ = self.add_pca_feature_temporal(data)
|
|
112
|
+
|
|
113
|
+
# Fit encoding transformer
|
|
114
|
+
data, self.transformer_ = self.encode_categorical_features(data)
|
|
115
|
+
|
|
116
|
+
# Save fitted transformers if experiment is available
|
|
117
|
+
if self.experiment:
|
|
118
|
+
joblib.dump(self.pcas_, f"{self.preprocessing_dir}/pcas.pkl")
|
|
119
|
+
joblib.dump(
|
|
120
|
+
self.pcas_cross_sectional_,
|
|
121
|
+
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl",
|
|
122
|
+
)
|
|
123
|
+
joblib.dump(
|
|
124
|
+
self.pcas_temporal_, f"{self.preprocessing_dir}/pcas_temporal.pkl"
|
|
125
|
+
)
|
|
126
|
+
joblib.dump(
|
|
127
|
+
self.transformer_, f"{self.preprocessing_dir}/column_transformer.pkl"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Save features and summary
|
|
131
|
+
joblib.dump(
|
|
132
|
+
list(data.columns),
|
|
133
|
+
f"{self.preprocessing_dir}/all_features_before_selection.pkl",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if PYTHON_ENV == "Development":
|
|
137
|
+
joblib.dump(X, f"{self.data_dir}/full.pkl")
|
|
138
|
+
|
|
139
|
+
summary = summarize_dataframe(data)
|
|
140
|
+
summary.to_csv(f"{self.experiment_dir}/feature_summary.csv", index=False)
|
|
141
|
+
|
|
142
|
+
self._set_fitted()
|
|
143
|
+
return self
|
|
144
|
+
|
|
145
|
+
def transform(self, X):
|
|
146
|
+
"""
|
|
147
|
+
Transform the input data using fitted components.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
X (pd.DataFrame): Input data
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
pd.DataFrame: Transformed data
|
|
154
|
+
"""
|
|
155
|
+
# Allow loading persisted artifacts even in a fresh instance
|
|
156
|
+
if not getattr(self, "is_fitted_", False) and self.experiment:
|
|
157
|
+
if os.path.exists(f"{self.preprocessing_dir}/column_transformer.pkl"):
|
|
158
|
+
self.is_fitted_ = True
|
|
159
|
+
|
|
160
|
+
self._check_is_fitted()
|
|
161
|
+
X, _ = self._validate_data(X, reset=False)
|
|
162
|
+
|
|
163
|
+
# Transform data
|
|
164
|
+
data = X.copy()
|
|
165
|
+
data.columns = data.columns.str.upper()
|
|
166
|
+
|
|
167
|
+
# Load fitted components if not already in memory
|
|
168
|
+
if not hasattr(self, "pcas_") and self.experiment:
|
|
169
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas.pkl"):
|
|
170
|
+
self.pcas_ = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
171
|
+
|
|
172
|
+
if not hasattr(self, "pcas_cross_sectional_") and self.experiment:
|
|
173
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
|
|
174
|
+
self.pcas_cross_sectional_ = joblib.load(
|
|
175
|
+
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if not hasattr(self, "pcas_temporal_") and self.experiment:
|
|
179
|
+
if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
|
|
180
|
+
self.pcas_temporal_ = joblib.load(
|
|
181
|
+
f"{self.preprocessing_dir}/pcas_temporal.pkl"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
if not hasattr(self, "transformer_") and self.experiment:
|
|
185
|
+
if os.path.exists(f"{self.preprocessing_dir}/column_transformer.pkl"):
|
|
186
|
+
self.transformer_ = joblib.load(
|
|
187
|
+
f"{self.preprocessing_dir}/column_transformer.pkl"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Apply PCA transformations using fitted components
|
|
191
|
+
if hasattr(self, "pcas_"):
|
|
192
|
+
data, _ = self.add_pca_features(data, pcas=self.pcas_)
|
|
193
|
+
if hasattr(self, "pcas_cross_sectional_"):
|
|
194
|
+
data, _ = self.add_pca_feature_cross_sectional(
|
|
195
|
+
data, pcas=self.pcas_cross_sectional_
|
|
196
|
+
)
|
|
197
|
+
if hasattr(self, "pcas_temporal_"):
|
|
198
|
+
data, _ = self.add_pca_feature_temporal(data, pcas=self.pcas_temporal_)
|
|
199
|
+
|
|
200
|
+
# Apply encoding using fitted transformer
|
|
201
|
+
if hasattr(self, "transformer_"):
|
|
202
|
+
data, _ = self.encode_categorical_features(
|
|
203
|
+
data, transformer=self.transformer_
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return data
|
|
207
|
+
|
|
208
|
+
# embedding and pca
|
|
209
|
+
def add_pca_features(
|
|
210
|
+
self, df: pd.DataFrame, n_components: int = 5, pcas=None
|
|
211
|
+
) -> tuple[pd.DataFrame, dict]:
|
|
212
|
+
"""
|
|
213
|
+
Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
|
|
214
|
+
NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
|
|
215
|
+
|
|
216
|
+
Parameters:
|
|
217
|
+
df (pd.DataFrame): Input DataFrame
|
|
218
|
+
column (str): Name of the column containing np.ndarray
|
|
219
|
+
n_components (int): Number of PCA components to keep
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
pd.DataFrame: DataFrame with new PCA columns added
|
|
223
|
+
"""
|
|
224
|
+
columns: list[str] = self.columns_pca
|
|
225
|
+
|
|
226
|
+
pcas_dict = {}
|
|
227
|
+
for column in columns:
|
|
228
|
+
# Convert text to embeddings if necessary
|
|
229
|
+
if not isinstance(df[column].iloc[0], (np.ndarray, list)):
|
|
230
|
+
sentences = df[column].astype(str).tolist()
|
|
231
|
+
logger.info(
|
|
232
|
+
f"Total sentences to embed for column {column}: {len(sentences)}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Truncate each sentence
|
|
236
|
+
truncate_sentences = [truncate_text(sentence) for sentence in sentences]
|
|
237
|
+
|
|
238
|
+
# embedding
|
|
239
|
+
embedding_matrix = get_openai_embeddings(truncate_sentences)
|
|
240
|
+
else:
|
|
241
|
+
logger.info(f"Column {column} is already embeddings")
|
|
242
|
+
# Stack the vectors into a 2D array
|
|
243
|
+
embedding_matrix = np.vstack(df[column].values)
|
|
244
|
+
|
|
245
|
+
# Apply PCA
|
|
246
|
+
if pcas:
|
|
247
|
+
pca = pcas[column]
|
|
248
|
+
pca_features = pca.transform(embedding_matrix)
|
|
249
|
+
else:
|
|
250
|
+
pca = PCA(n_components=n_components)
|
|
251
|
+
pca_features = pca.fit_transform(embedding_matrix)
|
|
252
|
+
|
|
253
|
+
# Add PCA columns
|
|
254
|
+
for i in range(n_components):
|
|
255
|
+
df[f"{column}_pca_{i+1}"] = pca_features[:, i]
|
|
256
|
+
|
|
257
|
+
# Drop the original column
|
|
258
|
+
df.drop(column, axis=1, inplace=True)
|
|
259
|
+
pcas_dict.update({column: pca})
|
|
260
|
+
|
|
261
|
+
return df, pcas_dict
|
|
262
|
+
|
|
263
|
+
def add_pca_feature_cross_sectional_old(
|
|
264
|
+
self,
|
|
265
|
+
df: pd.DataFrame,
|
|
266
|
+
*,
|
|
267
|
+
n_components: int = 5,
|
|
268
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
269
|
+
impute_strategy: str = "median",
|
|
270
|
+
standardize: bool = True,
|
|
271
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
272
|
+
"""
|
|
273
|
+
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
274
|
+
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
275
|
+
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
pcas_dict = {}
|
|
279
|
+
index_saved = df.index
|
|
280
|
+
|
|
281
|
+
for pca_cross_sectional in self.pca_cross_sectional:
|
|
282
|
+
name, index_col, columns_col, value_col = (
|
|
283
|
+
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
284
|
+
)
|
|
285
|
+
prefix = f"CS_PC_{name}"
|
|
286
|
+
|
|
287
|
+
pivot = df.pivot_table(
|
|
288
|
+
index=index_col, columns=columns_col, values=value_col
|
|
289
|
+
).sort_index()
|
|
290
|
+
|
|
291
|
+
# Pipeline à réutiliser entre train et test
|
|
292
|
+
if pcas is None:
|
|
293
|
+
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
294
|
+
if standardize:
|
|
295
|
+
steps.append(
|
|
296
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
297
|
+
)
|
|
298
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
299
|
+
steps.append(("pca", pca))
|
|
300
|
+
pipe = Pipeline(steps)
|
|
301
|
+
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
302
|
+
else:
|
|
303
|
+
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
304
|
+
|
|
305
|
+
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
306
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
307
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
308
|
+
|
|
309
|
+
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
310
|
+
df.index = index_saved
|
|
311
|
+
pcas_dict.update({name: pipe})
|
|
312
|
+
|
|
313
|
+
return df, pcas_dict
|
|
314
|
+
|
|
315
|
+
def add_pca_feature_cross_sectional(
|
|
316
|
+
self,
|
|
317
|
+
df: pd.DataFrame,
|
|
318
|
+
*,
|
|
319
|
+
n_components: int = 5,
|
|
320
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
321
|
+
impute_strategy: str = "median",
|
|
322
|
+
standardize: bool = True,
|
|
323
|
+
lookback_days: int = 365, # nombre de jours à regarder en arrière pour le fit
|
|
324
|
+
refresh_frequency: int = 90, # refresh la PCA tous les X jours
|
|
325
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
326
|
+
"""
|
|
327
|
+
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
328
|
+
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
329
|
+
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
330
|
+
|
|
331
|
+
Pour les séries temporelles : fit la PCA uniquement sur les données passées
|
|
332
|
+
pour éviter le leakage, avec refresh périodique.
|
|
333
|
+
|
|
334
|
+
Gère le cas des données panel où on a plusieurs séries temporelles
|
|
335
|
+
(ex: plusieurs stocks avec les mêmes dates).
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
pcas_dict = {}
|
|
339
|
+
index_saved = df.index
|
|
340
|
+
|
|
341
|
+
for pca_cross_sectional in self.pca_cross_sectional:
|
|
342
|
+
name, index_col, columns_col, value_col = (
|
|
343
|
+
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
344
|
+
)
|
|
345
|
+
prefix = f"CS_PC_{name}"
|
|
346
|
+
|
|
347
|
+
# Vérifier si c'est une série temporelle avec index = date
|
|
348
|
+
# Les dates sont déjà en ordinal après cyclic_encode_date
|
|
349
|
+
is_time_series = self.time_series and index_col == self.date_column
|
|
350
|
+
|
|
351
|
+
if is_time_series:
|
|
352
|
+
# Cas spécial : PCA cross-sectional sur des données de panel time series
|
|
353
|
+
# Par exemple : PCA sur les returns de tous les stocks à chaque date
|
|
354
|
+
# pour capturer le régime de marché
|
|
355
|
+
|
|
356
|
+
all_scores = []
|
|
357
|
+
|
|
358
|
+
# Les dates sont déjà en ordinal
|
|
359
|
+
unique_dates = sorted(df[index_col].unique())
|
|
360
|
+
|
|
361
|
+
# Pour l'inference, utiliser la PCA fournie
|
|
362
|
+
if pcas is not None:
|
|
363
|
+
pipe = pcas[name]
|
|
364
|
+
pivot = df.pivot_table(
|
|
365
|
+
index=index_col, columns=columns_col, values=value_col
|
|
366
|
+
).sort_index()
|
|
367
|
+
scores = pipe.transform(pivot)
|
|
368
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
369
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
370
|
+
else:
|
|
371
|
+
# Training : fit PCA de manière expanding avec refresh périodique
|
|
372
|
+
pipe = None
|
|
373
|
+
last_fit_date = None
|
|
374
|
+
|
|
375
|
+
for i, current_date_ordinal in enumerate(unique_dates):
|
|
376
|
+
# Convertir l'ordinal en date pour les calculs de temps
|
|
377
|
+
current_date = pd.Timestamp.fromordinal(
|
|
378
|
+
int(current_date_ordinal)
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Déterminer si on doit refitter la PCA
|
|
382
|
+
should_refit = pipe is None or ( # Première fois
|
|
383
|
+
last_fit_date is not None
|
|
384
|
+
and (current_date - last_fit_date).days >= refresh_frequency
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if (
|
|
388
|
+
should_refit and i > 30
|
|
389
|
+
): # Attendre au moins 30 jours de données
|
|
390
|
+
# Prendre les données des 'lookback_days' derniers jours
|
|
391
|
+
lookback_start_date = current_date - pd.Timedelta(
|
|
392
|
+
days=lookback_days
|
|
393
|
+
)
|
|
394
|
+
lookback_start_ordinal = pd.Timestamp.toordinal(
|
|
395
|
+
lookback_start_date
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Masque pour les dates passées uniquement (éviter le leakage)
|
|
399
|
+
mask_fit = (df[index_col] >= lookback_start_ordinal) & (
|
|
400
|
+
df[index_col] < current_date_ordinal
|
|
401
|
+
)
|
|
402
|
+
df_fit = df[mask_fit]
|
|
403
|
+
|
|
404
|
+
if len(df_fit) > 0:
|
|
405
|
+
# Créer le pivot pour la période de lookback
|
|
406
|
+
pivot_fit = df_fit.pivot_table(
|
|
407
|
+
index=index_col,
|
|
408
|
+
columns=columns_col,
|
|
409
|
+
values=value_col,
|
|
410
|
+
).sort_index()
|
|
411
|
+
|
|
412
|
+
# Vérifier qu'on a assez de dates et de colonnes
|
|
413
|
+
if (
|
|
414
|
+
len(pivot_fit) >= n_components
|
|
415
|
+
and pivot_fit.shape[1] >= n_components
|
|
416
|
+
):
|
|
417
|
+
# Créer nouveau pipeline
|
|
418
|
+
steps = [
|
|
419
|
+
(
|
|
420
|
+
"imputer",
|
|
421
|
+
SimpleImputer(strategy=impute_strategy),
|
|
422
|
+
)
|
|
423
|
+
]
|
|
424
|
+
if standardize:
|
|
425
|
+
steps.append(
|
|
426
|
+
(
|
|
427
|
+
"scaler",
|
|
428
|
+
StandardScaler(
|
|
429
|
+
with_mean=True, with_std=True
|
|
430
|
+
),
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
434
|
+
steps.append(("pca", pca))
|
|
435
|
+
pipe = Pipeline(steps)
|
|
436
|
+
pipe.fit(pivot_fit)
|
|
437
|
+
last_fit_date = current_date
|
|
438
|
+
|
|
439
|
+
logger.debug(
|
|
440
|
+
f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
|
|
441
|
+
f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Transform pour la date courante uniquement
|
|
445
|
+
if pipe is not None:
|
|
446
|
+
df_current = df[df[index_col] == current_date_ordinal]
|
|
447
|
+
if len(df_current) > 0:
|
|
448
|
+
pivot_current = df_current.pivot_table(
|
|
449
|
+
index=index_col,
|
|
450
|
+
columns=columns_col,
|
|
451
|
+
values=value_col,
|
|
452
|
+
)
|
|
453
|
+
try:
|
|
454
|
+
scores_current = pipe.transform(pivot_current)
|
|
455
|
+
scores_dict = {
|
|
456
|
+
index_col: [current_date_ordinal],
|
|
457
|
+
**{
|
|
458
|
+
f"{prefix}_{j}": [scores_current[0, j]]
|
|
459
|
+
for j in range(n_components)
|
|
460
|
+
},
|
|
461
|
+
}
|
|
462
|
+
all_scores.append(pd.DataFrame(scores_dict))
|
|
463
|
+
except Exception as e:
|
|
464
|
+
# En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
|
|
465
|
+
logger.debug(
|
|
466
|
+
f"PCA transform error at date {current_date}: {str(e)}"
|
|
467
|
+
)
|
|
468
|
+
scores_dict = {
|
|
469
|
+
index_col: [current_date_ordinal],
|
|
470
|
+
**{
|
|
471
|
+
f"{prefix}_{j}": [np.nan]
|
|
472
|
+
for j in range(n_components)
|
|
473
|
+
},
|
|
474
|
+
}
|
|
475
|
+
all_scores.append(pd.DataFrame(scores_dict))
|
|
476
|
+
else:
|
|
477
|
+
# Pas encore de PCA fittée, créer des NaN
|
|
478
|
+
scores_dict = {
|
|
479
|
+
index_col: [current_date_ordinal],
|
|
480
|
+
**{
|
|
481
|
+
f"{prefix}_{j}": [np.nan]
|
|
482
|
+
for j in range(n_components)
|
|
483
|
+
},
|
|
484
|
+
}
|
|
485
|
+
all_scores.append(pd.DataFrame(scores_dict))
|
|
486
|
+
|
|
487
|
+
# Combiner tous les scores
|
|
488
|
+
if all_scores:
|
|
489
|
+
scores_df = pd.concat(all_scores, ignore_index=True)
|
|
490
|
+
else:
|
|
491
|
+
# Créer un DataFrame vide avec les bonnes colonnes
|
|
492
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
493
|
+
scores_df = pd.DataFrame(columns=[index_col] + cols)
|
|
494
|
+
|
|
495
|
+
# Merger les scores
|
|
496
|
+
df = df.merge(scores_df, on=index_col, how="left")
|
|
497
|
+
df.index = index_saved
|
|
498
|
+
|
|
499
|
+
# Forward fill puis 0 pour éviter les NaN
|
|
500
|
+
pca_cols = [col for col in df.columns if col.startswith(prefix)]
|
|
501
|
+
df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
|
|
502
|
+
|
|
503
|
+
pcas_dict.update({name: pipe})
|
|
504
|
+
|
|
505
|
+
else:
|
|
506
|
+
# Approche classique (non time series ou index != date)
|
|
507
|
+
pivot = df.pivot_table(
|
|
508
|
+
index=index_col, columns=columns_col, values=value_col
|
|
509
|
+
).sort_index()
|
|
510
|
+
|
|
511
|
+
# Pipeline à réutiliser entre train et test
|
|
512
|
+
if pcas is None:
|
|
513
|
+
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
514
|
+
if standardize:
|
|
515
|
+
steps.append(
|
|
516
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
517
|
+
)
|
|
518
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
519
|
+
steps.append(("pca", pca))
|
|
520
|
+
pipe = Pipeline(steps)
|
|
521
|
+
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
522
|
+
else:
|
|
523
|
+
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
524
|
+
|
|
525
|
+
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
526
|
+
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
527
|
+
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
528
|
+
|
|
529
|
+
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
530
|
+
df.index = index_saved
|
|
531
|
+
pcas_dict.update({name: pipe})
|
|
532
|
+
|
|
533
|
+
return df, pcas_dict
|
|
534
|
+
|
|
535
|
+
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
536
|
+
def add_pca_feature_temporal_old(
|
|
537
|
+
self,
|
|
538
|
+
df: pd.DataFrame,
|
|
539
|
+
*,
|
|
540
|
+
n_components: int = 5,
|
|
541
|
+
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
542
|
+
impute_strategy: (
|
|
543
|
+
str | None
|
|
544
|
+
) = None, # None = on exige toutes les colonnes présentes
|
|
545
|
+
standardize: bool = True,
|
|
546
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
547
|
+
"""
|
|
548
|
+
Applique une PCA sur une matrice (rows = lignes df, cols = lags).
|
|
549
|
+
Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
|
|
550
|
+
Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
|
|
551
|
+
"""
|
|
552
|
+
pcas_dict = {}
|
|
553
|
+
|
|
554
|
+
for pca_temporal in self.pca_temporal:
|
|
555
|
+
name, cols = (pca_temporal[k] for k in ("name", "columns"))
|
|
556
|
+
prefix = f"TMP_PC_{name}"
|
|
557
|
+
|
|
558
|
+
# Masque des lignes utilisables
|
|
559
|
+
if impute_strategy is None:
|
|
560
|
+
mask = (
|
|
561
|
+
df[cols].notna().all(axis=1)
|
|
562
|
+
) # on n'impute pas → lignes complètes
|
|
563
|
+
X_fit = df.loc[mask, cols]
|
|
564
|
+
else:
|
|
565
|
+
mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
|
|
566
|
+
X_fit = df.loc[mask, cols]
|
|
567
|
+
|
|
568
|
+
# Pipeline
|
|
569
|
+
if pcas is None:
|
|
570
|
+
steps = []
|
|
571
|
+
if impute_strategy is not None:
|
|
572
|
+
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
573
|
+
if standardize:
|
|
574
|
+
steps.append(
|
|
575
|
+
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
576
|
+
)
|
|
577
|
+
pca = PCA(n_components=n_components, random_state=0)
|
|
578
|
+
steps.append(("pca", pca))
|
|
579
|
+
pipe = Pipeline(steps)
|
|
580
|
+
if not X_fit.empty:
|
|
581
|
+
pipe.fit(X_fit) # <- fit sur TRAIN uniquement
|
|
582
|
+
else:
|
|
583
|
+
pipe = pcas[name] # <- TEST
|
|
584
|
+
|
|
585
|
+
# Transform uniquement sur lignes valides (mask)
|
|
586
|
+
if not df.loc[mask, cols].empty:
|
|
587
|
+
Z = pipe.transform(df.loc[mask, cols])
|
|
588
|
+
for i in range(n_components):
|
|
589
|
+
df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
|
|
590
|
+
else:
|
|
591
|
+
# crée les colonnes vides si aucune ligne valide (cohérence de schéma)
|
|
592
|
+
for i in range(n_components):
|
|
593
|
+
df[f"{prefix}_{i}"] = pd.NA
|
|
594
|
+
|
|
595
|
+
pcas_dict.update({name: pipe})
|
|
596
|
+
|
|
597
|
+
return df, pcas_dict
|
|
598
|
+
|
|
599
|
+
def add_pca_feature_temporal(
|
|
600
|
+
self,
|
|
601
|
+
df: pd.DataFrame,
|
|
602
|
+
*,
|
|
603
|
+
n_components: int = 5,
|
|
604
|
+
pcas: dict[str, Pipeline] | None = None,
|
|
605
|
+
impute_strategy: str = "median",
|
|
606
|
+
standardize: bool = True,
|
|
607
|
+
lookback_days: int = 365,
|
|
608
|
+
refresh_frequency: int = 90,
|
|
609
|
+
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
610
|
+
"""
|
|
611
|
+
PCA temporelle pour time series avec support panel data.
|
|
612
|
+
Crée automatiquement les colonnes de lags et évite le look-ahead bias.
|
|
613
|
+
|
|
614
|
+
Format pca_temporal simplifié:
|
|
615
|
+
[{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
|
|
616
|
+
"""
|
|
617
|
+
pcas_dict = {}
|
|
618
|
+
|
|
619
|
+
for pca_config in self.pca_temporal:
|
|
620
|
+
# Support both old and new format
|
|
621
|
+
if "columns" in pca_config:
|
|
622
|
+
# Old format: use existing columns
|
|
623
|
+
name = pca_config["name"]
|
|
624
|
+
lag_columns = pca_config["columns"]
|
|
625
|
+
base_column = None
|
|
626
|
+
num_lags = len(lag_columns)
|
|
627
|
+
else:
|
|
628
|
+
# New format: create lag columns
|
|
629
|
+
name = pca_config["name"]
|
|
630
|
+
base_column = pca_config["column"].upper()
|
|
631
|
+
num_lags = pca_config.get("lags", 20)
|
|
632
|
+
|
|
633
|
+
# Create lag columns if they don't exist
|
|
634
|
+
if self.group_column:
|
|
635
|
+
# Panel data: create lags by group
|
|
636
|
+
for lag in range(1, num_lags + 1):
|
|
637
|
+
lag_col = f"{base_column}_-{lag}"
|
|
638
|
+
if lag_col not in df.columns:
|
|
639
|
+
df[lag_col] = df.groupby(self.group_column)[
|
|
640
|
+
base_column
|
|
641
|
+
].shift(lag)
|
|
642
|
+
else:
|
|
643
|
+
# Simple time series
|
|
644
|
+
for lag in range(1, num_lags + 1):
|
|
645
|
+
lag_col = f"{base_column}_-{lag}"
|
|
646
|
+
if lag_col not in df.columns:
|
|
647
|
+
df[lag_col] = df[base_column].shift(lag)
|
|
648
|
+
|
|
649
|
+
lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
|
|
650
|
+
|
|
651
|
+
prefix = f"TMP_PC_{name}"
|
|
652
|
+
|
|
653
|
+
# For time series: avoid look-ahead bias
|
|
654
|
+
if self.time_series and self.date_column:
|
|
655
|
+
all_scores = []
|
|
656
|
+
unique_dates = sorted(df[self.date_column].unique())
|
|
657
|
+
|
|
658
|
+
if pcas is not None:
|
|
659
|
+
# transform: use provided PCA
|
|
660
|
+
pipe = pcas[name]
|
|
661
|
+
|
|
662
|
+
# Apply to all data at once
|
|
663
|
+
mask = df[lag_columns].notna().all(axis=1)
|
|
664
|
+
if mask.any():
|
|
665
|
+
X_transform = df.loc[mask, lag_columns]
|
|
666
|
+
scores = pipe.transform(X_transform)
|
|
667
|
+
|
|
668
|
+
for i in range(n_components):
|
|
669
|
+
df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
|
|
670
|
+
|
|
671
|
+
# Fill NaN with forward fill then 0
|
|
672
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
673
|
+
df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
|
|
674
|
+
|
|
675
|
+
else:
|
|
676
|
+
# Training: expanding window with periodic refresh
|
|
677
|
+
pipe = None
|
|
678
|
+
last_fit_date = None
|
|
679
|
+
|
|
680
|
+
for current_date_ordinal in unique_dates:
|
|
681
|
+
current_date = pd.Timestamp.fromordinal(
|
|
682
|
+
int(current_date_ordinal)
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Determine if we should refit
|
|
686
|
+
should_refit = pipe is None or (
|
|
687
|
+
last_fit_date is not None
|
|
688
|
+
and (current_date - last_fit_date).days >= refresh_frequency
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
if (
|
|
692
|
+
should_refit
|
|
693
|
+
and len(df[df[self.date_column] < current_date_ordinal])
|
|
694
|
+
> num_lags * 2
|
|
695
|
+
):
|
|
696
|
+
# Get historical data for fitting
|
|
697
|
+
lookback_start = current_date - pd.Timedelta(
|
|
698
|
+
days=lookback_days
|
|
699
|
+
)
|
|
700
|
+
lookback_start_ordinal = pd.Timestamp.toordinal(
|
|
701
|
+
lookback_start
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
mask_fit = (
|
|
705
|
+
(df[self.date_column] >= lookback_start_ordinal)
|
|
706
|
+
& (df[self.date_column] < current_date_ordinal)
|
|
707
|
+
& df[lag_columns].notna().all(axis=1)
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
if mask_fit.sum() >= n_components:
|
|
711
|
+
X_fit = df.loc[mask_fit, lag_columns]
|
|
712
|
+
|
|
713
|
+
# Create pipeline
|
|
714
|
+
steps = []
|
|
715
|
+
if impute_strategy is not None:
|
|
716
|
+
steps.append(
|
|
717
|
+
(
|
|
718
|
+
"imputer",
|
|
719
|
+
SimpleImputer(strategy=impute_strategy),
|
|
720
|
+
)
|
|
721
|
+
)
|
|
722
|
+
if standardize:
|
|
723
|
+
steps.append(("scaler", StandardScaler()))
|
|
724
|
+
steps.append(
|
|
725
|
+
(
|
|
726
|
+
"pca",
|
|
727
|
+
PCA(n_components=n_components, random_state=0),
|
|
728
|
+
)
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
pipe = Pipeline(steps)
|
|
732
|
+
pipe.fit(X_fit)
|
|
733
|
+
last_fit_date = current_date
|
|
734
|
+
|
|
735
|
+
logger.debug(
|
|
736
|
+
f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
|
|
737
|
+
f"using {len(X_fit)} samples"
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Transform current date data
|
|
741
|
+
if pipe is not None:
|
|
742
|
+
mask_current = (
|
|
743
|
+
df[self.date_column] == current_date_ordinal
|
|
744
|
+
) & df[lag_columns].notna().all(axis=1)
|
|
745
|
+
|
|
746
|
+
if mask_current.any():
|
|
747
|
+
X_current = df.loc[mask_current, lag_columns]
|
|
748
|
+
scores = pipe.transform(X_current)
|
|
749
|
+
|
|
750
|
+
for i in range(n_components):
|
|
751
|
+
df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
|
|
752
|
+
|
|
753
|
+
# Fill NaN with forward fill then 0
|
|
754
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
755
|
+
for col in pca_cols:
|
|
756
|
+
if col not in df.columns:
|
|
757
|
+
df[col] = 0
|
|
758
|
+
df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
|
|
759
|
+
|
|
760
|
+
pcas_dict[name] = pipe
|
|
761
|
+
|
|
762
|
+
else:
|
|
763
|
+
# Non time-series: use original approach
|
|
764
|
+
mask = df[lag_columns].notna().all(axis=1)
|
|
765
|
+
|
|
766
|
+
if pcas is None and mask.any():
|
|
767
|
+
X_fit = df.loc[mask, lag_columns]
|
|
768
|
+
|
|
769
|
+
steps = []
|
|
770
|
+
if impute_strategy is not None:
|
|
771
|
+
steps.append(
|
|
772
|
+
("imputer", SimpleImputer(strategy=impute_strategy))
|
|
773
|
+
)
|
|
774
|
+
if standardize:
|
|
775
|
+
steps.append(("scaler", StandardScaler()))
|
|
776
|
+
steps.append(
|
|
777
|
+
("pca", PCA(n_components=n_components, random_state=0))
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
pipe = Pipeline(steps)
|
|
781
|
+
pipe.fit(X_fit)
|
|
782
|
+
pcas_dict[name] = pipe
|
|
783
|
+
elif pcas is not None:
|
|
784
|
+
pipe = pcas[name]
|
|
785
|
+
pcas_dict[name] = pipe
|
|
786
|
+
else:
|
|
787
|
+
continue
|
|
788
|
+
|
|
789
|
+
if mask.any():
|
|
790
|
+
X_transform = df.loc[mask, lag_columns]
|
|
791
|
+
scores = pipe.transform(X_transform)
|
|
792
|
+
|
|
793
|
+
for i in range(n_components):
|
|
794
|
+
df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
|
|
795
|
+
|
|
796
|
+
# Fill missing values
|
|
797
|
+
pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
798
|
+
for col in pca_cols:
|
|
799
|
+
if col not in df.columns:
|
|
800
|
+
df[col] = 0
|
|
801
|
+
df[pca_cols] = df[pca_cols].fillna(0)
|
|
802
|
+
|
|
803
|
+
return df, pcas_dict
|
|
804
|
+
|
|
805
|
+
# encoding categorical features
|
|
806
|
+
def encode_categorical_features(
|
|
807
|
+
self,
|
|
808
|
+
df: pd.DataFrame,
|
|
809
|
+
transformer: ColumnTransformer | None = None,
|
|
810
|
+
) -> tuple[pd.DataFrame, ColumnTransformer]:
|
|
811
|
+
"""
|
|
812
|
+
Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
|
|
813
|
+
|
|
814
|
+
Parameters:
|
|
815
|
+
df (pd.DataFrame): Input DataFrame
|
|
816
|
+
columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
|
|
817
|
+
columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
|
|
818
|
+
columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
|
|
819
|
+
columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
|
|
820
|
+
transformer (ColumnTransformer, optional): if provided, applies transform only
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
tuple: (transformed DataFrame, ColumnTransformer)
|
|
824
|
+
"""
|
|
825
|
+
columns_onehot: list[str] = self.columns_onehot
|
|
826
|
+
columns_binary: list[str] = self.columns_binary
|
|
827
|
+
columns_ordinal: list[str] = self.columns_ordinal
|
|
828
|
+
columns_frequency: list[str] = self.columns_frequency
|
|
829
|
+
|
|
830
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
831
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
832
|
+
save_in_db = False
|
|
833
|
+
|
|
834
|
+
all_columns = (
|
|
835
|
+
columns_onehot + columns_binary + columns_ordinal + columns_frequency
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
if transformer:
|
|
839
|
+
transformed = transformer.transform(X)
|
|
840
|
+
else:
|
|
841
|
+
transformer = ColumnTransformer(
|
|
842
|
+
transformers=[
|
|
843
|
+
(
|
|
844
|
+
"onehot",
|
|
845
|
+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
|
846
|
+
columns_onehot,
|
|
847
|
+
),
|
|
848
|
+
(
|
|
849
|
+
"ordinal",
|
|
850
|
+
OrdinalEncoder(
|
|
851
|
+
handle_unknown="use_encoded_value", unknown_value=-1
|
|
852
|
+
),
|
|
853
|
+
columns_ordinal,
|
|
854
|
+
),
|
|
855
|
+
("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
|
|
856
|
+
("freq", CountEncoder(normalize=True), columns_frequency),
|
|
857
|
+
],
|
|
858
|
+
remainder="passthrough",
|
|
859
|
+
)
|
|
860
|
+
transformed = transformer.fit_transform(X)
|
|
861
|
+
save_in_db = True
|
|
862
|
+
|
|
863
|
+
# Build output column names
|
|
864
|
+
column_names = []
|
|
865
|
+
|
|
866
|
+
if columns_onehot:
|
|
867
|
+
column_names.extend(
|
|
868
|
+
transformer.named_transformers_["onehot"]
|
|
869
|
+
.get_feature_names_out(columns_onehot)
|
|
870
|
+
.tolist()
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
if columns_ordinal:
|
|
874
|
+
column_names.extend(columns_ordinal)
|
|
875
|
+
|
|
876
|
+
if columns_binary:
|
|
877
|
+
column_names.extend(
|
|
878
|
+
transformer.named_transformers_["binary"]
|
|
879
|
+
.get_feature_names_out(columns_binary)
|
|
880
|
+
.tolist()
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
if columns_frequency:
|
|
884
|
+
column_names.extend(columns_frequency)
|
|
885
|
+
|
|
886
|
+
# Add passthrough (non-encoded) columns
|
|
887
|
+
passthrough_columns = [col for col in X.columns if col not in all_columns]
|
|
888
|
+
column_names.extend(passthrough_columns)
|
|
889
|
+
|
|
890
|
+
X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
|
|
891
|
+
|
|
892
|
+
# Try to convert columns to best possible dtypes
|
|
893
|
+
X_transformed = X_transformed.convert_dtypes()
|
|
894
|
+
|
|
895
|
+
# Insert features in db
|
|
896
|
+
if save_in_db:
|
|
897
|
+
# Get feature types from transformed data
|
|
898
|
+
categorical_features, numerical_features = get_features_by_types(
|
|
899
|
+
X_transformed
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
# Get column names from DataFrames
|
|
903
|
+
cat_feature_names = categorical_features.columns.tolist()
|
|
904
|
+
num_feature_names = numerical_features.columns.tolist()
|
|
905
|
+
|
|
906
|
+
# Combine all feature names and their types
|
|
907
|
+
all_feature_names = cat_feature_names + num_feature_names
|
|
908
|
+
all_feature_types = ["categorical"] * len(cat_feature_names) + [
|
|
909
|
+
"numerical"
|
|
910
|
+
] * len(num_feature_names)
|
|
911
|
+
|
|
912
|
+
# Upsert features in bulk if we have any features
|
|
913
|
+
if all_feature_names:
|
|
914
|
+
Feature.bulk_upsert(
|
|
915
|
+
name=all_feature_names,
|
|
916
|
+
type=all_feature_types,
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# Upsert targets in bulk
|
|
920
|
+
target_names = y.columns.tolist()
|
|
921
|
+
target_types = [
|
|
922
|
+
(
|
|
923
|
+
"classification"
|
|
924
|
+
if int(target.split("_")[1]) in self.target_clf
|
|
925
|
+
else "regression"
|
|
926
|
+
)
|
|
927
|
+
for target in target_names
|
|
928
|
+
]
|
|
929
|
+
|
|
930
|
+
Target.bulk_upsert(name=target_names, type=target_types)
|
|
931
|
+
|
|
932
|
+
# Get all the upserted objects
|
|
933
|
+
targets = Target.filter(name__in=target_names)
|
|
934
|
+
|
|
935
|
+
# Update experiment with targets
|
|
936
|
+
experiment = Experiment.get(self.experiment_id)
|
|
937
|
+
if experiment:
|
|
938
|
+
experiment.targets = targets
|
|
939
|
+
experiment.save()
|
|
940
|
+
|
|
941
|
+
return pd.concat([X_transformed, y], axis=1), transformer
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
# utils
|
|
945
|
+
def summarize_dataframe(
|
|
946
|
+
df: pd.DataFrame, sample_categorical_threshold: int = 15
|
|
947
|
+
) -> pd.DataFrame:
|
|
948
|
+
summary = []
|
|
949
|
+
|
|
950
|
+
def is_hashable_series(series: pd.Series) -> bool:
|
|
951
|
+
try:
|
|
952
|
+
_ = series.dropna().unique()
|
|
953
|
+
return True
|
|
954
|
+
except TypeError:
|
|
955
|
+
return False
|
|
956
|
+
|
|
957
|
+
df = convert_object_columns_that_are_numeric(df)
|
|
958
|
+
df = df.convert_dtypes()
|
|
959
|
+
|
|
960
|
+
for col in df.columns:
|
|
961
|
+
total_missing = df[col].isna().sum()
|
|
962
|
+
col_data = df[col].dropna()
|
|
963
|
+
dtype = col_data.dtype
|
|
964
|
+
|
|
965
|
+
if col_data.empty:
|
|
966
|
+
summary.append(
|
|
967
|
+
{
|
|
968
|
+
"Column": col,
|
|
969
|
+
"Dtype": dtype,
|
|
970
|
+
"Type": "unknown",
|
|
971
|
+
"Detail": "No non-null values",
|
|
972
|
+
"Missing": total_missing,
|
|
973
|
+
}
|
|
974
|
+
)
|
|
975
|
+
continue
|
|
976
|
+
|
|
977
|
+
# Case 1: Numeric columns
|
|
978
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
979
|
+
unique_vals = col_data.nunique()
|
|
980
|
+
|
|
981
|
+
if set(col_data.unique()).issubset({0, 1}):
|
|
982
|
+
col_type = "binary-categorical"
|
|
983
|
+
detail = "0/1 values only"
|
|
984
|
+
elif (
|
|
985
|
+
pd.api.types.is_integer_dtype(col_data)
|
|
986
|
+
and unique_vals <= sample_categorical_threshold
|
|
987
|
+
):
|
|
988
|
+
col_type = "multi-categorical"
|
|
989
|
+
top_vals = col_data.value_counts().head(10)
|
|
990
|
+
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
991
|
+
else:
|
|
992
|
+
col_type = "numeric"
|
|
993
|
+
q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
|
|
994
|
+
detail = (
|
|
995
|
+
f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
|
|
996
|
+
f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
# Case 2: Object or other hashable columns
|
|
1000
|
+
elif is_hashable_series(col_data):
|
|
1001
|
+
unique_vals = col_data.nunique()
|
|
1002
|
+
if unique_vals <= sample_categorical_threshold:
|
|
1003
|
+
col_type = "object-categorical"
|
|
1004
|
+
top_vals = col_data.value_counts().head(10)
|
|
1005
|
+
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
1006
|
+
else:
|
|
1007
|
+
col_type = "high-cardinality-categorical"
|
|
1008
|
+
detail = f"{unique_vals} unique values"
|
|
1009
|
+
|
|
1010
|
+
# Case 3: Unusable columns
|
|
1011
|
+
else:
|
|
1012
|
+
col_type = "non-hashable"
|
|
1013
|
+
detail = f"Non-hashable type: {type(col_data.iloc[0])}"
|
|
1014
|
+
|
|
1015
|
+
summary.append(
|
|
1016
|
+
{
|
|
1017
|
+
"Column": col,
|
|
1018
|
+
"Dtype": dtype,
|
|
1019
|
+
"Type": col_type,
|
|
1020
|
+
"Detail": detail,
|
|
1021
|
+
"Missing": total_missing,
|
|
1022
|
+
}
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
return pd.DataFrame(summary)
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
# Utility functions for data splitting
|
|
1029
|
+
# ===================================
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def split_data(
|
|
1033
|
+
data,
|
|
1034
|
+
experiment=None,
|
|
1035
|
+
time_series=None,
|
|
1036
|
+
date_column=None,
|
|
1037
|
+
group_column=None,
|
|
1038
|
+
val_size=None,
|
|
1039
|
+
test_size=None,
|
|
1040
|
+
target_numbers=None,
|
|
1041
|
+
target_clf=None,
|
|
1042
|
+
experiment_id=None,
|
|
1043
|
+
):
|
|
1044
|
+
"""
|
|
1045
|
+
Utility function to split data into train, validation, and test sets.
|
|
1046
|
+
|
|
1047
|
+
Args:
|
|
1048
|
+
data (pd.DataFrame): Input data to split
|
|
1049
|
+
experiment: LeCrapaud experiment instance (preferred - extracts all params automatically)
|
|
1050
|
+
time_series (bool): Whether to use time series splitting (overrides experiment)
|
|
1051
|
+
date_column (str): Date column for time series splitting (overrides experiment)
|
|
1052
|
+
group_column (str): Group column for time series splitting (overrides experiment)
|
|
1053
|
+
val_size (float): Validation set size (0.0-1.0) (overrides experiment)
|
|
1054
|
+
test_size (float): Test set size (0.0-1.0) (overrides experiment)
|
|
1055
|
+
target_numbers (list): List of target numbers for stratification (overrides experiment)
|
|
1056
|
+
target_clf (list): List of classification target numbers (overrides experiment)
|
|
1057
|
+
experiment_id (int): Optional experiment ID to update sizes in database
|
|
1058
|
+
|
|
1059
|
+
Returns:
|
|
1060
|
+
tuple: (train, val, test) DataFrames
|
|
1061
|
+
"""
|
|
1062
|
+
# Extract parameters from experiment if provided
|
|
1063
|
+
if experiment is not None:
|
|
1064
|
+
# Check if it's a BaseExperiment or just the experiment database object
|
|
1065
|
+
if hasattr(experiment, "context") and experiment.context:
|
|
1066
|
+
# It's a database experiment object with context
|
|
1067
|
+
context = experiment.context
|
|
1068
|
+
if time_series is None:
|
|
1069
|
+
time_series = context.get("time_series", False)
|
|
1070
|
+
if date_column is None:
|
|
1071
|
+
date_column = context.get("date_column")
|
|
1072
|
+
if group_column is None:
|
|
1073
|
+
group_column = context.get("group_column")
|
|
1074
|
+
if val_size is None:
|
|
1075
|
+
val_size = context.get("val_size", 0.2)
|
|
1076
|
+
if test_size is None:
|
|
1077
|
+
test_size = context.get("test_size", 0.2)
|
|
1078
|
+
if target_numbers is None:
|
|
1079
|
+
target_numbers = context.get("target_numbers", [])
|
|
1080
|
+
if target_clf is None:
|
|
1081
|
+
target_clf = context.get("target_clf", [])
|
|
1082
|
+
if experiment_id is None:
|
|
1083
|
+
experiment_id = experiment.id
|
|
1084
|
+
|
|
1085
|
+
# Set defaults if still None
|
|
1086
|
+
if time_series is None:
|
|
1087
|
+
time_series = False
|
|
1088
|
+
if val_size is None:
|
|
1089
|
+
val_size = 0.2
|
|
1090
|
+
if test_size is None:
|
|
1091
|
+
test_size = 0.2
|
|
1092
|
+
if target_numbers is None:
|
|
1093
|
+
target_numbers = []
|
|
1094
|
+
if target_clf is None:
|
|
1095
|
+
target_clf = []
|
|
1096
|
+
|
|
1097
|
+
dates = {}
|
|
1098
|
+
if time_series:
|
|
1099
|
+
(train, val, test), dates = _split_time_series(
|
|
1100
|
+
data, date_column, group_column, val_size, test_size
|
|
1101
|
+
)
|
|
1102
|
+
else:
|
|
1103
|
+
# Use first target for stratification if it's a classification target
|
|
1104
|
+
stratify_col = None
|
|
1105
|
+
if target_numbers and target_clf and target_numbers[0] in target_clf:
|
|
1106
|
+
stratify_col = f"TARGET_{target_numbers[0]}"
|
|
1107
|
+
train, val, test = _split_standard(data, val_size, test_size, stratify_col)
|
|
1108
|
+
|
|
1109
|
+
# Update experiment with sizes if experiment_id provided
|
|
1110
|
+
if experiment_id:
|
|
1111
|
+
Experiment.update(
|
|
1112
|
+
id=experiment_id,
|
|
1113
|
+
train_size=len(train),
|
|
1114
|
+
val_size=len(val),
|
|
1115
|
+
test_size=len(test),
|
|
1116
|
+
**dates,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
return train, val, test
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def _split_time_series(data, date_column, group_column, val_size, test_size):
|
|
1123
|
+
"""Time series splitting preserving temporal order."""
|
|
1124
|
+
if not date_column:
|
|
1125
|
+
raise ValueError("Please specify a date_column for time series")
|
|
1126
|
+
|
|
1127
|
+
df = data.copy()
|
|
1128
|
+
if group_column:
|
|
1129
|
+
df.sort_values([date_column, group_column], inplace=True)
|
|
1130
|
+
else:
|
|
1131
|
+
df.sort_values(date_column, inplace=True)
|
|
1132
|
+
|
|
1133
|
+
dates = df[date_column].unique()
|
|
1134
|
+
|
|
1135
|
+
val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
|
|
1136
|
+
test_first_id = int(len(dates) * (1 - test_size)) + 1
|
|
1137
|
+
|
|
1138
|
+
train = df[df[date_column].isin(dates[:val_first_id])]
|
|
1139
|
+
val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
|
|
1140
|
+
test = df[df[date_column].isin(dates[test_first_id:])]
|
|
1141
|
+
|
|
1142
|
+
dates = {}
|
|
1143
|
+
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
1144
|
+
dates[f"{name}_start_date"] = (
|
|
1145
|
+
data[date_column].map(pd.Timestamp.fromordinal).iat[0]
|
|
1146
|
+
)
|
|
1147
|
+
dates[f"{name}_end_date"] = (
|
|
1148
|
+
data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
|
|
1149
|
+
)
|
|
1150
|
+
|
|
1151
|
+
logger.info(
|
|
1152
|
+
f"{data.shape} {name} data from {dates[f'{name}_start_date'].strftime('%d/%m/%Y')} to {dates[f'{name}_end_date'].strftime('%d/%m/%Y')}"
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
return (
|
|
1156
|
+
train.reset_index(drop=True),
|
|
1157
|
+
val.reset_index(drop=True),
|
|
1158
|
+
test.reset_index(drop=True),
|
|
1159
|
+
), dates
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def _split_standard(data, val_size, test_size, stratify_col=None, random_state=42):
|
|
1163
|
+
"""Standard random splitting with optional stratification."""
|
|
1164
|
+
from sklearn.model_selection import train_test_split
|
|
1165
|
+
|
|
1166
|
+
df = data.copy()
|
|
1167
|
+
|
|
1168
|
+
stratify_vals = (
|
|
1169
|
+
df[stratify_col] if stratify_col and stratify_col in df.columns else None
|
|
1170
|
+
)
|
|
1171
|
+
|
|
1172
|
+
# First split: train + (val + test)
|
|
1173
|
+
train, temp = train_test_split(
|
|
1174
|
+
df,
|
|
1175
|
+
test_size=val_size + test_size,
|
|
1176
|
+
random_state=random_state,
|
|
1177
|
+
stratify=stratify_vals,
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
# Adjust stratify target for val/test split
|
|
1181
|
+
stratify_temp = (
|
|
1182
|
+
temp[stratify_col] if stratify_col and stratify_col in df.columns else None
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
# Compute val and test sizes relative to temp
|
|
1186
|
+
val_ratio = val_size / (val_size + test_size)
|
|
1187
|
+
|
|
1188
|
+
val, test = train_test_split(
|
|
1189
|
+
temp,
|
|
1190
|
+
test_size=1 - val_ratio,
|
|
1191
|
+
random_state=random_state,
|
|
1192
|
+
stratify=stratify_temp,
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
1196
|
+
logger.info(f"{data.shape} {name} data")
|
|
1197
|
+
|
|
1198
|
+
return (
|
|
1199
|
+
train.reset_index(drop=True),
|
|
1200
|
+
val.reset_index(drop=True),
|
|
1201
|
+
test.reset_index(drop=True),
|
|
1202
|
+
)
|