lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
  5. lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
  6. lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
  7. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  8. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  9. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  10. lecrapaud/db/models/__init__.py +2 -4
  11. lecrapaud/db/models/base.py +122 -67
  12. lecrapaud/db/models/experiment.py +196 -183
  13. lecrapaud/db/models/feature_selection.py +0 -3
  14. lecrapaud/db/models/feature_selection_rank.py +0 -18
  15. lecrapaud/db/models/model_selection.py +2 -2
  16. lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
  17. lecrapaud/db/session.py +33 -4
  18. lecrapaud/experiment.py +44 -17
  19. lecrapaud/feature_engineering.py +45 -674
  20. lecrapaud/feature_preprocessing.py +1202 -0
  21. lecrapaud/feature_selection.py +145 -332
  22. lecrapaud/integrations/sentry_integration.py +46 -0
  23. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  24. lecrapaud/mixins.py +247 -0
  25. lecrapaud/model_preprocessing.py +295 -0
  26. lecrapaud/model_selection.py +725 -249
  27. lecrapaud/pipeline.py +548 -0
  28. lecrapaud/search_space.py +38 -1
  29. lecrapaud/utils.py +36 -3
  30. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  31. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  32. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  33. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  34. lecrapaud/db/models/model_training.py +0 -64
  35. lecrapaud/jobs/__init__.py +0 -13
  36. lecrapaud/jobs/config.py +0 -17
  37. lecrapaud/jobs/scheduler.py +0 -30
  38. lecrapaud/jobs/tasks.py +0 -17
  39. lecrapaud-0.18.7.dist-info/METADATA +0 -248
  40. lecrapaud-0.18.7.dist-info/RECORD +0 -46
@@ -0,0 +1,1202 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import os
5
+
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.impute import SimpleImputer
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.pipeline import Pipeline
11
+ from category_encoders import BinaryEncoder, CountEncoder
12
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
13
+ from sklearn.model_selection import train_test_split
14
+
15
+ from lecrapaud.integrations.openai_integration import (
16
+ truncate_text,
17
+ get_openai_embeddings,
18
+ )
19
+ from lecrapaud.feature_selection import get_features_by_types
20
+ from lecrapaud.utils import logger
21
+ from lecrapaud.db import Target, Feature, Experiment
22
+ from lecrapaud.config import PYTHON_ENV
23
+ from lecrapaud.feature_engineering import convert_object_columns_that_are_numeric
24
+ from lecrapaud.mixins import LeCrapaudTransformerMixin
25
+
26
+
27
+ class FeaturePreprocessor(LeCrapaudTransformerMixin):
28
+
29
+ def __init__(
30
+ self,
31
+ experiment=None,
32
+ **kwargs,
33
+ ):
34
+ # The mixin will automatically set all experiment.context parameters as attributes
35
+ super().__init__(experiment=experiment, **kwargs)
36
+
37
+ # Set defaults for attributes not automatically set by mixin
38
+ if not hasattr(self, "time_series"):
39
+ self.time_series = False
40
+ if not hasattr(self, "date_column"):
41
+ self.date_column = None
42
+ if not hasattr(self, "group_column"):
43
+ self.group_column = None
44
+ if not hasattr(self, "val_size"):
45
+ self.val_size = 0.2
46
+ if not hasattr(self, "test_size"):
47
+ self.test_size = 0.2
48
+ if not hasattr(self, "target_numbers"):
49
+ self.target_numbers = []
50
+ if not hasattr(self, "target_clf"):
51
+ self.target_clf = []
52
+
53
+ # Handle list parameters with uppercase conversion
54
+ if not hasattr(self, "columns_pca"):
55
+ self.columns_pca = []
56
+ else:
57
+ self.columns_pca = [col.upper() for col in self.columns_pca]
58
+ if not hasattr(self, "pca_temporal"):
59
+ self.pca_temporal = []
60
+ if not hasattr(self, "pca_cross_sectional"):
61
+ self.pca_cross_sectional = []
62
+ if not hasattr(self, "columns_onehot"):
63
+ self.columns_onehot = []
64
+ else:
65
+ self.columns_onehot = [col.upper() for col in self.columns_onehot]
66
+ if not hasattr(self, "columns_binary"):
67
+ self.columns_binary = []
68
+ else:
69
+ self.columns_binary = [col.upper() for col in self.columns_binary]
70
+ if not hasattr(self, "columns_ordinal"):
71
+ self.columns_ordinal = []
72
+ else:
73
+ self.columns_ordinal = [col.upper() for col in self.columns_ordinal]
74
+ if not hasattr(self, "columns_frequency"):
75
+ self.columns_frequency = []
76
+ else:
77
+ self.columns_frequency = [col.upper() for col in self.columns_frequency]
78
+
79
+ # Set experiment-related paths if experiment is available
80
+ if self.experiment:
81
+ self.experiment_dir = self.experiment.path
82
+ self.experiment_id = self.experiment.id
83
+ self.data_dir = f"{self.experiment_dir}/data"
84
+ self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
85
+
86
+ def fit(self, X, y=None):
87
+ """
88
+ Fit the preprocessor (learns PCA components, encoders, etc.).
89
+
90
+ Args:
91
+ X (pd.DataFrame): Input data
92
+ y: Target values (ignored)
93
+
94
+ Returns:
95
+ self: Returns self for chaining
96
+ """
97
+ X, y = self._validate_data(X, y)
98
+
99
+ # Store data and make columns uppercase
100
+ data = X.copy()
101
+ data.columns = data.columns.str.upper()
102
+
103
+ joblib.dump(
104
+ list(data.columns),
105
+ f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
106
+ )
107
+
108
+ # Fit PCA components
109
+ data, self.pcas_ = self.add_pca_features(data)
110
+ data, self.pcas_cross_sectional_ = self.add_pca_feature_cross_sectional(data)
111
+ data, self.pcas_temporal_ = self.add_pca_feature_temporal(data)
112
+
113
+ # Fit encoding transformer
114
+ data, self.transformer_ = self.encode_categorical_features(data)
115
+
116
+ # Save fitted transformers if experiment is available
117
+ if self.experiment:
118
+ joblib.dump(self.pcas_, f"{self.preprocessing_dir}/pcas.pkl")
119
+ joblib.dump(
120
+ self.pcas_cross_sectional_,
121
+ f"{self.preprocessing_dir}/pcas_cross_sectional.pkl",
122
+ )
123
+ joblib.dump(
124
+ self.pcas_temporal_, f"{self.preprocessing_dir}/pcas_temporal.pkl"
125
+ )
126
+ joblib.dump(
127
+ self.transformer_, f"{self.preprocessing_dir}/column_transformer.pkl"
128
+ )
129
+
130
+ # Save features and summary
131
+ joblib.dump(
132
+ list(data.columns),
133
+ f"{self.preprocessing_dir}/all_features_before_selection.pkl",
134
+ )
135
+
136
+ if PYTHON_ENV == "Development":
137
+ joblib.dump(X, f"{self.data_dir}/full.pkl")
138
+
139
+ summary = summarize_dataframe(data)
140
+ summary.to_csv(f"{self.experiment_dir}/feature_summary.csv", index=False)
141
+
142
+ self._set_fitted()
143
+ return self
144
+
145
+ def transform(self, X):
146
+ """
147
+ Transform the input data using fitted components.
148
+
149
+ Args:
150
+ X (pd.DataFrame): Input data
151
+
152
+ Returns:
153
+ pd.DataFrame: Transformed data
154
+ """
155
+ # Allow loading persisted artifacts even in a fresh instance
156
+ if not getattr(self, "is_fitted_", False) and self.experiment:
157
+ if os.path.exists(f"{self.preprocessing_dir}/column_transformer.pkl"):
158
+ self.is_fitted_ = True
159
+
160
+ self._check_is_fitted()
161
+ X, _ = self._validate_data(X, reset=False)
162
+
163
+ # Transform data
164
+ data = X.copy()
165
+ data.columns = data.columns.str.upper()
166
+
167
+ # Load fitted components if not already in memory
168
+ if not hasattr(self, "pcas_") and self.experiment:
169
+ if os.path.exists(f"{self.preprocessing_dir}/pcas.pkl"):
170
+ self.pcas_ = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
171
+
172
+ if not hasattr(self, "pcas_cross_sectional_") and self.experiment:
173
+ if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
174
+ self.pcas_cross_sectional_ = joblib.load(
175
+ f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
176
+ )
177
+
178
+ if not hasattr(self, "pcas_temporal_") and self.experiment:
179
+ if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
180
+ self.pcas_temporal_ = joblib.load(
181
+ f"{self.preprocessing_dir}/pcas_temporal.pkl"
182
+ )
183
+
184
+ if not hasattr(self, "transformer_") and self.experiment:
185
+ if os.path.exists(f"{self.preprocessing_dir}/column_transformer.pkl"):
186
+ self.transformer_ = joblib.load(
187
+ f"{self.preprocessing_dir}/column_transformer.pkl"
188
+ )
189
+
190
+ # Apply PCA transformations using fitted components
191
+ if hasattr(self, "pcas_"):
192
+ data, _ = self.add_pca_features(data, pcas=self.pcas_)
193
+ if hasattr(self, "pcas_cross_sectional_"):
194
+ data, _ = self.add_pca_feature_cross_sectional(
195
+ data, pcas=self.pcas_cross_sectional_
196
+ )
197
+ if hasattr(self, "pcas_temporal_"):
198
+ data, _ = self.add_pca_feature_temporal(data, pcas=self.pcas_temporal_)
199
+
200
+ # Apply encoding using fitted transformer
201
+ if hasattr(self, "transformer_"):
202
+ data, _ = self.encode_categorical_features(
203
+ data, transformer=self.transformer_
204
+ )
205
+
206
+ return data
207
+
208
+ # embedding and pca
209
+ def add_pca_features(
210
+ self, df: pd.DataFrame, n_components: int = 5, pcas=None
211
+ ) -> tuple[pd.DataFrame, dict]:
212
+ """
213
+ Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
214
+ NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
215
+
216
+ Parameters:
217
+ df (pd.DataFrame): Input DataFrame
218
+ column (str): Name of the column containing np.ndarray
219
+ n_components (int): Number of PCA components to keep
220
+
221
+ Returns:
222
+ pd.DataFrame: DataFrame with new PCA columns added
223
+ """
224
+ columns: list[str] = self.columns_pca
225
+
226
+ pcas_dict = {}
227
+ for column in columns:
228
+ # Convert text to embeddings if necessary
229
+ if not isinstance(df[column].iloc[0], (np.ndarray, list)):
230
+ sentences = df[column].astype(str).tolist()
231
+ logger.info(
232
+ f"Total sentences to embed for column {column}: {len(sentences)}"
233
+ )
234
+
235
+ # Truncate each sentence
236
+ truncate_sentences = [truncate_text(sentence) for sentence in sentences]
237
+
238
+ # embedding
239
+ embedding_matrix = get_openai_embeddings(truncate_sentences)
240
+ else:
241
+ logger.info(f"Column {column} is already embeddings")
242
+ # Stack the vectors into a 2D array
243
+ embedding_matrix = np.vstack(df[column].values)
244
+
245
+ # Apply PCA
246
+ if pcas:
247
+ pca = pcas[column]
248
+ pca_features = pca.transform(embedding_matrix)
249
+ else:
250
+ pca = PCA(n_components=n_components)
251
+ pca_features = pca.fit_transform(embedding_matrix)
252
+
253
+ # Add PCA columns
254
+ for i in range(n_components):
255
+ df[f"{column}_pca_{i+1}"] = pca_features[:, i]
256
+
257
+ # Drop the original column
258
+ df.drop(column, axis=1, inplace=True)
259
+ pcas_dict.update({column: pca})
260
+
261
+ return df, pcas_dict
262
+
263
+ def add_pca_feature_cross_sectional_old(
264
+ self,
265
+ df: pd.DataFrame,
266
+ *,
267
+ n_components: int = 5,
268
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
269
+ impute_strategy: str = "median",
270
+ standardize: bool = True,
271
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
272
+ """
273
+ Construit un pivot (index=index_col, columns=columns_col, values=value_col),
274
+ fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
275
+ (par index_col) dans df. Renvoie (df_avec_features, pipe).
276
+ """
277
+
278
+ pcas_dict = {}
279
+ index_saved = df.index
280
+
281
+ for pca_cross_sectional in self.pca_cross_sectional:
282
+ name, index_col, columns_col, value_col = (
283
+ pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
284
+ )
285
+ prefix = f"CS_PC_{name}"
286
+
287
+ pivot = df.pivot_table(
288
+ index=index_col, columns=columns_col, values=value_col
289
+ ).sort_index()
290
+
291
+ # Pipeline à réutiliser entre train et test
292
+ if pcas is None:
293
+ steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
294
+ if standardize:
295
+ steps.append(
296
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
297
+ )
298
+ pca = PCA(n_components=n_components, random_state=0)
299
+ steps.append(("pca", pca))
300
+ pipe = Pipeline(steps)
301
+ pipe.fit(pivot) # <- fit sur TRAIN uniquement
302
+ else:
303
+ pipe = pcas[name] # <- TEST : on réutilise le pipe existant
304
+
305
+ scores = pipe.transform(pivot) # shape: (n_index, n_components)
306
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
307
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
308
+
309
+ df = df.merge(scores_df.reset_index(), on=index_col, how="left")
310
+ df.index = index_saved
311
+ pcas_dict.update({name: pipe})
312
+
313
+ return df, pcas_dict
314
+
315
+ def add_pca_feature_cross_sectional(
316
+ self,
317
+ df: pd.DataFrame,
318
+ *,
319
+ n_components: int = 5,
320
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
321
+ impute_strategy: str = "median",
322
+ standardize: bool = True,
323
+ lookback_days: int = 365, # nombre de jours à regarder en arrière pour le fit
324
+ refresh_frequency: int = 90, # refresh la PCA tous les X jours
325
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
326
+ """
327
+ Construit un pivot (index=index_col, columns=columns_col, values=value_col),
328
+ fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
329
+ (par index_col) dans df. Renvoie (df_avec_features, pipe).
330
+
331
+ Pour les séries temporelles : fit la PCA uniquement sur les données passées
332
+ pour éviter le leakage, avec refresh périodique.
333
+
334
+ Gère le cas des données panel où on a plusieurs séries temporelles
335
+ (ex: plusieurs stocks avec les mêmes dates).
336
+ """
337
+
338
+ pcas_dict = {}
339
+ index_saved = df.index
340
+
341
+ for pca_cross_sectional in self.pca_cross_sectional:
342
+ name, index_col, columns_col, value_col = (
343
+ pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
344
+ )
345
+ prefix = f"CS_PC_{name}"
346
+
347
+ # Vérifier si c'est une série temporelle avec index = date
348
+ # Les dates sont déjà en ordinal après cyclic_encode_date
349
+ is_time_series = self.time_series and index_col == self.date_column
350
+
351
+ if is_time_series:
352
+ # Cas spécial : PCA cross-sectional sur des données de panel time series
353
+ # Par exemple : PCA sur les returns de tous les stocks à chaque date
354
+ # pour capturer le régime de marché
355
+
356
+ all_scores = []
357
+
358
+ # Les dates sont déjà en ordinal
359
+ unique_dates = sorted(df[index_col].unique())
360
+
361
+ # Pour l'inference, utiliser la PCA fournie
362
+ if pcas is not None:
363
+ pipe = pcas[name]
364
+ pivot = df.pivot_table(
365
+ index=index_col, columns=columns_col, values=value_col
366
+ ).sort_index()
367
+ scores = pipe.transform(pivot)
368
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
369
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
370
+ else:
371
+ # Training : fit PCA de manière expanding avec refresh périodique
372
+ pipe = None
373
+ last_fit_date = None
374
+
375
+ for i, current_date_ordinal in enumerate(unique_dates):
376
+ # Convertir l'ordinal en date pour les calculs de temps
377
+ current_date = pd.Timestamp.fromordinal(
378
+ int(current_date_ordinal)
379
+ )
380
+
381
+ # Déterminer si on doit refitter la PCA
382
+ should_refit = pipe is None or ( # Première fois
383
+ last_fit_date is not None
384
+ and (current_date - last_fit_date).days >= refresh_frequency
385
+ )
386
+
387
+ if (
388
+ should_refit and i > 30
389
+ ): # Attendre au moins 30 jours de données
390
+ # Prendre les données des 'lookback_days' derniers jours
391
+ lookback_start_date = current_date - pd.Timedelta(
392
+ days=lookback_days
393
+ )
394
+ lookback_start_ordinal = pd.Timestamp.toordinal(
395
+ lookback_start_date
396
+ )
397
+
398
+ # Masque pour les dates passées uniquement (éviter le leakage)
399
+ mask_fit = (df[index_col] >= lookback_start_ordinal) & (
400
+ df[index_col] < current_date_ordinal
401
+ )
402
+ df_fit = df[mask_fit]
403
+
404
+ if len(df_fit) > 0:
405
+ # Créer le pivot pour la période de lookback
406
+ pivot_fit = df_fit.pivot_table(
407
+ index=index_col,
408
+ columns=columns_col,
409
+ values=value_col,
410
+ ).sort_index()
411
+
412
+ # Vérifier qu'on a assez de dates et de colonnes
413
+ if (
414
+ len(pivot_fit) >= n_components
415
+ and pivot_fit.shape[1] >= n_components
416
+ ):
417
+ # Créer nouveau pipeline
418
+ steps = [
419
+ (
420
+ "imputer",
421
+ SimpleImputer(strategy=impute_strategy),
422
+ )
423
+ ]
424
+ if standardize:
425
+ steps.append(
426
+ (
427
+ "scaler",
428
+ StandardScaler(
429
+ with_mean=True, with_std=True
430
+ ),
431
+ )
432
+ )
433
+ pca = PCA(n_components=n_components, random_state=0)
434
+ steps.append(("pca", pca))
435
+ pipe = Pipeline(steps)
436
+ pipe.fit(pivot_fit)
437
+ last_fit_date = current_date
438
+
439
+ logger.debug(
440
+ f"PCA {name} refitted at date {current_date.strftime('%Y-%m-%d')} "
441
+ f"using {len(pivot_fit)} dates and {pivot_fit.shape[1]} columns"
442
+ )
443
+
444
+ # Transform pour la date courante uniquement
445
+ if pipe is not None:
446
+ df_current = df[df[index_col] == current_date_ordinal]
447
+ if len(df_current) > 0:
448
+ pivot_current = df_current.pivot_table(
449
+ index=index_col,
450
+ columns=columns_col,
451
+ values=value_col,
452
+ )
453
+ try:
454
+ scores_current = pipe.transform(pivot_current)
455
+ scores_dict = {
456
+ index_col: [current_date_ordinal],
457
+ **{
458
+ f"{prefix}_{j}": [scores_current[0, j]]
459
+ for j in range(n_components)
460
+ },
461
+ }
462
+ all_scores.append(pd.DataFrame(scores_dict))
463
+ except Exception as e:
464
+ # En cas d'erreur (ex: nouvelles colonnes), créer des valeurs manquantes
465
+ logger.debug(
466
+ f"PCA transform error at date {current_date}: {str(e)}"
467
+ )
468
+ scores_dict = {
469
+ index_col: [current_date_ordinal],
470
+ **{
471
+ f"{prefix}_{j}": [np.nan]
472
+ for j in range(n_components)
473
+ },
474
+ }
475
+ all_scores.append(pd.DataFrame(scores_dict))
476
+ else:
477
+ # Pas encore de PCA fittée, créer des NaN
478
+ scores_dict = {
479
+ index_col: [current_date_ordinal],
480
+ **{
481
+ f"{prefix}_{j}": [np.nan]
482
+ for j in range(n_components)
483
+ },
484
+ }
485
+ all_scores.append(pd.DataFrame(scores_dict))
486
+
487
+ # Combiner tous les scores
488
+ if all_scores:
489
+ scores_df = pd.concat(all_scores, ignore_index=True)
490
+ else:
491
+ # Créer un DataFrame vide avec les bonnes colonnes
492
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
493
+ scores_df = pd.DataFrame(columns=[index_col] + cols)
494
+
495
+ # Merger les scores
496
+ df = df.merge(scores_df, on=index_col, how="left")
497
+ df.index = index_saved
498
+
499
+ # Forward fill puis 0 pour éviter les NaN
500
+ pca_cols = [col for col in df.columns if col.startswith(prefix)]
501
+ df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
502
+
503
+ pcas_dict.update({name: pipe})
504
+
505
+ else:
506
+ # Approche classique (non time series ou index != date)
507
+ pivot = df.pivot_table(
508
+ index=index_col, columns=columns_col, values=value_col
509
+ ).sort_index()
510
+
511
+ # Pipeline à réutiliser entre train et test
512
+ if pcas is None:
513
+ steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
514
+ if standardize:
515
+ steps.append(
516
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
517
+ )
518
+ pca = PCA(n_components=n_components, random_state=0)
519
+ steps.append(("pca", pca))
520
+ pipe = Pipeline(steps)
521
+ pipe.fit(pivot) # <- fit sur TRAIN uniquement
522
+ else:
523
+ pipe = pcas[name] # <- TEST : on réutilise le pipe existant
524
+
525
+ scores = pipe.transform(pivot) # shape: (n_index, n_components)
526
+ cols = [f"{prefix}_{i}" for i in range(n_components)]
527
+ scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
528
+
529
+ df = df.merge(scores_df.reset_index(), on=index_col, how="left")
530
+ df.index = index_saved
531
+ pcas_dict.update({name: pipe})
532
+
533
+ return df, pcas_dict
534
+
535
+ # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
536
+ def add_pca_feature_temporal_old(
537
+ self,
538
+ df: pd.DataFrame,
539
+ *,
540
+ n_components: int = 5,
541
+ pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
542
+ impute_strategy: (
543
+ str | None
544
+ ) = None, # None = on exige toutes les colonnes présentes
545
+ standardize: bool = True,
546
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
547
+ """
548
+ Applique une PCA sur une matrice (rows = lignes df, cols = lags).
549
+ Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
550
+ Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
551
+ """
552
+ pcas_dict = {}
553
+
554
+ for pca_temporal in self.pca_temporal:
555
+ name, cols = (pca_temporal[k] for k in ("name", "columns"))
556
+ prefix = f"TMP_PC_{name}"
557
+
558
+ # Masque des lignes utilisables
559
+ if impute_strategy is None:
560
+ mask = (
561
+ df[cols].notna().all(axis=1)
562
+ ) # on n'impute pas → lignes complètes
563
+ X_fit = df.loc[mask, cols]
564
+ else:
565
+ mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
566
+ X_fit = df.loc[mask, cols]
567
+
568
+ # Pipeline
569
+ if pcas is None:
570
+ steps = []
571
+ if impute_strategy is not None:
572
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
573
+ if standardize:
574
+ steps.append(
575
+ ("scaler", StandardScaler(with_mean=True, with_std=True))
576
+ )
577
+ pca = PCA(n_components=n_components, random_state=0)
578
+ steps.append(("pca", pca))
579
+ pipe = Pipeline(steps)
580
+ if not X_fit.empty:
581
+ pipe.fit(X_fit) # <- fit sur TRAIN uniquement
582
+ else:
583
+ pipe = pcas[name] # <- TEST
584
+
585
+ # Transform uniquement sur lignes valides (mask)
586
+ if not df.loc[mask, cols].empty:
587
+ Z = pipe.transform(df.loc[mask, cols])
588
+ for i in range(n_components):
589
+ df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
590
+ else:
591
+ # crée les colonnes vides si aucune ligne valide (cohérence de schéma)
592
+ for i in range(n_components):
593
+ df[f"{prefix}_{i}"] = pd.NA
594
+
595
+ pcas_dict.update({name: pipe})
596
+
597
+ return df, pcas_dict
598
+
599
+ def add_pca_feature_temporal(
600
+ self,
601
+ df: pd.DataFrame,
602
+ *,
603
+ n_components: int = 5,
604
+ pcas: dict[str, Pipeline] | None = None,
605
+ impute_strategy: str = "median",
606
+ standardize: bool = True,
607
+ lookback_days: int = 365,
608
+ refresh_frequency: int = 90,
609
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
610
+ """
611
+ PCA temporelle pour time series avec support panel data.
612
+ Crée automatiquement les colonnes de lags et évite le look-ahead bias.
613
+
614
+ Format pca_temporal simplifié:
615
+ [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
616
+ """
617
+ pcas_dict = {}
618
+
619
+ for pca_config in self.pca_temporal:
620
+ # Support both old and new format
621
+ if "columns" in pca_config:
622
+ # Old format: use existing columns
623
+ name = pca_config["name"]
624
+ lag_columns = pca_config["columns"]
625
+ base_column = None
626
+ num_lags = len(lag_columns)
627
+ else:
628
+ # New format: create lag columns
629
+ name = pca_config["name"]
630
+ base_column = pca_config["column"].upper()
631
+ num_lags = pca_config.get("lags", 20)
632
+
633
+ # Create lag columns if they don't exist
634
+ if self.group_column:
635
+ # Panel data: create lags by group
636
+ for lag in range(1, num_lags + 1):
637
+ lag_col = f"{base_column}_-{lag}"
638
+ if lag_col not in df.columns:
639
+ df[lag_col] = df.groupby(self.group_column)[
640
+ base_column
641
+ ].shift(lag)
642
+ else:
643
+ # Simple time series
644
+ for lag in range(1, num_lags + 1):
645
+ lag_col = f"{base_column}_-{lag}"
646
+ if lag_col not in df.columns:
647
+ df[lag_col] = df[base_column].shift(lag)
648
+
649
+ lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
650
+
651
+ prefix = f"TMP_PC_{name}"
652
+
653
+ # For time series: avoid look-ahead bias
654
+ if self.time_series and self.date_column:
655
+ all_scores = []
656
+ unique_dates = sorted(df[self.date_column].unique())
657
+
658
+ if pcas is not None:
659
+ # transform: use provided PCA
660
+ pipe = pcas[name]
661
+
662
+ # Apply to all data at once
663
+ mask = df[lag_columns].notna().all(axis=1)
664
+ if mask.any():
665
+ X_transform = df.loc[mask, lag_columns]
666
+ scores = pipe.transform(X_transform)
667
+
668
+ for i in range(n_components):
669
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
670
+
671
+ # Fill NaN with forward fill then 0
672
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
673
+ df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
674
+
675
+ else:
676
+ # Training: expanding window with periodic refresh
677
+ pipe = None
678
+ last_fit_date = None
679
+
680
+ for current_date_ordinal in unique_dates:
681
+ current_date = pd.Timestamp.fromordinal(
682
+ int(current_date_ordinal)
683
+ )
684
+
685
+ # Determine if we should refit
686
+ should_refit = pipe is None or (
687
+ last_fit_date is not None
688
+ and (current_date - last_fit_date).days >= refresh_frequency
689
+ )
690
+
691
+ if (
692
+ should_refit
693
+ and len(df[df[self.date_column] < current_date_ordinal])
694
+ > num_lags * 2
695
+ ):
696
+ # Get historical data for fitting
697
+ lookback_start = current_date - pd.Timedelta(
698
+ days=lookback_days
699
+ )
700
+ lookback_start_ordinal = pd.Timestamp.toordinal(
701
+ lookback_start
702
+ )
703
+
704
+ mask_fit = (
705
+ (df[self.date_column] >= lookback_start_ordinal)
706
+ & (df[self.date_column] < current_date_ordinal)
707
+ & df[lag_columns].notna().all(axis=1)
708
+ )
709
+
710
+ if mask_fit.sum() >= n_components:
711
+ X_fit = df.loc[mask_fit, lag_columns]
712
+
713
+ # Create pipeline
714
+ steps = []
715
+ if impute_strategy is not None:
716
+ steps.append(
717
+ (
718
+ "imputer",
719
+ SimpleImputer(strategy=impute_strategy),
720
+ )
721
+ )
722
+ if standardize:
723
+ steps.append(("scaler", StandardScaler()))
724
+ steps.append(
725
+ (
726
+ "pca",
727
+ PCA(n_components=n_components, random_state=0),
728
+ )
729
+ )
730
+
731
+ pipe = Pipeline(steps)
732
+ pipe.fit(X_fit)
733
+ last_fit_date = current_date
734
+
735
+ logger.debug(
736
+ f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
737
+ f"using {len(X_fit)} samples"
738
+ )
739
+
740
+ # Transform current date data
741
+ if pipe is not None:
742
+ mask_current = (
743
+ df[self.date_column] == current_date_ordinal
744
+ ) & df[lag_columns].notna().all(axis=1)
745
+
746
+ if mask_current.any():
747
+ X_current = df.loc[mask_current, lag_columns]
748
+ scores = pipe.transform(X_current)
749
+
750
+ for i in range(n_components):
751
+ df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
752
+
753
+ # Fill NaN with forward fill then 0
754
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
755
+ for col in pca_cols:
756
+ if col not in df.columns:
757
+ df[col] = 0
758
+ df[pca_cols] = df[pca_cols].fillna(method="ffill").fillna(0)
759
+
760
+ pcas_dict[name] = pipe
761
+
762
+ else:
763
+ # Non time-series: use original approach
764
+ mask = df[lag_columns].notna().all(axis=1)
765
+
766
+ if pcas is None and mask.any():
767
+ X_fit = df.loc[mask, lag_columns]
768
+
769
+ steps = []
770
+ if impute_strategy is not None:
771
+ steps.append(
772
+ ("imputer", SimpleImputer(strategy=impute_strategy))
773
+ )
774
+ if standardize:
775
+ steps.append(("scaler", StandardScaler()))
776
+ steps.append(
777
+ ("pca", PCA(n_components=n_components, random_state=0))
778
+ )
779
+
780
+ pipe = Pipeline(steps)
781
+ pipe.fit(X_fit)
782
+ pcas_dict[name] = pipe
783
+ elif pcas is not None:
784
+ pipe = pcas[name]
785
+ pcas_dict[name] = pipe
786
+ else:
787
+ continue
788
+
789
+ if mask.any():
790
+ X_transform = df.loc[mask, lag_columns]
791
+ scores = pipe.transform(X_transform)
792
+
793
+ for i in range(n_components):
794
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
795
+
796
+ # Fill missing values
797
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
798
+ for col in pca_cols:
799
+ if col not in df.columns:
800
+ df[col] = 0
801
+ df[pca_cols] = df[pca_cols].fillna(0)
802
+
803
+ return df, pcas_dict
804
+
805
+ # encoding categorical features
806
+ def encode_categorical_features(
807
+ self,
808
+ df: pd.DataFrame,
809
+ transformer: ColumnTransformer | None = None,
810
+ ) -> tuple[pd.DataFrame, ColumnTransformer]:
811
+ """
812
+ Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
813
+
814
+ Parameters:
815
+ df (pd.DataFrame): Input DataFrame
816
+ columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
817
+ columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
818
+ columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
819
+ columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
820
+ transformer (ColumnTransformer, optional): if provided, applies transform only
821
+
822
+ Returns:
823
+ tuple: (transformed DataFrame, ColumnTransformer)
824
+ """
825
+ columns_onehot: list[str] = self.columns_onehot
826
+ columns_binary: list[str] = self.columns_binary
827
+ columns_ordinal: list[str] = self.columns_ordinal
828
+ columns_frequency: list[str] = self.columns_frequency
829
+
830
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
831
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
832
+ save_in_db = False
833
+
834
+ all_columns = (
835
+ columns_onehot + columns_binary + columns_ordinal + columns_frequency
836
+ )
837
+
838
+ if transformer:
839
+ transformed = transformer.transform(X)
840
+ else:
841
+ transformer = ColumnTransformer(
842
+ transformers=[
843
+ (
844
+ "onehot",
845
+ OneHotEncoder(handle_unknown="ignore", sparse_output=False),
846
+ columns_onehot,
847
+ ),
848
+ (
849
+ "ordinal",
850
+ OrdinalEncoder(
851
+ handle_unknown="use_encoded_value", unknown_value=-1
852
+ ),
853
+ columns_ordinal,
854
+ ),
855
+ ("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
856
+ ("freq", CountEncoder(normalize=True), columns_frequency),
857
+ ],
858
+ remainder="passthrough",
859
+ )
860
+ transformed = transformer.fit_transform(X)
861
+ save_in_db = True
862
+
863
+ # Build output column names
864
+ column_names = []
865
+
866
+ if columns_onehot:
867
+ column_names.extend(
868
+ transformer.named_transformers_["onehot"]
869
+ .get_feature_names_out(columns_onehot)
870
+ .tolist()
871
+ )
872
+
873
+ if columns_ordinal:
874
+ column_names.extend(columns_ordinal)
875
+
876
+ if columns_binary:
877
+ column_names.extend(
878
+ transformer.named_transformers_["binary"]
879
+ .get_feature_names_out(columns_binary)
880
+ .tolist()
881
+ )
882
+
883
+ if columns_frequency:
884
+ column_names.extend(columns_frequency)
885
+
886
+ # Add passthrough (non-encoded) columns
887
+ passthrough_columns = [col for col in X.columns if col not in all_columns]
888
+ column_names.extend(passthrough_columns)
889
+
890
+ X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
891
+
892
+ # Try to convert columns to best possible dtypes
893
+ X_transformed = X_transformed.convert_dtypes()
894
+
895
+ # Insert features in db
896
+ if save_in_db:
897
+ # Get feature types from transformed data
898
+ categorical_features, numerical_features = get_features_by_types(
899
+ X_transformed
900
+ )
901
+
902
+ # Get column names from DataFrames
903
+ cat_feature_names = categorical_features.columns.tolist()
904
+ num_feature_names = numerical_features.columns.tolist()
905
+
906
+ # Combine all feature names and their types
907
+ all_feature_names = cat_feature_names + num_feature_names
908
+ all_feature_types = ["categorical"] * len(cat_feature_names) + [
909
+ "numerical"
910
+ ] * len(num_feature_names)
911
+
912
+ # Upsert features in bulk if we have any features
913
+ if all_feature_names:
914
+ Feature.bulk_upsert(
915
+ name=all_feature_names,
916
+ type=all_feature_types,
917
+ )
918
+
919
+ # Upsert targets in bulk
920
+ target_names = y.columns.tolist()
921
+ target_types = [
922
+ (
923
+ "classification"
924
+ if int(target.split("_")[1]) in self.target_clf
925
+ else "regression"
926
+ )
927
+ for target in target_names
928
+ ]
929
+
930
+ Target.bulk_upsert(name=target_names, type=target_types)
931
+
932
+ # Get all the upserted objects
933
+ targets = Target.filter(name__in=target_names)
934
+
935
+ # Update experiment with targets
936
+ experiment = Experiment.get(self.experiment_id)
937
+ if experiment:
938
+ experiment.targets = targets
939
+ experiment.save()
940
+
941
+ return pd.concat([X_transformed, y], axis=1), transformer
942
+
943
+
944
+ # utils
945
+ def summarize_dataframe(
946
+ df: pd.DataFrame, sample_categorical_threshold: int = 15
947
+ ) -> pd.DataFrame:
948
+ summary = []
949
+
950
+ def is_hashable_series(series: pd.Series) -> bool:
951
+ try:
952
+ _ = series.dropna().unique()
953
+ return True
954
+ except TypeError:
955
+ return False
956
+
957
+ df = convert_object_columns_that_are_numeric(df)
958
+ df = df.convert_dtypes()
959
+
960
+ for col in df.columns:
961
+ total_missing = df[col].isna().sum()
962
+ col_data = df[col].dropna()
963
+ dtype = col_data.dtype
964
+
965
+ if col_data.empty:
966
+ summary.append(
967
+ {
968
+ "Column": col,
969
+ "Dtype": dtype,
970
+ "Type": "unknown",
971
+ "Detail": "No non-null values",
972
+ "Missing": total_missing,
973
+ }
974
+ )
975
+ continue
976
+
977
+ # Case 1: Numeric columns
978
+ if pd.api.types.is_numeric_dtype(col_data):
979
+ unique_vals = col_data.nunique()
980
+
981
+ if set(col_data.unique()).issubset({0, 1}):
982
+ col_type = "binary-categorical"
983
+ detail = "0/1 values only"
984
+ elif (
985
+ pd.api.types.is_integer_dtype(col_data)
986
+ and unique_vals <= sample_categorical_threshold
987
+ ):
988
+ col_type = "multi-categorical"
989
+ top_vals = col_data.value_counts().head(10)
990
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
991
+ else:
992
+ col_type = "numeric"
993
+ q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
994
+ detail = (
995
+ f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
996
+ f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
997
+ )
998
+
999
+ # Case 2: Object or other hashable columns
1000
+ elif is_hashable_series(col_data):
1001
+ unique_vals = col_data.nunique()
1002
+ if unique_vals <= sample_categorical_threshold:
1003
+ col_type = "object-categorical"
1004
+ top_vals = col_data.value_counts().head(10)
1005
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
1006
+ else:
1007
+ col_type = "high-cardinality-categorical"
1008
+ detail = f"{unique_vals} unique values"
1009
+
1010
+ # Case 3: Unusable columns
1011
+ else:
1012
+ col_type = "non-hashable"
1013
+ detail = f"Non-hashable type: {type(col_data.iloc[0])}"
1014
+
1015
+ summary.append(
1016
+ {
1017
+ "Column": col,
1018
+ "Dtype": dtype,
1019
+ "Type": col_type,
1020
+ "Detail": detail,
1021
+ "Missing": total_missing,
1022
+ }
1023
+ )
1024
+
1025
+ return pd.DataFrame(summary)
1026
+
1027
+
1028
+ # Utility functions for data splitting
1029
+ # ===================================
1030
+
1031
+
1032
+ def split_data(
1033
+ data,
1034
+ experiment=None,
1035
+ time_series=None,
1036
+ date_column=None,
1037
+ group_column=None,
1038
+ val_size=None,
1039
+ test_size=None,
1040
+ target_numbers=None,
1041
+ target_clf=None,
1042
+ experiment_id=None,
1043
+ ):
1044
+ """
1045
+ Utility function to split data into train, validation, and test sets.
1046
+
1047
+ Args:
1048
+ data (pd.DataFrame): Input data to split
1049
+ experiment: LeCrapaud experiment instance (preferred - extracts all params automatically)
1050
+ time_series (bool): Whether to use time series splitting (overrides experiment)
1051
+ date_column (str): Date column for time series splitting (overrides experiment)
1052
+ group_column (str): Group column for time series splitting (overrides experiment)
1053
+ val_size (float): Validation set size (0.0-1.0) (overrides experiment)
1054
+ test_size (float): Test set size (0.0-1.0) (overrides experiment)
1055
+ target_numbers (list): List of target numbers for stratification (overrides experiment)
1056
+ target_clf (list): List of classification target numbers (overrides experiment)
1057
+ experiment_id (int): Optional experiment ID to update sizes in database
1058
+
1059
+ Returns:
1060
+ tuple: (train, val, test) DataFrames
1061
+ """
1062
+ # Extract parameters from experiment if provided
1063
+ if experiment is not None:
1064
+ # Check if it's a BaseExperiment or just the experiment database object
1065
+ if hasattr(experiment, "context") and experiment.context:
1066
+ # It's a database experiment object with context
1067
+ context = experiment.context
1068
+ if time_series is None:
1069
+ time_series = context.get("time_series", False)
1070
+ if date_column is None:
1071
+ date_column = context.get("date_column")
1072
+ if group_column is None:
1073
+ group_column = context.get("group_column")
1074
+ if val_size is None:
1075
+ val_size = context.get("val_size", 0.2)
1076
+ if test_size is None:
1077
+ test_size = context.get("test_size", 0.2)
1078
+ if target_numbers is None:
1079
+ target_numbers = context.get("target_numbers", [])
1080
+ if target_clf is None:
1081
+ target_clf = context.get("target_clf", [])
1082
+ if experiment_id is None:
1083
+ experiment_id = experiment.id
1084
+
1085
+ # Set defaults if still None
1086
+ if time_series is None:
1087
+ time_series = False
1088
+ if val_size is None:
1089
+ val_size = 0.2
1090
+ if test_size is None:
1091
+ test_size = 0.2
1092
+ if target_numbers is None:
1093
+ target_numbers = []
1094
+ if target_clf is None:
1095
+ target_clf = []
1096
+
1097
+ dates = {}
1098
+ if time_series:
1099
+ (train, val, test), dates = _split_time_series(
1100
+ data, date_column, group_column, val_size, test_size
1101
+ )
1102
+ else:
1103
+ # Use first target for stratification if it's a classification target
1104
+ stratify_col = None
1105
+ if target_numbers and target_clf and target_numbers[0] in target_clf:
1106
+ stratify_col = f"TARGET_{target_numbers[0]}"
1107
+ train, val, test = _split_standard(data, val_size, test_size, stratify_col)
1108
+
1109
+ # Update experiment with sizes if experiment_id provided
1110
+ if experiment_id:
1111
+ Experiment.update(
1112
+ id=experiment_id,
1113
+ train_size=len(train),
1114
+ val_size=len(val),
1115
+ test_size=len(test),
1116
+ **dates,
1117
+ )
1118
+
1119
+ return train, val, test
1120
+
1121
+
1122
+ def _split_time_series(data, date_column, group_column, val_size, test_size):
1123
+ """Time series splitting preserving temporal order."""
1124
+ if not date_column:
1125
+ raise ValueError("Please specify a date_column for time series")
1126
+
1127
+ df = data.copy()
1128
+ if group_column:
1129
+ df.sort_values([date_column, group_column], inplace=True)
1130
+ else:
1131
+ df.sort_values(date_column, inplace=True)
1132
+
1133
+ dates = df[date_column].unique()
1134
+
1135
+ val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
1136
+ test_first_id = int(len(dates) * (1 - test_size)) + 1
1137
+
1138
+ train = df[df[date_column].isin(dates[:val_first_id])]
1139
+ val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
1140
+ test = df[df[date_column].isin(dates[test_first_id:])]
1141
+
1142
+ dates = {}
1143
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
1144
+ dates[f"{name}_start_date"] = (
1145
+ data[date_column].map(pd.Timestamp.fromordinal).iat[0]
1146
+ )
1147
+ dates[f"{name}_end_date"] = (
1148
+ data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
1149
+ )
1150
+
1151
+ logger.info(
1152
+ f"{data.shape} {name} data from {dates[f'{name}_start_date'].strftime('%d/%m/%Y')} to {dates[f'{name}_end_date'].strftime('%d/%m/%Y')}"
1153
+ )
1154
+
1155
+ return (
1156
+ train.reset_index(drop=True),
1157
+ val.reset_index(drop=True),
1158
+ test.reset_index(drop=True),
1159
+ ), dates
1160
+
1161
+
1162
+ def _split_standard(data, val_size, test_size, stratify_col=None, random_state=42):
1163
+ """Standard random splitting with optional stratification."""
1164
+ from sklearn.model_selection import train_test_split
1165
+
1166
+ df = data.copy()
1167
+
1168
+ stratify_vals = (
1169
+ df[stratify_col] if stratify_col and stratify_col in df.columns else None
1170
+ )
1171
+
1172
+ # First split: train + (val + test)
1173
+ train, temp = train_test_split(
1174
+ df,
1175
+ test_size=val_size + test_size,
1176
+ random_state=random_state,
1177
+ stratify=stratify_vals,
1178
+ )
1179
+
1180
+ # Adjust stratify target for val/test split
1181
+ stratify_temp = (
1182
+ temp[stratify_col] if stratify_col and stratify_col in df.columns else None
1183
+ )
1184
+
1185
+ # Compute val and test sizes relative to temp
1186
+ val_ratio = val_size / (val_size + test_size)
1187
+
1188
+ val, test = train_test_split(
1189
+ temp,
1190
+ test_size=1 - val_ratio,
1191
+ random_state=random_state,
1192
+ stratify=stratify_temp,
1193
+ )
1194
+
1195
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
1196
+ logger.info(f"{data.shape} {name} data")
1197
+
1198
+ return (
1199
+ train.reset_index(drop=True),
1200
+ val.reset_index(drop=True),
1201
+ test.reset_index(drop=True),
1202
+ )