lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
lecrapaud/feature_engineering.py
CHANGED
|
@@ -47,30 +47,17 @@ Development
|
|
|
47
47
|
import pandas as pd
|
|
48
48
|
import numpy as np
|
|
49
49
|
from itertools import product
|
|
50
|
-
import joblib
|
|
51
|
-
import os
|
|
52
|
-
|
|
53
|
-
from sklearn.compose import ColumnTransformer
|
|
54
|
-
from sklearn.decomposition import PCA
|
|
55
|
-
from sklearn.impute import SimpleImputer
|
|
56
|
-
from sklearn.preprocessing import StandardScaler
|
|
57
|
-
from sklearn.pipeline import Pipeline
|
|
58
|
-
from category_encoders import BinaryEncoder, CountEncoder
|
|
59
|
-
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
|
60
|
-
from sklearn.model_selection import train_test_split
|
|
61
50
|
|
|
62
51
|
from lecrapaud.integrations.openai_integration import (
|
|
63
52
|
truncate_text,
|
|
64
53
|
get_openai_embeddings,
|
|
65
54
|
)
|
|
66
|
-
from lecrapaud.feature_selection import get_features_by_types
|
|
67
55
|
from lecrapaud.utils import logger
|
|
68
|
-
from lecrapaud.
|
|
69
|
-
from lecrapaud.config import PYTHON_ENV
|
|
56
|
+
from lecrapaud.mixins import LeCrapaudEstimatorMixin
|
|
70
57
|
|
|
71
58
|
|
|
72
59
|
# main function
|
|
73
|
-
class
|
|
60
|
+
class FeatureEngineering(LeCrapaudEstimatorMixin):
|
|
74
61
|
"""
|
|
75
62
|
Feature engineering pipeline
|
|
76
63
|
|
|
@@ -86,24 +73,39 @@ class FeatureEngineeringEngine:
|
|
|
86
73
|
|
|
87
74
|
def __init__(
|
|
88
75
|
self,
|
|
89
|
-
|
|
90
|
-
columns_drop: list[str] = [],
|
|
91
|
-
columns_boolean: list[str] = [],
|
|
92
|
-
columns_date: list[str] = [],
|
|
93
|
-
columns_te_groupby: list[str] = [],
|
|
94
|
-
columns_te_target: list[str] = [],
|
|
76
|
+
experiment=None,
|
|
95
77
|
for_training: bool = True,
|
|
96
78
|
**kwargs,
|
|
97
79
|
):
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
self
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
80
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
81
|
+
# and kwargs will override them if provided
|
|
82
|
+
super().__init__(experiment=experiment, for_training=for_training, **kwargs)
|
|
83
|
+
|
|
84
|
+
# Set defaults for required parameters if not provided
|
|
85
|
+
if not hasattr(self, 'columns_drop'):
|
|
86
|
+
self.columns_drop = []
|
|
87
|
+
if not hasattr(self, 'columns_boolean'):
|
|
88
|
+
self.columns_boolean = []
|
|
89
|
+
if not hasattr(self, 'columns_date'):
|
|
90
|
+
self.columns_date = []
|
|
91
|
+
if not hasattr(self, 'columns_te_groupby'):
|
|
92
|
+
self.columns_te_groupby = []
|
|
93
|
+
if not hasattr(self, 'columns_te_target'):
|
|
94
|
+
self.columns_te_target = []
|
|
95
|
+
|
|
96
|
+
def fit(self, X, y=None):
|
|
97
|
+
"""
|
|
98
|
+
Fit the feature engineering estimator.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
X (pd.DataFrame): Input data
|
|
102
|
+
y: Target values (ignored)
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Transformed data (for compatibility with existing code)
|
|
106
|
+
"""
|
|
107
|
+
self.data = X.copy()
|
|
108
|
+
|
|
107
109
|
# drop columns
|
|
108
110
|
self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
|
|
109
111
|
|
|
@@ -126,6 +128,17 @@ class FeatureEngineeringEngine:
|
|
|
126
128
|
# Cyclic encode dates
|
|
127
129
|
self.data = self.cyclic_encode_date()
|
|
128
130
|
|
|
131
|
+
self._set_fitted()
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def get_data(self):
|
|
135
|
+
"""
|
|
136
|
+
Get the transformed data after feature engineering.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
pd.DataFrame: The transformed data with engineered features
|
|
140
|
+
"""
|
|
141
|
+
self._check_is_fitted()
|
|
129
142
|
return self.data
|
|
130
143
|
|
|
131
144
|
def cyclic_encode_date(self) -> pd.DataFrame:
|
|
@@ -220,7 +233,7 @@ class FeatureEngineeringEngine:
|
|
|
220
233
|
Returns:
|
|
221
234
|
pd.DataFrame: Original dataframe with new encoded columns added
|
|
222
235
|
"""
|
|
223
|
-
# TODO: target encoding needs to be fit / transform based at
|
|
236
|
+
# TODO: target encoding needs to be fit / transform based at transform time.
|
|
224
237
|
df: pd.DataFrame = self.data
|
|
225
238
|
columns_te_groupby: list[list[str]] = self.columns_te_groupby
|
|
226
239
|
columns_te_target: list[str] = self.columns_te_target
|
|
@@ -299,7 +312,7 @@ class FeatureEngineeringEngine:
|
|
|
299
312
|
non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
|
|
300
313
|
|
|
301
314
|
logger.warning(
|
|
302
|
-
f"Missing values found in
|
|
315
|
+
f"Missing values found in transform data."
|
|
303
316
|
f"Filling with 0 for numeric columns: {numeric_cols}, "
|
|
304
317
|
f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
|
|
305
318
|
)
|
|
@@ -310,649 +323,7 @@ class FeatureEngineeringEngine:
|
|
|
310
323
|
return df
|
|
311
324
|
|
|
312
325
|
|
|
313
|
-
class PreprocessFeature:
|
|
314
|
-
|
|
315
|
-
def __init__(
|
|
316
|
-
self,
|
|
317
|
-
data: pd.DataFrame,
|
|
318
|
-
experiment,
|
|
319
|
-
time_series: bool = False,
|
|
320
|
-
date_column: str | None = None,
|
|
321
|
-
group_column: str | None = None,
|
|
322
|
-
val_size: float = 0.2,
|
|
323
|
-
test_size: float = 0.2,
|
|
324
|
-
columns_pca: list[str] = [],
|
|
325
|
-
pca_temporal: list[dict[str, list[str]]] = [],
|
|
326
|
-
pca_cross_sectional: list[dict[str, list[str]]] = [],
|
|
327
|
-
columns_onehot: list[str] = [],
|
|
328
|
-
columns_binary: list[str] = [],
|
|
329
|
-
columns_ordinal: list[str] = [],
|
|
330
|
-
columns_frequency: list[str] = [],
|
|
331
|
-
target_numbers: list = [],
|
|
332
|
-
target_clf: list = [],
|
|
333
|
-
**kwargs,
|
|
334
|
-
):
|
|
335
|
-
self.data = data
|
|
336
|
-
self.data.columns = self.data.columns.str.upper()
|
|
337
|
-
|
|
338
|
-
self.experiment = experiment
|
|
339
|
-
self.columns_pca = [col.upper() for col in columns_pca]
|
|
340
|
-
self.pca_temporal = pca_temporal
|
|
341
|
-
self.pca_cross_sectional = pca_cross_sectional
|
|
342
|
-
self.columns_onehot = [col.upper() for col in columns_onehot]
|
|
343
|
-
self.columns_binary = [col.upper() for col in columns_binary]
|
|
344
|
-
self.columns_ordinal = [col.upper() for col in columns_ordinal]
|
|
345
|
-
self.columns_frequency = [col.upper() for col in columns_frequency]
|
|
346
|
-
self.target_numbers = target_numbers
|
|
347
|
-
self.target_clf = target_clf
|
|
348
|
-
|
|
349
|
-
self.time_series = time_series
|
|
350
|
-
self.date_column = date_column
|
|
351
|
-
self.group_column = group_column
|
|
352
|
-
self.val_size = val_size
|
|
353
|
-
self.test_size = test_size
|
|
354
|
-
|
|
355
|
-
self.experiment_dir = self.experiment.path
|
|
356
|
-
self.experiment_id = self.experiment.id
|
|
357
|
-
self.data_dir = f"{self.experiment_dir}/data"
|
|
358
|
-
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
359
|
-
|
|
360
|
-
def run(self):
|
|
361
|
-
# Split
|
|
362
|
-
train, val, test = (
|
|
363
|
-
self.train_val_test_split_time_series()
|
|
364
|
-
if self.time_series
|
|
365
|
-
else self.train_val_test_split(
|
|
366
|
-
stratify_col=f"TARGET_{self.target_numbers[0]}"
|
|
367
|
-
)
|
|
368
|
-
) # TODO: only stratifying first target for now
|
|
369
|
-
|
|
370
|
-
# PCA
|
|
371
|
-
train, pcas = self.add_pca_features(train)
|
|
372
|
-
val, _ = self.add_pca_features(val, pcas=pcas)
|
|
373
|
-
test, _ = self.add_pca_features(test, pcas=pcas)
|
|
374
|
-
|
|
375
|
-
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
376
|
-
|
|
377
|
-
train, pcas_cross_sectional = self.add_pca_feature_cross_sectional(train)
|
|
378
|
-
val, _ = self.add_pca_feature_cross_sectional(val, pcas=pcas_cross_sectional)
|
|
379
|
-
test, _ = self.add_pca_feature_cross_sectional(test, pcas=pcas_cross_sectional)
|
|
380
|
-
|
|
381
|
-
joblib.dump(
|
|
382
|
-
pcas_cross_sectional, f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
train, pcas_temporal = self.add_pca_feature_temporal(train)
|
|
386
|
-
val, _ = self.add_pca_feature_temporal(val, pcas=pcas_temporal)
|
|
387
|
-
test, _ = self.add_pca_feature_temporal(test, pcas=pcas_temporal)
|
|
388
|
-
|
|
389
|
-
joblib.dump(pcas_temporal, f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
390
|
-
|
|
391
|
-
# Save all features before encoding
|
|
392
|
-
joblib.dump(
|
|
393
|
-
list(train.columns),
|
|
394
|
-
f"{self.preprocessing_dir}/all_features_before_encoding.pkl",
|
|
395
|
-
)
|
|
396
|
-
|
|
397
|
-
# Encoding
|
|
398
|
-
train, transformer = self.encode_categorical_features(train)
|
|
399
|
-
val, _ = self.encode_categorical_features(
|
|
400
|
-
val,
|
|
401
|
-
transformer=transformer,
|
|
402
|
-
)
|
|
403
|
-
test, _ = self.encode_categorical_features(
|
|
404
|
-
test,
|
|
405
|
-
transformer=transformer,
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
joblib.dump(self.data, f"{self.data_dir}/full.pkl")
|
|
409
|
-
joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
410
|
-
summary = summarize_dataframe(train)
|
|
411
|
-
summary.to_csv(f"{self.experiment_dir}/feature_summary.csv", index=False)
|
|
412
|
-
|
|
413
|
-
# Save all features before selection
|
|
414
|
-
joblib.dump(
|
|
415
|
-
list(train.columns),
|
|
416
|
-
f"{self.preprocessing_dir}/all_features_before_selection.pkl",
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
return train, val, test
|
|
420
|
-
|
|
421
|
-
def inference(self):
|
|
422
|
-
data = self.data
|
|
423
|
-
|
|
424
|
-
# PCA
|
|
425
|
-
if os.path.exists(f"{self.preprocessing_dir}/pcas.pkl"):
|
|
426
|
-
pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
|
|
427
|
-
data, _ = self.add_pca_features(data, pcas=pcas)
|
|
428
|
-
|
|
429
|
-
if os.path.exists(f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"):
|
|
430
|
-
pcas_cross_sectional = joblib.load(
|
|
431
|
-
f"{self.preprocessing_dir}/pcas_cross_sectional.pkl"
|
|
432
|
-
)
|
|
433
|
-
data, _ = self.add_pca_feature_cross_sectional(
|
|
434
|
-
data, pcas=pcas_cross_sectional
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
if os.path.exists(f"{self.preprocessing_dir}/pcas_temporal.pkl"):
|
|
438
|
-
pcas_temporal = joblib.load(f"{self.preprocessing_dir}/pcas_temporal.pkl")
|
|
439
|
-
data, _ = self.add_pca_feature_temporal(data, pcas=pcas_temporal)
|
|
440
|
-
|
|
441
|
-
# Encoding
|
|
442
|
-
transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
443
|
-
data, _ = self.encode_categorical_features(
|
|
444
|
-
data,
|
|
445
|
-
transformer=transformer,
|
|
446
|
-
)
|
|
447
|
-
return data
|
|
448
|
-
|
|
449
|
-
def train_val_test_split_time_series(self):
|
|
450
|
-
df: pd.DataFrame = self.data
|
|
451
|
-
date_column: str = self.date_column
|
|
452
|
-
group_column: str = self.group_column
|
|
453
|
-
val_size: float = self.val_size
|
|
454
|
-
test_size: float = self.test_size
|
|
455
|
-
|
|
456
|
-
if not date_column:
|
|
457
|
-
ValueError("Please specify a date_column for time series")
|
|
458
|
-
|
|
459
|
-
if group_column:
|
|
460
|
-
df.sort_values([date_column, group_column], inplace=True)
|
|
461
|
-
else:
|
|
462
|
-
df.sort_values(date_column, inplace=True)
|
|
463
|
-
|
|
464
|
-
dates = df[date_column].unique()
|
|
465
|
-
|
|
466
|
-
val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
|
|
467
|
-
test_first_id = int(len(dates) * (1 - test_size)) + 1
|
|
468
|
-
|
|
469
|
-
train = df[df[date_column].isin(dates[:val_first_id])]
|
|
470
|
-
val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
|
|
471
|
-
test = df[df[date_column].isin(dates[test_first_id:])]
|
|
472
|
-
|
|
473
|
-
dates = {}
|
|
474
|
-
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
475
|
-
dates[f"{name}_start_date"] = (
|
|
476
|
-
data[date_column].map(pd.Timestamp.fromordinal).iat[0]
|
|
477
|
-
)
|
|
478
|
-
dates[f"{name}_end_date"] = (
|
|
479
|
-
data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
logger.info(
|
|
483
|
-
f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
484
|
-
)
|
|
485
|
-
|
|
486
|
-
Experiment.upsert(
|
|
487
|
-
match_fields=["id"],
|
|
488
|
-
id=self.experiment_id,
|
|
489
|
-
train_size=len(train),
|
|
490
|
-
val_size=len(val),
|
|
491
|
-
test_size=len(test),
|
|
492
|
-
**dates,
|
|
493
|
-
)
|
|
494
|
-
return (
|
|
495
|
-
train.reset_index(drop=True),
|
|
496
|
-
val.reset_index(drop=True),
|
|
497
|
-
test.reset_index(drop=True),
|
|
498
|
-
)
|
|
499
|
-
|
|
500
|
-
def train_val_test_split(
|
|
501
|
-
self,
|
|
502
|
-
random_state: int = 42,
|
|
503
|
-
stratify_col: str | None = None,
|
|
504
|
-
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
505
|
-
"""
|
|
506
|
-
Splits a DataFrame into train, validation, and test sets.
|
|
507
|
-
|
|
508
|
-
Parameters:
|
|
509
|
-
df (pd.DataFrame): The full experiment
|
|
510
|
-
val_size (float): Proportion of validation set (default 0.1)
|
|
511
|
-
test_size (float): Proportion of test set (default 0.1)
|
|
512
|
-
random_state (int): Random seed for reproducibility
|
|
513
|
-
stratify_col (str | None): Optional column to stratify on (for classification tasks)
|
|
514
|
-
|
|
515
|
-
Returns:
|
|
516
|
-
Tuple of (train_df, val_df, test_df)
|
|
517
|
-
"""
|
|
518
|
-
df: pd.DataFrame = self.data
|
|
519
|
-
val_size: float = self.val_size
|
|
520
|
-
test_size: float = self.test_size
|
|
521
|
-
|
|
522
|
-
stratify_vals = df[stratify_col] if stratify_col else None
|
|
523
|
-
|
|
524
|
-
# First split: train + (val + test)
|
|
525
|
-
train, temp = train_test_split(
|
|
526
|
-
df,
|
|
527
|
-
test_size=val_size + test_size,
|
|
528
|
-
random_state=random_state,
|
|
529
|
-
stratify=stratify_vals,
|
|
530
|
-
)
|
|
531
|
-
|
|
532
|
-
# Adjust stratify target for val/test split
|
|
533
|
-
stratify_temp = temp[stratify_col] if stratify_col else None
|
|
534
|
-
|
|
535
|
-
# Compute val and test sizes relative to temp
|
|
536
|
-
val_ratio = val_size / (val_size + test_size)
|
|
537
|
-
|
|
538
|
-
val, test = train_test_split(
|
|
539
|
-
temp,
|
|
540
|
-
test_size=1 - val_ratio,
|
|
541
|
-
random_state=random_state,
|
|
542
|
-
stratify=stratify_temp,
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
546
|
-
logger.info(f"{data.shape} {name} data")
|
|
547
|
-
|
|
548
|
-
Experiment.upsert(
|
|
549
|
-
match_fields=["id"],
|
|
550
|
-
id=self.experiment_id,
|
|
551
|
-
train_size=len(train),
|
|
552
|
-
val_size=len(val),
|
|
553
|
-
test_size=len(test),
|
|
554
|
-
)
|
|
555
|
-
return (
|
|
556
|
-
train.reset_index(drop=True),
|
|
557
|
-
val.reset_index(drop=True),
|
|
558
|
-
test.reset_index(drop=True),
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
# embedding and pca
|
|
562
|
-
def add_pca_features(
|
|
563
|
-
self, df: pd.DataFrame, n_components: int = 5, pcas=None
|
|
564
|
-
) -> tuple[pd.DataFrame, dict]:
|
|
565
|
-
"""
|
|
566
|
-
Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
|
|
567
|
-
NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
|
|
568
|
-
|
|
569
|
-
Parameters:
|
|
570
|
-
df (pd.DataFrame): Input DataFrame
|
|
571
|
-
column (str): Name of the column containing np.ndarray
|
|
572
|
-
n_components (int): Number of PCA components to keep
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
pd.DataFrame: DataFrame with new PCA columns added
|
|
576
|
-
"""
|
|
577
|
-
columns: list[str] = self.columns_pca
|
|
578
|
-
|
|
579
|
-
pcas_dict = {}
|
|
580
|
-
for column in columns:
|
|
581
|
-
# Convert text to embeddings if necessary
|
|
582
|
-
if not isinstance(df[column].iloc[0], (np.ndarray, list)):
|
|
583
|
-
sentences = df[column].astype(str).tolist()
|
|
584
|
-
logger.info(
|
|
585
|
-
f"Total sentences to embed for column {column}: {len(sentences)}"
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
# Truncate each sentence
|
|
589
|
-
truncate_sentences = [truncate_text(sentence) for sentence in sentences]
|
|
590
|
-
|
|
591
|
-
# embedding
|
|
592
|
-
embedding_matrix = get_openai_embeddings(truncate_sentences)
|
|
593
|
-
else:
|
|
594
|
-
logger.info(f"Column {column} is already embeddings")
|
|
595
|
-
# Stack the vectors into a 2D array
|
|
596
|
-
embedding_matrix = np.vstack(df[column].values)
|
|
597
|
-
|
|
598
|
-
# Apply PCA
|
|
599
|
-
if pcas:
|
|
600
|
-
pca = pcas[column]
|
|
601
|
-
pca_features = pca.transform(embedding_matrix)
|
|
602
|
-
else:
|
|
603
|
-
pca = PCA(n_components=n_components)
|
|
604
|
-
pca_features = pca.fit_transform(embedding_matrix)
|
|
605
|
-
|
|
606
|
-
# Add PCA columns
|
|
607
|
-
for i in range(n_components):
|
|
608
|
-
df[f"{column}_pca_{i+1}"] = pca_features[:, i]
|
|
609
|
-
|
|
610
|
-
# Drop the original column
|
|
611
|
-
df.drop(column, axis=1, inplace=True)
|
|
612
|
-
pcas_dict.update({column: pca})
|
|
613
|
-
|
|
614
|
-
return df, pcas_dict
|
|
615
|
-
|
|
616
|
-
def add_pca_feature_cross_sectional(
|
|
617
|
-
self,
|
|
618
|
-
df: pd.DataFrame,
|
|
619
|
-
*,
|
|
620
|
-
n_components: int = 5,
|
|
621
|
-
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
622
|
-
impute_strategy: str = "median",
|
|
623
|
-
standardize: bool = True,
|
|
624
|
-
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
625
|
-
"""
|
|
626
|
-
Construit un pivot (index=index_col, columns=columns_col, values=value_col),
|
|
627
|
-
fit (ou réutilise) un Pipeline Imputer(+Scaler)+PCA, puis merge les scores
|
|
628
|
-
(par index_col) dans df. Renvoie (df_avec_features, pipe).
|
|
629
|
-
"""
|
|
630
|
-
|
|
631
|
-
pcas_dict = {}
|
|
632
|
-
|
|
633
|
-
for pca_cross_sectional in self.pca_cross_sectional:
|
|
634
|
-
name, index_col, columns_col, value_col = (
|
|
635
|
-
pca_cross_sectional[k] for k in ("name", "index", "columns", "value")
|
|
636
|
-
)
|
|
637
|
-
prefix = f"CS_PC_{name}"
|
|
638
|
-
|
|
639
|
-
pivot = df.pivot_table(
|
|
640
|
-
index=index_col, columns=columns_col, values=value_col
|
|
641
|
-
).sort_index()
|
|
642
|
-
|
|
643
|
-
# Pipeline à réutiliser entre train et test
|
|
644
|
-
if pcas is None:
|
|
645
|
-
steps = [("imputer", SimpleImputer(strategy=impute_strategy))]
|
|
646
|
-
if standardize:
|
|
647
|
-
steps.append(
|
|
648
|
-
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
649
|
-
)
|
|
650
|
-
pca = PCA(n_components=n_components, random_state=0)
|
|
651
|
-
steps.append(("pca", pca))
|
|
652
|
-
pipe = Pipeline(steps)
|
|
653
|
-
pipe.fit(pivot) # <- fit sur TRAIN uniquement
|
|
654
|
-
else:
|
|
655
|
-
pipe = pcas[name] # <- TEST : on réutilise le pipe existant
|
|
656
|
-
|
|
657
|
-
scores = pipe.transform(pivot) # shape: (n_index, n_components)
|
|
658
|
-
cols = [f"{prefix}_{i}" for i in range(n_components)]
|
|
659
|
-
scores_df = pd.DataFrame(scores, index=pivot.index, columns=cols)
|
|
660
|
-
|
|
661
|
-
df = df.merge(scores_df.reset_index(), on=index_col, how="left")
|
|
662
|
-
pcas_dict.update({name: pipe})
|
|
663
|
-
|
|
664
|
-
return df, pcas_dict
|
|
665
|
-
|
|
666
|
-
# ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
|
|
667
|
-
def add_pca_feature_temporal(
|
|
668
|
-
self,
|
|
669
|
-
df: pd.DataFrame,
|
|
670
|
-
*,
|
|
671
|
-
n_components: int = 5,
|
|
672
|
-
pcas: dict[str, Pipeline] | None = None, # si fourni: transform only
|
|
673
|
-
impute_strategy: (
|
|
674
|
-
str | None
|
|
675
|
-
) = None, # None = on exige toutes les colonnes présentes
|
|
676
|
-
standardize: bool = True,
|
|
677
|
-
) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
|
|
678
|
-
"""
|
|
679
|
-
Applique une PCA sur une matrice (rows = lignes df, cols = lags).
|
|
680
|
-
Fit le Pipeline sur TRAIN si pcas=None; sinon, utilise pcas et fait transform.
|
|
681
|
-
Ajoute les colonnes f"{prefix}_{i}" dans df. Renvoie (df, pipe).
|
|
682
|
-
"""
|
|
683
|
-
pcas_dict = {}
|
|
684
|
-
|
|
685
|
-
for pca_temporal in self.pca_temporal:
|
|
686
|
-
name, cols = (pca_temporal[k] for k in ("name", "columns"))
|
|
687
|
-
prefix = f"TMP_PC_{name}"
|
|
688
|
-
|
|
689
|
-
# Masque des lignes utilisables
|
|
690
|
-
if impute_strategy is None:
|
|
691
|
-
mask = (
|
|
692
|
-
df[cols].notna().all(axis=1)
|
|
693
|
-
) # on n'impute pas → lignes complètes
|
|
694
|
-
X_fit = df.loc[mask, cols]
|
|
695
|
-
else:
|
|
696
|
-
mask = df[cols].notna().any(axis=1) # on imputera → au moins une valeur
|
|
697
|
-
X_fit = df.loc[mask, cols]
|
|
698
|
-
|
|
699
|
-
# Pipeline
|
|
700
|
-
if pcas is None:
|
|
701
|
-
steps = []
|
|
702
|
-
if impute_strategy is not None:
|
|
703
|
-
steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
|
|
704
|
-
if standardize:
|
|
705
|
-
steps.append(
|
|
706
|
-
("scaler", StandardScaler(with_mean=True, with_std=True))
|
|
707
|
-
)
|
|
708
|
-
pca = PCA(n_components=n_components, random_state=0)
|
|
709
|
-
steps.append(("pca", pca))
|
|
710
|
-
pipe = Pipeline(steps)
|
|
711
|
-
if not X_fit.empty:
|
|
712
|
-
pipe.fit(X_fit) # <- fit sur TRAIN uniquement
|
|
713
|
-
else:
|
|
714
|
-
pipe = pcas[name] # <- TEST
|
|
715
|
-
|
|
716
|
-
# Transform uniquement sur lignes valides (mask)
|
|
717
|
-
if not df.loc[mask, cols].empty:
|
|
718
|
-
Z = pipe.transform(df.loc[mask, cols])
|
|
719
|
-
for i in range(n_components):
|
|
720
|
-
df.loc[mask, f"{prefix}_{i}"] = Z[:, i]
|
|
721
|
-
else:
|
|
722
|
-
# crée les colonnes vides si aucune ligne valide (cohérence de schéma)
|
|
723
|
-
for i in range(n_components):
|
|
724
|
-
df[f"{prefix}_{i}"] = pd.NA
|
|
725
|
-
|
|
726
|
-
pcas_dict.update({name: pipe})
|
|
727
|
-
|
|
728
|
-
return df, pcas_dict
|
|
729
|
-
|
|
730
|
-
# encoding categorical features
|
|
731
|
-
def encode_categorical_features(
|
|
732
|
-
self,
|
|
733
|
-
df: pd.DataFrame,
|
|
734
|
-
transformer: ColumnTransformer | None = None,
|
|
735
|
-
) -> tuple[pd.DataFrame, ColumnTransformer]:
|
|
736
|
-
"""
|
|
737
|
-
Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
|
|
738
|
-
|
|
739
|
-
Parameters:
|
|
740
|
-
df (pd.DataFrame): Input DataFrame
|
|
741
|
-
columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
|
|
742
|
-
columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
|
|
743
|
-
columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
|
|
744
|
-
columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
|
|
745
|
-
transformer (ColumnTransformer, optional): if provided, applies transform only
|
|
746
|
-
|
|
747
|
-
Returns:
|
|
748
|
-
tuple: (transformed DataFrame, ColumnTransformer)
|
|
749
|
-
"""
|
|
750
|
-
columns_onehot: list[str] = self.columns_onehot
|
|
751
|
-
columns_binary: list[str] = self.columns_binary
|
|
752
|
-
columns_ordinal: list[str] = self.columns_ordinal
|
|
753
|
-
columns_frequency: list[str] = self.columns_frequency
|
|
754
|
-
|
|
755
|
-
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
756
|
-
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
757
|
-
save_in_db = False
|
|
758
|
-
|
|
759
|
-
all_columns = (
|
|
760
|
-
columns_onehot + columns_binary + columns_ordinal + columns_frequency
|
|
761
|
-
)
|
|
762
|
-
|
|
763
|
-
if transformer:
|
|
764
|
-
transformed = transformer.transform(X)
|
|
765
|
-
else:
|
|
766
|
-
transformer = ColumnTransformer(
|
|
767
|
-
transformers=[
|
|
768
|
-
(
|
|
769
|
-
"onehot",
|
|
770
|
-
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
|
771
|
-
columns_onehot,
|
|
772
|
-
),
|
|
773
|
-
(
|
|
774
|
-
"ordinal",
|
|
775
|
-
OrdinalEncoder(
|
|
776
|
-
handle_unknown="use_encoded_value", unknown_value=-1
|
|
777
|
-
),
|
|
778
|
-
columns_ordinal,
|
|
779
|
-
),
|
|
780
|
-
("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
|
|
781
|
-
("freq", CountEncoder(normalize=True), columns_frequency),
|
|
782
|
-
],
|
|
783
|
-
remainder="passthrough",
|
|
784
|
-
)
|
|
785
|
-
transformed = transformer.fit_transform(X)
|
|
786
|
-
save_in_db = True
|
|
787
|
-
|
|
788
|
-
# Build output column names
|
|
789
|
-
column_names = []
|
|
790
|
-
|
|
791
|
-
if columns_onehot:
|
|
792
|
-
column_names.extend(
|
|
793
|
-
transformer.named_transformers_["onehot"]
|
|
794
|
-
.get_feature_names_out(columns_onehot)
|
|
795
|
-
.tolist()
|
|
796
|
-
)
|
|
797
|
-
|
|
798
|
-
if columns_ordinal:
|
|
799
|
-
column_names.extend(columns_ordinal)
|
|
800
|
-
|
|
801
|
-
if columns_binary:
|
|
802
|
-
column_names.extend(
|
|
803
|
-
transformer.named_transformers_["binary"]
|
|
804
|
-
.get_feature_names_out(columns_binary)
|
|
805
|
-
.tolist()
|
|
806
|
-
)
|
|
807
|
-
|
|
808
|
-
if columns_frequency:
|
|
809
|
-
column_names.extend(columns_frequency)
|
|
810
|
-
|
|
811
|
-
# Add passthrough (non-encoded) columns
|
|
812
|
-
passthrough_columns = [col for col in X.columns if col not in all_columns]
|
|
813
|
-
column_names.extend(passthrough_columns)
|
|
814
|
-
|
|
815
|
-
X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
|
|
816
|
-
|
|
817
|
-
# Try to convert columns to best possible dtypes
|
|
818
|
-
X_transformed = X_transformed.convert_dtypes()
|
|
819
|
-
|
|
820
|
-
# Insert features in db
|
|
821
|
-
if save_in_db:
|
|
822
|
-
# Get feature types from transformed data
|
|
823
|
-
categorical_features, numerical_features = get_features_by_types(
|
|
824
|
-
X_transformed
|
|
825
|
-
)
|
|
826
|
-
|
|
827
|
-
# Get column names from DataFrames
|
|
828
|
-
cat_feature_names = categorical_features.columns.tolist()
|
|
829
|
-
num_feature_names = numerical_features.columns.tolist()
|
|
830
|
-
|
|
831
|
-
# Combine all feature names and their types
|
|
832
|
-
all_feature_names = cat_feature_names + num_feature_names
|
|
833
|
-
all_feature_types = ["categorical"] * len(cat_feature_names) + [
|
|
834
|
-
"numerical"
|
|
835
|
-
] * len(num_feature_names)
|
|
836
|
-
|
|
837
|
-
# Upsert features in bulk if we have any features
|
|
838
|
-
if all_feature_names:
|
|
839
|
-
Feature.upsert_bulk(
|
|
840
|
-
match_fields=["name"],
|
|
841
|
-
name=all_feature_names,
|
|
842
|
-
type=all_feature_types,
|
|
843
|
-
)
|
|
844
|
-
|
|
845
|
-
# Upsert targets in bulk
|
|
846
|
-
target_names = y.columns.tolist()
|
|
847
|
-
target_types = [
|
|
848
|
-
(
|
|
849
|
-
"classification"
|
|
850
|
-
if int(target.split("_")[1]) in self.target_clf
|
|
851
|
-
else "regression"
|
|
852
|
-
)
|
|
853
|
-
for target in target_names
|
|
854
|
-
]
|
|
855
|
-
|
|
856
|
-
Target.upsert_bulk(
|
|
857
|
-
match_fields=["name"], name=target_names, type=target_types
|
|
858
|
-
)
|
|
859
|
-
|
|
860
|
-
# Get all the upserted objects
|
|
861
|
-
targets = Target.filter(name__in=target_names)
|
|
862
|
-
|
|
863
|
-
# Update experiment with targets
|
|
864
|
-
experiment = Experiment.get(self.experiment_id)
|
|
865
|
-
if experiment:
|
|
866
|
-
experiment.targets = targets
|
|
867
|
-
experiment.save()
|
|
868
|
-
|
|
869
|
-
return pd.concat([X_transformed, y], axis=1), transformer
|
|
870
|
-
|
|
871
|
-
|
|
872
326
|
# analysis & utils
|
|
873
|
-
def summarize_dataframe(
|
|
874
|
-
df: pd.DataFrame, sample_categorical_threshold: int = 15
|
|
875
|
-
) -> pd.DataFrame:
|
|
876
|
-
summary = []
|
|
877
|
-
|
|
878
|
-
def is_hashable_series(series: pd.Series) -> bool:
|
|
879
|
-
try:
|
|
880
|
-
_ = series.dropna().unique()
|
|
881
|
-
return True
|
|
882
|
-
except TypeError:
|
|
883
|
-
return False
|
|
884
|
-
|
|
885
|
-
df = convert_object_columns_that_are_numeric(df)
|
|
886
|
-
df = df.convert_dtypes()
|
|
887
|
-
|
|
888
|
-
for col in df.columns:
|
|
889
|
-
total_missing = df[col].isna().sum()
|
|
890
|
-
col_data = df[col].dropna()
|
|
891
|
-
dtype = col_data.dtype
|
|
892
|
-
|
|
893
|
-
if col_data.empty:
|
|
894
|
-
summary.append(
|
|
895
|
-
{
|
|
896
|
-
"Column": col,
|
|
897
|
-
"Dtype": dtype,
|
|
898
|
-
"Type": "unknown",
|
|
899
|
-
"Detail": "No non-null values",
|
|
900
|
-
"Missing": total_missing,
|
|
901
|
-
}
|
|
902
|
-
)
|
|
903
|
-
continue
|
|
904
|
-
|
|
905
|
-
# Case 1: Numeric columns
|
|
906
|
-
if pd.api.types.is_numeric_dtype(col_data):
|
|
907
|
-
unique_vals = col_data.nunique()
|
|
908
|
-
|
|
909
|
-
if set(col_data.unique()).issubset({0, 1}):
|
|
910
|
-
col_type = "binary-categorical"
|
|
911
|
-
detail = "0/1 values only"
|
|
912
|
-
elif (
|
|
913
|
-
pd.api.types.is_integer_dtype(col_data)
|
|
914
|
-
and unique_vals <= sample_categorical_threshold
|
|
915
|
-
):
|
|
916
|
-
col_type = "multi-categorical"
|
|
917
|
-
top_vals = col_data.value_counts().head(10)
|
|
918
|
-
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
919
|
-
else:
|
|
920
|
-
col_type = "numeric"
|
|
921
|
-
q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
|
|
922
|
-
detail = (
|
|
923
|
-
f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
|
|
924
|
-
f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
|
|
925
|
-
)
|
|
926
|
-
|
|
927
|
-
# Case 2: Object or other hashable columns
|
|
928
|
-
elif is_hashable_series(col_data):
|
|
929
|
-
unique_vals = col_data.nunique()
|
|
930
|
-
if unique_vals <= sample_categorical_threshold:
|
|
931
|
-
col_type = "object-categorical"
|
|
932
|
-
top_vals = col_data.value_counts().head(10)
|
|
933
|
-
detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
|
|
934
|
-
else:
|
|
935
|
-
col_type = "high-cardinality-categorical"
|
|
936
|
-
detail = f"{unique_vals} unique values"
|
|
937
|
-
|
|
938
|
-
# Case 3: Unusable columns
|
|
939
|
-
else:
|
|
940
|
-
col_type = "non-hashable"
|
|
941
|
-
detail = f"Non-hashable type: {type(col_data.iloc[0])}"
|
|
942
|
-
|
|
943
|
-
summary.append(
|
|
944
|
-
{
|
|
945
|
-
"Column": col,
|
|
946
|
-
"Dtype": dtype,
|
|
947
|
-
"Type": col_type,
|
|
948
|
-
"Detail": detail,
|
|
949
|
-
"Missing": total_missing,
|
|
950
|
-
}
|
|
951
|
-
)
|
|
952
|
-
|
|
953
|
-
return pd.DataFrame(summary)
|
|
954
|
-
|
|
955
|
-
|
|
956
327
|
def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
|
|
957
328
|
"""
|
|
958
329
|
Detect object columns that can be safely converted to numeric (float or int).
|